npm - @forwardimpact/libeval - Versions diffs - 0.1.50 → 0.1.51 - Mend

@forwardimpact/libeval 0.1.50 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

package/README.md +11 -8
package/bin/fit-benchmark.js +26 -27
package/bin/fit-eval.js +49 -30
package/bin/fit-trace.js +83 -57
package/package.json +1 -1
package/src/agent-runner.js +20 -12
package/src/benchmark/env-loader.js +35 -23
package/src/benchmark/{scorer.js → invariants.js} +14 -12
package/src/benchmark/judge.js +5 -8
package/src/benchmark/report.js +15 -15
package/src/benchmark/result.js +11 -11
package/src/benchmark/runner.js +11 -11
package/src/benchmark/task-family.js +6 -4
package/src/benchmark/workdir.js +18 -3
package/src/commands/assert.js +30 -22
package/src/commands/benchmark-invariants.js +74 -0
package/src/commands/benchmark-report.js +23 -15
package/src/commands/benchmark-run.js +15 -8
package/src/commands/by-discussion.js +29 -18
package/src/commands/callback.js +20 -11
package/src/commands/discuss.js +28 -11
package/src/commands/facilitate.js +18 -12
package/src/commands/output.js +11 -12
package/src/commands/run.js +22 -12
package/src/commands/supervise.js +27 -18
package/src/commands/task-input.js +10 -5
package/src/commands/trace.js +174 -97
package/src/discuss-tools.js +48 -2
package/src/discusser.js +49 -2
package/src/events/github.js +27 -5
package/src/inbox-poller.js +84 -0
package/src/judge.js +1 -1
package/src/message-bus.js +6 -0
package/src/orchestration-loop.js +14 -4
package/src/orchestration-toolkit.js +14 -0
package/src/redaction.js +31 -9
package/src/reply-emitter.js +47 -0
package/src/commands/benchmark-score.js +0 -68

package/src/benchmark/env-loader.js CHANGED Viewed

@@ -14,7 +14,6 @@
  * AND rendered (with resolved values) into the agent working directory.
  */
-import { readFile, writeFile } from "node:fs/promises";
 import { join } from "node:path";
 const ENV_FILES = [".env.local", ".env"];
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
 /**
  * Read and parse an env file, returning [] if the file does not exist.
+ * @param {object} fs - Async filesystem surface (`runtime.fs`).
  * @param {string} filePath
  * @returns {Promise<Array<{key: string, value: string}>>}
  */
-async function readEnvFile(filePath) {
+async function readEnvFile(fs, filePath) {
   try {
-    const content = await readFile(filePath, "utf8");
+    const content = await fs.readFile(filePath, "utf8");
     return parseEnvFile(content);
   } catch (e) {
     if (e.code === "ENOENT") return [];
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
 }
 /**
- * Load entries into process.env. Existing keys are never overwritten.
+ * Load entries into the process env map. Existing keys are never overwritten.
+ * @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
  * @param {Array<{key: string, value: string}>} entries
  * @returns {string[]} var names that were loaded
  */
-function applyToProcessEnv(entries) {
+function applyToProcessEnv(env, entries) {
   const names = [];
   for (const { key, value } of entries) {
     names.push(key);
-    if (process.env[key] === undefined) {
-      process.env[key] = value;
+    if (env[key] === undefined) {
+      env[key] = value;
     }
   }
   return names;
 }
 /**
- * Load one env file: apply to process.env, record keys in the merged map.
+ * Load one env file: apply to the env map, record keys in the merged map.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {string} dir
  * @param {string} file
  * @param {Set<string>} names
  * @param {Map<string, Map<string, true>>} merged
  */
-async function loadOneEnvFile(dir, file, names, merged) {
-  const entries = await readEnvFile(join(dir, file));
+async function loadOneEnvFile(runtime, dir, file, names, merged) {
+  const entries = await readEnvFile(runtime.fs, join(dir, file));
   if (entries.length === 0) return;
-  for (const name of applyToProcessEnv(entries)) names.add(name);
+  for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
+    names.add(name);
+  }
   if (!merged.has(file)) merged.set(file, new Map());
   const fileMap = merged.get(file);
   for (const { key } of entries) {
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
 }
 /**
- * Scan directories for env files, load into process.env, and collect
+ * Scan directories for env files, load into the env map, and collect
  * a merged key manifest per filename.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {string[]} dirs
  * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
  */
-async function collectEnvEntries(dirs) {
+async function collectEnvEntries(runtime, dirs) {
   const names = new Set();
   const merged = new Map();
   for (const dir of dirs) {
     for (const file of ENV_FILES) {
-      await loadOneEnvFile(dir, file, names, merged);
+      await loadOneEnvFile(runtime, dir, file, names, merged);
     }
   }
   return { names, merged };
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
 /**
  * Write resolved env files into the agent CWD and warn about empty values.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @param {Map<string, Map<string, true>>} merged
  * @param {string} agentCwd
  */
-async function renderEnvFiles(merged, agentCwd) {
+async function renderEnvFiles(runtime, merged, agentCwd) {
+  const env = runtime.proc.env;
   for (const [file, keyMap] of merged) {
     const keys = [...keyMap.keys()];
-    const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
-    await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
-    const empty = keys.filter((key) => !process.env[key]);
+    const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
+    await runtime.fs.writeFile(
+      join(agentCwd, file),
+      resolved.join("\n") + "\n",
+    );
+    const empty = keys.filter((key) => !env[key]);
     if (empty.length > 0) {
-      process.stderr.write(
+      runtime.proc.stderr.write(
         `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
       );
     }
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
 /**
  * Discover `.env` / `.env.local` in one or more directories, load them
- * into process.env, and render the resolved values into the agent CWD.
+ * into the process env map, and render the resolved values into the agent CWD.
  *
  * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
  * @param {string} agentCwd - Agent working directory to render into.
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
+ *   collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
  * @returns {Promise<string[]>} All var names discovered (for redaction).
  */
-export async function loadEnv(dirs, agentCwd) {
-  const { names, merged } = await collectEnvEntries(dirs);
-  await renderEnvFiles(merged, agentCwd);
+export async function loadEnv(dirs, agentCwd, runtime) {
+  const { names, merged } = await collectEnvEntries(runtime, dirs);
+  await renderEnvFiles(runtime, merged, agentCwd);
   return [...names];
 }

package/src/benchmark/{scorer.js → invariants.js} RENAMED Viewed

@@ -1,7 +1,7 @@
 /**
- * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
- * the post-run agent CWD. The exit code is authoritative for the verdict;
- * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
+ * Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
+ * against the post-run agent CWD. The exit code is authoritative for the
+ * verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
  */
 import { spawn } from "node:child_process";
@@ -15,31 +15,33 @@ import {
 import { join } from "node:path";
 /**
- * @typedef {object} ScoringResult
+ * @typedef {object} InvariantsResult
  * @property {"pass" | "fail"} verdict
  * @property {Array<object>} details
  * @property {number} exitCode
  */
 /**
- * Run the task's scoring script.
+ * Run the task's invariants script.
  * @param {import("./task-family.js").Task} task
  * @param {{cwd: string, port: number, runDir: string}} ctx
- * @returns {Promise<ScoringResult>}
+ * @returns {Promise<InvariantsResult>}
  */
-export function runScoring(task, ctx) {
-  if (!task.paths.score) {
+export function runInvariants(task, ctx) {
+  if (!task.paths.invariants) {
     return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
   }
   return new Promise((res, rej) => {
-    const script = task.paths.score;
-    const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
+    const script = task.paths.invariants;
+    const stderrLog = createWriteStream(
+      join(ctx.runDir, "invariants.stderr.log"),
+    );
     // Bun's child_process pipe setup for fd >= 3 is racy under load (it
     // creates a unix socket pair and the connect() can return ENOENT). Use
     // a temp file as the fd-3 backing store instead — the script still
     // writes via `$RESULTS_FD`, but we hand it a real file descriptor.
-    const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
+    const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
     let fd3File;
     try {
       fd3File = openSync(fd3Path, "w+");
@@ -63,7 +65,7 @@ export function runScoring(task, ctx) {
       } catch {
         // already closed
       }
-      rej(new Error(`failed to spawn scoring script: ${script}`));
+      rej(new Error(`failed to spawn invariants script: ${script}`));
       return;
     }

package/src/benchmark/judge.js CHANGED Viewed

@@ -9,13 +9,11 @@
  *   {{AGENT_INSTRUCTIONS}}  — contents of agent.task.md
  *   {{AGENT_PROFILE}}       — agent profile body (empty string if none)
  *   {{AGENT_TRACE_PATH}}    — path to agent.ndjson
- *   {{SCORING_RESULT}}      — JSON scoring object
+ *   {{INVARIANTS_RESULT}}   — JSON invariants object
  *   {{SKILL_SET_HASH}}      — SHA-256 from apm.lock.yaml
  *   {{TASK_ID}}             — task name (directory under tasks/)
  *   {{TASK_DIR}}            — agent working directory path
  *
- * Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
- *
  * The judge verdict is captured from the orchestration context's
  * `concluded` flag directly — no trace parsing on the happy path.
  * `parseConcludeFromTrace` is preserved for offline analysis and as a
@@ -46,17 +44,16 @@ import { createRedactor } from "../redaction.js";
  * Run the judge over a completed task run.
  * @param {import("./task-family.js").Task} task
  * @param {import("./workdir.js").Workdir} workdir
- * @param {import("./scorer.js").ScoringResult} scoring
+ * @param {import("./invariants.js").InvariantsResult} invariants
  * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
  * @param {JudgeContext} [context]
  * @returns {Promise<JudgeVerdict>}
  */
-export async function runJudge(task, workdir, scoring, deps, context) {
+export async function runJudge(task, workdir, invariants, deps, context) {
   const template = await readFile(task.paths.judge, "utf8");
-  const scoringJson = JSON.stringify(scoring, null, 2);
+  const invariantsJson = JSON.stringify(invariants, null, 2);
   const taskText = template
-    .replaceAll("{{SCORING_RESULT}}", scoringJson)
-    .replaceAll("{{SCORING}}", scoringJson)
+    .replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
     .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
     .replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
     .replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")

package/src/benchmark/report.js CHANGED Viewed

@@ -3,7 +3,7 @@
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
  *
- * When `includeRuns` is true, each task carries per-run detail (scoring
+ * When `includeRuns` is true, each task carries per-run detail (invariant
  * checks, judge commentary, cost, duration) and the text renderer produces
  * a full markdown report instead of just the pass@k table.
  *
@@ -22,7 +22,7 @@ import { validateResultRecord } from "./result.js";
  * @typedef {object} RunDetail
  * @property {number} runIndex
  * @property {"pass"|"fail"} verdict
- * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
  * @property {{verdict: string, summary: string}} [judgeVerdict]
  * @property {number} costUsd
  * @property {number} turns
@@ -112,7 +112,7 @@ function buildRunDetail(r, acc) {
   return {
     runIndex: r.runIndex,
     verdict: r.verdict,
-    ...(r.scoring && { scoring: r.scoring }),
+    ...(r.invariants && { invariants: r.invariants }),
     ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
     costUsd: r.costUsd ?? 0,
     turns: r.turns ?? 0,
@@ -262,7 +262,7 @@ function renderTaskDetail(task) {
   lines.push("", renderRunsTable(runs));
-  const checks = renderScoringChecks(runs, singleRun);
+  const checks = renderInvariantChecks(runs, singleRun);
   if (checks) lines.push("", checks);
   const commentary = renderJudgeCommentary(runs, singleRun);
@@ -278,7 +278,7 @@ function renderRunsTable(runs) {
   const header = [
     "Run",
     "Verdict",
-    "Scoring",
+    "Invariants",
     "Judge",
     "Cost",
     "Turns",
@@ -286,10 +286,10 @@ function renderRunsTable(runs) {
   ];
   const rows = [header, header.map(() => "---")];
   for (const r of runs) {
-    const scoringCell = r.preflightError
+    const invariantsCell = r.preflightError
       ? "preflight error"
-      : r.scoring
-        ? statusIcon(r.scoring.verdict === "pass")
+      : r.invariants
+        ? statusIcon(r.invariants.verdict === "pass")
         : "—";
     const judgeCell = r.preflightError
       ? "—"
@@ -299,7 +299,7 @@ function renderRunsTable(runs) {
     rows.push([
       String(r.runIndex),
       statusIcon(r.verdict === "pass"),
-      scoringCell,
+      invariantsCell,
       judgeCell,
       formatCost(r.costUsd),
       String(r.turns),
@@ -309,15 +309,15 @@ function renderRunsTable(runs) {
   return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
 }
-function renderScoringChecks(runs, singleRun) {
-  const rows = collectScoringRows(runs);
+function renderInvariantChecks(runs, singleRun) {
+  const rows = collectInvariantRows(runs);
   if (!rows.length) return null;
   const header = singleRun
     ? ["Check", "Result", "Message"]
     : ["Run", "Check", "Result", "Message"];
   const lines = [
-    "#### Scoring Checks",
+    "#### Invariant Checks",
     "",
     `| ${header.join(" | ")} |`,
     `| ${header.map(() => "---").join(" | ")} |`,
@@ -331,11 +331,11 @@ function renderScoringChecks(runs, singleRun) {
   return lines.join("\n");
 }
-function collectScoringRows(runs) {
+function collectInvariantRows(runs) {
   const rows = [];
   for (const r of runs) {
-    if (!r.scoring?.details?.length) continue;
-    for (const d of r.scoring.details) {
+    if (!r.invariants?.details?.length) continue;
+    for (const d of r.invariants.details) {
       rows.push({
         run: r.runIndex,
         check: escapeCell(String(d.test ?? "(unnamed)")),

package/src/benchmark/result.js CHANGED Viewed

@@ -3,10 +3,10 @@
  *
  * Two schemas live here:
  *   - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
- *     benchmark run. Has a happy branch (scoring + judge present) and a
- *     pre-flight-failure branch (scoring/judgeVerdict/submission absent).
- *   - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
- *     ad-hoc grading without a full lifecycle.
+ *     benchmark run. Has a happy branch (invariants + judge present) and a
+ *     pre-flight-failure branch (invariants/judgeVerdict/submission absent).
+ *   - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
+ *     (P7): ad-hoc grading without a full lifecycle.
  *
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
  * in a guard and reject schema drift at write time.
@@ -16,7 +16,7 @@ import { z } from "zod";
 const VERDICT_ENUM = z.enum(["pass", "fail"]);
-const SCORING_SHAPE = z.object({
+const INVARIANTS_SHAPE = z.object({
   verdict: VERDICT_ENUM,
   details: z.array(z.unknown()),
   exitCode: z.number().int(),
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
 const HAPPY_RECORD = z.object({
   ...COMMON_FIELDS,
-  scoring: SCORING_SHAPE,
+  invariants: INVARIANTS_SHAPE,
   submission: z.string(),
   judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
   agentTracePath: z.string(),
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
   agentTracePath: z.string(),
   supervisorTracePath: z.string(),
   judgeTracePath: z.string(),
-  scoring: z.undefined().optional(),
+  invariants: z.undefined().optional(),
   submission: z.undefined().optional(),
   judgeVerdict: z.undefined().optional(),
   agentError: z.undefined().optional(),
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
 export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
-export const SCORING_RECORD_SCHEMA = z.object({
+export const INVARIANTS_RECORD_SCHEMA = z.object({
   taskId: z.string().min(1),
-  scoring: SCORING_SHAPE,
+  invariants: INVARIANTS_SHAPE,
   exitCode: z.number().int(),
 });
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
  * Throw on schema mismatch.
  * @param {object} record
  */
-export function validateScoringRecord(record) {
-  SCORING_RECORD_SCHEMA.parse(record);
+export function validateInvariantsRecord(record) {
+  INVARIANTS_RECORD_SCHEMA.parse(record);
 }

package/src/benchmark/runner.js CHANGED Viewed

@@ -4,7 +4,7 @@
  * Phases per (task, runIndex):
  *   1. WorkdirManager.start → seed CWD + run pre-flight probe
  *   2. Supervisor session (agent + supervisor) → produce traces + submission
- *   3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
+ *   3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
  *   4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
  *   5. WorkdirManager.teardown → process-group cleanup
  *
@@ -25,7 +25,7 @@ import { installApm as defaultInstallApm } from "./apm-installer.js";
 import { installNpm as defaultInstallNpm } from "./npm-installer.js";
 import { runJudge } from "./judge.js";
 import { validateResultRecord } from "./result.js";
-import { runScoring } from "./scorer.js";
+import { runInvariants } from "./invariants.js";
 import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
 import { createWorkdirManager } from "./workdir.js";
@@ -60,10 +60,10 @@ export class BenchmarkRunner {
    *   write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
    *   `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
    *   testing only — not part of the public API.
-   * @param {Function} [opts.runScoring] - Test seam: replaces `runScoring`.
-   *   Same contract as `runScoring(task, ctx)`. Internal testing only.
+   * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
+   *   Same contract as `runInvariants(task, ctx)`. Internal testing only.
    * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
-   *   contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
+   *   contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
    *   only.
    * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
    *   Same contract as `installApm(family, outputDir)`. Lets tests inject a
@@ -86,7 +86,7 @@ export class BenchmarkRunner {
     termGraceMs,
     // Test seams — default to the real implementations.
     runAgent,
-    runScoring: runScoringHook,
+    runInvariants: runInvariantsHook,
     runJudge: runJudgeHook,
     installApm: installApmHook,
     installNpm: installNpmHook,
@@ -112,7 +112,7 @@ export class BenchmarkRunner {
     this.maxTurns = maxTurns;
     this.termGraceMs = termGraceMs;
     this._runAgentHook = runAgent ?? null;
-    this._runScoringHook = runScoringHook ?? runScoring;
+    this._runInvariantsHook = runInvariantsHook ?? runInvariants;
     this._runJudgeHook = runJudgeHook ?? runJudge;
     this._installApmHook = installApmHook ?? defaultInstallApm;
     this._installNpmHook = installNpmHook ?? defaultInstallNpm;
@@ -191,7 +191,7 @@ export class BenchmarkRunner {
       }
       const { costUsd, turns, submission, agentError } =
         await this.#runAgentSafe(task, workdir);
-      const scoring = await this._runScoringHook(task, {
+      const invariants = await this._runInvariantsHook(task, {
         cwd: workdir.cwd,
         port: workdir.port,
         runDir: workdir.runDir,
@@ -206,7 +206,7 @@ export class BenchmarkRunner {
         judgeVerdict = await this._runJudgeHook(
           task,
           workdir,
-          scoring,
+          invariants,
           {
             query: this.query,
             model: this.judgeModel,
@@ -217,7 +217,7 @@ export class BenchmarkRunner {
         );
       }
       const verdict =
-        scoring.verdict === "pass" &&
+        invariants.verdict === "pass" &&
         (judgeVerdict === null || judgeVerdict.verdict === "pass")
           ? "pass"
           : "fail";
@@ -225,7 +225,7 @@ export class BenchmarkRunner {
         taskId: task.id,
         runIndex,
         verdict,
-        scoring,
+        invariants,
         submission,
         ...(judgeVerdict && { judgeVerdict }),
         costUsd,

package/src/benchmark/task-family.js CHANGED Viewed

@@ -9,7 +9,7 @@
  *       judge.task.md
  *       hooks/                # harness-only; never copied to agent CWD
  *         preflight.sh
- *         score.sh
+ *         invariants.sh
  *       specs/                # copied into agent CWD
  *       workdir/              # copied into agent CWD
  *
@@ -104,7 +104,7 @@ async function discoverTasks(rootPath) {
     const supervisorPath = join(taskDir, "supervisor.task.md");
     const judgePath = join(taskDir, "judge.task.md");
     const preflightPath = join(taskDir, "hooks", "preflight.sh");
-    const scorePath = join(taskDir, "hooks", "score.sh");
+    const invariantsPath = join(taskDir, "hooks", "invariants.sh");
     tasks.push({
       id: entry.name,
       paths: {
@@ -114,7 +114,9 @@ async function discoverTasks(rootPath) {
         judge: (await fileExists(judgePath)) ? judgePath : null,
         hooks: join(taskDir, "hooks"),
         preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
-        score: (await fileExecutable(scorePath)) ? scorePath : null,
+        invariants: (await fileExecutable(invariantsPath))
+          ? invariantsPath
+          : null,
         specs: join(taskDir, "specs"),
         workdir: join(taskDir, "workdir"),
       },
@@ -236,7 +238,7 @@ function run(cmd, args) {
 /**
  * @typedef {object} Task
  * @property {string} id - Task name (directory name under tasks/)
- * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, invariants: string|null, specs: string, workdir: string}} paths
  */
 /**

package/src/benchmark/workdir.js CHANGED Viewed

@@ -4,7 +4,7 @@
  * the pre-flight smoke probe, and tear down the process group at end of run.
  *
  * The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
- * runAgent → score → judge → teardown.
+ * runAgent → invariants → judge → teardown.
  */
 import { spawn } from "node:child_process";
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
 import { connect } from "node:net";
 import { join } from "node:path";
+import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
 import { loadEnv } from "./env-loader.js";
 const DEFAULT_TERM_GRACE_MS = 5_000;
@@ -38,13 +40,23 @@ export class WorkdirManager {
    * @param {string} deps.stagingDir - Output of `installApm(...)`.
    * @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
    */
-  constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
+  constructor({
+    stagingDir,
+    runOutputDir,
+    termGraceMs,
+    familyRootPath,
+    runtime,
+  }) {
     if (!stagingDir) throw new Error("stagingDir is required");
     if (!runOutputDir) throw new Error("runOutputDir is required");
     this.stagingDir = stagingDir;
     this.runOutputDir = runOutputDir;
     this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
     this.familyRootPath = familyRootPath ?? null;
+    // `loadEnv` is the only collaborator routed through the runtime today; the
+    // rest of this manager still uses raw streaming/net/process-group APIs the
+    // runtime surface does not yet cover.
+    this.runtime = runtime ?? null;
   }
   /**
@@ -80,7 +92,10 @@ export class WorkdirManager {
       ...(this.familyRootPath ? [this.familyRootPath] : []),
       ...(task.paths.taskDir ? [task.paths.taskDir] : []),
     ];
-    const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
+    const envNames =
+      envDirs.length > 0
+        ? await loadEnv(envDirs, cwd, this.runtime ?? createDefaultRuntime())
+        : [];
     const port = await allocatePort();
     const agentTracePath = join(runDir, "agent.ndjson");