@forwardimpact/libeval 0.1.50 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +49 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/env-loader.js +35 -23
  8. package/src/benchmark/{scorer.js → invariants.js} +14 -12
  9. package/src/benchmark/judge.js +5 -8
  10. package/src/benchmark/report.js +15 -15
  11. package/src/benchmark/result.js +11 -11
  12. package/src/benchmark/runner.js +11 -11
  13. package/src/benchmark/task-family.js +6 -4
  14. package/src/benchmark/workdir.js +18 -3
  15. package/src/commands/assert.js +30 -22
  16. package/src/commands/benchmark-invariants.js +74 -0
  17. package/src/commands/benchmark-report.js +23 -15
  18. package/src/commands/benchmark-run.js +15 -8
  19. package/src/commands/by-discussion.js +29 -18
  20. package/src/commands/callback.js +20 -11
  21. package/src/commands/discuss.js +28 -11
  22. package/src/commands/facilitate.js +18 -12
  23. package/src/commands/output.js +11 -12
  24. package/src/commands/run.js +22 -12
  25. package/src/commands/supervise.js +27 -18
  26. package/src/commands/task-input.js +10 -5
  27. package/src/commands/trace.js +174 -97
  28. package/src/discuss-tools.js +48 -2
  29. package/src/discusser.js +49 -2
  30. package/src/events/github.js +27 -5
  31. package/src/inbox-poller.js +84 -0
  32. package/src/judge.js +1 -1
  33. package/src/message-bus.js +6 -0
  34. package/src/orchestration-loop.js +14 -4
  35. package/src/orchestration-toolkit.js +14 -0
  36. package/src/redaction.js +31 -9
  37. package/src/reply-emitter.js +47 -0
  38. package/src/commands/benchmark-score.js +0 -68
@@ -14,7 +14,6 @@
14
14
  * AND rendered (with resolved values) into the agent working directory.
15
15
  */
16
16
 
17
- import { readFile, writeFile } from "node:fs/promises";
18
17
  import { join } from "node:path";
19
18
 
20
19
  const ENV_FILES = [".env.local", ".env"];
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
48
47
 
49
48
  /**
50
49
  * Read and parse an env file, returning [] if the file does not exist.
50
+ * @param {object} fs - Async filesystem surface (`runtime.fs`).
51
51
  * @param {string} filePath
52
52
  * @returns {Promise<Array<{key: string, value: string}>>}
53
53
  */
54
- async function readEnvFile(filePath) {
54
+ async function readEnvFile(fs, filePath) {
55
55
  try {
56
- const content = await readFile(filePath, "utf8");
56
+ const content = await fs.readFile(filePath, "utf8");
57
57
  return parseEnvFile(content);
58
58
  } catch (e) {
59
59
  if (e.code === "ENOENT") return [];
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
62
62
  }
63
63
 
64
64
  /**
65
- * Load entries into process.env. Existing keys are never overwritten.
65
+ * Load entries into the process env map. Existing keys are never overwritten.
66
+ * @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
66
67
  * @param {Array<{key: string, value: string}>} entries
67
68
  * @returns {string[]} var names that were loaded
68
69
  */
69
- function applyToProcessEnv(entries) {
70
+ function applyToProcessEnv(env, entries) {
70
71
  const names = [];
71
72
  for (const { key, value } of entries) {
72
73
  names.push(key);
73
- if (process.env[key] === undefined) {
74
- process.env[key] = value;
74
+ if (env[key] === undefined) {
75
+ env[key] = value;
75
76
  }
76
77
  }
77
78
  return names;
78
79
  }
79
80
 
80
81
  /**
81
- * Load one env file: apply to process.env, record keys in the merged map.
82
+ * Load one env file: apply to the env map, record keys in the merged map.
83
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
82
84
  * @param {string} dir
83
85
  * @param {string} file
84
86
  * @param {Set<string>} names
85
87
  * @param {Map<string, Map<string, true>>} merged
86
88
  */
87
- async function loadOneEnvFile(dir, file, names, merged) {
88
- const entries = await readEnvFile(join(dir, file));
89
+ async function loadOneEnvFile(runtime, dir, file, names, merged) {
90
+ const entries = await readEnvFile(runtime.fs, join(dir, file));
89
91
  if (entries.length === 0) return;
90
- for (const name of applyToProcessEnv(entries)) names.add(name);
92
+ for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
93
+ names.add(name);
94
+ }
91
95
  if (!merged.has(file)) merged.set(file, new Map());
92
96
  const fileMap = merged.get(file);
93
97
  for (const { key } of entries) {
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
96
100
  }
97
101
 
98
102
  /**
99
- * Scan directories for env files, load into process.env, and collect
103
+ * Scan directories for env files, load into the env map, and collect
100
104
  * a merged key manifest per filename.
105
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
101
106
  * @param {string[]} dirs
102
107
  * @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
103
108
  */
104
- async function collectEnvEntries(dirs) {
109
+ async function collectEnvEntries(runtime, dirs) {
105
110
  const names = new Set();
106
111
  const merged = new Map();
107
112
  for (const dir of dirs) {
108
113
  for (const file of ENV_FILES) {
109
- await loadOneEnvFile(dir, file, names, merged);
114
+ await loadOneEnvFile(runtime, dir, file, names, merged);
110
115
  }
111
116
  }
112
117
  return { names, merged };
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
114
119
 
115
120
  /**
116
121
  * Write resolved env files into the agent CWD and warn about empty values.
122
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
117
123
  * @param {Map<string, Map<string, true>>} merged
118
124
  * @param {string} agentCwd
119
125
  */
120
- async function renderEnvFiles(merged, agentCwd) {
126
+ async function renderEnvFiles(runtime, merged, agentCwd) {
127
+ const env = runtime.proc.env;
121
128
  for (const [file, keyMap] of merged) {
122
129
  const keys = [...keyMap.keys()];
123
- const resolved = keys.map((key) => `${key}=${process.env[key] ?? ""}`);
124
- await writeFile(join(agentCwd, file), resolved.join("\n") + "\n");
125
- const empty = keys.filter((key) => !process.env[key]);
130
+ const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
131
+ await runtime.fs.writeFile(
132
+ join(agentCwd, file),
133
+ resolved.join("\n") + "\n",
134
+ );
135
+ const empty = keys.filter((key) => !env[key]);
126
136
  if (empty.length > 0) {
127
- process.stderr.write(
137
+ runtime.proc.stderr.write(
128
138
  `libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
129
139
  );
130
140
  }
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
133
143
 
134
144
  /**
135
145
  * Discover `.env` / `.env.local` in one or more directories, load them
136
- * into process.env, and render the resolved values into the agent CWD.
146
+ * into the process env map, and render the resolved values into the agent CWD.
137
147
  *
138
148
  * @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
139
149
  * @param {string} agentCwd - Agent working directory to render into.
150
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
151
+ * collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
140
152
  * @returns {Promise<string[]>} All var names discovered (for redaction).
141
153
  */
142
- export async function loadEnv(dirs, agentCwd) {
143
- const { names, merged } = await collectEnvEntries(dirs);
144
- await renderEnvFiles(merged, agentCwd);
154
+ export async function loadEnv(dirs, agentCwd, runtime) {
155
+ const { names, merged } = await collectEnvEntries(runtime, dirs);
156
+ await renderEnvFiles(runtime, merged, agentCwd);
145
157
  return [...names];
146
158
  }
@@ -1,7 +1,7 @@
1
1
  /**
2
- * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
3
- * the post-run agent CWD. The exit code is authoritative for the verdict;
4
- * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
2
+ * Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
3
+ * against the post-run agent CWD. The exit code is authoritative for the
4
+ * verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
5
  */
6
6
 
7
7
  import { spawn } from "node:child_process";
@@ -15,31 +15,33 @@ import {
15
15
  import { join } from "node:path";
16
16
 
17
17
  /**
18
- * @typedef {object} ScoringResult
18
+ * @typedef {object} InvariantsResult
19
19
  * @property {"pass" | "fail"} verdict
20
20
  * @property {Array<object>} details
21
21
  * @property {number} exitCode
22
22
  */
23
23
 
24
24
  /**
25
- * Run the task's scoring script.
25
+ * Run the task's invariants script.
26
26
  * @param {import("./task-family.js").Task} task
27
27
  * @param {{cwd: string, port: number, runDir: string}} ctx
28
- * @returns {Promise<ScoringResult>}
28
+ * @returns {Promise<InvariantsResult>}
29
29
  */
30
- export function runScoring(task, ctx) {
31
- if (!task.paths.score) {
30
+ export function runInvariants(task, ctx) {
31
+ if (!task.paths.invariants) {
32
32
  return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
33
33
  }
34
34
  return new Promise((res, rej) => {
35
- const script = task.paths.score;
36
- const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
35
+ const script = task.paths.invariants;
36
+ const stderrLog = createWriteStream(
37
+ join(ctx.runDir, "invariants.stderr.log"),
38
+ );
37
39
 
38
40
  // Bun's child_process pipe setup for fd >= 3 is racy under load (it
39
41
  // creates a unix socket pair and the connect() can return ENOENT). Use
40
42
  // a temp file as the fd-3 backing store instead — the script still
41
43
  // writes via `$RESULTS_FD`, but we hand it a real file descriptor.
42
- const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
44
+ const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
43
45
  let fd3File;
44
46
  try {
45
47
  fd3File = openSync(fd3Path, "w+");
@@ -63,7 +65,7 @@ export function runScoring(task, ctx) {
63
65
  } catch {
64
66
  // already closed
65
67
  }
66
- rej(new Error(`failed to spawn scoring script: ${script}`));
68
+ rej(new Error(`failed to spawn invariants script: ${script}`));
67
69
  return;
68
70
  }
69
71
 
@@ -9,13 +9,11 @@
9
9
  * {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
10
10
  * {{AGENT_PROFILE}} — agent profile body (empty string if none)
11
11
  * {{AGENT_TRACE_PATH}} — path to agent.ndjson
12
- * {{SCORING_RESULT}} — JSON scoring object
12
+ * {{INVARIANTS_RESULT}} — JSON invariants object
13
13
  * {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
14
14
  * {{TASK_ID}} — task name (directory under tasks/)
15
15
  * {{TASK_DIR}} — agent working directory path
16
16
  *
17
- * Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
18
- *
19
17
  * The judge verdict is captured from the orchestration context's
20
18
  * `concluded` flag directly — no trace parsing on the happy path.
21
19
  * `parseConcludeFromTrace` is preserved for offline analysis and as a
@@ -46,17 +44,16 @@ import { createRedactor } from "../redaction.js";
46
44
  * Run the judge over a completed task run.
47
45
  * @param {import("./task-family.js").Task} task
48
46
  * @param {import("./workdir.js").Workdir} workdir
49
- * @param {import("./scorer.js").ScoringResult} scoring
47
+ * @param {import("./invariants.js").InvariantsResult} invariants
50
48
  * @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
51
49
  * @param {JudgeContext} [context]
52
50
  * @returns {Promise<JudgeVerdict>}
53
51
  */
54
- export async function runJudge(task, workdir, scoring, deps, context) {
52
+ export async function runJudge(task, workdir, invariants, deps, context) {
55
53
  const template = await readFile(task.paths.judge, "utf8");
56
- const scoringJson = JSON.stringify(scoring, null, 2);
54
+ const invariantsJson = JSON.stringify(invariants, null, 2);
57
55
  const taskText = template
58
- .replaceAll("{{SCORING_RESULT}}", scoringJson)
59
- .replaceAll("{{SCORING}}", scoringJson)
56
+ .replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
60
57
  .replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
61
58
  .replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
62
59
  .replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
@@ -3,7 +3,7 @@
3
3
  * records by `taskId`, and compute pass@k via the OpenAI HumanEval
4
4
  * unbiased estimator: `1 - C(n-c, k) / C(n, k)`.
5
5
  *
6
- * When `includeRuns` is true, each task carries per-run detail (scoring
6
+ * When `includeRuns` is true, each task carries per-run detail (invariant
7
7
  * checks, judge commentary, cost, duration) and the text renderer produces
8
8
  * a full markdown report instead of just the pass@k table.
9
9
  *
@@ -22,7 +22,7 @@ import { validateResultRecord } from "./result.js";
22
22
  * @typedef {object} RunDetail
23
23
  * @property {number} runIndex
24
24
  * @property {"pass"|"fail"} verdict
25
- * @property {{verdict: string, details: unknown[], exitCode: number}} [scoring]
25
+ * @property {{verdict: string, details: unknown[], exitCode: number}} [invariants]
26
26
  * @property {{verdict: string, summary: string}} [judgeVerdict]
27
27
  * @property {number} costUsd
28
28
  * @property {number} turns
@@ -112,7 +112,7 @@ function buildRunDetail(r, acc) {
112
112
  return {
113
113
  runIndex: r.runIndex,
114
114
  verdict: r.verdict,
115
- ...(r.scoring && { scoring: r.scoring }),
115
+ ...(r.invariants && { invariants: r.invariants }),
116
116
  ...(r.judgeVerdict && { judgeVerdict: r.judgeVerdict }),
117
117
  costUsd: r.costUsd ?? 0,
118
118
  turns: r.turns ?? 0,
@@ -262,7 +262,7 @@ function renderTaskDetail(task) {
262
262
 
263
263
  lines.push("", renderRunsTable(runs));
264
264
 
265
- const checks = renderScoringChecks(runs, singleRun);
265
+ const checks = renderInvariantChecks(runs, singleRun);
266
266
  if (checks) lines.push("", checks);
267
267
 
268
268
  const commentary = renderJudgeCommentary(runs, singleRun);
@@ -278,7 +278,7 @@ function renderRunsTable(runs) {
278
278
  const header = [
279
279
  "Run",
280
280
  "Verdict",
281
- "Scoring",
281
+ "Invariants",
282
282
  "Judge",
283
283
  "Cost",
284
284
  "Turns",
@@ -286,10 +286,10 @@ function renderRunsTable(runs) {
286
286
  ];
287
287
  const rows = [header, header.map(() => "---")];
288
288
  for (const r of runs) {
289
- const scoringCell = r.preflightError
289
+ const invariantsCell = r.preflightError
290
290
  ? "preflight error"
291
- : r.scoring
292
- ? statusIcon(r.scoring.verdict === "pass")
291
+ : r.invariants
292
+ ? statusIcon(r.invariants.verdict === "pass")
293
293
  : "—";
294
294
  const judgeCell = r.preflightError
295
295
  ? "—"
@@ -299,7 +299,7 @@ function renderRunsTable(runs) {
299
299
  rows.push([
300
300
  String(r.runIndex),
301
301
  statusIcon(r.verdict === "pass"),
302
- scoringCell,
302
+ invariantsCell,
303
303
  judgeCell,
304
304
  formatCost(r.costUsd),
305
305
  String(r.turns),
@@ -309,15 +309,15 @@ function renderRunsTable(runs) {
309
309
  return rows.map((r) => `| ${r.join(" | ")} |`).join("\n");
310
310
  }
311
311
 
312
- function renderScoringChecks(runs, singleRun) {
313
- const rows = collectScoringRows(runs);
312
+ function renderInvariantChecks(runs, singleRun) {
313
+ const rows = collectInvariantRows(runs);
314
314
  if (!rows.length) return null;
315
315
 
316
316
  const header = singleRun
317
317
  ? ["Check", "Result", "Message"]
318
318
  : ["Run", "Check", "Result", "Message"];
319
319
  const lines = [
320
- "#### Scoring Checks",
320
+ "#### Invariant Checks",
321
321
  "",
322
322
  `| ${header.join(" | ")} |`,
323
323
  `| ${header.map(() => "---").join(" | ")} |`,
@@ -331,11 +331,11 @@ function renderScoringChecks(runs, singleRun) {
331
331
  return lines.join("\n");
332
332
  }
333
333
 
334
- function collectScoringRows(runs) {
334
+ function collectInvariantRows(runs) {
335
335
  const rows = [];
336
336
  for (const r of runs) {
337
- if (!r.scoring?.details?.length) continue;
338
- for (const d of r.scoring.details) {
337
+ if (!r.invariants?.details?.length) continue;
338
+ for (const d of r.invariants.details) {
339
339
  rows.push({
340
340
  run: r.runIndex,
341
341
  check: escapeCell(String(d.test ?? "(unnamed)")),
@@ -3,10 +3,10 @@
3
3
  *
4
4
  * Two schemas live here:
5
5
  * - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
6
- * benchmark run. Has a happy branch (scoring + judge present) and a
7
- * pre-flight-failure branch (scoring/judgeVerdict/submission absent).
8
- * - SCORING_RECORD_SCHEMA — narrower output of `benchmark-score` (P7):
9
- * ad-hoc grading without a full lifecycle.
6
+ * benchmark run. Has a happy branch (invariants + judge present) and a
7
+ * pre-flight-failure branch (invariants/judgeVerdict/submission absent).
8
+ * - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
9
+ * (P7): ad-hoc grading without a full lifecycle.
10
10
  *
11
11
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
12
12
  * in a guard and reject schema drift at write time.
@@ -16,7 +16,7 @@ import { z } from "zod";
16
16
 
17
17
  const VERDICT_ENUM = z.enum(["pass", "fail"]);
18
18
 
19
- const SCORING_SHAPE = z.object({
19
+ const INVARIANTS_SHAPE = z.object({
20
20
  verdict: VERDICT_ENUM,
21
21
  details: z.array(z.unknown()),
22
22
  exitCode: z.number().int(),
@@ -63,7 +63,7 @@ const AGENT_ERROR_SHAPE = z.object({
63
63
 
64
64
  const HAPPY_RECORD = z.object({
65
65
  ...COMMON_FIELDS,
66
- scoring: SCORING_SHAPE,
66
+ invariants: INVARIANTS_SHAPE,
67
67
  submission: z.string(),
68
68
  judgeVerdict: JUDGE_VERDICT_SHAPE.optional(),
69
69
  agentTracePath: z.string(),
@@ -83,7 +83,7 @@ const PREFLIGHT_RECORD = z.object({
83
83
  agentTracePath: z.string(),
84
84
  supervisorTracePath: z.string(),
85
85
  judgeTracePath: z.string(),
86
- scoring: z.undefined().optional(),
86
+ invariants: z.undefined().optional(),
87
87
  submission: z.undefined().optional(),
88
88
  judgeVerdict: z.undefined().optional(),
89
89
  agentError: z.undefined().optional(),
@@ -91,9 +91,9 @@ const PREFLIGHT_RECORD = z.object({
91
91
 
92
92
  export const RESULT_RECORD_SCHEMA = z.union([HAPPY_RECORD, PREFLIGHT_RECORD]);
93
93
 
94
- export const SCORING_RECORD_SCHEMA = z.object({
94
+ export const INVARIANTS_RECORD_SCHEMA = z.object({
95
95
  taskId: z.string().min(1),
96
- scoring: SCORING_SHAPE,
96
+ invariants: INVARIANTS_SHAPE,
97
97
  exitCode: z.number().int(),
98
98
  });
99
99
 
@@ -109,6 +109,6 @@ export function validateResultRecord(record) {
109
109
  * Throw on schema mismatch.
110
110
  * @param {object} record
111
111
  */
112
- export function validateScoringRecord(record) {
113
- SCORING_RECORD_SCHEMA.parse(record);
112
+ export function validateInvariantsRecord(record) {
113
+ INVARIANTS_RECORD_SCHEMA.parse(record);
114
114
  }
@@ -4,7 +4,7 @@
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
6
  * 2. Supervisor session (agent + supervisor) → produce traces + submission
7
- * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
7
+ * 3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
10
10
  *
@@ -25,7 +25,7 @@ import { installApm as defaultInstallApm } from "./apm-installer.js";
25
25
  import { installNpm as defaultInstallNpm } from "./npm-installer.js";
26
26
  import { runJudge } from "./judge.js";
27
27
  import { validateResultRecord } from "./result.js";
28
- import { runScoring } from "./scorer.js";
28
+ import { runInvariants } from "./invariants.js";
29
29
  import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
30
30
  import { createWorkdirManager } from "./workdir.js";
31
31
 
@@ -60,10 +60,10 @@ export class BenchmarkRunner {
60
60
  * write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
61
61
  * `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
62
62
  * testing only — not part of the public API.
63
- * @param {Function} [opts.runScoring] - Test seam: replaces `runScoring`.
64
- * Same contract as `runScoring(task, ctx)`. Internal testing only.
63
+ * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
64
+ * Same contract as `runInvariants(task, ctx)`. Internal testing only.
65
65
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
66
- * contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
66
+ * contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
67
67
  * only.
68
68
  * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
69
69
  * Same contract as `installApm(family, outputDir)`. Lets tests inject a
@@ -86,7 +86,7 @@ export class BenchmarkRunner {
86
86
  termGraceMs,
87
87
  // Test seams — default to the real implementations.
88
88
  runAgent,
89
- runScoring: runScoringHook,
89
+ runInvariants: runInvariantsHook,
90
90
  runJudge: runJudgeHook,
91
91
  installApm: installApmHook,
92
92
  installNpm: installNpmHook,
@@ -112,7 +112,7 @@ export class BenchmarkRunner {
112
112
  this.maxTurns = maxTurns;
113
113
  this.termGraceMs = termGraceMs;
114
114
  this._runAgentHook = runAgent ?? null;
115
- this._runScoringHook = runScoringHook ?? runScoring;
115
+ this._runInvariantsHook = runInvariantsHook ?? runInvariants;
116
116
  this._runJudgeHook = runJudgeHook ?? runJudge;
117
117
  this._installApmHook = installApmHook ?? defaultInstallApm;
118
118
  this._installNpmHook = installNpmHook ?? defaultInstallNpm;
@@ -191,7 +191,7 @@ export class BenchmarkRunner {
191
191
  }
192
192
  const { costUsd, turns, submission, agentError } =
193
193
  await this.#runAgentSafe(task, workdir);
194
- const scoring = await this._runScoringHook(task, {
194
+ const invariants = await this._runInvariantsHook(task, {
195
195
  cwd: workdir.cwd,
196
196
  port: workdir.port,
197
197
  runDir: workdir.runDir,
@@ -206,7 +206,7 @@ export class BenchmarkRunner {
206
206
  judgeVerdict = await this._runJudgeHook(
207
207
  task,
208
208
  workdir,
209
- scoring,
209
+ invariants,
210
210
  {
211
211
  query: this.query,
212
212
  model: this.judgeModel,
@@ -217,7 +217,7 @@ export class BenchmarkRunner {
217
217
  );
218
218
  }
219
219
  const verdict =
220
- scoring.verdict === "pass" &&
220
+ invariants.verdict === "pass" &&
221
221
  (judgeVerdict === null || judgeVerdict.verdict === "pass")
222
222
  ? "pass"
223
223
  : "fail";
@@ -225,7 +225,7 @@ export class BenchmarkRunner {
225
225
  taskId: task.id,
226
226
  runIndex,
227
227
  verdict,
228
- scoring,
228
+ invariants,
229
229
  submission,
230
230
  ...(judgeVerdict && { judgeVerdict }),
231
231
  costUsd,
@@ -9,7 +9,7 @@
9
9
  * judge.task.md
10
10
  * hooks/ # harness-only; never copied to agent CWD
11
11
  * preflight.sh
12
- * score.sh
12
+ * invariants.sh
13
13
  * specs/ # copied into agent CWD
14
14
  * workdir/ # copied into agent CWD
15
15
  *
@@ -104,7 +104,7 @@ async function discoverTasks(rootPath) {
104
104
  const supervisorPath = join(taskDir, "supervisor.task.md");
105
105
  const judgePath = join(taskDir, "judge.task.md");
106
106
  const preflightPath = join(taskDir, "hooks", "preflight.sh");
107
- const scorePath = join(taskDir, "hooks", "score.sh");
107
+ const invariantsPath = join(taskDir, "hooks", "invariants.sh");
108
108
  tasks.push({
109
109
  id: entry.name,
110
110
  paths: {
@@ -114,7 +114,9 @@ async function discoverTasks(rootPath) {
114
114
  judge: (await fileExists(judgePath)) ? judgePath : null,
115
115
  hooks: join(taskDir, "hooks"),
116
116
  preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
117
- score: (await fileExecutable(scorePath)) ? scorePath : null,
117
+ invariants: (await fileExecutable(invariantsPath))
118
+ ? invariantsPath
119
+ : null,
118
120
  specs: join(taskDir, "specs"),
119
121
  workdir: join(taskDir, "workdir"),
120
122
  },
@@ -236,7 +238,7 @@ function run(cmd, args) {
236
238
  /**
237
239
  * @typedef {object} Task
238
240
  * @property {string} id - Task name (directory name under tasks/)
239
- * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
241
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, invariants: string|null, specs: string, workdir: string}} paths
240
242
  */
241
243
 
242
244
  /**
@@ -4,7 +4,7 @@
4
4
  * the pre-flight smoke probe, and tear down the process group at end of run.
5
5
  *
6
6
  * The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
7
- * runAgent → score → judge → teardown.
7
+ * runAgent → invariants → judge → teardown.
8
8
  */
9
9
 
10
10
  import { spawn } from "node:child_process";
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
13
13
  import { connect } from "node:net";
14
14
  import { join } from "node:path";
15
15
 
16
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
17
+
16
18
  import { loadEnv } from "./env-loader.js";
17
19
 
18
20
  const DEFAULT_TERM_GRACE_MS = 5_000;
@@ -38,13 +40,23 @@ export class WorkdirManager {
38
40
  * @param {string} deps.stagingDir - Output of `installApm(...)`.
39
41
  * @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
40
42
  */
41
- constructor({ stagingDir, runOutputDir, termGraceMs, familyRootPath }) {
43
+ constructor({
44
+ stagingDir,
45
+ runOutputDir,
46
+ termGraceMs,
47
+ familyRootPath,
48
+ runtime,
49
+ }) {
42
50
  if (!stagingDir) throw new Error("stagingDir is required");
43
51
  if (!runOutputDir) throw new Error("runOutputDir is required");
44
52
  this.stagingDir = stagingDir;
45
53
  this.runOutputDir = runOutputDir;
46
54
  this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
47
55
  this.familyRootPath = familyRootPath ?? null;
56
+ // `loadEnv` is the only collaborator routed through the runtime today; the
57
+ // rest of this manager still uses raw streaming/net/process-group APIs the
58
+ // runtime surface does not yet cover.
59
+ this.runtime = runtime ?? null;
48
60
  }
49
61
 
50
62
  /**
@@ -80,7 +92,10 @@ export class WorkdirManager {
80
92
  ...(this.familyRootPath ? [this.familyRootPath] : []),
81
93
  ...(task.paths.taskDir ? [task.paths.taskDir] : []),
82
94
  ];
83
- const envNames = envDirs.length > 0 ? await loadEnv(envDirs, cwd) : [];
95
+ const envNames =
96
+ envDirs.length > 0
97
+ ? await loadEnv(envDirs, cwd, this.runtime ?? createDefaultRuntime())
98
+ : [];
84
99
 
85
100
  const port = await allocatePort();
86
101
  const agentTracePath = join(runDir, "agent.ndjson");