@forwardimpact/libeval 0.1.56 → 0.1.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.56",
3
+ "version": "0.1.57",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Shared environment builder for the benchmark hook scripts (`preflight.sh` and
3
+ * `invariants.sh`). Keeping both spawns on one helper guarantees they expose the
4
+ * same variable set, so hook authors never have to wonder which vars a given
5
+ * hook receives.
6
+ *
7
+ * Path vars (TASK_DIR, FAMILY_DIR, HOOKS_DIR) let hooks reference real
8
+ * locations instead of reconstructing them from `$0`. They are paths, not
9
+ * secrets, so they need no redaction allowlist entry.
10
+ */
11
+
12
+ /**
13
+ * @param {Record<string, string>} baseEnv - Inherited env (`runtime.proc.env`).
14
+ * @param {object} vars
15
+ * @param {string} vars.cwd - Agent CWD → `$WORKDIR`.
16
+ * @param {number} vars.port - Allocated TCP port → `$PORT`.
17
+ * @param {string} vars.taskId - Task id → `$TASK_ID`.
18
+ * @param {string} vars.taskDir - Task directory on host → `$TASK_DIR`.
19
+ * @param {string} vars.hooksDir - Task `hooks/` dir on host → `$HOOKS_DIR`.
20
+ * @param {string|null} vars.familyDir - Family root on host → `$FAMILY_DIR`
21
+ * (null when the family root is unknown, e.g. a standalone task).
22
+ * @returns {Record<string, string>}
23
+ */
24
+ export function buildHookEnv(
25
+ baseEnv,
26
+ { cwd, port, taskId, taskDir, hooksDir, familyDir },
27
+ ) {
28
+ return {
29
+ ...baseEnv,
30
+ WORKDIR: cwd,
31
+ PORT: String(port),
32
+ TASK_ID: taskId,
33
+ TASK_DIR: taskDir,
34
+ HOOKS_DIR: hooksDir,
35
+ FAMILY_DIR: familyDir ?? "",
36
+ };
37
+ }
@@ -10,6 +10,8 @@
10
10
 
11
11
  import { join } from "node:path";
12
12
 
13
+ import { buildHookEnv } from "./hook-env.js";
14
+
13
15
  /**
14
16
  * @typedef {object} InvariantsResult
15
17
  * @property {"pass" | "fail"} verdict
@@ -20,7 +22,7 @@ import { join } from "node:path";
20
22
  /**
21
23
  * Run the task's invariants script.
22
24
  * @param {import("./task-family.js").Task} task
23
- * @param {{cwd: string, port: number, runDir: string}} ctx
25
+ * @param {{cwd: string, port: number, runDir: string, familyDir?: string|null}} ctx
24
26
  * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
25
27
  * @returns {Promise<InvariantsResult>}
26
28
  */
@@ -44,9 +46,14 @@ export async function runInvariants(task, ctx, runtime) {
44
46
  try {
45
47
  child = runtime.subprocess.spawn(script, [], {
46
48
  env: {
47
- ...runtime.proc.env,
48
- WORKDIR: ctx.cwd,
49
- PORT: String(ctx.port),
49
+ ...buildHookEnv(runtime.proc.env, {
50
+ cwd: ctx.cwd,
51
+ port: ctx.port,
52
+ taskId: task.id,
53
+ taskDir: task.paths.taskDir,
54
+ hooksDir: task.paths.hooks,
55
+ familyDir: ctx.familyDir,
56
+ }),
50
57
  RESULTS_FD: "3",
51
58
  },
52
59
  stdio: ["inherit", "pipe", "pipe", fd3File],
@@ -201,6 +201,7 @@ export class BenchmarkRunner {
201
201
  cwd: workdir.cwd,
202
202
  port: workdir.port,
203
203
  runDir: workdir.runDir,
204
+ familyDir: family.rootPath,
204
205
  },
205
206
  this.runtime,
206
207
  );
@@ -17,6 +17,7 @@ import { connect } from "node:net";
17
17
  import { join } from "node:path";
18
18
 
19
19
  import { loadEnv } from "./env-loader.js";
20
+ import { buildHookEnv } from "./hook-env.js";
20
21
 
21
22
  const DEFAULT_TERM_GRACE_MS = 5_000;
22
23
 
@@ -73,6 +74,24 @@ export class WorkdirManager {
73
74
  const cwd = join(runDir, "cwd");
74
75
  await fs.mkdir(cwd, { recursive: true });
75
76
 
77
+ // Family-level shared fixtures: convention-over-configuration, copied if
78
+ // present. They form the shared base; the per-task workdir/specs below
79
+ // overlay on top (fs.cp defaults to force:true, so a per-task file wins).
80
+ if (this.familyRootPath) {
81
+ await fs
82
+ .cp(join(this.familyRootPath, "workdir"), cwd, { recursive: true })
83
+ .catch((e) => {
84
+ if (e.code !== "ENOENT") throw e;
85
+ });
86
+ await fs
87
+ .cp(join(this.familyRootPath, "specs"), join(cwd, "specs"), {
88
+ recursive: true,
89
+ })
90
+ .catch((e) => {
91
+ if (e.code !== "ENOENT") throw e;
92
+ });
93
+ }
94
+
76
95
  await fs.cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
77
96
  if (e.code !== "ENOENT") throw e;
78
97
  });
@@ -107,7 +126,12 @@ export class WorkdirManager {
107
126
  const judgeTracePath = join(runDir, "judge.ndjson");
108
127
 
109
128
  const preflight = task.paths.preflight
110
- ? await runPreflight(this.runtime, task.paths.preflight, cwd, port)
129
+ ? await runPreflight(this.runtime, task.paths.preflight, cwd, port, {
130
+ taskId: task.id,
131
+ taskDir: task.paths.taskDir,
132
+ hooksDir: task.paths.hooks,
133
+ familyDir: this.familyRootPath,
134
+ })
111
135
  : { pgid: 0 };
112
136
 
113
137
  return {
@@ -163,12 +187,13 @@ export class WorkdirManager {
163
187
  * @param {string} script
164
188
  * @param {string} cwd - Agent CWD passed via $WORKDIR.
165
189
  * @param {number} port - Free TCP port passed via $PORT.
190
+ * @param {{taskId: string, taskDir: string, hooksDir: string, familyDir: string|null}} vars - Extra hook env vars.
166
191
  * @returns {Promise<{pgid: number, error?: {phase: string, message: string, exitCode: number}}>}
167
192
  */
168
- async function runPreflight(runtime, script, cwd, port) {
193
+ async function runPreflight(runtime, script, cwd, port, vars) {
169
194
  const child = runtime.subprocess.spawn(script, [], {
170
195
  cwd,
171
- env: { ...runtime.proc.env, WORKDIR: cwd, PORT: String(port) },
196
+ env: buildHookEnv(runtime.proc.env, { cwd, port, ...vars }),
172
197
  detached: true,
173
198
  stdio: ["ignore", "pipe", "pipe"],
174
199
  });