@forwardimpact/libeval 0.1.55 → 0.1.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/benchmark/hook-env.js +37 -0
- package/src/benchmark/invariants.js +11 -4
- package/src/benchmark/runner.js +1 -0
- package/src/benchmark/workdir.js +28 -3
- package/src/supervisor.js +2 -4
package/package.json
CHANGED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared environment builder for the benchmark hook scripts (`preflight.sh` and
|
|
3
|
+
* `invariants.sh`). Keeping both spawns on one helper guarantees they expose the
|
|
4
|
+
* same variable set, so hook authors never have to wonder which vars a given
|
|
5
|
+
* hook receives.
|
|
6
|
+
*
|
|
7
|
+
* Path vars (TASK_DIR, FAMILY_DIR, HOOKS_DIR) let hooks reference real
|
|
8
|
+
* locations instead of reconstructing them from `$0`. They are paths, not
|
|
9
|
+
* secrets, so they need no redaction allowlist entry.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* @param {Record<string, string>} baseEnv - Inherited env (`runtime.proc.env`).
|
|
14
|
+
* @param {object} vars
|
|
15
|
+
* @param {string} vars.cwd - Agent CWD → `$WORKDIR`.
|
|
16
|
+
* @param {number} vars.port - Allocated TCP port → `$PORT`.
|
|
17
|
+
* @param {string} vars.taskId - Task id → `$TASK_ID`.
|
|
18
|
+
* @param {string} vars.taskDir - Task directory on host → `$TASK_DIR`.
|
|
19
|
+
* @param {string} vars.hooksDir - Task `hooks/` dir on host → `$HOOKS_DIR`.
|
|
20
|
+
* @param {string|null} vars.familyDir - Family root on host → `$FAMILY_DIR`
|
|
21
|
+
* (null when the family root is unknown, e.g. a standalone task).
|
|
22
|
+
* @returns {Record<string, string>}
|
|
23
|
+
*/
|
|
24
|
+
export function buildHookEnv(
|
|
25
|
+
baseEnv,
|
|
26
|
+
{ cwd, port, taskId, taskDir, hooksDir, familyDir },
|
|
27
|
+
) {
|
|
28
|
+
return {
|
|
29
|
+
...baseEnv,
|
|
30
|
+
WORKDIR: cwd,
|
|
31
|
+
PORT: String(port),
|
|
32
|
+
TASK_ID: taskId,
|
|
33
|
+
TASK_DIR: taskDir,
|
|
34
|
+
HOOKS_DIR: hooksDir,
|
|
35
|
+
FAMILY_DIR: familyDir ?? "",
|
|
36
|
+
};
|
|
37
|
+
}
|
|
@@ -10,6 +10,8 @@
|
|
|
10
10
|
|
|
11
11
|
import { join } from "node:path";
|
|
12
12
|
|
|
13
|
+
import { buildHookEnv } from "./hook-env.js";
|
|
14
|
+
|
|
13
15
|
/**
|
|
14
16
|
* @typedef {object} InvariantsResult
|
|
15
17
|
* @property {"pass" | "fail"} verdict
|
|
@@ -20,7 +22,7 @@ import { join } from "node:path";
|
|
|
20
22
|
/**
|
|
21
23
|
* Run the task's invariants script.
|
|
22
24
|
* @param {import("./task-family.js").Task} task
|
|
23
|
-
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
25
|
+
* @param {{cwd: string, port: number, runDir: string, familyDir?: string|null}} ctx
|
|
24
26
|
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
25
27
|
* @returns {Promise<InvariantsResult>}
|
|
26
28
|
*/
|
|
@@ -44,9 +46,14 @@ export async function runInvariants(task, ctx, runtime) {
|
|
|
44
46
|
try {
|
|
45
47
|
child = runtime.subprocess.spawn(script, [], {
|
|
46
48
|
env: {
|
|
47
|
-
...runtime.proc.env,
|
|
48
|
-
|
|
49
|
-
|
|
49
|
+
...buildHookEnv(runtime.proc.env, {
|
|
50
|
+
cwd: ctx.cwd,
|
|
51
|
+
port: ctx.port,
|
|
52
|
+
taskId: task.id,
|
|
53
|
+
taskDir: task.paths.taskDir,
|
|
54
|
+
hooksDir: task.paths.hooks,
|
|
55
|
+
familyDir: ctx.familyDir,
|
|
56
|
+
}),
|
|
50
57
|
RESULTS_FD: "3",
|
|
51
58
|
},
|
|
52
59
|
stdio: ["inherit", "pipe", "pipe", fd3File],
|
package/src/benchmark/runner.js
CHANGED
package/src/benchmark/workdir.js
CHANGED
|
@@ -17,6 +17,7 @@ import { connect } from "node:net";
|
|
|
17
17
|
import { join } from "node:path";
|
|
18
18
|
|
|
19
19
|
import { loadEnv } from "./env-loader.js";
|
|
20
|
+
import { buildHookEnv } from "./hook-env.js";
|
|
20
21
|
|
|
21
22
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
22
23
|
|
|
@@ -73,6 +74,24 @@ export class WorkdirManager {
|
|
|
73
74
|
const cwd = join(runDir, "cwd");
|
|
74
75
|
await fs.mkdir(cwd, { recursive: true });
|
|
75
76
|
|
|
77
|
+
// Family-level shared fixtures: convention-over-configuration, copied if
|
|
78
|
+
// present. They form the shared base; the per-task workdir/specs below
|
|
79
|
+
// overlay on top (fs.cp defaults to force:true, so a per-task file wins).
|
|
80
|
+
if (this.familyRootPath) {
|
|
81
|
+
await fs
|
|
82
|
+
.cp(join(this.familyRootPath, "workdir"), cwd, { recursive: true })
|
|
83
|
+
.catch((e) => {
|
|
84
|
+
if (e.code !== "ENOENT") throw e;
|
|
85
|
+
});
|
|
86
|
+
await fs
|
|
87
|
+
.cp(join(this.familyRootPath, "specs"), join(cwd, "specs"), {
|
|
88
|
+
recursive: true,
|
|
89
|
+
})
|
|
90
|
+
.catch((e) => {
|
|
91
|
+
if (e.code !== "ENOENT") throw e;
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
|
|
76
95
|
await fs.cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
77
96
|
if (e.code !== "ENOENT") throw e;
|
|
78
97
|
});
|
|
@@ -107,7 +126,12 @@ export class WorkdirManager {
|
|
|
107
126
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
108
127
|
|
|
109
128
|
const preflight = task.paths.preflight
|
|
110
|
-
? await runPreflight(this.runtime, task.paths.preflight, cwd, port
|
|
129
|
+
? await runPreflight(this.runtime, task.paths.preflight, cwd, port, {
|
|
130
|
+
taskId: task.id,
|
|
131
|
+
taskDir: task.paths.taskDir,
|
|
132
|
+
hooksDir: task.paths.hooks,
|
|
133
|
+
familyDir: this.familyRootPath,
|
|
134
|
+
})
|
|
111
135
|
: { pgid: 0 };
|
|
112
136
|
|
|
113
137
|
return {
|
|
@@ -163,12 +187,13 @@ export class WorkdirManager {
|
|
|
163
187
|
* @param {string} script
|
|
164
188
|
* @param {string} cwd - Agent CWD passed via $WORKDIR.
|
|
165
189
|
* @param {number} port - Free TCP port passed via $PORT.
|
|
190
|
+
* @param {{taskId: string, taskDir: string, hooksDir: string, familyDir: string|null}} vars - Extra hook env vars.
|
|
166
191
|
* @returns {Promise<{pgid: number, error?: {phase: string, message: string, exitCode: number}}>}
|
|
167
192
|
*/
|
|
168
|
-
async function runPreflight(runtime, script, cwd, port) {
|
|
193
|
+
async function runPreflight(runtime, script, cwd, port, vars) {
|
|
169
194
|
const child = runtime.subprocess.spawn(script, [], {
|
|
170
195
|
cwd,
|
|
171
|
-
env:
|
|
196
|
+
env: buildHookEnv(runtime.proc.env, { cwd, port, ...vars }),
|
|
172
197
|
detached: true,
|
|
173
198
|
stdio: ["ignore", "pipe", "pipe"],
|
|
174
199
|
});
|
package/src/supervisor.js
CHANGED
|
@@ -30,8 +30,7 @@ import { OrchestrationLoop } from "./orchestration-loop.js";
|
|
|
30
30
|
/** System prompt for the supervisor lead. L0 mechanics only per COALIGNED. */
|
|
31
31
|
export const SUPERVISOR_SYSTEM_PROMPT =
|
|
32
32
|
"You supervise one agent.\n" +
|
|
33
|
-
"
|
|
34
|
-
"Use `Ask` to delegate work to the agent.\n" +
|
|
33
|
+
"Use `Ask` to delegate the agent's task to the agent.\n" +
|
|
35
34
|
"`Ask` is async and returns {askIds:[N]} immediately.\n" +
|
|
36
35
|
"The reply arrives on your next turn as `[answer#N] agent: <text>` in your inbox.\n" +
|
|
37
36
|
"End your turn while Asks are pending. The system resumes you when an answer arrives.\n" +
|
|
@@ -196,7 +195,6 @@ export function createSupervisor({
|
|
|
196
195
|
"Task",
|
|
197
196
|
"TaskOutput",
|
|
198
197
|
"TaskStop",
|
|
199
|
-
"Bash",
|
|
200
198
|
"Write",
|
|
201
199
|
"Edit",
|
|
202
200
|
];
|
|
@@ -210,7 +208,7 @@ export function createSupervisor({
|
|
|
210
208
|
output: devNull,
|
|
211
209
|
model: supervisorModel ?? model,
|
|
212
210
|
maxTurns: perRunBudget,
|
|
213
|
-
allowedTools: supervisorAllowedTools ?? ["Read", "Glob", "Grep"],
|
|
211
|
+
allowedTools: supervisorAllowedTools ?? ["Read", "Glob", "Grep", "Bash"],
|
|
214
212
|
disallowedTools,
|
|
215
213
|
onLine: (line) => supervisor.emitLine("supervisor", line),
|
|
216
214
|
settingSources: ["project"],
|