@forwardimpact/libeval 0.1.50 → 0.1.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +36 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/invariants.js +128 -0
- package/src/benchmark/judge.js +18 -19
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +40 -26
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +90 -46
- package/src/benchmark/task-family.js +78 -65
- package/src/benchmark/workdir.js +100 -93
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +24 -15
- package/src/commands/benchmark-run.js +16 -9
- package/src/commands/by-discussion.js +33 -23
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +31 -13
- package/src/commands/facilitate.js +21 -14
- package/src/commands/output.js +15 -13
- package/src/commands/run.js +28 -14
- package/src/commands/supervise.js +29 -19
- package/src/commands/task-input.js +10 -5
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +181 -99
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +53 -2
- package/src/events/github.js +27 -5
- package/src/facilitator.js +4 -0
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +4 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/profile-prompt.js +22 -9
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/supervisor.js +4 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
- package/src/benchmark/scorer.js +0 -138
- package/src/commands/benchmark-score.js +0 -68
|
@@ -4,26 +4,25 @@
|
|
|
4
4
|
* staging directory, and computes the manifest fingerprint from the lockfile.
|
|
5
5
|
* Per-task copy happens later in WorkdirManager.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
7
|
+
* Subprocess and filesystem access route through the injected `runtime` bag
|
|
8
|
+
* (`runtime.subprocess.spawn` for the streaming `apm` child, `runtime.fs` for
|
|
9
|
+
* the async staging copies). See `createApmInstaller` for the real-dependency
|
|
10
|
+
* wiring; `installApm` is a thin free-function wrapper.
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import { spawn as nodeSpawn } from "node:child_process";
|
|
14
13
|
import { createHash } from "node:crypto";
|
|
15
|
-
import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
|
|
16
14
|
import { join } from "node:path";
|
|
17
15
|
|
|
18
16
|
/** Installs apm and stages `.claude/` for a task family. */
|
|
19
17
|
export class ApmInstaller {
|
|
20
18
|
/**
|
|
21
|
-
* @param {object}
|
|
22
|
-
* @param {
|
|
23
|
-
*
|
|
19
|
+
* @param {object} deps
|
|
20
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
21
|
+
* Ambient collaborators; uses `subprocess.spawn` and `fs`.
|
|
24
22
|
*/
|
|
25
|
-
constructor({
|
|
26
|
-
|
|
23
|
+
constructor({ runtime }) {
|
|
24
|
+
if (!runtime) throw new Error("runtime is required");
|
|
25
|
+
this.runtime = runtime;
|
|
27
26
|
}
|
|
28
27
|
|
|
29
28
|
/**
|
|
@@ -32,19 +31,21 @@ export class ApmInstaller {
|
|
|
32
31
|
* @returns {Promise<{stagingDir: string, skillSetHash: string, judgeProfilesDir: string}>}
|
|
33
32
|
*/
|
|
34
33
|
async install(family, outputDir) {
|
|
34
|
+
const fs = this.runtime.fs;
|
|
35
35
|
const stagingDir = join(outputDir, ".apm-staging");
|
|
36
36
|
const stagedClaude = join(stagingDir, ".claude");
|
|
37
37
|
const sourceClaude = join(family.rootPath, ".claude");
|
|
38
38
|
const apmYml = join(family.rootPath, "apm.yml");
|
|
39
39
|
|
|
40
|
-
const hasApm = await
|
|
40
|
+
const hasApm = await fs
|
|
41
|
+
.access(apmYml)
|
|
41
42
|
.then(() => true)
|
|
42
43
|
.catch(() => false);
|
|
43
44
|
|
|
44
45
|
if (hasApm) {
|
|
45
46
|
await this.#runApmInstall(family.rootPath);
|
|
46
47
|
try {
|
|
47
|
-
await access(sourceClaude);
|
|
48
|
+
await fs.access(sourceClaude);
|
|
48
49
|
} catch {
|
|
49
50
|
throw new Error(
|
|
50
51
|
`apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
|
|
@@ -52,14 +53,15 @@ export class ApmInstaller {
|
|
|
52
53
|
}
|
|
53
54
|
}
|
|
54
55
|
|
|
55
|
-
await rm(stagingDir, { recursive: true, force: true });
|
|
56
|
-
const hasClaudeDir = await
|
|
56
|
+
await fs.rm(stagingDir, { recursive: true, force: true });
|
|
57
|
+
const hasClaudeDir = await fs
|
|
58
|
+
.access(sourceClaude)
|
|
57
59
|
.then(() => true)
|
|
58
60
|
.catch(() => false);
|
|
59
61
|
if (hasClaudeDir) {
|
|
60
|
-
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
62
|
+
await fs.cp(sourceClaude, stagedClaude, { recursive: true });
|
|
61
63
|
} else {
|
|
62
|
-
await mkdir(stagedClaude, { recursive: true });
|
|
64
|
+
await fs.mkdir(stagedClaude, { recursive: true });
|
|
63
65
|
}
|
|
64
66
|
|
|
65
67
|
// Stage the family-local judge profile outside .claude/ so it is available
|
|
@@ -67,15 +69,15 @@ export class ApmInstaller {
|
|
|
67
69
|
const judgeSource = join(family.rootPath, "judge.md");
|
|
68
70
|
const judgeProfilesDir = join(stagingDir, "judge-profiles");
|
|
69
71
|
try {
|
|
70
|
-
await access(judgeSource);
|
|
71
|
-
await mkdir(judgeProfilesDir, { recursive: true });
|
|
72
|
-
await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
72
|
+
await fs.access(judgeSource);
|
|
73
|
+
await fs.mkdir(judgeProfilesDir, { recursive: true });
|
|
74
|
+
await fs.cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
73
75
|
} catch {}
|
|
74
76
|
|
|
75
77
|
const lockPath = join(family.rootPath, "apm.lock.yaml");
|
|
76
78
|
let skillSetHash = "";
|
|
77
79
|
try {
|
|
78
|
-
const lockBytes = await readFile(lockPath);
|
|
80
|
+
const lockBytes = await fs.readFile(lockPath);
|
|
79
81
|
skillSetHash =
|
|
80
82
|
"sha256:" +
|
|
81
83
|
createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
|
|
@@ -86,25 +88,26 @@ export class ApmInstaller {
|
|
|
86
88
|
return { stagingDir, skillSetHash, judgeProfilesDir };
|
|
87
89
|
}
|
|
88
90
|
|
|
89
|
-
#runApmInstall(cwd) {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
91
|
+
async #runApmInstall(cwd) {
|
|
92
|
+
const child = this.runtime.subprocess.spawn(
|
|
93
|
+
"apm",
|
|
94
|
+
["install", "--target", "claude"],
|
|
95
|
+
{ cwd, stdio: ["ignore", "pipe", "pipe"] },
|
|
96
|
+
);
|
|
97
|
+
// Drain stdout concurrently so the child never blocks on backpressure;
|
|
98
|
+
// capture stderr for the failure message.
|
|
99
|
+
let stderr = "";
|
|
100
|
+
const drainStdout = (async () => {
|
|
101
|
+
for await (const _chunk of child.stdout) {
|
|
102
|
+
// discard
|
|
103
|
+
}
|
|
104
|
+
})();
|
|
105
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
106
|
+
await drainStdout;
|
|
107
|
+
const code = await child.exitCode;
|
|
108
|
+
if (code !== 0) {
|
|
109
|
+
throw new Error(`apm install exited ${code}: ${stderr}`);
|
|
110
|
+
}
|
|
108
111
|
}
|
|
109
112
|
}
|
|
110
113
|
|
|
@@ -119,7 +122,7 @@ function normalizeLf(buf) {
|
|
|
119
122
|
|
|
120
123
|
/**
|
|
121
124
|
* Factory function — wires real dependencies.
|
|
122
|
-
* @param {ConstructorParameters<typeof ApmInstaller>[0]}
|
|
125
|
+
* @param {ConstructorParameters<typeof ApmInstaller>[0]} deps
|
|
123
126
|
* @returns {ApmInstaller}
|
|
124
127
|
*/
|
|
125
128
|
export function createApmInstaller(deps) {
|
|
@@ -127,10 +130,11 @@ export function createApmInstaller(deps) {
|
|
|
127
130
|
}
|
|
128
131
|
|
|
129
132
|
/**
|
|
130
|
-
* Free-function shorthand for callers that
|
|
133
|
+
* Free-function shorthand for callers that thread a runtime bag.
|
|
131
134
|
* @param {import("./task-family.js").TaskFamily} family
|
|
132
135
|
* @param {string} outputDir
|
|
136
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
133
137
|
*/
|
|
134
|
-
export function installApm(family, outputDir) {
|
|
135
|
-
return new ApmInstaller().install(family, outputDir);
|
|
138
|
+
export function installApm(family, outputDir, runtime) {
|
|
139
|
+
return new ApmInstaller({ runtime }).install(family, outputDir);
|
|
136
140
|
}
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
* AND rendered (with resolved values) into the agent working directory.
|
|
15
15
|
*/
|
|
16
16
|
|
|
17
|
-
import { readFile, writeFile } from "node:fs/promises";
|
|
18
17
|
import { join } from "node:path";
|
|
19
18
|
|
|
20
19
|
const ENV_FILES = [".env.local", ".env"];
|
|
@@ -48,12 +47,13 @@ export function parseEnvFile(content) {
|
|
|
48
47
|
|
|
49
48
|
/**
|
|
50
49
|
* Read and parse an env file, returning [] if the file does not exist.
|
|
50
|
+
* @param {object} fs - Async filesystem surface (`runtime.fs`).
|
|
51
51
|
* @param {string} filePath
|
|
52
52
|
* @returns {Promise<Array<{key: string, value: string}>>}
|
|
53
53
|
*/
|
|
54
|
-
async function readEnvFile(filePath) {
|
|
54
|
+
async function readEnvFile(fs, filePath) {
|
|
55
55
|
try {
|
|
56
|
-
const content = await readFile(filePath, "utf8");
|
|
56
|
+
const content = await fs.readFile(filePath, "utf8");
|
|
57
57
|
return parseEnvFile(content);
|
|
58
58
|
} catch (e) {
|
|
59
59
|
if (e.code === "ENOENT") return [];
|
|
@@ -62,32 +62,36 @@ async function readEnvFile(filePath) {
|
|
|
62
62
|
}
|
|
63
63
|
|
|
64
64
|
/**
|
|
65
|
-
* Load entries into process
|
|
65
|
+
* Load entries into the process env map. Existing keys are never overwritten.
|
|
66
|
+
* @param {Record<string, string|undefined>} env - The `runtime.proc.env` map.
|
|
66
67
|
* @param {Array<{key: string, value: string}>} entries
|
|
67
68
|
* @returns {string[]} var names that were loaded
|
|
68
69
|
*/
|
|
69
|
-
function applyToProcessEnv(entries) {
|
|
70
|
+
function applyToProcessEnv(env, entries) {
|
|
70
71
|
const names = [];
|
|
71
72
|
for (const { key, value } of entries) {
|
|
72
73
|
names.push(key);
|
|
73
|
-
if (
|
|
74
|
-
|
|
74
|
+
if (env[key] === undefined) {
|
|
75
|
+
env[key] = value;
|
|
75
76
|
}
|
|
76
77
|
}
|
|
77
78
|
return names;
|
|
78
79
|
}
|
|
79
80
|
|
|
80
81
|
/**
|
|
81
|
-
* Load one env file: apply to
|
|
82
|
+
* Load one env file: apply to the env map, record keys in the merged map.
|
|
83
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
82
84
|
* @param {string} dir
|
|
83
85
|
* @param {string} file
|
|
84
86
|
* @param {Set<string>} names
|
|
85
87
|
* @param {Map<string, Map<string, true>>} merged
|
|
86
88
|
*/
|
|
87
|
-
async function loadOneEnvFile(dir, file, names, merged) {
|
|
88
|
-
const entries = await readEnvFile(join(dir, file));
|
|
89
|
+
async function loadOneEnvFile(runtime, dir, file, names, merged) {
|
|
90
|
+
const entries = await readEnvFile(runtime.fs, join(dir, file));
|
|
89
91
|
if (entries.length === 0) return;
|
|
90
|
-
for (const name of applyToProcessEnv(entries))
|
|
92
|
+
for (const name of applyToProcessEnv(runtime.proc.env, entries)) {
|
|
93
|
+
names.add(name);
|
|
94
|
+
}
|
|
91
95
|
if (!merged.has(file)) merged.set(file, new Map());
|
|
92
96
|
const fileMap = merged.get(file);
|
|
93
97
|
for (const { key } of entries) {
|
|
@@ -96,17 +100,18 @@ async function loadOneEnvFile(dir, file, names, merged) {
|
|
|
96
100
|
}
|
|
97
101
|
|
|
98
102
|
/**
|
|
99
|
-
* Scan directories for env files, load into
|
|
103
|
+
* Scan directories for env files, load into the env map, and collect
|
|
100
104
|
* a merged key manifest per filename.
|
|
105
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
101
106
|
* @param {string[]} dirs
|
|
102
107
|
* @returns {Promise<{names: Set<string>, merged: Map<string, Map<string, true>>}>}
|
|
103
108
|
*/
|
|
104
|
-
async function collectEnvEntries(dirs) {
|
|
109
|
+
async function collectEnvEntries(runtime, dirs) {
|
|
105
110
|
const names = new Set();
|
|
106
111
|
const merged = new Map();
|
|
107
112
|
for (const dir of dirs) {
|
|
108
113
|
for (const file of ENV_FILES) {
|
|
109
|
-
await loadOneEnvFile(dir, file, names, merged);
|
|
114
|
+
await loadOneEnvFile(runtime, dir, file, names, merged);
|
|
110
115
|
}
|
|
111
116
|
}
|
|
112
117
|
return { names, merged };
|
|
@@ -114,17 +119,22 @@ async function collectEnvEntries(dirs) {
|
|
|
114
119
|
|
|
115
120
|
/**
|
|
116
121
|
* Write resolved env files into the agent CWD and warn about empty values.
|
|
122
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
117
123
|
* @param {Map<string, Map<string, true>>} merged
|
|
118
124
|
* @param {string} agentCwd
|
|
119
125
|
*/
|
|
120
|
-
async function renderEnvFiles(merged, agentCwd) {
|
|
126
|
+
async function renderEnvFiles(runtime, merged, agentCwd) {
|
|
127
|
+
const env = runtime.proc.env;
|
|
121
128
|
for (const [file, keyMap] of merged) {
|
|
122
129
|
const keys = [...keyMap.keys()];
|
|
123
|
-
const resolved = keys.map((key) => `${key}=${
|
|
124
|
-
await writeFile(
|
|
125
|
-
|
|
130
|
+
const resolved = keys.map((key) => `${key}=${env[key] ?? ""}`);
|
|
131
|
+
await runtime.fs.writeFile(
|
|
132
|
+
join(agentCwd, file),
|
|
133
|
+
resolved.join("\n") + "\n",
|
|
134
|
+
);
|
|
135
|
+
const empty = keys.filter((key) => !env[key]);
|
|
126
136
|
if (empty.length > 0) {
|
|
127
|
-
|
|
137
|
+
runtime.proc.stderr.write(
|
|
128
138
|
`libeval: env warning: ${file} declares vars with no value: ${empty.join(", ")}\n`,
|
|
129
139
|
);
|
|
130
140
|
}
|
|
@@ -133,14 +143,16 @@ async function renderEnvFiles(merged, agentCwd) {
|
|
|
133
143
|
|
|
134
144
|
/**
|
|
135
145
|
* Discover `.env` / `.env.local` in one or more directories, load them
|
|
136
|
-
* into process
|
|
146
|
+
* into the process env map, and render the resolved values into the agent CWD.
|
|
137
147
|
*
|
|
138
148
|
* @param {string[]} dirs - Directories to scan (family root, task dir, etc.)
|
|
139
149
|
* @param {string} agentCwd - Agent working directory to render into.
|
|
150
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
|
|
151
|
+
* collaborators; uses `fs` (async read/write), `proc.env`, `proc.stderr`.
|
|
140
152
|
* @returns {Promise<string[]>} All var names discovered (for redaction).
|
|
141
153
|
*/
|
|
142
|
-
export async function loadEnv(dirs, agentCwd) {
|
|
143
|
-
const { names, merged } = await collectEnvEntries(dirs);
|
|
144
|
-
await renderEnvFiles(merged, agentCwd);
|
|
154
|
+
export async function loadEnv(dirs, agentCwd, runtime) {
|
|
155
|
+
const { names, merged } = await collectEnvEntries(runtime, dirs);
|
|
156
|
+
await renderEnvFiles(runtime, merged, agentCwd);
|
|
145
157
|
return [...names];
|
|
146
158
|
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
|
|
3
|
+
* against the post-run agent CWD. The exit code is authoritative for the
|
|
4
|
+
* verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
+
*
|
|
6
|
+
* Subprocess access flows through `runtime.subprocess.spawn`; the fd-3 backing
|
|
7
|
+
* store and the stderr log use the sync filesystem surface (`runtime.fsSync`) —
|
|
8
|
+
* the only surface this module touches, per design Decision 7.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { join } from "node:path";
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* @typedef {object} InvariantsResult
|
|
15
|
+
* @property {"pass" | "fail"} verdict
|
|
16
|
+
* @property {Array<object>} details
|
|
17
|
+
* @property {number} exitCode
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Run the task's invariants script.
|
|
22
|
+
* @param {import("./task-family.js").Task} task
|
|
23
|
+
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
24
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
25
|
+
* @returns {Promise<InvariantsResult>}
|
|
26
|
+
*/
|
|
27
|
+
export async function runInvariants(task, ctx, runtime) {
|
|
28
|
+
if (!runtime) throw new Error("runtime is required");
|
|
29
|
+
if (!task.paths.invariants) {
|
|
30
|
+
return { verdict: "pass", details: [], exitCode: 0 };
|
|
31
|
+
}
|
|
32
|
+
const fsSync = runtime.fsSync;
|
|
33
|
+
const script = task.paths.invariants;
|
|
34
|
+
const stderrLogPath = join(ctx.runDir, "invariants.stderr.log");
|
|
35
|
+
|
|
36
|
+
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
37
|
+
// creates a unix socket pair and the connect() can return ENOENT). Use
|
|
38
|
+
// a temp file as the fd-3 backing store instead — the script still
|
|
39
|
+
// writes via `$RESULTS_FD`, but we hand it a real file descriptor.
|
|
40
|
+
const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
|
|
41
|
+
const fd3File = fsSync.openSync(fd3Path, "w+");
|
|
42
|
+
|
|
43
|
+
let child;
|
|
44
|
+
try {
|
|
45
|
+
child = runtime.subprocess.spawn(script, [], {
|
|
46
|
+
env: {
|
|
47
|
+
...runtime.proc.env,
|
|
48
|
+
WORKDIR: ctx.cwd,
|
|
49
|
+
PORT: String(ctx.port),
|
|
50
|
+
RESULTS_FD: "3",
|
|
51
|
+
},
|
|
52
|
+
stdio: ["inherit", "pipe", "pipe", fd3File],
|
|
53
|
+
});
|
|
54
|
+
} catch (e) {
|
|
55
|
+
tryClose(fsSync, fd3File);
|
|
56
|
+
throw e;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Drain stdout (do not require consumers to read it); capture stderr to log.
|
|
60
|
+
const drainStdout = (async () => {
|
|
61
|
+
for await (const _chunk of child.stdout) {
|
|
62
|
+
// discard
|
|
63
|
+
}
|
|
64
|
+
})();
|
|
65
|
+
let stderr = "";
|
|
66
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
67
|
+
await drainStdout;
|
|
68
|
+
const code = await child.exitCode;
|
|
69
|
+
|
|
70
|
+
fsSync.writeFileSync(stderrLogPath, stderr);
|
|
71
|
+
tryClose(fsSync, fd3File);
|
|
72
|
+
|
|
73
|
+
const raw = readAndUnlink(fsSync, fd3Path);
|
|
74
|
+
const details = [];
|
|
75
|
+
parseFd3Buffer(raw, details);
|
|
76
|
+
const exitCode = typeof code === "number" ? code : -1;
|
|
77
|
+
return {
|
|
78
|
+
verdict: exitCode === 0 ? "pass" : "fail",
|
|
79
|
+
details,
|
|
80
|
+
exitCode,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function pushRow(line, details) {
|
|
85
|
+
const trimmed = line.trim();
|
|
86
|
+
if (!trimmed) return;
|
|
87
|
+
try {
|
|
88
|
+
details.push(JSON.parse(trimmed));
|
|
89
|
+
} catch {
|
|
90
|
+
details.push({ raw: trimmed, parseError: true });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
function tryClose(fsSync, fd) {
|
|
95
|
+
try {
|
|
96
|
+
fsSync.closeSync(fd);
|
|
97
|
+
} catch {
|
|
98
|
+
// already closed
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
function readAndUnlink(fsSync, path) {
|
|
103
|
+
let raw = "";
|
|
104
|
+
try {
|
|
105
|
+
raw = fsSync.readFileSync(path, "utf8");
|
|
106
|
+
} catch {
|
|
107
|
+
// empty
|
|
108
|
+
}
|
|
109
|
+
try {
|
|
110
|
+
fsSync.unlinkSync(path);
|
|
111
|
+
} catch {
|
|
112
|
+
// best-effort cleanup
|
|
113
|
+
}
|
|
114
|
+
return raw;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Parse the fd-3 buffer (read from the temp-file backing) into one NDJSON
|
|
119
|
+
* row per detail entry.
|
|
120
|
+
*/
|
|
121
|
+
function parseFd3Buffer(buf, details) {
|
|
122
|
+
if (!buf) return;
|
|
123
|
+
const parts = buf.split("\n");
|
|
124
|
+
for (let i = 0; i < parts.length - 1; i++) pushRow(parts[i], details);
|
|
125
|
+
if (parts[parts.length - 1].trim()) {
|
|
126
|
+
pushRow(parts[parts.length - 1], details);
|
|
127
|
+
}
|
|
128
|
+
}
|
package/src/benchmark/judge.js
CHANGED
|
@@ -9,13 +9,11 @@
|
|
|
9
9
|
* {{AGENT_INSTRUCTIONS}} — contents of agent.task.md
|
|
10
10
|
* {{AGENT_PROFILE}} — agent profile body (empty string if none)
|
|
11
11
|
* {{AGENT_TRACE_PATH}} — path to agent.ndjson
|
|
12
|
-
* {{
|
|
12
|
+
* {{INVARIANTS_RESULT}} — JSON invariants object
|
|
13
13
|
* {{SKILL_SET_HASH}} — SHA-256 from apm.lock.yaml
|
|
14
14
|
* {{TASK_ID}} — task name (directory under tasks/)
|
|
15
15
|
* {{TASK_DIR}} — agent working directory path
|
|
16
16
|
*
|
|
17
|
-
* Legacy alias: {{SCORING}} is accepted as an alias for {{SCORING_RESULT}}.
|
|
18
|
-
*
|
|
19
17
|
* The judge verdict is captured from the orchestration context's
|
|
20
18
|
* `concluded` flag directly — no trace parsing on the happy path.
|
|
21
19
|
* `parseConcludeFromTrace` is preserved for offline analysis and as a
|
|
@@ -23,9 +21,6 @@
|
|
|
23
21
|
* historical run from its judge.ndjson file).
|
|
24
22
|
*/
|
|
25
23
|
|
|
26
|
-
import { createReadStream, createWriteStream } from "node:fs";
|
|
27
|
-
import { readFile } from "node:fs/promises";
|
|
28
|
-
import { createInterface } from "node:readline";
|
|
29
24
|
import { createJudge } from "../judge.js";
|
|
30
25
|
import { createRedactor } from "../redaction.js";
|
|
31
26
|
|
|
@@ -46,17 +41,19 @@ import { createRedactor } from "../redaction.js";
|
|
|
46
41
|
* Run the judge over a completed task run.
|
|
47
42
|
* @param {import("./task-family.js").Task} task
|
|
48
43
|
* @param {import("./workdir.js").Workdir} workdir
|
|
49
|
-
* @param {import("./
|
|
50
|
-
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
44
|
+
* @param {import("./invariants.js").InvariantsResult} invariants
|
|
45
|
+
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string, runtime: import("@forwardimpact/libutil/runtime").Runtime}} deps
|
|
51
46
|
* @param {JudgeContext} [context]
|
|
52
47
|
* @returns {Promise<JudgeVerdict>}
|
|
53
48
|
*/
|
|
54
|
-
export async function runJudge(task, workdir,
|
|
55
|
-
const
|
|
56
|
-
|
|
49
|
+
export async function runJudge(task, workdir, invariants, deps, context) {
|
|
50
|
+
const runtime = deps.runtime;
|
|
51
|
+
if (!runtime) throw new Error("runtime is required");
|
|
52
|
+
const fs = runtime.fs;
|
|
53
|
+
const template = await fs.readFile(task.paths.judge, "utf8");
|
|
54
|
+
const invariantsJson = JSON.stringify(invariants, null, 2);
|
|
57
55
|
const taskText = template
|
|
58
|
-
.replaceAll("{{
|
|
59
|
-
.replaceAll("{{SCORING}}", scoringJson)
|
|
56
|
+
.replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
|
|
60
57
|
.replaceAll("{{AGENT_TRACE_PATH}}", workdir.agentTracePath)
|
|
61
58
|
.replaceAll("{{AGENT_INSTRUCTIONS}}", context?.agentInstructions ?? "")
|
|
62
59
|
.replaceAll("{{AGENT_PROFILE}}", context?.agentProfile ?? "")
|
|
@@ -64,7 +61,7 @@ export async function runJudge(task, workdir, scoring, deps, context) {
|
|
|
64
61
|
.replaceAll("{{TASK_ID}}", task.id)
|
|
65
62
|
.replaceAll("{{TASK_DIR}}", workdir.cwd);
|
|
66
63
|
|
|
67
|
-
const output = createWriteStream(workdir.judgeTracePath);
|
|
64
|
+
const output = fs.createWriteStream(workdir.judgeTracePath);
|
|
68
65
|
const judge = createJudge({
|
|
69
66
|
cwd: workdir.cwd,
|
|
70
67
|
query: deps.query,
|
|
@@ -73,7 +70,8 @@ export async function runJudge(task, workdir, scoring, deps, context) {
|
|
|
73
70
|
judgeProfile: deps.judgeProfile,
|
|
74
71
|
profilesDir: deps.profilesDir,
|
|
75
72
|
maxTurns: 25,
|
|
76
|
-
redactor: createRedactor(),
|
|
73
|
+
redactor: createRedactor({ runtime }),
|
|
74
|
+
runtime,
|
|
77
75
|
});
|
|
78
76
|
|
|
79
77
|
let outcome;
|
|
@@ -98,13 +96,14 @@ export async function runJudge(task, workdir, scoring, deps, context) {
|
|
|
98
96
|
* and map the verdict (`success → pass`, `failure → fail`). Preserved for
|
|
99
97
|
* offline analysis; not used on the runtime happy path.
|
|
100
98
|
* @param {string} tracePath
|
|
99
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
101
100
|
* @returns {Promise<JudgeVerdict | null>}
|
|
102
101
|
*/
|
|
103
|
-
export async function parseConcludeFromTrace(tracePath) {
|
|
104
|
-
|
|
105
|
-
const
|
|
102
|
+
export async function parseConcludeFromTrace(tracePath, runtime) {
|
|
103
|
+
if (!runtime) throw new Error("runtime is required");
|
|
104
|
+
const content = await runtime.fs.readFile(tracePath, "utf8");
|
|
106
105
|
let last = null;
|
|
107
|
-
for
|
|
106
|
+
for (const line of content.split("\n")) {
|
|
108
107
|
const candidate = extractConcludeInput(line);
|
|
109
108
|
if (candidate) last = candidate;
|
|
110
109
|
}
|
|
@@ -3,23 +3,22 @@
|
|
|
3
3
|
* is present, then copies the resulting `node_modules/` into the staging
|
|
4
4
|
* directory so WorkdirManager can seed each per-task CWD.
|
|
5
5
|
*
|
|
6
|
-
* Symmetric to ApmInstaller:
|
|
7
|
-
*
|
|
6
|
+
* Symmetric to ApmInstaller: the subprocess and filesystem flow through the
|
|
7
|
+
* injected `runtime` bag (`runtime.subprocess.spawn` + `runtime.fs`).
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
|
-
import { spawn as nodeSpawn } from "node:child_process";
|
|
11
|
-
import { access, cp } from "node:fs/promises";
|
|
12
10
|
import { join } from "node:path";
|
|
13
11
|
|
|
14
12
|
/** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
|
|
15
13
|
export class NpmInstaller {
|
|
16
14
|
/**
|
|
17
|
-
* @param {object}
|
|
18
|
-
* @param {
|
|
19
|
-
*
|
|
15
|
+
* @param {object} deps
|
|
16
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
17
|
+
* Ambient collaborators; uses `subprocess.spawn` and `fs`.
|
|
20
18
|
*/
|
|
21
|
-
constructor({
|
|
22
|
-
|
|
19
|
+
constructor({ runtime }) {
|
|
20
|
+
if (!runtime) throw new Error("runtime is required");
|
|
21
|
+
this.runtime = runtime;
|
|
23
22
|
}
|
|
24
23
|
|
|
25
24
|
/**
|
|
@@ -28,8 +27,10 @@ export class NpmInstaller {
|
|
|
28
27
|
* @returns {Promise<void>}
|
|
29
28
|
*/
|
|
30
29
|
async install(family, stagingDir) {
|
|
30
|
+
const fs = this.runtime.fs;
|
|
31
31
|
const pkgJson = join(family.rootPath, "package.json");
|
|
32
|
-
const hasPkg = await
|
|
32
|
+
const hasPkg = await fs
|
|
33
|
+
.access(pkgJson)
|
|
33
34
|
.then(() => true)
|
|
34
35
|
.catch(() => false);
|
|
35
36
|
if (!hasPkg) return;
|
|
@@ -38,37 +39,35 @@ export class NpmInstaller {
|
|
|
38
39
|
|
|
39
40
|
const sourceModules = join(family.rootPath, "node_modules");
|
|
40
41
|
try {
|
|
41
|
-
await access(sourceModules);
|
|
42
|
+
await fs.access(sourceModules);
|
|
42
43
|
} catch {
|
|
43
44
|
throw new Error(
|
|
44
45
|
`bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
|
|
45
46
|
);
|
|
46
47
|
}
|
|
47
48
|
|
|
48
|
-
await cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
|
+
await fs.cp(sourceModules, join(stagingDir, "node_modules"), {
|
|
49
50
|
recursive: true,
|
|
50
51
|
});
|
|
51
52
|
}
|
|
52
53
|
|
|
53
|
-
#runBunInstall(cwd) {
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
58
|
-
});
|
|
59
|
-
let stderr = "";
|
|
60
|
-
child.stdout.on("data", () => {});
|
|
61
|
-
child.stderr.on("data", (d) => {
|
|
62
|
-
stderr += d.toString();
|
|
63
|
-
});
|
|
64
|
-
child.on("error", (e) => {
|
|
65
|
-
rej(new Error(`failed to spawn bun: ${e.message}`));
|
|
66
|
-
});
|
|
67
|
-
child.on("close", (code) => {
|
|
68
|
-
if (code === 0) res();
|
|
69
|
-
else rej(new Error(`bun install exited ${code}: ${stderr}`));
|
|
70
|
-
});
|
|
54
|
+
async #runBunInstall(cwd) {
|
|
55
|
+
const child = this.runtime.subprocess.spawn("bun", ["install"], {
|
|
56
|
+
cwd,
|
|
57
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
71
58
|
});
|
|
59
|
+
let stderr = "";
|
|
60
|
+
const drainStdout = (async () => {
|
|
61
|
+
for await (const _chunk of child.stdout) {
|
|
62
|
+
// discard
|
|
63
|
+
}
|
|
64
|
+
})();
|
|
65
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
66
|
+
await drainStdout;
|
|
67
|
+
const code = await child.exitCode;
|
|
68
|
+
if (code !== 0) {
|
|
69
|
+
throw new Error(`bun install exited ${code}: ${stderr}`);
|
|
70
|
+
}
|
|
72
71
|
}
|
|
73
72
|
}
|
|
74
73
|
|
|
@@ -78,10 +77,11 @@ export function createNpmInstaller(deps) {
|
|
|
78
77
|
}
|
|
79
78
|
|
|
80
79
|
/**
|
|
81
|
-
* Free-function shorthand for callers that
|
|
80
|
+
* Free-function shorthand for callers that thread a runtime bag.
|
|
82
81
|
* @param {import("./task-family.js").TaskFamily} family
|
|
83
82
|
* @param {string} stagingDir
|
|
83
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
84
84
|
*/
|
|
85
|
-
export function installNpm(family, stagingDir) {
|
|
86
|
-
return new NpmInstaller().install(family, stagingDir);
|
|
85
|
+
export function installNpm(family, stagingDir, runtime) {
|
|
86
|
+
return new NpmInstaller({ runtime }).install(family, stagingDir);
|
|
87
87
|
}
|