@forwardimpact/libeval 0.1.51 → 0.1.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +8 -14
- package/bin/fit-eval.js +8 -28
- package/bin/fit-selfedit.js +6 -4
- package/bin/fit-trace.js +7 -14
- package/package.json +1 -1
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/invariants.js +51 -63
- package/src/benchmark/judge.js +13 -11
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +25 -11
- package/src/benchmark/result.js +2 -2
- package/src/benchmark/runner.js +82 -38
- package/src/benchmark/task-family.js +74 -63
- package/src/benchmark/workdir.js +91 -99
- package/src/commands/benchmark-invariants.js +3 -3
- package/src/commands/benchmark-report.js +1 -0
- package/src/commands/benchmark-run.js +1 -1
- package/src/commands/by-discussion.js +10 -11
- package/src/commands/discuss.js +3 -2
- package/src/commands/facilitate.js +3 -2
- package/src/commands/output.js +4 -1
- package/src/commands/run.js +6 -2
- package/src/commands/supervise.js +3 -2
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +7 -2
- package/src/discusser.js +7 -5
- package/src/events/github.js +7 -1
- package/src/facilitator.js +6 -5
- package/src/inbox-poller.js +5 -8
- package/src/judge.js +12 -13
- package/src/profile-prompt.js +124 -26
- package/src/redaction.js +3 -16
- package/src/supervisor.js +7 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
package/bin/fit-benchmark.js
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import {
|
|
5
|
+
import { realpathSync } from "node:fs";
|
|
6
6
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
7
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
8
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
@@ -11,17 +11,8 @@ import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
|
|
|
11
11
|
import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
|
|
12
12
|
import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
|
|
13
13
|
|
|
14
|
-
// `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
|
|
15
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
16
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
17
|
-
const VERSION =
|
|
18
|
-
process.env.FIT_BENCHMARK_VERSION ||
|
|
19
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
20
|
-
.version;
|
|
21
|
-
|
|
22
14
|
export const definition = {
|
|
23
15
|
name: "fit-benchmark",
|
|
24
|
-
version: VERSION,
|
|
25
16
|
description:
|
|
26
17
|
"Run coding-agent task families, grade hidden tests, and aggregate pass@k across runs.",
|
|
27
18
|
commands: [
|
|
@@ -156,11 +147,14 @@ export const definition = {
|
|
|
156
147
|
],
|
|
157
148
|
};
|
|
158
149
|
|
|
159
|
-
const
|
|
150
|
+
const runtime = createDefaultRuntime();
|
|
151
|
+
const logger = createLogger("benchmark", runtime);
|
|
160
152
|
|
|
161
153
|
async function main() {
|
|
162
|
-
const
|
|
163
|
-
|
|
154
|
+
const cli = createCli(definition, {
|
|
155
|
+
runtime,
|
|
156
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
157
|
+
});
|
|
164
158
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
165
159
|
if (!parsed) return runtime.proc.exit(0);
|
|
166
160
|
|
|
@@ -187,7 +181,7 @@ async function main() {
|
|
|
187
181
|
if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
|
|
188
182
|
main().catch((error) => {
|
|
189
183
|
logger.exception("main", error);
|
|
190
|
-
createCli(definition).error(error.message);
|
|
184
|
+
createCli(definition, { runtime }).error(error.message);
|
|
191
185
|
process.exit(1);
|
|
192
186
|
});
|
|
193
187
|
}
|
package/bin/fit-eval.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import { readFileSync } from "node:fs";
|
|
6
5
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
6
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
7
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
@@ -15,27 +14,6 @@ import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
|
15
14
|
import { runDiscussCommand } from "../src/commands/discuss.js";
|
|
16
15
|
import { runCallbackCommand } from "../src/commands/callback.js";
|
|
17
16
|
|
|
18
|
-
// `tee` streams stdin→stdout via Node's `pipeline`, which needs real stream
|
|
19
|
-
// objects the runtime surface does not expose; it keeps the legacy
|
|
20
|
-
// `(values, args)` signature and this adapter bridges it into dispatch.
|
|
21
|
-
async function teeHandler(ctx) {
|
|
22
|
-
const out = ctx.args.output;
|
|
23
|
-
try {
|
|
24
|
-
await runTeeCommand(ctx.options, out ? [out] : []);
|
|
25
|
-
return { ok: true };
|
|
26
|
-
} catch (error) {
|
|
27
|
-
return { ok: false, code: 1, error: error.message };
|
|
28
|
-
}
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
32
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
33
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
34
|
-
const VERSION =
|
|
35
|
-
process.env.FIT_EVAL_VERSION ||
|
|
36
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
37
|
-
.version;
|
|
38
|
-
|
|
39
17
|
const LEAD_OPTIONS = {
|
|
40
18
|
"lead-profile": {
|
|
41
19
|
type: "string",
|
|
@@ -73,7 +51,6 @@ const TASK_INPUT_OPTIONS = {
|
|
|
73
51
|
|
|
74
52
|
const definition = {
|
|
75
53
|
name: "fit-eval",
|
|
76
|
-
version: VERSION,
|
|
77
54
|
description:
|
|
78
55
|
"Run agents and capture NDJSON traces — for agent evaluations or multi-agent collaboration",
|
|
79
56
|
commands: [
|
|
@@ -249,7 +226,7 @@ const definition = {
|
|
|
249
226
|
name: "tee",
|
|
250
227
|
args: ["output"],
|
|
251
228
|
argsUsage: "[output.ndjson]",
|
|
252
|
-
handler:
|
|
229
|
+
handler: runTeeCommand,
|
|
253
230
|
description:
|
|
254
231
|
"Stream readable text to stdout while saving raw NDJSON to a file",
|
|
255
232
|
},
|
|
@@ -326,11 +303,14 @@ const definition = {
|
|
|
326
303
|
],
|
|
327
304
|
};
|
|
328
305
|
|
|
329
|
-
const
|
|
306
|
+
const runtime = createDefaultRuntime();
|
|
307
|
+
const logger = createLogger("eval", runtime);
|
|
330
308
|
|
|
331
309
|
async function main() {
|
|
332
|
-
const
|
|
333
|
-
|
|
310
|
+
const cli = createCli(definition, {
|
|
311
|
+
runtime,
|
|
312
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
313
|
+
});
|
|
334
314
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
335
315
|
if (!parsed) return runtime.proc.exit(0);
|
|
336
316
|
|
|
@@ -354,6 +334,6 @@ async function main() {
|
|
|
354
334
|
|
|
355
335
|
main().catch((error) => {
|
|
356
336
|
logger.exception("main", error);
|
|
357
|
-
createCli(definition).error(error.message);
|
|
337
|
+
createCli(definition, { runtime }).error(error.message);
|
|
358
338
|
process.exit(1);
|
|
359
339
|
});
|
package/bin/fit-selfedit.js
CHANGED
|
@@ -7,12 +7,11 @@
|
|
|
7
7
|
|
|
8
8
|
import "@forwardimpact/libpreflight/node22";
|
|
9
9
|
import { existsSync, readFileSync, writeFileSync } from "node:fs";
|
|
10
|
-
import fsPromises from "node:fs/promises";
|
|
11
10
|
import { parseArgs } from "node:util";
|
|
12
11
|
import { resolve, relative, dirname } from "node:path";
|
|
13
12
|
import { execFileSync } from "node:child_process";
|
|
14
13
|
|
|
15
|
-
import {
|
|
14
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
16
15
|
import { minimatch } from "minimatch";
|
|
17
16
|
|
|
18
17
|
const HELP = `fit-selfedit — write stdin to a settings.json-allowed path on a non-main branch.
|
|
@@ -71,8 +70,11 @@ if (extra.length > 0) fail(`unexpected extra arguments: ${extra.join(" ")}`);
|
|
|
71
70
|
|
|
72
71
|
const absoluteTarget = resolve(process.cwd(), targetArg);
|
|
73
72
|
|
|
74
|
-
// Safeguard 1: settings.json must grant Edit() on this path.
|
|
75
|
-
|
|
73
|
+
// Safeguard 1: settings.json must grant Edit() on this path. The bin is the
|
|
74
|
+
// sole construction site for the runtime; resolve the finder off the bag
|
|
75
|
+
// rather than constructing a Finder here (Success Criterion 9).
|
|
76
|
+
const runtime = createDefaultRuntime();
|
|
77
|
+
const settingsPath = runtime.finder.findUpward(
|
|
76
78
|
dirname(absoluteTarget),
|
|
77
79
|
".claude/settings.json",
|
|
78
80
|
20,
|
package/bin/fit-trace.js
CHANGED
|
@@ -2,7 +2,6 @@
|
|
|
2
2
|
|
|
3
3
|
import "@forwardimpact/libpreflight/node22";
|
|
4
4
|
|
|
5
|
-
import { readFileSync } from "node:fs";
|
|
6
5
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
6
|
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
8
7
|
import { createScriptConfig } from "@forwardimpact/libconfig";
|
|
@@ -31,17 +30,8 @@ import {
|
|
|
31
30
|
import { runAssertCommand } from "../src/commands/assert.js";
|
|
32
31
|
import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
|
|
33
32
|
|
|
34
|
-
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
35
|
-
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
36
|
-
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
37
|
-
const VERSION =
|
|
38
|
-
process.env.FIT_TRACE_VERSION ||
|
|
39
|
-
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
40
|
-
.version;
|
|
41
|
-
|
|
42
33
|
const definition = {
|
|
43
34
|
name: "fit-trace",
|
|
44
|
-
version: VERSION,
|
|
45
35
|
description:
|
|
46
36
|
"Download, query, and analyze agent execution traces — read NDJSON output from fit-eval as qualitative research",
|
|
47
37
|
commands: [
|
|
@@ -340,15 +330,18 @@ const definition = {
|
|
|
340
330
|
],
|
|
341
331
|
};
|
|
342
332
|
|
|
343
|
-
const
|
|
333
|
+
const runtime = createDefaultRuntime();
|
|
334
|
+
const logger = createLogger("trace", runtime);
|
|
344
335
|
|
|
345
336
|
// Commands that talk to the GitHub API need a config-backed token resolver;
|
|
346
337
|
// the rest only read local trace files through the runtime.
|
|
347
338
|
const NEEDS_CONFIG = new Set(["runs", "download"]);
|
|
348
339
|
|
|
349
340
|
async function main() {
|
|
350
|
-
const
|
|
351
|
-
|
|
341
|
+
const cli = createCli(definition, {
|
|
342
|
+
runtime,
|
|
343
|
+
packageJsonUrl: new URL("../package.json", import.meta.url),
|
|
344
|
+
});
|
|
352
345
|
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
353
346
|
if (!parsed) return runtime.proc.exit(0);
|
|
354
347
|
|
|
@@ -376,6 +369,6 @@ async function main() {
|
|
|
376
369
|
|
|
377
370
|
main().catch((error) => {
|
|
378
371
|
logger.exception("main", error);
|
|
379
|
-
createCli(definition).error(error.message);
|
|
372
|
+
createCli(definition, { runtime }).error(error.message);
|
|
380
373
|
process.exit(1);
|
|
381
374
|
});
|
package/package.json
CHANGED
|
@@ -4,26 +4,25 @@
|
|
|
4
4
|
* staging directory, and computes the manifest fingerprint from the lockfile.
|
|
5
5
|
* Per-task copy happens later in WorkdirManager.
|
|
6
6
|
*
|
|
7
|
-
*
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
7
|
+
* Subprocess and filesystem access route through the injected `runtime` bag
|
|
8
|
+
* (`runtime.subprocess.spawn` for the streaming `apm` child, `runtime.fs` for
|
|
9
|
+
* the async staging copies). See `createApmInstaller` for the real-dependency
|
|
10
|
+
* wiring; `installApm` is a thin free-function wrapper.
|
|
11
11
|
*/
|
|
12
12
|
|
|
13
|
-
import { spawn as nodeSpawn } from "node:child_process";
|
|
14
13
|
import { createHash } from "node:crypto";
|
|
15
|
-
import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
|
|
16
14
|
import { join } from "node:path";
|
|
17
15
|
|
|
18
16
|
/** Installs apm and stages `.claude/` for a task family. */
|
|
19
17
|
export class ApmInstaller {
|
|
20
18
|
/**
|
|
21
|
-
* @param {object}
|
|
22
|
-
* @param {
|
|
23
|
-
*
|
|
19
|
+
* @param {object} deps
|
|
20
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
21
|
+
* Ambient collaborators; uses `subprocess.spawn` and `fs`.
|
|
24
22
|
*/
|
|
25
|
-
constructor({
|
|
26
|
-
|
|
23
|
+
constructor({ runtime }) {
|
|
24
|
+
if (!runtime) throw new Error("runtime is required");
|
|
25
|
+
this.runtime = runtime;
|
|
27
26
|
}
|
|
28
27
|
|
|
29
28
|
/**
|
|
@@ -32,19 +31,21 @@ export class ApmInstaller {
|
|
|
32
31
|
* @returns {Promise<{stagingDir: string, skillSetHash: string, judgeProfilesDir: string}>}
|
|
33
32
|
*/
|
|
34
33
|
async install(family, outputDir) {
|
|
34
|
+
const fs = this.runtime.fs;
|
|
35
35
|
const stagingDir = join(outputDir, ".apm-staging");
|
|
36
36
|
const stagedClaude = join(stagingDir, ".claude");
|
|
37
37
|
const sourceClaude = join(family.rootPath, ".claude");
|
|
38
38
|
const apmYml = join(family.rootPath, "apm.yml");
|
|
39
39
|
|
|
40
|
-
const hasApm = await
|
|
40
|
+
const hasApm = await fs
|
|
41
|
+
.access(apmYml)
|
|
41
42
|
.then(() => true)
|
|
42
43
|
.catch(() => false);
|
|
43
44
|
|
|
44
45
|
if (hasApm) {
|
|
45
46
|
await this.#runApmInstall(family.rootPath);
|
|
46
47
|
try {
|
|
47
|
-
await access(sourceClaude);
|
|
48
|
+
await fs.access(sourceClaude);
|
|
48
49
|
} catch {
|
|
49
50
|
throw new Error(
|
|
50
51
|
`apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
|
|
@@ -52,14 +53,15 @@ export class ApmInstaller {
|
|
|
52
53
|
}
|
|
53
54
|
}
|
|
54
55
|
|
|
55
|
-
await rm(stagingDir, { recursive: true, force: true });
|
|
56
|
-
const hasClaudeDir = await
|
|
56
|
+
await fs.rm(stagingDir, { recursive: true, force: true });
|
|
57
|
+
const hasClaudeDir = await fs
|
|
58
|
+
.access(sourceClaude)
|
|
57
59
|
.then(() => true)
|
|
58
60
|
.catch(() => false);
|
|
59
61
|
if (hasClaudeDir) {
|
|
60
|
-
await cp(sourceClaude, stagedClaude, { recursive: true });
|
|
62
|
+
await fs.cp(sourceClaude, stagedClaude, { recursive: true });
|
|
61
63
|
} else {
|
|
62
|
-
await mkdir(stagedClaude, { recursive: true });
|
|
64
|
+
await fs.mkdir(stagedClaude, { recursive: true });
|
|
63
65
|
}
|
|
64
66
|
|
|
65
67
|
// Stage the family-local judge profile outside .claude/ so it is available
|
|
@@ -67,15 +69,15 @@ export class ApmInstaller {
|
|
|
67
69
|
const judgeSource = join(family.rootPath, "judge.md");
|
|
68
70
|
const judgeProfilesDir = join(stagingDir, "judge-profiles");
|
|
69
71
|
try {
|
|
70
|
-
await access(judgeSource);
|
|
71
|
-
await mkdir(judgeProfilesDir, { recursive: true });
|
|
72
|
-
await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
72
|
+
await fs.access(judgeSource);
|
|
73
|
+
await fs.mkdir(judgeProfilesDir, { recursive: true });
|
|
74
|
+
await fs.cp(judgeSource, join(judgeProfilesDir, "judge.md"));
|
|
73
75
|
} catch {}
|
|
74
76
|
|
|
75
77
|
const lockPath = join(family.rootPath, "apm.lock.yaml");
|
|
76
78
|
let skillSetHash = "";
|
|
77
79
|
try {
|
|
78
|
-
const lockBytes = await readFile(lockPath);
|
|
80
|
+
const lockBytes = await fs.readFile(lockPath);
|
|
79
81
|
skillSetHash =
|
|
80
82
|
"sha256:" +
|
|
81
83
|
createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
|
|
@@ -86,25 +88,26 @@ export class ApmInstaller {
|
|
|
86
88
|
return { stagingDir, skillSetHash, judgeProfilesDir };
|
|
87
89
|
}
|
|
88
90
|
|
|
89
|
-
#runApmInstall(cwd) {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
91
|
+
async #runApmInstall(cwd) {
|
|
92
|
+
const child = this.runtime.subprocess.spawn(
|
|
93
|
+
"apm",
|
|
94
|
+
["install", "--target", "claude"],
|
|
95
|
+
{ cwd, stdio: ["ignore", "pipe", "pipe"] },
|
|
96
|
+
);
|
|
97
|
+
// Drain stdout concurrently so the child never blocks on backpressure;
|
|
98
|
+
// capture stderr for the failure message.
|
|
99
|
+
let stderr = "";
|
|
100
|
+
const drainStdout = (async () => {
|
|
101
|
+
for await (const _chunk of child.stdout) {
|
|
102
|
+
// discard
|
|
103
|
+
}
|
|
104
|
+
})();
|
|
105
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
106
|
+
await drainStdout;
|
|
107
|
+
const code = await child.exitCode;
|
|
108
|
+
if (code !== 0) {
|
|
109
|
+
throw new Error(`apm install exited ${code}: ${stderr}`);
|
|
110
|
+
}
|
|
108
111
|
}
|
|
109
112
|
}
|
|
110
113
|
|
|
@@ -119,7 +122,7 @@ function normalizeLf(buf) {
|
|
|
119
122
|
|
|
120
123
|
/**
|
|
121
124
|
* Factory function — wires real dependencies.
|
|
122
|
-
* @param {ConstructorParameters<typeof ApmInstaller>[0]}
|
|
125
|
+
* @param {ConstructorParameters<typeof ApmInstaller>[0]} deps
|
|
123
126
|
* @returns {ApmInstaller}
|
|
124
127
|
*/
|
|
125
128
|
export function createApmInstaller(deps) {
|
|
@@ -127,10 +130,11 @@ export function createApmInstaller(deps) {
|
|
|
127
130
|
}
|
|
128
131
|
|
|
129
132
|
/**
|
|
130
|
-
* Free-function shorthand for callers that
|
|
133
|
+
* Free-function shorthand for callers that thread a runtime bag.
|
|
131
134
|
* @param {import("./task-family.js").TaskFamily} family
|
|
132
135
|
* @param {string} outputDir
|
|
136
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
133
137
|
*/
|
|
134
|
-
export function installApm(family, outputDir) {
|
|
135
|
-
return new ApmInstaller().install(family, outputDir);
|
|
138
|
+
export function installApm(family, outputDir, runtime) {
|
|
139
|
+
return new ApmInstaller({ runtime }).install(family, outputDir);
|
|
136
140
|
}
|
|
@@ -2,16 +2,12 @@
|
|
|
2
2
|
* Invariants — runs `<task.paths.hooks>/invariants.sh` from the template path
|
|
3
3
|
* against the post-run agent CWD. The exit code is authoritative for the
|
|
4
4
|
* verdict; structured per-check rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
+
*
|
|
6
|
+
* Subprocess access flows through `runtime.subprocess.spawn`; the fd-3 backing
|
|
7
|
+
* store and the stderr log use the sync filesystem surface (`runtime.fsSync`) —
|
|
8
|
+
* the only surface this module touches, per design Decision 7.
|
|
5
9
|
*/
|
|
6
10
|
|
|
7
|
-
import { spawn } from "node:child_process";
|
|
8
|
-
import {
|
|
9
|
-
closeSync,
|
|
10
|
-
createWriteStream,
|
|
11
|
-
openSync,
|
|
12
|
-
readFileSync,
|
|
13
|
-
unlinkSync,
|
|
14
|
-
} from "node:fs";
|
|
15
11
|
import { join } from "node:path";
|
|
16
12
|
|
|
17
13
|
/**
|
|
@@ -25,72 +21,64 @@ import { join } from "node:path";
|
|
|
25
21
|
* Run the task's invariants script.
|
|
26
22
|
* @param {import("./task-family.js").Task} task
|
|
27
23
|
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
24
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
28
25
|
* @returns {Promise<InvariantsResult>}
|
|
29
26
|
*/
|
|
30
|
-
export function runInvariants(task, ctx) {
|
|
27
|
+
export async function runInvariants(task, ctx, runtime) {
|
|
28
|
+
if (!runtime) throw new Error("runtime is required");
|
|
31
29
|
if (!task.paths.invariants) {
|
|
32
|
-
return
|
|
30
|
+
return { verdict: "pass", details: [], exitCode: 0 };
|
|
33
31
|
}
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
join(ctx.runDir, "invariants.stderr.log"),
|
|
38
|
-
);
|
|
32
|
+
const fsSync = runtime.fsSync;
|
|
33
|
+
const script = task.paths.invariants;
|
|
34
|
+
const stderrLogPath = join(ctx.runDir, "invariants.stderr.log");
|
|
39
35
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
try {
|
|
47
|
-
fd3File = openSync(fd3Path, "w+");
|
|
48
|
-
} catch (e) {
|
|
49
|
-
rej(e);
|
|
50
|
-
return;
|
|
51
|
-
}
|
|
36
|
+
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
37
|
+
// creates a unix socket pair and the connect() can return ENOENT). Use
|
|
38
|
+
// a temp file as the fd-3 backing store instead — the script still
|
|
39
|
+
// writes via `$RESULTS_FD`, but we hand it a real file descriptor.
|
|
40
|
+
const fd3Path = join(ctx.runDir, "invariants.fd3.ndjson");
|
|
41
|
+
const fd3File = fsSync.openSync(fd3Path, "w+");
|
|
52
42
|
|
|
53
|
-
|
|
43
|
+
let child;
|
|
44
|
+
try {
|
|
45
|
+
child = runtime.subprocess.spawn(script, [], {
|
|
54
46
|
env: {
|
|
55
|
-
...
|
|
47
|
+
...runtime.proc.env,
|
|
56
48
|
WORKDIR: ctx.cwd,
|
|
57
49
|
PORT: String(ctx.port),
|
|
58
50
|
RESULTS_FD: "3",
|
|
59
51
|
},
|
|
60
52
|
stdio: ["inherit", "pipe", "pipe", fd3File],
|
|
61
53
|
});
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
54
|
+
} catch (e) {
|
|
55
|
+
tryClose(fsSync, fd3File);
|
|
56
|
+
throw e;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Drain stdout (do not require consumers to read it); capture stderr to log.
|
|
60
|
+
const drainStdout = (async () => {
|
|
61
|
+
for await (const _chunk of child.stdout) {
|
|
62
|
+
// discard
|
|
70
63
|
}
|
|
64
|
+
})();
|
|
65
|
+
let stderr = "";
|
|
66
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
67
|
+
await drainStdout;
|
|
68
|
+
const code = await child.exitCode;
|
|
71
69
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
child.stdout.on("data", () => {});
|
|
70
|
+
fsSync.writeFileSync(stderrLogPath, stderr);
|
|
71
|
+
tryClose(fsSync, fd3File);
|
|
75
72
|
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
parseFd3Buffer(raw, details);
|
|
86
|
-
const exitCode = typeof code === "number" ? code : -1;
|
|
87
|
-
res({
|
|
88
|
-
verdict: exitCode === 0 ? "pass" : "fail",
|
|
89
|
-
details,
|
|
90
|
-
exitCode,
|
|
91
|
-
});
|
|
92
|
-
});
|
|
93
|
-
});
|
|
73
|
+
const raw = readAndUnlink(fsSync, fd3Path);
|
|
74
|
+
const details = [];
|
|
75
|
+
parseFd3Buffer(raw, details);
|
|
76
|
+
const exitCode = typeof code === "number" ? code : -1;
|
|
77
|
+
return {
|
|
78
|
+
verdict: exitCode === 0 ? "pass" : "fail",
|
|
79
|
+
details,
|
|
80
|
+
exitCode,
|
|
81
|
+
};
|
|
94
82
|
}
|
|
95
83
|
|
|
96
84
|
function pushRow(line, details) {
|
|
@@ -103,23 +91,23 @@ function pushRow(line, details) {
|
|
|
103
91
|
}
|
|
104
92
|
}
|
|
105
93
|
|
|
106
|
-
function tryClose(fd) {
|
|
94
|
+
function tryClose(fsSync, fd) {
|
|
107
95
|
try {
|
|
108
|
-
closeSync(fd);
|
|
96
|
+
fsSync.closeSync(fd);
|
|
109
97
|
} catch {
|
|
110
98
|
// already closed
|
|
111
99
|
}
|
|
112
100
|
}
|
|
113
101
|
|
|
114
|
-
function readAndUnlink(path) {
|
|
102
|
+
function readAndUnlink(fsSync, path) {
|
|
115
103
|
let raw = "";
|
|
116
104
|
try {
|
|
117
|
-
raw = readFileSync(path, "utf8");
|
|
105
|
+
raw = fsSync.readFileSync(path, "utf8");
|
|
118
106
|
} catch {
|
|
119
107
|
// empty
|
|
120
108
|
}
|
|
121
109
|
try {
|
|
122
|
-
unlinkSync(path);
|
|
110
|
+
fsSync.unlinkSync(path);
|
|
123
111
|
} catch {
|
|
124
112
|
// best-effort cleanup
|
|
125
113
|
}
|
package/src/benchmark/judge.js
CHANGED
|
@@ -21,9 +21,6 @@
|
|
|
21
21
|
* historical run from its judge.ndjson file).
|
|
22
22
|
*/
|
|
23
23
|
|
|
24
|
-
import { createReadStream, createWriteStream } from "node:fs";
|
|
25
|
-
import { readFile } from "node:fs/promises";
|
|
26
|
-
import { createInterface } from "node:readline";
|
|
27
24
|
import { createJudge } from "../judge.js";
|
|
28
25
|
import { createRedactor } from "../redaction.js";
|
|
29
26
|
|
|
@@ -45,12 +42,15 @@ import { createRedactor } from "../redaction.js";
|
|
|
45
42
|
* @param {import("./task-family.js").Task} task
|
|
46
43
|
* @param {import("./workdir.js").Workdir} workdir
|
|
47
44
|
* @param {import("./invariants.js").InvariantsResult} invariants
|
|
48
|
-
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string}} deps
|
|
45
|
+
* @param {{query: Function, model: string, judgeProfile?: string, profilesDir?: string, runtime: import("@forwardimpact/libutil/runtime").Runtime}} deps
|
|
49
46
|
* @param {JudgeContext} [context]
|
|
50
47
|
* @returns {Promise<JudgeVerdict>}
|
|
51
48
|
*/
|
|
52
49
|
export async function runJudge(task, workdir, invariants, deps, context) {
|
|
53
|
-
const
|
|
50
|
+
const runtime = deps.runtime;
|
|
51
|
+
if (!runtime) throw new Error("runtime is required");
|
|
52
|
+
const fs = runtime.fs;
|
|
53
|
+
const template = await fs.readFile(task.paths.judge, "utf8");
|
|
54
54
|
const invariantsJson = JSON.stringify(invariants, null, 2);
|
|
55
55
|
const taskText = template
|
|
56
56
|
.replaceAll("{{INVARIANTS_RESULT}}", invariantsJson)
|
|
@@ -61,7 +61,7 @@ export async function runJudge(task, workdir, invariants, deps, context) {
|
|
|
61
61
|
.replaceAll("{{TASK_ID}}", task.id)
|
|
62
62
|
.replaceAll("{{TASK_DIR}}", workdir.cwd);
|
|
63
63
|
|
|
64
|
-
const output = createWriteStream(workdir.judgeTracePath);
|
|
64
|
+
const output = fs.createWriteStream(workdir.judgeTracePath);
|
|
65
65
|
const judge = createJudge({
|
|
66
66
|
cwd: workdir.cwd,
|
|
67
67
|
query: deps.query,
|
|
@@ -70,7 +70,8 @@ export async function runJudge(task, workdir, invariants, deps, context) {
|
|
|
70
70
|
judgeProfile: deps.judgeProfile,
|
|
71
71
|
profilesDir: deps.profilesDir,
|
|
72
72
|
maxTurns: 25,
|
|
73
|
-
redactor: createRedactor(),
|
|
73
|
+
redactor: createRedactor({ runtime }),
|
|
74
|
+
runtime,
|
|
74
75
|
});
|
|
75
76
|
|
|
76
77
|
let outcome;
|
|
@@ -95,13 +96,14 @@ export async function runJudge(task, workdir, invariants, deps, context) {
|
|
|
95
96
|
* and map the verdict (`success → pass`, `failure → fail`). Preserved for
|
|
96
97
|
* offline analysis; not used on the runtime happy path.
|
|
97
98
|
* @param {string} tracePath
|
|
99
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
98
100
|
* @returns {Promise<JudgeVerdict | null>}
|
|
99
101
|
*/
|
|
100
|
-
export async function parseConcludeFromTrace(tracePath) {
|
|
101
|
-
|
|
102
|
-
const
|
|
102
|
+
export async function parseConcludeFromTrace(tracePath, runtime) {
|
|
103
|
+
if (!runtime) throw new Error("runtime is required");
|
|
104
|
+
const content = await runtime.fs.readFile(tracePath, "utf8");
|
|
103
105
|
let last = null;
|
|
104
|
-
for
|
|
106
|
+
for (const line of content.split("\n")) {
|
|
105
107
|
const candidate = extractConcludeInput(line);
|
|
106
108
|
if (candidate) last = candidate;
|
|
107
109
|
}
|