@forwardimpact/libeval 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +76 -78
- package/bin/fit-trace.js +83 -57
- package/package.json +2 -2
- package/src/agent-runner.js +23 -13
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/{scorer.js → invariants.js} +14 -12
- package/src/benchmark/judge.js +5 -8
- package/src/benchmark/npm-installer.js +87 -0
- package/src/benchmark/report.js +15 -15
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +17 -11
- package/src/benchmark/task-family.js +6 -4
- package/src/benchmark/workdir.js +23 -3
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +23 -15
- package/src/commands/benchmark-run.js +22 -7
- package/src/commands/by-discussion.js +29 -18
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +30 -21
- package/src/commands/facilitate.js +20 -21
- package/src/commands/output.js +11 -12
- package/src/commands/run.js +24 -21
- package/src/commands/supervise.js +27 -27
- package/src/commands/task-input.js +54 -0
- package/src/commands/trace.js +174 -97
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +49 -2
- package/src/events/github.js +155 -0
- package/src/inbox-poller.js +84 -0
- package/src/index.js +10 -0
- package/src/judge.js +1 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +19 -5
- package/src/orchestration-toolkit.js +14 -0
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/commands/benchmark-score.js +0 -68
package/src/benchmark/runner.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Phases per (task, runIndex):
|
|
5
5
|
* 1. WorkdirManager.start → seed CWD + run pre-flight probe
|
|
6
6
|
* 2. Supervisor session (agent + supervisor) → produce traces + submission
|
|
7
|
-
* 3.
|
|
7
|
+
* 3. Invariants.runInvariants → exit-code-driven verdict via fd-3 NDJSON
|
|
8
8
|
* 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
|
|
9
9
|
* 5. WorkdirManager.teardown → process-group cleanup
|
|
10
10
|
*
|
|
@@ -22,9 +22,10 @@ import { join, resolve as resolvePath } from "node:path";
|
|
|
22
22
|
import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
|
|
23
23
|
import { createSupervisor } from "../supervisor.js";
|
|
24
24
|
import { installApm as defaultInstallApm } from "./apm-installer.js";
|
|
25
|
+
import { installNpm as defaultInstallNpm } from "./npm-installer.js";
|
|
25
26
|
import { runJudge } from "./judge.js";
|
|
26
27
|
import { validateResultRecord } from "./result.js";
|
|
27
|
-
import {
|
|
28
|
+
import { runInvariants } from "./invariants.js";
|
|
28
29
|
import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
|
|
29
30
|
import { createWorkdirManager } from "./workdir.js";
|
|
30
31
|
|
|
@@ -59,15 +60,17 @@ export class BenchmarkRunner {
|
|
|
59
60
|
* write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
|
|
60
61
|
* `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
|
|
61
62
|
* testing only — not part of the public API.
|
|
62
|
-
* @param {Function} [opts.
|
|
63
|
-
* Same contract as `
|
|
63
|
+
* @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
|
|
64
|
+
* Same contract as `runInvariants(task, ctx)`. Internal testing only.
|
|
64
65
|
* @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
|
|
65
|
-
* contract as `runJudge(task, workdir,
|
|
66
|
+
* contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
|
|
66
67
|
* only.
|
|
67
68
|
* @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
|
|
68
69
|
* Same contract as `installApm(family, outputDir)`. Lets tests inject a
|
|
69
70
|
* fake `apm` spawn (or skip the install entirely) so the suite never
|
|
70
71
|
* shells out to a real `apm` binary. Internal testing only.
|
|
72
|
+
* @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
|
|
73
|
+
* Same contract as `installNpm(family, stagingDir)`. Internal testing only.
|
|
71
74
|
*/
|
|
72
75
|
constructor({
|
|
73
76
|
family,
|
|
@@ -83,9 +86,10 @@ export class BenchmarkRunner {
|
|
|
83
86
|
termGraceMs,
|
|
84
87
|
// Test seams — default to the real implementations.
|
|
85
88
|
runAgent,
|
|
86
|
-
|
|
89
|
+
runInvariants: runInvariantsHook,
|
|
87
90
|
runJudge: runJudgeHook,
|
|
88
91
|
installApm: installApmHook,
|
|
92
|
+
installNpm: installNpmHook,
|
|
89
93
|
}) {
|
|
90
94
|
if (!family) throw new Error("family is required");
|
|
91
95
|
if (!Number.isInteger(runs) || runs < 1)
|
|
@@ -108,9 +112,10 @@ export class BenchmarkRunner {
|
|
|
108
112
|
this.maxTurns = maxTurns;
|
|
109
113
|
this.termGraceMs = termGraceMs;
|
|
110
114
|
this._runAgentHook = runAgent ?? null;
|
|
111
|
-
this.
|
|
115
|
+
this._runInvariantsHook = runInvariantsHook ?? runInvariants;
|
|
112
116
|
this._runJudgeHook = runJudgeHook ?? runJudge;
|
|
113
117
|
this._installApmHook = installApmHook ?? defaultInstallApm;
|
|
118
|
+
this._installNpmHook = installNpmHook ?? defaultInstallNpm;
|
|
114
119
|
}
|
|
115
120
|
|
|
116
121
|
/**
|
|
@@ -126,6 +131,7 @@ export class BenchmarkRunner {
|
|
|
126
131
|
await mkdir(this.output, { recursive: true });
|
|
127
132
|
const { stagingDir, skillSetHash, judgeProfilesDir } =
|
|
128
133
|
await this._installApmHook(family, this.output);
|
|
134
|
+
await this._installNpmHook(family, stagingDir);
|
|
129
135
|
|
|
130
136
|
const tasks = family.tasks();
|
|
131
137
|
if (this.profiles.judge) {
|
|
@@ -185,7 +191,7 @@ export class BenchmarkRunner {
|
|
|
185
191
|
}
|
|
186
192
|
const { costUsd, turns, submission, agentError } =
|
|
187
193
|
await this.#runAgentSafe(task, workdir);
|
|
188
|
-
const
|
|
194
|
+
const invariants = await this._runInvariantsHook(task, {
|
|
189
195
|
cwd: workdir.cwd,
|
|
190
196
|
port: workdir.port,
|
|
191
197
|
runDir: workdir.runDir,
|
|
@@ -200,7 +206,7 @@ export class BenchmarkRunner {
|
|
|
200
206
|
judgeVerdict = await this._runJudgeHook(
|
|
201
207
|
task,
|
|
202
208
|
workdir,
|
|
203
|
-
|
|
209
|
+
invariants,
|
|
204
210
|
{
|
|
205
211
|
query: this.query,
|
|
206
212
|
model: this.judgeModel,
|
|
@@ -211,7 +217,7 @@ export class BenchmarkRunner {
|
|
|
211
217
|
);
|
|
212
218
|
}
|
|
213
219
|
const verdict =
|
|
214
|
-
|
|
220
|
+
invariants.verdict === "pass" &&
|
|
215
221
|
(judgeVerdict === null || judgeVerdict.verdict === "pass")
|
|
216
222
|
? "pass"
|
|
217
223
|
: "fail";
|
|
@@ -219,7 +225,7 @@ export class BenchmarkRunner {
|
|
|
219
225
|
taskId: task.id,
|
|
220
226
|
runIndex,
|
|
221
227
|
verdict,
|
|
222
|
-
|
|
228
|
+
invariants,
|
|
223
229
|
submission,
|
|
224
230
|
...(judgeVerdict && { judgeVerdict }),
|
|
225
231
|
costUsd,
|
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
* judge.task.md
|
|
10
10
|
* hooks/ # harness-only; never copied to agent CWD
|
|
11
11
|
* preflight.sh
|
|
12
|
-
*
|
|
12
|
+
* invariants.sh
|
|
13
13
|
* specs/ # copied into agent CWD
|
|
14
14
|
* workdir/ # copied into agent CWD
|
|
15
15
|
*
|
|
@@ -104,7 +104,7 @@ async function discoverTasks(rootPath) {
|
|
|
104
104
|
const supervisorPath = join(taskDir, "supervisor.task.md");
|
|
105
105
|
const judgePath = join(taskDir, "judge.task.md");
|
|
106
106
|
const preflightPath = join(taskDir, "hooks", "preflight.sh");
|
|
107
|
-
const
|
|
107
|
+
const invariantsPath = join(taskDir, "hooks", "invariants.sh");
|
|
108
108
|
tasks.push({
|
|
109
109
|
id: entry.name,
|
|
110
110
|
paths: {
|
|
@@ -114,7 +114,9 @@ async function discoverTasks(rootPath) {
|
|
|
114
114
|
judge: (await fileExists(judgePath)) ? judgePath : null,
|
|
115
115
|
hooks: join(taskDir, "hooks"),
|
|
116
116
|
preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
|
|
117
|
-
|
|
117
|
+
invariants: (await fileExecutable(invariantsPath))
|
|
118
|
+
? invariantsPath
|
|
119
|
+
: null,
|
|
118
120
|
specs: join(taskDir, "specs"),
|
|
119
121
|
workdir: join(taskDir, "workdir"),
|
|
120
122
|
},
|
|
@@ -236,7 +238,7 @@ function run(cmd, args) {
|
|
|
236
238
|
/**
|
|
237
239
|
* @typedef {object} Task
|
|
238
240
|
* @property {string} id - Task name (directory name under tasks/)
|
|
239
|
-
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null,
|
|
241
|
+
* @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, invariants: string|null, specs: string, workdir: string}} paths
|
|
240
242
|
*/
|
|
241
243
|
|
|
242
244
|
/**
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* the pre-flight smoke probe, and tear down the process group at end of run.
|
|
5
5
|
*
|
|
6
6
|
* The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
|
|
7
|
-
* runAgent →
|
|
7
|
+
* runAgent → invariants → judge → teardown.
|
|
8
8
|
*/
|
|
9
9
|
|
|
10
10
|
import { spawn } from "node:child_process";
|
|
@@ -13,6 +13,8 @@ import { createServer } from "node:net";
|
|
|
13
13
|
import { connect } from "node:net";
|
|
14
14
|
import { join } from "node:path";
|
|
15
15
|
|
|
16
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
17
|
+
|
|
16
18
|
import { loadEnv } from "./env-loader.js";
|
|
17
19
|
|
|
18
20
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
@@ -38,13 +40,23 @@ export class WorkdirManager {
|
|
|
38
40
|
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
39
41
|
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
40
42
|
*/
|
|
41
|
-
constructor({
|
|
43
|
+
constructor({
|
|
44
|
+
stagingDir,
|
|
45
|
+
runOutputDir,
|
|
46
|
+
termGraceMs,
|
|
47
|
+
familyRootPath,
|
|
48
|
+
runtime,
|
|
49
|
+
}) {
|
|
42
50
|
if (!stagingDir) throw new Error("stagingDir is required");
|
|
43
51
|
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
44
52
|
this.stagingDir = stagingDir;
|
|
45
53
|
this.runOutputDir = runOutputDir;
|
|
46
54
|
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
47
55
|
this.familyRootPath = familyRootPath ?? null;
|
|
56
|
+
// `loadEnv` is the only collaborator routed through the runtime today; the
|
|
57
|
+
// rest of this manager still uses raw streaming/net/process-group APIs the
|
|
58
|
+
// runtime surface does not yet cover.
|
|
59
|
+
this.runtime = runtime ?? null;
|
|
48
60
|
}
|
|
49
61
|
|
|
50
62
|
/**
|
|
@@ -70,12 +82,20 @@ export class WorkdirManager {
|
|
|
70
82
|
await cp(join(this.stagingDir, ".claude"), join(cwd, ".claude"), {
|
|
71
83
|
recursive: true,
|
|
72
84
|
});
|
|
85
|
+
await cp(join(this.stagingDir, "node_modules"), join(cwd, "node_modules"), {
|
|
86
|
+
recursive: true,
|
|
87
|
+
}).catch((e) => {
|
|
88
|
+
if (e.code !== "ENOENT") throw e;
|
|
89
|
+
});
|
|
73
90
|
|
|
74
91
|
const envDirs = [
|
|
75
92
|
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
76
93
|
...(task.paths.taskDir ? [task.paths.taskDir] : []),
|
|
77
94
|
];
|
|
78
|
-
const envNames =
|
|
95
|
+
const envNames =
|
|
96
|
+
envDirs.length > 0
|
|
97
|
+
? await loadEnv(envDirs, cwd, this.runtime ?? createDefaultRuntime())
|
|
98
|
+
: [];
|
|
79
99
|
|
|
80
100
|
const port = await allocatePort();
|
|
81
101
|
const agentTracePath = join(runDir, "agent.ndjson");
|
package/src/commands/assert.js
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { existsSync, readFileSync } from "node:fs";
|
|
2
1
|
import { basename } from "node:path";
|
|
3
2
|
import jmespath from "jmespath";
|
|
4
3
|
|
|
@@ -6,10 +5,11 @@ import jmespath from "jmespath";
|
|
|
6
5
|
* Evaluate an assertion and return the structured result.
|
|
7
6
|
* @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
|
|
8
7
|
* @param {string[]} args - [testName, file]
|
|
8
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`): `existsSync`, `readFileSync`.
|
|
9
9
|
* @returns {{ test: string, pass: boolean, message?: string }}
|
|
10
10
|
*/
|
|
11
11
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
|
|
12
|
-
export function evaluateAssertion(values, args) {
|
|
12
|
+
export function evaluateAssertion(values, args, fsSync) {
|
|
13
13
|
const testName = args[0];
|
|
14
14
|
if (!testName) throw new Error("assert: missing test name");
|
|
15
15
|
|
|
@@ -34,16 +34,16 @@ export function evaluateAssertion(values, args) {
|
|
|
34
34
|
let result;
|
|
35
35
|
if (values.exists) {
|
|
36
36
|
if (!file) throw new Error("assert: missing file argument");
|
|
37
|
-
result = assertExists(file);
|
|
37
|
+
result = assertExists(file, fsSync);
|
|
38
38
|
} else if (values.grep) {
|
|
39
39
|
if (!file) throw new Error("assert: missing file argument for --grep");
|
|
40
|
-
result = assertGrep(values.grep, file);
|
|
40
|
+
result = assertGrep(values.grep, file, fsSync);
|
|
41
41
|
} else if (values["cites-job"]) {
|
|
42
42
|
if (!file) throw new Error("assert: missing file argument for --cites-job");
|
|
43
|
-
result = assertCitesJob(values["cites-job"], file);
|
|
43
|
+
result = assertCitesJob(values["cites-job"], file, fsSync);
|
|
44
44
|
} else {
|
|
45
45
|
if (!file) throw new Error("assert: missing file argument for --query");
|
|
46
|
-
result = assertQuery(values.query, file);
|
|
46
|
+
result = assertQuery(values.query, file, fsSync);
|
|
47
47
|
}
|
|
48
48
|
|
|
49
49
|
if (values.not) {
|
|
@@ -66,23 +66,31 @@ export function evaluateAssertion(values, args) {
|
|
|
66
66
|
}
|
|
67
67
|
|
|
68
68
|
/**
|
|
69
|
-
* Run an assertion, write JSON to stdout, and
|
|
70
|
-
*
|
|
71
|
-
* @param {
|
|
69
|
+
* Run an assertion, write JSON to stdout, and return a failure envelope when
|
|
70
|
+
* the assertion does not pass.
|
|
71
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
72
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
72
73
|
*/
|
|
73
|
-
export async function runAssertCommand(
|
|
74
|
-
const
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
export async function runAssertCommand(ctx) {
|
|
75
|
+
const runtime = ctx.deps.runtime;
|
|
76
|
+
const args = [ctx.args["test-name"], ctx.args.file];
|
|
77
|
+
let result;
|
|
78
|
+
try {
|
|
79
|
+
result = evaluateAssertion(ctx.options, args, runtime.fsSync);
|
|
80
|
+
} catch (err) {
|
|
81
|
+
return { ok: false, code: 1, error: err.message };
|
|
82
|
+
}
|
|
83
|
+
runtime.proc.stdout.write(JSON.stringify(result) + "\n");
|
|
84
|
+
return result.pass ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
77
85
|
}
|
|
78
86
|
|
|
79
|
-
function assertExists(file) {
|
|
80
|
-
if (existsSync(file)) return { pass: true };
|
|
87
|
+
function assertExists(file, fsSync) {
|
|
88
|
+
if (fsSync.existsSync(file)) return { pass: true };
|
|
81
89
|
return { pass: false, message: `${file} not found` };
|
|
82
90
|
}
|
|
83
91
|
|
|
84
|
-
function assertGrep(pattern, file) {
|
|
85
|
-
const content = readFileSync(file, "utf8");
|
|
92
|
+
function assertGrep(pattern, file, fsSync) {
|
|
93
|
+
const content = fsSync.readFileSync(file, "utf8");
|
|
86
94
|
const re = new RegExp(pattern, "im");
|
|
87
95
|
if (re.test(content)) return { pass: true };
|
|
88
96
|
return {
|
|
@@ -91,8 +99,8 @@ function assertGrep(pattern, file) {
|
|
|
91
99
|
};
|
|
92
100
|
}
|
|
93
101
|
|
|
94
|
-
function assertQuery(expression, file) {
|
|
95
|
-
const content = readFileSync(file, "utf8");
|
|
102
|
+
function assertQuery(expression, file, fsSync) {
|
|
103
|
+
const content = fsSync.readFileSync(file, "utf8");
|
|
96
104
|
const data = parseJsonOrNdjson(content);
|
|
97
105
|
const result = jmespath.search(data, expression);
|
|
98
106
|
const truthy =
|
|
@@ -109,8 +117,8 @@ function assertQuery(expression, file) {
|
|
|
109
117
|
|
|
110
118
|
const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
|
|
111
119
|
|
|
112
|
-
function assertCitesJob(jobFile, file) {
|
|
113
|
-
const jobContent = readFileSync(jobFile, "utf8");
|
|
120
|
+
function assertCitesJob(jobFile, file, fsSync) {
|
|
121
|
+
const jobContent = fsSync.readFileSync(jobFile, "utf8");
|
|
114
122
|
const match = JOB_TAG_RE.exec(jobContent);
|
|
115
123
|
if (!match) {
|
|
116
124
|
return {
|
|
@@ -119,7 +127,7 @@ function assertCitesJob(jobFile, file) {
|
|
|
119
127
|
};
|
|
120
128
|
}
|
|
121
129
|
const citation = `${match[1]}: ${match[2]}`;
|
|
122
|
-
const content = readFileSync(file, "utf8");
|
|
130
|
+
const content = fsSync.readFileSync(file, "utf8");
|
|
123
131
|
if (content.includes(citation)) return { pass: true };
|
|
124
132
|
return { pass: false, message: `missing "${citation}"` };
|
|
125
133
|
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark invariants` — check a single task's invariants against a
|
|
3
|
+
* post-run workdir directory without invoking an agent (P6/P7). Useful for
|
|
4
|
+
* re-checking an agent's output against revised grading material.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { join, resolve } from "node:path";
|
|
8
|
+
import { createServer } from "node:net";
|
|
9
|
+
|
|
10
|
+
import { validateInvariantsRecord } from "../benchmark/result.js";
|
|
11
|
+
import { runInvariants } from "../benchmark/invariants.js";
|
|
12
|
+
import { loadTaskFamily } from "../benchmark/task-family.js";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
16
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
17
|
+
*/
|
|
18
|
+
export async function runBenchmarkInvariantsCommand(ctx) {
|
|
19
|
+
const values = ctx.options;
|
|
20
|
+
const runtime = ctx.deps.runtime;
|
|
21
|
+
const familyInput = values.family;
|
|
22
|
+
if (!familyInput)
|
|
23
|
+
return { ok: false, code: 1, error: "--family is required" };
|
|
24
|
+
const taskId = values.task;
|
|
25
|
+
if (!taskId) return { ok: false, code: 1, error: "--task is required" };
|
|
26
|
+
const workdirArg = values.workdir;
|
|
27
|
+
if (!workdirArg)
|
|
28
|
+
return { ok: false, code: 1, error: "--workdir is required" };
|
|
29
|
+
|
|
30
|
+
const family = await loadTaskFamily(familyInput);
|
|
31
|
+
const task = family.tasks().find((t) => t.id === taskId);
|
|
32
|
+
if (!task)
|
|
33
|
+
return { ok: false, code: 1, error: `task not found in family: ${taskId}` };
|
|
34
|
+
|
|
35
|
+
const runDir = resolve(workdirArg);
|
|
36
|
+
const cwd = join(runDir, "cwd");
|
|
37
|
+
const port = await allocatePort();
|
|
38
|
+
|
|
39
|
+
const invariants = await runInvariants(task, { cwd, port, runDir });
|
|
40
|
+
const record = {
|
|
41
|
+
taskId: task.id,
|
|
42
|
+
invariants,
|
|
43
|
+
exitCode: invariants.exitCode,
|
|
44
|
+
};
|
|
45
|
+
validateInvariantsRecord(record);
|
|
46
|
+
|
|
47
|
+
const line = JSON.stringify(record) + "\n";
|
|
48
|
+
if (values.output) {
|
|
49
|
+
runtime.fsSync.writeFileSync(resolve(values.output), line);
|
|
50
|
+
} else {
|
|
51
|
+
runtime.proc.stdout.write(line);
|
|
52
|
+
}
|
|
53
|
+
return invariants.verdict === "pass"
|
|
54
|
+
? { ok: true }
|
|
55
|
+
: { ok: false, code: 1, error: "" };
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function allocatePort() {
|
|
59
|
+
return new Promise((res, rej) => {
|
|
60
|
+
const server = createServer();
|
|
61
|
+
server.unref();
|
|
62
|
+
server.on("error", rej);
|
|
63
|
+
server.listen(0, "127.0.0.1", () => {
|
|
64
|
+
const addr = server.address();
|
|
65
|
+
if (!addr || typeof addr === "string") {
|
|
66
|
+
server.close();
|
|
67
|
+
rej(new Error("failed to allocate port"));
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
const port = addr.port;
|
|
71
|
+
server.close(() => res(port));
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
}
|
|
@@ -9,24 +9,31 @@ import { resolve } from "node:path";
|
|
|
9
9
|
import { aggregate, renderTextReport } from "../benchmark/report.js";
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
* @param {
|
|
13
|
-
* @
|
|
12
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
13
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
14
14
|
*/
|
|
15
|
-
export async function runBenchmarkReportCommand(
|
|
15
|
+
export async function runBenchmarkReportCommand(ctx) {
|
|
16
|
+
const values = ctx.options;
|
|
17
|
+
const runtime = ctx.deps.runtime;
|
|
16
18
|
const inputDir = values.input ?? "benchmark-runs";
|
|
17
19
|
const kRaw = values.k ?? "1,3,5";
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
20
|
+
let kValues;
|
|
21
|
+
try {
|
|
22
|
+
kValues = kRaw.split(",").map((t) => {
|
|
23
|
+
const n = Number.parseInt(t.trim(), 10);
|
|
24
|
+
if (!Number.isFinite(n) || n < 1) {
|
|
25
|
+
throw new Error(
|
|
26
|
+
"--k must be a comma-separated list of positive integers",
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
return n;
|
|
30
|
+
});
|
|
31
|
+
} catch (err) {
|
|
32
|
+
return { ok: false, code: 1, error: err.message };
|
|
33
|
+
}
|
|
27
34
|
const format = values.format ?? "json";
|
|
28
35
|
if (format !== "json" && format !== "text") {
|
|
29
|
-
|
|
36
|
+
return { ok: false, code: 1, error: "--format must be 'json' or 'text'" };
|
|
30
37
|
}
|
|
31
38
|
|
|
32
39
|
const report = await aggregate({
|
|
@@ -35,8 +42,9 @@ export async function runBenchmarkReportCommand(values, _args) {
|
|
|
35
42
|
includeRuns: format === "text",
|
|
36
43
|
});
|
|
37
44
|
if (format === "text") {
|
|
38
|
-
|
|
45
|
+
runtime.proc.stdout.write(renderTextReport(report, kValues) + "\n");
|
|
39
46
|
} else {
|
|
40
|
-
|
|
47
|
+
runtime.proc.stdout.write(JSON.stringify(report, null, 2) + "\n");
|
|
41
48
|
}
|
|
49
|
+
return { ok: true };
|
|
42
50
|
}
|
|
@@ -10,22 +10,37 @@ import { createConfig } from "@forwardimpact/libconfig";
|
|
|
10
10
|
import { createBenchmarkRunner } from "../benchmark/runner.js";
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
* @param {
|
|
14
|
-
* @
|
|
13
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
14
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
15
15
|
*/
|
|
16
|
-
export async function runBenchmarkRunCommand(
|
|
17
|
-
const
|
|
16
|
+
export async function runBenchmarkRunCommand(ctx) {
|
|
17
|
+
const values = ctx.options;
|
|
18
|
+
const runtime = ctx.deps.runtime;
|
|
19
|
+
let opts;
|
|
20
|
+
try {
|
|
21
|
+
opts = parseRunOptions(values);
|
|
22
|
+
} catch (err) {
|
|
23
|
+
return { ok: false, code: 1, error: err.message };
|
|
24
|
+
}
|
|
18
25
|
const config = await createConfig("script", "benchmark");
|
|
19
|
-
|
|
26
|
+
runtime.proc.env.ANTHROPIC_API_KEY = await config.anthropicToken();
|
|
27
|
+
|
|
28
|
+
// The Claude Agent SDK spawns a `claude` subprocess that inherits
|
|
29
|
+
// process.env. NODE_EXTRA_CA_CERTS causes undici (the HTTP client
|
|
30
|
+
// inside that subprocess) to fail with UND_ERR_INVALID_ARG on
|
|
31
|
+
// Node 22+, aborting every API call after 10 retries. Strip it
|
|
32
|
+
// before the SDK loads so the subprocess gets a clean environment.
|
|
33
|
+
delete runtime.proc.env.NODE_EXTRA_CA_CERTS;
|
|
34
|
+
|
|
20
35
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
21
36
|
const runner = createBenchmarkRunner({ ...opts, query });
|
|
22
37
|
|
|
23
38
|
let anyFail = false;
|
|
24
39
|
for await (const record of runner.run()) {
|
|
25
|
-
|
|
40
|
+
runtime.proc.stdout.write(JSON.stringify(record) + "\n");
|
|
26
41
|
if (record.verdict !== "pass") anyFail = true;
|
|
27
42
|
}
|
|
28
|
-
|
|
43
|
+
return anyFail ? { ok: false, code: 1, error: "" } : { ok: true };
|
|
29
44
|
}
|
|
30
45
|
|
|
31
46
|
function parseRunOptions(values) {
|
|
@@ -1,9 +1,16 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { closeSync, openSync, readSync } from "node:fs";
|
|
2
2
|
import { join } from "node:path";
|
|
3
3
|
|
|
4
|
+
const FIRST_LINE_CAP = 64 * 1024;
|
|
5
|
+
|
|
4
6
|
/**
|
|
5
|
-
* Read the first newline-terminated line of a file
|
|
6
|
-
*
|
|
7
|
+
* Read the first newline-terminated line of a file, bounded to the first
|
|
8
|
+
* {@link FIRST_LINE_CAP} bytes. Trace `.ndjson` files can be many MB; the
|
|
9
|
+
* Step 2.6 meta header is always small, so a bounded `readSync` avoids
|
|
10
|
+
* loading whole files into memory just to inspect the header. This uses
|
|
11
|
+
* `node:fs` directly because the `runtime.fsSync` surface exposes no
|
|
12
|
+
* positional `openSync`/`readSync` — the file is grandfathered for
|
|
13
|
+
* `import:fs` in `check-ambient-deps.deny.yml` until that seam exists.
|
|
7
14
|
*
|
|
8
15
|
* @param {string} path
|
|
9
16
|
* @returns {string}
|
|
@@ -11,11 +18,11 @@ import { join } from "node:path";
|
|
|
11
18
|
function readFirstLine(path) {
|
|
12
19
|
const fd = openSync(path, "r");
|
|
13
20
|
try {
|
|
14
|
-
const buf = Buffer.alloc(
|
|
21
|
+
const buf = Buffer.alloc(FIRST_LINE_CAP);
|
|
15
22
|
const bytes = readSync(fd, buf, 0, buf.length, 0);
|
|
16
|
-
const
|
|
17
|
-
const nl =
|
|
18
|
-
return nl === -1 ?
|
|
23
|
+
const text = buf.toString("utf8", 0, bytes);
|
|
24
|
+
const nl = text.indexOf("\n");
|
|
25
|
+
return nl === -1 ? text : text.slice(0, nl);
|
|
19
26
|
} finally {
|
|
20
27
|
closeSync(fd);
|
|
21
28
|
}
|
|
@@ -30,13 +37,14 @@ function readFirstLine(path) {
|
|
|
30
37
|
*
|
|
31
38
|
* @param {string} dir
|
|
32
39
|
* @param {string} discussionId
|
|
40
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
33
41
|
* @returns {Array<{path: string, mtimeMs: number}>}
|
|
34
42
|
*/
|
|
35
|
-
export function findTracesByDiscussion(dir, discussionId) {
|
|
43
|
+
export function findTracesByDiscussion(dir, discussionId, fsSync) {
|
|
36
44
|
const matches = [];
|
|
37
45
|
let entries;
|
|
38
46
|
try {
|
|
39
|
-
entries = readdirSync(dir);
|
|
47
|
+
entries = fsSync.readdirSync(dir);
|
|
40
48
|
} catch {
|
|
41
49
|
return [];
|
|
42
50
|
}
|
|
@@ -58,7 +66,7 @@ export function findTracesByDiscussion(dir, discussionId) {
|
|
|
58
66
|
const event = parsed.event ?? parsed;
|
|
59
67
|
if (event?.type !== "meta") continue;
|
|
60
68
|
if (event.discussion_id !== discussionId) continue;
|
|
61
|
-
matches.push({ path, mtimeMs: statSync(path).mtimeMs });
|
|
69
|
+
matches.push({ path, mtimeMs: fsSync.statSync(path).mtimeMs });
|
|
62
70
|
}
|
|
63
71
|
matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
|
|
64
72
|
return matches;
|
|
@@ -70,15 +78,18 @@ export function findTracesByDiscussion(dir, discussionId) {
|
|
|
70
78
|
* line, ordered by first-event timestamp (file mtime ascending). The
|
|
71
79
|
* result is usable with `xargs cat` for a chronological merge.
|
|
72
80
|
*
|
|
73
|
-
* @param {
|
|
74
|
-
* @
|
|
81
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
82
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
75
83
|
*/
|
|
76
|
-
export async function runByDiscussionCommand(
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
84
|
+
export async function runByDiscussionCommand(ctx) {
|
|
85
|
+
const runtime = ctx.deps.runtime;
|
|
86
|
+
const discussionId = ctx.args["discussion-id"];
|
|
87
|
+
if (!discussionId)
|
|
88
|
+
return { ok: false, code: 1, error: "<discussion-id> is required" };
|
|
89
|
+
const dir = ctx.args["trace-dir"] ?? ctx.options["trace-dir"] ?? "traces";
|
|
90
|
+
const matches = findTracesByDiscussion(dir, discussionId, runtime.fsSync);
|
|
81
91
|
for (const { path } of matches) {
|
|
82
|
-
|
|
92
|
+
runtime.proc.stdout.write(`${path}\n`);
|
|
83
93
|
}
|
|
94
|
+
return { ok: true };
|
|
84
95
|
}
|
package/src/commands/callback.js
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { readFileSync } from "node:fs";
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
2
|
* Scan an NDJSON trace and return the last orchestrator summary event,
|
|
5
3
|
* the first `meta` event's `discussion_id`, and any structured replies
|
|
@@ -11,13 +9,14 @@ import { readFileSync } from "node:fs";
|
|
|
11
9
|
* its channel semantics.
|
|
12
10
|
*
|
|
13
11
|
* @param {string} traceFile
|
|
12
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
14
13
|
* @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
|
|
15
14
|
*/
|
|
16
15
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
|
|
17
|
-
function readTraceSummary(traceFile) {
|
|
16
|
+
function readTraceSummary(traceFile, fsSync) {
|
|
18
17
|
let summary = null;
|
|
19
18
|
let metaDiscussionId = null;
|
|
20
|
-
for (const line of readFileSync(traceFile, "utf8").split("\n")) {
|
|
19
|
+
for (const line of fsSync.readFileSync(traceFile, "utf8").split("\n")) {
|
|
21
20
|
if (!line.trim()) continue;
|
|
22
21
|
let record;
|
|
23
22
|
try {
|
|
@@ -40,6 +39,9 @@ function readTraceSummary(traceFile) {
|
|
|
40
39
|
...(record.event.discussion_id && {
|
|
41
40
|
discussionId: record.event.discussion_id,
|
|
42
41
|
}),
|
|
42
|
+
...(typeof record.event.lastActedSeq === "number" && {
|
|
43
|
+
lastActedSeq: record.event.lastActedSeq,
|
|
44
|
+
}),
|
|
43
45
|
};
|
|
44
46
|
}
|
|
45
47
|
}
|
|
@@ -64,20 +66,24 @@ function readTraceSummary(traceFile) {
|
|
|
64
66
|
* }
|
|
65
67
|
* ```
|
|
66
68
|
*
|
|
67
|
-
* @param {
|
|
68
|
-
* @
|
|
69
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
70
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
69
71
|
*/
|
|
70
|
-
export async function runCallbackCommand(
|
|
72
|
+
export async function runCallbackCommand(ctx) {
|
|
73
|
+
const values = ctx.options;
|
|
74
|
+
const runtime = ctx.deps.runtime;
|
|
71
75
|
const traceFile = values["trace-file"];
|
|
72
76
|
const callbackUrl = values["callback-url"];
|
|
73
77
|
const correlationId = values["correlation-id"];
|
|
74
78
|
const runUrl = values["run-url"] ?? "";
|
|
75
79
|
const discussionIdOverride = values["discussion-id"] ?? null;
|
|
76
80
|
|
|
77
|
-
if (!traceFile)
|
|
78
|
-
|
|
81
|
+
if (!traceFile)
|
|
82
|
+
return { ok: false, code: 1, error: "--trace-file is required" };
|
|
83
|
+
if (!callbackUrl)
|
|
84
|
+
return { ok: false, code: 1, error: "--callback-url is required" };
|
|
79
85
|
|
|
80
|
-
const found = readTraceSummary(traceFile) ?? {
|
|
86
|
+
const found = readTraceSummary(traceFile, runtime.fsSync) ?? {
|
|
81
87
|
verdict: "failed",
|
|
82
88
|
summary: "Run ended without producing a summary.",
|
|
83
89
|
replies: [],
|
|
@@ -86,10 +92,12 @@ export async function runCallbackCommand(values, _args) {
|
|
|
86
92
|
const discussionId = found.discussionId ?? discussionIdOverride ?? null;
|
|
87
93
|
const payload = {
|
|
88
94
|
correlation_id: correlationId,
|
|
95
|
+
kind: "terminal",
|
|
89
96
|
verdict: found.verdict,
|
|
90
97
|
summary: found.summary,
|
|
91
98
|
run_url: runUrl,
|
|
92
99
|
replies: found.replies,
|
|
100
|
+
last_acted_seq: found.lastActedSeq ?? -1,
|
|
93
101
|
...(discussionId && { discussion_id: discussionId }),
|
|
94
102
|
...(found.trigger && { trigger: found.trigger }),
|
|
95
103
|
};
|
|
@@ -99,6 +107,7 @@ export async function runCallbackCommand(values, _args) {
|
|
|
99
107
|
body: JSON.stringify(payload),
|
|
100
108
|
});
|
|
101
109
|
if (!res.ok) {
|
|
102
|
-
|
|
110
|
+
return { ok: false, code: 1, error: `Callback POST failed: ${res.status}` };
|
|
103
111
|
}
|
|
112
|
+
return { ok: true };
|
|
104
113
|
}
|