@forwardimpact/libeval 0.1.51 → 0.1.53
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +8 -14
- package/bin/fit-eval.js +8 -28
- package/bin/fit-selfedit.js +6 -4
- package/bin/fit-trace.js +7 -14
- package/package.json +1 -1
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/invariants.js +51 -63
- package/src/benchmark/judge.js +13 -11
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +25 -11
- package/src/benchmark/result.js +2 -2
- package/src/benchmark/runner.js +82 -38
- package/src/benchmark/task-family.js +74 -63
- package/src/benchmark/workdir.js +91 -99
- package/src/commands/benchmark-invariants.js +3 -3
- package/src/commands/benchmark-report.js +1 -0
- package/src/commands/benchmark-run.js +1 -1
- package/src/commands/by-discussion.js +10 -11
- package/src/commands/discuss.js +3 -2
- package/src/commands/facilitate.js +3 -2
- package/src/commands/output.js +4 -1
- package/src/commands/run.js +6 -2
- package/src/commands/supervise.js +3 -2
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +7 -2
- package/src/discusser.js +7 -5
- package/src/events/github.js +7 -1
- package/src/facilitator.js +6 -5
- package/src/inbox-poller.js +5 -8
- package/src/judge.js +12 -13
- package/src/profile-prompt.js +124 -26
- package/src/redaction.js +3 -16
- package/src/supervisor.js +7 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Task-family loader. A task family is a directory under
|
|
3
3
|
* <root>/
|
|
4
4
|
* apm.lock.yaml
|
|
5
|
-
* .claude/ # pre-staged skills + agents
|
|
5
|
+
* .claude/ # pre-staged skills + agents
|
|
6
6
|
* tasks/<task_name>/
|
|
7
7
|
* agent.task.md
|
|
8
8
|
* supervisor.task.md # optional; appended to the task as supervisor context
|
|
@@ -17,45 +17,55 @@
|
|
|
17
17
|
* a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
|
|
18
18
|
* Local paths use the canonical-tree algorithm from design § Family revision
|
|
19
19
|
* algorithm so the result is stable across operating systems.
|
|
20
|
+
*
|
|
21
|
+
* Filesystem and subprocess access route through the injected `runtime` bag
|
|
22
|
+
* (`runtime.fs` async, `runtime.subprocess.run` one-shot, `tmpdir` derived
|
|
23
|
+
* from `runtime.proc.env`).
|
|
20
24
|
*/
|
|
21
25
|
|
|
22
|
-
import { spawn } from "node:child_process";
|
|
23
26
|
import { createHash } from "node:crypto";
|
|
24
|
-
import {
|
|
25
|
-
access,
|
|
26
|
-
constants,
|
|
27
|
-
lstat,
|
|
28
|
-
mkdtemp,
|
|
29
|
-
readdir,
|
|
30
|
-
readFile,
|
|
31
|
-
realpath,
|
|
32
|
-
} from "node:fs/promises";
|
|
33
|
-
import { tmpdir } from "node:os";
|
|
34
27
|
import { join, posix, relative, resolve, sep } from "node:path";
|
|
35
28
|
|
|
36
29
|
const GIT_URL_RE = /^(git@|https?:\/\/|ssh:\/\/|git:\/\/)/;
|
|
37
30
|
const SKIP_DIRS = new Set([".git", "node_modules"]);
|
|
31
|
+
// POSIX `X_OK` (execute permission); node's fs honours the numeric mode, so we
|
|
32
|
+
// avoid importing `node:fs`'s `constants` (which would light the fs smell).
|
|
33
|
+
const X_OK = 1;
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Derive the system temp dir from the env (node's `os.tmpdir()` is itself an
|
|
37
|
+
* env-respecting wrapper). The runtime bag has no `os` slot by design.
|
|
38
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
39
|
+
* @returns {string}
|
|
40
|
+
*/
|
|
41
|
+
function tmpdir(runtime) {
|
|
42
|
+
return runtime.proc.env.TMPDIR ?? "/tmp";
|
|
43
|
+
}
|
|
38
44
|
|
|
39
45
|
/**
|
|
40
46
|
* Load a task family from a local path or git URL.
|
|
41
47
|
* @param {string} rootPathOrGitUrl
|
|
48
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
42
49
|
* @returns {Promise<TaskFamily>}
|
|
43
50
|
*/
|
|
44
|
-
export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
51
|
+
export async function loadTaskFamily(rootPathOrGitUrl, runtime) {
|
|
52
|
+
if (!runtime) throw new Error("runtime is required");
|
|
45
53
|
const isGit = GIT_URL_RE.test(rootPathOrGitUrl);
|
|
46
54
|
let rootPath;
|
|
47
55
|
let familyRevision;
|
|
48
56
|
if (isGit) {
|
|
49
|
-
const dir = await mkdtemp(
|
|
50
|
-
|
|
57
|
+
const dir = await runtime.fs.mkdtemp(
|
|
58
|
+
join(tmpdir(runtime), "fit-benchmark-family-"),
|
|
59
|
+
);
|
|
60
|
+
await gitClone(runtime, rootPathOrGitUrl, dir);
|
|
51
61
|
rootPath = dir;
|
|
52
|
-
familyRevision = "git:" + (await gitHead(dir));
|
|
62
|
+
familyRevision = "git:" + (await gitHead(runtime, dir));
|
|
53
63
|
} else {
|
|
54
64
|
rootPath = resolve(rootPathOrGitUrl);
|
|
55
|
-
familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
|
|
65
|
+
familyRevision = "sha256:" + (await canonicalTreeHash(runtime, rootPath));
|
|
56
66
|
}
|
|
57
67
|
|
|
58
|
-
const tasks = await discoverTasks(rootPath);
|
|
68
|
+
const tasks = await discoverTasks(runtime, rootPath);
|
|
59
69
|
|
|
60
70
|
return {
|
|
61
71
|
rootPath,
|
|
@@ -73,27 +83,30 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
|
|
|
73
83
|
* @param {TaskFamily} _family
|
|
74
84
|
* @param {string} judgeProfilesDir
|
|
75
85
|
* @param {string} judgeProfile
|
|
86
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
76
87
|
* @returns {Promise<void>}
|
|
77
88
|
*/
|
|
78
89
|
export async function assertJudgeProfileStaged(
|
|
79
90
|
_family,
|
|
80
91
|
judgeProfilesDir,
|
|
81
92
|
judgeProfile,
|
|
93
|
+
runtime,
|
|
82
94
|
) {
|
|
83
95
|
const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
|
|
84
96
|
try {
|
|
85
|
-
await access(candidate);
|
|
97
|
+
await runtime.fs.access(candidate);
|
|
86
98
|
} catch {
|
|
87
99
|
throw new Error(`judge profile not staged: ${candidate}`);
|
|
88
100
|
}
|
|
89
101
|
}
|
|
90
102
|
|
|
91
|
-
async function discoverTasks(rootPath) {
|
|
103
|
+
async function discoverTasks(runtime, rootPath) {
|
|
104
|
+
const fs = runtime.fs;
|
|
92
105
|
const tasksRoot = join(rootPath, "tasks");
|
|
93
106
|
const tasks = [];
|
|
94
107
|
let entries;
|
|
95
108
|
try {
|
|
96
|
-
entries = await readdir(tasksRoot, { withFileTypes: true });
|
|
109
|
+
entries = await fs.readdir(tasksRoot, { withFileTypes: true });
|
|
97
110
|
} catch (e) {
|
|
98
111
|
if (e.code === "ENOENT") return tasks;
|
|
99
112
|
throw e;
|
|
@@ -110,11 +123,15 @@ async function discoverTasks(rootPath) {
|
|
|
110
123
|
paths: {
|
|
111
124
|
taskDir,
|
|
112
125
|
instructions: join(taskDir, "agent.task.md"),
|
|
113
|
-
supervisor: (await fileExists(supervisorPath))
|
|
114
|
-
|
|
126
|
+
supervisor: (await fileExists(fs, supervisorPath))
|
|
127
|
+
? supervisorPath
|
|
128
|
+
: null,
|
|
129
|
+
judge: (await fileExists(fs, judgePath)) ? judgePath : null,
|
|
115
130
|
hooks: join(taskDir, "hooks"),
|
|
116
|
-
preflight: (await fileExecutable(preflightPath))
|
|
117
|
-
|
|
131
|
+
preflight: (await fileExecutable(fs, preflightPath))
|
|
132
|
+
? preflightPath
|
|
133
|
+
: null,
|
|
134
|
+
invariants: (await fileExecutable(fs, invariantsPath))
|
|
118
135
|
? invariantsPath
|
|
119
136
|
: null,
|
|
120
137
|
specs: join(taskDir, "specs"),
|
|
@@ -126,18 +143,18 @@ async function discoverTasks(rootPath) {
|
|
|
126
143
|
return tasks;
|
|
127
144
|
}
|
|
128
145
|
|
|
129
|
-
async function fileExists(path) {
|
|
146
|
+
async function fileExists(fs, path) {
|
|
130
147
|
try {
|
|
131
|
-
await access(path);
|
|
148
|
+
await fs.access(path);
|
|
132
149
|
return true;
|
|
133
150
|
} catch {
|
|
134
151
|
return false;
|
|
135
152
|
}
|
|
136
153
|
}
|
|
137
154
|
|
|
138
|
-
async function fileExecutable(path) {
|
|
155
|
+
async function fileExecutable(fs, path) {
|
|
139
156
|
try {
|
|
140
|
-
await access(path,
|
|
157
|
+
await fs.access(path, X_OK);
|
|
141
158
|
return true;
|
|
142
159
|
} catch {
|
|
143
160
|
return false;
|
|
@@ -151,16 +168,18 @@ async function fileExecutable(path) {
|
|
|
151
168
|
* sort by NFC-normalised POSIX-style root-relative path
|
|
152
169
|
* row = <rel-path>\0<hex-sha256>\n
|
|
153
170
|
* sha256(concat(rows))
|
|
171
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
154
172
|
* @param {string} rootPath
|
|
155
173
|
* @returns {Promise<string>} hex digest
|
|
156
174
|
*/
|
|
157
|
-
async function canonicalTreeHash(rootPath) {
|
|
158
|
-
const
|
|
175
|
+
async function canonicalTreeHash(runtime, rootPath) {
|
|
176
|
+
const fs = runtime.fs;
|
|
177
|
+
const real = await fs.realpath(rootPath);
|
|
159
178
|
const rows = [];
|
|
160
|
-
for await (const filePath of walkFiles(real)) {
|
|
179
|
+
for await (const filePath of walkFiles(fs, real)) {
|
|
161
180
|
const rel = toPosix(relative(real, filePath)).normalize("NFC");
|
|
162
|
-
const target = await realpath(filePath);
|
|
163
|
-
const bytes = await readFile(target);
|
|
181
|
+
const target = await fs.realpath(filePath);
|
|
182
|
+
const bytes = await fs.readFile(target);
|
|
164
183
|
const hex = createHash("sha256").update(bytes).digest("hex");
|
|
165
184
|
rows.push({ rel, hex });
|
|
166
185
|
}
|
|
@@ -170,15 +189,15 @@ async function canonicalTreeHash(rootPath) {
|
|
|
170
189
|
return acc.digest("hex");
|
|
171
190
|
}
|
|
172
191
|
|
|
173
|
-
async function* walkFiles(dir) {
|
|
174
|
-
const entries = await readdir(dir, { withFileTypes: true });
|
|
192
|
+
async function* walkFiles(fs, dir) {
|
|
193
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
175
194
|
for (const entry of entries) {
|
|
176
195
|
const full = join(dir, entry.name);
|
|
177
196
|
if (entry.isDirectory()) {
|
|
178
197
|
if (SKIP_DIRS.has(entry.name)) continue;
|
|
179
|
-
yield* walkFiles(full);
|
|
198
|
+
yield* walkFiles(fs, full);
|
|
180
199
|
} else if (entry.isSymbolicLink()) {
|
|
181
|
-
const resolvedFile = await resolveSymlinkToFile(full);
|
|
200
|
+
const resolvedFile = await resolveSymlinkToFile(fs, full);
|
|
182
201
|
if (resolvedFile) yield full;
|
|
183
202
|
} else if (entry.isFile()) {
|
|
184
203
|
yield full;
|
|
@@ -190,12 +209,12 @@ async function* walkFiles(dir) {
|
|
|
190
209
|
* Return the resolved path if `linkPath` is a symlink to a regular file.
|
|
191
210
|
* Returns null for dangling symlinks or links to non-file targets.
|
|
192
211
|
*/
|
|
193
|
-
async function resolveSymlinkToFile(linkPath) {
|
|
194
|
-
const st = await lstat(linkPath);
|
|
212
|
+
async function resolveSymlinkToFile(fs, linkPath) {
|
|
213
|
+
const st = await fs.lstat(linkPath);
|
|
195
214
|
if (!st.isSymbolicLink()) return null;
|
|
196
215
|
try {
|
|
197
|
-
const resolved = await realpath(linkPath);
|
|
198
|
-
const tstat = await lstat(resolved);
|
|
216
|
+
const resolved = await fs.realpath(linkPath);
|
|
217
|
+
const tstat = await fs.lstat(resolved);
|
|
199
218
|
return tstat.isFile() ? resolved : null;
|
|
200
219
|
} catch {
|
|
201
220
|
return null;
|
|
@@ -207,32 +226,24 @@ function toPosix(p) {
|
|
|
207
226
|
return p.split(sep).join(posix.sep);
|
|
208
227
|
}
|
|
209
228
|
|
|
210
|
-
async function gitClone(url, dir) {
|
|
211
|
-
await
|
|
229
|
+
async function gitClone(runtime, url, dir) {
|
|
230
|
+
await git(runtime, ["clone", "--depth", "1", url, dir]);
|
|
212
231
|
}
|
|
213
232
|
|
|
214
|
-
async function gitHead(dir) {
|
|
215
|
-
const out = await
|
|
233
|
+
async function gitHead(runtime, dir) {
|
|
234
|
+
const out = await git(runtime, ["-C", dir, "rev-parse", "HEAD"]);
|
|
216
235
|
return out.trim();
|
|
217
236
|
}
|
|
218
237
|
|
|
219
|
-
function
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
stderr += d.toString();
|
|
229
|
-
});
|
|
230
|
-
child.on("error", rej);
|
|
231
|
-
child.on("close", (code) => {
|
|
232
|
-
if (code === 0) res(stdout);
|
|
233
|
-
else rej(new Error(`${cmd} ${args.join(" ")} exited ${code}: ${stderr}`));
|
|
234
|
-
});
|
|
235
|
-
});
|
|
238
|
+
async function git(runtime, args) {
|
|
239
|
+
const { stdout, stderr, exitCode } = await runtime.subprocess.run(
|
|
240
|
+
"git",
|
|
241
|
+
args,
|
|
242
|
+
);
|
|
243
|
+
if (exitCode !== 0) {
|
|
244
|
+
throw new Error(`git ${args.join(" ")} exited ${exitCode}: ${stderr}`);
|
|
245
|
+
}
|
|
246
|
+
return stdout;
|
|
236
247
|
}
|
|
237
248
|
|
|
238
249
|
/**
|
package/src/benchmark/workdir.js
CHANGED
|
@@ -5,16 +5,17 @@
|
|
|
5
5
|
*
|
|
6
6
|
* The Workdir handle threads `cwd`, `port`, `pgid`, and trace paths through
|
|
7
7
|
* runAgent → invariants → judge → teardown.
|
|
8
|
+
*
|
|
9
|
+
* Filesystem, subprocess, clock, and process-signal access all route through
|
|
10
|
+
* the injected `runtime` bag. Only raw TCP plumbing (`node:net`) stays direct —
|
|
11
|
+
* it is not an ambient-dependency smell and the runtime bag models no socket
|
|
12
|
+
* surface.
|
|
8
13
|
*/
|
|
9
14
|
|
|
10
|
-
import { spawn } from "node:child_process";
|
|
11
|
-
import { cp, mkdir } from "node:fs/promises";
|
|
12
15
|
import { createServer } from "node:net";
|
|
13
16
|
import { connect } from "node:net";
|
|
14
17
|
import { join } from "node:path";
|
|
15
18
|
|
|
16
|
-
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
17
|
-
|
|
18
19
|
import { loadEnv } from "./env-loader.js";
|
|
19
20
|
|
|
20
21
|
const DEFAULT_TERM_GRACE_MS = 5_000;
|
|
@@ -39,6 +40,8 @@ export class WorkdirManager {
|
|
|
39
40
|
* @param {object} deps
|
|
40
41
|
* @param {string} deps.stagingDir - Output of `installApm(...)`.
|
|
41
42
|
* @param {string} deps.runOutputDir - Root run-output directory (parent of `runs/`).
|
|
43
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
44
|
+
* Ambient collaborators; uses `fs`, `subprocess`, `clock`, `proc`.
|
|
42
45
|
*/
|
|
43
46
|
constructor({
|
|
44
47
|
stagingDir,
|
|
@@ -49,14 +52,12 @@ export class WorkdirManager {
|
|
|
49
52
|
}) {
|
|
50
53
|
if (!stagingDir) throw new Error("stagingDir is required");
|
|
51
54
|
if (!runOutputDir) throw new Error("runOutputDir is required");
|
|
55
|
+
if (!runtime) throw new Error("runtime is required");
|
|
52
56
|
this.stagingDir = stagingDir;
|
|
53
57
|
this.runOutputDir = runOutputDir;
|
|
54
58
|
this.termGraceMs = termGraceMs ?? DEFAULT_TERM_GRACE_MS;
|
|
55
59
|
this.familyRootPath = familyRootPath ?? null;
|
|
56
|
-
|
|
57
|
-
// rest of this manager still uses raw streaming/net/process-group APIs the
|
|
58
|
-
// runtime surface does not yet cover.
|
|
59
|
-
this.runtime = runtime ?? null;
|
|
60
|
+
this.runtime = runtime;
|
|
60
61
|
}
|
|
61
62
|
|
|
62
63
|
/**
|
|
@@ -66,36 +67,39 @@ export class WorkdirManager {
|
|
|
66
67
|
* @returns {Promise<Workdir>}
|
|
67
68
|
*/
|
|
68
69
|
async start(task, runIndex) {
|
|
70
|
+
const fs = this.runtime.fs;
|
|
69
71
|
const slug = task.id.replace("/", "__");
|
|
70
72
|
const runDir = join(this.runOutputDir, "runs", slug, String(runIndex));
|
|
71
73
|
const cwd = join(runDir, "cwd");
|
|
72
|
-
await mkdir(cwd, { recursive: true });
|
|
74
|
+
await fs.mkdir(cwd, { recursive: true });
|
|
73
75
|
|
|
74
|
-
await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
75
|
-
if (e.code !== "ENOENT") throw e;
|
|
76
|
-
});
|
|
77
|
-
await cp(task.paths.specs, join(cwd, "specs"), {
|
|
78
|
-
recursive: true,
|
|
79
|
-
}).catch((e) => {
|
|
76
|
+
await fs.cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
|
|
80
77
|
if (e.code !== "ENOENT") throw e;
|
|
81
78
|
});
|
|
82
|
-
await
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
79
|
+
await fs
|
|
80
|
+
.cp(task.paths.specs, join(cwd, "specs"), {
|
|
81
|
+
recursive: true,
|
|
82
|
+
})
|
|
83
|
+
.catch((e) => {
|
|
84
|
+
if (e.code !== "ENOENT") throw e;
|
|
85
|
+
});
|
|
86
|
+
await fs.cp(join(this.stagingDir, ".claude"), join(cwd, ".claude"), {
|
|
86
87
|
recursive: true,
|
|
87
|
-
}).catch((e) => {
|
|
88
|
-
if (e.code !== "ENOENT") throw e;
|
|
89
88
|
});
|
|
89
|
+
await fs
|
|
90
|
+
.cp(join(this.stagingDir, "node_modules"), join(cwd, "node_modules"), {
|
|
91
|
+
recursive: true,
|
|
92
|
+
})
|
|
93
|
+
.catch((e) => {
|
|
94
|
+
if (e.code !== "ENOENT") throw e;
|
|
95
|
+
});
|
|
90
96
|
|
|
91
97
|
const envDirs = [
|
|
92
98
|
...(this.familyRootPath ? [this.familyRootPath] : []),
|
|
93
99
|
...(task.paths.taskDir ? [task.paths.taskDir] : []),
|
|
94
100
|
];
|
|
95
101
|
const envNames =
|
|
96
|
-
envDirs.length > 0
|
|
97
|
-
? await loadEnv(envDirs, cwd, this.runtime ?? createDefaultRuntime())
|
|
98
|
-
: [];
|
|
102
|
+
envDirs.length > 0 ? await loadEnv(envDirs, cwd, this.runtime) : [];
|
|
99
103
|
|
|
100
104
|
const port = await allocatePort();
|
|
101
105
|
const agentTracePath = join(runDir, "agent.ndjson");
|
|
@@ -103,7 +107,7 @@ export class WorkdirManager {
|
|
|
103
107
|
const judgeTracePath = join(runDir, "judge.ndjson");
|
|
104
108
|
|
|
105
109
|
const preflight = task.paths.preflight
|
|
106
|
-
? await runPreflight(task.paths.preflight, cwd, port)
|
|
110
|
+
? await runPreflight(this.runtime, task.paths.preflight, cwd, port)
|
|
107
111
|
: { pgid: 0 };
|
|
108
112
|
|
|
109
113
|
return {
|
|
@@ -126,81 +130,71 @@ export class WorkdirManager {
|
|
|
126
130
|
* @returns {Promise<{portFree: boolean, descendants: number}>}
|
|
127
131
|
*/
|
|
128
132
|
async teardown(workdir) {
|
|
133
|
+
const { proc, clock } = this.runtime;
|
|
129
134
|
if (workdir.pgid && workdir.pgid > 0) {
|
|
130
135
|
try {
|
|
131
|
-
|
|
136
|
+
proc.kill(-workdir.pgid, "SIGTERM");
|
|
132
137
|
} catch {
|
|
133
138
|
// Process group already gone — fine.
|
|
134
139
|
}
|
|
135
|
-
await sleep(this.termGraceMs);
|
|
140
|
+
await clock.sleep(this.termGraceMs);
|
|
136
141
|
try {
|
|
137
|
-
|
|
142
|
+
proc.kill(-workdir.pgid, "SIGKILL");
|
|
138
143
|
} catch {
|
|
139
144
|
// Already exited.
|
|
140
145
|
}
|
|
141
146
|
// Poll briefly until the process group is empty — SIGKILL returns
|
|
142
147
|
// before the kernel finishes reaping descendants.
|
|
143
148
|
await waitFor(
|
|
144
|
-
|
|
149
|
+
this.runtime,
|
|
150
|
+
async () => (await countDescendants(this.runtime, workdir.pgid)) === 0,
|
|
145
151
|
2_000,
|
|
146
152
|
);
|
|
147
153
|
}
|
|
148
154
|
const portFree = await isPortFree(workdir.port);
|
|
149
|
-
const descendants = await countDescendants(workdir.pgid);
|
|
155
|
+
const descendants = await countDescendants(this.runtime, workdir.pgid);
|
|
150
156
|
return { portFree, descendants };
|
|
151
157
|
}
|
|
152
158
|
}
|
|
153
159
|
|
|
154
160
|
/**
|
|
155
161
|
* Spawn preflight. Stays detached so we can SIGTERM the whole process group.
|
|
162
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
156
163
|
* @param {string} script
|
|
157
164
|
* @param {string} cwd - Agent CWD passed via $WORKDIR.
|
|
158
165
|
* @param {number} port - Free TCP port passed via $PORT.
|
|
159
166
|
* @returns {Promise<{pgid: number, error?: {phase: string, message: string, exitCode: number}}>}
|
|
160
167
|
*/
|
|
161
|
-
function runPreflight(script, cwd, port) {
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
detached: true,
|
|
168
|
-
stdio: ["ignore", "pipe", "pipe"],
|
|
169
|
-
});
|
|
170
|
-
if (child.pid === undefined) {
|
|
171
|
-
rej(new Error(`failed to spawn preflight: ${script}`));
|
|
172
|
-
return;
|
|
173
|
-
}
|
|
174
|
-
const pgid = child.pid;
|
|
175
|
-
child.stderr.on("data", (d) => {
|
|
176
|
-
stderr += d.toString();
|
|
177
|
-
});
|
|
178
|
-
child.on("error", (e) => {
|
|
179
|
-
res({
|
|
180
|
-
pgid,
|
|
181
|
-
error: {
|
|
182
|
-
phase: "preflight",
|
|
183
|
-
message: `preflight failed to spawn: ${e.message}`,
|
|
184
|
-
exitCode: -1,
|
|
185
|
-
},
|
|
186
|
-
});
|
|
187
|
-
});
|
|
188
|
-
child.on("exit", (code, signal) => {
|
|
189
|
-
if (code === 0) {
|
|
190
|
-
res({ pgid });
|
|
191
|
-
return;
|
|
192
|
-
}
|
|
193
|
-
const message = stderr.trim() || `preflight exited with signal ${signal}`;
|
|
194
|
-
res({
|
|
195
|
-
pgid,
|
|
196
|
-
error: {
|
|
197
|
-
phase: "preflight",
|
|
198
|
-
message,
|
|
199
|
-
exitCode: typeof code === "number" ? code : -1,
|
|
200
|
-
},
|
|
201
|
-
});
|
|
202
|
-
});
|
|
168
|
+
async function runPreflight(runtime, script, cwd, port) {
|
|
169
|
+
const child = runtime.subprocess.spawn(script, [], {
|
|
170
|
+
cwd,
|
|
171
|
+
env: { ...runtime.proc.env, WORKDIR: cwd, PORT: String(port) },
|
|
172
|
+
detached: true,
|
|
173
|
+
stdio: ["ignore", "pipe", "pipe"],
|
|
203
174
|
});
|
|
175
|
+
if (child.pid === undefined) {
|
|
176
|
+
throw new Error(`failed to spawn preflight: ${script}`);
|
|
177
|
+
}
|
|
178
|
+
const pgid = child.pid;
|
|
179
|
+
let stderr = "";
|
|
180
|
+
const drainStdout = (async () => {
|
|
181
|
+
for await (const _chunk of child.stdout) {
|
|
182
|
+
// discard
|
|
183
|
+
}
|
|
184
|
+
})();
|
|
185
|
+
for await (const chunk of child.stderr) stderr += chunk.toString();
|
|
186
|
+
await drainStdout;
|
|
187
|
+
const code = await child.exitCode;
|
|
188
|
+
if (code === 0) return { pgid };
|
|
189
|
+
const message = stderr.trim() || `preflight exited with code ${code}`;
|
|
190
|
+
return {
|
|
191
|
+
pgid,
|
|
192
|
+
error: {
|
|
193
|
+
phase: "preflight",
|
|
194
|
+
message,
|
|
195
|
+
exitCode: typeof code === "number" ? code : -1,
|
|
196
|
+
},
|
|
197
|
+
};
|
|
204
198
|
}
|
|
205
199
|
|
|
206
200
|
function allocatePort() {
|
|
@@ -236,37 +230,35 @@ function isPortFree(port) {
|
|
|
236
230
|
});
|
|
237
231
|
}
|
|
238
232
|
|
|
239
|
-
function countDescendants(pgid) {
|
|
240
|
-
if (!pgid || pgid <= 0) return
|
|
241
|
-
|
|
242
|
-
|
|
233
|
+
async function countDescendants(runtime, pgid) {
|
|
234
|
+
if (!pgid || pgid <= 0) return 0;
|
|
235
|
+
const child = runtime.subprocess.spawn(
|
|
236
|
+
"ps",
|
|
237
|
+
["-o", "pid=", "-g", String(pgid)],
|
|
238
|
+
{
|
|
243
239
|
stdio: ["ignore", "pipe", "ignore"],
|
|
244
|
-
}
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
child.
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
function sleep(ms) {
|
|
262
|
-
return new Promise((r) => setTimeout(r, ms));
|
|
240
|
+
},
|
|
241
|
+
);
|
|
242
|
+
let out = "";
|
|
243
|
+
try {
|
|
244
|
+
for await (const chunk of child.stdout) out += chunk.toString();
|
|
245
|
+
await child.exitCode;
|
|
246
|
+
} catch {
|
|
247
|
+
return 0;
|
|
248
|
+
}
|
|
249
|
+
const pids = out
|
|
250
|
+
.split("\n")
|
|
251
|
+
.map((s) => s.trim())
|
|
252
|
+
.filter(Boolean)
|
|
253
|
+
.filter((s) => Number(s) !== runtime.proc.pid);
|
|
254
|
+
return pids.length;
|
|
263
255
|
}
|
|
264
256
|
|
|
265
|
-
async function waitFor(predicate, timeoutMs) {
|
|
266
|
-
const deadline =
|
|
267
|
-
while (
|
|
257
|
+
async function waitFor(runtime, predicate, timeoutMs) {
|
|
258
|
+
const deadline = runtime.clock.now() + timeoutMs;
|
|
259
|
+
while (runtime.clock.now() < deadline) {
|
|
268
260
|
if (await predicate()) return true;
|
|
269
|
-
await sleep(50);
|
|
261
|
+
await runtime.clock.sleep(50);
|
|
270
262
|
}
|
|
271
263
|
return false;
|
|
272
264
|
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* `fit-benchmark invariants` — check a single task's invariants against a
|
|
3
|
-
* post-run workdir directory without invoking an agent
|
|
3
|
+
* post-run workdir directory without invoking an agent. Useful for
|
|
4
4
|
* re-checking an agent's output against revised grading material.
|
|
5
5
|
*/
|
|
6
6
|
|
|
@@ -27,7 +27,7 @@ export async function runBenchmarkInvariantsCommand(ctx) {
|
|
|
27
27
|
if (!workdirArg)
|
|
28
28
|
return { ok: false, code: 1, error: "--workdir is required" };
|
|
29
29
|
|
|
30
|
-
const family = await loadTaskFamily(familyInput);
|
|
30
|
+
const family = await loadTaskFamily(familyInput, runtime);
|
|
31
31
|
const task = family.tasks().find((t) => t.id === taskId);
|
|
32
32
|
if (!task)
|
|
33
33
|
return { ok: false, code: 1, error: `task not found in family: ${taskId}` };
|
|
@@ -36,7 +36,7 @@ export async function runBenchmarkInvariantsCommand(ctx) {
|
|
|
36
36
|
const cwd = join(runDir, "cwd");
|
|
37
37
|
const port = await allocatePort();
|
|
38
38
|
|
|
39
|
-
const invariants = await runInvariants(task, { cwd, port, runDir });
|
|
39
|
+
const invariants = await runInvariants(task, { cwd, port, runDir }, runtime);
|
|
40
40
|
const record = {
|
|
41
41
|
taskId: task.id,
|
|
42
42
|
invariants,
|
|
@@ -33,7 +33,7 @@ export async function runBenchmarkRunCommand(ctx) {
|
|
|
33
33
|
delete runtime.proc.env.NODE_EXTRA_CA_CERTS;
|
|
34
34
|
|
|
35
35
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
36
|
-
const runner = createBenchmarkRunner({ ...opts, query });
|
|
36
|
+
const runner = createBenchmarkRunner({ ...opts, query, runtime });
|
|
37
37
|
|
|
38
38
|
let anyFail = false;
|
|
39
39
|
for await (const record of runner.run()) {
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { closeSync, openSync, readSync } from "node:fs";
|
|
2
1
|
import { join } from "node:path";
|
|
3
2
|
|
|
4
3
|
const FIRST_LINE_CAP = 64 * 1024;
|
|
@@ -6,25 +5,25 @@ const FIRST_LINE_CAP = 64 * 1024;
|
|
|
6
5
|
/**
|
|
7
6
|
* Read the first newline-terminated line of a file, bounded to the first
|
|
8
7
|
* {@link FIRST_LINE_CAP} bytes. Trace `.ndjson` files can be many MB; the
|
|
9
|
-
* Step 2.6 meta header is always small, so a bounded
|
|
10
|
-
* loading whole files into memory just to inspect the header.
|
|
11
|
-
* `
|
|
12
|
-
*
|
|
13
|
-
* `import:fs` in `check-ambient-deps.deny.yml` until that seam exists.
|
|
8
|
+
* Step 2.6 meta header is always small, so a bounded positional read avoids
|
|
9
|
+
* loading whole files into memory just to inspect the header. The positional
|
|
10
|
+
* `openSync`/`readSync`/`closeSync` trio is read off the injected
|
|
11
|
+
* `runtime.fsSync` surface.
|
|
14
12
|
*
|
|
13
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
15
14
|
* @param {string} path
|
|
16
15
|
* @returns {string}
|
|
17
16
|
*/
|
|
18
|
-
function readFirstLine(path) {
|
|
19
|
-
const fd = openSync(path, "r");
|
|
17
|
+
function readFirstLine(fsSync, path) {
|
|
18
|
+
const fd = fsSync.openSync(path, "r");
|
|
20
19
|
try {
|
|
21
20
|
const buf = Buffer.alloc(FIRST_LINE_CAP);
|
|
22
|
-
const bytes = readSync(fd, buf, 0, buf.length, 0);
|
|
21
|
+
const bytes = fsSync.readSync(fd, buf, 0, buf.length, 0);
|
|
23
22
|
const text = buf.toString("utf8", 0, bytes);
|
|
24
23
|
const nl = text.indexOf("\n");
|
|
25
24
|
return nl === -1 ? text : text.slice(0, nl);
|
|
26
25
|
} finally {
|
|
27
|
-
closeSync(fd);
|
|
26
|
+
fsSync.closeSync(fd);
|
|
28
27
|
}
|
|
29
28
|
}
|
|
30
29
|
|
|
@@ -53,7 +52,7 @@ export function findTracesByDiscussion(dir, discussionId, fsSync) {
|
|
|
53
52
|
const path = join(dir, entry);
|
|
54
53
|
let firstLine;
|
|
55
54
|
try {
|
|
56
|
-
firstLine = readFirstLine(path);
|
|
55
|
+
firstLine = readFirstLine(fsSync, path);
|
|
57
56
|
} catch {
|
|
58
57
|
continue;
|
|
59
58
|
}
|
package/src/commands/discuss.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { createWriteStream } from "node:fs";
|
|
2
1
|
import { resolve } from "node:path";
|
|
2
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
3
3
|
import { createDiscusser } from "../discusser.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
@@ -80,13 +80,14 @@ export async function runDiscussCommand(ctx) {
|
|
|
80
80
|
const redactor = createRedactor({ runtime });
|
|
81
81
|
|
|
82
82
|
const fileStream = opts.outputPath
|
|
83
|
-
? createWriteStream(opts.outputPath)
|
|
83
|
+
? runtime.fs.createWriteStream(opts.outputPath)
|
|
84
84
|
: null;
|
|
85
85
|
const output = fileStream
|
|
86
86
|
? createTeeWriter({
|
|
87
87
|
fileStream,
|
|
88
88
|
textStream: runtime.proc.stdout,
|
|
89
89
|
mode: "supervised",
|
|
90
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
90
91
|
})
|
|
91
92
|
: runtime.proc.stdout;
|
|
92
93
|
|