@forwardimpact/libeval 0.1.50 → 0.1.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +36 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/invariants.js +128 -0
- package/src/benchmark/judge.js +18 -19
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +40 -26
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +90 -46
- package/src/benchmark/task-family.js +78 -65
- package/src/benchmark/workdir.js +100 -93
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +24 -15
- package/src/commands/benchmark-run.js +16 -9
- package/src/commands/by-discussion.js +33 -23
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +31 -13
- package/src/commands/facilitate.js +21 -14
- package/src/commands/output.js +15 -13
- package/src/commands/run.js +28 -14
- package/src/commands/supervise.js +29 -19
- package/src/commands/task-input.js +10 -5
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +181 -99
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +53 -2
- package/src/events/github.js +27 -5
- package/src/facilitator.js +4 -0
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +4 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/profile-prompt.js +22 -9
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/supervisor.js +4 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
- package/src/benchmark/scorer.js +0 -138
- package/src/commands/benchmark-score.js +0 -68
package/src/trace-github.js
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
|
-
import { execSync } from "node:child_process";
|
|
2
|
-
import { createWriteStream } from "node:fs";
|
|
3
|
-
import { mkdir } from "node:fs/promises";
|
|
4
1
|
import path from "node:path";
|
|
5
2
|
import { pipeline } from "node:stream/promises";
|
|
6
3
|
import { Readable } from "node:stream";
|
|
7
4
|
|
|
5
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
6
|
+
|
|
8
7
|
const API = "https://api.github.com";
|
|
9
8
|
|
|
10
9
|
/**
|
|
@@ -17,11 +16,15 @@ export class TraceGitHub {
|
|
|
17
16
|
* @param {string} deps.token - GitHub token
|
|
18
17
|
* @param {string} deps.owner - Repository owner
|
|
19
18
|
* @param {string} deps.repo - Repository name
|
|
19
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
|
|
20
|
+
* Ambient collaborators; uses `fs`, `subprocess`, `clock`.
|
|
20
21
|
*/
|
|
21
|
-
constructor({ token, owner, repo }) {
|
|
22
|
+
constructor({ token, owner, repo, runtime }) {
|
|
23
|
+
if (!runtime) throw new Error("runtime is required");
|
|
22
24
|
this.token = token;
|
|
23
25
|
this.owner = owner;
|
|
24
26
|
this.repo = repo;
|
|
27
|
+
this.runtime = runtime;
|
|
25
28
|
}
|
|
26
29
|
|
|
27
30
|
/**
|
|
@@ -35,7 +38,7 @@ export class TraceGitHub {
|
|
|
35
38
|
*/
|
|
36
39
|
async listRuns(opts = {}) {
|
|
37
40
|
const { pattern = "agent", limit = 50, lookback = "7d" } = opts;
|
|
38
|
-
const cutoff = parseLookback(lookback);
|
|
41
|
+
const cutoff = parseLookback(lookback, this.runtime.clock.now());
|
|
39
42
|
|
|
40
43
|
const params = new URLSearchParams({
|
|
41
44
|
per_page: String(Math.min(limit, 100)),
|
|
@@ -77,8 +80,9 @@ export class TraceGitHub {
|
|
|
77
80
|
* @returns {Promise<{dir: string, artifact: string, files: string[]}>}
|
|
78
81
|
*/
|
|
79
82
|
async downloadTrace(runId, opts = {}) {
|
|
83
|
+
const fs = this.runtime.fs;
|
|
80
84
|
const dir = opts.dir ?? `/tmp/trace-${runId}`;
|
|
81
|
-
await mkdir(dir, { recursive: true });
|
|
85
|
+
await fs.mkdir(dir, { recursive: true });
|
|
82
86
|
|
|
83
87
|
// List artifacts for this run.
|
|
84
88
|
const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
|
|
@@ -121,15 +125,27 @@ export class TraceGitHub {
|
|
|
121
125
|
}
|
|
122
126
|
|
|
123
127
|
// Stream to disk then extract.
|
|
124
|
-
await pipeline(
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
`unzip -o -q ${JSON.stringify(zipPath)} -d ${JSON.stringify(dir)}`,
|
|
128
|
+
await pipeline(
|
|
129
|
+
Readable.fromWeb(response.body),
|
|
130
|
+
fs.createWriteStream(zipPath),
|
|
128
131
|
);
|
|
129
132
|
|
|
133
|
+
const unzip = await this.runtime.subprocess.run("unzip", [
|
|
134
|
+
"-o",
|
|
135
|
+
"-q",
|
|
136
|
+
zipPath,
|
|
137
|
+
"-d",
|
|
138
|
+
dir,
|
|
139
|
+
]);
|
|
140
|
+
if (unzip.exitCode !== 0) {
|
|
141
|
+
throw new Error(
|
|
142
|
+
`unzip failed (${unzip.exitCode}): ${unzip.stderr || unzip.stdout}`,
|
|
143
|
+
);
|
|
144
|
+
}
|
|
145
|
+
|
|
130
146
|
// List extracted files.
|
|
131
|
-
const
|
|
132
|
-
const files =
|
|
147
|
+
const entries = await fs.readdir(dir);
|
|
148
|
+
const files = entries.filter((f) => !f.endsWith(".zip"));
|
|
133
149
|
|
|
134
150
|
return { dir, artifact: artifact.name, files };
|
|
135
151
|
}
|
|
@@ -160,14 +176,15 @@ export class TraceGitHub {
|
|
|
160
176
|
* Parse a lookback duration string into an ISO date string.
|
|
161
177
|
* Supports: Nd (days), Nh (hours), Nw (weeks).
|
|
162
178
|
* @param {string} lookback
|
|
179
|
+
* @param {number} nowMs - Current time in ms (`runtime.clock.now()`).
|
|
163
180
|
* @returns {string|null} ISO date string or null if unparseable
|
|
164
181
|
*/
|
|
165
|
-
function parseLookback(lookback) {
|
|
182
|
+
function parseLookback(lookback, nowMs) {
|
|
166
183
|
const match = lookback.match(/^(\d+)([dhw])$/);
|
|
167
184
|
if (!match) return null;
|
|
168
185
|
const [, val, unit] = match;
|
|
169
186
|
const ms = { d: 86400000, h: 3600000, w: 604800000 }[unit];
|
|
170
|
-
return
|
|
187
|
+
return isoTimestamp(nowMs - parseInt(val, 10) * ms);
|
|
171
188
|
}
|
|
172
189
|
|
|
173
190
|
/**
|
|
@@ -203,22 +220,23 @@ export function parseGitRemote(remote) {
|
|
|
203
220
|
* 1. `GITHUB_REPOSITORY` env var (set automatically by GitHub Actions).
|
|
204
221
|
* 2. `git remote get-url origin` in the current working directory.
|
|
205
222
|
*
|
|
206
|
-
* @
|
|
223
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
224
|
+
* @returns {Promise<{owner: string, repo: string}>}
|
|
207
225
|
* @throws {Error} with a clear message if neither source yields a parseable slug.
|
|
208
226
|
*/
|
|
209
|
-
export function detectRepoSlug() {
|
|
210
|
-
const env =
|
|
227
|
+
export async function detectRepoSlug(runtime) {
|
|
228
|
+
const env = runtime.proc.env.GITHUB_REPOSITORY;
|
|
211
229
|
if (env && env.trim()) {
|
|
212
230
|
return parseGitRemote(env.trim());
|
|
213
231
|
}
|
|
214
232
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
233
|
+
const result = await runtime.subprocess.run("git", [
|
|
234
|
+
"remote",
|
|
235
|
+
"get-url",
|
|
236
|
+
"origin",
|
|
237
|
+
]);
|
|
238
|
+
const remote = result.exitCode === 0 ? result.stdout.trim() : "";
|
|
239
|
+
if (result.exitCode !== 0) {
|
|
222
240
|
throw new Error(
|
|
223
241
|
"Cannot detect repository: set --repo <owner/repo>, export GITHUB_REPOSITORY, or run inside a git checkout with an 'origin' remote.",
|
|
224
242
|
);
|
|
@@ -245,10 +263,12 @@ export function detectRepoSlug() {
|
|
|
245
263
|
* @param {object} opts
|
|
246
264
|
* @param {string} opts.token - GitHub token (e.g. from `Config.ghToken()`)
|
|
247
265
|
* @param {string} [opts.repo] - "owner/repo" override (default: detect from git remote)
|
|
266
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime - Ambient collaborators.
|
|
248
267
|
* @returns {Promise<TraceGitHub>}
|
|
249
268
|
*/
|
|
250
269
|
export async function createTraceGitHub(opts = {}) {
|
|
251
|
-
const { token, repo: repoOverride } = opts;
|
|
270
|
+
const { token, repo: repoOverride, runtime } = opts;
|
|
271
|
+
if (!runtime) throw new Error("createTraceGitHub: runtime is required");
|
|
252
272
|
if (!token) {
|
|
253
273
|
throw new Error(
|
|
254
274
|
"createTraceGitHub: token is required (pass Config.ghToken())",
|
|
@@ -257,7 +277,7 @@ export async function createTraceGitHub(opts = {}) {
|
|
|
257
277
|
|
|
258
278
|
const { owner, repo } = repoOverride
|
|
259
279
|
? parseGitRemote(repoOverride)
|
|
260
|
-
: detectRepoSlug();
|
|
280
|
+
: await detectRepoSlug(runtime);
|
|
261
281
|
|
|
262
|
-
return new TraceGitHub({ token, owner, repo });
|
|
282
|
+
return new TraceGitHub({ token, owner, repo, runtime });
|
|
263
283
|
}
|
package/src/benchmark/scorer.js
DELETED
|
@@ -1,138 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
|
|
3
|
-
* the post-run agent CWD. The exit code is authoritative for the verdict;
|
|
4
|
-
* structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { spawn } from "node:child_process";
|
|
8
|
-
import {
|
|
9
|
-
closeSync,
|
|
10
|
-
createWriteStream,
|
|
11
|
-
openSync,
|
|
12
|
-
readFileSync,
|
|
13
|
-
unlinkSync,
|
|
14
|
-
} from "node:fs";
|
|
15
|
-
import { join } from "node:path";
|
|
16
|
-
|
|
17
|
-
/**
|
|
18
|
-
* @typedef {object} ScoringResult
|
|
19
|
-
* @property {"pass" | "fail"} verdict
|
|
20
|
-
* @property {Array<object>} details
|
|
21
|
-
* @property {number} exitCode
|
|
22
|
-
*/
|
|
23
|
-
|
|
24
|
-
/**
|
|
25
|
-
* Run the task's scoring script.
|
|
26
|
-
* @param {import("./task-family.js").Task} task
|
|
27
|
-
* @param {{cwd: string, port: number, runDir: string}} ctx
|
|
28
|
-
* @returns {Promise<ScoringResult>}
|
|
29
|
-
*/
|
|
30
|
-
export function runScoring(task, ctx) {
|
|
31
|
-
if (!task.paths.score) {
|
|
32
|
-
return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
|
|
33
|
-
}
|
|
34
|
-
return new Promise((res, rej) => {
|
|
35
|
-
const script = task.paths.score;
|
|
36
|
-
const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
|
|
37
|
-
|
|
38
|
-
// Bun's child_process pipe setup for fd >= 3 is racy under load (it
|
|
39
|
-
// creates a unix socket pair and the connect() can return ENOENT). Use
|
|
40
|
-
// a temp file as the fd-3 backing store instead — the script still
|
|
41
|
-
// writes via `$RESULTS_FD`, but we hand it a real file descriptor.
|
|
42
|
-
const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
|
|
43
|
-
let fd3File;
|
|
44
|
-
try {
|
|
45
|
-
fd3File = openSync(fd3Path, "w+");
|
|
46
|
-
} catch (e) {
|
|
47
|
-
rej(e);
|
|
48
|
-
return;
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
const child = spawn(script, [], {
|
|
52
|
-
env: {
|
|
53
|
-
...process.env,
|
|
54
|
-
WORKDIR: ctx.cwd,
|
|
55
|
-
PORT: String(ctx.port),
|
|
56
|
-
RESULTS_FD: "3",
|
|
57
|
-
},
|
|
58
|
-
stdio: ["inherit", "pipe", "pipe", fd3File],
|
|
59
|
-
});
|
|
60
|
-
if (child.pid === undefined) {
|
|
61
|
-
try {
|
|
62
|
-
closeSync(fd3File);
|
|
63
|
-
} catch {
|
|
64
|
-
// already closed
|
|
65
|
-
}
|
|
66
|
-
rej(new Error(`failed to spawn scoring script: ${script}`));
|
|
67
|
-
return;
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
child.stderr.pipe(stderrLog);
|
|
71
|
-
// Drain stdout (do not require consumers to read it).
|
|
72
|
-
child.stdout.on("data", () => {});
|
|
73
|
-
|
|
74
|
-
child.on("error", (e) => {
|
|
75
|
-
tryClose(fd3File);
|
|
76
|
-
rej(e);
|
|
77
|
-
});
|
|
78
|
-
child.on("close", (code) => {
|
|
79
|
-
stderrLog.end();
|
|
80
|
-
tryClose(fd3File);
|
|
81
|
-
const raw = readAndUnlink(fd3Path);
|
|
82
|
-
const details = [];
|
|
83
|
-
parseFd3Buffer(raw, details);
|
|
84
|
-
const exitCode = typeof code === "number" ? code : -1;
|
|
85
|
-
res({
|
|
86
|
-
verdict: exitCode === 0 ? "pass" : "fail",
|
|
87
|
-
details,
|
|
88
|
-
exitCode,
|
|
89
|
-
});
|
|
90
|
-
});
|
|
91
|
-
});
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
function pushRow(line, details) {
|
|
95
|
-
const trimmed = line.trim();
|
|
96
|
-
if (!trimmed) return;
|
|
97
|
-
try {
|
|
98
|
-
details.push(JSON.parse(trimmed));
|
|
99
|
-
} catch {
|
|
100
|
-
details.push({ raw: trimmed, parseError: true });
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
function tryClose(fd) {
|
|
105
|
-
try {
|
|
106
|
-
closeSync(fd);
|
|
107
|
-
} catch {
|
|
108
|
-
// already closed
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
function readAndUnlink(path) {
|
|
113
|
-
let raw = "";
|
|
114
|
-
try {
|
|
115
|
-
raw = readFileSync(path, "utf8");
|
|
116
|
-
} catch {
|
|
117
|
-
// empty
|
|
118
|
-
}
|
|
119
|
-
try {
|
|
120
|
-
unlinkSync(path);
|
|
121
|
-
} catch {
|
|
122
|
-
// best-effort cleanup
|
|
123
|
-
}
|
|
124
|
-
return raw;
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
/**
|
|
128
|
-
* Parse the fd-3 buffer (read from the temp-file backing) into one NDJSON
|
|
129
|
-
* row per detail entry.
|
|
130
|
-
*/
|
|
131
|
-
function parseFd3Buffer(buf, details) {
|
|
132
|
-
if (!buf) return;
|
|
133
|
-
const parts = buf.split("\n");
|
|
134
|
-
for (let i = 0; i < parts.length - 1; i++) pushRow(parts[i], details);
|
|
135
|
-
if (parts[parts.length - 1].trim()) {
|
|
136
|
-
pushRow(parts[parts.length - 1], details);
|
|
137
|
-
}
|
|
138
|
-
}
|
|
@@ -1,68 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* `fit-benchmark score` — score a single task against a post-run workdir
|
|
3
|
-
* directory without invoking an agent (P6/P7). Useful for re-scoring an
|
|
4
|
-
* agent's output against revised grading material.
|
|
5
|
-
*/
|
|
6
|
-
|
|
7
|
-
import { writeFileSync } from "node:fs";
|
|
8
|
-
import { join, resolve } from "node:path";
|
|
9
|
-
import { createServer } from "node:net";
|
|
10
|
-
|
|
11
|
-
import { validateScoringRecord } from "../benchmark/result.js";
|
|
12
|
-
import { runScoring } from "../benchmark/scorer.js";
|
|
13
|
-
import { loadTaskFamily } from "../benchmark/task-family.js";
|
|
14
|
-
|
|
15
|
-
/**
|
|
16
|
-
* @param {object} values
|
|
17
|
-
* @param {string[]} _args
|
|
18
|
-
*/
|
|
19
|
-
export async function runBenchmarkScoreCommand(values, _args) {
|
|
20
|
-
const familyInput = values.family;
|
|
21
|
-
if (!familyInput) throw new Error("--family is required");
|
|
22
|
-
const taskId = values.task;
|
|
23
|
-
if (!taskId) throw new Error("--task is required");
|
|
24
|
-
const workdirArg = values.workdir;
|
|
25
|
-
if (!workdirArg) throw new Error("--workdir is required");
|
|
26
|
-
|
|
27
|
-
const family = await loadTaskFamily(familyInput);
|
|
28
|
-
const task = family.tasks().find((t) => t.id === taskId);
|
|
29
|
-
if (!task) throw new Error(`task not found in family: ${taskId}`);
|
|
30
|
-
|
|
31
|
-
const runDir = resolve(workdirArg);
|
|
32
|
-
const cwd = join(runDir, "cwd");
|
|
33
|
-
const port = await allocatePort();
|
|
34
|
-
|
|
35
|
-
const scoring = await runScoring(task, { cwd, port, runDir });
|
|
36
|
-
const record = {
|
|
37
|
-
taskId: task.id,
|
|
38
|
-
scoring,
|
|
39
|
-
exitCode: scoring.exitCode,
|
|
40
|
-
};
|
|
41
|
-
validateScoringRecord(record);
|
|
42
|
-
|
|
43
|
-
const line = JSON.stringify(record) + "\n";
|
|
44
|
-
if (values.output) {
|
|
45
|
-
writeFileSync(resolve(values.output), line);
|
|
46
|
-
} else {
|
|
47
|
-
process.stdout.write(line);
|
|
48
|
-
}
|
|
49
|
-
process.exit(scoring.verdict === "pass" ? 0 : 1);
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
function allocatePort() {
|
|
53
|
-
return new Promise((res, rej) => {
|
|
54
|
-
const server = createServer();
|
|
55
|
-
server.unref();
|
|
56
|
-
server.on("error", rej);
|
|
57
|
-
server.listen(0, "127.0.0.1", () => {
|
|
58
|
-
const addr = server.address();
|
|
59
|
-
if (!addr || typeof addr === "string") {
|
|
60
|
-
server.close();
|
|
61
|
-
rej(new Error("failed to allocate port"));
|
|
62
|
-
return;
|
|
63
|
-
}
|
|
64
|
-
const port = addr.port;
|
|
65
|
-
server.close(() => res(port));
|
|
66
|
-
});
|
|
67
|
-
});
|
|
68
|
-
}
|