@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +36 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/apm-installer.js +48 -44
  8. package/src/benchmark/env-loader.js +35 -23
  9. package/src/benchmark/invariants.js +128 -0
  10. package/src/benchmark/judge.js +18 -19
  11. package/src/benchmark/npm-installer.js +33 -33
  12. package/src/benchmark/report.js +40 -26
  13. package/src/benchmark/result.js +11 -11
  14. package/src/benchmark/runner.js +90 -46
  15. package/src/benchmark/task-family.js +78 -65
  16. package/src/benchmark/workdir.js +100 -93
  17. package/src/commands/assert.js +30 -22
  18. package/src/commands/benchmark-invariants.js +74 -0
  19. package/src/commands/benchmark-report.js +24 -15
  20. package/src/commands/benchmark-run.js +16 -9
  21. package/src/commands/by-discussion.js +33 -23
  22. package/src/commands/callback.js +20 -11
  23. package/src/commands/discuss.js +31 -13
  24. package/src/commands/facilitate.js +21 -14
  25. package/src/commands/output.js +15 -13
  26. package/src/commands/run.js +28 -14
  27. package/src/commands/supervise.js +29 -19
  28. package/src/commands/task-input.js +10 -5
  29. package/src/commands/tee.js +24 -9
  30. package/src/commands/trace.js +181 -99
  31. package/src/discuss-tools.js +48 -2
  32. package/src/discusser.js +53 -2
  33. package/src/events/github.js +27 -5
  34. package/src/facilitator.js +4 -0
  35. package/src/inbox-poller.js +84 -0
  36. package/src/judge.js +4 -1
  37. package/src/message-bus.js +6 -0
  38. package/src/orchestration-loop.js +14 -4
  39. package/src/orchestration-toolkit.js +14 -0
  40. package/src/profile-prompt.js +22 -9
  41. package/src/redaction.js +31 -9
  42. package/src/reply-emitter.js +47 -0
  43. package/src/supervisor.js +4 -0
  44. package/src/tee-writer.js +4 -2
  45. package/src/trace-collector.js +9 -2
  46. package/src/trace-github.js +47 -27
  47. package/src/benchmark/scorer.js +0 -138
  48. package/src/commands/benchmark-score.js +0 -68
@@ -1,10 +1,9 @@
1
- import { execSync } from "node:child_process";
2
- import { createWriteStream } from "node:fs";
3
- import { mkdir } from "node:fs/promises";
4
1
  import path from "node:path";
5
2
  import { pipeline } from "node:stream/promises";
6
3
  import { Readable } from "node:stream";
7
4
 
5
+ import { isoTimestamp } from "@forwardimpact/libutil";
6
+
8
7
  const API = "https://api.github.com";
9
8
 
10
9
  /**
@@ -17,11 +16,15 @@ export class TraceGitHub {
17
16
  * @param {string} deps.token - GitHub token
18
17
  * @param {string} deps.owner - Repository owner
19
18
  * @param {string} deps.repo - Repository name
19
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
20
+ * Ambient collaborators; uses `fs`, `subprocess`, `clock`.
20
21
  */
21
- constructor({ token, owner, repo }) {
22
+ constructor({ token, owner, repo, runtime }) {
23
+ if (!runtime) throw new Error("runtime is required");
22
24
  this.token = token;
23
25
  this.owner = owner;
24
26
  this.repo = repo;
27
+ this.runtime = runtime;
25
28
  }
26
29
 
27
30
  /**
@@ -35,7 +38,7 @@ export class TraceGitHub {
35
38
  */
36
39
  async listRuns(opts = {}) {
37
40
  const { pattern = "agent", limit = 50, lookback = "7d" } = opts;
38
- const cutoff = parseLookback(lookback);
41
+ const cutoff = parseLookback(lookback, this.runtime.clock.now());
39
42
 
40
43
  const params = new URLSearchParams({
41
44
  per_page: String(Math.min(limit, 100)),
@@ -77,8 +80,9 @@ export class TraceGitHub {
77
80
  * @returns {Promise<{dir: string, artifact: string, files: string[]}>}
78
81
  */
79
82
  async downloadTrace(runId, opts = {}) {
83
+ const fs = this.runtime.fs;
80
84
  const dir = opts.dir ?? `/tmp/trace-${runId}`;
81
- await mkdir(dir, { recursive: true });
85
+ await fs.mkdir(dir, { recursive: true });
82
86
 
83
87
  // List artifacts for this run.
84
88
  const url = `${API}/repos/${this.owner}/${this.repo}/actions/runs/${runId}/artifacts`;
@@ -121,15 +125,27 @@ export class TraceGitHub {
121
125
  }
122
126
 
123
127
  // Stream to disk then extract.
124
- await pipeline(Readable.fromWeb(response.body), createWriteStream(zipPath));
125
-
126
- execSync(
127
- `unzip -o -q ${JSON.stringify(zipPath)} -d ${JSON.stringify(dir)}`,
128
+ await pipeline(
129
+ Readable.fromWeb(response.body),
130
+ fs.createWriteStream(zipPath),
128
131
  );
129
132
 
133
+ const unzip = await this.runtime.subprocess.run("unzip", [
134
+ "-o",
135
+ "-q",
136
+ zipPath,
137
+ "-d",
138
+ dir,
139
+ ]);
140
+ if (unzip.exitCode !== 0) {
141
+ throw new Error(
142
+ `unzip failed (${unzip.exitCode}): ${unzip.stderr || unzip.stdout}`,
143
+ );
144
+ }
145
+
130
146
  // List extracted files.
131
- const { readdirSync } = await import("node:fs");
132
- const files = readdirSync(dir).filter((f) => !f.endsWith(".zip"));
147
+ const entries = await fs.readdir(dir);
148
+ const files = entries.filter((f) => !f.endsWith(".zip"));
133
149
 
134
150
  return { dir, artifact: artifact.name, files };
135
151
  }
@@ -160,14 +176,15 @@ export class TraceGitHub {
160
176
  * Parse a lookback duration string into an ISO date string.
161
177
  * Supports: Nd (days), Nh (hours), Nw (weeks).
162
178
  * @param {string} lookback
179
+ * @param {number} nowMs - Current time in ms (`runtime.clock.now()`).
163
180
  * @returns {string|null} ISO date string or null if unparseable
164
181
  */
165
- function parseLookback(lookback) {
182
+ function parseLookback(lookback, nowMs) {
166
183
  const match = lookback.match(/^(\d+)([dhw])$/);
167
184
  if (!match) return null;
168
185
  const [, val, unit] = match;
169
186
  const ms = { d: 86400000, h: 3600000, w: 604800000 }[unit];
170
- return new Date(Date.now() - parseInt(val, 10) * ms).toISOString();
187
+ return isoTimestamp(nowMs - parseInt(val, 10) * ms);
171
188
  }
172
189
 
173
190
  /**
@@ -203,22 +220,23 @@ export function parseGitRemote(remote) {
203
220
  * 1. `GITHUB_REPOSITORY` env var (set automatically by GitHub Actions).
204
221
  * 2. `git remote get-url origin` in the current working directory.
205
222
  *
206
- * @returns {{owner: string, repo: string}}
223
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
224
+ * @returns {Promise<{owner: string, repo: string}>}
207
225
  * @throws {Error} with a clear message if neither source yields a parseable slug.
208
226
  */
209
- export function detectRepoSlug() {
210
- const env = process.env.GITHUB_REPOSITORY;
227
+ export async function detectRepoSlug(runtime) {
228
+ const env = runtime.proc.env.GITHUB_REPOSITORY;
211
229
  if (env && env.trim()) {
212
230
  return parseGitRemote(env.trim());
213
231
  }
214
232
 
215
- let remote;
216
- try {
217
- remote = execSync("git remote get-url origin", {
218
- encoding: "utf8",
219
- stdio: ["ignore", "pipe", "ignore"],
220
- }).trim();
221
- } catch {
233
+ const result = await runtime.subprocess.run("git", [
234
+ "remote",
235
+ "get-url",
236
+ "origin",
237
+ ]);
238
+ const remote = result.exitCode === 0 ? result.stdout.trim() : "";
239
+ if (result.exitCode !== 0) {
222
240
  throw new Error(
223
241
  "Cannot detect repository: set --repo <owner/repo>, export GITHUB_REPOSITORY, or run inside a git checkout with an 'origin' remote.",
224
242
  );
@@ -245,10 +263,12 @@ export function detectRepoSlug() {
245
263
  * @param {object} opts
246
264
  * @param {string} opts.token - GitHub token (e.g. from `Config.ghToken()`)
247
265
  * @param {string} [opts.repo] - "owner/repo" override (default: detect from git remote)
266
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime - Ambient collaborators.
248
267
  * @returns {Promise<TraceGitHub>}
249
268
  */
250
269
  export async function createTraceGitHub(opts = {}) {
251
- const { token, repo: repoOverride } = opts;
270
+ const { token, repo: repoOverride, runtime } = opts;
271
+ if (!runtime) throw new Error("createTraceGitHub: runtime is required");
252
272
  if (!token) {
253
273
  throw new Error(
254
274
  "createTraceGitHub: token is required (pass Config.ghToken())",
@@ -257,7 +277,7 @@ export async function createTraceGitHub(opts = {}) {
257
277
 
258
278
  const { owner, repo } = repoOverride
259
279
  ? parseGitRemote(repoOverride)
260
- : detectRepoSlug();
280
+ : await detectRepoSlug(runtime);
261
281
 
262
- return new TraceGitHub({ token, owner, repo });
282
+ return new TraceGitHub({ token, owner, repo, runtime });
263
283
  }
@@ -1,138 +0,0 @@
1
- /**
2
- * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
3
- * the post-run agent CWD. The exit code is authoritative for the verdict;
4
- * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
- */
6
-
7
- import { spawn } from "node:child_process";
8
- import {
9
- closeSync,
10
- createWriteStream,
11
- openSync,
12
- readFileSync,
13
- unlinkSync,
14
- } from "node:fs";
15
- import { join } from "node:path";
16
-
17
- /**
18
- * @typedef {object} ScoringResult
19
- * @property {"pass" | "fail"} verdict
20
- * @property {Array<object>} details
21
- * @property {number} exitCode
22
- */
23
-
24
- /**
25
- * Run the task's scoring script.
26
- * @param {import("./task-family.js").Task} task
27
- * @param {{cwd: string, port: number, runDir: string}} ctx
28
- * @returns {Promise<ScoringResult>}
29
- */
30
- export function runScoring(task, ctx) {
31
- if (!task.paths.score) {
32
- return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
33
- }
34
- return new Promise((res, rej) => {
35
- const script = task.paths.score;
36
- const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
37
-
38
- // Bun's child_process pipe setup for fd >= 3 is racy under load (it
39
- // creates a unix socket pair and the connect() can return ENOENT). Use
40
- // a temp file as the fd-3 backing store instead — the script still
41
- // writes via `$RESULTS_FD`, but we hand it a real file descriptor.
42
- const fd3Path = join(ctx.runDir, "scoring.fd3.ndjson");
43
- let fd3File;
44
- try {
45
- fd3File = openSync(fd3Path, "w+");
46
- } catch (e) {
47
- rej(e);
48
- return;
49
- }
50
-
51
- const child = spawn(script, [], {
52
- env: {
53
- ...process.env,
54
- WORKDIR: ctx.cwd,
55
- PORT: String(ctx.port),
56
- RESULTS_FD: "3",
57
- },
58
- stdio: ["inherit", "pipe", "pipe", fd3File],
59
- });
60
- if (child.pid === undefined) {
61
- try {
62
- closeSync(fd3File);
63
- } catch {
64
- // already closed
65
- }
66
- rej(new Error(`failed to spawn scoring script: ${script}`));
67
- return;
68
- }
69
-
70
- child.stderr.pipe(stderrLog);
71
- // Drain stdout (do not require consumers to read it).
72
- child.stdout.on("data", () => {});
73
-
74
- child.on("error", (e) => {
75
- tryClose(fd3File);
76
- rej(e);
77
- });
78
- child.on("close", (code) => {
79
- stderrLog.end();
80
- tryClose(fd3File);
81
- const raw = readAndUnlink(fd3Path);
82
- const details = [];
83
- parseFd3Buffer(raw, details);
84
- const exitCode = typeof code === "number" ? code : -1;
85
- res({
86
- verdict: exitCode === 0 ? "pass" : "fail",
87
- details,
88
- exitCode,
89
- });
90
- });
91
- });
92
- }
93
-
94
- function pushRow(line, details) {
95
- const trimmed = line.trim();
96
- if (!trimmed) return;
97
- try {
98
- details.push(JSON.parse(trimmed));
99
- } catch {
100
- details.push({ raw: trimmed, parseError: true });
101
- }
102
- }
103
-
104
- function tryClose(fd) {
105
- try {
106
- closeSync(fd);
107
- } catch {
108
- // already closed
109
- }
110
- }
111
-
112
- function readAndUnlink(path) {
113
- let raw = "";
114
- try {
115
- raw = readFileSync(path, "utf8");
116
- } catch {
117
- // empty
118
- }
119
- try {
120
- unlinkSync(path);
121
- } catch {
122
- // best-effort cleanup
123
- }
124
- return raw;
125
- }
126
-
127
- /**
128
- * Parse the fd-3 buffer (read from the temp-file backing) into one NDJSON
129
- * row per detail entry.
130
- */
131
- function parseFd3Buffer(buf, details) {
132
- if (!buf) return;
133
- const parts = buf.split("\n");
134
- for (let i = 0; i < parts.length - 1; i++) pushRow(parts[i], details);
135
- if (parts[parts.length - 1].trim()) {
136
- pushRow(parts[parts.length - 1], details);
137
- }
138
- }
@@ -1,68 +0,0 @@
1
- /**
2
- * `fit-benchmark score` — score a single task against a post-run workdir
3
- * directory without invoking an agent (P6/P7). Useful for re-scoring an
4
- * agent's output against revised grading material.
5
- */
6
-
7
- import { writeFileSync } from "node:fs";
8
- import { join, resolve } from "node:path";
9
- import { createServer } from "node:net";
10
-
11
- import { validateScoringRecord } from "../benchmark/result.js";
12
- import { runScoring } from "../benchmark/scorer.js";
13
- import { loadTaskFamily } from "../benchmark/task-family.js";
14
-
15
- /**
16
- * @param {object} values
17
- * @param {string[]} _args
18
- */
19
- export async function runBenchmarkScoreCommand(values, _args) {
20
- const familyInput = values.family;
21
- if (!familyInput) throw new Error("--family is required");
22
- const taskId = values.task;
23
- if (!taskId) throw new Error("--task is required");
24
- const workdirArg = values.workdir;
25
- if (!workdirArg) throw new Error("--workdir is required");
26
-
27
- const family = await loadTaskFamily(familyInput);
28
- const task = family.tasks().find((t) => t.id === taskId);
29
- if (!task) throw new Error(`task not found in family: ${taskId}`);
30
-
31
- const runDir = resolve(workdirArg);
32
- const cwd = join(runDir, "cwd");
33
- const port = await allocatePort();
34
-
35
- const scoring = await runScoring(task, { cwd, port, runDir });
36
- const record = {
37
- taskId: task.id,
38
- scoring,
39
- exitCode: scoring.exitCode,
40
- };
41
- validateScoringRecord(record);
42
-
43
- const line = JSON.stringify(record) + "\n";
44
- if (values.output) {
45
- writeFileSync(resolve(values.output), line);
46
- } else {
47
- process.stdout.write(line);
48
- }
49
- process.exit(scoring.verdict === "pass" ? 0 : 1);
50
- }
51
-
52
- function allocatePort() {
53
- return new Promise((res, rej) => {
54
- const server = createServer();
55
- server.unref();
56
- server.on("error", rej);
57
- server.listen(0, "127.0.0.1", () => {
58
- const addr = server.address();
59
- if (!addr || typeof addr === "string") {
60
- server.close();
61
- rej(new Error("failed to allocate port"));
62
- return;
63
- }
64
- const port = addr.port;
65
- server.close(() => res(port));
66
- });
67
- });
68
- }