npm - @forwardimpact/libeval - Versions diffs - 0.1.51 → 0.1.53 - Mend

@forwardimpact/libeval 0.1.51 → 0.1.53

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/bin/fit-benchmark.js +8 -14
package/bin/fit-eval.js +8 -28
package/bin/fit-selfedit.js +6 -4
package/bin/fit-trace.js +7 -14
package/package.json +1 -1
package/src/benchmark/apm-installer.js +48 -44
package/src/benchmark/invariants.js +51 -63
package/src/benchmark/judge.js +13 -11
package/src/benchmark/npm-installer.js +33 -33
package/src/benchmark/report.js +25 -11
package/src/benchmark/result.js +2 -2
package/src/benchmark/runner.js +82 -38
package/src/benchmark/task-family.js +74 -63
package/src/benchmark/workdir.js +91 -99
package/src/commands/benchmark-invariants.js +3 -3
package/src/commands/benchmark-report.js +1 -0
package/src/commands/benchmark-run.js +1 -1
package/src/commands/by-discussion.js +10 -11
package/src/commands/discuss.js +3 -2
package/src/commands/facilitate.js +3 -2
package/src/commands/output.js +4 -1
package/src/commands/run.js +6 -2
package/src/commands/supervise.js +3 -2
package/src/commands/tee.js +24 -9
package/src/commands/trace.js +7 -2
package/src/discusser.js +7 -5
package/src/events/github.js +7 -1
package/src/facilitator.js +6 -5
package/src/inbox-poller.js +5 -8
package/src/judge.js +12 -13
package/src/profile-prompt.js +124 -26
package/src/redaction.js +3 -16
package/src/supervisor.js +7 -0
package/src/tee-writer.js +4 -2
package/src/trace-collector.js +9 -2
package/src/trace-github.js +47 -27

package/src/benchmark/npm-installer.js CHANGED Viewed

@@ -3,23 +3,22 @@
  * is present, then copies the resulting `node_modules/` into the staging
  * directory so WorkdirManager can seed each per-task CWD.
  *
- * Symmetric to ApmInstaller: constructor injection of `spawn` for testability,
- * factory function, and a free-function shorthand.
+ * Symmetric to ApmInstaller: the subprocess and filesystem flow through the
+ * injected `runtime` bag (`runtime.subprocess.spawn` + `runtime.fs`).
  */
-import { spawn as nodeSpawn } from "node:child_process";
-import { access, cp } from "node:fs/promises";
 import { join } from "node:path";
 /** Run `bun install` in the family root and stage node_modules/ for per-task CWDs. */
 export class NpmInstaller {
   /**
-   * @param {object} [deps]
-   * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
-   *   `node:child_process` spawn). Tests inject a fake to avoid shelling out.
+   * @param {object} deps
+   * @param {import("@forwardimpact/libutil/runtime").Runtime} deps.runtime -
+   *   Ambient collaborators; uses `subprocess.spawn` and `fs`.
    */
-  constructor({ spawn } = {}) {
-    this.spawn = spawn ?? nodeSpawn;
+  constructor({ runtime }) {
+    if (!runtime) throw new Error("runtime is required");
+    this.runtime = runtime;
   }
   /**
@@ -28,8 +27,10 @@ export class NpmInstaller {
    * @returns {Promise<void>}
    */
   async install(family, stagingDir) {
+    const fs = this.runtime.fs;
     const pkgJson = join(family.rootPath, "package.json");
-    const hasPkg = await access(pkgJson)
+    const hasPkg = await fs
+      .access(pkgJson)
       .then(() => true)
       .catch(() => false);
     if (!hasPkg) return;
@@ -38,37 +39,35 @@ export class NpmInstaller {
     const sourceModules = join(family.rootPath, "node_modules");
     try {
-      await access(sourceModules);
+      await fs.access(sourceModules);
     } catch {
       throw new Error(
         `bun install did not produce node_modules/ at ${sourceModules}; check the family's package.json`,
       );
     }
-    await cp(sourceModules, join(stagingDir, "node_modules"), {
+    await fs.cp(sourceModules, join(stagingDir, "node_modules"), {
       recursive: true,
     });
   }
-  #runBunInstall(cwd) {
-    return new Promise((res, rej) => {
-      const child = this.spawn("bun", ["install"], {
-        cwd,
-        stdio: ["ignore", "pipe", "pipe"],
-      });
-      let stderr = "";
-      child.stdout.on("data", () => {});
-      child.stderr.on("data", (d) => {
-        stderr += d.toString();
-      });
-      child.on("error", (e) => {
-        rej(new Error(`failed to spawn bun: ${e.message}`));
-      });
-      child.on("close", (code) => {
-        if (code === 0) res();
-        else rej(new Error(`bun install exited ${code}: ${stderr}`));
-      });
+  async #runBunInstall(cwd) {
+    const child = this.runtime.subprocess.spawn("bun", ["install"], {
+      cwd,
+      stdio: ["ignore", "pipe", "pipe"],
     });
+    let stderr = "";
+    const drainStdout = (async () => {
+      for await (const _chunk of child.stdout) {
+        // discard
+      }
+    })();
+    for await (const chunk of child.stderr) stderr += chunk.toString();
+    await drainStdout;
+    const code = await child.exitCode;
+    if (code !== 0) {
+      throw new Error(`bun install exited ${code}: ${stderr}`);
+    }
   }
 }
@@ -78,10 +77,11 @@ export function createNpmInstaller(deps) {
 }
 /**
- * Free-function shorthand for callers that don't need to inject a spawn seam.
+ * Free-function shorthand for callers that thread a runtime bag.
  * @param {import("./task-family.js").TaskFamily} family
  * @param {string} stagingDir
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  */
-export function installNpm(family, stagingDir) {
-  return new NpmInstaller().install(family, stagingDir);
+export function installNpm(family, stagingDir, runtime) {
+  return new NpmInstaller({ runtime }).install(family, stagingDir);
 }

package/src/benchmark/report.js CHANGED Viewed

@@ -12,9 +12,7 @@
  * whole report.
  */
-import { createReadStream } from "node:fs";
 import { join } from "node:path";
-import { createInterface } from "node:readline";
 import { validateResultRecord } from "./result.js";
@@ -41,11 +39,17 @@ import { validateResultRecord } from "./result.js";
  */
 /**
- * @param {{inputDir: string, kValues: number[], includeRuns?: boolean}} opts
+ * @param {{inputDir: string, kValues: number[], includeRuns?: boolean, runtime: import("@forwardimpact/libutil/runtime").Runtime}} opts
  * @returns {Promise<{tasks: TaskReport[], totals: object}>}
  */
-export async function aggregate({ inputDir, kValues, includeRuns = false }) {
-  const records = await loadRecords(inputDir);
+export async function aggregate({
+  inputDir,
+  kValues,
+  includeRuns = false,
+  runtime,
+}) {
+  if (!runtime) throw new Error("runtime is required");
+  const records = await loadRecords(inputDir, runtime);
   const grouped = groupByTask(records.records);
   const tasks = [];
   let totalRuns = 0;
@@ -429,20 +433,30 @@ function median(arr) {
 // Record loading
 // ---------------------------------------------------------------------------
-async function loadRecords(inputDir) {
+async function loadRecords(inputDir, runtime) {
   const path = join(inputDir, "results.jsonl");
-  const stream = createReadStream(path);
-  const rl = createInterface({ input: stream, crlfDelay: Infinity });
+  let content;
+  try {
+    content = await runtime.fs.readFile(path, "utf8");
+  } catch (e) {
+    // Re-throw with the stack collapsed to the message line so the CLI's
+    // error rendering stays free of node-internal async `readFile` frames
+    // (matching the pre-1370 stream-error shape the golden captured).
+    const err = new Error(e.message);
+    if (e.code) err.code = e.code;
+    err.stack = `Error: ${e.message}`;
+    throw err;
+  }
   const records = [];
   let skipped = 0;
-  for await (const line of rl) {
+  for (const line of content.split("\n")) {
     const trimmed = line.trim();
     if (!trimmed) continue;
     let record;
     try {
       record = JSON.parse(trimmed);
     } catch (e) {
-      process.stderr.write(
+      runtime.proc.stderr.write(
         `benchmark report: skipped malformed JSON line — ${e.message}\n`,
       );
       skipped++;
@@ -451,7 +465,7 @@ async function loadRecords(inputDir) {
     try {
       validateResultRecord(record);
     } catch (e) {
-      process.stderr.write(
+      runtime.proc.stderr.write(
         `benchmark report: skipped record failing schema — ${describeError(e)}\n`,
       );
       skipped++;

package/src/benchmark/result.js CHANGED Viewed

@@ -5,8 +5,8 @@
  *   - RESULT_RECORD_SCHEMA — one record per (task, runIndex) from a full
  *     benchmark run. Has a happy branch (invariants + judge present) and a
  *     pre-flight-failure branch (invariants/judgeVerdict/submission absent).
- *   - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`
- *     (P7): ad-hoc grading without a full lifecycle.
+ *   - INVARIANTS_RECORD_SCHEMA — narrower output of `benchmark-invariants`:
+ *     ad-hoc grading without a full lifecycle.
  *
  * Validation is throw-on-mismatch so the runner can wrap every JSONL append
  * in a guard and reject schema drift at write time.

package/src/benchmark/runner.js CHANGED Viewed

@@ -14,8 +14,6 @@
  * the JSONL append is the system of record.
  */
-import { createReadStream, createWriteStream } from "node:fs";
-import { mkdir, readFile, unlink } from "node:fs/promises";
 import { createInterface } from "node:readline";
 import { join, resolve as resolvePath } from "node:path";
@@ -60,17 +58,21 @@ export class BenchmarkRunner {
    *   write a valid NDJSON trace to `workdir.agentTracePath`. Default uses
    *   `createAgentRunner` with the harness `BASE_TOOLS` allowlist. Internal
    *   testing only — not part of the public API.
+   * @param {import("@forwardimpact/libutil/runtime").Runtime} opts.runtime -
+   *   Injected ambient collaborators (`fs`, `subprocess`, `clock`, `proc`),
+   *   threaded into the installers, workdir manager, invariants, and judge.
    * @param {Function} [opts.runInvariants] - Test seam: replaces `runInvariants`.
-   *   Same contract as `runInvariants(task, ctx)`. Internal testing only.
+   *   Same contract as `runInvariants(task, ctx, runtime)`. Internal testing only.
    * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
-   *   contract as `runJudge(task, workdir, invariants, deps)`. Internal testing
-   *   only.
+   *   contract as `runJudge(task, workdir, invariants, deps)` (deps carries
+   *   `runtime`). Internal testing only.
    * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
-   *   Same contract as `installApm(family, outputDir)`. Lets tests inject a
-   *   fake `apm` spawn (or skip the install entirely) so the suite never
-   *   shells out to a real `apm` binary. Internal testing only.
+   *   Same contract as `installApm(family, outputDir, runtime)`. Lets tests
+   *   inject a fake subprocess (or skip the install entirely) so the suite
+   *   never shells out to a real `apm` binary. Internal testing only.
    * @param {Function} [opts.installNpm] - Test seam: replaces `installNpm`.
-   *   Same contract as `installNpm(family, stagingDir)`. Internal testing only.
+   *   Same contract as `installNpm(family, stagingDir, runtime)`. Internal
+   *   testing only.
    */
   constructor({
     family,
@@ -84,6 +86,7 @@ export class BenchmarkRunner {
     allowedTools,
     maxTurns,
     termGraceMs,
+    runtime,
     // Test seams — default to the real implementations.
     runAgent,
     runInvariants: runInvariantsHook,
@@ -91,12 +94,8 @@ export class BenchmarkRunner {
     installApm: installApmHook,
     installNpm: installNpmHook,
   }) {
-    if (!family) throw new Error("family is required");
-    if (!Number.isInteger(runs) || runs < 1)
-      throw new Error("runs must be an integer ≥ 1");
-    if (!output) throw new Error("output is required");
-    if (!agentModel) throw new Error("agentModel is required");
-    if (!query) throw new Error("query is required");
+    validateRunnerArgs({ family, runs, output, agentModel, query, runtime });
+    this.runtime = runtime;
     this.familyInput = family;
     this.runs = runs;
     this.output = output;
@@ -123,15 +122,16 @@ export class BenchmarkRunner {
    * @returns {AsyncGenerator<object>}
    */
   async *run() {
+    const runtime = this.runtime;
     const family =
       typeof this.familyInput === "string"
-        ? await loadTaskFamily(this.familyInput)
+        ? await loadTaskFamily(this.familyInput, runtime)
         : this.familyInput;
-    await mkdir(this.output, { recursive: true });
+    await runtime.fs.mkdir(this.output, { recursive: true });
     const { stagingDir, skillSetHash, judgeProfilesDir } =
-      await this._installApmHook(family, this.output);
-    await this._installNpmHook(family, stagingDir);
+      await this._installApmHook(family, this.output, runtime);
+    await this._installNpmHook(family, stagingDir, runtime);
     const tasks = family.tasks();
     if (this.profiles.judge) {
@@ -139,6 +139,7 @@ export class BenchmarkRunner {
         family,
         judgeProfilesDir,
         this.profiles.judge,
+        runtime,
       );
     }
@@ -147,10 +148,13 @@ export class BenchmarkRunner {
       runOutputDir: this.output,
       termGraceMs: this.termGraceMs,
       familyRootPath: family.rootPath,
+      runtime,
     });
     const resultsPath = join(this.output, "results.jsonl");
-    const resultsStream = createWriteStream(resultsPath, { flags: "a" });
+    const resultsStream = runtime.fs.createWriteStream(resultsPath, {
+      flags: "a",
+    });
     try {
       for (const task of tasks) {
         for (let runIndex = 0; runIndex < this.runs; runIndex++) {
@@ -172,7 +176,7 @@ export class BenchmarkRunner {
   }
   async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
-    const t0 = Date.now();
+    const t0 = this.runtime.clock.now();
     const workdir = await wm.start(task, runIndex);
     try {
       if (workdir.preflightError) {
@@ -182,7 +186,7 @@ export class BenchmarkRunner {
           workdir,
           skillSetHash,
           familyRevision: family.familyRevision,
-          durationMs: Date.now() - t0,
+          durationMs: this.runtime.clock.now() - t0,
         });
         return this.#validateOrFallback(
           record,
@@ -191,11 +195,15 @@ export class BenchmarkRunner {
       }
       const { costUsd, turns, submission, agentError } =
         await this.#runAgentSafe(task, workdir);
-      const invariants = await this._runInvariantsHook(task, {
-        cwd: workdir.cwd,
-        port: workdir.port,
-        runDir: workdir.runDir,
-      });
+      const invariants = await this._runInvariantsHook(
+        task,
+        {
+          cwd: workdir.cwd,
+          port: workdir.port,
+          runDir: workdir.runDir,
+        },
+        this.runtime,
+      );
       let judgeVerdict = null;
       if (task.paths.judge) {
         const judgeContext = await this.#buildJudgeContext(
@@ -212,6 +220,7 @@ export class BenchmarkRunner {
             model: this.judgeModel,
             judgeProfile: this.profiles.judge ?? undefined,
             profilesDir: judgeProfilesDir,
+            runtime: this.runtime,
           },
           judgeContext,
         );
@@ -245,7 +254,7 @@ export class BenchmarkRunner {
         },
         skillSetHash,
         familyRevision: family.familyRevision,
-        durationMs: Date.now() - t0,
+        durationMs: this.runtime.clock.now() - t0,
         ...(agentError && { agentError }),
       };
       return this.#validateOrFallback(record, resultsRecordKey(task, runIndex));
@@ -283,10 +292,11 @@ export class BenchmarkRunner {
    * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
    */
   async #runAgent(task, workdir) {
+    const fs = this.runtime.fs;
     const combinedPath = join(workdir.runDir, ".combined.ndjson");
-    const combinedStream = createWriteStream(combinedPath);
+    const combinedStream = fs.createWriteStream(combinedPath);
     const supervisorInstructions = task.paths.supervisor
-      ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
+      ? await fs.readFile(task.paths.supervisor, "utf8").catch(() => null)
       : null;
     const supervisor = createSupervisor({
       supervisorCwd: workdir.cwd,
@@ -301,9 +311,11 @@ export class BenchmarkRunner {
       ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
       redactor: createRedactor({
         allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
+        runtime: this.runtime,
       }),
+      runtime: this.runtime,
     });
-    const instructions = await readFile(task.paths.instructions, "utf8");
+    const instructions = await fs.readFile(task.paths.instructions, "utf8");
     let agentError = null;
     try {
       const result = await supervisor.run(instructions);
@@ -316,16 +328,21 @@ export class BenchmarkRunner {
       await new Promise((r) => combinedStream.end(r));
     }
     const summary = await splitAndSummarize(
+      this.runtime,
       combinedPath,
       workdir.agentTracePath,
       workdir.supervisorTracePath,
     );
-    await unlink(combinedPath).catch(() => {});
+    await fs.unlink(combinedPath).catch(() => {});
     return { ...summary, agentError };
   }
   async #buildJudgeContext(task, workdir, skillSetHash) {
-    const agentInstructions = await readFile(task.paths.instructions, "utf8");
+    const fs = this.runtime.fs;
+    const agentInstructions = await fs.readFile(
+      task.paths.instructions,
+      "utf8",
+    );
     let agentProfile = "";
     if (this.profiles.agent) {
       const profilePath = resolvePath(
@@ -333,7 +350,7 @@ export class BenchmarkRunner {
         ".claude/agents",
         `${this.profiles.agent}.md`,
       );
-      agentProfile = await readFile(profilePath, "utf8").catch(() => "");
+      agentProfile = await fs.readFile(profilePath, "utf8").catch(() => "");
     }
     return { agentInstructions, agentProfile, skillSetHash };
   }
@@ -390,6 +407,27 @@ export class BenchmarkRunner {
   }
 }
+/**
+ * Validate the required BenchmarkRunner constructor arguments. Extracted from
+ * the constructor to keep its cognitive complexity under the lint ceiling.
+ */
+function validateRunnerArgs({
+  family,
+  runs,
+  output,
+  agentModel,
+  query,
+  runtime,
+}) {
+  if (!family) throw new Error("family is required");
+  if (!Number.isInteger(runs) || runs < 1)
+    throw new Error("runs must be an integer ≥ 1");
+  if (!output) throw new Error("output is required");
+  if (!agentModel) throw new Error("agentModel is required");
+  if (!query) throw new Error("query is required");
+  if (!runtime) throw new Error("runtime is required");
+}
 function resultsRecordKey(task, runIndex) {
   return { taskId: task.id, runIndex };
 }
@@ -408,11 +446,17 @@ async function writeRecord(stream, record) {
  * `supervisorPath`.
  */
 // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
-async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
-  const agentStream = createWriteStream(agentPath);
-  const supStream = createWriteStream(supervisorPath);
+async function splitAndSummarize(
+  runtime,
+  combinedPath,
+  agentPath,
+  supervisorPath,
+) {
+  const fs = runtime.fs;
+  const agentStream = fs.createWriteStream(agentPath);
+  const supStream = fs.createWriteStream(supervisorPath);
   const rl = createInterface({
-    input: createReadStream(combinedPath),
+    input: fs.createReadStream(combinedPath),
     crlfDelay: Infinity,
   });
   let agentCost = 0;