npm - @forwardimpact/libeval - Versions diffs - 0.1.50 → 0.1.52 - Mend

@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/README.md +11 -8
package/bin/fit-benchmark.js +26 -27
package/bin/fit-eval.js +36 -30
package/bin/fit-trace.js +83 -57
package/package.json +1 -1
package/src/agent-runner.js +20 -12
package/src/benchmark/apm-installer.js +48 -44
package/src/benchmark/env-loader.js +35 -23
package/src/benchmark/invariants.js +128 -0
package/src/benchmark/judge.js +18 -19
package/src/benchmark/npm-installer.js +33 -33
package/src/benchmark/report.js +40 -26
package/src/benchmark/result.js +11 -11
package/src/benchmark/runner.js +90 -46
package/src/benchmark/task-family.js +78 -65
package/src/benchmark/workdir.js +100 -93
package/src/commands/assert.js +30 -22
package/src/commands/benchmark-invariants.js +74 -0
package/src/commands/benchmark-report.js +24 -15
package/src/commands/benchmark-run.js +16 -9
package/src/commands/by-discussion.js +33 -23
package/src/commands/callback.js +20 -11
package/src/commands/discuss.js +31 -13
package/src/commands/facilitate.js +21 -14
package/src/commands/output.js +15 -13
package/src/commands/run.js +28 -14
package/src/commands/supervise.js +29 -19
package/src/commands/task-input.js +10 -5
package/src/commands/tee.js +24 -9
package/src/commands/trace.js +181 -99
package/src/discuss-tools.js +48 -2
package/src/discusser.js +53 -2
package/src/events/github.js +27 -5
package/src/facilitator.js +4 -0
package/src/inbox-poller.js +84 -0
package/src/judge.js +4 -1
package/src/message-bus.js +6 -0
package/src/orchestration-loop.js +14 -4
package/src/orchestration-toolkit.js +14 -0
package/src/profile-prompt.js +22 -9
package/src/redaction.js +31 -9
package/src/reply-emitter.js +47 -0
package/src/supervisor.js +4 -0
package/src/tee-writer.js +4 -2
package/src/trace-collector.js +9 -2
package/src/trace-github.js +47 -27
package/src/benchmark/scorer.js +0 -138
package/src/commands/benchmark-score.js +0 -68

package/src/commands/run.js CHANGED Viewed

@@ -1,6 +1,6 @@
-import { createWriteStream } from "node:fs";
 import { Writable } from "node:stream";
 import { resolve } from "node:path";
+import { isoTimestamp } from "@forwardimpact/libutil";
 import { createAgentRunner } from "../agent-runner.js";
 import { composeProfilePrompt } from "../profile-prompt.js";
 import { createRedactor } from "../redaction.js";
@@ -12,10 +12,14 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse and validate run command options from parsed values.
  * @param {object} values - Parsed option values from cli.parse()
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
  */
-function parseRunOptions(values) {
-  const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
+function parseRunOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const maxTurnsRaw = values["max-turns"] ?? "50";
   return {
@@ -39,10 +43,11 @@ function parseRunOptions(values) {
  *
  * Usage: fit-eval run [options]
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runRunCommand(values, _args) {
+export async function runRunCommand(ctx) {
+  const runtime = ctx.deps.runtime;
   const {
     taskContent,
     taskAmend,
@@ -53,19 +58,26 @@ export async function runRunCommand(values, _args) {
     agentProfile,
     allowedTools,
     mcpServer,
-  } = parseRunOptions(values);
+  } = parseRunOptions(ctx.options, runtime);
   // Build the redactor as the first observable side-effect after option
   // parsing — the env snapshot must freeze BEFORE any in-process
-  // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
-  const redactor = createRedactor();
+  // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
+  const redactor = createRedactor({ runtime });
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
-  const fileStream = outputPath ? createWriteStream(outputPath) : null;
+  const fileStream = outputPath
+    ? runtime.fs.createWriteStream(outputPath)
+    : null;
   const output = fileStream
-    ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
-    : process.stdout;
+    ? createTeeWriter({
+        fileStream,
+        textStream: runtime.proc.stdout,
+        mode: "raw",
+        now: () => isoTimestamp(runtime.clock.now()),
+      })
+    : runtime.proc.stdout;
   const counter = new SequenceCounter();
   const devNull = new Writable({
@@ -93,12 +105,13 @@ export async function runRunCommand(values, _args) {
   }
   if (agentProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = agentProfile;
   }
   const systemPrompt = agentProfile
     ? composeProfilePrompt(agentProfile, {
         profilesDir: resolve(cwd, ".claude/agents"),
+        runtime,
       })
     : undefined;
@@ -116,6 +129,7 @@ export async function runRunCommand(values, _args) {
     taskAmend,
     mcpServers,
     redactor,
+    runtime,
   });
   const result = await runner.run(taskContent);
@@ -125,5 +139,5 @@ export async function runRunCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/supervise.js CHANGED Viewed

@@ -1,6 +1,5 @@
-import { createWriteStream, mkdtempSync } from "node:fs";
 import { resolve, join } from "node:path";
-import { tmpdir } from "node:os";
+import { isoTimestamp } from "@forwardimpact/libutil";
 import { createSupervisor } from "../supervisor.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
@@ -10,19 +9,27 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse all supervise flags from parsed values into an options object.
  * @param {object} values - Parsed option values from cli.parse()
- * @returns {object}
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
+ * @returns {Promise<object>}
  */
-export function parseSuperviseOptions(values) {
-  const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
+export async function parseSuperviseOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
+  const tmpRoot = runtime.proc.env.TMPDIR ?? "/tmp";
+  const agentCwd = resolve(
+    values["agent-cwd"] ??
+      (await runtime.fs.mkdtemp(join(tmpRoot, "fit-eval-agent-"))),
+  );
   return {
     taskContent,
     taskAmend,
     supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
-    agentCwd: resolve(
-      values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
-    ),
+    agentCwd,
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
@@ -50,29 +57,31 @@ export function parseSuperviseOptions(values) {
  *
  * Usage: fit-eval supervise [options]
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runSuperviseCommand(values, _args) {
-  const opts = parseSuperviseOptions(values);
+export async function runSuperviseCommand(ctx) {
+  const runtime = ctx.deps.runtime;
+  const opts = await parseSuperviseOptions(ctx.options, runtime);
   // Build the redactor as the first observable side-effect after option
   // parsing — the env snapshot must freeze BEFORE any in-process
-  // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
-  const redactor = createRedactor();
+  // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
+  const redactor = createRedactor({ runtime });
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
   const fileStream = opts.outputPath
-    ? createWriteStream(opts.outputPath)
+    ? runtime.fs.createWriteStream(opts.outputPath)
     : null;
   const output = fileStream
     ? createTeeWriter({
         fileStream,
-        textStream: process.stdout,
+        textStream: runtime.proc.stdout,
         mode: "supervised",
+        now: () => isoTimestamp(runtime.clock.now()),
       })
-    : process.stdout;
+    : runtime.proc.stdout;
   let agentMcpServers = null;
   if (opts.mcpServer) {
@@ -88,7 +97,7 @@ export async function runSuperviseCommand(values, _args) {
   }
   if (opts.agentProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
   }
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -107,6 +116,7 @@ export async function runSuperviseCommand(values, _args) {
     taskAmend: opts.taskAmend,
     agentMcpServers,
     redactor,
+    runtime,
   });
   const result = await supervisor.run(opts.taskContent);
@@ -116,5 +126,5 @@ export async function runSuperviseCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/task-input.js CHANGED Viewed

@@ -1,4 +1,3 @@
-import { readFileSync } from "node:fs";
 import { composeTaskFromGitHubEvent } from "../events/github.js";
 /**
@@ -11,9 +10,12 @@ import { composeTaskFromGitHubEvent } from "../events/github.js";
  * works as before.
  *
  * @param {object} values - Parsed option values from cli.parse()
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
+ *   collaborators; `fsSync.readFileSync` loads `--task-file`/`--task-event`
+ *   and `proc.env` resolves `GITHUB_EVENT_NAME`.
  * @returns {{ task: string, amend: string | undefined }}
  */
-export function resolveTaskContent(values) {
+export function resolveTaskContent(values, runtime) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
   const taskEvent = values["task-event"];
@@ -33,17 +35,20 @@ export function resolveTaskContent(values) {
   const amendFlag = values["task-amend"] ?? undefined;
   if (taskFile) {
-    return { task: readFileSync(taskFile, "utf8"), amend: amendFlag };
+    return {
+      task: runtime.fsSync.readFileSync(taskFile, "utf8"),
+      amend: amendFlag,
+    };
   }
   if (taskText) {
     return { task: taskText, amend: amendFlag };
   }
-  const eventName = process.env.GITHUB_EVENT_NAME;
+  const eventName = runtime.proc.env.GITHUB_EVENT_NAME;
   if (!eventName) {
     throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
   }
-  const payload = JSON.parse(readFileSync(taskEvent, "utf8"));
+  const payload = JSON.parse(runtime.fsSync.readFileSync(taskEvent, "utf8"));
   const composed = composeTaskFromGitHubEvent(payload, eventName);
   return { task: composed.task, amend: amendFlag ?? composed.amend };
 }

package/src/commands/tee.js CHANGED Viewed

@@ -1,32 +1,47 @@
-import { createWriteStream } from "fs";
 import { PassThrough } from "node:stream";
 import { pipeline } from "node:stream/promises";
+import { isoTimestamp } from "@forwardimpact/libutil";
 import { createTeeWriter } from "../tee-writer.js";
 /**
  * Tee command — stream text output to stdout while optionally saving the raw
- * NDJSON to a file. Processes stdin line-by-line for streaming output.
+ * NDJSON to a file. Reads stdin line-by-line through the injected runtime and
+ * re-delimits each record with a newline so the TeeWriter's line splitter sees
+ * the same framing the raw byte stream produced.
  *
  * Usage: fit-eval tee [output.ndjson] < trace.ndjson
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments (optional output file path)
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runTeeCommand(values, args) {
-  const outputPath = args.find((a) => !a.startsWith("-")) ?? null;
-  const fileStream = outputPath ? createWriteStream(outputPath) : null;
+export async function runTeeCommand(ctx) {
+  const runtime = ctx.deps.runtime;
+  const outputPath = ctx.args.output ?? null;
+  const fileStream = outputPath
+    ? runtime.fs.createWriteStream(outputPath)
+    : null;
   // TeeWriter requires a fileStream; when no output file is specified,
   // use a PassThrough as a no-op sink (NDJSON is not saved).
   const sink = fileStream ?? new PassThrough();
   const tee = createTeeWriter({
     fileStream: sink,
-    textStream: process.stdout,
+    textStream: runtime.proc.stdout,
     mode: "raw",
+    now: () => isoTimestamp(runtime.clock.now()),
   });
   try {
-    await pipeline(process.stdin, tee);
+    // `runtime.proc.stdin` yields newline-stripped lines; re-append `\n` so the
+    // TeeWriter's `_write` line splitter frames records exactly as it did when
+    // piped the raw byte stream.
+    const lines = (async function* () {
+      for await (const line of runtime.proc.stdin) yield `${line}\n`;
+    })();
+    await pipeline(lines, tee);
+    return { ok: true };
+  } catch (error) {
+    return { ok: false, code: 1, error: error.message };
   } finally {
     if (fileStream) {
       await new Promise((resolve, reject) => {