npm - @forwardimpact/libeval - Versions diffs - 0.1.49 → 0.1.51 - Mend

@forwardimpact/libeval 0.1.49 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +11 -8
package/bin/fit-benchmark.js +26 -27
package/bin/fit-eval.js +76 -78
package/bin/fit-trace.js +83 -57
package/package.json +2 -2
package/src/agent-runner.js +23 -13
package/src/benchmark/env-loader.js +35 -23
package/src/benchmark/{scorer.js → invariants.js} +14 -12
package/src/benchmark/judge.js +5 -8
package/src/benchmark/npm-installer.js +87 -0
package/src/benchmark/report.js +15 -15
package/src/benchmark/result.js +11 -11
package/src/benchmark/runner.js +17 -11
package/src/benchmark/task-family.js +6 -4
package/src/benchmark/workdir.js +23 -3
package/src/commands/assert.js +30 -22
package/src/commands/benchmark-invariants.js +74 -0
package/src/commands/benchmark-report.js +23 -15
package/src/commands/benchmark-run.js +22 -7
package/src/commands/by-discussion.js +29 -18
package/src/commands/callback.js +20 -11
package/src/commands/discuss.js +30 -21
package/src/commands/facilitate.js +20 -21
package/src/commands/output.js +11 -12
package/src/commands/run.js +24 -21
package/src/commands/supervise.js +27 -27
package/src/commands/task-input.js +54 -0
package/src/commands/trace.js +174 -97
package/src/discuss-tools.js +48 -2
package/src/discusser.js +49 -2
package/src/events/github.js +155 -0
package/src/inbox-poller.js +84 -0
package/src/index.js +10 -0
package/src/judge.js +1 -1
package/src/message-bus.js +6 -0
package/src/orchestration-loop.js +19 -5
package/src/orchestration-toolkit.js +14 -0
package/src/redaction.js +31 -9
package/src/reply-emitter.js +47 -0
package/src/commands/benchmark-score.js +0 -68

package/src/commands/discuss.js CHANGED Viewed

@@ -1,8 +1,9 @@
-import { readFileSync, createWriteStream } from "node:fs";
+import { createWriteStream } from "node:fs";
 import { resolve } from "node:path";
 import { createDiscusser } from "../discusser.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
+import { resolveTaskContent } from "./task-input.js";
 function parseAgentProfiles(raw, cwd, maxTurns) {
   if (!raw) return [];
@@ -16,19 +17,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
  * Parse and validate discuss command options. Exported so tests can verify
  * defaults and the legacy-flag clean break.
  * @param {object} values - Parsed option values
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @returns {object}
  */
-// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
-export function parseDiscussOptions(values) {
-  const taskFile = values["task-file"];
-  const taskText = values["task-text"];
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
-  const taskAmend = values["task-amend"] ?? undefined;
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+export function parseDiscussOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const profilesRaw = values["agent-profiles"];
   const agentCwd = resolve(values["agent-cwd"] ?? ".");
@@ -48,6 +44,9 @@ export function parseDiscussOptions(values) {
     }
   }
+  const maxLeadTurnsRaw = values["max-lead-turns"] ?? "200";
+  const maxLeadTurns = parseInt(maxLeadTurnsRaw, 10);
   return {
     taskContent,
     taskAmend,
@@ -56,9 +55,13 @@ export function parseDiscussOptions(values) {
     leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     maxTurns,
+    maxLeadTurns,
     outputPath: values.output,
     discussionId: values["discussion-id"] ?? null,
     resumeContext,
+    callbackUrl: runtime.proc.env.CALLBACK_URL ?? null,
+    inboxUrl: runtime.proc.env.INBOX_URL ?? null,
+    correlationId: runtime.proc.env.CORRELATION_ID ?? null,
   };
 }
@@ -67,13 +70,14 @@ export function parseDiscussOptions(values) {
  * semantics, threading `discussion_id` through the trace so multi-run
  * conversations are queryable as one.
  *
- * @param {object} values - Parsed option values
- * @param {string[]} _args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runDiscussCommand(values, _args) {
-  const opts = parseDiscussOptions(values);
+export async function runDiscussCommand(ctx) {
+  const runtime = ctx.deps.runtime;
+  const opts = parseDiscussOptions(ctx.options, runtime);
-  const redactor = createRedactor();
+  const redactor = createRedactor({ runtime });
   const fileStream = opts.outputPath
     ? createWriteStream(opts.outputPath)
@@ -81,13 +85,13 @@ export async function runDiscussCommand(values, _args) {
   const output = fileStream
     ? createTeeWriter({
         fileStream,
-        textStream: process.stdout,
+        textStream: runtime.proc.stdout,
         mode: "supervised",
       })
-    : process.stdout;
+    : runtime.proc.stdout;
   if (opts.leadProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
   }
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -101,8 +105,13 @@ export async function runDiscussCommand(values, _args) {
     query,
     output,
     maxTurns: opts.maxTurns,
+    maxLeadTurns: opts.maxLeadTurns,
     taskAmend: opts.taskAmend,
     redactor,
+    callbackUrl: opts.callbackUrl,
+    inboxUrl: opts.inboxUrl,
+    correlationId: opts.correlationId,
+    runtime,
   });
   const result = await discusser.run(opts.taskContent);
@@ -112,5 +121,5 @@ export async function runDiscussCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/facilitate.js CHANGED Viewed

@@ -1,8 +1,9 @@
-import { readFileSync, createWriteStream } from "node:fs";
+import { createWriteStream } from "node:fs";
 import { resolve } from "node:path";
 import { createFacilitator } from "../facilitator.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
+import { resolveTaskContent } from "./task-input.js";
 /**
  * Parse comma-separated agent profile names into structured configs.
@@ -22,18 +23,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
  * coverage of the `--max-turns` → per-agent threading contract; not part
  * of the package's public API.
  * @param {object} values - Parsed option values
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @returns {object} Parsed options
  */
-export function parseFacilitateOptions(values) {
-  const taskFile = values["task-file"];
-  const taskText = values["task-text"];
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
-  const taskAmend = values["task-amend"] ?? undefined;
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+export function parseFacilitateOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const profilesRaw = values["agent-profiles"];
   if (!profilesRaw) throw new Error("--agent-profiles is required");
@@ -66,16 +63,17 @@ export function parseFacilitateOptions(values) {
  *
  * Usage: fit-eval facilitate [options]
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} _args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runFacilitateCommand(values, _args) {
-  const opts = parseFacilitateOptions(values);
+export async function runFacilitateCommand(ctx) {
+  const runtime = ctx.deps.runtime;
+  const opts = parseFacilitateOptions(ctx.options, runtime);
   // Build the redactor as the first observable side-effect after option
   // parsing — the env snapshot must freeze BEFORE any in-process
-  // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
-  const redactor = createRedactor();
+  // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
+  const redactor = createRedactor({ runtime });
   const fileStream = opts.outputPath
     ? createWriteStream(opts.outputPath)
@@ -83,13 +81,13 @@ export async function runFacilitateCommand(values, _args) {
   const output = fileStream
     ? createTeeWriter({
         fileStream,
-        textStream: process.stdout,
+        textStream: runtime.proc.stdout,
         mode: "supervised",
       })
-    : process.stdout;
+    : runtime.proc.stdout;
   if (opts.facilitatorProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
   }
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -104,6 +102,7 @@ export async function runFacilitateCommand(values, _args) {
     facilitatorProfile: opts.facilitatorProfile,
     taskAmend: opts.taskAmend,
     redactor,
+    runtime,
   });
   const result = await facilitator.run(opts.taskContent);
@@ -113,5 +112,5 @@ export async function runFacilitateCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/output.js CHANGED Viewed

@@ -6,29 +6,28 @@ import { createTraceCollector } from "@forwardimpact/libeval";
  *
  * Usage: fit-eval output [--format=json|text] < trace.ndjson
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: true}>}
  */
-export async function runOutputCommand(values, _args) {
+export async function runOutputCommand(ctx) {
+  const values = ctx.options;
+  const runtime = ctx.deps.runtime;
   const format =
     values.format === "text" || values.format === "json"
       ? values.format
       : "json";
   const collector = createTraceCollector();
-  const chunks = [];
-  for await (const chunk of process.stdin) {
-    chunks.push(chunk);
-  }
-  const input = Buffer.concat(chunks).toString("utf8");
-  for (const line of input.split("\n")) {
+  // `runtime.proc.stdin` is an AsyncIterable of UTF-8 lines (newline-split by
+  // the runtime), so each yielded value is exactly one NDJSON record.
+  for await (const line of runtime.proc.stdin) {
     collector.addLine(line);
   }
   if (format === "text") {
-    process.stdout.write(collector.toText() + "\n");
+    runtime.proc.stdout.write(collector.toText() + "\n");
   } else {
-    process.stdout.write(JSON.stringify(collector.toJSON()) + "\n");
+    runtime.proc.stdout.write(JSON.stringify(collector.toJSON()) + "\n");
   }
+  return { ok: true };
 }

package/src/commands/run.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { readFileSync, createWriteStream } from "node:fs";
+import { createWriteStream } from "node:fs";
 import { Writable } from "node:stream";
 import { resolve } from "node:path";
 import { createAgentRunner } from "../agent-runner.js";
@@ -6,24 +6,21 @@ import { composeProfilePrompt } from "../profile-prompt.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
 import { SequenceCounter } from "../sequence-counter.js";
+import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse and validate run command options from parsed values.
  * @param {object} values - Parsed option values from cli.parse()
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
  * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
  */
-function parseRunOptions(values) {
-  const taskFile = values["task-file"];
-  const taskText = values["task-text"];
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
+function parseRunOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const maxTurnsRaw = values["max-turns"] ?? "50";
-  const taskAmend = values["task-amend"] ?? undefined;
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
   return {
     taskContent,
@@ -46,10 +43,11 @@ function parseRunOptions(values) {
  *
  * Usage: fit-eval run [options]
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runRunCommand(values, _args) {
+export async function runRunCommand(ctx) {
+  const runtime = ctx.deps.runtime;
   const {
     taskContent,
     taskAmend,
@@ -60,19 +58,23 @@ export async function runRunCommand(values, _args) {
     agentProfile,
     allowedTools,
     mcpServer,
-  } = parseRunOptions(values);
+  } = parseRunOptions(ctx.options, runtime);
   // Build the redactor as the first observable side-effect after option
   // parsing — the env snapshot must freeze BEFORE any in-process
-  // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
-  const redactor = createRedactor();
+  // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
+  const redactor = createRedactor({ runtime });
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
   const fileStream = outputPath ? createWriteStream(outputPath) : null;
   const output = fileStream
-    ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
-    : process.stdout;
+    ? createTeeWriter({
+        fileStream,
+        textStream: runtime.proc.stdout,
+        mode: "raw",
+      })
+    : runtime.proc.stdout;
   const counter = new SequenceCounter();
   const devNull = new Writable({
@@ -100,7 +102,7 @@ export async function runRunCommand(values, _args) {
   }
   if (agentProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = agentProfile;
   }
   const systemPrompt = agentProfile
@@ -123,6 +125,7 @@ export async function runRunCommand(values, _args) {
     taskAmend,
     mcpServers,
     redactor,
+    runtime,
   });
   const result = await runner.run(taskContent);
@@ -132,5 +135,5 @@ export async function runRunCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/supervise.js CHANGED Viewed

@@ -1,37 +1,35 @@
-import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
+import { createWriteStream } from "node:fs";
 import { resolve, join } from "node:path";
-import { tmpdir } from "node:os";
 import { createSupervisor } from "../supervisor.js";
 import { createRedactor } from "../redaction.js";
 import { createTeeWriter } from "../tee-writer.js";
+import { resolveTaskContent } from "./task-input.js";
 import { createServiceConfig } from "@forwardimpact/libconfig";
 /**
  * Parse all supervise flags from parsed values into an options object.
  * @param {object} values - Parsed option values from cli.parse()
- * @returns {object}
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
+ * @returns {Promise<object>}
  */
-// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
-export function parseSuperviseOptions(values) {
-  const taskFile = values["task-file"];
-  const taskText = values["task-text"];
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
+export async function parseSuperviseOptions(values, runtime) {
+  const { task: taskContent, amend: taskAmend } = resolveTaskContent(
+    values,
+    runtime,
+  );
   const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
-  const taskAmend = values["task-amend"] ?? undefined;
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  const tmpRoot = runtime.proc.env.TMPDIR ?? "/tmp";
+  const agentCwd = resolve(
+    values["agent-cwd"] ??
+      (await runtime.fs.mkdtemp(join(tmpRoot, "fit-eval-agent-"))),
+  );
   return {
     taskContent,
     taskAmend,
     supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
-    agentCwd: resolve(
-      values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
-    ),
+    agentCwd,
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
@@ -59,16 +57,17 @@ export function parseSuperviseOptions(values) {
  *
  * Usage: fit-eval supervise [options]
  *
- * @param {object} values - Parsed option values from cli.parse()
- * @param {string[]} args - Positional arguments
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
  */
-export async function runSuperviseCommand(values, _args) {
-  const opts = parseSuperviseOptions(values);
+export async function runSuperviseCommand(ctx) {
+  const runtime = ctx.deps.runtime;
+  const opts = await parseSuperviseOptions(ctx.options, runtime);
   // Build the redactor as the first observable side-effect after option
   // parsing — the env snapshot must freeze BEFORE any in-process
-  // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
-  const redactor = createRedactor();
+  // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
+  const redactor = createRedactor({ runtime });
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -78,10 +77,10 @@ export async function runSuperviseCommand(values, _args) {
   const output = fileStream
     ? createTeeWriter({
         fileStream,
-        textStream: process.stdout,
+        textStream: runtime.proc.stdout,
         mode: "supervised",
       })
-    : process.stdout;
+    : runtime.proc.stdout;
   let agentMcpServers = null;
   if (opts.mcpServer) {
@@ -97,7 +96,7 @@ export async function runSuperviseCommand(values, _args) {
   }
   if (opts.agentProfile) {
-    process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
+    runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
   }
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -116,6 +115,7 @@ export async function runSuperviseCommand(values, _args) {
     taskAmend: opts.taskAmend,
     agentMcpServers,
     redactor,
+    runtime,
   });
   const result = await supervisor.run(opts.taskContent);
@@ -125,5 +125,5 @@ export async function runSuperviseCommand(values, _args) {
     await new Promise((r) => fileStream.end(r));
   }
-  process.exit(result.success ? 0 : 1);
+  return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
 }

package/src/commands/task-input.js ADDED Viewed

@@ -0,0 +1,54 @@
+import { composeTaskFromGitHubEvent } from "../events/github.js";
+/**
+ * Resolve `--task-file` / `--task-text` / `--task-event` into the task pair the
+ * runner consumes. Exactly one of the three must be set. For `--task-event`,
+ * libeval reads the event payload and extracts both the main task (from the
+ * template that matches `$GITHUB_EVENT_NAME` + `payload.action`) and the
+ * amendment (from `payload.inputs?.prompt`) — so the workflow doesn't need to
+ * wire `--task-amend` separately. For the other two modes, `--task-amend`
+ * works as before.
+ *
+ * @param {object} values - Parsed option values from cli.parse()
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
+ *   collaborators; `fsSync.readFileSync` loads `--task-file`/`--task-event`
+ *   and `proc.env` resolves `GITHUB_EVENT_NAME`.
+ * @returns {{ task: string, amend: string | undefined }}
+ */
+export function resolveTaskContent(values, runtime) {
+  const taskFile = values["task-file"];
+  const taskText = values["task-text"];
+  const taskEvent = values["task-event"];
+  const set = [taskFile, taskText, taskEvent].filter(Boolean).length;
+  if (set === 0) {
+    throw new Error(
+      "one of --task-file, --task-text, --task-event is required",
+    );
+  }
+  if (set > 1) {
+    throw new Error(
+      "--task-file, --task-text, --task-event are mutually exclusive",
+    );
+  }
+  const amendFlag = values["task-amend"] ?? undefined;
+  if (taskFile) {
+    return {
+      task: runtime.fsSync.readFileSync(taskFile, "utf8"),
+      amend: amendFlag,
+    };
+  }
+  if (taskText) {
+    return { task: taskText, amend: amendFlag };
+  }
+  const eventName = runtime.proc.env.GITHUB_EVENT_NAME;
+  if (!eventName) {
+    throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
+  }
+  const payload = JSON.parse(runtime.fsSync.readFileSync(taskEvent, "utf8"));
+  const composed = composeTaskFromGitHubEvent(payload, eventName);
+  return { task: composed.task, amend: amendFlag ?? composed.amend };
+}