npm - @forwardimpact/libeval - Versions diffs - 0.1.50 → 0.1.52 - Mend

@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/README.md +11 -8
package/bin/fit-benchmark.js +26 -27
package/bin/fit-eval.js +36 -30
package/bin/fit-trace.js +83 -57
package/package.json +1 -1
package/src/agent-runner.js +20 -12
package/src/benchmark/apm-installer.js +48 -44
package/src/benchmark/env-loader.js +35 -23
package/src/benchmark/invariants.js +128 -0
package/src/benchmark/judge.js +18 -19
package/src/benchmark/npm-installer.js +33 -33
package/src/benchmark/report.js +40 -26
package/src/benchmark/result.js +11 -11
package/src/benchmark/runner.js +90 -46
package/src/benchmark/task-family.js +78 -65
package/src/benchmark/workdir.js +100 -93
package/src/commands/assert.js +30 -22
package/src/commands/benchmark-invariants.js +74 -0
package/src/commands/benchmark-report.js +24 -15
package/src/commands/benchmark-run.js +16 -9
package/src/commands/by-discussion.js +33 -23
package/src/commands/callback.js +20 -11
package/src/commands/discuss.js +31 -13
package/src/commands/facilitate.js +21 -14
package/src/commands/output.js +15 -13
package/src/commands/run.js +28 -14
package/src/commands/supervise.js +29 -19
package/src/commands/task-input.js +10 -5
package/src/commands/tee.js +24 -9
package/src/commands/trace.js +181 -99
package/src/discuss-tools.js +48 -2
package/src/discusser.js +53 -2
package/src/events/github.js +27 -5
package/src/facilitator.js +4 -0
package/src/inbox-poller.js +84 -0
package/src/judge.js +4 -1
package/src/message-bus.js +6 -0
package/src/orchestration-loop.js +14 -4
package/src/orchestration-toolkit.js +14 -0
package/src/profile-prompt.js +22 -9
package/src/redaction.js +31 -9
package/src/reply-emitter.js +47 -0
package/src/supervisor.js +4 -0
package/src/tee-writer.js +4 -2
package/src/trace-collector.js +9 -2
package/src/trace-github.js +47 -27
package/src/benchmark/scorer.js +0 -138
package/src/commands/benchmark-score.js +0 -68

package/README.md CHANGED Viewed

@@ -71,11 +71,12 @@ while participants work in parallel — nothing blocks the LLM thread.
 ### Discuss-mode replies
-In discussion mode, Answer calls routed to the lead are captured as
-thread replies delivered via the bridge callback. The lead delegates work
-via Ask; each agent's Answer becomes a separate reply posted to the
-discussion thread. No explicit reply tool is needed on the lead surface —
-the message bus intercepts answers and appends them to `ctx.replies[]`.
+In discussion mode, Answer calls routed to the lead are streamed to
+the discussion thread as they are produced — each agent's Answer becomes
+a separate reply posted immediately, not batched at session end. The
+lead and agents can also call `Acknowledge` to post brief messages
+directly to the thread (status updates, human follow-up responses).
+The message bus intercepts answers and appends them to `ctx.replies[]`.
 `RequestForComment` is a separate coordination tool available on agent
 roles (facilitated agents and discuss agents). It queues an intent to
@@ -104,8 +105,8 @@ only feeds the summary's `success`/`verdict`.
 | Fac. agent   | ✓   | ✓      | ✓        | ✓        |          | `RequestForComment`            |
 | Supervisor   | ✓   | ✓      | ✓        | ✓        | ✓        |                                |
 | Sup. agent   | ✓   | ✓      | ✓        | ✓        |          |                                |
-| Discuss lead | ✓   | ✓      | ✓        | ✓        |          | `Recess`, `Adjourn`            |
-| Discuss agt  | ✓   | ✓      | ✓        | ✓        |          | `RequestForComment`            |
+| Discuss lead | ✓   | ✓      | ✓        | ✓        |          | `Recess`, `Adjourn`, `Acknowledge` |
+| Discuss agt  | ✓   | ✓      | ✓        | ✓        |          | `RequestForComment`, `Acknowledge` |
 | Judge        |     |        |          |          | ✓        |                                |
 Ask's `to` accepts a participant name on multi-participant roles
@@ -169,7 +170,9 @@ downloadable through retention.
 | `orchestration-toolkit.js`                                  | Shared Ask/Answer/Announce/Conclude/RollCall/RequestForComment handlers + builders. |
 | `orchestration-loop.js`                                     | Unified lead+participant loop; reminder/violation handling.          |
 | `facilitator.js` / `supervisor.js` / `discusser.js` / `judge.js` | Per-mode class + factory + system prompt.                       |
-| `discuss-tools.js`                                          | Discuss-only `Recess`/`Adjourn`.                                     |
+| `discuss-tools.js`                                          | Discuss-only `Recess`/`Adjourn`/`Acknowledge`.                       |
+| `reply-emitter.js`                                          | Fire-and-forget POST of reply/ack events to the callback URL.        |
+| `inbox-poller.js`                                           | Long-poll the bridge inbox for injected human messages.              |
 | `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers.              |
 | `redaction.js`                                              | Env-var allowlist + credential-shape pattern redaction.              |

package/bin/fit-benchmark.js CHANGED Viewed

@@ -4,10 +4,11 @@ import "@forwardimpact/libpreflight/node22";
 import { readFileSync, realpathSync } from "node:fs";
 import { createCli } from "@forwardimpact/libcli";
+import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
 import { createLogger } from "@forwardimpact/libtelemetry";
 import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
-import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
+import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
 import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
 // `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
@@ -26,7 +27,8 @@ export const definition = {
   commands: [
     {
       name: "run",
-      args: "",
+      args: [],
+      handler: runBenchmarkRunCommand,
       description:
         "Run every task in a family for N runs and emit one result record per (task, runIndex).",
       options: {
@@ -78,10 +80,11 @@ export const definition = {
       },
     },
     {
-      name: "score",
-      args: "",
+      name: "invariants",
+      args: [],
+      handler: runBenchmarkInvariantsCommand,
       description:
-        "Score a single task against a post-run workdir without invoking an agent.",
+        "Check a single task's invariants against a post-run workdir without invoking an agent.",
       options: {
         family: {
           type: "string",
@@ -94,7 +97,7 @@ export const definition = {
         workdir: {
           type: "string",
           description:
-            "Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
+            "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
         },
         output: {
           type: "string",
@@ -104,7 +107,8 @@ export const definition = {
     },
     {
       name: "report",
-      args: "",
+      args: [],
+      handler: runBenchmarkReportCommand,
       description:
         "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
       options: {
@@ -132,7 +136,7 @@ export const definition = {
   examples: [
     "fit-benchmark run --family=./families/coding",
     "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
-    "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
+    "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
     "fit-benchmark report --format=text",
     "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
   ],
@@ -152,35 +156,30 @@ export const definition = {
   ],
 };
-const cli = createCli(definition);
 const logger = createLogger("benchmark");
-const COMMANDS = {
-  run: runBenchmarkRunCommand,
-  score: runBenchmarkScoreCommand,
-  report: runBenchmarkReportCommand,
-};
 async function main() {
-  const parsed = cli.parse(process.argv.slice(2));
-  if (!parsed) process.exit(0);
-  const { values, positionals } = parsed;
+  const runtime = createDefaultRuntime();
+  const cli = createCli(definition, { runtime });
+  const parsed = cli.parse(runtime.proc.argv.slice(2));
+  if (!parsed) return runtime.proc.exit(0);
+  const { positionals } = parsed;
   if (positionals.length === 0) {
     cli.usageError("no command specified");
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  const [command, ...args] = positionals;
-  const handler = COMMANDS[command];
-  if (!handler) {
+  const command = positionals[0];
+  if (!definition.commands.some((c) => c.name === command)) {
     cli.usageError(`unknown command "${command}"`);
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  await handler(values, args);
+  const result = await cli.dispatch(parsed, { deps: { runtime } });
+  const envelope = result ?? { ok: true };
+  if (!envelope.ok && envelope.error) cli.error(envelope.error);
+  runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
 }
 // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
@@ -188,7 +187,7 @@ async function main() {
 if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
   main().catch((error) => {
     logger.exception("main", error);
-    cli.error(error.message);
+    createCli(definition).error(error.message);
     process.exit(1);
   });
 }

package/bin/fit-eval.js CHANGED Viewed

@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
 import { readFileSync } from "node:fs";
 import { createCli } from "@forwardimpact/libcli";
+import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
 import { createLogger } from "@forwardimpact/libtelemetry";
 import { runOutputCommand } from "../src/commands/output.js";
@@ -65,7 +66,9 @@ const definition = {
   commands: [
     {
       name: "run",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runRunCommand,
       description: "Run a single agent autonomously on a defined task",
       options: {
         ...TASK_INPUT_OPTIONS,
@@ -100,7 +103,9 @@ const definition = {
     },
     {
       name: "supervise",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runSuperviseCommand,
       description:
         "Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
       options: {
@@ -143,7 +148,9 @@ const definition = {
     },
     {
       name: "facilitate",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runFacilitateCommand,
       description:
         "Run a facilitator with N participants — typical shape for multi-agent collaboration",
       options: {
@@ -178,7 +185,9 @@ const definition = {
     },
     {
       name: "discuss",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runDiscussCommand,
       description:
         "Run an async, suspendable discussion — Chair + N participants + bridge callback",
       options: {
@@ -217,19 +226,25 @@ const definition = {
     },
     {
       name: "output",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runOutputCommand,
       description:
         "Read NDJSON from stdin and emit a structured or readable form",
     },
     {
       name: "tee",
-      args: "[output.ndjson]",
+      args: ["output"],
+      argsUsage: "[output.ndjson]",
+      handler: runTeeCommand,
       description:
         "Stream readable text to stdout while saving raw NDJSON to a file",
     },
     {
       name: "callback",
-      args: "",
+      args: [],
+      argsUsage: "",
+      handler: runCallbackCommand,
       description:
         "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
       options: {
@@ -298,43 +313,34 @@ const definition = {
   ],
 };
-const cli = createCli(definition);
 const logger = createLogger("eval");
-const COMMANDS = {
-  output: runOutputCommand,
-  tee: runTeeCommand,
-  run: runRunCommand,
-  supervise: runSuperviseCommand,
-  facilitate: runFacilitateCommand,
-  discuss: runDiscussCommand,
-  callback: runCallbackCommand,
-};
 async function main() {
-  const parsed = cli.parse(process.argv.slice(2));
-  if (!parsed) process.exit(0);
-  const { values, positionals } = parsed;
+  const runtime = createDefaultRuntime();
+  const cli = createCli(definition, { runtime });
+  const parsed = cli.parse(runtime.proc.argv.slice(2));
+  if (!parsed) return runtime.proc.exit(0);
+  const { positionals } = parsed;
   if (positionals.length === 0) {
     cli.usageError("no command specified");
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  const [command, ...args] = positionals;
-  const handler = COMMANDS[command];
-  if (!handler) {
+  const command = positionals[0];
+  if (!definition.commands.some((c) => c.name === command)) {
     cli.usageError(`unknown command "${command}"`);
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  await handler(values, args);
+  const result = await cli.dispatch(parsed, { deps: { runtime } });
+  const envelope = result ?? { ok: true };
+  if (!envelope.ok && envelope.error) cli.error(envelope.error);
+  runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
 }
 main().catch((error) => {
   logger.exception("main", error);
-  cli.error(error.message);
+  createCli(definition).error(error.message);
   process.exit(1);
 });

package/bin/fit-trace.js CHANGED Viewed

@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
 import { readFileSync } from "node:fs";
 import { createCli } from "@forwardimpact/libcli";
+import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
 import { createScriptConfig } from "@forwardimpact/libconfig";
 import { createLogger } from "@forwardimpact/libtelemetry";
@@ -46,7 +47,9 @@ const definition = {
   commands: [
     {
       name: "runs",
-      args: "[pattern]",
+      args: ["pattern"],
+      argsUsage: "[pattern]",
+      handler: runRunsCommand,
       description:
         "List recent GitHub Actions workflow runs (default pattern: agent)",
       options: {
@@ -63,7 +66,9 @@ const definition = {
     },
     {
       name: "download",
-      args: "<run-id>",
+      args: ["run-id"],
+      argsUsage: "<run-id>",
+      handler: runDownloadCommand,
       description: "Download trace artifact and convert to structured JSON",
       options: {
         dir: { type: "string", description: "Output directory" },
@@ -77,32 +82,44 @@ const definition = {
     },
     {
       name: "overview",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runOverviewCommand,
       description: "Metadata, summary, turn count, tool frequency",
     },
     {
       name: "count",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runCountCommand,
       description: "Number of turns",
     },
     {
       name: "batch",
-      args: "<file> <from> <to>",
+      args: ["file", "from", "to"],
+      argsUsage: "<file> <from> <to>",
+      handler: runBatchCommand,
       description: "Turns in range [from, to) (zero-indexed)",
     },
     {
       name: "head",
-      args: "<file> [N]",
+      args: ["file", "n"],
+      argsUsage: "<file> [N]",
+      handler: runHeadCommand,
       description: "First N turns (default 10)",
     },
     {
       name: "tail",
-      args: "<file> [N]",
+      args: ["file", "n"],
+      argsUsage: "<file> [N]",
+      handler: runTailCommand,
       description: "Last N turns (default 10)",
     },
     {
       name: "search",
-      args: "<file> <pattern>",
+      args: ["file", "pattern"],
+      argsUsage: "<file> <pattern>",
+      handler: runSearchCommand,
       description: "Search all content for regex pattern",
       options: {
         limit: {
@@ -121,22 +138,30 @@ const definition = {
     },
     {
       name: "tools",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runToolsCommand,
       description: "Tool usage frequency (descending)",
     },
     {
       name: "tool",
-      args: "<file> <name>",
+      args: ["file", "name"],
+      argsUsage: "<file> <name>",
+      handler: runToolCommand,
       description: "All turns involving a specific tool",
     },
     {
       name: "errors",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runErrorsCommand,
       description: "Tool results with isError=true",
     },
     {
       name: "reasoning",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runReasoningCommand,
       description: "Agent reasoning text only",
       options: {
         from: { type: "string", description: "Start at turn index" },
@@ -145,27 +170,37 @@ const definition = {
     },
     {
       name: "timeline",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runTimelineCommand,
       description: "Compact one-line-per-turn overview",
     },
     {
       name: "stats",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runStatsCommand,
       description: "Token usage and cost breakdown",
     },
     {
       name: "init",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runInitCommand,
       description: "Full system/init event",
     },
     {
       name: "turn",
-      args: "<file> <index>",
+      args: ["file", "index"],
+      argsUsage: "<file> <index>",
+      handler: runTurnCommand,
       description: "Single turn by index",
     },
     {
       name: "by-discussion",
-      args: "<discussion-id> [trace-dir]",
+      args: ["discussion-id", "trace-dir"],
+      argsUsage: "<discussion-id> [trace-dir]",
+      handler: runByDiscussionCommand,
       description:
         "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
       options: {
@@ -177,7 +212,9 @@ const definition = {
     },
     {
       name: "filter",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runFilterCommand,
       description: "Filter turns by role, tool, or error status",
       options: {
         role: {
@@ -196,7 +233,9 @@ const definition = {
     },
     {
       name: "split",
-      args: "<file>",
+      args: ["file"],
+      argsUsage: "<file>",
+      handler: runSplitCommand,
       description:
         "Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
       options: {
@@ -217,9 +256,11 @@ const definition = {
     },
     {
       name: "assert",
-      args: "<test-name> <file>",
+      args: ["test-name", "file"],
+      argsUsage: "<test-name> <file>",
+      handler: runAssertCommand,
       description:
-        "Shell-friendly assertion — outputs structured JSON for scoring hooks",
+        "Shell-friendly assertion — outputs structured JSON for invariant hooks",
       options: {
         grep: {
           type: "string",
@@ -299,57 +340,42 @@ const definition = {
   ],
 };
-const cli = createCli(definition);
 const logger = createLogger("trace");
-const COMMANDS = {
-  runs: runRunsCommand,
-  download: runDownloadCommand,
-  overview: runOverviewCommand,
-  count: runCountCommand,
-  batch: runBatchCommand,
-  head: runHeadCommand,
-  tail: runTailCommand,
-  search: runSearchCommand,
-  tools: runToolsCommand,
-  tool: runToolCommand,
-  errors: runErrorsCommand,
-  reasoning: runReasoningCommand,
-  timeline: runTimelineCommand,
-  stats: runStatsCommand,
-  init: runInitCommand,
-  turn: runTurnCommand,
-  filter: runFilterCommand,
-  split: runSplitCommand,
-  assert: runAssertCommand,
-  "by-discussion": runByDiscussionCommand,
-};
+// Commands that talk to the GitHub API need a config-backed token resolver;
+// the rest only read local trace files through the runtime.
+const NEEDS_CONFIG = new Set(["runs", "download"]);
 async function main() {
-  const parsed = cli.parse(process.argv.slice(2));
-  if (!parsed) process.exit(0);
-  const { values, positionals } = parsed;
+  const runtime = createDefaultRuntime();
+  const cli = createCli(definition, { runtime });
+  const parsed = cli.parse(runtime.proc.argv.slice(2));
+  if (!parsed) return runtime.proc.exit(0);
+  const { positionals } = parsed;
   if (positionals.length === 0) {
     cli.usageError("no command specified");
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  const [command, ...args] = positionals;
-  const handler = COMMANDS[command];
-  if (!handler) {
+  const command = positionals[0];
+  if (!definition.commands.some((c) => c.name === command)) {
     cli.usageError(`unknown command "${command}"`);
-    process.exit(2);
+    return runtime.proc.exit(2);
   }
-  const config = await createScriptConfig("eval");
-  await handler(values, args, { config });
+  const config = NEEDS_CONFIG.has(command)
+    ? await createScriptConfig("eval")
+    : undefined;
+  const result = await cli.dispatch(parsed, { deps: { runtime, config } });
+  const envelope = result ?? { ok: true };
+  if (!envelope.ok && envelope.error) cli.error(envelope.error);
+  runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
 }
 main().catch((error) => {
   logger.exception("main", error);
-  cli.error(error.message);
+  createCli(definition).error(error.message);
   process.exit(1);
 });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.50",
+  "version": "0.1.52",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/agent-runner.js CHANGED Viewed

@@ -29,12 +29,16 @@ export class AgentRunner {
    * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
    * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
    * @param {object} deps.redactor
+   * @param {import("@forwardimpact/libutil/runtime").Runtime} [deps.runtime] -
+   *   Ambient collaborators. Only `proc.env` is read (to record Skill
+   *   invocations into `LIBEVAL_SKILL`); when absent the write is skipped.
    */
   constructor(deps) {
     if (!deps.cwd) throw new Error("cwd is required");
     if (!deps.query) throw new Error("query is required");
     if (!deps.output) throw new Error("output is required");
     if (!deps.redactor) throw new Error("redactor is required");
+    this.runtime = deps.runtime ?? null;
     this.cwd = deps.cwd;
     this.query = deps.query;
     this.output = deps.output;
@@ -179,20 +183,24 @@ export class AgentRunner {
     if (message.type === "system" && message.subtype === "init") {
       this.sessionId = message.session_id;
     }
-    if (message.type === "assistant") trackSkillInvocation(message);
+    if (message.type === "assistant") this.#trackSkillInvocation(message);
   }
-}
-function trackSkillInvocation(message) {
-  const content = message.message?.content ?? message.content;
-  if (!Array.isArray(content)) return;
-  for (const block of content) {
-    if (
-      block.type === "tool_use" &&
-      block.name === "Skill" &&
-      block.input?.skill
-    ) {
-      process.env.LIBEVAL_SKILL = block.input.skill;
+  #trackSkillInvocation(message) {
+    const content = message.message?.content ?? message.content;
+    if (!Array.isArray(content)) return;
+    // Skill metric is recorded into the env map; without a runtime there is
+    // no env surface to write to, so the side-effect is simply skipped.
+    const env = this.runtime?.proc?.env ?? null;
+    if (!env) return;
+    for (const block of content) {
+      if (
+        block.type === "tool_use" &&
+        block.name === "Skill" &&
+        block.input?.skill
+      ) {
+        env.LIBEVAL_SKILL = block.input.skill;
+      }
     }
   }
 }