npm - @forwardimpact/libeval - Versions diffs - 0.1.43 → 0.1.44 - Mend

@forwardimpact/libeval 0.1.43 → 0.1.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/bin/fit-benchmark.js +2 -2
package/bin/fit-eval.js +101 -21
package/bin/fit-trace.js +14 -0
package/package.json +1 -1
package/src/commands/benchmark-run.js +1 -1
package/src/commands/by-discussion.js +84 -0
package/src/commands/callback.js +104 -0
package/src/commands/discuss.js +116 -0
package/src/commands/facilitate.js +2 -2
package/src/commands/supervise.js +3 -3
package/src/discuss-tools.js +203 -0
package/src/discusser.js +332 -0
package/src/facilitator.js +39 -333
package/src/index.js +14 -0
package/src/orchestration-loop.js +369 -0
package/src/redaction.js +10 -0
package/src/render/orchestrator-filter.js +1 -0
package/src/trace-collector.js +4 -0

package/bin/fit-benchmark.js CHANGED Viewed

@@ -46,10 +46,10 @@ export const definition = {
           description:
             "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
         },
-        "supervisor-model": {
+        "lead-model": {
           type: "string",
           description:
-            "Claude model for the supervisor (default: claude-opus-4-7)",
+            "Claude model for the lead role (default: claude-opus-4-7)",
         },
         "judge-model": {
           type: "string",

package/bin/fit-eval.js CHANGED Viewed

@@ -9,6 +9,8 @@ import { runTeeCommand } from "../src/commands/tee.js";
 import { runRunCommand } from "../src/commands/run.js";
 import { runSuperviseCommand } from "../src/commands/supervise.js";
 import { runFacilitateCommand } from "../src/commands/facilitate.js";
+import { runDiscussCommand } from "../src/commands/discuss.js";
+import { runCallbackCommand } from "../src/commands/callback.js";
 // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
 // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -18,6 +20,18 @@ const VERSION =
   JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
     .version;
+const LEAD_OPTIONS = {
+  "lead-profile": {
+    type: "string",
+    description: "Lead role profile name (supervisor / facilitator / chair)",
+  },
+  "lead-model": {
+    type: "string",
+    description:
+      "Claude model for the lead role (default: claude-opus-4-7[1m])",
+  },
+};
 const definition = {
   name: "fit-eval",
   version: VERSION,
@@ -93,11 +107,7 @@ const definition = {
           description:
             "Claude model for the agent (default: claude-opus-4-7[1m])",
         },
-        "supervisor-model": {
-          type: "string",
-          description:
-            "Claude model for the supervisor (default: claude-opus-4-7[1m])",
-        },
+        ...LEAD_OPTIONS,
         "max-turns": {
           type: "string",
           description:
@@ -117,10 +127,6 @@ const definition = {
           description: "Supervisor working directory",
         },
         "agent-cwd": { type: "string", description: "Agent working directory" },
-        "supervisor-profile": {
-          type: "string",
-          description: "Supervisor (judge) profile name",
-        },
         "supervisor-allowed-tools": {
           type: "string",
           description: "Supervisor tool allowlist",
@@ -154,11 +160,7 @@ const definition = {
           type: "string",
           description: "Claude model for agents (default: claude-opus-4-7[1m])",
         },
-        "facilitator-model": {
-          type: "string",
-          description:
-            "Claude model for the facilitator (default: claude-opus-4-7[1m])",
-        },
+        ...LEAD_OPTIONS,
         "max-turns": {
           type: "string",
           description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -171,10 +173,6 @@ const definition = {
           type: "string",
           description: "Facilitator working directory",
         },
-        "facilitator-profile": {
-          type: "string",
-          description: "Facilitator profile name",
-        },
         "agent-profiles": {
           type: "string",
           description:
@@ -186,6 +184,56 @@ const definition = {
         },
       },
     },
+    {
+      name: "discuss",
+      args: "",
+      description:
+        "Run an async, suspendable discussion — Chair + N participants + bridge callback",
+      options: {
+        "task-file": {
+          type: "string",
+          description: "Path to a markdown task file",
+        },
+        "task-text": {
+          type: "string",
+          description: "Inline task text (alternative to --task-file)",
+        },
+        "task-amend": {
+          type: "string",
+          description: "Additional text appended to the task",
+        },
+        "agent-model": {
+          type: "string",
+          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+        },
+        ...LEAD_OPTIONS,
+        "max-turns": {
+          type: "string",
+          description: "Max agentic turns (default: 40, 0 = unlimited)",
+        },
+        output: {
+          type: "string",
+          description: "Write the NDJSON trace to a file",
+        },
+        "agent-profiles": {
+          type: "string",
+          description: "Comma-separated participant profile names (optional)",
+        },
+        "agent-cwd": {
+          type: "string",
+          description: "Working directory shared by participants (default: .)",
+        },
+        "discussion-id": {
+          type: "string",
+          description:
+            "Stable id for the threaded conversation; carried through traces for linking",
+        },
+        "resume-context": {
+          type: "string",
+          description: "JSON-serialized prior state for a resumed run",
+        },
+      },
+    },
     {
       name: "output",
       args: "",
@@ -198,6 +246,35 @@ const definition = {
       description:
         "Stream readable text to stdout while saving raw NDJSON to a file",
     },
+    {
+      name: "callback",
+      args: "",
+      description:
+        "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
+      options: {
+        "trace-file": {
+          type: "string",
+          description: "Path to the NDJSON trace file",
+        },
+        "callback-url": {
+          type: "string",
+          description: "URL to POST the summary to",
+        },
+        "correlation-id": {
+          type: "string",
+          description: "Correlation ID to include in the payload",
+        },
+        "run-url": {
+          type: "string",
+          description: "GitHub Actions run URL (optional)",
+        },
+        "discussion-id": {
+          type: "string",
+          description:
+            "Discussion id (fallback when the trace lacks a meta event)",
+        },
+      },
+    },
   ],
   globalOptions: {
     format: { type: "string", description: "Output format (json|text)" },
@@ -207,8 +284,9 @@ const definition = {
   },
   examples: [
     "fit-eval run --task-file=task.md --output=trace.ndjson",
-    "fit-eval supervise --task-file=task.md --supervisor-profile=judge --agent-profile=coder --output=trace.ndjson",
-    'fit-eval facilitate --task-file=task.md --facilitator-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
+    "fit-eval supervise --task-file=task.md --lead-profile=judge --agent-profile=coder --output=trace.ndjson",
+    'fit-eval facilitate --task-file=task.md --lead-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
+    'fit-eval discuss --task-file=task.md --lead-profile=release-engineer --agent-profiles="staff-engineer,security-engineer" --discussion-id=GD_kw...',
     "fit-eval output --format=text < trace.ndjson",
   ],
   documentation: [
@@ -234,7 +312,7 @@ const definition = {
       title: "Agent Teams",
       url: "https://www.forwardimpact.team/docs/products/agent-teams/index.md",
       description:
-        "How to author the agent, supervisor, and facilitator profiles consumed by --agent-profile, --supervisor-profile, --facilitator-profile, and --agent-profiles.",
+        "How to author the profiles consumed by --agent-profile, --lead-profile, and --agent-profiles.",
     },
   ],
 };
@@ -248,6 +326,8 @@ const COMMANDS = {
   run: runRunCommand,
   supervise: runSuperviseCommand,
   facilitate: runFacilitateCommand,
+  discuss: runDiscussCommand,
+  callback: runCallbackCommand,
 };
 async function main() {

package/bin/fit-trace.js CHANGED Viewed

@@ -26,6 +26,7 @@ import {
   runSplitCommand,
 } from "../src/commands/trace.js";
 import { runAssertCommand } from "../src/commands/assert.js";
+import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
 // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
 // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -160,6 +161,18 @@ const definition = {
       args: "<file> <index>",
       description: "Single turn by index",
     },
+    {
+      name: "by-discussion",
+      args: "<discussion-id> [trace-dir]",
+      description:
+        "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
+      options: {
+        "trace-dir": {
+          type: "string",
+          description: "Directory to scan (default: traces)",
+        },
+      },
+    },
     {
       name: "filter",
       args: "<file>",
@@ -307,6 +320,7 @@ const COMMANDS = {
   filter: runFilterCommand,
   split: runSplitCommand,
   assert: runAssertCommand,
+  "by-discussion": runByDiscussionCommand,
 };
 async function main() {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.43",
+  "version": "0.1.44",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/commands/benchmark-run.js CHANGED Viewed

@@ -40,7 +40,7 @@ function parseRunOptions(values) {
     runs,
     output: resolve(output),
     agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
-    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7",
+    supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
     judgeModel: values["judge-model"] ?? "claude-opus-4-7",
     profiles: {
       agent: values["agent-profile"] ?? null,

package/src/commands/by-discussion.js ADDED Viewed

@@ -0,0 +1,84 @@
+import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
+import { join } from "node:path";
+/**
+ * Read the first newline-terminated line of a file. Bounded to 64 KiB
+ * which is well above any orchestrator envelope.
+ *
+ * @param {string} path
+ * @returns {string}
+ */
+function readFirstLine(path) {
+  const fd = openSync(path, "r");
+  try {
+    const buf = Buffer.alloc(65536);
+    const bytes = readSync(fd, buf, 0, buf.length, 0);
+    const slice = buf.slice(0, bytes).toString("utf8");
+    const nl = slice.indexOf("\n");
+    return nl === -1 ? slice : slice.slice(0, nl);
+  } finally {
+    closeSync(fd);
+  }
+}
+/**
+ * Scan a directory for `.ndjson` files whose meta header carries the
+ * given discussion_id. The Step 2.6 first-line guarantee makes the
+ * lookup cheap: we read only the first line per file. Files without a
+ * meta header (e.g. legacy supervise/facilitate traces) are skipped
+ * silently — not erroneous.
+ *
+ * @param {string} dir
+ * @param {string} discussionId
+ * @returns {Array<{path: string, mtimeMs: number}>}
+ */
+export function findTracesByDiscussion(dir, discussionId) {
+  const matches = [];
+  let entries;
+  try {
+    entries = readdirSync(dir);
+  } catch {
+    return [];
+  }
+  for (const entry of entries) {
+    if (!entry.endsWith(".ndjson")) continue;
+    const path = join(dir, entry);
+    let firstLine;
+    try {
+      firstLine = readFirstLine(path);
+    } catch {
+      continue;
+    }
+    let parsed;
+    try {
+      parsed = JSON.parse(firstLine);
+    } catch {
+      continue;
+    }
+    const event = parsed.event ?? parsed;
+    if (event?.type !== "meta") continue;
+    if (event.discussion_id !== discussionId) continue;
+    matches.push({ path, mtimeMs: statSync(path).mtimeMs });
+  }
+  matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
+  return matches;
+}
+/**
+ * `fit-trace by-discussion <discussion-id> [trace-dir]` — list trace
+ * files whose meta header carries the given discussion_id, one per
+ * line, ordered by first-event timestamp (file mtime ascending). The
+ * result is usable with `xargs cat` for a chronological merge.
+ *
+ * @param {object} values
+ * @param {string[]} args
+ */
+export async function runByDiscussionCommand(values, args) {
+  const [discussionId, traceDirArg] = args;
+  if (!discussionId) throw new Error("<discussion-id> is required");
+  const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
+  const matches = findTracesByDiscussion(dir, discussionId);
+  for (const { path } of matches) {
+    process.stdout.write(`${path}\n`);
+  }
+}

package/src/commands/callback.js ADDED Viewed

@@ -0,0 +1,104 @@
+import { readFileSync } from "node:fs";
+/**
+ * Scan an NDJSON trace and return the last orchestrator summary event,
+ * the first `meta` event's `discussion_id`, and any structured replies
+ * collected by the discusser. Skips malformed lines.
+ *
+ * The runner is verdict-agnostic — verbatim passthrough of whatever the
+ * trace carries ("success"/"failure" from supervise/facilitate; canonical
+ * "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
+ * its channel semantics.
+ *
+ * @param {string} traceFile
+ * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
+function readTraceSummary(traceFile) {
+  let summary = null;
+  let metaDiscussionId = null;
+  for (const line of readFileSync(traceFile, "utf8").split("\n")) {
+    if (!line.trim()) continue;
+    let record;
+    try {
+      record = JSON.parse(line);
+    } catch {
+      continue;
+    }
+    if (record.source !== "orchestrator") continue;
+    if (record.event?.type === "meta" && !metaDiscussionId) {
+      metaDiscussionId = record.event.discussion_id ?? null;
+    }
+    if (record.event?.type === "summary") {
+      summary = {
+        verdict: record.event.verdict ?? "failed",
+        summary: record.event.summary ?? "",
+        replies: Array.isArray(record.event.replies)
+          ? record.event.replies
+          : [],
+        ...(record.event.trigger && { trigger: record.event.trigger }),
+        ...(record.event.discussion_id && {
+          discussionId: record.event.discussion_id,
+        }),
+      };
+    }
+  }
+  if (summary && !summary.discussionId && metaDiscussionId) {
+    summary.discussionId = metaDiscussionId;
+  }
+  return summary;
+}
+/**
+ * Callback command — read an NDJSON trace, extract the terminal
+ * orchestrator summary, and POST a canonical callback body to the
+ * configured URL. Used by `kata-dispatch.yml` to deliver the lead's
+ * conclusion to the bridge that dispatched the run.
+ *
+ * Wire shape (single shape across modes):
+ *
+ * ```
+ * {
+ *   correlation_id, verdict, summary, run_url,
+ *   discussion_id?, replies: [], trigger?
+ * }
+ * ```
+ *
+ * @param {object} values - Parsed option values from cli.parse()
+ * @param {string[]} _args - Positional arguments
+ */
+export async function runCallbackCommand(values, _args) {
+  const traceFile = values["trace-file"];
+  const callbackUrl = values["callback-url"];
+  const correlationId = values["correlation-id"];
+  const runUrl = values["run-url"] ?? "";
+  const discussionIdOverride = values["discussion-id"] ?? null;
+  if (!traceFile) throw new Error("--trace-file is required");
+  if (!callbackUrl) throw new Error("--callback-url is required");
+  const found = readTraceSummary(traceFile) ?? {
+    verdict: "failed",
+    summary: "Run ended without producing a summary.",
+    replies: [],
+  };
+  const discussionId = found.discussionId ?? discussionIdOverride ?? null;
+  const payload = {
+    correlation_id: correlationId,
+    verdict: found.verdict,
+    summary: found.summary,
+    run_url: runUrl,
+    replies: found.replies,
+    ...(discussionId && { discussion_id: discussionId }),
+    ...(found.trigger && { trigger: found.trigger }),
+  };
+  const res = await fetch(callbackUrl, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(payload),
+  });
+  if (!res.ok) {
+    throw new Error(`Callback POST failed: ${res.status}`);
+  }
+}

package/src/commands/discuss.js ADDED Viewed

@@ -0,0 +1,116 @@
+import { readFileSync, createWriteStream } from "node:fs";
+import { resolve } from "node:path";
+import { createDiscusser } from "../discusser.js";
+import { createRedactor } from "../redaction.js";
+import { createTeeWriter } from "../tee-writer.js";
+function parseAgentProfiles(raw, cwd, maxTurns) {
+  if (!raw) return [];
+  return raw.split(",").map((entry) => {
+    const name = entry.trim();
+    return { name, role: name, cwd, agentProfile: name, maxTurns };
+  });
+}
+/**
+ * Parse and validate discuss command options. Exported so tests can verify
+ * defaults and the legacy-flag clean break.
+ * @param {object} values - Parsed option values
+ * @returns {object}
+ */
+// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
+export function parseDiscussOptions(values) {
+  const taskFile = values["task-file"];
+  const taskText = values["task-text"];
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
+  const taskAmend = values["task-amend"] ?? undefined;
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  const profilesRaw = values["agent-profiles"];
+  const agentCwd = resolve(values["agent-cwd"] ?? ".");
+  const maxTurnsRaw = values["max-turns"] ?? "40";
+  const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
+  const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
+  const resumeContextRaw = values["resume-context"];
+  let resumeContext = null;
+  if (resumeContextRaw) {
+    try {
+      resumeContext = JSON.parse(resumeContextRaw);
+    } catch (err) {
+      throw new Error(`--resume-context is not valid JSON: ${err.message}`);
+    }
+  }
+  return {
+    taskContent,
+    taskAmend,
+    agentConfigs,
+    leadProfile: values["lead-profile"] ?? "release-engineer",
+    leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
+    agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
+    maxTurns,
+    outputPath: values.output,
+    discussionId: values["discussion-id"] ?? null,
+    resumeContext,
+  };
+}
+/**
+ * Discuss command — run a discusser-led session with suspend/resume
+ * semantics, threading `discussion_id` through the trace so multi-run
+ * conversations are queryable as one.
+ *
+ * @param {object} values - Parsed option values
+ * @param {string[]} _args - Positional arguments
+ */
+export async function runDiscussCommand(values, _args) {
+  const opts = parseDiscussOptions(values);
+  const redactor = createRedactor();
+  const fileStream = opts.outputPath
+    ? createWriteStream(opts.outputPath)
+    : null;
+  const output = fileStream
+    ? createTeeWriter({
+        fileStream,
+        textStream: process.stdout,
+        mode: "supervised",
+      })
+    : process.stdout;
+  if (opts.leadProfile) {
+    process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
+  }
+  const { query } = await import("@anthropic-ai/claude-agent-sdk");
+  const discusser = createDiscusser({
+    leadProfile: opts.leadProfile,
+    leadModel: opts.leadModel,
+    agentModel: opts.agentModel,
+    agentConfigs: opts.agentConfigs,
+    discussionId: opts.discussionId,
+    resumeContext: opts.resumeContext,
+    query,
+    output,
+    maxTurns: opts.maxTurns,
+    taskAmend: opts.taskAmend,
+    redactor,
+  });
+  const result = await discusser.run(opts.taskContent);
+  if (fileStream) {
+    await new Promise((r) => output.end(r));
+    await new Promise((r) => fileStream.end(r));
+  }
+  process.exit(result.success ? 0 : 1);
+}

package/src/commands/facilitate.js CHANGED Viewed

@@ -54,10 +54,10 @@ export function parseFacilitateOptions(values) {
     agentConfigs,
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
+    facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns,
     outputPath: values.output,
-    facilitatorProfile: values["facilitator-profile"] ?? undefined,
+    facilitatorProfile: values["lead-profile"] ?? undefined,
   };
 }

package/src/commands/supervise.js CHANGED Viewed

@@ -12,7 +12,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
  * @returns {object}
  */
 // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
-function parseSuperviseOptions(values) {
+export function parseSuperviseOptions(values) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
   if (taskFile && taskText)
@@ -33,13 +33,13 @@ function parseSuperviseOptions(values) {
       values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
     ),
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
-    supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
+    supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
       const raw = values["max-turns"] ?? "200";
       return raw === "0" ? 0 : parseInt(raw, 10);
     })(),
     outputPath: values.output,
-    supervisorProfile: values["supervisor-profile"] ?? undefined,
+    supervisorProfile: values["lead-profile"] ?? undefined,
     agentProfile: values["agent-profile"] ?? undefined,
     allowedTools: (
       values["allowed-tools"] ??