npm - @forwardimpact/libeval - Versions diffs - 0.1.43 → 0.1.45 - Mend

@forwardimpact/libeval 0.1.43 → 0.1.45

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

package/README.md +212 -13
package/bin/fit-benchmark.js +2 -2
package/bin/fit-eval.js +101 -21
package/bin/fit-trace.js +14 -0
package/package.json +1 -1
package/src/agent-runner.js +45 -181
package/src/benchmark/runner.js +2 -2
package/src/commands/benchmark-run.js +1 -1
package/src/commands/by-discussion.js +84 -0
package/src/commands/callback.js +104 -0
package/src/commands/discuss.js +116 -0
package/src/commands/facilitate.js +2 -2
package/src/commands/supervise.js +6 -4
package/src/discuss-tools.js +135 -0
package/src/discusser.js +315 -0
package/src/facilitator.js +46 -357
package/src/index.js +12 -0
package/src/judge.js +1 -1
package/src/message-bus.js +27 -81
package/src/orchestration-loop.js +316 -0
package/src/orchestration-toolkit.js +272 -303
package/src/orchestrator-helpers.js +9 -45
package/src/redaction.js +12 -0
package/src/render/orchestrator-filter.js +1 -8
package/src/supervisor.js +79 -465
package/src/trace-collector.js +4 -0

package/README.md CHANGED Viewed

@@ -7,12 +7,188 @@ reproducible evidence.
 <!-- END:description -->
-## Getting Started
+`libeval` provides the runtime and tool surface for multi-LLM
+coordination: an agent talks to a supervisor, a facilitator chairs a
+team meeting, or a lead drives an asynchronous discussion across a
+human channel. Every conversation produces a structured NDJSON trace
+for analysis.
+## Modes
+| Mode          | Lead          | Participants  | Terminal tool          |
+| ------------- | ------------- | ------------- | ---------------------- |
+| `run`         | (none)        | one agent     | task completion        |
+| `supervise`   | `supervisor`  | one `agent`   | `Conclude`             |
+| `facilitate` | `facilitator` | N named       | `Conclude`             |
+| `discuss`     | `lead`        | N named       | `Adjourn` or `Recess`  |
+| `judge`       | `judge`       | (none)        | `Conclude`             |
+Every mode except `run` and `judge` shares one orchestration loop
+(`OrchestrationLoop`) and one tool surface (`Ask` / `Answer` /
+`Announce` / `RollCall`, plus a mode-specific terminal tool). The
+loop fires the lead's LLM, fans messages out to participants over an
+in-memory bus, wakes them when something lands, and emits the
+universal `{source, seq, event}` NDJSON envelope for every line.
+## The Ask / Answer protocol
+Coordination uses one async request/reply pattern with one piece of
+state per question — the `askId`. Every Ask returns immediately; the
+reply arrives later on the asker's inbox.
+### Ask
+```text
+Ask({ question, to? })  →  { askIds: [N, …] }
+```
+The handler registers a pending entry per addressee, posts the
+question on the bus, and returns immediately. Each pending entry is
+keyed by a numeric `askId`. Two Asks to the same addressee each get
+their own id, so they coexist without overwriting.
+Broadcast: omit `to` on a multi-participant lead's Ask to fan out to
+every other participant — the result `askIds` array has one entry
+per addressee.
+### Answer
+```text
+Answer({ message, askId? })  →  routed to the asker
+```
+The reply lands in the asker's bus inbox as
+`[answer#N] <participant>: <text>` on a later turn. `askId` is
+optional and the handler is forgiving:
+- **Provided + matches an ask owed by the caller** → routes the reply
+  to that specific asker.
+- **Provided but unknown or wrong addressee** → `isError` with a
+  pointed message. The caller tried to specify; we tell them why.
+- **Omitted + exactly one ask is owed to the caller** → auto-picks
+  that ask. (Forcing an Announce when the only owed ask is obvious
+  would be pedantic.)
+- **Omitted + 0 or many asks owed** → broadcasts as Announce so the
+  message still reaches every participant.
+### Announce
+```text
+Announce({ message })  →  broadcast, no reply expected
+```
+Lands on every other participant's queue as `[shared] <from>: <text>`.
+### Inbox format
+Every line a participant reads on a resume is one bus message rendered
+with its tag:
+```text
+[ask#42]     facilitator: What is your current condition?
+[answer#41]  agent-1:     We're at 7 out of 10.
+[shared]     agent-2:     FYI I'm switching to Bun 1.2.
+[system]     @orchestrator: You have an unanswered ask from facilitator (askId=42)…
+```
+The `[ask#N]` tag is what the participant quotes back in Answer's
+`askId` field.
+### Why async
+The lead can issue Asks, end its turn, and use the gap between turns
+for planning, reflection, or follow-up Asks while participants work
+in parallel. Nothing blocks the LLM thread waiting on a reply. The
+orchestrator wakes the lead whenever the inbox has new content.
+## The orchestration loop
+`OrchestrationLoop` runs one outer pattern for both the lead and each
+participant:
+1. Drain the bus queue, or wait for the first message.
+2. Run (first turn) or resume (every subsequent turn) the LLM with the
+   drained messages formatted as tagged lines.
+3. If the participant ended a turn with an unanswered Ask owed to it,
+   inject one synthetic reminder and resume once more. If still
+   unanswered, emit a `protocol_violation` event and cancel the
+   pending entry with a synthetic null answer so the asker unblocks.
+The lead's first turn starts with the task as its initial prompt;
+participants' first runs are triggered by their first inbound message.
+Termination flips two flags:
+- `ctx.concluded` — explicit `Conclude` / `Adjourn` / `Recess`. The
+  handler also cancels any in-flight Asks with a synthetic null so
+  askers see why their question won't be answered.
+- `stopped` — broader: also true on a lead error, an agent crash, or
+  any abort path. Loops watch `stopped`; `ctx.concluded` is only used
+  for the summary's `success` / `verdict`.
+## Tool surface, by role
+| Role         | Ask | Answer | Announce | RollCall | Conclude | Other                                |
+| ------------ | --- | ------ | -------- | -------- | -------- | ------------------------------------ |
+| Facilitator  | ✓   | ✓      | ✓        | ✓        | ✓        |                                      |
+| Fac. agent   | ✓   | ✓      | ✓        | ✓        |          |                                      |
+| Supervisor   | ✓   | ✓      | ✓        | ✓        | ✓        |                                      |
+| Sup. agent   | ✓   | ✓      | ✓        | ✓        |          |                                      |
+| Discuss lead | ✓   | ✓      | ✓        | ✓        |          | `RequestForComment`, `Recess`, `Adjourn` |
+| Discuss agt  | ✓   | ✓      | ✓        | ✓        |          |                                      |
+| Judge        |     |        |          |          | ✓        |                                      |
+Ask's `to` accepts a participant name on multi-participant roles
+(facilitator, discuss lead, all participants); supervise's
+`supervisor` / `agent` pair don't accept `to` because there's only
+one possible target.
+## Minimal example: a two-participant facilitator
 ```js
-import { createTraceCollector, createTraceQuery, createAgentRunner } from '@forwardimpact/libeval';
+import { createFacilitator, createRedactor } from "@forwardimpact/libeval";
+import { query } from "@anthropic-ai/claude-agent-sdk";
+const facilitator = createFacilitator({
+  facilitatorCwd: process.cwd(),
+  agentConfigs: [
+    { name: "alice", role: "explorer", agentProfile: "alice" },
+    { name: "bob",   role: "tester",   agentProfile: "bob" },
+  ],
+  query,
+  output: process.stdout,
+  redactor: createRedactor(),
+  facilitatorProfile: "improvement-coach",
+});
+const result = await facilitator.run("Run a kata storyboard meeting.");
+// result.success / result.turns / NDJSON trace on process.stdout
 ```
+The facilitator's LLM, started with that task, has access to `Ask`,
+`Answer`, `Announce`, `RollCall`, and `Conclude`. Alice and Bob each
+get `Ask`, `Answer`, `Announce`, `RollCall`. Every tool call, every
+message routed through the bus, and every orchestrator event becomes a
+line in the trace.
+## Trace format
+Every line is one JSON object with three fields:
+```json
+{ "source": "facilitator", "seq": 42, "event": { … } }
+```
+- `source` — the participant whose LLM produced the line, or
+  `orchestrator` for loop-level events (`session_start`, `agent_start`,
+  `protocol_violation`, `lead_turn_limit`, `summary`).
+- `seq` — monotonically increasing across the whole trace; useful for
+  reconstructing the wall-clock order across concurrent participants.
+- `event` — the SDK event verbatim, or the orchestrator event payload.
+`fit-trace` consumes this format. See the trace analysis guide for the
+full schema.
 ## Trace redaction
 `fit-eval run`, `fit-eval supervise`, and `fit-eval facilitate` redact
@@ -21,14 +197,37 @@ secrets in trace artifacts before they reach disk. Two layers compose:
 - **Env-var allowlist**, defaulting to `ANTHROPIC_API_KEY`, `GH_TOKEN`,
   `GITHUB_TOKEN`. The runtime values of these vars are replaced with
   `[REDACTED:env:NAME]` wherever they appear in tool inputs, tool
-  outputs, assistant text, or orchestrator summaries. Override the list
-  with `LIBEVAL_REDACTION_ENV_VARS=NAME1,NAME2,…` (replaces, not extends).
-- **Credential-shape patterns**, covering Anthropic API keys (`sk-ant-`),
-  GitHub PATs (`ghp_`), installation tokens (`ghs_`), OAuth tokens
-  (`gho_`), and fine-grained PATs (`github_pat_`). Pattern hits become
-  `[REDACTED:pattern:KIND]`.
-Redaction is on by default. To disable, set `LIBEVAL_REDACTION_DISABLED=1`
-— a stderr warning fires once per run. Never set this in CI on a public
-repository: workflow artifacts there are downloadable through the
-retention window.
+  outputs, assistant text, or orchestrator summaries. Override the
+  list with `LIBEVAL_REDACTION_ENV_VARS=NAME1,NAME2,…` (replaces, not
+  extends).
+- **Credential-shape patterns**, covering Anthropic API keys
+  (`sk-ant-`), GitHub PATs (`ghp_`), installation tokens (`ghs_`),
+  OAuth tokens (`gho_`), and fine-grained PATs (`github_pat_`).
+  Pattern hits become `[REDACTED:pattern:KIND]`.
+Redaction is on by default. To disable, set
+`LIBEVAL_REDACTION_DISABLED=1` — a stderr warning fires once per run.
+Never set this in CI on a public repository: workflow artifacts there
+are downloadable through the retention window.
+## Module map
+| Module                       | Purpose                                                                 |
+| ---------------------------- | ----------------------------------------------------------------------- |
+| `agent-runner.js`            | One Claude Agent SDK session; emits NDJSON via the redactor.            |
+| `message-bus.js`             | In-memory per-participant queues + `waitForMessages` Promise wakeup.    |
+| `orchestration-toolkit.js`   | Shared Ask / Answer / Announce / Conclude / RollCall handlers + builders. |
+| `orchestration-loop.js`      | Unified lead+participant loop; reminder/violation handling.             |
+| `facilitator.js`             | `Facilitator` class + factory + system prompts.                         |
+| `supervisor.js`              | `Supervisor` class + factory + system prompts.                          |
+| `discuss-tools.js`           | Discuss-only RequestForComment / Recess / Adjourn handlers + tool servers. |
+| `discusser.js`               | `Discusser` class + factory + system prompt + resume hydration.         |
+| `judge.js`                   | One-shot post-hoc verdict via `Conclude`.                               |
+| `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers. |
+| `redaction.js`               | Env-var allowlist + credential-shape pattern redaction.                 |
+## Documentation
+- [Agent Evaluations Guide](https://www.forwardimpact.team/docs/libraries/agent-evaluations/index.md) — how to run an eval and read its trace.
+- [Agent Collaboration Guide](https://www.forwardimpact.team/docs/libraries/agent-collaboration/index.md) — supervise / facilitate / discuss in depth.
+- [Trace Analysis Guide](https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md) — analysing NDJSON traces with `fit-trace`.

package/bin/fit-benchmark.js CHANGED Viewed

@@ -46,10 +46,10 @@ export const definition = {
           description:
             "Claude model for the agent-under-test (default: claude-sonnet-4-6)",
         },
-        "supervisor-model": {
+        "lead-model": {
           type: "string",
           description:
-            "Claude model for the supervisor (default: claude-opus-4-7)",
+            "Claude model for the lead role (default: claude-opus-4-7)",
         },
         "judge-model": {
           type: "string",

package/bin/fit-eval.js CHANGED Viewed

@@ -9,6 +9,8 @@ import { runTeeCommand } from "../src/commands/tee.js";
 import { runRunCommand } from "../src/commands/run.js";
 import { runSuperviseCommand } from "../src/commands/supervise.js";
 import { runFacilitateCommand } from "../src/commands/facilitate.js";
+import { runDiscussCommand } from "../src/commands/discuss.js";
+import { runCallbackCommand } from "../src/commands/callback.js";
 // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
 // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -18,6 +20,18 @@ const VERSION =
   JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
     .version;
+const LEAD_OPTIONS = {
+  "lead-profile": {
+    type: "string",
+    description: "Lead role profile name (supervisor / facilitator / chair)",
+  },
+  "lead-model": {
+    type: "string",
+    description:
+      "Claude model for the lead role (default: claude-opus-4-7[1m])",
+  },
+};
 const definition = {
   name: "fit-eval",
   version: VERSION,
@@ -93,11 +107,7 @@ const definition = {
           description:
             "Claude model for the agent (default: claude-opus-4-7[1m])",
         },
-        "supervisor-model": {
-          type: "string",
-          description:
-            "Claude model for the supervisor (default: claude-opus-4-7[1m])",
-        },
+        ...LEAD_OPTIONS,
         "max-turns": {
           type: "string",
           description:
@@ -117,10 +127,6 @@ const definition = {
           description: "Supervisor working directory",
         },
         "agent-cwd": { type: "string", description: "Agent working directory" },
-        "supervisor-profile": {
-          type: "string",
-          description: "Supervisor (judge) profile name",
-        },
         "supervisor-allowed-tools": {
           type: "string",
           description: "Supervisor tool allowlist",
@@ -154,11 +160,7 @@ const definition = {
           type: "string",
           description: "Claude model for agents (default: claude-opus-4-7[1m])",
         },
-        "facilitator-model": {
-          type: "string",
-          description:
-            "Claude model for the facilitator (default: claude-opus-4-7[1m])",
-        },
+        ...LEAD_OPTIONS,
         "max-turns": {
           type: "string",
           description: "Max agentic turns (default: 20, 0 = unlimited)",
@@ -171,10 +173,6 @@ const definition = {
           type: "string",
           description: "Facilitator working directory",
         },
-        "facilitator-profile": {
-          type: "string",
-          description: "Facilitator profile name",
-        },
         "agent-profiles": {
           type: "string",
           description:
@@ -186,6 +184,56 @@ const definition = {
         },
       },
     },
+    {
+      name: "discuss",
+      args: "",
+      description:
+        "Run an async, suspendable discussion — Chair + N participants + bridge callback",
+      options: {
+        "task-file": {
+          type: "string",
+          description: "Path to a markdown task file",
+        },
+        "task-text": {
+          type: "string",
+          description: "Inline task text (alternative to --task-file)",
+        },
+        "task-amend": {
+          type: "string",
+          description: "Additional text appended to the task",
+        },
+        "agent-model": {
+          type: "string",
+          description: "Claude model for agents (default: claude-opus-4-7[1m])",
+        },
+        ...LEAD_OPTIONS,
+        "max-turns": {
+          type: "string",
+          description: "Max agentic turns (default: 40, 0 = unlimited)",
+        },
+        output: {
+          type: "string",
+          description: "Write the NDJSON trace to a file",
+        },
+        "agent-profiles": {
+          type: "string",
+          description: "Comma-separated participant profile names (optional)",
+        },
+        "agent-cwd": {
+          type: "string",
+          description: "Working directory shared by participants (default: .)",
+        },
+        "discussion-id": {
+          type: "string",
+          description:
+            "Stable id for the threaded conversation; carried through traces for linking",
+        },
+        "resume-context": {
+          type: "string",
+          description: "JSON-serialized prior state for a resumed run",
+        },
+      },
+    },
     {
       name: "output",
       args: "",
@@ -198,6 +246,35 @@ const definition = {
       description:
         "Stream readable text to stdout while saving raw NDJSON to a file",
     },
+    {
+      name: "callback",
+      args: "",
+      description:
+        "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
+      options: {
+        "trace-file": {
+          type: "string",
+          description: "Path to the NDJSON trace file",
+        },
+        "callback-url": {
+          type: "string",
+          description: "URL to POST the summary to",
+        },
+        "correlation-id": {
+          type: "string",
+          description: "Correlation ID to include in the payload",
+        },
+        "run-url": {
+          type: "string",
+          description: "GitHub Actions run URL (optional)",
+        },
+        "discussion-id": {
+          type: "string",
+          description:
+            "Discussion id (fallback when the trace lacks a meta event)",
+        },
+      },
+    },
   ],
   globalOptions: {
     format: { type: "string", description: "Output format (json|text)" },
@@ -207,8 +284,9 @@ const definition = {
   },
   examples: [
     "fit-eval run --task-file=task.md --output=trace.ndjson",
-    "fit-eval supervise --task-file=task.md --supervisor-profile=judge --agent-profile=coder --output=trace.ndjson",
-    'fit-eval facilitate --task-file=task.md --facilitator-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
+    "fit-eval supervise --task-file=task.md --lead-profile=judge --agent-profile=coder --output=trace.ndjson",
+    'fit-eval facilitate --task-file=task.md --lead-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
+    'fit-eval discuss --task-file=task.md --lead-profile=release-engineer --agent-profiles="staff-engineer,security-engineer" --discussion-id=GD_kw...',
     "fit-eval output --format=text < trace.ndjson",
   ],
   documentation: [
@@ -234,7 +312,7 @@ const definition = {
       title: "Agent Teams",
       url: "https://www.forwardimpact.team/docs/products/agent-teams/index.md",
       description:
-        "How to author the agent, supervisor, and facilitator profiles consumed by --agent-profile, --supervisor-profile, --facilitator-profile, and --agent-profiles.",
+        "How to author the profiles consumed by --agent-profile, --lead-profile, and --agent-profiles.",
     },
   ],
 };
@@ -248,6 +326,8 @@ const COMMANDS = {
   run: runRunCommand,
   supervise: runSuperviseCommand,
   facilitate: runFacilitateCommand,
+  discuss: runDiscussCommand,
+  callback: runCallbackCommand,
 };
 async function main() {

package/bin/fit-trace.js CHANGED Viewed

@@ -26,6 +26,7 @@ import {
   runSplitCommand,
 } from "../src/commands/trace.js";
 import { runAssertCommand } from "../src/commands/assert.js";
+import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
 // `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
 // the readFileSync branch in the compiled binary (which would ENOENT against
@@ -160,6 +161,18 @@ const definition = {
       args: "<file> <index>",
       description: "Single turn by index",
     },
+    {
+      name: "by-discussion",
+      args: "<discussion-id> [trace-dir]",
+      description:
+        "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
+      options: {
+        "trace-dir": {
+          type: "string",
+          description: "Directory to scan (default: traces)",
+        },
+      },
+    },
     {
       name: "filter",
       args: "<file>",
@@ -307,6 +320,7 @@ const COMMANDS = {
   filter: runFilterCommand,
   split: runSplitCommand,
   assert: runAssertCommand,
+  "by-discussion": runByDiscussionCommand,
 };
 async function main() {

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.43",
+  "version": "0.1.45",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",