npm - @forwardimpact/libeval - Versions diffs - 0.1.41 → 0.1.43 - Mend

@forwardimpact/libeval 0.1.41 → 0.1.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/bin/fit-eval.js +2 -1
package/package.json +1 -1
package/src/commands/facilitate.js +14 -6
package/src/commands/supervise.js +1 -1
package/src/facilitator.js +22 -1
package/src/supervisor.js +20 -5

package/bin/fit-eval.js CHANGED Viewed

@@ -100,7 +100,8 @@ const definition = {
         },
         "max-turns": {
           type: "string",
-          description: "Max agentic turns (default: 20, 0 = unlimited)",
+          description:
+            "Max agentic turns per runner invocation (default: 200, 0 = unlimited)",
         },
         output: {
           type: "string",

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.41",
+  "version": "0.1.43",
   "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
   "keywords": [
     "eval",

package/src/commands/facilitate.js CHANGED Viewed

@@ -10,19 +10,21 @@ import { createTeeWriter } from "../tee-writer.js";
  * @param {string} cwd - Shared working directory for all agents
  * @returns {Array<{name: string, role: string, cwd: string, agentProfile: string}>}
  */
-function parseAgentProfiles(raw, cwd) {
+function parseAgentProfiles(raw, cwd, maxTurns) {
   return raw.split(",").map((entry) => {
     const name = entry.trim();
-    return { name, role: name, cwd, agentProfile: name };
+    return { name, role: name, cwd, agentProfile: name, maxTurns };
   });
 }
 /**
- * Parse and validate facilitate command options.
+ * Parse and validate facilitate command options. Exported for test
+ * coverage of the `--max-turns` → per-agent threading contract; not part
+ * of the package's public API.
  * @param {object} values - Parsed option values
  * @returns {object} Parsed options
  */
-function parseFacilitateOptions(values) {
+export function parseFacilitateOptions(values) {
   const taskFile = values["task-file"];
   const taskText = values["task-text"];
   if (taskFile && taskText)
@@ -36,9 +38,15 @@ function parseFacilitateOptions(values) {
   const profilesRaw = values["agent-profiles"];
   if (!profilesRaw) throw new Error("--agent-profiles is required");
   const agentCwd = resolve(values["agent-cwd"] ?? ".");
-  const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd);
   const maxTurnsRaw = values["max-turns"] ?? "20";
+  const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
+  // Thread --max-turns into each participant: without this, every facilitated
+  // agent silently falls back to the 50-turn default in facilitator.js even
+  // when the caller raises the budget. Observed in run 26078312414 where
+  // staff-engineer terminated at 51 turns despite --max-turns=200.
+  const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
   return {
     taskContent,
@@ -47,7 +55,7 @@ function parseFacilitateOptions(values) {
     facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
-    maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
+    maxTurns,
     outputPath: values.output,
     facilitatorProfile: values["facilitator-profile"] ?? undefined,
   };

package/src/commands/supervise.js CHANGED Viewed

@@ -35,7 +35,7 @@ function parseSuperviseOptions(values) {
     agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
     supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
     maxTurns: (() => {
-      const raw = values["max-turns"] ?? "20";
+      const raw = values["max-turns"] ?? "200";
       return raw === "0" ? 0 : parseInt(raw, 10);
     })(),
     outputPath: values.output,

package/src/facilitator.js CHANGED Viewed

@@ -393,7 +393,9 @@ const devNull = new Writable({
  * @param {string} [deps.model] - Default model for all participants.
  * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
  * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
- * @param {number} [deps.maxTurns]
+ * @param {number} [deps.maxTurns] - Facilitator's own per-invocation turn budget (default 20). Each participating agent's budget is taken from `config.maxTurns` on its entry in `agentConfigs` (default 50 when unset). The CLI command (`commands/facilitate.js`) threads `--max-turns` into both this parameter and every agent config so a single CLI value bounds all participants uniformly.
+ * @param {string[]} [deps.facilitatorAllowedTools] - Tools the facilitator may use; defaults to a read/write file-edit set.
+ * @param {string[]} [deps.facilitatorDisallowedTools] - Additional tools to block on the facilitator; merged with the sub-agent spawn defaults (Agent/Task/TaskOutput/TaskStop).
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
@@ -408,6 +410,8 @@ export function createFacilitator({
   agentModel,
   facilitatorModel,
   maxTurns,
+  facilitatorAllowedTools,
+  facilitatorDisallowedTools,
   facilitatorProfile,
   profilesDir,
   taskAmend,
@@ -467,12 +471,29 @@ export function createFacilitator({
     return { name: config.name, role: config.role, runner };
   });
+  // Block the SDK's sub-agent spawn tools on the facilitator: its job is to
+  // coordinate participants through the libeval orchestration harness, not
+  // to fan work out to ad-hoc Claude Code sub-agents. Mirrors the supervisor.
+  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
+  const disallowedTools = facilitatorDisallowedTools
+    ? [...new Set([...defaultDisallowed, ...facilitatorDisallowedTools])]
+    : defaultDisallowed;
   const facilitatorRunner = createAgentRunner({
     cwd: facilitatorCwd,
     query,
     output: devNull,
     model: facilitatorModel ?? model,
     maxTurns: maxTurns ?? 20,
+    allowedTools: facilitatorAllowedTools ?? [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ],
+    disallowedTools,
     onLine: (line) => facilitator.emitLine("facilitator", line),
     mcpServers: { orchestration: facilitatorServer },
     settingSources: ["project"],

package/src/supervisor.js CHANGED Viewed

@@ -50,10 +50,17 @@ export const AGENT_SYSTEM_PROMPT =
  * Maximum number of mid-turn interventions allowed within a single agent turn.
  * Bounded so a looping supervisor exhausts its quota fast (observability) but
  * leaves headroom for legitimate "intervene, observe, intervene again" patterns.
- * The outer maxTurns budget still bounds overall runtime.
+ * The outer exchange budget still bounds overall runtime.
  */
 const MAX_INTERVENTIONS_PER_TURN = 5;
+/**
+ * Default cap on supervisor↔agent exchanges in a single run. Not exposed via
+ * CLI — `--max-turns` governs the per-runner invocation budget instead. When
+ * a `--max-exchanges` flag is added this becomes the default for that flag.
+ */
+const DEFAULT_MAX_EXCHANGES = 100;
 /** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
 export class Supervisor {
   /**
@@ -485,7 +492,7 @@ const devNull = new Writable({
  * @param {string} [deps.model] - Default model for both runners.
  * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
  * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
- * @param {number} [deps.maxTurns]
+ * @param {number} [deps.maxTurns] - Per-runner invocation budget for both the supervisor and the agent (default 200; 0 = unlimited). Outer supervisor↔agent exchanges are bounded separately by `DEFAULT_MAX_EXCHANGES` (passes through to unlimited when `maxTurns === 0`).
  * @param {string[]} [deps.allowedTools]
  * @param {string[]} [deps.supervisorAllowedTools]
  * @param {string[]} [deps.supervisorDisallowedTools]
@@ -544,8 +551,13 @@ export function createSupervisor({
   const onLine = (line) => supervisor.emitLine(line);
-  const perInvocationTurns =
-    maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
+  // `maxTurns` is the per-runner invocation budget — matches `run` and
+  // `facilitate` semantics. The outer supervisor↔agent exchange loop is
+  // bounded separately by `DEFAULT_MAX_EXCHANGES`; when --max-exchanges is
+  // added it will become a parameter. `maxTurns === 0` propagates through
+  // to mean unlimited on both axes.
+  const perInvocationTurns = maxTurns ?? 200;
+  const exchangeBudget = maxTurns === 0 ? 0 : DEFAULT_MAX_EXCHANGES;
   const agentRunner = createAgentRunner({
     cwd: agentCwd,
@@ -561,6 +573,9 @@ export function createSupervisor({
     redactor,
   });
+  // Block the SDK's sub-agent spawn tools on the supervisor: its job is to
+  // coordinate the agent through the libeval orchestration harness, not to
+  // fan work out to ad-hoc Claude Code sub-agents. Mirrors the facilitator.
   const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
   const disallowedTools = supervisorDisallowedTools
     ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
@@ -592,7 +607,7 @@ export function createSupervisor({
     agentRunner,
     supervisorRunner,
     output,
-    maxTurns,
+    maxTurns: exchangeBudget,
     ctx,
     messageBus,
     taskAmend,