@forwardimpact/libeval 0.1.41 → 0.1.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -100,7 +100,8 @@ const definition = {
100
100
  },
101
101
  "max-turns": {
102
102
  type: "string",
103
- description: "Max agentic turns (default: 20, 0 = unlimited)",
103
+ description:
104
+ "Max agentic turns per runner invocation (default: 200, 0 = unlimited)",
104
105
  },
105
106
  output: {
106
107
  type: "string",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.41",
3
+ "version": "0.1.43",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -10,19 +10,21 @@ import { createTeeWriter } from "../tee-writer.js";
10
10
  * @param {string} cwd - Shared working directory for all agents
11
11
  * @returns {Array<{name: string, role: string, cwd: string, agentProfile: string}>}
12
12
  */
13
- function parseAgentProfiles(raw, cwd) {
13
+ function parseAgentProfiles(raw, cwd, maxTurns) {
14
14
  return raw.split(",").map((entry) => {
15
15
  const name = entry.trim();
16
- return { name, role: name, cwd, agentProfile: name };
16
+ return { name, role: name, cwd, agentProfile: name, maxTurns };
17
17
  });
18
18
  }
19
19
 
20
20
  /**
21
- * Parse and validate facilitate command options.
21
+ * Parse and validate facilitate command options. Exported for test
22
+ * coverage of the `--max-turns` → per-agent threading contract; not part
23
+ * of the package's public API.
22
24
  * @param {object} values - Parsed option values
23
25
  * @returns {object} Parsed options
24
26
  */
25
- function parseFacilitateOptions(values) {
27
+ export function parseFacilitateOptions(values) {
26
28
  const taskFile = values["task-file"];
27
29
  const taskText = values["task-text"];
28
30
  if (taskFile && taskText)
@@ -36,9 +38,15 @@ function parseFacilitateOptions(values) {
36
38
  const profilesRaw = values["agent-profiles"];
37
39
  if (!profilesRaw) throw new Error("--agent-profiles is required");
38
40
  const agentCwd = resolve(values["agent-cwd"] ?? ".");
39
- const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd);
40
41
 
41
42
  const maxTurnsRaw = values["max-turns"] ?? "20";
43
+ const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
44
+
45
+ // Thread --max-turns into each participant: without this, every facilitated
46
+ // agent silently falls back to the 50-turn default in facilitator.js even
47
+ // when the caller raises the budget. Observed in run 26078312414 where
48
+ // staff-engineer terminated at 51 turns despite --max-turns=200.
49
+ const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
42
50
 
43
51
  return {
44
52
  taskContent,
@@ -47,7 +55,7 @@ function parseFacilitateOptions(values) {
47
55
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
48
56
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
49
57
  facilitatorModel: values["facilitator-model"] ?? "claude-opus-4-7[1m]",
50
- maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
58
+ maxTurns,
51
59
  outputPath: values.output,
52
60
  facilitatorProfile: values["facilitator-profile"] ?? undefined,
53
61
  };
@@ -35,7 +35,7 @@ function parseSuperviseOptions(values) {
35
35
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
36
36
  supervisorModel: values["supervisor-model"] ?? "claude-opus-4-7[1m]",
37
37
  maxTurns: (() => {
38
- const raw = values["max-turns"] ?? "20";
38
+ const raw = values["max-turns"] ?? "200";
39
39
  return raw === "0" ? 0 : parseInt(raw, 10);
40
40
  })(),
41
41
  outputPath: values.output,
@@ -393,7 +393,9 @@ const devNull = new Writable({
393
393
  * @param {string} [deps.model] - Default model for all participants.
394
394
  * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
395
395
  * @param {string} [deps.facilitatorModel] - Facilitator model override (falls back to `model`).
396
- * @param {number} [deps.maxTurns]
396
+ * @param {number} [deps.maxTurns] - Facilitator's own per-invocation turn budget (default 20). Each participating agent's budget is taken from `config.maxTurns` on its entry in `agentConfigs` (default 50 when unset). The CLI command (`commands/facilitate.js`) threads `--max-turns` into both this parameter and every agent config so a single CLI value bounds all participants uniformly.
397
+ * @param {string[]} [deps.facilitatorAllowedTools] - Tools the facilitator may use; defaults to a read/write file-edit set.
398
+ * @param {string[]} [deps.facilitatorDisallowedTools] - Additional tools to block on the facilitator; merged with the sub-agent spawn defaults (Agent/Task/TaskOutput/TaskStop).
397
399
  * @param {string} [deps.facilitatorProfile] - Facilitator profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
398
400
  * @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<facilitatorCwd>/.claude/agents`. Resolved once from the facilitator's cwd so profiles travel with the project, not with per-agent sandboxes.
399
401
  * @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
@@ -408,6 +410,8 @@ export function createFacilitator({
408
410
  agentModel,
409
411
  facilitatorModel,
410
412
  maxTurns,
413
+ facilitatorAllowedTools,
414
+ facilitatorDisallowedTools,
411
415
  facilitatorProfile,
412
416
  profilesDir,
413
417
  taskAmend,
@@ -467,12 +471,29 @@ export function createFacilitator({
467
471
  return { name: config.name, role: config.role, runner };
468
472
  });
469
473
 
474
+ // Block the SDK's sub-agent spawn tools on the facilitator: its job is to
475
+ // coordinate participants through the libeval orchestration harness, not
476
+ // to fan work out to ad-hoc Claude Code sub-agents. Mirrors the supervisor.
477
+ const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
478
+ const disallowedTools = facilitatorDisallowedTools
479
+ ? [...new Set([...defaultDisallowed, ...facilitatorDisallowedTools])]
480
+ : defaultDisallowed;
481
+
470
482
  const facilitatorRunner = createAgentRunner({
471
483
  cwd: facilitatorCwd,
472
484
  query,
473
485
  output: devNull,
474
486
  model: facilitatorModel ?? model,
475
487
  maxTurns: maxTurns ?? 20,
488
+ allowedTools: facilitatorAllowedTools ?? [
489
+ "Bash",
490
+ "Read",
491
+ "Glob",
492
+ "Grep",
493
+ "Write",
494
+ "Edit",
495
+ ],
496
+ disallowedTools,
476
497
  onLine: (line) => facilitator.emitLine("facilitator", line),
477
498
  mcpServers: { orchestration: facilitatorServer },
478
499
  settingSources: ["project"],
package/src/supervisor.js CHANGED
@@ -50,10 +50,17 @@ export const AGENT_SYSTEM_PROMPT =
50
50
  * Maximum number of mid-turn interventions allowed within a single agent turn.
51
51
  * Bounded so a looping supervisor exhausts its quota fast (observability) but
52
52
  * leaves headroom for legitimate "intervene, observe, intervene again" patterns.
53
- * The outer maxTurns budget still bounds overall runtime.
53
+ * The outer exchange budget still bounds overall runtime.
54
54
  */
55
55
  const MAX_INTERVENTIONS_PER_TURN = 5;
56
56
 
57
+ /**
58
+ * Default cap on supervisor↔agent exchanges in a single run. Not exposed via
59
+ * CLI — `--max-turns` governs the per-runner invocation budget instead. When
60
+ * a `--max-exchanges` flag is added this becomes the default for that flag.
61
+ */
62
+ const DEFAULT_MAX_EXCHANGES = 100;
63
+
57
64
  /** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
58
65
  export class Supervisor {
59
66
  /**
@@ -485,7 +492,7 @@ const devNull = new Writable({
485
492
  * @param {string} [deps.model] - Default model for both runners.
486
493
  * @param {string} [deps.agentModel] - Agent model override (falls back to `model`).
487
494
  * @param {string} [deps.supervisorModel] - Supervisor model override (falls back to `model`).
488
- * @param {number} [deps.maxTurns]
495
+ * @param {number} [deps.maxTurns] - Per-runner invocation budget for both the supervisor and the agent (default 200; 0 = unlimited). Outer supervisor↔agent exchanges are bounded separately by `DEFAULT_MAX_EXCHANGES` (passes through to unlimited when `maxTurns === 0`).
489
496
  * @param {string[]} [deps.allowedTools]
490
497
  * @param {string[]} [deps.supervisorAllowedTools]
491
498
  * @param {string[]} [deps.supervisorDisallowedTools]
@@ -544,8 +551,13 @@ export function createSupervisor({
544
551
 
545
552
  const onLine = (line) => supervisor.emitLine(line);
546
553
 
547
- const perInvocationTurns =
548
- maxTurns === 0 ? 0 : Math.max(maxTurns ?? 100, 200);
554
+ // `maxTurns` is the per-runner invocation budget — matches `run` and
555
+ // `facilitate` semantics. The outer supervisor↔agent exchange loop is
556
+ // bounded separately by `DEFAULT_MAX_EXCHANGES`; when --max-exchanges is
557
+ // added it will become a parameter. `maxTurns === 0` propagates through
558
+ // to mean unlimited on both axes.
559
+ const perInvocationTurns = maxTurns ?? 200;
560
+ const exchangeBudget = maxTurns === 0 ? 0 : DEFAULT_MAX_EXCHANGES;
549
561
 
550
562
  const agentRunner = createAgentRunner({
551
563
  cwd: agentCwd,
@@ -561,6 +573,9 @@ export function createSupervisor({
561
573
  redactor,
562
574
  });
563
575
 
576
+ // Block the SDK's sub-agent spawn tools on the supervisor: its job is to
577
+ // coordinate the agent through the libeval orchestration harness, not to
578
+ // fan work out to ad-hoc Claude Code sub-agents. Mirrors the facilitator.
564
579
  const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
565
580
  const disallowedTools = supervisorDisallowedTools
566
581
  ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
@@ -592,7 +607,7 @@ export function createSupervisor({
592
607
  agentRunner,
593
608
  supervisorRunner,
594
609
  output,
595
- maxTurns,
610
+ maxTurns: exchangeBudget,
596
611
  ctx,
597
612
  messageBus,
598
613
  taskAmend,