npm - @forwardimpact/libeval - Versions diffs - 0.1.6 → 0.1.9 - Mend

@forwardimpact/libeval 0.1.6 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/bin/fit-eval.js +2 -2
package/index.js +2 -0
package/package.json +1 -1
package/src/agent-runner.js +178 -43
package/src/commands/run.js +43 -18
package/src/commands/supervise.js +59 -37
package/src/supervisor.js +298 -59
package/test/agent-runner-batching.test.js +271 -0
package/test/mock-runner.js +113 -0
package/test/supervisor-batching.test.js +175 -0
package/test/supervisor-intervention.test.js +365 -0
package/test/{supervisor.test.js → supervisor-output.test.js} +121 -306
package/test/supervisor-run.test.js +310 -0

package/test/agent-runner-batching.test.js ADDED Viewed

@@ -0,0 +1,271 @@
+import { describe, test } from "node:test";
+import assert from "node:assert";
+import { PassThrough } from "node:stream";
+import { AgentRunner } from "@forwardimpact/libeval";
+/**
+ * Create a mock query function that yields canned messages.
+ * @param {object[]} messages - Messages to yield
+ * @returns {function}
+ */
+function mockQuery(messages) {
+  return async function* () {
+    for (const msg of messages) {
+      yield msg;
+    }
+  };
+}
+const textBlock = (t) => ({
+  type: "assistant",
+  message: { content: [{ type: "text", text: t }] },
+});
+const toolOnly = (name) => ({
+  type: "assistant",
+  message: {
+    content: [{ type: "tool_use", id: "tu_" + name, name, input: {} }],
+  },
+});
+describe("AgentRunner - onBatch batching", () => {
+  test("batchSize defaults to 3", () => {
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.strictEqual(runner.batchSize, 3);
+  });
+  test("onBatch fires every 3 assistant text-block messages by default", async () => {
+    // 5 text-block messages + terminal result. With the default batchSize
+    // of 3, onBatch should fire on the 3rd text message and again on the
+    // terminal result (flushing the remaining 2).
+    const messages = [
+      { type: "system", subtype: "init", session_id: "sess-batch" },
+      textBlock("one"),
+      textBlock("two"),
+      textBlock("three"),
+      textBlock("four"),
+      textBlock("five"),
+      { type: "result", subtype: "success", result: "Done." },
+    ];
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: mockQuery(messages),
+      output: new PassThrough(),
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.map((l) => JSON.parse(l)));
+    };
+    await runner.run("Task");
+    // First flush carries init + first 3 text messages; second carries
+    // remaining 2 text messages + the result.
+    assert.strictEqual(batches.length, 2);
+    assert.strictEqual(batches[0].length, 4);
+    assert.strictEqual(batches[1].length, 3);
+  });
+  test("onBatch honours custom batchSize", async () => {
+    // batchSize = 2: 4 text messages produce 2 flushes; result adds a 3rd.
+    const messages = [
+      textBlock("a"),
+      textBlock("b"),
+      textBlock("c"),
+      textBlock("d"),
+      { type: "result", subtype: "success", result: "Done." },
+    ];
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: mockQuery(messages),
+      output: new PassThrough(),
+      batchSize: 2,
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.length);
+    };
+    await runner.run("Task");
+    assert.deepStrictEqual(batches, [2, 2, 1]);
+  });
+  test("tool-only assistant messages ride along in the next flush", async () => {
+    // Tool-only assistant messages accumulate without incrementing the
+    // counter. The supervisor sees the preceding tool calls when the
+    // flush eventually fires.
+    const messages = [
+      toolOnly("Read"),
+      toolOnly("Grep"),
+      textBlock("found it"),
+      { type: "result", subtype: "success", result: "Done." },
+    ];
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: mockQuery(messages),
+      output: new PassThrough(),
+      batchSize: 1,
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.map((l) => JSON.parse(l)));
+    };
+    await runner.run("Task");
+    // First flush triggered by the single text-block message; it carries
+    // the two preceding tool-only messages with it.
+    assert.strictEqual(batches.length, 2);
+    assert.strictEqual(batches[0].length, 3);
+    assert.strictEqual(batches[0][0].message.content[0].type, "tool_use");
+    assert.strictEqual(batches[0][1].message.content[0].type, "tool_use");
+    assert.strictEqual(batches[0][2].message.content[0].type, "text");
+    assert.strictEqual(batches[1].length, 1);
+    assert.strictEqual(batches[1][0].type, "result");
+  });
+  test("terminal result always flushes even if batchSize not yet reached", async () => {
+    // 1 text-block + result, batchSize = 5. The counter only reaches 1
+    // but the terminal result must still flush.
+    const messages = [
+      textBlock("only one"),
+      { type: "result", subtype: "success", result: "Done." },
+    ];
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: mockQuery(messages),
+      output: new PassThrough(),
+      batchSize: 5,
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.length);
+    };
+    await runner.run("Task");
+    assert.deepStrictEqual(batches, [2]);
+  });
+});
+describe("AgentRunner - terminal flush on abnormal end", () => {
+  test("iterator crash before a flush boundary still delivers the pending batch", async () => {
+    // batchSize = 3: the first two text messages accumulate without
+    // flushing. The iterator then throws before the threshold — the
+    // pending batch must ship in a terminal flush.
+    async function* crashingQuery() {
+      yield { type: "system", subtype: "init", session_id: "sess-crash" };
+      yield textBlock("step 1");
+      yield textBlock("step 2");
+      throw new Error("Claude Code process exited with code 1");
+    }
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: () => crashingQuery(),
+      output: new PassThrough(),
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.map((l) => JSON.parse(l)));
+    };
+    const result = await runner.run("Task");
+    assert.ok(result.error);
+    assert.match(result.error.message, /exited with code 1/);
+    assert.strictEqual(batches.length, 1);
+    assert.strictEqual(batches[0].length, 3);
+    assert.strictEqual(batches[0][0].type, "system");
+    assert.strictEqual(batches[0][1].type, "assistant");
+    assert.strictEqual(batches[0][2].type, "assistant");
+  });
+  test("iterator crash after a completed batch does not re-flush", async () => {
+    // batchSize = 2: two text messages trigger a normal flush, emptying
+    // the pending batch. The iterator then throws with nothing pending —
+    // the terminal flush must be a no-op, not an empty call.
+    async function* crashingQuery() {
+      yield textBlock("a");
+      yield textBlock("b");
+      throw new Error("boom");
+    }
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: () => crashingQuery(),
+      output: new PassThrough(),
+      batchSize: 2,
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.length);
+    };
+    const result = await runner.run("Task");
+    assert.ok(result.error);
+    assert.match(result.error.message, /boom/);
+    assert.deepStrictEqual(batches, [2]);
+  });
+  test("natural-end iterator without a result does not trigger terminal flush", async () => {
+    // The real SDK always terminates with `result`. A mock that ends
+    // naturally with pending lines is treated as an incomplete stub —
+    // no phantom flush, since nothing about a natural end warrants a
+    // new mid-turn review.
+    async function* noResultQuery() {
+      yield textBlock("one");
+      yield textBlock("two");
+      // No result, no error — just ends.
+    }
+    const batches = [];
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: () => noResultQuery(),
+      output: new PassThrough(),
+      batchSize: 3,
+    });
+    runner.onBatch = async (lines) => {
+      batches.push(lines.length);
+    };
+    const result = await runner.run("Task");
+    assert.strictEqual(result.error, null);
+    assert.strictEqual(batches.length, 0);
+  });
+  test("onBatch throw during terminal flush does not mask an earlier error", async () => {
+    // The iterator threw first; the terminal flush also throws. The
+    // original iterator error must win — it is the more actionable
+    // condition to surface to the caller.
+    async function* crashingQuery() {
+      yield textBlock("partial");
+      throw new Error("original failure");
+    }
+    const runner = new AgentRunner({
+      cwd: "/tmp",
+      query: () => crashingQuery(),
+      output: new PassThrough(),
+      batchSize: 3,
+    });
+    runner.onBatch = async () => {
+      throw new Error("flush failure");
+    };
+    const result = await runner.run("Task");
+    assert.ok(result.error);
+    assert.match(result.error.message, /original failure/);
+  });
+});

package/test/mock-runner.js ADDED Viewed

@@ -0,0 +1,113 @@
+/**
+ * Test-only mock factory for AgentRunner. Yields pre-scripted responses,
+ * and (when an `onBatch` callback is set) fires it at the same boundaries
+ * the real AgentRunner would: every `runner.batchSize` assistant messages
+ * with a text block, and the terminal `result` message. Tool-only
+ * assistant messages accumulate into the pending batch without counting
+ * toward the threshold. If the callback calls `abort()`, the mock stops
+ * iterating that response's messages and reports `aborted: true` — any
+ * lines that never made it through a flush boundary then ship in a
+ * terminal batch, mirroring the real runner's finally-flush.
+ *
+ * Intentionally a regular module (not a test file) so describe/test blocks
+ * here would not run. Lives under test/ to make its scope explicit.
+ */
+import { PassThrough } from "node:stream";
+import { AgentRunner } from "@forwardimpact/libeval";
+import { hasTextBlock } from "../src/agent-runner.js";
+/**
+ * Create a mock AgentRunner that yields pre-scripted responses. Each call
+ * to `run()` or `resume()` pops the next response from the array.
+ * @param {object[]} responses - Array of {text, success} objects
+ * @param {object[]} [messages] - Messages to buffer per response
+ * @returns {AgentRunner}
+ */
+export function createMockRunner(responses, messages) {
+  const output = new PassThrough();
+  let callIndex = 0;
+  const runner = new AgentRunner({
+    cwd: "/tmp",
+    query: async function* () {},
+    output,
+  });
+  const consume = async (msgs) => {
+    let aborted = false;
+    const pendingBatch = [];
+    let assistantTextCount = 0;
+    for (const m of msgs) {
+      const line = JSON.stringify(m);
+      runner.buffer.push(line);
+      if (runner.onLine) runner.onLine(line);
+      if (runner.onBatch) pendingBatch.push(line);
+      if (hasTextBlock(m)) {
+        assistantTextCount++;
+      }
+      const shouldFlush =
+        runner.onBatch &&
+        (m.type === "result" || assistantTextCount >= runner.batchSize);
+      if (shouldFlush) {
+        assistantTextCount = 0;
+        const batchLines = pendingBatch.splice(0);
+        await runner.onBatch(batchLines, {
+          abort: () => {
+            aborted = true;
+          },
+        });
+        if (aborted) break;
+      }
+    }
+    // Terminal flush: mirror the real AgentRunner's abnormal-end path —
+    // an aborted scripted run delivers any pending tail so the supervisor
+    // sees the partial state. Natural-end without a `result` marker is
+    // treated as a simplified stub (no phantom flush), matching the real
+    // runner's rule that terminal flush only fires on error/abort.
+    if (aborted && runner.onBatch && pendingBatch.length > 0) {
+      const batchLines = pendingBatch.splice(0);
+      await runner.onBatch(batchLines, {
+        abort: () => {
+          aborted = true;
+        },
+      });
+    }
+    return aborted;
+  };
+  runner.run = async (_task) => {
+    const resp = responses[callIndex++];
+    const msgs = messages?.[callIndex - 1] ?? [
+      { type: "assistant", content: resp.text },
+    ];
+    const aborted = await consume(msgs);
+    runner.sessionId = "mock-session";
+    return {
+      success: resp.success ?? true,
+      text: resp.text,
+      sessionId: "mock-session",
+      aborted,
+      error: null,
+    };
+  };
+  runner.resume = async (_prompt) => {
+    const resp = responses[callIndex++];
+    const msgs = messages?.[callIndex - 1] ?? [
+      { type: "assistant", content: resp.text },
+    ];
+    const aborted = await consume(msgs);
+    return {
+      success: resp.success ?? true,
+      text: resp.text,
+      sessionId: runner.sessionId,
+      aborted,
+      error: null,
+    };
+  };
+  return runner;
+}

package/test/supervisor-batching.test.js ADDED Viewed

@@ -0,0 +1,175 @@
+import { describe, test } from "node:test";
+import assert from "node:assert";
+import { PassThrough } from "node:stream";
+import { Supervisor } from "@forwardimpact/libeval";
+import { createMockRunner } from "./mock-runner.js";
+const textBlock = (t) => ({
+  type: "assistant",
+  message: { content: [{ type: "text", text: t }] },
+});
+describe("Supervisor - batching at the default batchSize", () => {
+  test("mid-turn review fires once per 3 agent text messages", async () => {
+    // Agent emits 7 text-block assistant messages in one turn. With the
+    // default batchSize of 3 the supervisor's mid-turn review should fire
+    // twice (after messages 3 and 6) plus once more from the terminal
+    // result flush carrying the remaining message — not seven times, as
+    // the old per-message flushing would have done.
+    const agentMessages = [
+      [
+        textBlock("step 1"),
+        textBlock("step 2"),
+        textBlock("step 3"),
+        textBlock("step 4"),
+        textBlock("step 5"),
+        textBlock("step 6"),
+        textBlock("step 7"),
+        { type: "result", subtype: "success", result: "Done." },
+      ],
+    ];
+    const agentRunner = createMockRunner(
+      [{ text: "Finished." }],
+      agentMessages,
+    );
+    // Leave batchSize at the default (3) — this is the behaviour we're
+    // verifying end-to-end through the supervisor loop.
+    assert.strictEqual(agentRunner.batchSize, 3);
+    const supervisorRunner = createMockRunner([
+      { text: "Welcome. Begin." },
+      { text: "Keep going." }, // mid-turn batch 1 (messages 1-3)
+      { text: "Keep going." }, // mid-turn batch 2 (messages 4-6)
+      { text: "Keep going." }, // terminal result flush (message 7 + result)
+      { text: "Good work.\n\nEVALUATION_COMPLETE" }, // end-of-turn review
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    const result = await supervisor.run("Do the task");
+    assert.strictEqual(result.success, true);
+    const midTurnReviews = (output.read()?.toString() ?? "")
+      .trim()
+      .split("\n")
+      .filter((l) => l.length > 0)
+      .map((l) => JSON.parse(l))
+      .filter(
+        (l) =>
+          l.source === "orchestrator" && l.event?.type === "mid_turn_review",
+      );
+    // 3 flushes total: two at the batchSize threshold (messages 3 and 6),
+    // one at the terminal result (trailing message + result marker).
+    assert.strictEqual(
+      midTurnReviews.length,
+      3,
+      "Supervisor should review 3 times per turn, not 7",
+    );
+  });
+  test("EVALUATION_INTERVENTION at the default batchSize still aborts and relays", async () => {
+    // Companion to the observation test above: the 3-message batching and
+    // the intervention path exercised together.
+    //
+    // Agent call 1 emits 3 text-block messages (triggering a flush at the
+    // 3rd). The supervisor intervenes; the agent SDK session aborts and
+    // the supervisor's intervention text is relayed into resume(). Agent
+    // call 2 has 1 text block — below the batchSize threshold — so no
+    // extra mid-turn flush fires, and the supervisor jumps straight to
+    // the end-of-turn review.
+    const agentMessages = [
+      [
+        textBlock("reading docs"),
+        textBlock("running Bash"),
+        textBlock("found the wrong path"),
+      ],
+      [textBlock("corrected, using the documented path")],
+    ];
+    const agentRunner = createMockRunner(
+      [{ text: "wrong path" }, { text: "corrected" }],
+      agentMessages,
+    );
+    assert.strictEqual(agentRunner.batchSize, 3);
+    const supervisorMessages = [
+      undefined,
+      [
+        {
+          type: "assistant",
+          message: {
+            content: [
+              {
+                type: "text",
+                text: "EVALUATION_INTERVENTION Use the documented path.",
+              },
+            ],
+          },
+        },
+      ],
+      undefined,
+    ];
+    const supervisorRunner = createMockRunner(
+      [
+        { text: "Welcome. Begin." },
+        { text: "EVALUATION_INTERVENTION Use the documented path." },
+        { text: "Good.\n\nEVALUATION_COMPLETE" },
+      ],
+      supervisorMessages,
+    );
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    let resumePrompt = null;
+    const origResume = agentRunner.resume;
+    agentRunner.resume = async (prompt) => {
+      resumePrompt = prompt;
+      return origResume.call(agentRunner, prompt);
+    };
+    const result = await supervisor.run("Install");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 1);
+    assert.ok(
+      resumePrompt && resumePrompt.includes("documented path"),
+      "Resume prompt should carry the supervisor's intervention text",
+    );
+    const orchestratorEvents = (output.read()?.toString() ?? "")
+      .trim()
+      .split("\n")
+      .filter((l) => l.length > 0)
+      .map((l) => JSON.parse(l))
+      .filter((e) => e.source === "orchestrator");
+    assert.ok(
+      orchestratorEvents.some(
+        (e) => e.event?.type === "intervention_requested",
+      ),
+      "Trace should contain intervention_requested",
+    );
+    assert.ok(
+      orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
+      "Trace should contain intervention_relayed",
+    );
+  });
+});