npm - @forwardimpact/libeval - Versions diffs - 0.1.11 → 0.1.13 - Mend

@forwardimpact/libeval 0.1.11 → 0.1.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/package.json +11 -2
package/src/index.js +11 -0
package/src/supervisor.js +2 -2
package/index.js +0 -11
package/test/agent-runner-batching.test.js +0 -271
package/test/agent-runner.test.js +0 -317
package/test/fixtures/stream.ndjson +0 -7
package/test/mock-runner.js +0 -113
package/test/supervisor-batching.test.js +0 -175
package/test/supervisor-intervention.test.js +0 -365
package/test/supervisor-output.test.js +0 -369
package/test/supervisor-run.test.js +0 -310
package/test/tee-writer.test.js +0 -324
package/test/trace-collector.test.js +0 -424

package/test/mock-runner.js DELETED Viewed

@@ -1,113 +0,0 @@
-/**
- * Test-only mock factory for AgentRunner. Yields pre-scripted responses,
- * and (when an `onBatch` callback is set) fires it at the same boundaries
- * the real AgentRunner would: every `runner.batchSize` assistant messages
- * with a text block, and the terminal `result` message. Tool-only
- * assistant messages accumulate into the pending batch without counting
- * toward the threshold. If the callback calls `abort()`, the mock stops
- * iterating that response's messages and reports `aborted: true` — any
- * lines that never made it through a flush boundary then ship in a
- * terminal batch, mirroring the real runner's finally-flush.
- *
- * Intentionally a regular module (not a test file) so describe/test blocks
- * here would not run. Lives under test/ to make its scope explicit.
- */
-import { PassThrough } from "node:stream";
-import { AgentRunner } from "@forwardimpact/libeval";
-import { hasTextBlock } from "../src/agent-runner.js";
-/**
- * Create a mock AgentRunner that yields pre-scripted responses. Each call
- * to `run()` or `resume()` pops the next response from the array.
- * @param {object[]} responses - Array of {text, success} objects
- * @param {object[]} [messages] - Messages to buffer per response
- * @returns {AgentRunner}
- */
-export function createMockRunner(responses, messages) {
-  const output = new PassThrough();
-  let callIndex = 0;
-  const runner = new AgentRunner({
-    cwd: "/tmp",
-    query: async function* () {},
-    output,
-  });
-  const consume = async (msgs) => {
-    let aborted = false;
-    const pendingBatch = [];
-    let assistantTextCount = 0;
-    for (const m of msgs) {
-      const line = JSON.stringify(m);
-      runner.buffer.push(line);
-      if (runner.onLine) runner.onLine(line);
-      if (runner.onBatch) pendingBatch.push(line);
-      if (hasTextBlock(m)) {
-        assistantTextCount++;
-      }
-      const shouldFlush =
-        runner.onBatch &&
-        (m.type === "result" || assistantTextCount >= runner.batchSize);
-      if (shouldFlush) {
-        assistantTextCount = 0;
-        const batchLines = pendingBatch.splice(0);
-        await runner.onBatch(batchLines, {
-          abort: () => {
-            aborted = true;
-          },
-        });
-        if (aborted) break;
-      }
-    }
-    // Terminal flush: mirror the real AgentRunner's abnormal-end path —
-    // an aborted scripted run delivers any pending tail so the supervisor
-    // sees the partial state. Natural-end without a `result` marker is
-    // treated as a simplified stub (no phantom flush), matching the real
-    // runner's rule that terminal flush only fires on error/abort.
-    if (aborted && runner.onBatch && pendingBatch.length > 0) {
-      const batchLines = pendingBatch.splice(0);
-      await runner.onBatch(batchLines, {
-        abort: () => {
-          aborted = true;
-        },
-      });
-    }
-    return aborted;
-  };
-  runner.run = async (_task) => {
-    const resp = responses[callIndex++];
-    const msgs = messages?.[callIndex - 1] ?? [
-      { type: "assistant", content: resp.text },
-    ];
-    const aborted = await consume(msgs);
-    runner.sessionId = "mock-session";
-    return {
-      success: resp.success ?? true,
-      text: resp.text,
-      sessionId: "mock-session",
-      aborted,
-      error: null,
-    };
-  };
-  runner.resume = async (_prompt) => {
-    const resp = responses[callIndex++];
-    const msgs = messages?.[callIndex - 1] ?? [
-      { type: "assistant", content: resp.text },
-    ];
-    const aborted = await consume(msgs);
-    return {
-      success: resp.success ?? true,
-      text: resp.text,
-      sessionId: runner.sessionId,
-      aborted,
-      error: null,
-    };
-  };
-  return runner;
-}

package/test/supervisor-batching.test.js DELETED Viewed

@@ -1,175 +0,0 @@
-import { describe, test } from "node:test";
-import assert from "node:assert";
-import { PassThrough } from "node:stream";
-import { Supervisor } from "@forwardimpact/libeval";
-import { createMockRunner } from "./mock-runner.js";
-const textBlock = (t) => ({
-  type: "assistant",
-  message: { content: [{ type: "text", text: t }] },
-});
-describe("Supervisor - batching at the default batchSize", () => {
-  test("mid-turn review fires once per 3 agent text messages", async () => {
-    // Agent emits 7 text-block assistant messages in one turn. With the
-    // default batchSize of 3 the supervisor's mid-turn review should fire
-    // twice (after messages 3 and 6) plus once more from the terminal
-    // result flush carrying the remaining message — not seven times, as
-    // the old per-message flushing would have done.
-    const agentMessages = [
-      [
-        textBlock("step 1"),
-        textBlock("step 2"),
-        textBlock("step 3"),
-        textBlock("step 4"),
-        textBlock("step 5"),
-        textBlock("step 6"),
-        textBlock("step 7"),
-        { type: "result", subtype: "success", result: "Done." },
-      ],
-    ];
-    const agentRunner = createMockRunner(
-      [{ text: "Finished." }],
-      agentMessages,
-    );
-    // Leave batchSize at the default (3) — this is the behaviour we're
-    // verifying end-to-end through the supervisor loop.
-    assert.strictEqual(agentRunner.batchSize, 3);
-    const supervisorRunner = createMockRunner([
-      { text: "Welcome. Begin." },
-      { text: "Keep going." }, // mid-turn batch 1 (messages 1-3)
-      { text: "Keep going." }, // mid-turn batch 2 (messages 4-6)
-      { text: "Keep going." }, // terminal result flush (message 7 + result)
-      { text: "Good work.\n\nEVALUATION_COMPLETE" }, // end-of-turn review
-    ]);
-    const output = new PassThrough();
-    const supervisor = new Supervisor({
-      agentRunner,
-      supervisorRunner,
-      output,
-      maxTurns: 10,
-    });
-    agentRunner.onLine = (line) => supervisor.emitLine(line);
-    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
-    const result = await supervisor.run("Do the task");
-    assert.strictEqual(result.success, true);
-    const midTurnReviews = (output.read()?.toString() ?? "")
-      .trim()
-      .split("\n")
-      .filter((l) => l.length > 0)
-      .map((l) => JSON.parse(l))
-      .filter(
-        (l) =>
-          l.source === "orchestrator" && l.event?.type === "mid_turn_review",
-      );
-    // 3 flushes total: two at the batchSize threshold (messages 3 and 6),
-    // one at the terminal result (trailing message + result marker).
-    assert.strictEqual(
-      midTurnReviews.length,
-      3,
-      "Supervisor should review 3 times per turn, not 7",
-    );
-  });
-  test("EVALUATION_INTERVENTION at the default batchSize still aborts and relays", async () => {
-    // Companion to the observation test above: the 3-message batching and
-    // the intervention path exercised together.
-    //
-    // Agent call 1 emits 3 text-block messages (triggering a flush at the
-    // 3rd). The supervisor intervenes; the agent SDK session aborts and
-    // the supervisor's intervention text is relayed into resume(). Agent
-    // call 2 has 1 text block — below the batchSize threshold — so no
-    // extra mid-turn flush fires, and the supervisor jumps straight to
-    // the end-of-turn review.
-    const agentMessages = [
-      [
-        textBlock("reading docs"),
-        textBlock("running Bash"),
-        textBlock("found the wrong path"),
-      ],
-      [textBlock("corrected, using the documented path")],
-    ];
-    const agentRunner = createMockRunner(
-      [{ text: "wrong path" }, { text: "corrected" }],
-      agentMessages,
-    );
-    assert.strictEqual(agentRunner.batchSize, 3);
-    const supervisorMessages = [
-      undefined,
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [
-              {
-                type: "text",
-                text: "EVALUATION_INTERVENTION Use the documented path.",
-              },
-            ],
-          },
-        },
-      ],
-      undefined,
-    ];
-    const supervisorRunner = createMockRunner(
-      [
-        { text: "Welcome. Begin." },
-        { text: "EVALUATION_INTERVENTION Use the documented path." },
-        { text: "Good.\n\nEVALUATION_COMPLETE" },
-      ],
-      supervisorMessages,
-    );
-    const output = new PassThrough();
-    const supervisor = new Supervisor({
-      agentRunner,
-      supervisorRunner,
-      output,
-      maxTurns: 10,
-    });
-    agentRunner.onLine = (line) => supervisor.emitLine(line);
-    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
-    let resumePrompt = null;
-    const origResume = agentRunner.resume;
-    agentRunner.resume = async (prompt) => {
-      resumePrompt = prompt;
-      return origResume.call(agentRunner, prompt);
-    };
-    const result = await supervisor.run("Install");
-    assert.strictEqual(result.success, true);
-    assert.strictEqual(result.turns, 1);
-    assert.ok(
-      resumePrompt && resumePrompt.includes("documented path"),
-      "Resume prompt should carry the supervisor's intervention text",
-    );
-    const orchestratorEvents = (output.read()?.toString() ?? "")
-      .trim()
-      .split("\n")
-      .filter((l) => l.length > 0)
-      .map((l) => JSON.parse(l))
-      .filter((e) => e.source === "orchestrator");
-    assert.ok(
-      orchestratorEvents.some(
-        (e) => e.event?.type === "intervention_requested",
-      ),
-      "Trace should contain intervention_requested",
-    );
-    assert.ok(
-      orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
-      "Trace should contain intervention_relayed",
-    );
-  });
-});

package/test/supervisor-intervention.test.js DELETED Viewed

@@ -1,365 +0,0 @@
-import { describe, test } from "node:test";
-import assert from "node:assert";
-import { PassThrough } from "node:stream";
-import { Supervisor } from "@forwardimpact/libeval";
-import { isIntervention } from "../src/supervisor.js";
-import { createMockRunner } from "./mock-runner.js";
-describe("isIntervention", () => {
-  test("detects EVALUATION_INTERVENTION on its own line", () => {
-    assert.strictEqual(isIntervention("EVALUATION_INTERVENTION"), true);
-    assert.strictEqual(
-      isIntervention("Some text\nEVALUATION_INTERVENTION\nMore text"),
-      true,
-    );
-    assert.strictEqual(
-      isIntervention("Stop.\n\nEVALUATION_INTERVENTION"),
-      true,
-    );
-  });
-  test("tolerates markdown formatting around the signal", () => {
-    assert.strictEqual(isIntervention("**EVALUATION_INTERVENTION**"), true);
-    assert.strictEqual(isIntervention("*EVALUATION_INTERVENTION*"), true);
-    assert.strictEqual(isIntervention("__EVALUATION_INTERVENTION__"), true);
-    assert.strictEqual(isIntervention("_EVALUATION_INTERVENTION_"), true);
-    assert.strictEqual(isIntervention("`EVALUATION_INTERVENTION`"), true);
-    assert.strictEqual(
-      isIntervention(
-        "Wrong path.\n\n**EVALUATION_INTERVENTION**\n\nTry the documented one.",
-      ),
-      true,
-    );
-  });
-  test("matches EVALUATION_INTERVENTION inline", () => {
-    assert.strictEqual(
-      isIntervention("Stopping you with EVALUATION_INTERVENTION now."),
-      true,
-    );
-    assert.strictEqual(
-      isIntervention("Note: EVALUATION_INTERVENTION. Switch to Y."),
-      true,
-    );
-  });
-  test("does not match empty or unrelated text", () => {
-    assert.strictEqual(isIntervention(""), false);
-    assert.strictEqual(isIntervention("Stop and think."), false);
-    assert.strictEqual(isIntervention("INTERVENTION"), false);
-  });
-  test("does not match EVALUATION_COMPLETE alone", () => {
-    assert.strictEqual(isIntervention("EVALUATION_COMPLETE"), false);
-    assert.strictEqual(
-      isIntervention("Good work.\n\nEVALUATION_COMPLETE"),
-      false,
-    );
-  });
-});
-describe("Supervisor - mid-turn intervention", () => {
-  test("observation without intervention does not interrupt the agent", async () => {
-    // Agent emits one structured assistant text block — fires onBatch once.
-    // Supervisor responds with "Keep going." — neither signal flag is set,
-    // so the agent's SDK session completes naturally and the end-of-turn
-    // review then emits EVALUATION_COMPLETE.
-    //
-    // batchSize = 1 keeps this test focused on intervention semantics, not
-    // on the coarser default batching (3) exercised by agent-runner.test.js.
-    const agentMessages = [
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [{ type: "text", text: "I'm working on it." }],
-          },
-        },
-      ],
-    ];
-    const agentRunner = createMockRunner(
-      [{ text: "I'm working on it." }],
-      agentMessages,
-    );
-    agentRunner.batchSize = 1;
-    const supervisorRunner = createMockRunner([
-      { text: "Welcome! Please install." },
-      { text: "Keep going." },
-      { text: "Good work.\n\nEVALUATION_COMPLETE" },
-    ]);
-    const output = new PassThrough();
-    const supervisor = new Supervisor({
-      agentRunner,
-      supervisorRunner,
-      output,
-      maxTurns: 10,
-    });
-    agentRunner.onLine = (line) => supervisor.emitLine(line);
-    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
-    let agentResumeCalls = 0;
-    const origAgentResume = agentRunner.resume;
-    agentRunner.resume = async (prompt) => {
-      agentResumeCalls++;
-      return origAgentResume.call(agentRunner, prompt);
-    };
-    const result = await supervisor.run("Install");
-    assert.strictEqual(result.success, true);
-    assert.strictEqual(result.turns, 1);
-    assert.strictEqual(
-      agentResumeCalls,
-      0,
-      "Agent should not be resumed when supervisor never intervenes",
-    );
-    // Trace must contain a mid_turn_review marker but no intervention markers.
-    const data = output.read()?.toString() ?? "";
-    const orchestratorEvents = data
-      .trim()
-      .split("\n")
-      .filter((l) => l.length > 0)
-      .map((l) => JSON.parse(l))
-      .filter((e) => e.source === "orchestrator");
-    assert.ok(
-      orchestratorEvents.some((e) => e.event?.type === "mid_turn_review"),
-      "Trace should contain mid_turn_review when onBatch fires",
-    );
-    assert.ok(
-      !orchestratorEvents.some(
-        (e) => e.event?.type === "intervention_requested",
-      ),
-      "Trace should not contain intervention_requested when supervisor only observes",
-    );
-  });
-  test("EVALUATION_INTERVENTION from mid-turn batch interrupts and relays", async () => {
-    // Agent's first call fires onBatch on a structured assistant text block;
-    // supervisor responds with EVALUATION_INTERVENTION → abort + relay.
-    // Agent's second call (resume) finishes naturally; end-of-turn review
-    // then emits EVALUATION_COMPLETE.
-    const agentMessages = [
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [{ type: "text", text: "I'll try the wrong path." }],
-          },
-        },
-      ],
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [
-              { type: "text", text: "OK, switching to the documented path." },
-            ],
-          },
-        },
-      ],
-    ];
-    const agentRunner = createMockRunner(
-      [
-        { text: "I'll try the wrong path." },
-        { text: "OK, switching to the documented path." },
-      ],
-      agentMessages,
-    );
-    agentRunner.batchSize = 1;
-    // Supervisor responses (in order):
-    //   0: turn 0 introduction
-    //   1: mid-turn 1 batch 1 — intervene
-    //   2: mid-turn 1 batch 1 (post-resume) — keep going
-    //   3: end-of-turn 1 — EVALUATION_COMPLETE
-    const supervisorMessages = [
-      undefined,
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [
-              {
-                type: "text",
-                text: "EVALUATION_INTERVENTION Stop and use the documented path.",
-              },
-            ],
-          },
-        },
-      ],
-      undefined,
-      undefined,
-    ];
-    const supervisorRunner = createMockRunner(
-      [
-        { text: "Welcome." },
-        { text: "EVALUATION_INTERVENTION Stop and use the documented path." },
-        { text: "Keep going." },
-        { text: "Good.\n\nEVALUATION_COMPLETE" },
-      ],
-      supervisorMessages,
-    );
-    const output = new PassThrough();
-    const supervisor = new Supervisor({
-      agentRunner,
-      supervisorRunner,
-      output,
-      maxTurns: 10,
-    });
-    agentRunner.onLine = (line) => supervisor.emitLine(line);
-    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
-    let agentResumeCalls = 0;
-    let firstResumePrompt = null;
-    const origAgentResume = agentRunner.resume;
-    agentRunner.resume = async (prompt) => {
-      agentResumeCalls++;
-      if (agentResumeCalls === 1) firstResumePrompt = prompt;
-      return origAgentResume.call(agentRunner, prompt);
-    };
-    const result = await supervisor.run("Install");
-    assert.strictEqual(result.success, true);
-    assert.strictEqual(result.turns, 1);
-    assert.strictEqual(
-      agentResumeCalls,
-      1,
-      "Agent should be resumed exactly once after intervention",
-    );
-    assert.ok(
-      firstResumePrompt && firstResumePrompt.includes("documented path"),
-      "Resume prompt should carry the supervisor's intervention text",
-    );
-    const orchestratorEvents = (output.read()?.toString() ?? "")
-      .trim()
-      .split("\n")
-      .filter((l) => l.length > 0)
-      .map((l) => JSON.parse(l))
-      .filter((e) => e.source === "orchestrator");
-    assert.ok(
-      orchestratorEvents.some(
-        (e) => e.event?.type === "intervention_requested",
-      ),
-      "Trace should contain intervention_requested orchestrator event",
-    );
-    assert.ok(
-      orchestratorEvents.some((e) => e.event?.type === "intervention_relayed"),
-      "Trace should contain intervention_relayed orchestrator event",
-    );
-  });
-  test("EVALUATION_INTERVENTION and EVALUATION_COMPLETE in the same turn", async () => {
-    // Batch 1: supervisor intervenes (abort + relay).
-    // After resume, batch 1 of resume: supervisor writes EVALUATION_COMPLETE
-    // (mid-turn) — the loop must exit success without running an end-of-turn
-    // review.
-    const agentMessages = [
-      [
-        {
-          type: "assistant",
-          message: { content: [{ type: "text", text: "Trying X." }] },
-        },
-      ],
-      [
-        {
-          type: "assistant",
-          message: { content: [{ type: "text", text: "OK trying Y." }] },
-        },
-      ],
-    ];
-    const agentRunner = createMockRunner(
-      [{ text: "Trying X." }, { text: "Trying Y." }],
-      agentMessages,
-    );
-    agentRunner.batchSize = 1;
-    const supervisorMessages = [
-      undefined,
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [
-              {
-                type: "text",
-                text: "EVALUATION_INTERVENTION Try Y instead.",
-              },
-            ],
-          },
-        },
-      ],
-      [
-        {
-          type: "assistant",
-          message: {
-            content: [{ type: "text", text: "Excellent. EVALUATION_COMPLETE" }],
-          },
-        },
-      ],
-    ];
-    const supervisorRunner = createMockRunner(
-      [
-        { text: "Welcome." },
-        { text: "EVALUATION_INTERVENTION Try Y instead." },
-        { text: "Excellent. EVALUATION_COMPLETE" },
-      ],
-      supervisorMessages,
-    );
-    const output = new PassThrough();
-    const supervisor = new Supervisor({
-      agentRunner,
-      supervisorRunner,
-      output,
-      maxTurns: 10,
-    });
-    agentRunner.onLine = (line) => supervisor.emitLine(line);
-    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
-    let agentResumeCalls = 0;
-    const origAgentResume = agentRunner.resume;
-    agentRunner.resume = async (prompt) => {
-      agentResumeCalls++;
-      return origAgentResume.call(agentRunner, prompt);
-    };
-    const result = await supervisor.run("Install");
-    assert.strictEqual(result.success, true);
-    assert.strictEqual(result.turns, 1);
-    assert.strictEqual(
-      agentResumeCalls,
-      1,
-      "Agent.resume runs once (after intervention); EVALUATION_COMPLETE then ends the turn",
-    );
-    const orchestratorEvents = (output.read()?.toString() ?? "")
-      .trim()
-      .split("\n")
-      .filter((l) => l.length > 0)
-      .map((l) => JSON.parse(l))
-      .filter((e) => e.source === "orchestrator");
-    assert.ok(
-      orchestratorEvents.some(
-        (e) => e.event?.type === "intervention_requested",
-      ),
-      "Trace should contain intervention_requested",
-    );
-    assert.ok(
-      orchestratorEvents.some((e) => e.event?.type === "complete_requested"),
-      "Trace should contain complete_requested for mid-turn EVALUATION_COMPLETE",
-    );
-  });
-});