npm - @forwardimpact/libeval - Versions diffs - 0.1.0 → 0.1.2 - Mend

@forwardimpact/libeval 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/bin/fit-eval.js +26 -1
package/index.js +3 -0
package/package.json +6 -3
package/src/agent-runner.js +154 -0
package/src/commands/run.js +76 -0
package/src/commands/supervise.js +86 -0
package/src/commands/tee.js +13 -75
package/src/supervisor.js +186 -0
package/src/tee-writer.js +157 -0
package/test/agent-runner.test.js +317 -0
package/test/supervisor.test.js +342 -0
package/test/tee-writer.test.js +326 -0

package/test/supervisor.test.js ADDED Viewed

@@ -0,0 +1,342 @@
+import { describe, test } from "node:test";
+import assert from "node:assert";
+import { PassThrough } from "node:stream";
+import {
+  AgentRunner,
+  Supervisor,
+  createSupervisor,
+} from "@forwardimpact/libeval";
+import { isDone } from "../src/supervisor.js";
+/**
+ * Create a mock AgentRunner that yields pre-scripted responses.
+ * Each call to run() or resume() pops the next response from the array.
+ * @param {object[]} responses - Array of {text, success} objects
+ * @param {object[]} [messages] - Messages to buffer per turn
+ * @returns {AgentRunner}
+ */
+function createMockRunner(responses, messages) {
+  const output = new PassThrough();
+  let callIndex = 0;
+  const runner = new AgentRunner({
+    cwd: "/tmp",
+    query: async function* () {},
+    output,
+  });
+  // Override run and resume to return scripted responses
+  runner.run = async (_task) => {
+    const resp = responses[callIndex++];
+    const msgs = messages?.[callIndex - 1] ?? [
+      { type: "assistant", content: resp.text },
+    ];
+    for (const m of msgs) {
+      const line = JSON.stringify(m);
+      runner.buffer.push(line);
+      if (runner.onLine) runner.onLine(line);
+    }
+    runner.sessionId = "mock-session";
+    return {
+      success: resp.success ?? true,
+      text: resp.text,
+      sessionId: "mock-session",
+    };
+  };
+  runner.resume = async (_prompt) => {
+    const resp = responses[callIndex++];
+    const msgs = messages?.[callIndex - 1] ?? [
+      { type: "assistant", content: resp.text },
+    ];
+    for (const m of msgs) {
+      const line = JSON.stringify(m);
+      runner.buffer.push(line);
+      if (runner.onLine) runner.onLine(line);
+    }
+    return { success: resp.success ?? true, text: resp.text };
+  };
+  return runner;
+}
+describe("isDone", () => {
+  test("detects EVALUATION_COMPLETE on its own line", () => {
+    assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
+    assert.strictEqual(
+      isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
+      true,
+    );
+    assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
+  });
+  test("does not match EVALUATION_COMPLETE embedded in text", () => {
+    assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
+    assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
+    assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
+  });
+  test("does not match empty or unrelated text", () => {
+    assert.strictEqual(isDone(""), false);
+    assert.strictEqual(isDone("All done!"), false);
+    assert.strictEqual(isDone("DONE"), false);
+  });
+});
+describe("Supervisor", () => {
+  test("constructor throws on missing agentRunner", () => {
+    assert.throws(
+      () =>
+        new Supervisor({
+          supervisorRunner: createMockRunner([]),
+          output: new PassThrough(),
+        }),
+      /agentRunner is required/,
+    );
+  });
+  test("constructor throws on missing supervisorRunner", () => {
+    assert.throws(
+      () =>
+        new Supervisor({
+          agentRunner: createMockRunner([]),
+          output: new PassThrough(),
+        }),
+      /supervisorRunner is required/,
+    );
+  });
+  test("constructor throws on missing output", () => {
+    assert.throws(
+      () =>
+        new Supervisor({
+          agentRunner: createMockRunner([]),
+          supervisorRunner: createMockRunner([]),
+        }),
+      /output is required/,
+    );
+  });
+  test("completes on EVALUATION_COMPLETE from supervisor", async () => {
+    const agentRunner = createMockRunner([
+      { text: "I installed the packages." },
+    ]);
+    const supervisorRunner = createMockRunner([
+      { text: "Good work.\n\nEVALUATION_COMPLETE" },
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    const result = await supervisor.run("Install stuff");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 1);
+  });
+  test("runs multiple turns before completion", async () => {
+    const agentRunner = createMockRunner([
+      { text: "Started working." },
+      { text: "Made progress." },
+      { text: "Finished everything." },
+    ]);
+    const supervisorRunner = createMockRunner([
+      { text: "Keep going, you need to do more." },
+      { text: "Almost there, continue." },
+      { text: "EVALUATION_COMPLETE" },
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    const result = await supervisor.run("Do the work");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 3);
+  });
+  test("enforces maxTurns limit", async () => {
+    // Agent responds to every turn, supervisor never says done
+    const agentRunner = createMockRunner([
+      { text: "Turn 0" },
+      { text: "Turn 1" },
+      { text: "Turn 2" },
+    ]);
+    const supervisorRunner = createMockRunner([
+      { text: "Continue." },
+      { text: "Continue." },
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 2,
+    });
+    const result = await supervisor.run("Endless task");
+    assert.strictEqual(result.success, false);
+    assert.strictEqual(result.turns, 2);
+  });
+  test("output contains tagged lines with correct source and turn", async () => {
+    const agentMessages = [[{ type: "assistant", content: "Working" }]];
+    const supervisorMessages = [
+      [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
+    ];
+    const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
+    const supervisorRunner = createMockRunner(
+      [{ text: "EVALUATION_COMPLETE" }],
+      supervisorMessages,
+    );
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    await supervisor.run("Task");
+    const data = output.read()?.toString() ?? "";
+    const lines = data
+      .trim()
+      .split("\n")
+      .filter((l) => l.length > 0);
+    // Should have: agent turn 0, supervisor turn 1, orchestrator summary
+    assert.ok(lines.length >= 3);
+    const agentLine = JSON.parse(lines[0]);
+    assert.strictEqual(agentLine.source, "agent");
+    assert.strictEqual(agentLine.turn, 0);
+    assert.ok("event" in agentLine);
+    const supervisorLine = JSON.parse(lines[1]);
+    assert.strictEqual(supervisorLine.source, "supervisor");
+    assert.strictEqual(supervisorLine.turn, 1);
+    assert.ok("event" in supervisorLine);
+    const summaryLine = JSON.parse(lines[lines.length - 1]);
+    assert.strictEqual(summaryLine.source, "orchestrator");
+    assert.strictEqual(summaryLine.type, "summary");
+    assert.strictEqual(summaryLine.success, true);
+  });
+  test("events are nested under event key (no field collisions)", async () => {
+    const sourceEvent = {
+      type: "assistant",
+      source: "sdk-internal",
+      content: "test",
+    };
+    const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
+    const supervisorRunner = createMockRunner(
+      [{ text: "EVALUATION_COMPLETE" }],
+      [[{ type: "assistant", content: "ok" }]],
+    );
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    await supervisor.run("Task");
+    const data = output.read()?.toString() ?? "";
+    const lines = data
+      .trim()
+      .split("\n")
+      .filter((l) => l.length > 0);
+    const tagged = JSON.parse(lines[0]);
+    // The original event's `source` field is preserved inside `event`
+    assert.strictEqual(tagged.source, "agent");
+    assert.strictEqual(tagged.event.source, "sdk-internal");
+  });
+  test("emits agent output and summary when agent errors on turn 0", async () => {
+    const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
+    const agentRunner = createMockRunner(
+      [{ text: "Partial work", success: false }],
+      agentMessages,
+    );
+    // Override run to simulate an error return
+    const origRun = agentRunner.run;
+    agentRunner.run = async (task) => {
+      const result = await origRun.call(agentRunner, task);
+      return { ...result, error: new Error("Process exited with code 1") };
+    };
+    const supervisorRunner = createMockRunner([]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    const result = await supervisor.run("Task");
+    assert.strictEqual(result.success, false);
+    assert.strictEqual(result.turns, 0);
+    // Output should still contain the agent's buffered lines + summary
+    const data = output.read()?.toString() ?? "";
+    const lines = data
+      .trim()
+      .split("\n")
+      .filter((l) => l.length > 0);
+    assert.ok(lines.length >= 2, "Expected at least agent line + summary");
+    const agentLine = JSON.parse(lines[0]);
+    assert.strictEqual(agentLine.source, "agent");
+    assert.strictEqual(agentLine.turn, 0);
+    const summaryLine = JSON.parse(lines[lines.length - 1]);
+    assert.strictEqual(summaryLine.source, "orchestrator");
+    assert.strictEqual(summaryLine.success, false);
+    assert.strictEqual(summaryLine.turns, 0);
+  });
+  test("createSupervisor factory returns a Supervisor instance", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.ok(supervisor instanceof Supervisor);
+  });
+});

package/test/tee-writer.test.js ADDED Viewed

@@ -0,0 +1,326 @@
+import { describe, test } from "node:test";
+import assert from "node:assert";
+import { PassThrough } from "node:stream";
+import { TeeWriter, createTeeWriter } from "@forwardimpact/libeval";
+/**
+ * Collect all data written to a PassThrough stream as a string.
+ * @param {PassThrough} stream
+ * @returns {string}
+ */
+function collect(stream) {
+  const data = stream.read();
+  return data ? data.toString() : "";
+}
+/**
+ * Write lines to a TeeWriter and wait for it to finish.
+ * @param {TeeWriter} writer
+ * @param {string[]} lines - JSON lines to write
+ */
+async function writeLines(writer, lines) {
+  for (const line of lines) {
+    writer.write(line + "\n");
+  }
+  await new Promise((resolve) => writer.end(resolve));
+}
+describe("TeeWriter", () => {
+  test("constructor throws on missing fileStream", () => {
+    assert.throws(
+      () => new TeeWriter({ textStream: new PassThrough() }),
+      /fileStream is required/,
+    );
+  });
+  test("constructor throws on missing textStream", () => {
+    assert.throws(
+      () => new TeeWriter({ fileStream: new PassThrough() }),
+      /textStream is required/,
+    );
+  });
+  test("writes NDJSON to fileStream and text to textStream in raw mode", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
+    const events = [
+      JSON.stringify({
+        type: "system",
+        subtype: "init",
+        session_id: "s1",
+        model: "opus",
+      }),
+      JSON.stringify({
+        type: "assistant",
+        message: {
+          content: [{ type: "text", text: "Hello world" }],
+          usage: { input_tokens: 10, output_tokens: 5 },
+        },
+      }),
+      JSON.stringify({
+        type: "assistant",
+        message: {
+          content: [
+            {
+              type: "tool_use",
+              id: "t1",
+              name: "Bash",
+              input: { command: "ls" },
+            },
+          ],
+          usage: { input_tokens: 20, output_tokens: 10 },
+        },
+      }),
+      JSON.stringify({
+        type: "result",
+        subtype: "success",
+        duration_ms: 5000,
+        num_turns: 2,
+        total_cost_usd: 0.05,
+        usage: { input_tokens: 30, output_tokens: 15 },
+      }),
+    ];
+    await writeLines(writer, events);
+    const fileData = collect(fileStream);
+    const textData = collect(textStream);
+    // File should contain all NDJSON lines
+    const fileLines = fileData.trim().split("\n");
+    assert.strictEqual(fileLines.length, 4);
+    assert.deepStrictEqual(JSON.parse(fileLines[0]).type, "system");
+    assert.deepStrictEqual(JSON.parse(fileLines[3]).type, "result");
+    // Text should contain human-readable output
+    assert.ok(textData.includes("Hello world"));
+    assert.ok(textData.includes("> Tool: Bash"));
+    assert.ok(textData.includes("--- Result: success"));
+  });
+  test("streams text incrementally as events arrive", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
+    // Write first assistant message
+    writer.write(
+      JSON.stringify({
+        type: "assistant",
+        message: {
+          content: [{ type: "text", text: "First message" }],
+          usage: { input_tokens: 10, output_tokens: 5 },
+        },
+      }) + "\n",
+    );
+    // Text should be available before stream ends
+    const firstText = collect(textStream);
+    assert.ok(firstText.includes("First message"));
+    writer.write(
+      JSON.stringify({
+        type: "assistant",
+        message: {
+          content: [{ type: "text", text: "Second message" }],
+          usage: { input_tokens: 20, output_tokens: 10 },
+        },
+      }) + "\n",
+    );
+    const secondText = collect(textStream);
+    assert.ok(secondText.includes("Second message"));
+    await new Promise((resolve) => writer.end(resolve));
+  });
+  test("supervised mode shows source labels and unwraps events", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({
+      fileStream,
+      textStream,
+      mode: "supervised",
+    });
+    const events = [
+      JSON.stringify({
+        source: "agent",
+        turn: 0,
+        event: {
+          type: "assistant",
+          message: {
+            content: [{ type: "text", text: "Working on it" }],
+            usage: { input_tokens: 10, output_tokens: 5 },
+          },
+        },
+      }),
+      JSON.stringify({
+        source: "supervisor",
+        turn: 1,
+        event: {
+          type: "assistant",
+          message: {
+            content: [{ type: "text", text: "Looks good" }],
+            usage: { input_tokens: 20, output_tokens: 10 },
+          },
+        },
+      }),
+      JSON.stringify({
+        source: "orchestrator",
+        type: "summary",
+        success: true,
+        turns: 1,
+      }),
+    ];
+    await writeLines(writer, events);
+    const fileData = collect(fileStream);
+    const textData = collect(textStream);
+    // File should contain all raw tagged NDJSON
+    const fileLines = fileData.trim().split("\n");
+    assert.strictEqual(fileLines.length, 3);
+    assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
+    // Text should show source labels
+    assert.ok(textData.includes("[agent]"));
+    assert.ok(textData.includes("Working on it"));
+    assert.ok(textData.includes("[supervisor]"));
+    assert.ok(textData.includes("Looks good"));
+    assert.ok(textData.includes("Evaluation completed after 1 turns"));
+  });
+  test("supervised mode shows incomplete status on failure", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({
+      fileStream,
+      textStream,
+      mode: "supervised",
+    });
+    await writeLines(writer, [
+      JSON.stringify({
+        source: "orchestrator",
+        type: "summary",
+        success: false,
+        turns: 5,
+      }),
+    ]);
+    const textData = collect(textStream);
+    assert.ok(textData.includes("Evaluation incomplete after 5 turns"));
+  });
+  test("supervised mode only shows source label on change", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({
+      fileStream,
+      textStream,
+      mode: "supervised",
+    });
+    const events = [
+      JSON.stringify({
+        source: "agent",
+        turn: 0,
+        event: {
+          type: "assistant",
+          message: {
+            content: [{ type: "text", text: "Step 1" }],
+            usage: { input_tokens: 10, output_tokens: 5 },
+          },
+        },
+      }),
+      JSON.stringify({
+        source: "agent",
+        turn: 0,
+        event: {
+          type: "assistant",
+          message: {
+            content: [{ type: "text", text: "Step 2" }],
+            usage: { input_tokens: 10, output_tokens: 5 },
+          },
+        },
+      }),
+    ];
+    await writeLines(writer, events);
+    const textData = collect(textStream);
+    // [agent] label should appear only once
+    const agentLabels = textData.split("[agent]").length - 1;
+    assert.strictEqual(agentLabels, 1);
+  });
+  test("handles partial lines across chunks", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
+    const fullLine = JSON.stringify({
+      type: "assistant",
+      message: {
+        content: [{ type: "text", text: "Split message" }],
+        usage: { input_tokens: 10, output_tokens: 5 },
+      },
+    });
+    // Split the line across two chunks
+    const mid = Math.floor(fullLine.length / 2);
+    writer.write(fullLine.slice(0, mid));
+    writer.write(fullLine.slice(mid) + "\n");
+    await new Promise((resolve) => writer.end(resolve));
+    const textData = collect(textStream);
+    assert.ok(textData.includes("Split message"));
+  });
+  test("truncates long tool input", async () => {
+    const fileStream = new PassThrough();
+    const textStream = new PassThrough();
+    const writer = new TeeWriter({ fileStream, textStream, mode: "raw" });
+    const longInput = { command: "x".repeat(300) };
+    const event = JSON.stringify({
+      type: "assistant",
+      message: {
+        content: [
+          { type: "tool_use", id: "t1", name: "Bash", input: longInput },
+        ],
+        usage: { input_tokens: 10, output_tokens: 5 },
+      },
+    });
+    await writeLines(writer, [event]);
+    const textData = collect(textStream);
+    assert.ok(textData.includes("> Tool: Bash"));
+    assert.ok(textData.includes("..."));
+    // Truncated to ~200 chars
+    const toolLine = textData.split("\n").find((l) => l.startsWith("> Tool:"));
+    assert.ok(toolLine.length < 250);
+  });
+  test("defaults to raw mode", () => {
+    const writer = new TeeWriter({
+      fileStream: new PassThrough(),
+      textStream: new PassThrough(),
+    });
+    assert.strictEqual(writer.mode, "raw");
+  });
+  test("createTeeWriter factory returns a TeeWriter instance", () => {
+    const writer = createTeeWriter({
+      fileStream: new PassThrough(),
+      textStream: new PassThrough(),
+    });
+    assert.ok(writer instanceof TeeWriter);
+  });
+});