npm - @forwardimpact/libeval - Versions diffs - 0.1.0 → 0.1.1 - Mend

@forwardimpact/libeval 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/bin/fit-eval.js +26 -1
package/index.js +3 -0
package/package.json +6 -3
package/src/agent-runner.js +142 -0
package/src/commands/run.js +75 -0
package/src/commands/supervise.js +86 -0
package/src/supervisor.js +165 -0
package/src/tee-writer.js +157 -0
package/test/agent-runner.test.js +292 -0
package/test/supervisor.test.js +333 -0
package/test/tee-writer.test.js +326 -0

package/bin/fit-eval.js CHANGED Viewed

@@ -1,11 +1,15 @@
-#!/usr/bin/env node
+#!/usr/bin/env bun
 import { runOutputCommand } from "../src/commands/output.js";
 import { runTeeCommand } from "../src/commands/tee.js";
+import { runRunCommand } from "../src/commands/run.js";
+import { runSuperviseCommand } from "../src/commands/supervise.js";
 const COMMANDS = {
   output: runOutputCommand,
   tee: runTeeCommand,
+  run: runRunCommand,
+  supervise: runSuperviseCommand,
 };
 const HELP_TEXT = `
@@ -17,6 +21,25 @@ Usage:
 Commands:
   output [--format=json|text]    Process trace and output formatted result
   tee [output.ndjson]            Stream text to stdout, optionally save raw NDJSON
+  run [options]                  Run a single agent via the Claude Agent SDK
+  supervise [options]            Run a supervised agent ↔ supervisor relay loop
+Run options:
+  --task=PATH          Path to task file (required)
+  --cwd=DIR            Agent working directory (default: .)
+  --model=MODEL        Claude model to use (default: opus)
+  --max-turns=N        Maximum agentic turns (default: 50)
+  --output=PATH        Write NDJSON trace to file (default: stdout)
+  --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+Supervise options:
+  --task=PATH               Path to task file (required)
+  --supervisor-cwd=DIR      Supervisor working directory (default: .)
+  --agent-cwd=DIR           Agent working directory (default: temp directory)
+  --model=MODEL             Claude model to use (default: opus)
+  --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
+  --output=PATH             Write NDJSON trace to file (default: stdout)
+  --allowed-tools=LIST      Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
 Options:
   --help      Show this help message
@@ -27,6 +50,8 @@ Examples:
   fit-eval output --format=json < trace.ndjson
   fit-eval tee < trace.ndjson
   fit-eval tee output.ndjson < trace.ndjson
+  fit-eval run --task=.github/tasks/security-audit.md --model=opus
+  fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
 `.trim();
 async function main() {

package/index.js CHANGED Viewed

@@ -1 +1,4 @@
 export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
+export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
+export { Supervisor, createSupervisor } from "./src/supervisor.js";
+export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.0",
+  "version": "0.1.1",
   "description": "Process Claude Code stream-json output into structured traces",
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",
@@ -10,10 +10,13 @@
     "fit-eval": "./bin/fit-eval.js"
   },
   "engines": {
-    "node": ">=22.0.0"
+    "bun": ">=1.2.0"
   },
   "scripts": {
-    "test": "node --test test/*.test.js"
+    "test": "bun run node --test test/*.test.js"
+  },
+  "dependencies": {
+    "@anthropic-ai/claude-agent-sdk": "^0.1.0"
   },
   "publishConfig": {
     "access": "public"

package/src/agent-runner.js ADDED Viewed

@@ -0,0 +1,142 @@
+/**
+ * AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
+ * events to an output stream. Building block for both `fit-eval run` and
+ * `fit-eval supervise`.
+ *
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
+ */
+export class AgentRunner {
+  /**
+   * @param {object} deps
+   * @param {string} deps.cwd - Agent working directory
+   * @param {function} deps.query - SDK query function (injected for testing)
+   * @param {import("stream").Writable} deps.output - Stream to emit NDJSON to
+   * @param {string} [deps.model] - Claude model identifier
+   * @param {number} [deps.maxTurns] - Maximum agentic turns
+   * @param {string[]} [deps.allowedTools] - Tools the agent may use
+   * @param {string} [deps.permissionMode] - SDK permission mode
+   */
+  constructor({
+    cwd,
+    query,
+    output,
+    model,
+    maxTurns,
+    allowedTools,
+    permissionMode,
+  }) {
+    if (!cwd) throw new Error("cwd is required");
+    if (!query) throw new Error("query is required");
+    if (!output) throw new Error("output is required");
+    this.cwd = cwd;
+    this.query = query;
+    this.output = output;
+    this.model = model ?? "opus";
+    this.maxTurns = maxTurns ?? 50;
+    this.allowedTools = allowedTools ?? [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ];
+    this.permissionMode = permissionMode ?? "bypassPermissions";
+    this.sessionId = null;
+    this.buffer = [];
+  }
+  /**
+   * Run a new agent session with the given task.
+   * @param {string} task - The task prompt
+   * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
+   */
+  async run(task) {
+    let text = "";
+    let stopReason = null;
+    let error = null;
+    try {
+      for await (const message of this.query({
+        prompt: task,
+        options: {
+          cwd: this.cwd,
+          allowedTools: this.allowedTools,
+          maxTurns: this.maxTurns,
+          model: this.model,
+          permissionMode: this.permissionMode,
+          allowDangerouslySkipPermissions: true,
+        },
+      })) {
+        const line = JSON.stringify(message);
+        this.output.write(line + "\n");
+        this.buffer.push(line);
+        if (message.type === "system" && message.subtype === "init") {
+          this.sessionId = message.session_id;
+        }
+        if (message.type === "result") {
+          text = message.result ?? "";
+          stopReason = message.subtype;
+        }
+      }
+    } catch (err) {
+      error = err;
+    }
+    const success = !error && stopReason === "success";
+    return { success, text, sessionId: this.sessionId, error };
+  }
+  /**
+   * Resume an existing session with a follow-up prompt.
+   * @param {string} prompt - The follow-up prompt
+   * @returns {Promise<{success: boolean, text: string}>}
+   */
+  async resume(prompt) {
+    let text = "";
+    let stopReason = null;
+    let error = null;
+    try {
+      for await (const message of this.query({
+        prompt,
+        options: { resume: this.sessionId },
+      })) {
+        const line = JSON.stringify(message);
+        this.output.write(line + "\n");
+        this.buffer.push(line);
+        if (message.type === "result") {
+          text = message.result ?? "";
+          stopReason = message.subtype;
+        }
+      }
+    } catch (err) {
+      error = err;
+    }
+    const success = !error && stopReason === "success";
+    return { success, text, error };
+  }
+  /**
+   * Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
+   * @returns {string[]}
+   */
+  drainOutput() {
+    const lines = [...this.buffer];
+    this.buffer = [];
+    return lines;
+  }
+}
+/**
+ * Factory function — wires real dependencies.
+ * @param {object} deps - Same as AgentRunner constructor
+ * @returns {AgentRunner}
+ */
+export function createAgentRunner(deps) {
+  return new AgentRunner(deps);
+}

package/src/commands/run.js ADDED Viewed

@@ -0,0 +1,75 @@
+import { readFileSync, createWriteStream } from "node:fs";
+import { resolve } from "node:path";
+import { createAgentRunner } from "../agent-runner.js";
+import { createTeeWriter } from "../tee-writer.js";
+/**
+ * Parse a --key=value or --key value flag from args.
+ * @param {string[]} args
+ * @param {string} name - Flag name without --
+ * @returns {string|undefined}
+ */
+function parseFlag(args, name) {
+  const prefix = `--${name}=`;
+  for (let i = 0; i < args.length; i++) {
+    if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
+    if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
+  }
+  return undefined;
+}
+/**
+ * Run command — execute a single agent via the Claude Agent SDK.
+ *
+ * Usage: fit-eval run [options]
+ *
+ * Options:
+ *   --task=PATH          Path to task file (required)
+ *   --cwd=DIR            Agent working directory (default: .)
+ *   --model=MODEL        Claude model to use (default: opus)
+ *   --max-turns=N        Maximum agentic turns (default: 50)
+ *   --output=PATH        Write NDJSON trace to file (default: stdout)
+ *   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+ *
+ * @param {string[]} args - Command arguments
+ */
+export async function runRunCommand(args) {
+  const task = parseFlag(args, "task");
+  if (!task) throw new Error("--task is required");
+  const cwd = resolve(parseFlag(args, "cwd") ?? ".");
+  const model = parseFlag(args, "model") ?? "opus";
+  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
+  const outputPath = parseFlag(args, "output");
+  const allowedTools = (
+    parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
+  ).split(",");
+  const taskContent = readFileSync(task, "utf8");
+  // When --output is specified, stream text to stdout while writing NDJSON to file.
+  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
+  const fileStream = outputPath ? createWriteStream(outputPath) : null;
+  const output = fileStream
+    ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
+    : process.stdout;
+  const { query } = await import("@anthropic-ai/claude-agent-sdk");
+  const runner = createAgentRunner({
+    cwd,
+    query,
+    output,
+    model,
+    maxTurns,
+    allowedTools,
+  });
+  const result = await runner.run(taskContent);
+  if (fileStream) {
+    await new Promise((r) => output.end(r));
+    await new Promise((r) => fileStream.end(r));
+  }
+  process.exit(result.success ? 0 : 1);
+}

package/src/commands/supervise.js ADDED Viewed

@@ -0,0 +1,86 @@
+import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
+import { resolve, join } from "node:path";
+import { tmpdir } from "node:os";
+import { createSupervisor } from "../supervisor.js";
+import { createTeeWriter } from "../tee-writer.js";
+/**
+ * Parse a --key=value or --key value flag from args.
+ * @param {string[]} args
+ * @param {string} name - Flag name without --
+ * @returns {string|undefined}
+ */
+function parseFlag(args, name) {
+  const prefix = `--${name}=`;
+  for (let i = 0; i < args.length; i++) {
+    if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
+    if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
+  }
+  return undefined;
+}
+/**
+ * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
+ *
+ * Usage: fit-eval supervise [options]
+ *
+ * Options:
+ *   --task=PATH               Path to task file (required)
+ *   --supervisor-cwd=DIR      Supervisor working directory (default: .)
+ *   --agent-cwd=DIR           Agent working directory (default: temp directory)
+ *   --model=MODEL             Claude model to use (default: opus)
+ *   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
+ *   --output=PATH             Write NDJSON trace to file (default: stdout)
+ *   --allowed-tools=LIST      Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
+ *
+ * @param {string[]} args - Command arguments
+ */
+export async function runSuperviseCommand(args) {
+  const task = parseFlag(args, "task");
+  if (!task) throw new Error("--task is required");
+  const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
+  const agentCwd = resolve(
+    parseFlag(args, "agent-cwd") ??
+      mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
+  );
+  const model = parseFlag(args, "model") ?? "opus";
+  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
+  const outputPath = parseFlag(args, "output");
+  const allowedTools = (
+    parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
+  ).split(",");
+  const taskContent = readFileSync(task, "utf8");
+  // When --output is specified, stream text to stdout while writing NDJSON to file.
+  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
+  const fileStream = outputPath ? createWriteStream(outputPath) : null;
+  const output = fileStream
+    ? createTeeWriter({
+        fileStream,
+        textStream: process.stdout,
+        mode: "supervised",
+      })
+    : process.stdout;
+  const { query } = await import("@anthropic-ai/claude-agent-sdk");
+  const supervisor = createSupervisor({
+    supervisorCwd,
+    agentCwd,
+    query,
+    output,
+    model,
+    maxTurns,
+    allowedTools,
+  });
+  const result = await supervisor.run(taskContent);
+  if (fileStream) {
+    await new Promise((r) => output.end(r));
+    await new Promise((r) => fileStream.end(r));
+  }
+  process.exit(result.success ? 0 : 1);
+}

package/src/supervisor.js ADDED Viewed

@@ -0,0 +1,165 @@
+/**
+ * Supervisor — orchestrates a relay loop between an agent and a supervisor,
+ * both running as AgentRunner instances. The agent works on a task while the
+ * supervisor observes and decides when the evaluation is complete.
+ *
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
+ */
+import { PassThrough } from "node:stream";
+import { createAgentRunner } from "./agent-runner.js";
+/**
+ * Check if the supervisor's response signals evaluation completion.
+ * Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
+ * to avoid false positives from natural language.
+ * @param {string} text
+ * @returns {boolean}
+ */
+export function isDone(text) {
+  return /^EVALUATION_COMPLETE$/m.test(text);
+}
+export class Supervisor {
+  /**
+   * @param {object} deps
+   * @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions
+   * @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
+   * @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
+   * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
+   */
+  constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
+    if (!agentRunner) throw new Error("agentRunner is required");
+    if (!supervisorRunner) throw new Error("supervisorRunner is required");
+    if (!output) throw new Error("output is required");
+    this.agentRunner = agentRunner;
+    this.supervisorRunner = supervisorRunner;
+    this.output = output;
+    this.maxTurns = maxTurns ?? 20;
+  }
+  /**
+   * Run the supervisor ↔ agent relay loop.
+   * @param {string} task - The initial task for the agent
+   * @returns {Promise<{success: boolean, turns: number}>}
+   */
+  async run(task) {
+    // Turn 0: Agent receives the task and starts working
+    let agentResult = await this.agentRunner.run(task);
+    this.emitTagged("agent", 0);
+    if (agentResult.error) {
+      this.emitSummary({ success: false, turns: 0 });
+      return { success: false, turns: 0 };
+    }
+    for (let turn = 1; turn <= this.maxTurns; turn++) {
+      // Supervisor observes the agent's output
+      const supervisorPrompt =
+        `The agent reported:\n\n${agentResult.text}\n\n` +
+        `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
+      let supervisorResult;
+      if (turn === 1) {
+        supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
+      } else {
+        supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
+      }
+      this.emitTagged("supervisor", turn);
+      if (supervisorResult.error) {
+        this.emitSummary({ success: false, turns: turn });
+        return { success: false, turns: turn };
+      }
+      if (isDone(supervisorResult.text)) {
+        this.emitSummary({ success: true, turns: turn });
+        return { success: true, turns: turn };
+      }
+      // Supervisor's response becomes the agent's next input
+      agentResult = await this.agentRunner.resume(supervisorResult.text);
+      this.emitTagged("agent", turn);
+      if (agentResult.error) {
+        this.emitSummary({ success: false, turns: turn });
+        return { success: false, turns: turn };
+      }
+    }
+    this.emitSummary({ success: false, turns: this.maxTurns });
+    return { success: false, turns: this.maxTurns };
+  }
+  /**
+   * Drain a runner's buffered output and re-emit each line tagged with
+   * source and turn metadata.
+   * @param {"agent"|"supervisor"} source
+   * @param {number} turn
+   */
+  emitTagged(source, turn) {
+    const runner =
+      source === "agent" ? this.agentRunner : this.supervisorRunner;
+    for (const line of runner.drainOutput()) {
+      const event = JSON.parse(line);
+      const tagged = { source, turn, event };
+      this.output.write(JSON.stringify(tagged) + "\n");
+    }
+  }
+  /**
+   * Emit a final orchestrator summary line.
+   * @param {{success: boolean, turns: number}} result
+   */
+  emitSummary(result) {
+    const summary = {
+      source: "orchestrator",
+      type: "summary",
+      success: result.success,
+      turns: result.turns,
+    };
+    this.output.write(JSON.stringify(summary) + "\n");
+  }
+}
+/**
+ * Factory function — wires both AgentRunners with their respective configs.
+ * @param {object} deps
+ * @param {string} deps.supervisorCwd - Supervisor working directory
+ * @param {string} deps.agentCwd - Agent working directory
+ * @param {function} deps.query - SDK query function
+ * @param {import("stream").Writable} deps.output - Final output stream
+ * @param {string} [deps.model] - Claude model identifier
+ * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
+ * @param {string[]} [deps.allowedTools] - Tools the agent may use
+ * @returns {Supervisor}
+ */
+export function createSupervisor({
+  supervisorCwd,
+  agentCwd,
+  query,
+  output,
+  model,
+  maxTurns,
+  allowedTools,
+}) {
+  const agentRunner = createAgentRunner({
+    cwd: agentCwd,
+    query,
+    output: new PassThrough(),
+    model,
+    maxTurns: 50,
+    allowedTools,
+  });
+  const supervisorRunner = createAgentRunner({
+    cwd: supervisorCwd,
+    query,
+    output: new PassThrough(),
+    model,
+    maxTurns: 10,
+    allowedTools: ["Read", "Glob", "Grep"],
+  });
+  return new Supervisor({ agentRunner, supervisorRunner, output, maxTurns });
+}