npm - @forwardimpact/libeval - Versions diffs - 0.1.5 → 0.1.8 - Mend

@forwardimpact/libeval 0.1.5 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/bin/fit-eval.js +2 -2
package/index.js +2 -0
package/package.json +1 -1
package/src/agent-runner.js +97 -39
package/src/commands/run.js +43 -18
package/src/commands/supervise.js +59 -37
package/src/supervisor.js +320 -48
package/src/trace-collector.js +7 -0
package/test/mock-runner.js +101 -0
package/test/supervisor-intervention.test.js +359 -0
package/test/{supervisor.test.js → supervisor-output.test.js} +120 -245
package/test/supervisor-run.test.js +310 -0
package/test/trace-collector.test.js +96 -0

package/bin/fit-eval.js CHANGED Viewed

@@ -29,7 +29,7 @@ Run options:
   --task-text=STRING   Inline task text (mutually exclusive with --task-file)
   --cwd=DIR            Agent working directory (default: .)
   --model=MODEL        Claude model to use (default: opus)
-  --max-turns=N        Maximum agentic turns (default: 50)
+  --max-turns=N        Maximum agentic turns (default: 50, 0 = unlimited)
   --output=PATH        Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
   --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
@@ -40,7 +40,7 @@ Supervise options:
   --supervisor-cwd=DIR      Supervisor working directory (default: .)
   --agent-cwd=DIR           Agent working directory (default: temp directory)
   --model=MODEL             Claude model to use (default: opus)
-  --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
+  --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
   --output=PATH             Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST      Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
   --supervisor-allowed-tools=LIST

package/index.js CHANGED Viewed

@@ -5,5 +5,7 @@ export {
   createSupervisor,
   SUPERVISOR_SYSTEM_PROMPT,
   AGENT_SYSTEM_PROMPT,
+  isComplete,
+  isIntervention,
 } from "./src/supervisor.js";
 export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.5",
+  "version": "0.1.8",
   "description": "Process Claude Code stream-json output into structured traces",
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",

package/src/agent-runner.js CHANGED Viewed

@@ -17,6 +17,7 @@ export class AgentRunner {
    * @param {string[]} [deps.allowedTools] - Tools the agent may use
    * @param {string} [deps.permissionMode] - SDK permission mode
    * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
+   * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
    * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
    * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
    * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
@@ -31,6 +32,7 @@ export class AgentRunner {
     allowedTools,
     permissionMode,
     onLine,
+    onBatch,
     settingSources,
     agentProfile,
     systemPrompt,
@@ -43,7 +45,7 @@ export class AgentRunner {
     this.query = query;
     this.output = output;
     this.model = model ?? "opus";
-    this.maxTurns = maxTurns ?? 50;
+    this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
     this.allowedTools = allowedTools ?? [
       "Bash",
       "Read",
@@ -54,101 +56,140 @@ export class AgentRunner {
     ];
     this.permissionMode = permissionMode ?? "bypassPermissions";
     this.onLine = onLine ?? null;
+    this.onBatch = onBatch ?? null;
     this.settingSources = settingSources ?? [];
     this.agentProfile = agentProfile ?? null;
     this.systemPrompt = systemPrompt ?? null;
     this.disallowedTools = disallowedTools ?? [];
     this.sessionId = null;
     this.buffer = [];
+    /** @type {AbortController|null} */
+    this.currentAbortController = null;
   }
   /**
    * Run a new agent session with the given task.
    * @param {string} task - The task prompt
-   * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
+   * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
    */
   async run(task) {
-    let text = "";
-    let stopReason = null;
-    let error = null;
+    const abortController = new AbortController();
+    this.currentAbortController = abortController;
     try {
-      for await (const message of this.query({
+      const iterator = this.query({
         prompt: task,
         options: {
           cwd: this.cwd,
           allowedTools: this.allowedTools,
-          maxTurns: this.maxTurns,
+          ...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
           model: this.model,
           permissionMode: this.permissionMode,
           allowDangerouslySkipPermissions: true,
           settingSources: this.settingSources,
+          abortController,
           ...(this.disallowedTools.length > 0 && {
             disallowedTools: this.disallowedTools,
           }),
           ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
           ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
         },
-      })) {
-        const line = JSON.stringify(message);
-        this.output.write(line + "\n");
-        this.buffer.push(line);
-        if (this.onLine) this.onLine(line);
-        if (message.type === "system" && message.subtype === "init") {
-          this.sessionId = message.session_id;
-        }
-        if (message.type === "result") {
-          text = message.result ?? "";
-          stopReason = message.subtype;
-        }
-      }
-    } catch (err) {
-      error = err;
+      });
+      return await this.#consumeQuery(iterator);
+    } finally {
+      this.currentAbortController = null;
     }
-    // If the SDK already emitted a successful result, honour it even when the
-    // stream throws afterwards (e.g. "Credit balance is too low" during
-    // cleanup). Only treat errors as fatal when no result was received yet.
-    const success = stopReason === "success";
-    return { success, text, sessionId: this.sessionId, error };
   }
   /**
    * Resume an existing session with a follow-up prompt.
    * @param {string} prompt - The follow-up prompt
-   * @returns {Promise<{success: boolean, text: string}>}
+   * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
    */
   async resume(prompt) {
-    let text = "";
-    let stopReason = null;
-    let error = null;
+    const abortController = new AbortController();
+    this.currentAbortController = abortController;
     try {
-      for await (const message of this.query({
+      const iterator = this.query({
         prompt,
         options: {
           resume: this.sessionId,
           permissionMode: this.permissionMode,
           allowDangerouslySkipPermissions: true,
+          abortController,
         },
-      })) {
+      });
+      return await this.#consumeQuery(iterator);
+    } finally {
+      this.currentAbortController = null;
+    }
+  }
+  /**
+   * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
+   * iterator, mirroring every line to the output stream / buffer / onLine
+   * callback, and — when `onBatch` is set — flushes accumulated lines to it
+   * at natural boundaries (assistant messages with text blocks, and the
+   * terminal `result` message).
+   *
+   * INVARIANT: the `await this.onBatch(...)` call below is the ONLY
+   * suspension point in this loop. While it is pending, no further lines
+   * are pulled from the SDK generator. The Supervisor relies on this — its
+   * onBatch callback flips `currentSource` to "supervisor" for the duration
+   * of its mid-turn LLM call, and the invariant guarantees no agent line
+   * can arrive concurrently and be mis-tagged.
+   *
+   * If the supervisor calls `abort()` from inside the callback, the next
+   * iteration of the for-await loop will throw. We catch the throw, check
+   * `currentAbortController.signal.aborted` (avoiding fragility around
+   * AbortError vs DOMException shapes), and report `aborted: true` so the
+   * caller can distinguish "supervisor asked us to stop" from a real error.
+   * @param {AsyncIterable<object>} iterator
+   * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
+   */
+  async #consumeQuery(iterator) {
+    let text = "";
+    let stopReason = null;
+    let error = null;
+    let aborted = false;
+    const pendingBatch = [];
+    try {
+      for await (const message of iterator) {
         const line = JSON.stringify(message);
         this.output.write(line + "\n");
         this.buffer.push(line);
         if (this.onLine) this.onLine(line);
+        if (this.onBatch) pendingBatch.push(line);
+        if (message.type === "system" && message.subtype === "init") {
+          this.sessionId = message.session_id;
+        }
         if (message.type === "result") {
           text = message.result ?? "";
           stopReason = message.subtype;
         }
+        const shouldFlush =
+          this.onBatch &&
+          (message.type === "result" ||
+            (message.type === "assistant" && hasTextBlock(message)));
+        if (shouldFlush) {
+          const batchLines = pendingBatch.splice(0, pendingBatch.length);
+          await this.onBatch(batchLines, {
+            abort: () => this.currentAbortController?.abort(),
+          });
+        }
       }
     } catch (err) {
-      error = err;
+      if (this.currentAbortController?.signal.aborted) {
+        aborted = true;
+      } else {
+        error = err;
+      }
     }
     const success = stopReason === "success";
-    return { success, text, error };
+    return { success, text, sessionId: this.sessionId, error, aborted };
   }
   /**
@@ -162,6 +203,23 @@ export class AgentRunner {
   }
 }
+/**
+ * Whether an SDK assistant message contains at least one text block.
+ * Tool-only assistant messages return false so they accumulate into the
+ * pending batch and flush with the next text block (or with the terminal
+ * `result` message), keeping supervisor LLM cost bounded.
+ * @param {object} message
+ * @returns {boolean}
+ */
+function hasTextBlock(message) {
+  const content = message.message?.content ?? message.content;
+  if (!Array.isArray(content)) return false;
+  for (const block of content) {
+    if (block.type === "text" && block.text) return true;
+  }
+  return false;
+}
 /**
  * Factory function — wires real dependencies.
  * @param {object} deps - Same as AgentRunner constructor

package/src/commands/run.js CHANGED Viewed

@@ -18,6 +18,38 @@ function parseFlag(args, name) {
   return undefined;
 }
+/**
+ * Parse and validate run command options from args.
+ * @param {string[]} args
+ * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
+ */
+function parseRunOptions(args) {
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
+  const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
+  const taskAmend = parseFlag(args, "task-amend") ?? undefined;
+  let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  if (taskAmend) taskContent += `\n\n${taskAmend}`;
+  return {
+    taskContent,
+    cwd: resolve(parseFlag(args, "cwd") ?? "."),
+    model: parseFlag(args, "model") ?? "opus",
+    maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
+    outputPath: parseFlag(args, "output"),
+    agentProfile: parseFlag(args, "agent-profile") ?? undefined,
+    allowedTools: (
+      parseFlag(args, "allowed-tools") ??
+      "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
+    ).split(","),
+  };
+}
 /**
  * Run command — execute a single agent via the Claude Agent SDK.
  *
@@ -28,31 +60,24 @@ function parseFlag(args, name) {
  *   --task-text=STRING   Inline task text (mutually exclusive with --task-file)
  *   --cwd=DIR            Agent working directory (default: .)
  *   --model=MODEL        Claude model to use (default: opus)
- *   --max-turns=N        Maximum agentic turns (default: 50)
+ *   --max-turns=N        Maximum agentic turns (default: 50, 0 = unlimited)
  *   --output=PATH        Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
  *   --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
+ *   --task-amend=TEXT     Additional text appended to the task prompt
  *
  * @param {string[]} args - Command arguments
  */
 export async function runRunCommand(args) {
-  const taskFile = parseFlag(args, "task-file");
-  const taskText = parseFlag(args, "task-text");
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
-  const cwd = resolve(parseFlag(args, "cwd") ?? ".");
-  const model = parseFlag(args, "model") ?? "opus";
-  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
-  const outputPath = parseFlag(args, "output");
-  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
-  const allowedTools = (
-    parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
-  ).split(",");
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  const {
+    taskContent,
+    cwd,
+    model,
+    maxTurns,
+    outputPath,
+    agentProfile,
+    allowedTools,
+  } = parseRunOptions(args);
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).

package/src/commands/supervise.js CHANGED Viewed

@@ -19,6 +19,50 @@ function parseFlag(args, name) {
   return undefined;
 }
+/**
+ * Parse all supervise flags from args into an options object.
+ * @param {string[]} args
+ * @returns {object}
+ */
+function parseSuperviseOptions(args) {
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
+  const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
+  const taskAmend = parseFlag(args, "task-amend") ?? undefined;
+  let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  if (taskAmend) taskContent += `\n\n${taskAmend}`;
+  return {
+    taskContent,
+    supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
+    agentCwd: resolve(
+      parseFlag(args, "agent-cwd") ??
+        mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
+    ),
+    model: parseFlag(args, "model") ?? "opus",
+    maxTurns: (() => {
+      const raw = parseFlag(args, "max-turns") ?? "20";
+      return raw === "0" ? 0 : parseInt(raw, 10);
+    })(),
+    outputPath: parseFlag(args, "output"),
+    supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
+    agentProfile: parseFlag(args, "agent-profile") ?? undefined,
+    allowedTools: (
+      parseFlag(args, "allowed-tools") ??
+      "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
+    ).split(","),
+    supervisorAllowedTools: supervisorAllowedToolsRaw
+      ? supervisorAllowedToolsRaw.split(",")
+      : undefined,
+  };
+}
 /**
  * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
  *
@@ -30,45 +74,23 @@ function parseFlag(args, name) {
  *   --supervisor-cwd=DIR      Supervisor working directory (default: .)
  *   --agent-cwd=DIR           Agent working directory (default: temp directory)
  *   --model=MODEL             Claude model to use (default: opus)
- *   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
+ *   --max-turns=N             Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
  *   --output=PATH             Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST      Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
  *   --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
  *   --agent-profile=NAME      Agent profile name (passed as --agent to Claude CLI)
+ *   --task-amend=TEXT          Additional text appended to the task prompt
  *
  * @param {string[]} args - Command arguments
  */
 export async function runSuperviseCommand(args) {
-  const taskFile = parseFlag(args, "task-file");
-  const taskText = parseFlag(args, "task-text");
-  if (taskFile && taskText)
-    throw new Error("--task-file and --task-text are mutually exclusive");
-  if (!taskFile && !taskText)
-    throw new Error("--task-file or --task-text is required");
-  const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
-  const agentCwd = resolve(
-    parseFlag(args, "agent-cwd") ??
-      mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
-  );
-  const model = parseFlag(args, "model") ?? "opus";
-  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
-  const outputPath = parseFlag(args, "output");
-  const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
-  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
-  const allowedTools = (
-    parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
-  ).split(",");
-  const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
-  const supervisorAllowedTools = supervisorAllowedToolsRaw
-    ? supervisorAllowedToolsRaw.split(",")
-    : undefined;
-  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
+  const opts = parseSuperviseOptions(args);
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
-  const fileStream = outputPath ? createWriteStream(outputPath) : null;
+  const fileStream = opts.outputPath
+    ? createWriteStream(opts.outputPath)
+    : null;
   const output = fileStream
     ? createTeeWriter({
         fileStream,
@@ -79,19 +101,19 @@ export async function runSuperviseCommand(args) {
   const { query } = await import("@anthropic-ai/claude-agent-sdk");
   const supervisor = createSupervisor({
-    supervisorCwd,
-    agentCwd,
+    supervisorCwd: opts.supervisorCwd,
+    agentCwd: opts.agentCwd,
     query,
     output,
-    model,
-    maxTurns,
-    allowedTools,
-    supervisorAllowedTools,
-    supervisorProfile,
-    agentProfile,
+    model: opts.model,
+    maxTurns: opts.maxTurns,
+    allowedTools: opts.allowedTools,
+    supervisorAllowedTools: opts.supervisorAllowedTools,
+    supervisorProfile: opts.supervisorProfile,
+    agentProfile: opts.agentProfile,
   });
-  const result = await supervisor.run(taskContent);
+  const result = await supervisor.run(opts.taskContent);
   if (fileStream) {
     await new Promise((r) => output.end(r));