npm - @forwardimpact/libeval - Versions diffs - 0.1.3 → 0.1.6 - Mend

@forwardimpact/libeval 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/bin/fit-eval.js +12 -4
package/index.js +6 -1
package/package.json +4 -3
package/src/agent-runner.js +19 -1
package/src/commands/run.js +12 -4
package/src/commands/supervise.js +20 -4
package/src/supervisor.js +141 -31
package/src/tee-writer.js +6 -3
package/src/trace-collector.js +7 -0
package/test/supervisor.test.js +261 -49
package/test/tee-writer.test.js +6 -8
package/test/trace-collector.test.js +96 -0

package/bin/fit-eval.js CHANGED Viewed

@@ -25,21 +25,28 @@ Commands:
   supervise [options]            Run a supervised agent ↔ supervisor relay loop
 Run options:
-  --task=PATH          Path to task file (required)
+  --task-file=PATH     Path to task file (mutually exclusive with --task-text)
+  --task-text=STRING   Inline task text (mutually exclusive with --task-file)
   --cwd=DIR            Agent working directory (default: .)
   --model=MODEL        Claude model to use (default: opus)
   --max-turns=N        Maximum agentic turns (default: 50)
   --output=PATH        Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+  --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
 Supervise options:
-  --task=PATH               Path to task file (required)
+  --task-file=PATH          Path to task file (mutually exclusive with --task-text)
+  --task-text=STRING        Inline task text (mutually exclusive with --task-file)
   --supervisor-cwd=DIR      Supervisor working directory (default: .)
   --agent-cwd=DIR           Agent working directory (default: temp directory)
   --model=MODEL             Claude model to use (default: opus)
   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
   --output=PATH             Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST      Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
+  --supervisor-allowed-tools=LIST
+                            Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
+  --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
+  --agent-profile=NAME      Agent profile name (passed as --agent to Claude CLI)
 Options:
   --help      Show this help message
@@ -50,8 +57,9 @@ Examples:
   fit-eval output --format=json < trace.ndjson
   fit-eval tee < trace.ndjson
   fit-eval tee output.ndjson < trace.ndjson
-  fit-eval run --task=.github/tasks/security-audit.md --model=opus
-  fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
+  fit-eval run --task-text="Perform a security audit of the repository." --model=opus
+  fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
+  fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
 `.trim();
 async function main() {

package/index.js CHANGED Viewed

@@ -1,4 +1,9 @@
 export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
 export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
-export { Supervisor, createSupervisor } from "./src/supervisor.js";
+export {
+  Supervisor,
+  createSupervisor,
+  SUPERVISOR_SYSTEM_PROMPT,
+  AGENT_SYSTEM_PROMPT,
+} from "./src/supervisor.js";
 export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.3",
+  "version": "0.1.6",
   "description": "Process Claude Code stream-json output into structured traces",
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",
@@ -10,13 +10,14 @@
     "fit-eval": "./bin/fit-eval.js"
   },
   "engines": {
-    "bun": ">=1.2.0"
+    "bun": ">=1.2.0",
+    "node": ">=18.0.0"
   },
   "scripts": {
     "test": "bun run node --test test/*.test.js"
   },
   "dependencies": {
-    "@anthropic-ai/claude-agent-sdk": "^0.1.0"
+    "@anthropic-ai/claude-agent-sdk": "^0.2.91"
   },
   "publishConfig": {
     "access": "public"

package/src/agent-runner.js CHANGED Viewed

@@ -18,6 +18,9 @@ export class AgentRunner {
    * @param {string} [deps.permissionMode] - SDK permission mode
    * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
    * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
+   * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
+   * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
+   * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
    */
   constructor({
     cwd,
@@ -29,6 +32,9 @@ export class AgentRunner {
     permissionMode,
     onLine,
     settingSources,
+    agentProfile,
+    systemPrompt,
+    disallowedTools,
   }) {
     if (!cwd) throw new Error("cwd is required");
     if (!query) throw new Error("query is required");
@@ -49,6 +55,9 @@ export class AgentRunner {
     this.permissionMode = permissionMode ?? "bypassPermissions";
     this.onLine = onLine ?? null;
     this.settingSources = settingSources ?? [];
+    this.agentProfile = agentProfile ?? null;
+    this.systemPrompt = systemPrompt ?? null;
+    this.disallowedTools = disallowedTools ?? [];
     this.sessionId = null;
     this.buffer = [];
   }
@@ -74,6 +83,11 @@ export class AgentRunner {
           permissionMode: this.permissionMode,
           allowDangerouslySkipPermissions: true,
           settingSources: this.settingSources,
+          ...(this.disallowedTools.length > 0 && {
+            disallowedTools: this.disallowedTools,
+          }),
+          ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
+          ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
         },
       })) {
         const line = JSON.stringify(message);
@@ -113,7 +127,11 @@ export class AgentRunner {
     try {
       for await (const message of this.query({
         prompt,
-        options: { resume: this.sessionId },
+        options: {
+          resume: this.sessionId,
+          permissionMode: this.permissionMode,
+          allowDangerouslySkipPermissions: true,
+        },
       })) {
         const line = JSON.stringify(message);
         this.output.write(line + "\n");

package/src/commands/run.js CHANGED Viewed

@@ -24,28 +24,35 @@ function parseFlag(args, name) {
  * Usage: fit-eval run [options]
  *
  * Options:
- *   --task=PATH          Path to task file (required)
+ *   --task-file=PATH     Path to task file (mutually exclusive with --task-text)
+ *   --task-text=STRING   Inline task text (mutually exclusive with --task-file)
  *   --cwd=DIR            Agent working directory (default: .)
  *   --model=MODEL        Claude model to use (default: opus)
  *   --max-turns=N        Maximum agentic turns (default: 50)
  *   --output=PATH        Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+ *   --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
  *
  * @param {string[]} args - Command arguments
  */
 export async function runRunCommand(args) {
-  const task = parseFlag(args, "task");
-  if (!task) throw new Error("--task is required");
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
   const cwd = resolve(parseFlag(args, "cwd") ?? ".");
   const model = parseFlag(args, "model") ?? "opus";
   const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
   const outputPath = parseFlag(args, "output");
+  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
   const allowedTools = (
     parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
   ).split(",");
-  const taskContent = readFileSync(task, "utf8");
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
     maxTurns,
     allowedTools,
     settingSources: ["project"],
+    agentProfile,
   });
   const result = await runner.run(taskContent);

package/src/commands/supervise.js CHANGED Viewed

@@ -25,19 +25,26 @@ function parseFlag(args, name) {
  * Usage: fit-eval supervise [options]
  *
  * Options:
- *   --task=PATH               Path to task file (required)
+ *   --task-file=PATH          Path to task file (mutually exclusive with --task-text)
+ *   --task-text=STRING        Inline task text (mutually exclusive with --task-file)
  *   --supervisor-cwd=DIR      Supervisor working directory (default: .)
  *   --agent-cwd=DIR           Agent working directory (default: temp directory)
  *   --model=MODEL             Claude model to use (default: opus)
  *   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
  *   --output=PATH             Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST      Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
+ *   --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
+ *   --agent-profile=NAME      Agent profile name (passed as --agent to Claude CLI)
  *
  * @param {string[]} args - Command arguments
  */
 export async function runSuperviseCommand(args) {
-  const task = parseFlag(args, "task");
-  if (!task) throw new Error("--task is required");
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
   const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
   const agentCwd = resolve(
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
   const model = parseFlag(args, "model") ?? "opus";
   const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
   const outputPath = parseFlag(args, "output");
+  const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
+  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
   const allowedTools = (
     parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
   ).split(",");
+  const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
+  const supervisorAllowedTools = supervisorAllowedToolsRaw
+    ? supervisorAllowedToolsRaw.split(",")
+    : undefined;
-  const taskContent = readFileSync(task, "utf8");
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
     model,
     maxTurns,
     allowedTools,
+    supervisorAllowedTools,
+    supervisorProfile,
+    agentProfile,
   });
   const result = await supervisor.run(taskContent);

package/src/supervisor.js CHANGED Viewed

@@ -1,25 +1,38 @@
 /**
  * Supervisor — orchestrates a relay loop between an agent and a supervisor,
- * both running as AgentRunner instances. The agent works on a task while the
- * supervisor observes and decides when the evaluation is complete.
+ * both running as AgentRunner instances. The supervisor receives the task first,
+ * introduces itself, and delegates work to the agent. The loop then alternates:
+ * agent → supervisor → agent.
  *
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
 import { PassThrough } from "node:stream";
 import { createAgentRunner } from "./agent-runner.js";
+import { TraceCollector } from "./trace-collector.js";
 /**
- * Check if the supervisor's response signals evaluation completion.
- * Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
- * to avoid false positives from natural language.
+ * Check if the supervisor's response signals evaluation success.
+ * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
+ * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
+ * avoid matching inside longer identifiers.
  * @param {string} text
  * @returns {boolean}
  */
-export function isDone(text) {
-  return /^EVALUATION_COMPLETE$/m.test(text);
+export function isSuccessful(text) {
+  return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
 }
+/** System prompt appended for the supervisor runner in supervise mode. */
+export const SUPERVISOR_SYSTEM_PROMPT =
+  "You supervise another AI agent through a relay — your output becomes the agent's next input. " +
+  "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
+/** System prompt appended for the agent runner in supervise mode. */
+export const AGENT_SYSTEM_PROMPT =
+  "You are being supervised by another AI agent. " +
+  "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
 export class Supervisor {
   /**
    * @param {object} deps
@@ -40,67 +53,113 @@ export class Supervisor {
     this.currentSource = "agent";
     /** @type {number} */
     this.currentTurn = 0;
+    /**
+     * Set to true when any supervisor message contains the success signal.
+     * The SDK result text only reflects the last assistant message, so when
+     * the supervisor writes EVALUATION_SUCCESSFUL in an early message and
+     * then continues with follow-up work, the result text won't contain it.
+     * This flag captures the signal from the full message stream.
+     * @type {boolean}
+     */
+    this.successSignalSeen = false;
   }
   /**
    * Run the supervisor ↔ agent relay loop.
-   * @param {string} task - The initial task for the agent
+   * The supervisor receives the task first, introduces itself, and delegates
+   * work to the agent. The loop then alternates: agent → supervisor → agent.
+   * @param {string} task - The initial task for the supervisor
    * @returns {Promise<{success: boolean, turns: number}>}
    */
   async run(task) {
-    // Turn 0: Agent receives the task and starts working
-    this.currentSource = "agent";
+    // Turn 0: Supervisor receives the task and introduces it to the agent
+    this.currentSource = "supervisor";
     this.currentTurn = 0;
-    let agentResult = await this.agentRunner.run(task);
+    this.successSignalSeen = false;
+    let supervisorResult = await this.supervisorRunner.run(task);
-    if (agentResult.error) {
+    if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: 0 });
       return { success: false, turns: 0 };
     }
-    for (let turn = 1; turn <= this.maxTurns; turn++) {
-      // Supervisor observes the agent's output
-      const supervisorPrompt =
-        `The agent reported:\n\n${agentResult.text}\n\n` +
-        `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
+    // Check for the success signal in either the SDK result text or the
+    // streamed message content. The SDK result text only reflects the last
+    // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
+    // early and then continues (e.g. filing issues), we must also check the
+    // flag set by emitLine during streaming.
+    if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
+      this.emitSummary({ success: true, turns: 0 });
+      return { success: true, turns: 0 };
+    }
-      this.currentSource = "supervisor";
+    for (let turn = 1; turn <= this.maxTurns; turn++) {
+      // Supervisor's output becomes the agent's input
+      this.currentSource = "agent";
       this.currentTurn = turn;
-      let supervisorResult;
+      let agentResult;
       if (turn === 1) {
-        supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
+        agentResult = await this.agentRunner.run(supervisorResult.text);
       } else {
-        supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
+        agentResult = await this.agentRunner.resume(supervisorResult.text);
       }
-      if (supervisorResult.error) {
+      if (agentResult.error) {
         this.emitSummary({ success: false, turns: turn });
         return { success: false, turns: turn };
       }
-      if (isDone(supervisorResult.text)) {
-        this.emitSummary({ success: true, turns: turn });
-        return { success: true, turns: turn };
-      }
+      // Build the full agent transcript from buffered NDJSON events so the
+      // supervisor sees tool calls and reasoning, not just the SDK result summary.
+      const agentTranscript = this.extractTranscript(this.agentRunner);
-      // Supervisor's response becomes the agent's next input
-      this.currentSource = "agent";
+      const supervisorPrompt =
+        `The agent reported:\n\n${agentTranscript}\n\n` +
+        `Review the agent's work and decide how to proceed.`;
+      this.currentSource = "supervisor";
       this.currentTurn = turn;
-      agentResult = await this.agentRunner.resume(supervisorResult.text);
+      this.successSignalSeen = false;
+      supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
-      if (agentResult.error) {
+      if (supervisorResult.error) {
         this.emitSummary({ success: false, turns: turn });
         return { success: false, turns: turn };
       }
+      // The supervisor's turn is fully complete — check for success signal
+      // in either the SDK result text or streamed messages.
+      if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
+        this.emitSummary({ success: true, turns: turn });
+        return { success: true, turns: turn };
+      }
     }
     this.emitSummary({ success: false, turns: this.maxTurns });
     return { success: false, turns: this.maxTurns };
   }
+  /**
+   * Extract a human-readable transcript from an AgentRunner's buffered output.
+   * Drains the buffer and replays events through a TraceCollector.
+   * @param {import("./agent-runner.js").AgentRunner} runner
+   * @returns {string}
+   */
+  extractTranscript(runner) {
+    const lines = runner.drainOutput();
+    const collector = new TraceCollector();
+    for (const line of lines) {
+      collector.addLine(line);
+    }
+    return collector.toText() || "[The agent produced no output.]";
+  }
   /**
    * Emit a single NDJSON line tagged with the current source and turn.
    * Called in real-time via the AgentRunner onLine callback.
+   *
+   * When the current source is the supervisor, also scans assistant text
+   * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
    * @param {string} line - Raw NDJSON line from the runner
    */
   emitLine(line) {
@@ -111,6 +170,21 @@ export class Supervisor {
       event,
     };
     this.output.write(JSON.stringify(tagged) + "\n");
+    // Scan supervisor assistant messages for the success signal in real time.
+    // The SDK result text only reflects the final assistant message, but the
+    // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
+    // then continue with follow-up tool calls.
+    if (this.currentSource === "supervisor" && event.type === "assistant") {
+      const content = event.message?.content ?? event.content ?? [];
+      if (Array.isArray(content)) {
+        for (const block of content) {
+          if (block.type === "text" && isSuccessful(block.text)) {
+            this.successSignalSeen = true;
+          }
+        }
+      }
+    }
   }
   /**
@@ -138,6 +212,10 @@ export class Supervisor {
  * @param {string} [deps.model] - Claude model identifier
  * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
+ * @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
+ * @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
+ * @param {string} [deps.supervisorProfile] - Supervisor agent profile name
+ * @param {string} [deps.agentProfile] - Agent profile name
  * @returns {Supervisor}
  */
 export function createSupervisor({
@@ -148,6 +226,10 @@ export function createSupervisor({
   model,
   maxTurns,
   allowedTools,
+  supervisorDisallowedTools,
+  supervisorAllowedTools,
+  supervisorProfile,
+  agentProfile,
 }) {
   // Forward-reference: onLine captures `supervisor` before construction completes.
   // This is safe because onLine is only called during run(), after construction.
@@ -163,17 +245,45 @@ export function createSupervisor({
     allowedTools,
     onLine,
     settingSources: ["project"],
+    agentProfile,
+    systemPrompt: {
+      type: "preset",
+      preset: "claude_code",
+      append: AGENT_SYSTEM_PROMPT,
+    },
   });
+  // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
+  // The relay loop handles agent communication — letting the supervisor use
+  // Task would bypass the relay and produce an empty agent trace.
+  const defaultDisallowed = ["Task", "TaskOutput"];
+  const disallowedTools = supervisorDisallowedTools
+    ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
+    : defaultDisallowed;
   const supervisorRunner = createAgentRunner({
     cwd: supervisorCwd,
     query,
     output: new PassThrough(),
     model,
     maxTurns: 10,
-    allowedTools: ["Read", "Glob", "Grep"],
+    allowedTools: supervisorAllowedTools ?? [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ],
+    disallowedTools,
     onLine,
     settingSources: ["project"],
+    agentProfile: supervisorProfile,
+    systemPrompt: {
+      type: "preset",
+      preset: "claude_code",
+      append: SUPERVISOR_SYSTEM_PROMPT,
+    },
   });
   supervisor = new Supervisor({

package/src/tee-writer.js CHANGED Viewed

@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
     if (parsed.event) {
       if (parsed.source && parsed.source !== this.lastSource) {
         this.lastSource = parsed.source;
-        this.textStream.write(`\n[${parsed.source}]\n`);
       }
       this.collector.addLine(JSON.stringify(parsed.event));
       this.flushTurns();
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
    */
   flushTurns() {
     const turns = this.collector.turns;
+    const prefix =
+      this.mode === "supervised" && this.lastSource
+        ? `[${this.lastSource}] `
+        : "";
     while (this.turnsEmitted < turns.length) {
       const turn = turns[this.turnsEmitted++];
       if (turn.role === "assistant") {
         for (const block of turn.content) {
           if (block.type === "text") {
-            this.textStream.write(block.text + "\n");
+            this.textStream.write(`${prefix}${block.text}\n`);
           } else if (block.type === "tool_use") {
             const input = summarizeInput(block.input);
-            this.textStream.write(`> Tool: ${block.name} ${input}\n`);
+            this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
           }
         }
       }

package/src/trace-collector.js CHANGED Viewed

@@ -38,6 +38,13 @@ export class TraceCollector {
       return;
     }
+    // Unwrap combined supervised trace format {source, turn, event}.
+    // The Supervisor emits this wrapper; when replayed through addLine the
+    // inner event is the one we need.
+    if (event.event && !event.type && typeof event.source === "string") {
+      event = event.event;
+    }
     switch (event.type) {
       case "system":
         this.handleSystem(event);

package/test/supervisor.test.js CHANGED Viewed

@@ -6,8 +6,10 @@ import {
   AgentRunner,
   Supervisor,
   createSupervisor,
+  SUPERVISOR_SYSTEM_PROMPT,
+  AGENT_SYSTEM_PROMPT,
 } from "@forwardimpact/libeval";
-import { isDone } from "../src/supervisor.js";
+import { isSuccessful } from "../src/supervisor.js";
 /**
  * Create a mock AgentRunner that yields pre-scripted responses.
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
   return runner;
 }
-describe("isDone", () => {
-  test("detects EVALUATION_COMPLETE on its own line", () => {
-    assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
+describe("isSuccessful", () => {
+  test("detects EVALUATION_SUCCESSFUL on its own line", () => {
+    assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
     assert.strictEqual(
-      isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
+      isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
       true,
     );
-    assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
+    assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
   });
-  test("does not match EVALUATION_COMPLETE embedded in text", () => {
-    assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
-    assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
-    assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
+  test("tolerates markdown formatting around the signal", () => {
+    assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
+    assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
+    assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
+    assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
+    assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
+    assert.strictEqual(
+      isSuccessful(
+        "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
+      ),
+      true,
+    );
+  });
+  test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
+    assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
+    assert.strictEqual(
+      isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
+      true,
+    );
+    assert.strictEqual(
+      isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
+      true,
+    );
   });
   test("does not match empty or unrelated text", () => {
-    assert.strictEqual(isDone(""), false);
-    assert.strictEqual(isDone("All done!"), false);
-    assert.strictEqual(isDone("DONE"), false);
+    assert.strictEqual(isSuccessful(""), false);
+    assert.strictEqual(isSuccessful("All done!"), false);
+    assert.strictEqual(isSuccessful("DONE"), false);
+  });
+  test("does not match old EVALUATION_COMPLETE signal", () => {
+    assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
   });
 });
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
     );
   });
-  test("completes on EVALUATION_COMPLETE from supervisor", async () => {
+  test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
+    const agentRunner = createMockRunner([]);
+    const supervisorRunner = createMockRunner([
+      { text: "EVALUATION_SUCCESSFUL" },
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    const result = await supervisor.run("Install stuff");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 0);
+  });
+  test("completes after one agent turn", async () => {
     const agentRunner = createMockRunner([
       { text: "I installed the packages." },
     ]);
     const supervisorRunner = createMockRunner([
-      { text: "Good work.\n\nEVALUATION_COMPLETE" },
+      { text: "Welcome! Please install the packages." },
+      { text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
     ]);
     const output = new PassThrough();
@@ -141,6 +189,67 @@ describe("Supervisor", () => {
     assert.strictEqual(result.turns, 1);
   });
+  test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
+    // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
+    // an early message, then continues with follow-up work (e.g. filing issues).
+    // The SDK result text reflects only the final message, which does NOT
+    // contain the signal.
+    const agentRunner = createMockRunner([
+      { text: "I installed the packages." },
+    ]);
+    // The supervisor's result text is the Summary (no signal), but messages
+    // include one with EVALUATION_SUCCESSFUL.
+    const supervisorMessages = [
+      undefined, // turn 0: use default
+      [
+        {
+          type: "assistant",
+          message: {
+            content: [
+              {
+                type: "text",
+                text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
+              },
+            ],
+          },
+        },
+        {
+          type: "assistant",
+          message: {
+            content: [
+              { type: "text", text: "## Summary\n\nAll issues filed." },
+            ],
+          },
+        },
+      ],
+    ];
+    const supervisorRunner = createMockRunner(
+      [
+        { text: "Welcome! Please install the packages." },
+        // Result text is the final message — does NOT contain the signal
+        { text: "## Summary\n\nAll issues filed." },
+      ],
+      supervisorMessages,
+    );
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    const result = await supervisor.run("Install stuff");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 1);
+  });
   test("runs multiple turns before completion", async () => {
     const agentRunner = createMockRunner([
       { text: "Started working." },
@@ -149,9 +258,10 @@ describe("Supervisor", () => {
     ]);
     const supervisorRunner = createMockRunner([
+      { text: "Here is your task. Do the work." },
       { text: "Keep going, you need to do more." },
       { text: "Almost there, continue." },
-      { text: "EVALUATION_COMPLETE" },
+      { text: "EVALUATION_SUCCESSFUL" },
     ]);
     const output = new PassThrough();
@@ -169,14 +279,14 @@ describe("Supervisor", () => {
   });
   test("enforces maxTurns limit", async () => {
-    // Agent responds to every turn, supervisor never says done
+    // Supervisor starts, agent responds each turn, supervisor never says done
     const agentRunner = createMockRunner([
-      { text: "Turn 0" },
       { text: "Turn 1" },
       { text: "Turn 2" },
     ]);
     const supervisorRunner = createMockRunner([
+      { text: "Start working." },
       { text: "Continue." },
       { text: "Continue." },
     ]);
@@ -196,16 +306,17 @@ describe("Supervisor", () => {
   });
   test("output contains tagged lines with correct source and turn", async () => {
-    const agentMessages = [[{ type: "assistant", content: "Working" }]];
     const supervisorMessages = [
-      [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
+      [{ type: "assistant", content: "Go ahead" }],
+      [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
     ];
+    const agentMessages = [[{ type: "assistant", content: "Working" }]];
-    const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
     const supervisorRunner = createMockRunner(
-      [{ text: "EVALUATION_COMPLETE" }],
+      [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
       supervisorMessages,
     );
+    const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -225,19 +336,19 @@ describe("Supervisor", () => {
       .split("\n")
       .filter((l) => l.length > 0);
-    // Should have: agent turn 0, supervisor turn 1, orchestrator summary
-    assert.ok(lines.length >= 3);
-    const agentLine = JSON.parse(lines[0]);
-    assert.strictEqual(agentLine.source, "agent");
-    assert.strictEqual(agentLine.turn, 0);
-    assert.ok("event" in agentLine);
+    // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
+    assert.ok(lines.length >= 4);
-    const supervisorLine = JSON.parse(lines[1]);
+    const supervisorLine = JSON.parse(lines[0]);
     assert.strictEqual(supervisorLine.source, "supervisor");
-    assert.strictEqual(supervisorLine.turn, 1);
+    assert.strictEqual(supervisorLine.turn, 0);
     assert.ok("event" in supervisorLine);
+    const agentLine = JSON.parse(lines[1]);
+    assert.strictEqual(agentLine.source, "agent");
+    assert.strictEqual(agentLine.turn, 1);
+    assert.ok("event" in agentLine);
     const summaryLine = JSON.parse(lines[lines.length - 1]);
     assert.strictEqual(summaryLine.source, "orchestrator");
     assert.strictEqual(summaryLine.type, "summary");
@@ -250,11 +361,14 @@ describe("Supervisor", () => {
       source: "sdk-internal",
       content: "test",
     };
-    const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
     const supervisorRunner = createMockRunner(
-      [{ text: "EVALUATION_COMPLETE" }],
-      [[{ type: "assistant", content: "ok" }]],
+      [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
+      [
+        [{ type: "assistant", content: "Go" }],
+        [{ type: "assistant", content: "ok" }],
+      ],
     );
+    const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -274,27 +388,30 @@ describe("Supervisor", () => {
       .split("\n")
       .filter((l) => l.length > 0);
-    const tagged = JSON.parse(lines[0]);
+    // First line is supervisor turn 0, second is agent turn 1
+    const tagged = JSON.parse(lines[1]);
     // The original event's `source` field is preserved inside `event`
     assert.strictEqual(tagged.source, "agent");
     assert.strictEqual(tagged.event.source, "sdk-internal");
   });
-  test("emits agent output and summary when agent errors on turn 0", async () => {
-    const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
-    const agentRunner = createMockRunner(
-      [{ text: "Partial work", success: false }],
-      agentMessages,
+  test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
+    const supervisorMessages = [
+      [{ type: "assistant", content: "Starting..." }],
+    ];
+    const supervisorRunner = createMockRunner(
+      [{ text: "Starting...", success: false }],
+      supervisorMessages,
     );
     // Override run to simulate an error return
-    const origRun = agentRunner.run;
-    agentRunner.run = async (task) => {
-      const result = await origRun.call(agentRunner, task);
+    const origRun = supervisorRunner.run;
+    supervisorRunner.run = async (task) => {
+      const result = await origRun.call(supervisorRunner, task);
       return { ...result, error: new Error("Process exited with code 1") };
     };
-    const supervisorRunner = createMockRunner([]);
+    const agentRunner = createMockRunner([]);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -311,18 +428,18 @@ describe("Supervisor", () => {
     assert.strictEqual(result.success, false);
     assert.strictEqual(result.turns, 0);
-    // Output should still contain the agent's buffered lines + summary
+    // Output should still contain the supervisor's buffered lines + summary
     const data = output.read()?.toString() ?? "";
     const lines = data
       .trim()
       .split("\n")
       .filter((l) => l.length > 0);
-    assert.ok(lines.length >= 2, "Expected at least agent line + summary");
+    assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
-    const agentLine = JSON.parse(lines[0]);
-    assert.strictEqual(agentLine.source, "agent");
-    assert.strictEqual(agentLine.turn, 0);
+    const supervisorLine = JSON.parse(lines[0]);
+    assert.strictEqual(supervisorLine.source, "supervisor");
+    assert.strictEqual(supervisorLine.turn, 0);
     const summaryLine = JSON.parse(lines[lines.length - 1]);
     assert.strictEqual(summaryLine.source, "orchestrator");
@@ -339,4 +456,99 @@ describe("Supervisor", () => {
     });
     assert.ok(supervisor instanceof Supervisor);
   });
+  test("createSupervisor uses default supervisor tools when none specified", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ]);
+  });
+  test("createSupervisor passes custom supervisor tools", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+      supervisorAllowedTools: ["Read", "Glob", "Grep"],
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
+      "Read",
+      "Glob",
+      "Grep",
+    ]);
+  });
+  test("createSupervisor wires system prompts to both runners", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
+      type: "preset",
+      preset: "claude_code",
+      append: AGENT_SYSTEM_PROMPT,
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
+      type: "preset",
+      preset: "claude_code",
+      append: SUPERVISOR_SYSTEM_PROMPT,
+    });
+  });
+  test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
+      "Task",
+      "TaskOutput",
+    ]);
+    // Agent should not have disallowed tools
+    assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
+  });
+  test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+      supervisorDisallowedTools: ["WebSearch", "Task"],
+    });
+    const disallowed = supervisor.supervisorRunner.disallowedTools;
+    assert.ok(disallowed.includes("Task"));
+    assert.ok(disallowed.includes("TaskOutput"));
+    assert.ok(disallowed.includes("WebSearch"));
+    // No duplicates
+    assert.strictEqual(disallowed.length, new Set(disallowed).size);
+  });
+  test("system prompt constants are non-empty strings", () => {
+    assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
+    assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
+    assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
+  });
+  test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
+  });
 });

package/test/tee-writer.test.js CHANGED Viewed

@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
     assert.strictEqual(fileLines.length, 3);
     assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
-    // Text should show source labels
-    assert.ok(textData.includes("[agent]"));
-    assert.ok(textData.includes("Working on it"));
-    assert.ok(textData.includes("[supervisor]"));
-    assert.ok(textData.includes("Looks good"));
+    // Text should show source prefixes on content lines
+    assert.ok(textData.includes("[agent] Working on it"));
+    assert.ok(textData.includes("[supervisor] Looks good"));
     assert.ok(textData.includes("Evaluation completed after 1 turns"));
   });
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
     await writeLines(writer, events);
     const textData = collect(textStream);
-    // [agent] label should appear only once
-    const agentLabels = textData.split("[agent]").length - 1;
-    assert.strictEqual(agentLabels, 1);
+    // [agent] prefix should appear on each content line
+    assert.ok(textData.includes("[agent] Step 1"));
+    assert.ok(textData.includes("[agent] Step 2"));
   });
   test("handles partial lines across chunks", async () => {

package/test/trace-collector.test.js CHANGED Viewed

@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
       assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
     });
+    test("unwraps combined supervised trace format {source, turn, event}", () => {
+      const collector = new TraceCollector();
+      // System init wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 0,
+          event: {
+            type: "system",
+            subtype: "init",
+            session_id: "sess-supervised",
+            model: "claude-opus-4-6",
+            tools: ["Bash"],
+          },
+        }),
+      );
+      // Assistant message wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 1,
+          event: {
+            type: "assistant",
+            message: {
+              content: [{ type: "text", text: "I ran the tests." }],
+              usage: { input_tokens: 100, output_tokens: 50 },
+            },
+          },
+        }),
+      );
+      // Tool result wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 1,
+          event: {
+            type: "user",
+            message: {
+              role: "user",
+              content: [
+                {
+                  type: "tool_result",
+                  tool_use_id: "toolu_sup",
+                  content: "All tests passed",
+                },
+              ],
+            },
+          },
+        }),
+      );
+      // Result event wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "supervisor",
+          turn: 1,
+          event: {
+            type: "result",
+            subtype: "success",
+            total_cost_usd: 0.44,
+            duration_ms: 30000,
+            num_turns: 2,
+          },
+        }),
+      );
+      const trace = collector.toJSON();
+      assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
+      assert.strictEqual(trace.turns.length, 2);
+      assert.strictEqual(trace.turns[0].role, "assistant");
+      assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
+      assert.strictEqual(trace.turns[1].role, "tool_result");
+      assert.strictEqual(trace.turns[1].content, "All tests passed");
+      assert.strictEqual(trace.summary.result, "success");
+      assert.strictEqual(trace.summary.totalCostUsd, 0.44);
+    });
+    test("skips orchestrator summary lines from supervised traces", () => {
+      const collector = new TraceCollector();
+      collector.addLine(
+        JSON.stringify({
+          source: "orchestrator",
+          type: "summary",
+          success: true,
+          turns: 3,
+        }),
+      );
+      // Orchestrator summaries have no inner event and no recognized type
+      // after unwrap — they should be silently skipped.
+      assert.strictEqual(collector.toJSON().turns.length, 0);
+    });
     test("skips rate_limit_event and unknown types", () => {
       const collector = new TraceCollector();
       collector.addLine(