npm - @forwardimpact/libeval - Versions diffs - 0.1.3 → 0.1.5 - Mend

@forwardimpact/libeval 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/bin/fit-eval.js +12 -4
package/index.js +6 -1
package/package.json +4 -3
package/src/agent-runner.js +19 -1
package/src/commands/run.js +12 -4
package/src/commands/supervise.js +20 -4
package/src/supervisor.js +108 -31
package/src/tee-writer.js +6 -3
package/test/supervisor.test.js +200 -49
package/test/tee-writer.test.js +6 -8

package/bin/fit-eval.js CHANGED Viewed

@@ -25,21 +25,28 @@ Commands:
   supervise [options]            Run a supervised agent ↔ supervisor relay loop
 Run options:
-  --task=PATH          Path to task file (required)
+  --task-file=PATH     Path to task file (mutually exclusive with --task-text)
+  --task-text=STRING   Inline task text (mutually exclusive with --task-file)
   --cwd=DIR            Agent working directory (default: .)
   --model=MODEL        Claude model to use (default: opus)
   --max-turns=N        Maximum agentic turns (default: 50)
   --output=PATH        Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+  --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
 Supervise options:
-  --task=PATH               Path to task file (required)
+  --task-file=PATH          Path to task file (mutually exclusive with --task-text)
+  --task-text=STRING        Inline task text (mutually exclusive with --task-file)
   --supervisor-cwd=DIR      Supervisor working directory (default: .)
   --agent-cwd=DIR           Agent working directory (default: temp directory)
   --model=MODEL             Claude model to use (default: opus)
   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
   --output=PATH             Write NDJSON trace to file (default: stdout)
   --allowed-tools=LIST      Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
+  --supervisor-allowed-tools=LIST
+                            Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
+  --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
+  --agent-profile=NAME      Agent profile name (passed as --agent to Claude CLI)
 Options:
   --help      Show this help message
@@ -50,8 +57,9 @@ Examples:
   fit-eval output --format=json < trace.ndjson
   fit-eval tee < trace.ndjson
   fit-eval tee output.ndjson < trace.ndjson
-  fit-eval run --task=.github/tasks/security-audit.md --model=opus
-  fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
+  fit-eval run --task-text="Perform a security audit of the repository." --model=opus
+  fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
+  fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
 `.trim();
 async function main() {

package/index.js CHANGED Viewed

@@ -1,4 +1,9 @@
 export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
 export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
-export { Supervisor, createSupervisor } from "./src/supervisor.js";
+export {
+  Supervisor,
+  createSupervisor,
+  SUPERVISOR_SYSTEM_PROMPT,
+  AGENT_SYSTEM_PROMPT,
+} from "./src/supervisor.js";
 export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.3",
+  "version": "0.1.5",
   "description": "Process Claude Code stream-json output into structured traces",
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",
@@ -10,13 +10,14 @@
     "fit-eval": "./bin/fit-eval.js"
   },
   "engines": {
-    "bun": ">=1.2.0"
+    "bun": ">=1.2.0",
+    "node": ">=18.0.0"
   },
   "scripts": {
     "test": "bun run node --test test/*.test.js"
   },
   "dependencies": {
-    "@anthropic-ai/claude-agent-sdk": "^0.1.0"
+    "@anthropic-ai/claude-agent-sdk": "^0.2.91"
   },
   "publishConfig": {
     "access": "public"

package/src/agent-runner.js CHANGED Viewed

@@ -18,6 +18,9 @@ export class AgentRunner {
    * @param {string} [deps.permissionMode] - SDK permission mode
    * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
    * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
+   * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
+   * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
+   * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
    */
   constructor({
     cwd,
@@ -29,6 +32,9 @@ export class AgentRunner {
     permissionMode,
     onLine,
     settingSources,
+    agentProfile,
+    systemPrompt,
+    disallowedTools,
   }) {
     if (!cwd) throw new Error("cwd is required");
     if (!query) throw new Error("query is required");
@@ -49,6 +55,9 @@ export class AgentRunner {
     this.permissionMode = permissionMode ?? "bypassPermissions";
     this.onLine = onLine ?? null;
     this.settingSources = settingSources ?? [];
+    this.agentProfile = agentProfile ?? null;
+    this.systemPrompt = systemPrompt ?? null;
+    this.disallowedTools = disallowedTools ?? [];
     this.sessionId = null;
     this.buffer = [];
   }
@@ -74,6 +83,11 @@ export class AgentRunner {
           permissionMode: this.permissionMode,
           allowDangerouslySkipPermissions: true,
           settingSources: this.settingSources,
+          ...(this.disallowedTools.length > 0 && {
+            disallowedTools: this.disallowedTools,
+          }),
+          ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
+          ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
         },
       })) {
         const line = JSON.stringify(message);
@@ -113,7 +127,11 @@ export class AgentRunner {
     try {
       for await (const message of this.query({
         prompt,
-        options: { resume: this.sessionId },
+        options: {
+          resume: this.sessionId,
+          permissionMode: this.permissionMode,
+          allowDangerouslySkipPermissions: true,
+        },
       })) {
         const line = JSON.stringify(message);
         this.output.write(line + "\n");

package/src/commands/run.js CHANGED Viewed

@@ -24,28 +24,35 @@ function parseFlag(args, name) {
  * Usage: fit-eval run [options]
  *
  * Options:
- *   --task=PATH          Path to task file (required)
+ *   --task-file=PATH     Path to task file (mutually exclusive with --task-text)
+ *   --task-text=STRING   Inline task text (mutually exclusive with --task-file)
  *   --cwd=DIR            Agent working directory (default: .)
  *   --model=MODEL        Claude model to use (default: opus)
  *   --max-turns=N        Maximum agentic turns (default: 50)
  *   --output=PATH        Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
+ *   --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
  *
  * @param {string[]} args - Command arguments
  */
 export async function runRunCommand(args) {
-  const task = parseFlag(args, "task");
-  if (!task) throw new Error("--task is required");
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
   const cwd = resolve(parseFlag(args, "cwd") ?? ".");
   const model = parseFlag(args, "model") ?? "opus";
   const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
   const outputPath = parseFlag(args, "output");
+  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
   const allowedTools = (
     parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
   ).split(",");
-  const taskContent = readFileSync(task, "utf8");
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
     maxTurns,
     allowedTools,
     settingSources: ["project"],
+    agentProfile,
   });
   const result = await runner.run(taskContent);

package/src/commands/supervise.js CHANGED Viewed

@@ -25,19 +25,26 @@ function parseFlag(args, name) {
  * Usage: fit-eval supervise [options]
  *
  * Options:
- *   --task=PATH               Path to task file (required)
+ *   --task-file=PATH          Path to task file (mutually exclusive with --task-text)
+ *   --task-text=STRING        Inline task text (mutually exclusive with --task-file)
  *   --supervisor-cwd=DIR      Supervisor working directory (default: .)
  *   --agent-cwd=DIR           Agent working directory (default: temp directory)
  *   --model=MODEL             Claude model to use (default: opus)
  *   --max-turns=N             Maximum supervisor ↔ agent exchanges (default: 20)
  *   --output=PATH             Write NDJSON trace to file (default: stdout)
  *   --allowed-tools=LIST      Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
+ *   --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
+ *   --agent-profile=NAME      Agent profile name (passed as --agent to Claude CLI)
  *
  * @param {string[]} args - Command arguments
  */
 export async function runSuperviseCommand(args) {
-  const task = parseFlag(args, "task");
-  if (!task) throw new Error("--task is required");
+  const taskFile = parseFlag(args, "task-file");
+  const taskText = parseFlag(args, "task-text");
+  if (taskFile && taskText)
+    throw new Error("--task-file and --task-text are mutually exclusive");
+  if (!taskFile && !taskText)
+    throw new Error("--task-file or --task-text is required");
   const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
   const agentCwd = resolve(
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
   const model = parseFlag(args, "model") ?? "opus";
   const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
   const outputPath = parseFlag(args, "output");
+  const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
+  const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
   const allowedTools = (
     parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
   ).split(",");
+  const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
+  const supervisorAllowedTools = supervisorAllowedToolsRaw
+    ? supervisorAllowedToolsRaw.split(",")
+    : undefined;
-  const taskContent = readFileSync(task, "utf8");
+  const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
   // When --output is specified, stream text to stdout while writing NDJSON to file.
   // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
     model,
     maxTurns,
     allowedTools,
+    supervisorAllowedTools,
+    supervisorProfile,
+    agentProfile,
   });
   const result = await supervisor.run(taskContent);

package/src/supervisor.js CHANGED Viewed

@@ -1,25 +1,38 @@
 /**
  * Supervisor — orchestrates a relay loop between an agent and a supervisor,
- * both running as AgentRunner instances. The agent works on a task while the
- * supervisor observes and decides when the evaluation is complete.
+ * both running as AgentRunner instances. The supervisor receives the task first,
+ * introduces itself, and delegates work to the agent. The loop then alternates:
+ * agent → supervisor → agent.
  *
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
  */
 import { PassThrough } from "node:stream";
 import { createAgentRunner } from "./agent-runner.js";
+import { TraceCollector } from "./trace-collector.js";
 /**
- * Check if the supervisor's response signals evaluation completion.
- * Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
- * to avoid false positives from natural language.
+ * Check if the supervisor's response signals evaluation success.
+ * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
+ * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
+ * avoid matching inside longer identifiers.
  * @param {string} text
  * @returns {boolean}
  */
-export function isDone(text) {
-  return /^EVALUATION_COMPLETE$/m.test(text);
+export function isSuccessful(text) {
+  return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
 }
+/** System prompt appended for the supervisor runner in supervise mode. */
+export const SUPERVISOR_SYSTEM_PROMPT =
+  "You supervise another AI agent through a relay — your output becomes the agent's next input. " +
+  "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
+/** System prompt appended for the agent runner in supervise mode. */
+export const AGENT_SYSTEM_PROMPT =
+  "You are being supervised by another AI agent. " +
+  "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
 export class Supervisor {
   /**
    * @param {object} deps
@@ -44,60 +57,88 @@ export class Supervisor {
   /**
    * Run the supervisor ↔ agent relay loop.
-   * @param {string} task - The initial task for the agent
+   * The supervisor receives the task first, introduces itself, and delegates
+   * work to the agent. The loop then alternates: agent → supervisor → agent.
+   * @param {string} task - The initial task for the supervisor
    * @returns {Promise<{success: boolean, turns: number}>}
    */
   async run(task) {
-    // Turn 0: Agent receives the task and starts working
-    this.currentSource = "agent";
+    // Turn 0: Supervisor receives the task and introduces it to the agent
+    this.currentSource = "supervisor";
     this.currentTurn = 0;
-    let agentResult = await this.agentRunner.run(task);
+    let supervisorResult = await this.supervisorRunner.run(task);
-    if (agentResult.error) {
+    if (supervisorResult.error) {
       this.emitSummary({ success: false, turns: 0 });
       return { success: false, turns: 0 };
     }
-    for (let turn = 1; turn <= this.maxTurns; turn++) {
-      // Supervisor observes the agent's output
-      const supervisorPrompt =
-        `The agent reported:\n\n${agentResult.text}\n\n` +
-        `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
+    // The supervisor's turn is fully complete (all tool calls executed) by the
+    // time we check the signal — no work is interrupted.
+    if (isSuccessful(supervisorResult.text)) {
+      this.emitSummary({ success: true, turns: 0 });
+      return { success: true, turns: 0 };
+    }
-      this.currentSource = "supervisor";
+    for (let turn = 1; turn <= this.maxTurns; turn++) {
+      // Supervisor's output becomes the agent's input
+      this.currentSource = "agent";
       this.currentTurn = turn;
-      let supervisorResult;
+      let agentResult;
       if (turn === 1) {
-        supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
+        agentResult = await this.agentRunner.run(supervisorResult.text);
       } else {
-        supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
+        agentResult = await this.agentRunner.resume(supervisorResult.text);
       }
-      if (supervisorResult.error) {
+      if (agentResult.error) {
         this.emitSummary({ success: false, turns: turn });
         return { success: false, turns: turn };
       }
-      if (isDone(supervisorResult.text)) {
-        this.emitSummary({ success: true, turns: turn });
-        return { success: true, turns: turn };
-      }
+      // Build the full agent transcript from buffered NDJSON events so the
+      // supervisor sees tool calls and reasoning, not just the SDK result summary.
+      const agentTranscript = this.extractTranscript(this.agentRunner);
-      // Supervisor's response becomes the agent's next input
-      this.currentSource = "agent";
+      const supervisorPrompt =
+        `The agent reported:\n\n${agentTranscript}\n\n` +
+        `Review the agent's work and decide how to proceed.`;
+      this.currentSource = "supervisor";
       this.currentTurn = turn;
-      agentResult = await this.agentRunner.resume(supervisorResult.text);
+      supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
-      if (agentResult.error) {
+      if (supervisorResult.error) {
         this.emitSummary({ success: false, turns: turn });
         return { success: false, turns: turn };
       }
+      // The supervisor's turn is fully complete — check for success signal.
+      if (isSuccessful(supervisorResult.text)) {
+        this.emitSummary({ success: true, turns: turn });
+        return { success: true, turns: turn };
+      }
     }
     this.emitSummary({ success: false, turns: this.maxTurns });
     return { success: false, turns: this.maxTurns };
   }
+  /**
+   * Extract a human-readable transcript from an AgentRunner's buffered output.
+   * Drains the buffer and replays events through a TraceCollector.
+   * @param {import("./agent-runner.js").AgentRunner} runner
+   * @returns {string}
+   */
+  extractTranscript(runner) {
+    const lines = runner.drainOutput();
+    const collector = new TraceCollector();
+    for (const line of lines) {
+      collector.addLine(line);
+    }
+    return collector.toText() || "[The agent produced no output.]";
+  }
   /**
    * Emit a single NDJSON line tagged with the current source and turn.
    * Called in real-time via the AgentRunner onLine callback.
@@ -138,6 +179,10 @@ export class Supervisor {
  * @param {string} [deps.model] - Claude model identifier
  * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
+ * @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
+ * @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
+ * @param {string} [deps.supervisorProfile] - Supervisor agent profile name
+ * @param {string} [deps.agentProfile] - Agent profile name
  * @returns {Supervisor}
  */
 export function createSupervisor({
@@ -148,6 +193,10 @@ export function createSupervisor({
   model,
   maxTurns,
   allowedTools,
+  supervisorDisallowedTools,
+  supervisorAllowedTools,
+  supervisorProfile,
+  agentProfile,
 }) {
   // Forward-reference: onLine captures `supervisor` before construction completes.
   // This is safe because onLine is only called during run(), after construction.
@@ -163,17 +212,45 @@ export function createSupervisor({
     allowedTools,
     onLine,
     settingSources: ["project"],
+    agentProfile,
+    systemPrompt: {
+      type: "preset",
+      preset: "claude_code",
+      append: AGENT_SYSTEM_PROMPT,
+    },
   });
+  // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
+  // The relay loop handles agent communication — letting the supervisor use
+  // Task would bypass the relay and produce an empty agent trace.
+  const defaultDisallowed = ["Task", "TaskOutput"];
+  const disallowedTools = supervisorDisallowedTools
+    ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
+    : defaultDisallowed;
   const supervisorRunner = createAgentRunner({
     cwd: supervisorCwd,
     query,
     output: new PassThrough(),
     model,
     maxTurns: 10,
-    allowedTools: ["Read", "Glob", "Grep"],
+    allowedTools: supervisorAllowedTools ?? [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ],
+    disallowedTools,
     onLine,
     settingSources: ["project"],
+    agentProfile: supervisorProfile,
+    systemPrompt: {
+      type: "preset",
+      preset: "claude_code",
+      append: SUPERVISOR_SYSTEM_PROMPT,
+    },
   });
   supervisor = new Supervisor({

package/src/tee-writer.js CHANGED Viewed

@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
     if (parsed.event) {
       if (parsed.source && parsed.source !== this.lastSource) {
         this.lastSource = parsed.source;
-        this.textStream.write(`\n[${parsed.source}]\n`);
       }
       this.collector.addLine(JSON.stringify(parsed.event));
       this.flushTurns();
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
    */
   flushTurns() {
     const turns = this.collector.turns;
+    const prefix =
+      this.mode === "supervised" && this.lastSource
+        ? `[${this.lastSource}] `
+        : "";
     while (this.turnsEmitted < turns.length) {
       const turn = turns[this.turnsEmitted++];
       if (turn.role === "assistant") {
         for (const block of turn.content) {
           if (block.type === "text") {
-            this.textStream.write(block.text + "\n");
+            this.textStream.write(`${prefix}${block.text}\n`);
           } else if (block.type === "tool_use") {
             const input = summarizeInput(block.input);
-            this.textStream.write(`> Tool: ${block.name} ${input}\n`);
+            this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
           }
         }
       }

package/test/supervisor.test.js CHANGED Viewed

@@ -6,8 +6,10 @@ import {
   AgentRunner,
   Supervisor,
   createSupervisor,
+  SUPERVISOR_SYSTEM_PROMPT,
+  AGENT_SYSTEM_PROMPT,
 } from "@forwardimpact/libeval";
-import { isDone } from "../src/supervisor.js";
+import { isSuccessful } from "../src/supervisor.js";
 /**
  * Create a mock AgentRunner that yields pre-scripted responses.
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
   return runner;
 }
-describe("isDone", () => {
-  test("detects EVALUATION_COMPLETE on its own line", () => {
-    assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
+describe("isSuccessful", () => {
+  test("detects EVALUATION_SUCCESSFUL on its own line", () => {
+    assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
     assert.strictEqual(
-      isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
+      isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
       true,
     );
-    assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
+    assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
   });
-  test("does not match EVALUATION_COMPLETE embedded in text", () => {
-    assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
-    assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
-    assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
+  test("tolerates markdown formatting around the signal", () => {
+    assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
+    assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
+    assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
+    assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
+    assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
+    assert.strictEqual(
+      isSuccessful(
+        "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
+      ),
+      true,
+    );
+  });
+  test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
+    assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
+    assert.strictEqual(
+      isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
+      true,
+    );
+    assert.strictEqual(
+      isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
+      true,
+    );
   });
   test("does not match empty or unrelated text", () => {
-    assert.strictEqual(isDone(""), false);
-    assert.strictEqual(isDone("All done!"), false);
-    assert.strictEqual(isDone("DONE"), false);
+    assert.strictEqual(isSuccessful(""), false);
+    assert.strictEqual(isSuccessful("All done!"), false);
+    assert.strictEqual(isSuccessful("DONE"), false);
+  });
+  test("does not match old EVALUATION_COMPLETE signal", () => {
+    assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
   });
 });
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
     );
   });
-  test("completes on EVALUATION_COMPLETE from supervisor", async () => {
+  test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
+    const agentRunner = createMockRunner([]);
+    const supervisorRunner = createMockRunner([
+      { text: "EVALUATION_SUCCESSFUL" },
+    ]);
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    const result = await supervisor.run("Install stuff");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 0);
+  });
+  test("completes after one agent turn", async () => {
     const agentRunner = createMockRunner([
       { text: "I installed the packages." },
     ]);
     const supervisorRunner = createMockRunner([
-      { text: "Good work.\n\nEVALUATION_COMPLETE" },
+      { text: "Welcome! Please install the packages." },
+      { text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
     ]);
     const output = new PassThrough();
@@ -149,9 +197,10 @@ describe("Supervisor", () => {
     ]);
     const supervisorRunner = createMockRunner([
+      { text: "Here is your task. Do the work." },
       { text: "Keep going, you need to do more." },
       { text: "Almost there, continue." },
-      { text: "EVALUATION_COMPLETE" },
+      { text: "EVALUATION_SUCCESSFUL" },
     ]);
     const output = new PassThrough();
@@ -169,14 +218,14 @@ describe("Supervisor", () => {
   });
   test("enforces maxTurns limit", async () => {
-    // Agent responds to every turn, supervisor never says done
+    // Supervisor starts, agent responds each turn, supervisor never says done
     const agentRunner = createMockRunner([
-      { text: "Turn 0" },
       { text: "Turn 1" },
       { text: "Turn 2" },
     ]);
     const supervisorRunner = createMockRunner([
+      { text: "Start working." },
       { text: "Continue." },
       { text: "Continue." },
     ]);
@@ -196,16 +245,17 @@ describe("Supervisor", () => {
   });
   test("output contains tagged lines with correct source and turn", async () => {
-    const agentMessages = [[{ type: "assistant", content: "Working" }]];
     const supervisorMessages = [
-      [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
+      [{ type: "assistant", content: "Go ahead" }],
+      [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
     ];
+    const agentMessages = [[{ type: "assistant", content: "Working" }]];
-    const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
     const supervisorRunner = createMockRunner(
-      [{ text: "EVALUATION_COMPLETE" }],
+      [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
       supervisorMessages,
     );
+    const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -225,19 +275,19 @@ describe("Supervisor", () => {
       .split("\n")
       .filter((l) => l.length > 0);
-    // Should have: agent turn 0, supervisor turn 1, orchestrator summary
-    assert.ok(lines.length >= 3);
-    const agentLine = JSON.parse(lines[0]);
-    assert.strictEqual(agentLine.source, "agent");
-    assert.strictEqual(agentLine.turn, 0);
-    assert.ok("event" in agentLine);
+    // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
+    assert.ok(lines.length >= 4);
-    const supervisorLine = JSON.parse(lines[1]);
+    const supervisorLine = JSON.parse(lines[0]);
     assert.strictEqual(supervisorLine.source, "supervisor");
-    assert.strictEqual(supervisorLine.turn, 1);
+    assert.strictEqual(supervisorLine.turn, 0);
     assert.ok("event" in supervisorLine);
+    const agentLine = JSON.parse(lines[1]);
+    assert.strictEqual(agentLine.source, "agent");
+    assert.strictEqual(agentLine.turn, 1);
+    assert.ok("event" in agentLine);
     const summaryLine = JSON.parse(lines[lines.length - 1]);
     assert.strictEqual(summaryLine.source, "orchestrator");
     assert.strictEqual(summaryLine.type, "summary");
@@ -250,11 +300,14 @@ describe("Supervisor", () => {
       source: "sdk-internal",
       content: "test",
     };
-    const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
     const supervisorRunner = createMockRunner(
-      [{ text: "EVALUATION_COMPLETE" }],
-      [[{ type: "assistant", content: "ok" }]],
+      [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
+      [
+        [{ type: "assistant", content: "Go" }],
+        [{ type: "assistant", content: "ok" }],
+      ],
     );
+    const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -274,27 +327,30 @@ describe("Supervisor", () => {
       .split("\n")
       .filter((l) => l.length > 0);
-    const tagged = JSON.parse(lines[0]);
+    // First line is supervisor turn 0, second is agent turn 1
+    const tagged = JSON.parse(lines[1]);
     // The original event's `source` field is preserved inside `event`
     assert.strictEqual(tagged.source, "agent");
     assert.strictEqual(tagged.event.source, "sdk-internal");
   });
-  test("emits agent output and summary when agent errors on turn 0", async () => {
-    const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
-    const agentRunner = createMockRunner(
-      [{ text: "Partial work", success: false }],
-      agentMessages,
+  test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
+    const supervisorMessages = [
+      [{ type: "assistant", content: "Starting..." }],
+    ];
+    const supervisorRunner = createMockRunner(
+      [{ text: "Starting...", success: false }],
+      supervisorMessages,
     );
     // Override run to simulate an error return
-    const origRun = agentRunner.run;
-    agentRunner.run = async (task) => {
-      const result = await origRun.call(agentRunner, task);
+    const origRun = supervisorRunner.run;
+    supervisorRunner.run = async (task) => {
+      const result = await origRun.call(supervisorRunner, task);
       return { ...result, error: new Error("Process exited with code 1") };
     };
-    const supervisorRunner = createMockRunner([]);
+    const agentRunner = createMockRunner([]);
     const output = new PassThrough();
     const supervisor = new Supervisor({
@@ -311,18 +367,18 @@ describe("Supervisor", () => {
     assert.strictEqual(result.success, false);
     assert.strictEqual(result.turns, 0);
-    // Output should still contain the agent's buffered lines + summary
+    // Output should still contain the supervisor's buffered lines + summary
     const data = output.read()?.toString() ?? "";
     const lines = data
       .trim()
       .split("\n")
       .filter((l) => l.length > 0);
-    assert.ok(lines.length >= 2, "Expected at least agent line + summary");
+    assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
-    const agentLine = JSON.parse(lines[0]);
-    assert.strictEqual(agentLine.source, "agent");
-    assert.strictEqual(agentLine.turn, 0);
+    const supervisorLine = JSON.parse(lines[0]);
+    assert.strictEqual(supervisorLine.source, "supervisor");
+    assert.strictEqual(supervisorLine.turn, 0);
     const summaryLine = JSON.parse(lines[lines.length - 1]);
     assert.strictEqual(summaryLine.source, "orchestrator");
@@ -339,4 +395,99 @@ describe("Supervisor", () => {
     });
     assert.ok(supervisor instanceof Supervisor);
   });
+  test("createSupervisor uses default supervisor tools when none specified", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
+      "Bash",
+      "Read",
+      "Glob",
+      "Grep",
+      "Write",
+      "Edit",
+    ]);
+  });
+  test("createSupervisor passes custom supervisor tools", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+      supervisorAllowedTools: ["Read", "Glob", "Grep"],
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
+      "Read",
+      "Glob",
+      "Grep",
+    ]);
+  });
+  test("createSupervisor wires system prompts to both runners", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
+      type: "preset",
+      preset: "claude_code",
+      append: AGENT_SYSTEM_PROMPT,
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
+      type: "preset",
+      preset: "claude_code",
+      append: SUPERVISOR_SYSTEM_PROMPT,
+    });
+  });
+  test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+    });
+    assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
+      "Task",
+      "TaskOutput",
+    ]);
+    // Agent should not have disallowed tools
+    assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
+  });
+  test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
+    const supervisor = createSupervisor({
+      supervisorCwd: "/tmp/sup",
+      agentCwd: "/tmp/agent",
+      query: async function* () {},
+      output: new PassThrough(),
+      supervisorDisallowedTools: ["WebSearch", "Task"],
+    });
+    const disallowed = supervisor.supervisorRunner.disallowedTools;
+    assert.ok(disallowed.includes("Task"));
+    assert.ok(disallowed.includes("TaskOutput"));
+    assert.ok(disallowed.includes("WebSearch"));
+    // No duplicates
+    assert.strictEqual(disallowed.length, new Set(disallowed).size);
+  });
+  test("system prompt constants are non-empty strings", () => {
+    assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
+    assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
+    assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
+  });
+  test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
+    assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
+  });
 });

package/test/tee-writer.test.js CHANGED Viewed

@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
     assert.strictEqual(fileLines.length, 3);
     assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
-    // Text should show source labels
-    assert.ok(textData.includes("[agent]"));
-    assert.ok(textData.includes("Working on it"));
-    assert.ok(textData.includes("[supervisor]"));
-    assert.ok(textData.includes("Looks good"));
+    // Text should show source prefixes on content lines
+    assert.ok(textData.includes("[agent] Working on it"));
+    assert.ok(textData.includes("[supervisor] Looks good"));
     assert.ok(textData.includes("Evaluation completed after 1 turns"));
   });
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
     await writeLines(writer, events);
     const textData = collect(textStream);
-    // [agent] label should appear only once
-    const agentLabels = textData.split("[agent]").length - 1;
-    assert.strictEqual(agentLabels, 1);
+    // [agent] prefix should appear on each content line
+    assert.ok(textData.includes("[agent] Step 1"));
+    assert.ok(textData.includes("[agent] Step 2"));
   });
   test("handles partial lines across chunks", async () => {