@forwardimpact/libeval 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -25,21 +25,28 @@ Commands:
25
25
  supervise [options] Run a supervised agent ↔ supervisor relay loop
26
26
 
27
27
  Run options:
28
- --task=PATH Path to task file (required)
28
+ --task-file=PATH Path to task file (mutually exclusive with --task-text)
29
+ --task-text=STRING Inline task text (mutually exclusive with --task-file)
29
30
  --cwd=DIR Agent working directory (default: .)
30
31
  --model=MODEL Claude model to use (default: opus)
31
32
  --max-turns=N Maximum agentic turns (default: 50)
32
33
  --output=PATH Write NDJSON trace to file (default: stdout)
33
34
  --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
35
+ --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
34
36
 
35
37
  Supervise options:
36
- --task=PATH Path to task file (required)
38
+ --task-file=PATH Path to task file (mutually exclusive with --task-text)
39
+ --task-text=STRING Inline task text (mutually exclusive with --task-file)
37
40
  --supervisor-cwd=DIR Supervisor working directory (default: .)
38
41
  --agent-cwd=DIR Agent working directory (default: temp directory)
39
42
  --model=MODEL Claude model to use (default: opus)
40
43
  --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
41
44
  --output=PATH Write NDJSON trace to file (default: stdout)
42
45
  --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
46
+ --supervisor-allowed-tools=LIST
47
+ Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
48
+ --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
49
+ --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
43
50
 
44
51
  Options:
45
52
  --help Show this help message
@@ -50,8 +57,9 @@ Examples:
50
57
  fit-eval output --format=json < trace.ndjson
51
58
  fit-eval tee < trace.ndjson
52
59
  fit-eval tee output.ndjson < trace.ndjson
53
- fit-eval run --task=.github/tasks/security-audit.md --model=opus
54
- fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
60
+ fit-eval run --task-text="Perform a security audit of the repository." --model=opus
61
+ fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
62
+ fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
55
63
  `.trim();
56
64
 
57
65
  async function main() {
package/index.js CHANGED
@@ -1,4 +1,9 @@
1
1
  export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
2
2
  export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
3
- export { Supervisor, createSupervisor } from "./src/supervisor.js";
3
+ export {
4
+ Supervisor,
5
+ createSupervisor,
6
+ SUPERVISOR_SYSTEM_PROMPT,
7
+ AGENT_SYSTEM_PROMPT,
8
+ } from "./src/supervisor.js";
4
9
  export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.3",
3
+ "version": "0.1.5",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -10,13 +10,14 @@
10
10
  "fit-eval": "./bin/fit-eval.js"
11
11
  },
12
12
  "engines": {
13
- "bun": ">=1.2.0"
13
+ "bun": ">=1.2.0",
14
+ "node": ">=18.0.0"
14
15
  },
15
16
  "scripts": {
16
17
  "test": "bun run node --test test/*.test.js"
17
18
  },
18
19
  "dependencies": {
19
- "@anthropic-ai/claude-agent-sdk": "^0.1.0"
20
+ "@anthropic-ai/claude-agent-sdk": "^0.2.91"
20
21
  },
21
22
  "publishConfig": {
22
23
  "access": "public"
@@ -18,6 +18,9 @@ export class AgentRunner {
18
18
  * @param {string} [deps.permissionMode] - SDK permission mode
19
19
  * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
20
20
  * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
21
+ * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
22
+ * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
23
+ * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
21
24
  */
22
25
  constructor({
23
26
  cwd,
@@ -29,6 +32,9 @@ export class AgentRunner {
29
32
  permissionMode,
30
33
  onLine,
31
34
  settingSources,
35
+ agentProfile,
36
+ systemPrompt,
37
+ disallowedTools,
32
38
  }) {
33
39
  if (!cwd) throw new Error("cwd is required");
34
40
  if (!query) throw new Error("query is required");
@@ -49,6 +55,9 @@ export class AgentRunner {
49
55
  this.permissionMode = permissionMode ?? "bypassPermissions";
50
56
  this.onLine = onLine ?? null;
51
57
  this.settingSources = settingSources ?? [];
58
+ this.agentProfile = agentProfile ?? null;
59
+ this.systemPrompt = systemPrompt ?? null;
60
+ this.disallowedTools = disallowedTools ?? [];
52
61
  this.sessionId = null;
53
62
  this.buffer = [];
54
63
  }
@@ -74,6 +83,11 @@ export class AgentRunner {
74
83
  permissionMode: this.permissionMode,
75
84
  allowDangerouslySkipPermissions: true,
76
85
  settingSources: this.settingSources,
86
+ ...(this.disallowedTools.length > 0 && {
87
+ disallowedTools: this.disallowedTools,
88
+ }),
89
+ ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
90
+ ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
77
91
  },
78
92
  })) {
79
93
  const line = JSON.stringify(message);
@@ -113,7 +127,11 @@ export class AgentRunner {
113
127
  try {
114
128
  for await (const message of this.query({
115
129
  prompt,
116
- options: { resume: this.sessionId },
130
+ options: {
131
+ resume: this.sessionId,
132
+ permissionMode: this.permissionMode,
133
+ allowDangerouslySkipPermissions: true,
134
+ },
117
135
  })) {
118
136
  const line = JSON.stringify(message);
119
137
  this.output.write(line + "\n");
@@ -24,28 +24,35 @@ function parseFlag(args, name) {
24
24
  * Usage: fit-eval run [options]
25
25
  *
26
26
  * Options:
27
- * --task=PATH Path to task file (required)
27
+ * --task-file=PATH Path to task file (mutually exclusive with --task-text)
28
+ * --task-text=STRING Inline task text (mutually exclusive with --task-file)
28
29
  * --cwd=DIR Agent working directory (default: .)
29
30
  * --model=MODEL Claude model to use (default: opus)
30
31
  * --max-turns=N Maximum agentic turns (default: 50)
31
32
  * --output=PATH Write NDJSON trace to file (default: stdout)
32
33
  * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
34
+ * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
33
35
  *
34
36
  * @param {string[]} args - Command arguments
35
37
  */
36
38
  export async function runRunCommand(args) {
37
- const task = parseFlag(args, "task");
38
- if (!task) throw new Error("--task is required");
39
+ const taskFile = parseFlag(args, "task-file");
40
+ const taskText = parseFlag(args, "task-text");
41
+ if (taskFile && taskText)
42
+ throw new Error("--task-file and --task-text are mutually exclusive");
43
+ if (!taskFile && !taskText)
44
+ throw new Error("--task-file or --task-text is required");
39
45
 
40
46
  const cwd = resolve(parseFlag(args, "cwd") ?? ".");
41
47
  const model = parseFlag(args, "model") ?? "opus";
42
48
  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
43
49
  const outputPath = parseFlag(args, "output");
50
+ const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
44
51
  const allowedTools = (
45
52
  parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
46
53
  ).split(",");
47
54
 
48
- const taskContent = readFileSync(task, "utf8");
55
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
49
56
 
50
57
  // When --output is specified, stream text to stdout while writing NDJSON to file.
51
58
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
63
70
  maxTurns,
64
71
  allowedTools,
65
72
  settingSources: ["project"],
73
+ agentProfile,
66
74
  });
67
75
 
68
76
  const result = await runner.run(taskContent);
@@ -25,19 +25,26 @@ function parseFlag(args, name) {
25
25
  * Usage: fit-eval supervise [options]
26
26
  *
27
27
  * Options:
28
- * --task=PATH Path to task file (required)
28
+ * --task-file=PATH Path to task file (mutually exclusive with --task-text)
29
+ * --task-text=STRING Inline task text (mutually exclusive with --task-file)
29
30
  * --supervisor-cwd=DIR Supervisor working directory (default: .)
30
31
  * --agent-cwd=DIR Agent working directory (default: temp directory)
31
32
  * --model=MODEL Claude model to use (default: opus)
32
33
  * --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
33
34
  * --output=PATH Write NDJSON trace to file (default: stdout)
34
35
  * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
36
+ * --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
37
+ * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
35
38
  *
36
39
  * @param {string[]} args - Command arguments
37
40
  */
38
41
  export async function runSuperviseCommand(args) {
39
- const task = parseFlag(args, "task");
40
- if (!task) throw new Error("--task is required");
42
+ const taskFile = parseFlag(args, "task-file");
43
+ const taskText = parseFlag(args, "task-text");
44
+ if (taskFile && taskText)
45
+ throw new Error("--task-file and --task-text are mutually exclusive");
46
+ if (!taskFile && !taskText)
47
+ throw new Error("--task-file or --task-text is required");
41
48
 
42
49
  const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
43
50
  const agentCwd = resolve(
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
47
54
  const model = parseFlag(args, "model") ?? "opus";
48
55
  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
49
56
  const outputPath = parseFlag(args, "output");
57
+ const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
58
+ const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
50
59
  const allowedTools = (
51
60
  parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
52
61
  ).split(",");
62
+ const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
63
+ const supervisorAllowedTools = supervisorAllowedToolsRaw
64
+ ? supervisorAllowedToolsRaw.split(",")
65
+ : undefined;
53
66
 
54
- const taskContent = readFileSync(task, "utf8");
67
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
55
68
 
56
69
  // When --output is specified, stream text to stdout while writing NDJSON to file.
57
70
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
73
86
  model,
74
87
  maxTurns,
75
88
  allowedTools,
89
+ supervisorAllowedTools,
90
+ supervisorProfile,
91
+ agentProfile,
76
92
  });
77
93
 
78
94
  const result = await supervisor.run(taskContent);
package/src/supervisor.js CHANGED
@@ -1,25 +1,38 @@
1
1
  /**
2
2
  * Supervisor — orchestrates a relay loop between an agent and a supervisor,
3
- * both running as AgentRunner instances. The agent works on a task while the
4
- * supervisor observes and decides when the evaluation is complete.
3
+ * both running as AgentRunner instances. The supervisor receives the task first,
4
+ * introduces itself, and delegates work to the agent. The loop then alternates:
5
+ * agent → supervisor → agent.
5
6
  *
6
7
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
7
8
  */
8
9
 
9
10
  import { PassThrough } from "node:stream";
10
11
  import { createAgentRunner } from "./agent-runner.js";
12
+ import { TraceCollector } from "./trace-collector.js";
11
13
 
12
14
  /**
13
- * Check if the supervisor's response signals evaluation completion.
14
- * Uses a structured signal `EVALUATION_COMPLETE` on its own line —
15
- * to avoid false positives from natural language.
15
+ * Check if the supervisor's response signals evaluation success.
16
+ * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
17
+ * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
18
+ * avoid matching inside longer identifiers.
16
19
  * @param {string} text
17
20
  * @returns {boolean}
18
21
  */
19
- export function isDone(text) {
20
- return /^EVALUATION_COMPLETE$/m.test(text);
22
+ export function isSuccessful(text) {
23
+ return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
21
24
  }
22
25
 
26
+ /** System prompt appended for the supervisor runner in supervise mode. */
27
+ export const SUPERVISOR_SYSTEM_PROMPT =
28
+ "You supervise another AI agent through a relay — your output becomes the agent's next input. " +
29
+ "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
30
+
31
+ /** System prompt appended for the agent runner in supervise mode. */
32
+ export const AGENT_SYSTEM_PROMPT =
33
+ "You are being supervised by another AI agent. " +
34
+ "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
35
+
23
36
  export class Supervisor {
24
37
  /**
25
38
  * @param {object} deps
@@ -44,60 +57,88 @@ export class Supervisor {
44
57
 
45
58
  /**
46
59
  * Run the supervisor ↔ agent relay loop.
47
- * @param {string} task - The initial task for the agent
60
+ * The supervisor receives the task first, introduces itself, and delegates
61
+ * work to the agent. The loop then alternates: agent → supervisor → agent.
62
+ * @param {string} task - The initial task for the supervisor
48
63
  * @returns {Promise<{success: boolean, turns: number}>}
49
64
  */
50
65
  async run(task) {
51
- // Turn 0: Agent receives the task and starts working
52
- this.currentSource = "agent";
66
+ // Turn 0: Supervisor receives the task and introduces it to the agent
67
+ this.currentSource = "supervisor";
53
68
  this.currentTurn = 0;
54
- let agentResult = await this.agentRunner.run(task);
69
+ let supervisorResult = await this.supervisorRunner.run(task);
55
70
 
56
- if (agentResult.error) {
71
+ if (supervisorResult.error) {
57
72
  this.emitSummary({ success: false, turns: 0 });
58
73
  return { success: false, turns: 0 };
59
74
  }
60
75
 
61
- for (let turn = 1; turn <= this.maxTurns; turn++) {
62
- // Supervisor observes the agent's output
63
- const supervisorPrompt =
64
- `The agent reported:\n\n${agentResult.text}\n\n` +
65
- `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
76
+ // The supervisor's turn is fully complete (all tool calls executed) by the
77
+ // time we check the signal — no work is interrupted.
78
+ if (isSuccessful(supervisorResult.text)) {
79
+ this.emitSummary({ success: true, turns: 0 });
80
+ return { success: true, turns: 0 };
81
+ }
66
82
 
67
- this.currentSource = "supervisor";
83
+ for (let turn = 1; turn <= this.maxTurns; turn++) {
84
+ // Supervisor's output becomes the agent's input
85
+ this.currentSource = "agent";
68
86
  this.currentTurn = turn;
69
- let supervisorResult;
87
+ let agentResult;
70
88
  if (turn === 1) {
71
- supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
89
+ agentResult = await this.agentRunner.run(supervisorResult.text);
72
90
  } else {
73
- supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
91
+ agentResult = await this.agentRunner.resume(supervisorResult.text);
74
92
  }
75
93
 
76
- if (supervisorResult.error) {
94
+ if (agentResult.error) {
77
95
  this.emitSummary({ success: false, turns: turn });
78
96
  return { success: false, turns: turn };
79
97
  }
80
98
 
81
- if (isDone(supervisorResult.text)) {
82
- this.emitSummary({ success: true, turns: turn });
83
- return { success: true, turns: turn };
84
- }
99
+ // Build the full agent transcript from buffered NDJSON events so the
100
+ // supervisor sees tool calls and reasoning, not just the SDK result summary.
101
+ const agentTranscript = this.extractTranscript(this.agentRunner);
85
102
 
86
- // Supervisor's response becomes the agent's next input
87
- this.currentSource = "agent";
103
+ const supervisorPrompt =
104
+ `The agent reported:\n\n${agentTranscript}\n\n` +
105
+ `Review the agent's work and decide how to proceed.`;
106
+
107
+ this.currentSource = "supervisor";
88
108
  this.currentTurn = turn;
89
- agentResult = await this.agentRunner.resume(supervisorResult.text);
109
+ supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
90
110
 
91
- if (agentResult.error) {
111
+ if (supervisorResult.error) {
92
112
  this.emitSummary({ success: false, turns: turn });
93
113
  return { success: false, turns: turn };
94
114
  }
115
+
116
+ // The supervisor's turn is fully complete — check for success signal.
117
+ if (isSuccessful(supervisorResult.text)) {
118
+ this.emitSummary({ success: true, turns: turn });
119
+ return { success: true, turns: turn };
120
+ }
95
121
  }
96
122
 
97
123
  this.emitSummary({ success: false, turns: this.maxTurns });
98
124
  return { success: false, turns: this.maxTurns };
99
125
  }
100
126
 
127
+ /**
128
+ * Extract a human-readable transcript from an AgentRunner's buffered output.
129
+ * Drains the buffer and replays events through a TraceCollector.
130
+ * @param {import("./agent-runner.js").AgentRunner} runner
131
+ * @returns {string}
132
+ */
133
+ extractTranscript(runner) {
134
+ const lines = runner.drainOutput();
135
+ const collector = new TraceCollector();
136
+ for (const line of lines) {
137
+ collector.addLine(line);
138
+ }
139
+ return collector.toText() || "[The agent produced no output.]";
140
+ }
141
+
101
142
  /**
102
143
  * Emit a single NDJSON line tagged with the current source and turn.
103
144
  * Called in real-time via the AgentRunner onLine callback.
@@ -138,6 +179,10 @@ export class Supervisor {
138
179
  * @param {string} [deps.model] - Claude model identifier
139
180
  * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
140
181
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
182
+ * @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
183
+ * @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
184
+ * @param {string} [deps.supervisorProfile] - Supervisor agent profile name
185
+ * @param {string} [deps.agentProfile] - Agent profile name
141
186
  * @returns {Supervisor}
142
187
  */
143
188
  export function createSupervisor({
@@ -148,6 +193,10 @@ export function createSupervisor({
148
193
  model,
149
194
  maxTurns,
150
195
  allowedTools,
196
+ supervisorDisallowedTools,
197
+ supervisorAllowedTools,
198
+ supervisorProfile,
199
+ agentProfile,
151
200
  }) {
152
201
  // Forward-reference: onLine captures `supervisor` before construction completes.
153
202
  // This is safe because onLine is only called during run(), after construction.
@@ -163,17 +212,45 @@ export function createSupervisor({
163
212
  allowedTools,
164
213
  onLine,
165
214
  settingSources: ["project"],
215
+ agentProfile,
216
+ systemPrompt: {
217
+ type: "preset",
218
+ preset: "claude_code",
219
+ append: AGENT_SYSTEM_PROMPT,
220
+ },
166
221
  });
167
222
 
223
+ // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
224
+ // The relay loop handles agent communication — letting the supervisor use
225
+ // Task would bypass the relay and produce an empty agent trace.
226
+ const defaultDisallowed = ["Task", "TaskOutput"];
227
+ const disallowedTools = supervisorDisallowedTools
228
+ ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
229
+ : defaultDisallowed;
230
+
168
231
  const supervisorRunner = createAgentRunner({
169
232
  cwd: supervisorCwd,
170
233
  query,
171
234
  output: new PassThrough(),
172
235
  model,
173
236
  maxTurns: 10,
174
- allowedTools: ["Read", "Glob", "Grep"],
237
+ allowedTools: supervisorAllowedTools ?? [
238
+ "Bash",
239
+ "Read",
240
+ "Glob",
241
+ "Grep",
242
+ "Write",
243
+ "Edit",
244
+ ],
245
+ disallowedTools,
175
246
  onLine,
176
247
  settingSources: ["project"],
248
+ agentProfile: supervisorProfile,
249
+ systemPrompt: {
250
+ type: "preset",
251
+ preset: "claude_code",
252
+ append: SUPERVISOR_SYSTEM_PROMPT,
253
+ },
177
254
  });
178
255
 
179
256
  supervisor = new Supervisor({
package/src/tee-writer.js CHANGED
@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
107
107
  if (parsed.event) {
108
108
  if (parsed.source && parsed.source !== this.lastSource) {
109
109
  this.lastSource = parsed.source;
110
- this.textStream.write(`\n[${parsed.source}]\n`);
111
110
  }
112
111
  this.collector.addLine(JSON.stringify(parsed.event));
113
112
  this.flushTurns();
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
119
118
  */
120
119
  flushTurns() {
121
120
  const turns = this.collector.turns;
121
+ const prefix =
122
+ this.mode === "supervised" && this.lastSource
123
+ ? `[${this.lastSource}] `
124
+ : "";
122
125
  while (this.turnsEmitted < turns.length) {
123
126
  const turn = turns[this.turnsEmitted++];
124
127
  if (turn.role === "assistant") {
125
128
  for (const block of turn.content) {
126
129
  if (block.type === "text") {
127
- this.textStream.write(block.text + "\n");
130
+ this.textStream.write(`${prefix}${block.text}\n`);
128
131
  } else if (block.type === "tool_use") {
129
132
  const input = summarizeInput(block.input);
130
- this.textStream.write(`> Tool: ${block.name} ${input}\n`);
133
+ this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
131
134
  }
132
135
  }
133
136
  }
@@ -6,8 +6,10 @@ import {
6
6
  AgentRunner,
7
7
  Supervisor,
8
8
  createSupervisor,
9
+ SUPERVISOR_SYSTEM_PROMPT,
10
+ AGENT_SYSTEM_PROMPT,
9
11
  } from "@forwardimpact/libeval";
10
- import { isDone } from "../src/supervisor.js";
12
+ import { isSuccessful } from "../src/supervisor.js";
11
13
 
12
14
  /**
13
15
  * Create a mock AgentRunner that yields pre-scripted responses.
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
61
63
  return runner;
62
64
  }
63
65
 
64
- describe("isDone", () => {
65
- test("detects EVALUATION_COMPLETE on its own line", () => {
66
- assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
66
+ describe("isSuccessful", () => {
67
+ test("detects EVALUATION_SUCCESSFUL on its own line", () => {
68
+ assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
67
69
  assert.strictEqual(
68
- isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
70
+ isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
69
71
  true,
70
72
  );
71
- assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
73
+ assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
72
74
  });
73
75
 
74
- test("does not match EVALUATION_COMPLETE embedded in text", () => {
75
- assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
76
- assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
77
- assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
76
+ test("tolerates markdown formatting around the signal", () => {
77
+ assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
78
+ assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
79
+ assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
80
+ assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
81
+ assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
82
+ assert.strictEqual(
83
+ isSuccessful(
84
+ "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
85
+ ),
86
+ true,
87
+ );
88
+ });
89
+
90
+ test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
91
+ assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
92
+ assert.strictEqual(
93
+ isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
94
+ true,
95
+ );
96
+ assert.strictEqual(
97
+ isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
98
+ true,
99
+ );
78
100
  });
79
101
 
80
102
  test("does not match empty or unrelated text", () => {
81
- assert.strictEqual(isDone(""), false);
82
- assert.strictEqual(isDone("All done!"), false);
83
- assert.strictEqual(isDone("DONE"), false);
103
+ assert.strictEqual(isSuccessful(""), false);
104
+ assert.strictEqual(isSuccessful("All done!"), false);
105
+ assert.strictEqual(isSuccessful("DONE"), false);
106
+ });
107
+
108
+ test("does not match old EVALUATION_COMPLETE signal", () => {
109
+ assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
84
110
  });
85
111
  });
86
112
 
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
118
144
  );
119
145
  });
120
146
 
121
- test("completes on EVALUATION_COMPLETE from supervisor", async () => {
147
+ test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
148
+ const agentRunner = createMockRunner([]);
149
+
150
+ const supervisorRunner = createMockRunner([
151
+ { text: "EVALUATION_SUCCESSFUL" },
152
+ ]);
153
+
154
+ const output = new PassThrough();
155
+ const supervisor = new Supervisor({
156
+ agentRunner,
157
+ supervisorRunner,
158
+ output,
159
+ maxTurns: 10,
160
+ });
161
+
162
+ const result = await supervisor.run("Install stuff");
163
+
164
+ assert.strictEqual(result.success, true);
165
+ assert.strictEqual(result.turns, 0);
166
+ });
167
+
168
+ test("completes after one agent turn", async () => {
122
169
  const agentRunner = createMockRunner([
123
170
  { text: "I installed the packages." },
124
171
  ]);
125
172
 
126
173
  const supervisorRunner = createMockRunner([
127
- { text: "Good work.\n\nEVALUATION_COMPLETE" },
174
+ { text: "Welcome! Please install the packages." },
175
+ { text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
128
176
  ]);
129
177
 
130
178
  const output = new PassThrough();
@@ -149,9 +197,10 @@ describe("Supervisor", () => {
149
197
  ]);
150
198
 
151
199
  const supervisorRunner = createMockRunner([
200
+ { text: "Here is your task. Do the work." },
152
201
  { text: "Keep going, you need to do more." },
153
202
  { text: "Almost there, continue." },
154
- { text: "EVALUATION_COMPLETE" },
203
+ { text: "EVALUATION_SUCCESSFUL" },
155
204
  ]);
156
205
 
157
206
  const output = new PassThrough();
@@ -169,14 +218,14 @@ describe("Supervisor", () => {
169
218
  });
170
219
 
171
220
  test("enforces maxTurns limit", async () => {
172
- // Agent responds to every turn, supervisor never says done
221
+ // Supervisor starts, agent responds each turn, supervisor never says done
173
222
  const agentRunner = createMockRunner([
174
- { text: "Turn 0" },
175
223
  { text: "Turn 1" },
176
224
  { text: "Turn 2" },
177
225
  ]);
178
226
 
179
227
  const supervisorRunner = createMockRunner([
228
+ { text: "Start working." },
180
229
  { text: "Continue." },
181
230
  { text: "Continue." },
182
231
  ]);
@@ -196,16 +245,17 @@ describe("Supervisor", () => {
196
245
  });
197
246
 
198
247
  test("output contains tagged lines with correct source and turn", async () => {
199
- const agentMessages = [[{ type: "assistant", content: "Working" }]];
200
248
  const supervisorMessages = [
201
- [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
249
+ [{ type: "assistant", content: "Go ahead" }],
250
+ [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
202
251
  ];
252
+ const agentMessages = [[{ type: "assistant", content: "Working" }]];
203
253
 
204
- const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
205
254
  const supervisorRunner = createMockRunner(
206
- [{ text: "EVALUATION_COMPLETE" }],
255
+ [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
207
256
  supervisorMessages,
208
257
  );
258
+ const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
209
259
 
210
260
  const output = new PassThrough();
211
261
  const supervisor = new Supervisor({
@@ -225,19 +275,19 @@ describe("Supervisor", () => {
225
275
  .split("\n")
226
276
  .filter((l) => l.length > 0);
227
277
 
228
- // Should have: agent turn 0, supervisor turn 1, orchestrator summary
229
- assert.ok(lines.length >= 3);
230
-
231
- const agentLine = JSON.parse(lines[0]);
232
- assert.strictEqual(agentLine.source, "agent");
233
- assert.strictEqual(agentLine.turn, 0);
234
- assert.ok("event" in agentLine);
278
+ // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
279
+ assert.ok(lines.length >= 4);
235
280
 
236
- const supervisorLine = JSON.parse(lines[1]);
281
+ const supervisorLine = JSON.parse(lines[0]);
237
282
  assert.strictEqual(supervisorLine.source, "supervisor");
238
- assert.strictEqual(supervisorLine.turn, 1);
283
+ assert.strictEqual(supervisorLine.turn, 0);
239
284
  assert.ok("event" in supervisorLine);
240
285
 
286
+ const agentLine = JSON.parse(lines[1]);
287
+ assert.strictEqual(agentLine.source, "agent");
288
+ assert.strictEqual(agentLine.turn, 1);
289
+ assert.ok("event" in agentLine);
290
+
241
291
  const summaryLine = JSON.parse(lines[lines.length - 1]);
242
292
  assert.strictEqual(summaryLine.source, "orchestrator");
243
293
  assert.strictEqual(summaryLine.type, "summary");
@@ -250,11 +300,14 @@ describe("Supervisor", () => {
250
300
  source: "sdk-internal",
251
301
  content: "test",
252
302
  };
253
- const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
254
303
  const supervisorRunner = createMockRunner(
255
- [{ text: "EVALUATION_COMPLETE" }],
256
- [[{ type: "assistant", content: "ok" }]],
304
+ [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
305
+ [
306
+ [{ type: "assistant", content: "Go" }],
307
+ [{ type: "assistant", content: "ok" }],
308
+ ],
257
309
  );
310
+ const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
258
311
 
259
312
  const output = new PassThrough();
260
313
  const supervisor = new Supervisor({
@@ -274,27 +327,30 @@ describe("Supervisor", () => {
274
327
  .split("\n")
275
328
  .filter((l) => l.length > 0);
276
329
 
277
- const tagged = JSON.parse(lines[0]);
330
+ // First line is supervisor turn 0, second is agent turn 1
331
+ const tagged = JSON.parse(lines[1]);
278
332
  // The original event's `source` field is preserved inside `event`
279
333
  assert.strictEqual(tagged.source, "agent");
280
334
  assert.strictEqual(tagged.event.source, "sdk-internal");
281
335
  });
282
336
 
283
- test("emits agent output and summary when agent errors on turn 0", async () => {
284
- const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
285
- const agentRunner = createMockRunner(
286
- [{ text: "Partial work", success: false }],
287
- agentMessages,
337
+ test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
338
+ const supervisorMessages = [
339
+ [{ type: "assistant", content: "Starting..." }],
340
+ ];
341
+ const supervisorRunner = createMockRunner(
342
+ [{ text: "Starting...", success: false }],
343
+ supervisorMessages,
288
344
  );
289
345
 
290
346
  // Override run to simulate an error return
291
- const origRun = agentRunner.run;
292
- agentRunner.run = async (task) => {
293
- const result = await origRun.call(agentRunner, task);
347
+ const origRun = supervisorRunner.run;
348
+ supervisorRunner.run = async (task) => {
349
+ const result = await origRun.call(supervisorRunner, task);
294
350
  return { ...result, error: new Error("Process exited with code 1") };
295
351
  };
296
352
 
297
- const supervisorRunner = createMockRunner([]);
353
+ const agentRunner = createMockRunner([]);
298
354
 
299
355
  const output = new PassThrough();
300
356
  const supervisor = new Supervisor({
@@ -311,18 +367,18 @@ describe("Supervisor", () => {
311
367
  assert.strictEqual(result.success, false);
312
368
  assert.strictEqual(result.turns, 0);
313
369
 
314
- // Output should still contain the agent's buffered lines + summary
370
+ // Output should still contain the supervisor's buffered lines + summary
315
371
  const data = output.read()?.toString() ?? "";
316
372
  const lines = data
317
373
  .trim()
318
374
  .split("\n")
319
375
  .filter((l) => l.length > 0);
320
376
 
321
- assert.ok(lines.length >= 2, "Expected at least agent line + summary");
377
+ assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
322
378
 
323
- const agentLine = JSON.parse(lines[0]);
324
- assert.strictEqual(agentLine.source, "agent");
325
- assert.strictEqual(agentLine.turn, 0);
379
+ const supervisorLine = JSON.parse(lines[0]);
380
+ assert.strictEqual(supervisorLine.source, "supervisor");
381
+ assert.strictEqual(supervisorLine.turn, 0);
326
382
 
327
383
  const summaryLine = JSON.parse(lines[lines.length - 1]);
328
384
  assert.strictEqual(summaryLine.source, "orchestrator");
@@ -339,4 +395,99 @@ describe("Supervisor", () => {
339
395
  });
340
396
  assert.ok(supervisor instanceof Supervisor);
341
397
  });
398
+
399
+ test("createSupervisor uses default supervisor tools when none specified", () => {
400
+ const supervisor = createSupervisor({
401
+ supervisorCwd: "/tmp/sup",
402
+ agentCwd: "/tmp/agent",
403
+ query: async function* () {},
404
+ output: new PassThrough(),
405
+ });
406
+ assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
407
+ "Bash",
408
+ "Read",
409
+ "Glob",
410
+ "Grep",
411
+ "Write",
412
+ "Edit",
413
+ ]);
414
+ });
415
+
416
+ test("createSupervisor passes custom supervisor tools", () => {
417
+ const supervisor = createSupervisor({
418
+ supervisorCwd: "/tmp/sup",
419
+ agentCwd: "/tmp/agent",
420
+ query: async function* () {},
421
+ output: new PassThrough(),
422
+ supervisorAllowedTools: ["Read", "Glob", "Grep"],
423
+ });
424
+ assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
425
+ "Read",
426
+ "Glob",
427
+ "Grep",
428
+ ]);
429
+ });
430
+
431
+ test("createSupervisor wires system prompts to both runners", () => {
432
+ const supervisor = createSupervisor({
433
+ supervisorCwd: "/tmp/sup",
434
+ agentCwd: "/tmp/agent",
435
+ query: async function* () {},
436
+ output: new PassThrough(),
437
+ });
438
+
439
+ assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
440
+ type: "preset",
441
+ preset: "claude_code",
442
+ append: AGENT_SYSTEM_PROMPT,
443
+ });
444
+ assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
445
+ type: "preset",
446
+ preset: "claude_code",
447
+ append: SUPERVISOR_SYSTEM_PROMPT,
448
+ });
449
+ });
450
+
451
+ test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
452
+ const supervisor = createSupervisor({
453
+ supervisorCwd: "/tmp/sup",
454
+ agentCwd: "/tmp/agent",
455
+ query: async function* () {},
456
+ output: new PassThrough(),
457
+ });
458
+ assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
459
+ "Task",
460
+ "TaskOutput",
461
+ ]);
462
+ // Agent should not have disallowed tools
463
+ assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
464
+ });
465
+
466
+ test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
467
+ const supervisor = createSupervisor({
468
+ supervisorCwd: "/tmp/sup",
469
+ agentCwd: "/tmp/agent",
470
+ query: async function* () {},
471
+ output: new PassThrough(),
472
+ supervisorDisallowedTools: ["WebSearch", "Task"],
473
+ });
474
+ const disallowed = supervisor.supervisorRunner.disallowedTools;
475
+ assert.ok(disallowed.includes("Task"));
476
+ assert.ok(disallowed.includes("TaskOutput"));
477
+ assert.ok(disallowed.includes("WebSearch"));
478
+ // No duplicates
479
+ assert.strictEqual(disallowed.length, new Set(disallowed).size);
480
+ });
481
+
482
+ test("system prompt constants are non-empty strings", () => {
483
+ assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
484
+ assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
485
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
486
+ assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
487
+ });
488
+
489
+ test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
490
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
491
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
492
+ });
342
493
  });
@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
187
187
  assert.strictEqual(fileLines.length, 3);
188
188
  assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
189
189
 
190
- // Text should show source labels
191
- assert.ok(textData.includes("[agent]"));
192
- assert.ok(textData.includes("Working on it"));
193
- assert.ok(textData.includes("[supervisor]"));
194
- assert.ok(textData.includes("Looks good"));
190
+ // Text should show source prefixes on content lines
191
+ assert.ok(textData.includes("[agent] Working on it"));
192
+ assert.ok(textData.includes("[supervisor] Looks good"));
195
193
  assert.ok(textData.includes("Evaluation completed after 1 turns"));
196
194
  });
197
195
 
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
254
252
  await writeLines(writer, events);
255
253
 
256
254
  const textData = collect(textStream);
257
- // [agent] label should appear only once
258
- const agentLabels = textData.split("[agent]").length - 1;
259
- assert.strictEqual(agentLabels, 1);
255
+ // [agent] prefix should appear on each content line
256
+ assert.ok(textData.includes("[agent] Step 1"));
257
+ assert.ok(textData.includes("[agent] Step 2"));
260
258
  });
261
259
 
262
260
  test("handles partial lines across chunks", async () => {