@forwardimpact/libeval 0.1.3 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -25,21 +25,28 @@ Commands:
25
25
  supervise [options] Run a supervised agent ↔ supervisor relay loop
26
26
 
27
27
  Run options:
28
- --task=PATH Path to task file (required)
28
+ --task-file=PATH Path to task file (mutually exclusive with --task-text)
29
+ --task-text=STRING Inline task text (mutually exclusive with --task-file)
29
30
  --cwd=DIR Agent working directory (default: .)
30
31
  --model=MODEL Claude model to use (default: opus)
31
32
  --max-turns=N Maximum agentic turns (default: 50)
32
33
  --output=PATH Write NDJSON trace to file (default: stdout)
33
34
  --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
35
+ --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
34
36
 
35
37
  Supervise options:
36
- --task=PATH Path to task file (required)
38
+ --task-file=PATH Path to task file (mutually exclusive with --task-text)
39
+ --task-text=STRING Inline task text (mutually exclusive with --task-file)
37
40
  --supervisor-cwd=DIR Supervisor working directory (default: .)
38
41
  --agent-cwd=DIR Agent working directory (default: temp directory)
39
42
  --model=MODEL Claude model to use (default: opus)
40
43
  --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
41
44
  --output=PATH Write NDJSON trace to file (default: stdout)
42
45
  --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
46
+ --supervisor-allowed-tools=LIST
47
+ Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
48
+ --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
49
+ --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
43
50
 
44
51
  Options:
45
52
  --help Show this help message
@@ -50,8 +57,9 @@ Examples:
50
57
  fit-eval output --format=json < trace.ndjson
51
58
  fit-eval tee < trace.ndjson
52
59
  fit-eval tee output.ndjson < trace.ndjson
53
- fit-eval run --task=.github/tasks/security-audit.md --model=opus
54
- fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
60
+ fit-eval run --task-text="Perform a security audit of the repository." --model=opus
61
+ fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
62
+ fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
55
63
  `.trim();
56
64
 
57
65
  async function main() {
package/index.js CHANGED
@@ -1,4 +1,9 @@
1
1
  export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
2
2
  export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
3
- export { Supervisor, createSupervisor } from "./src/supervisor.js";
3
+ export {
4
+ Supervisor,
5
+ createSupervisor,
6
+ SUPERVISOR_SYSTEM_PROMPT,
7
+ AGENT_SYSTEM_PROMPT,
8
+ } from "./src/supervisor.js";
4
9
  export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.3",
3
+ "version": "0.1.6",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -10,13 +10,14 @@
10
10
  "fit-eval": "./bin/fit-eval.js"
11
11
  },
12
12
  "engines": {
13
- "bun": ">=1.2.0"
13
+ "bun": ">=1.2.0",
14
+ "node": ">=18.0.0"
14
15
  },
15
16
  "scripts": {
16
17
  "test": "bun run node --test test/*.test.js"
17
18
  },
18
19
  "dependencies": {
19
- "@anthropic-ai/claude-agent-sdk": "^0.1.0"
20
+ "@anthropic-ai/claude-agent-sdk": "^0.2.91"
20
21
  },
21
22
  "publishConfig": {
22
23
  "access": "public"
@@ -18,6 +18,9 @@ export class AgentRunner {
18
18
  * @param {string} [deps.permissionMode] - SDK permission mode
19
19
  * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
20
20
  * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
21
+ * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
22
+ * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
23
+ * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
21
24
  */
22
25
  constructor({
23
26
  cwd,
@@ -29,6 +32,9 @@ export class AgentRunner {
29
32
  permissionMode,
30
33
  onLine,
31
34
  settingSources,
35
+ agentProfile,
36
+ systemPrompt,
37
+ disallowedTools,
32
38
  }) {
33
39
  if (!cwd) throw new Error("cwd is required");
34
40
  if (!query) throw new Error("query is required");
@@ -49,6 +55,9 @@ export class AgentRunner {
49
55
  this.permissionMode = permissionMode ?? "bypassPermissions";
50
56
  this.onLine = onLine ?? null;
51
57
  this.settingSources = settingSources ?? [];
58
+ this.agentProfile = agentProfile ?? null;
59
+ this.systemPrompt = systemPrompt ?? null;
60
+ this.disallowedTools = disallowedTools ?? [];
52
61
  this.sessionId = null;
53
62
  this.buffer = [];
54
63
  }
@@ -74,6 +83,11 @@ export class AgentRunner {
74
83
  permissionMode: this.permissionMode,
75
84
  allowDangerouslySkipPermissions: true,
76
85
  settingSources: this.settingSources,
86
+ ...(this.disallowedTools.length > 0 && {
87
+ disallowedTools: this.disallowedTools,
88
+ }),
89
+ ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
90
+ ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
77
91
  },
78
92
  })) {
79
93
  const line = JSON.stringify(message);
@@ -113,7 +127,11 @@ export class AgentRunner {
113
127
  try {
114
128
  for await (const message of this.query({
115
129
  prompt,
116
- options: { resume: this.sessionId },
130
+ options: {
131
+ resume: this.sessionId,
132
+ permissionMode: this.permissionMode,
133
+ allowDangerouslySkipPermissions: true,
134
+ },
117
135
  })) {
118
136
  const line = JSON.stringify(message);
119
137
  this.output.write(line + "\n");
@@ -24,28 +24,35 @@ function parseFlag(args, name) {
24
24
  * Usage: fit-eval run [options]
25
25
  *
26
26
  * Options:
27
- * --task=PATH Path to task file (required)
27
+ * --task-file=PATH Path to task file (mutually exclusive with --task-text)
28
+ * --task-text=STRING Inline task text (mutually exclusive with --task-file)
28
29
  * --cwd=DIR Agent working directory (default: .)
29
30
  * --model=MODEL Claude model to use (default: opus)
30
31
  * --max-turns=N Maximum agentic turns (default: 50)
31
32
  * --output=PATH Write NDJSON trace to file (default: stdout)
32
33
  * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
34
+ * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
33
35
  *
34
36
  * @param {string[]} args - Command arguments
35
37
  */
36
38
  export async function runRunCommand(args) {
37
- const task = parseFlag(args, "task");
38
- if (!task) throw new Error("--task is required");
39
+ const taskFile = parseFlag(args, "task-file");
40
+ const taskText = parseFlag(args, "task-text");
41
+ if (taskFile && taskText)
42
+ throw new Error("--task-file and --task-text are mutually exclusive");
43
+ if (!taskFile && !taskText)
44
+ throw new Error("--task-file or --task-text is required");
39
45
 
40
46
  const cwd = resolve(parseFlag(args, "cwd") ?? ".");
41
47
  const model = parseFlag(args, "model") ?? "opus";
42
48
  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
43
49
  const outputPath = parseFlag(args, "output");
50
+ const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
44
51
  const allowedTools = (
45
52
  parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
46
53
  ).split(",");
47
54
 
48
- const taskContent = readFileSync(task, "utf8");
55
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
49
56
 
50
57
  // When --output is specified, stream text to stdout while writing NDJSON to file.
51
58
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
63
70
  maxTurns,
64
71
  allowedTools,
65
72
  settingSources: ["project"],
73
+ agentProfile,
66
74
  });
67
75
 
68
76
  const result = await runner.run(taskContent);
@@ -25,19 +25,26 @@ function parseFlag(args, name) {
25
25
  * Usage: fit-eval supervise [options]
26
26
  *
27
27
  * Options:
28
- * --task=PATH Path to task file (required)
28
+ * --task-file=PATH Path to task file (mutually exclusive with --task-text)
29
+ * --task-text=STRING Inline task text (mutually exclusive with --task-file)
29
30
  * --supervisor-cwd=DIR Supervisor working directory (default: .)
30
31
  * --agent-cwd=DIR Agent working directory (default: temp directory)
31
32
  * --model=MODEL Claude model to use (default: opus)
32
33
  * --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
33
34
  * --output=PATH Write NDJSON trace to file (default: stdout)
34
35
  * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
36
+ * --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
37
+ * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
35
38
  *
36
39
  * @param {string[]} args - Command arguments
37
40
  */
38
41
  export async function runSuperviseCommand(args) {
39
- const task = parseFlag(args, "task");
40
- if (!task) throw new Error("--task is required");
42
+ const taskFile = parseFlag(args, "task-file");
43
+ const taskText = parseFlag(args, "task-text");
44
+ if (taskFile && taskText)
45
+ throw new Error("--task-file and --task-text are mutually exclusive");
46
+ if (!taskFile && !taskText)
47
+ throw new Error("--task-file or --task-text is required");
41
48
 
42
49
  const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
43
50
  const agentCwd = resolve(
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
47
54
  const model = parseFlag(args, "model") ?? "opus";
48
55
  const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
49
56
  const outputPath = parseFlag(args, "output");
57
+ const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
58
+ const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
50
59
  const allowedTools = (
51
60
  parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
52
61
  ).split(",");
62
+ const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
63
+ const supervisorAllowedTools = supervisorAllowedToolsRaw
64
+ ? supervisorAllowedToolsRaw.split(",")
65
+ : undefined;
53
66
 
54
- const taskContent = readFileSync(task, "utf8");
67
+ const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
55
68
 
56
69
  // When --output is specified, stream text to stdout while writing NDJSON to file.
57
70
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
73
86
  model,
74
87
  maxTurns,
75
88
  allowedTools,
89
+ supervisorAllowedTools,
90
+ supervisorProfile,
91
+ agentProfile,
76
92
  });
77
93
 
78
94
  const result = await supervisor.run(taskContent);
package/src/supervisor.js CHANGED
@@ -1,25 +1,38 @@
1
1
  /**
2
2
  * Supervisor — orchestrates a relay loop between an agent and a supervisor,
3
- * both running as AgentRunner instances. The agent works on a task while the
4
- * supervisor observes and decides when the evaluation is complete.
3
+ * both running as AgentRunner instances. The supervisor receives the task first,
4
+ * introduces itself, and delegates work to the agent. The loop then alternates:
5
+ * agent → supervisor → agent.
5
6
  *
6
7
  * Follows OO+DI: constructor injection, factory function, tests bypass factory.
7
8
  */
8
9
 
9
10
  import { PassThrough } from "node:stream";
10
11
  import { createAgentRunner } from "./agent-runner.js";
12
+ import { TraceCollector } from "./trace-collector.js";
11
13
 
12
14
  /**
13
- * Check if the supervisor's response signals evaluation completion.
14
- * Uses a structured signal `EVALUATION_COMPLETE` on its own line —
15
- * to avoid false positives from natural language.
15
+ * Check if the supervisor's response signals evaluation success.
16
+ * Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
17
+ * formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
18
+ * avoid matching inside longer identifiers.
16
19
  * @param {string} text
17
20
  * @returns {boolean}
18
21
  */
19
- export function isDone(text) {
20
- return /^EVALUATION_COMPLETE$/m.test(text);
22
+ export function isSuccessful(text) {
23
+ return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
21
24
  }
22
25
 
26
+ /** System prompt appended for the supervisor runner in supervise mode. */
27
+ export const SUPERVISOR_SYSTEM_PROMPT =
28
+ "You supervise another AI agent through a relay — your output becomes the agent's next input. " +
29
+ "Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
30
+
31
+ /** System prompt appended for the agent runner in supervise mode. */
32
+ export const AGENT_SYSTEM_PROMPT =
33
+ "You are being supervised by another AI agent. " +
34
+ "When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
35
+
23
36
  export class Supervisor {
24
37
  /**
25
38
  * @param {object} deps
@@ -40,67 +53,113 @@ export class Supervisor {
40
53
  this.currentSource = "agent";
41
54
  /** @type {number} */
42
55
  this.currentTurn = 0;
56
+ /**
57
+ * Set to true when any supervisor message contains the success signal.
58
+ * The SDK result text only reflects the last assistant message, so when
59
+ * the supervisor writes EVALUATION_SUCCESSFUL in an early message and
60
+ * then continues with follow-up work, the result text won't contain it.
61
+ * This flag captures the signal from the full message stream.
62
+ * @type {boolean}
63
+ */
64
+ this.successSignalSeen = false;
43
65
  }
44
66
 
45
67
  /**
46
68
  * Run the supervisor ↔ agent relay loop.
47
- * @param {string} task - The initial task for the agent
69
+ * The supervisor receives the task first, introduces itself, and delegates
70
+ * work to the agent. The loop then alternates: agent → supervisor → agent.
71
+ * @param {string} task - The initial task for the supervisor
48
72
  * @returns {Promise<{success: boolean, turns: number}>}
49
73
  */
50
74
  async run(task) {
51
- // Turn 0: Agent receives the task and starts working
52
- this.currentSource = "agent";
75
+ // Turn 0: Supervisor receives the task and introduces it to the agent
76
+ this.currentSource = "supervisor";
53
77
  this.currentTurn = 0;
54
- let agentResult = await this.agentRunner.run(task);
78
+ this.successSignalSeen = false;
79
+ let supervisorResult = await this.supervisorRunner.run(task);
55
80
 
56
- if (agentResult.error) {
81
+ if (supervisorResult.error) {
57
82
  this.emitSummary({ success: false, turns: 0 });
58
83
  return { success: false, turns: 0 };
59
84
  }
60
85
 
61
- for (let turn = 1; turn <= this.maxTurns; turn++) {
62
- // Supervisor observes the agent's output
63
- const supervisorPrompt =
64
- `The agent reported:\n\n${agentResult.text}\n\n` +
65
- `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
86
+ // Check for the success signal in either the SDK result text or the
87
+ // streamed message content. The SDK result text only reflects the last
88
+ // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
89
+ // early and then continues (e.g. filing issues), we must also check the
90
+ // flag set by emitLine during streaming.
91
+ if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
92
+ this.emitSummary({ success: true, turns: 0 });
93
+ return { success: true, turns: 0 };
94
+ }
66
95
 
67
- this.currentSource = "supervisor";
96
+ for (let turn = 1; turn <= this.maxTurns; turn++) {
97
+ // Supervisor's output becomes the agent's input
98
+ this.currentSource = "agent";
68
99
  this.currentTurn = turn;
69
- let supervisorResult;
100
+ let agentResult;
70
101
  if (turn === 1) {
71
- supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
102
+ agentResult = await this.agentRunner.run(supervisorResult.text);
72
103
  } else {
73
- supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
104
+ agentResult = await this.agentRunner.resume(supervisorResult.text);
74
105
  }
75
106
 
76
- if (supervisorResult.error) {
107
+ if (agentResult.error) {
77
108
  this.emitSummary({ success: false, turns: turn });
78
109
  return { success: false, turns: turn };
79
110
  }
80
111
 
81
- if (isDone(supervisorResult.text)) {
82
- this.emitSummary({ success: true, turns: turn });
83
- return { success: true, turns: turn };
84
- }
112
+ // Build the full agent transcript from buffered NDJSON events so the
113
+ // supervisor sees tool calls and reasoning, not just the SDK result summary.
114
+ const agentTranscript = this.extractTranscript(this.agentRunner);
85
115
 
86
- // Supervisor's response becomes the agent's next input
87
- this.currentSource = "agent";
116
+ const supervisorPrompt =
117
+ `The agent reported:\n\n${agentTranscript}\n\n` +
118
+ `Review the agent's work and decide how to proceed.`;
119
+
120
+ this.currentSource = "supervisor";
88
121
  this.currentTurn = turn;
89
- agentResult = await this.agentRunner.resume(supervisorResult.text);
122
+ this.successSignalSeen = false;
123
+ supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
90
124
 
91
- if (agentResult.error) {
125
+ if (supervisorResult.error) {
92
126
  this.emitSummary({ success: false, turns: turn });
93
127
  return { success: false, turns: turn };
94
128
  }
129
+
130
+ // The supervisor's turn is fully complete — check for success signal
131
+ // in either the SDK result text or streamed messages.
132
+ if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
133
+ this.emitSummary({ success: true, turns: turn });
134
+ return { success: true, turns: turn };
135
+ }
95
136
  }
96
137
 
97
138
  this.emitSummary({ success: false, turns: this.maxTurns });
98
139
  return { success: false, turns: this.maxTurns };
99
140
  }
100
141
 
142
+ /**
143
+ * Extract a human-readable transcript from an AgentRunner's buffered output.
144
+ * Drains the buffer and replays events through a TraceCollector.
145
+ * @param {import("./agent-runner.js").AgentRunner} runner
146
+ * @returns {string}
147
+ */
148
+ extractTranscript(runner) {
149
+ const lines = runner.drainOutput();
150
+ const collector = new TraceCollector();
151
+ for (const line of lines) {
152
+ collector.addLine(line);
153
+ }
154
+ return collector.toText() || "[The agent produced no output.]";
155
+ }
156
+
101
157
  /**
102
158
  * Emit a single NDJSON line tagged with the current source and turn.
103
159
  * Called in real-time via the AgentRunner onLine callback.
160
+ *
161
+ * When the current source is the supervisor, also scans assistant text
162
+ * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
104
163
  * @param {string} line - Raw NDJSON line from the runner
105
164
  */
106
165
  emitLine(line) {
@@ -111,6 +170,21 @@ export class Supervisor {
111
170
  event,
112
171
  };
113
172
  this.output.write(JSON.stringify(tagged) + "\n");
173
+
174
+ // Scan supervisor assistant messages for the success signal in real time.
175
+ // The SDK result text only reflects the final assistant message, but the
176
+ // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
177
+ // then continue with follow-up tool calls.
178
+ if (this.currentSource === "supervisor" && event.type === "assistant") {
179
+ const content = event.message?.content ?? event.content ?? [];
180
+ if (Array.isArray(content)) {
181
+ for (const block of content) {
182
+ if (block.type === "text" && isSuccessful(block.text)) {
183
+ this.successSignalSeen = true;
184
+ }
185
+ }
186
+ }
187
+ }
114
188
  }
115
189
 
116
190
  /**
@@ -138,6 +212,10 @@ export class Supervisor {
138
212
  * @param {string} [deps.model] - Claude model identifier
139
213
  * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
140
214
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
215
+ * @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
216
+ * @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
217
+ * @param {string} [deps.supervisorProfile] - Supervisor agent profile name
218
+ * @param {string} [deps.agentProfile] - Agent profile name
141
219
  * @returns {Supervisor}
142
220
  */
143
221
  export function createSupervisor({
@@ -148,6 +226,10 @@ export function createSupervisor({
148
226
  model,
149
227
  maxTurns,
150
228
  allowedTools,
229
+ supervisorDisallowedTools,
230
+ supervisorAllowedTools,
231
+ supervisorProfile,
232
+ agentProfile,
151
233
  }) {
152
234
  // Forward-reference: onLine captures `supervisor` before construction completes.
153
235
  // This is safe because onLine is only called during run(), after construction.
@@ -163,17 +245,45 @@ export function createSupervisor({
163
245
  allowedTools,
164
246
  onLine,
165
247
  settingSources: ["project"],
248
+ agentProfile,
249
+ systemPrompt: {
250
+ type: "preset",
251
+ preset: "claude_code",
252
+ append: AGENT_SYSTEM_PROMPT,
253
+ },
166
254
  });
167
255
 
256
+ // Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
257
+ // The relay loop handles agent communication — letting the supervisor use
258
+ // Task would bypass the relay and produce an empty agent trace.
259
+ const defaultDisallowed = ["Task", "TaskOutput"];
260
+ const disallowedTools = supervisorDisallowedTools
261
+ ? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
262
+ : defaultDisallowed;
263
+
168
264
  const supervisorRunner = createAgentRunner({
169
265
  cwd: supervisorCwd,
170
266
  query,
171
267
  output: new PassThrough(),
172
268
  model,
173
269
  maxTurns: 10,
174
- allowedTools: ["Read", "Glob", "Grep"],
270
+ allowedTools: supervisorAllowedTools ?? [
271
+ "Bash",
272
+ "Read",
273
+ "Glob",
274
+ "Grep",
275
+ "Write",
276
+ "Edit",
277
+ ],
278
+ disallowedTools,
175
279
  onLine,
176
280
  settingSources: ["project"],
281
+ agentProfile: supervisorProfile,
282
+ systemPrompt: {
283
+ type: "preset",
284
+ preset: "claude_code",
285
+ append: SUPERVISOR_SYSTEM_PROMPT,
286
+ },
177
287
  });
178
288
 
179
289
  supervisor = new Supervisor({
package/src/tee-writer.js CHANGED
@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
107
107
  if (parsed.event) {
108
108
  if (parsed.source && parsed.source !== this.lastSource) {
109
109
  this.lastSource = parsed.source;
110
- this.textStream.write(`\n[${parsed.source}]\n`);
111
110
  }
112
111
  this.collector.addLine(JSON.stringify(parsed.event));
113
112
  this.flushTurns();
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
119
118
  */
120
119
  flushTurns() {
121
120
  const turns = this.collector.turns;
121
+ const prefix =
122
+ this.mode === "supervised" && this.lastSource
123
+ ? `[${this.lastSource}] `
124
+ : "";
122
125
  while (this.turnsEmitted < turns.length) {
123
126
  const turn = turns[this.turnsEmitted++];
124
127
  if (turn.role === "assistant") {
125
128
  for (const block of turn.content) {
126
129
  if (block.type === "text") {
127
- this.textStream.write(block.text + "\n");
130
+ this.textStream.write(`${prefix}${block.text}\n`);
128
131
  } else if (block.type === "tool_use") {
129
132
  const input = summarizeInput(block.input);
130
- this.textStream.write(`> Tool: ${block.name} ${input}\n`);
133
+ this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
131
134
  }
132
135
  }
133
136
  }
@@ -38,6 +38,13 @@ export class TraceCollector {
38
38
  return;
39
39
  }
40
40
 
41
+ // Unwrap combined supervised trace format {source, turn, event}.
42
+ // The Supervisor emits this wrapper; when replayed through addLine the
43
+ // inner event is the one we need.
44
+ if (event.event && !event.type && typeof event.source === "string") {
45
+ event = event.event;
46
+ }
47
+
41
48
  switch (event.type) {
42
49
  case "system":
43
50
  this.handleSystem(event);
@@ -6,8 +6,10 @@ import {
6
6
  AgentRunner,
7
7
  Supervisor,
8
8
  createSupervisor,
9
+ SUPERVISOR_SYSTEM_PROMPT,
10
+ AGENT_SYSTEM_PROMPT,
9
11
  } from "@forwardimpact/libeval";
10
- import { isDone } from "../src/supervisor.js";
12
+ import { isSuccessful } from "../src/supervisor.js";
11
13
 
12
14
  /**
13
15
  * Create a mock AgentRunner that yields pre-scripted responses.
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
61
63
  return runner;
62
64
  }
63
65
 
64
- describe("isDone", () => {
65
- test("detects EVALUATION_COMPLETE on its own line", () => {
66
- assert.strictEqual(isDone("EVALUATION_COMPLETE"), true);
66
+ describe("isSuccessful", () => {
67
+ test("detects EVALUATION_SUCCESSFUL on its own line", () => {
68
+ assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
67
69
  assert.strictEqual(
68
- isDone("Some text\nEVALUATION_COMPLETE\nMore text"),
70
+ isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
69
71
  true,
70
72
  );
71
- assert.strictEqual(isDone("Done.\n\nEVALUATION_COMPLETE"), true);
73
+ assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
72
74
  });
73
75
 
74
- test("does not match EVALUATION_COMPLETE embedded in text", () => {
75
- assert.strictEqual(isDone("not EVALUATION_COMPLETE yet"), false);
76
- assert.strictEqual(isDone("The agent is EVALUATION_COMPLETE done"), false);
77
- assert.strictEqual(isDone("EVALUATION_COMPLETE_EXTRA"), false);
76
+ test("tolerates markdown formatting around the signal", () => {
77
+ assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
78
+ assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
79
+ assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
80
+ assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
81
+ assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
82
+ assert.strictEqual(
83
+ isSuccessful(
84
+ "Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
85
+ ),
86
+ true,
87
+ );
88
+ });
89
+
90
+ test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
91
+ assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
92
+ assert.strictEqual(
93
+ isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
94
+ true,
95
+ );
96
+ assert.strictEqual(
97
+ isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
98
+ true,
99
+ );
78
100
  });
79
101
 
80
102
  test("does not match empty or unrelated text", () => {
81
- assert.strictEqual(isDone(""), false);
82
- assert.strictEqual(isDone("All done!"), false);
83
- assert.strictEqual(isDone("DONE"), false);
103
+ assert.strictEqual(isSuccessful(""), false);
104
+ assert.strictEqual(isSuccessful("All done!"), false);
105
+ assert.strictEqual(isSuccessful("DONE"), false);
106
+ });
107
+
108
+ test("does not match old EVALUATION_COMPLETE signal", () => {
109
+ assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
84
110
  });
85
111
  });
86
112
 
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
118
144
  );
119
145
  });
120
146
 
121
- test("completes on EVALUATION_COMPLETE from supervisor", async () => {
147
+ test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
148
+ const agentRunner = createMockRunner([]);
149
+
150
+ const supervisorRunner = createMockRunner([
151
+ { text: "EVALUATION_SUCCESSFUL" },
152
+ ]);
153
+
154
+ const output = new PassThrough();
155
+ const supervisor = new Supervisor({
156
+ agentRunner,
157
+ supervisorRunner,
158
+ output,
159
+ maxTurns: 10,
160
+ });
161
+
162
+ const result = await supervisor.run("Install stuff");
163
+
164
+ assert.strictEqual(result.success, true);
165
+ assert.strictEqual(result.turns, 0);
166
+ });
167
+
168
+ test("completes after one agent turn", async () => {
122
169
  const agentRunner = createMockRunner([
123
170
  { text: "I installed the packages." },
124
171
  ]);
125
172
 
126
173
  const supervisorRunner = createMockRunner([
127
- { text: "Good work.\n\nEVALUATION_COMPLETE" },
174
+ { text: "Welcome! Please install the packages." },
175
+ { text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
128
176
  ]);
129
177
 
130
178
  const output = new PassThrough();
@@ -141,6 +189,67 @@ describe("Supervisor", () => {
141
189
  assert.strictEqual(result.turns, 1);
142
190
  });
143
191
 
192
+ test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
193
+ // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
194
+ // an early message, then continues with follow-up work (e.g. filing issues).
195
+ // The SDK result text reflects only the final message, which does NOT
196
+ // contain the signal.
197
+ const agentRunner = createMockRunner([
198
+ { text: "I installed the packages." },
199
+ ]);
200
+
201
+ // The supervisor's result text is the Summary (no signal), but messages
202
+ // include one with EVALUATION_SUCCESSFUL.
203
+ const supervisorMessages = [
204
+ undefined, // turn 0: use default
205
+ [
206
+ {
207
+ type: "assistant",
208
+ message: {
209
+ content: [
210
+ {
211
+ type: "text",
212
+ text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
213
+ },
214
+ ],
215
+ },
216
+ },
217
+ {
218
+ type: "assistant",
219
+ message: {
220
+ content: [
221
+ { type: "text", text: "## Summary\n\nAll issues filed." },
222
+ ],
223
+ },
224
+ },
225
+ ],
226
+ ];
227
+
228
+ const supervisorRunner = createMockRunner(
229
+ [
230
+ { text: "Welcome! Please install the packages." },
231
+ // Result text is the final message — does NOT contain the signal
232
+ { text: "## Summary\n\nAll issues filed." },
233
+ ],
234
+ supervisorMessages,
235
+ );
236
+
237
+ const output = new PassThrough();
238
+ const supervisor = new Supervisor({
239
+ agentRunner,
240
+ supervisorRunner,
241
+ output,
242
+ maxTurns: 10,
243
+ });
244
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
245
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
246
+
247
+ const result = await supervisor.run("Install stuff");
248
+
249
+ assert.strictEqual(result.success, true);
250
+ assert.strictEqual(result.turns, 1);
251
+ });
252
+
144
253
  test("runs multiple turns before completion", async () => {
145
254
  const agentRunner = createMockRunner([
146
255
  { text: "Started working." },
@@ -149,9 +258,10 @@ describe("Supervisor", () => {
149
258
  ]);
150
259
 
151
260
  const supervisorRunner = createMockRunner([
261
+ { text: "Here is your task. Do the work." },
152
262
  { text: "Keep going, you need to do more." },
153
263
  { text: "Almost there, continue." },
154
- { text: "EVALUATION_COMPLETE" },
264
+ { text: "EVALUATION_SUCCESSFUL" },
155
265
  ]);
156
266
 
157
267
  const output = new PassThrough();
@@ -169,14 +279,14 @@ describe("Supervisor", () => {
169
279
  });
170
280
 
171
281
  test("enforces maxTurns limit", async () => {
172
- // Agent responds to every turn, supervisor never says done
282
+ // Supervisor starts, agent responds each turn, supervisor never says done
173
283
  const agentRunner = createMockRunner([
174
- { text: "Turn 0" },
175
284
  { text: "Turn 1" },
176
285
  { text: "Turn 2" },
177
286
  ]);
178
287
 
179
288
  const supervisorRunner = createMockRunner([
289
+ { text: "Start working." },
180
290
  { text: "Continue." },
181
291
  { text: "Continue." },
182
292
  ]);
@@ -196,16 +306,17 @@ describe("Supervisor", () => {
196
306
  });
197
307
 
198
308
  test("output contains tagged lines with correct source and turn", async () => {
199
- const agentMessages = [[{ type: "assistant", content: "Working" }]];
200
309
  const supervisorMessages = [
201
- [{ type: "assistant", content: "EVALUATION_COMPLETE" }],
310
+ [{ type: "assistant", content: "Go ahead" }],
311
+ [{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
202
312
  ];
313
+ const agentMessages = [[{ type: "assistant", content: "Working" }]];
203
314
 
204
- const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
205
315
  const supervisorRunner = createMockRunner(
206
- [{ text: "EVALUATION_COMPLETE" }],
316
+ [{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
207
317
  supervisorMessages,
208
318
  );
319
+ const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
209
320
 
210
321
  const output = new PassThrough();
211
322
  const supervisor = new Supervisor({
@@ -225,19 +336,19 @@ describe("Supervisor", () => {
225
336
  .split("\n")
226
337
  .filter((l) => l.length > 0);
227
338
 
228
- // Should have: agent turn 0, supervisor turn 1, orchestrator summary
229
- assert.ok(lines.length >= 3);
230
-
231
- const agentLine = JSON.parse(lines[0]);
232
- assert.strictEqual(agentLine.source, "agent");
233
- assert.strictEqual(agentLine.turn, 0);
234
- assert.ok("event" in agentLine);
339
+ // Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
340
+ assert.ok(lines.length >= 4);
235
341
 
236
- const supervisorLine = JSON.parse(lines[1]);
342
+ const supervisorLine = JSON.parse(lines[0]);
237
343
  assert.strictEqual(supervisorLine.source, "supervisor");
238
- assert.strictEqual(supervisorLine.turn, 1);
344
+ assert.strictEqual(supervisorLine.turn, 0);
239
345
  assert.ok("event" in supervisorLine);
240
346
 
347
+ const agentLine = JSON.parse(lines[1]);
348
+ assert.strictEqual(agentLine.source, "agent");
349
+ assert.strictEqual(agentLine.turn, 1);
350
+ assert.ok("event" in agentLine);
351
+
241
352
  const summaryLine = JSON.parse(lines[lines.length - 1]);
242
353
  assert.strictEqual(summaryLine.source, "orchestrator");
243
354
  assert.strictEqual(summaryLine.type, "summary");
@@ -250,11 +361,14 @@ describe("Supervisor", () => {
250
361
  source: "sdk-internal",
251
362
  content: "test",
252
363
  };
253
- const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
254
364
  const supervisorRunner = createMockRunner(
255
- [{ text: "EVALUATION_COMPLETE" }],
256
- [[{ type: "assistant", content: "ok" }]],
365
+ [{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
366
+ [
367
+ [{ type: "assistant", content: "Go" }],
368
+ [{ type: "assistant", content: "ok" }],
369
+ ],
257
370
  );
371
+ const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
258
372
 
259
373
  const output = new PassThrough();
260
374
  const supervisor = new Supervisor({
@@ -274,27 +388,30 @@ describe("Supervisor", () => {
274
388
  .split("\n")
275
389
  .filter((l) => l.length > 0);
276
390
 
277
- const tagged = JSON.parse(lines[0]);
391
+ // First line is supervisor turn 0, second is agent turn 1
392
+ const tagged = JSON.parse(lines[1]);
278
393
  // The original event's `source` field is preserved inside `event`
279
394
  assert.strictEqual(tagged.source, "agent");
280
395
  assert.strictEqual(tagged.event.source, "sdk-internal");
281
396
  });
282
397
 
283
- test("emits agent output and summary when agent errors on turn 0", async () => {
284
- const agentMessages = [[{ type: "assistant", content: "Partial work" }]];
285
- const agentRunner = createMockRunner(
286
- [{ text: "Partial work", success: false }],
287
- agentMessages,
398
+ test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
399
+ const supervisorMessages = [
400
+ [{ type: "assistant", content: "Starting..." }],
401
+ ];
402
+ const supervisorRunner = createMockRunner(
403
+ [{ text: "Starting...", success: false }],
404
+ supervisorMessages,
288
405
  );
289
406
 
290
407
  // Override run to simulate an error return
291
- const origRun = agentRunner.run;
292
- agentRunner.run = async (task) => {
293
- const result = await origRun.call(agentRunner, task);
408
+ const origRun = supervisorRunner.run;
409
+ supervisorRunner.run = async (task) => {
410
+ const result = await origRun.call(supervisorRunner, task);
294
411
  return { ...result, error: new Error("Process exited with code 1") };
295
412
  };
296
413
 
297
- const supervisorRunner = createMockRunner([]);
414
+ const agentRunner = createMockRunner([]);
298
415
 
299
416
  const output = new PassThrough();
300
417
  const supervisor = new Supervisor({
@@ -311,18 +428,18 @@ describe("Supervisor", () => {
311
428
  assert.strictEqual(result.success, false);
312
429
  assert.strictEqual(result.turns, 0);
313
430
 
314
- // Output should still contain the agent's buffered lines + summary
431
+ // Output should still contain the supervisor's buffered lines + summary
315
432
  const data = output.read()?.toString() ?? "";
316
433
  const lines = data
317
434
  .trim()
318
435
  .split("\n")
319
436
  .filter((l) => l.length > 0);
320
437
 
321
- assert.ok(lines.length >= 2, "Expected at least agent line + summary");
438
+ assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
322
439
 
323
- const agentLine = JSON.parse(lines[0]);
324
- assert.strictEqual(agentLine.source, "agent");
325
- assert.strictEqual(agentLine.turn, 0);
440
+ const supervisorLine = JSON.parse(lines[0]);
441
+ assert.strictEqual(supervisorLine.source, "supervisor");
442
+ assert.strictEqual(supervisorLine.turn, 0);
326
443
 
327
444
  const summaryLine = JSON.parse(lines[lines.length - 1]);
328
445
  assert.strictEqual(summaryLine.source, "orchestrator");
@@ -339,4 +456,99 @@ describe("Supervisor", () => {
339
456
  });
340
457
  assert.ok(supervisor instanceof Supervisor);
341
458
  });
459
+
460
+ test("createSupervisor uses default supervisor tools when none specified", () => {
461
+ const supervisor = createSupervisor({
462
+ supervisorCwd: "/tmp/sup",
463
+ agentCwd: "/tmp/agent",
464
+ query: async function* () {},
465
+ output: new PassThrough(),
466
+ });
467
+ assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
468
+ "Bash",
469
+ "Read",
470
+ "Glob",
471
+ "Grep",
472
+ "Write",
473
+ "Edit",
474
+ ]);
475
+ });
476
+
477
+ test("createSupervisor passes custom supervisor tools", () => {
478
+ const supervisor = createSupervisor({
479
+ supervisorCwd: "/tmp/sup",
480
+ agentCwd: "/tmp/agent",
481
+ query: async function* () {},
482
+ output: new PassThrough(),
483
+ supervisorAllowedTools: ["Read", "Glob", "Grep"],
484
+ });
485
+ assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
486
+ "Read",
487
+ "Glob",
488
+ "Grep",
489
+ ]);
490
+ });
491
+
492
+ test("createSupervisor wires system prompts to both runners", () => {
493
+ const supervisor = createSupervisor({
494
+ supervisorCwd: "/tmp/sup",
495
+ agentCwd: "/tmp/agent",
496
+ query: async function* () {},
497
+ output: new PassThrough(),
498
+ });
499
+
500
+ assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
501
+ type: "preset",
502
+ preset: "claude_code",
503
+ append: AGENT_SYSTEM_PROMPT,
504
+ });
505
+ assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
506
+ type: "preset",
507
+ preset: "claude_code",
508
+ append: SUPERVISOR_SYSTEM_PROMPT,
509
+ });
510
+ });
511
+
512
+ test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
513
+ const supervisor = createSupervisor({
514
+ supervisorCwd: "/tmp/sup",
515
+ agentCwd: "/tmp/agent",
516
+ query: async function* () {},
517
+ output: new PassThrough(),
518
+ });
519
+ assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
520
+ "Task",
521
+ "TaskOutput",
522
+ ]);
523
+ // Agent should not have disallowed tools
524
+ assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
525
+ });
526
+
527
+ test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
528
+ const supervisor = createSupervisor({
529
+ supervisorCwd: "/tmp/sup",
530
+ agentCwd: "/tmp/agent",
531
+ query: async function* () {},
532
+ output: new PassThrough(),
533
+ supervisorDisallowedTools: ["WebSearch", "Task"],
534
+ });
535
+ const disallowed = supervisor.supervisorRunner.disallowedTools;
536
+ assert.ok(disallowed.includes("Task"));
537
+ assert.ok(disallowed.includes("TaskOutput"));
538
+ assert.ok(disallowed.includes("WebSearch"));
539
+ // No duplicates
540
+ assert.strictEqual(disallowed.length, new Set(disallowed).size);
541
+ });
542
+
543
+ test("system prompt constants are non-empty strings", () => {
544
+ assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
545
+ assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
546
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
547
+ assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
548
+ });
549
+
550
+ test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
551
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
552
+ assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
553
+ });
342
554
  });
@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
187
187
  assert.strictEqual(fileLines.length, 3);
188
188
  assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
189
189
 
190
- // Text should show source labels
191
- assert.ok(textData.includes("[agent]"));
192
- assert.ok(textData.includes("Working on it"));
193
- assert.ok(textData.includes("[supervisor]"));
194
- assert.ok(textData.includes("Looks good"));
190
+ // Text should show source prefixes on content lines
191
+ assert.ok(textData.includes("[agent] Working on it"));
192
+ assert.ok(textData.includes("[supervisor] Looks good"));
195
193
  assert.ok(textData.includes("Evaluation completed after 1 turns"));
196
194
  });
197
195
 
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
254
252
  await writeLines(writer, events);
255
253
 
256
254
  const textData = collect(textStream);
257
- // [agent] label should appear only once
258
- const agentLabels = textData.split("[agent]").length - 1;
259
- assert.strictEqual(agentLabels, 1);
255
+ // [agent] prefix should appear on each content line
256
+ assert.ok(textData.includes("[agent] Step 1"));
257
+ assert.ok(textData.includes("[agent] Step 2"));
260
258
  });
261
259
 
262
260
  test("handles partial lines across chunks", async () => {
@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
149
149
  assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
150
150
  });
151
151
 
152
+ test("unwraps combined supervised trace format {source, turn, event}", () => {
153
+ const collector = new TraceCollector();
154
+
155
+ // System init wrapped in supervisor envelope
156
+ collector.addLine(
157
+ JSON.stringify({
158
+ source: "agent",
159
+ turn: 0,
160
+ event: {
161
+ type: "system",
162
+ subtype: "init",
163
+ session_id: "sess-supervised",
164
+ model: "claude-opus-4-6",
165
+ tools: ["Bash"],
166
+ },
167
+ }),
168
+ );
169
+
170
+ // Assistant message wrapped in supervisor envelope
171
+ collector.addLine(
172
+ JSON.stringify({
173
+ source: "agent",
174
+ turn: 1,
175
+ event: {
176
+ type: "assistant",
177
+ message: {
178
+ content: [{ type: "text", text: "I ran the tests." }],
179
+ usage: { input_tokens: 100, output_tokens: 50 },
180
+ },
181
+ },
182
+ }),
183
+ );
184
+
185
+ // Tool result wrapped in supervisor envelope
186
+ collector.addLine(
187
+ JSON.stringify({
188
+ source: "agent",
189
+ turn: 1,
190
+ event: {
191
+ type: "user",
192
+ message: {
193
+ role: "user",
194
+ content: [
195
+ {
196
+ type: "tool_result",
197
+ tool_use_id: "toolu_sup",
198
+ content: "All tests passed",
199
+ },
200
+ ],
201
+ },
202
+ },
203
+ }),
204
+ );
205
+
206
+ // Result event wrapped in supervisor envelope
207
+ collector.addLine(
208
+ JSON.stringify({
209
+ source: "supervisor",
210
+ turn: 1,
211
+ event: {
212
+ type: "result",
213
+ subtype: "success",
214
+ total_cost_usd: 0.44,
215
+ duration_ms: 30000,
216
+ num_turns: 2,
217
+ },
218
+ }),
219
+ );
220
+
221
+ const trace = collector.toJSON();
222
+ assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
223
+ assert.strictEqual(trace.turns.length, 2);
224
+ assert.strictEqual(trace.turns[0].role, "assistant");
225
+ assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
226
+ assert.strictEqual(trace.turns[1].role, "tool_result");
227
+ assert.strictEqual(trace.turns[1].content, "All tests passed");
228
+ assert.strictEqual(trace.summary.result, "success");
229
+ assert.strictEqual(trace.summary.totalCostUsd, 0.44);
230
+ });
231
+
232
+ test("skips orchestrator summary lines from supervised traces", () => {
233
+ const collector = new TraceCollector();
234
+ collector.addLine(
235
+ JSON.stringify({
236
+ source: "orchestrator",
237
+ type: "summary",
238
+ success: true,
239
+ turns: 3,
240
+ }),
241
+ );
242
+
243
+ // Orchestrator summaries have no inner event and no recognized type
244
+ // after unwrap — they should be silently skipped.
245
+ assert.strictEqual(collector.toJSON().turns.length, 0);
246
+ });
247
+
152
248
  test("skips rate_limit_event and unknown types", () => {
153
249
  const collector = new TraceCollector();
154
250
  collector.addLine(