@forwardimpact/libeval 0.1.3 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +12 -4
- package/index.js +6 -1
- package/package.json +4 -3
- package/src/agent-runner.js +19 -1
- package/src/commands/run.js +12 -4
- package/src/commands/supervise.js +20 -4
- package/src/supervisor.js +141 -31
- package/src/tee-writer.js +6 -3
- package/src/trace-collector.js +7 -0
- package/test/supervisor.test.js +261 -49
- package/test/tee-writer.test.js +6 -8
- package/test/trace-collector.test.js +96 -0
package/bin/fit-eval.js
CHANGED
|
@@ -25,21 +25,28 @@ Commands:
|
|
|
25
25
|
supervise [options] Run a supervised agent ↔ supervisor relay loop
|
|
26
26
|
|
|
27
27
|
Run options:
|
|
28
|
-
--task=PATH
|
|
28
|
+
--task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
29
|
+
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
30
|
--cwd=DIR Agent working directory (default: .)
|
|
30
31
|
--model=MODEL Claude model to use (default: opus)
|
|
31
32
|
--max-turns=N Maximum agentic turns (default: 50)
|
|
32
33
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
33
34
|
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
35
|
+
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
34
36
|
|
|
35
37
|
Supervise options:
|
|
36
|
-
--task=PATH
|
|
38
|
+
--task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
39
|
+
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
37
40
|
--supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
38
41
|
--agent-cwd=DIR Agent working directory (default: temp directory)
|
|
39
42
|
--model=MODEL Claude model to use (default: opus)
|
|
40
43
|
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
41
44
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
42
45
|
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
46
|
+
--supervisor-allowed-tools=LIST
|
|
47
|
+
Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
48
|
+
--supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
49
|
+
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
43
50
|
|
|
44
51
|
Options:
|
|
45
52
|
--help Show this help message
|
|
@@ -50,8 +57,9 @@ Examples:
|
|
|
50
57
|
fit-eval output --format=json < trace.ndjson
|
|
51
58
|
fit-eval tee < trace.ndjson
|
|
52
59
|
fit-eval tee output.ndjson < trace.ndjson
|
|
53
|
-
fit-eval run --task
|
|
54
|
-
fit-eval
|
|
60
|
+
fit-eval run --task-text="Perform a security audit of the repository." --model=opus
|
|
61
|
+
fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
|
|
62
|
+
fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
|
|
55
63
|
`.trim();
|
|
56
64
|
|
|
57
65
|
async function main() {
|
package/index.js
CHANGED
|
@@ -1,4 +1,9 @@
|
|
|
1
1
|
export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
|
|
2
2
|
export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
|
|
3
|
-
export {
|
|
3
|
+
export {
|
|
4
|
+
Supervisor,
|
|
5
|
+
createSupervisor,
|
|
6
|
+
SUPERVISOR_SYSTEM_PROMPT,
|
|
7
|
+
AGENT_SYSTEM_PROMPT,
|
|
8
|
+
} from "./src/supervisor.js";
|
|
4
9
|
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"description": "Process Claude Code stream-json output into structured traces",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
@@ -10,13 +10,14 @@
|
|
|
10
10
|
"fit-eval": "./bin/fit-eval.js"
|
|
11
11
|
},
|
|
12
12
|
"engines": {
|
|
13
|
-
"bun": ">=1.2.0"
|
|
13
|
+
"bun": ">=1.2.0",
|
|
14
|
+
"node": ">=18.0.0"
|
|
14
15
|
},
|
|
15
16
|
"scripts": {
|
|
16
17
|
"test": "bun run node --test test/*.test.js"
|
|
17
18
|
},
|
|
18
19
|
"dependencies": {
|
|
19
|
-
"@anthropic-ai/claude-agent-sdk": "^0.
|
|
20
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.91"
|
|
20
21
|
},
|
|
21
22
|
"publishConfig": {
|
|
22
23
|
"access": "public"
|
package/src/agent-runner.js
CHANGED
|
@@ -18,6 +18,9 @@ export class AgentRunner {
|
|
|
18
18
|
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
19
|
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
20
|
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
21
|
+
* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
|
|
22
|
+
* @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
|
|
23
|
+
* @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
|
|
21
24
|
*/
|
|
22
25
|
constructor({
|
|
23
26
|
cwd,
|
|
@@ -29,6 +32,9 @@ export class AgentRunner {
|
|
|
29
32
|
permissionMode,
|
|
30
33
|
onLine,
|
|
31
34
|
settingSources,
|
|
35
|
+
agentProfile,
|
|
36
|
+
systemPrompt,
|
|
37
|
+
disallowedTools,
|
|
32
38
|
}) {
|
|
33
39
|
if (!cwd) throw new Error("cwd is required");
|
|
34
40
|
if (!query) throw new Error("query is required");
|
|
@@ -49,6 +55,9 @@ export class AgentRunner {
|
|
|
49
55
|
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
50
56
|
this.onLine = onLine ?? null;
|
|
51
57
|
this.settingSources = settingSources ?? [];
|
|
58
|
+
this.agentProfile = agentProfile ?? null;
|
|
59
|
+
this.systemPrompt = systemPrompt ?? null;
|
|
60
|
+
this.disallowedTools = disallowedTools ?? [];
|
|
52
61
|
this.sessionId = null;
|
|
53
62
|
this.buffer = [];
|
|
54
63
|
}
|
|
@@ -74,6 +83,11 @@ export class AgentRunner {
|
|
|
74
83
|
permissionMode: this.permissionMode,
|
|
75
84
|
allowDangerouslySkipPermissions: true,
|
|
76
85
|
settingSources: this.settingSources,
|
|
86
|
+
...(this.disallowedTools.length > 0 && {
|
|
87
|
+
disallowedTools: this.disallowedTools,
|
|
88
|
+
}),
|
|
89
|
+
...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
|
|
90
|
+
...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
|
|
77
91
|
},
|
|
78
92
|
})) {
|
|
79
93
|
const line = JSON.stringify(message);
|
|
@@ -113,7 +127,11 @@ export class AgentRunner {
|
|
|
113
127
|
try {
|
|
114
128
|
for await (const message of this.query({
|
|
115
129
|
prompt,
|
|
116
|
-
options: {
|
|
130
|
+
options: {
|
|
131
|
+
resume: this.sessionId,
|
|
132
|
+
permissionMode: this.permissionMode,
|
|
133
|
+
allowDangerouslySkipPermissions: true,
|
|
134
|
+
},
|
|
117
135
|
})) {
|
|
118
136
|
const line = JSON.stringify(message);
|
|
119
137
|
this.output.write(line + "\n");
|
package/src/commands/run.js
CHANGED
|
@@ -24,28 +24,35 @@ function parseFlag(args, name) {
|
|
|
24
24
|
* Usage: fit-eval run [options]
|
|
25
25
|
*
|
|
26
26
|
* Options:
|
|
27
|
-
* --task=PATH
|
|
27
|
+
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
28
|
+
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
28
29
|
* --cwd=DIR Agent working directory (default: .)
|
|
29
30
|
* --model=MODEL Claude model to use (default: opus)
|
|
30
31
|
* --max-turns=N Maximum agentic turns (default: 50)
|
|
31
32
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
32
33
|
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
34
|
+
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
33
35
|
*
|
|
34
36
|
* @param {string[]} args - Command arguments
|
|
35
37
|
*/
|
|
36
38
|
export async function runRunCommand(args) {
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
+
const taskFile = parseFlag(args, "task-file");
|
|
40
|
+
const taskText = parseFlag(args, "task-text");
|
|
41
|
+
if (taskFile && taskText)
|
|
42
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
43
|
+
if (!taskFile && !taskText)
|
|
44
|
+
throw new Error("--task-file or --task-text is required");
|
|
39
45
|
|
|
40
46
|
const cwd = resolve(parseFlag(args, "cwd") ?? ".");
|
|
41
47
|
const model = parseFlag(args, "model") ?? "opus";
|
|
42
48
|
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
|
|
43
49
|
const outputPath = parseFlag(args, "output");
|
|
50
|
+
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
44
51
|
const allowedTools = (
|
|
45
52
|
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
46
53
|
).split(",");
|
|
47
54
|
|
|
48
|
-
const taskContent = readFileSync(
|
|
55
|
+
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
49
56
|
|
|
50
57
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
51
58
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
|
|
|
63
70
|
maxTurns,
|
|
64
71
|
allowedTools,
|
|
65
72
|
settingSources: ["project"],
|
|
73
|
+
agentProfile,
|
|
66
74
|
});
|
|
67
75
|
|
|
68
76
|
const result = await runner.run(taskContent);
|
|
@@ -25,19 +25,26 @@ function parseFlag(args, name) {
|
|
|
25
25
|
* Usage: fit-eval supervise [options]
|
|
26
26
|
*
|
|
27
27
|
* Options:
|
|
28
|
-
* --task=PATH
|
|
28
|
+
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
29
|
+
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
30
|
* --supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
30
31
|
* --agent-cwd=DIR Agent working directory (default: temp directory)
|
|
31
32
|
* --model=MODEL Claude model to use (default: opus)
|
|
32
33
|
* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
33
34
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
34
35
|
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
36
|
+
* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
37
|
+
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
35
38
|
*
|
|
36
39
|
* @param {string[]} args - Command arguments
|
|
37
40
|
*/
|
|
38
41
|
export async function runSuperviseCommand(args) {
|
|
39
|
-
const
|
|
40
|
-
|
|
42
|
+
const taskFile = parseFlag(args, "task-file");
|
|
43
|
+
const taskText = parseFlag(args, "task-text");
|
|
44
|
+
if (taskFile && taskText)
|
|
45
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
46
|
+
if (!taskFile && !taskText)
|
|
47
|
+
throw new Error("--task-file or --task-text is required");
|
|
41
48
|
|
|
42
49
|
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
|
|
43
50
|
const agentCwd = resolve(
|
|
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
|
|
|
47
54
|
const model = parseFlag(args, "model") ?? "opus";
|
|
48
55
|
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
|
|
49
56
|
const outputPath = parseFlag(args, "output");
|
|
57
|
+
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
|
|
58
|
+
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
50
59
|
const allowedTools = (
|
|
51
60
|
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
52
61
|
).split(",");
|
|
62
|
+
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
63
|
+
const supervisorAllowedTools = supervisorAllowedToolsRaw
|
|
64
|
+
? supervisorAllowedToolsRaw.split(",")
|
|
65
|
+
: undefined;
|
|
53
66
|
|
|
54
|
-
const taskContent = readFileSync(
|
|
67
|
+
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
55
68
|
|
|
56
69
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
57
70
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
|
|
|
73
86
|
model,
|
|
74
87
|
maxTurns,
|
|
75
88
|
allowedTools,
|
|
89
|
+
supervisorAllowedTools,
|
|
90
|
+
supervisorProfile,
|
|
91
|
+
agentProfile,
|
|
76
92
|
});
|
|
77
93
|
|
|
78
94
|
const result = await supervisor.run(taskContent);
|
package/src/supervisor.js
CHANGED
|
@@ -1,25 +1,38 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Supervisor — orchestrates a relay loop between an agent and a supervisor,
|
|
3
|
-
* both running as AgentRunner instances. The
|
|
4
|
-
*
|
|
3
|
+
* both running as AgentRunner instances. The supervisor receives the task first,
|
|
4
|
+
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
|
+
* agent → supervisor → agent.
|
|
5
6
|
*
|
|
6
7
|
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
7
8
|
*/
|
|
8
9
|
|
|
9
10
|
import { PassThrough } from "node:stream";
|
|
10
11
|
import { createAgentRunner } from "./agent-runner.js";
|
|
12
|
+
import { TraceCollector } from "./trace-collector.js";
|
|
11
13
|
|
|
12
14
|
/**
|
|
13
|
-
* Check if the supervisor's response signals evaluation
|
|
14
|
-
*
|
|
15
|
-
*
|
|
15
|
+
* Check if the supervisor's response signals evaluation success.
|
|
16
|
+
* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
|
|
17
|
+
* formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
|
|
18
|
+
* avoid matching inside longer identifiers.
|
|
16
19
|
* @param {string} text
|
|
17
20
|
* @returns {boolean}
|
|
18
21
|
*/
|
|
19
|
-
export function
|
|
20
|
-
return
|
|
22
|
+
export function isSuccessful(text) {
|
|
23
|
+
return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
21
24
|
}
|
|
22
25
|
|
|
26
|
+
/** System prompt appended for the supervisor runner in supervise mode. */
|
|
27
|
+
export const SUPERVISOR_SYSTEM_PROMPT =
|
|
28
|
+
"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
|
|
29
|
+
"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
|
|
30
|
+
|
|
31
|
+
/** System prompt appended for the agent runner in supervise mode. */
|
|
32
|
+
export const AGENT_SYSTEM_PROMPT =
|
|
33
|
+
"You are being supervised by another AI agent. " +
|
|
34
|
+
"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
|
|
35
|
+
|
|
23
36
|
export class Supervisor {
|
|
24
37
|
/**
|
|
25
38
|
* @param {object} deps
|
|
@@ -40,67 +53,113 @@ export class Supervisor {
|
|
|
40
53
|
this.currentSource = "agent";
|
|
41
54
|
/** @type {number} */
|
|
42
55
|
this.currentTurn = 0;
|
|
56
|
+
/**
|
|
57
|
+
* Set to true when any supervisor message contains the success signal.
|
|
58
|
+
* The SDK result text only reflects the last assistant message, so when
|
|
59
|
+
* the supervisor writes EVALUATION_SUCCESSFUL in an early message and
|
|
60
|
+
* then continues with follow-up work, the result text won't contain it.
|
|
61
|
+
* This flag captures the signal from the full message stream.
|
|
62
|
+
* @type {boolean}
|
|
63
|
+
*/
|
|
64
|
+
this.successSignalSeen = false;
|
|
43
65
|
}
|
|
44
66
|
|
|
45
67
|
/**
|
|
46
68
|
* Run the supervisor ↔ agent relay loop.
|
|
47
|
-
*
|
|
69
|
+
* The supervisor receives the task first, introduces itself, and delegates
|
|
70
|
+
* work to the agent. The loop then alternates: agent → supervisor → agent.
|
|
71
|
+
* @param {string} task - The initial task for the supervisor
|
|
48
72
|
* @returns {Promise<{success: boolean, turns: number}>}
|
|
49
73
|
*/
|
|
50
74
|
async run(task) {
|
|
51
|
-
// Turn 0:
|
|
52
|
-
this.currentSource = "
|
|
75
|
+
// Turn 0: Supervisor receives the task and introduces it to the agent
|
|
76
|
+
this.currentSource = "supervisor";
|
|
53
77
|
this.currentTurn = 0;
|
|
54
|
-
|
|
78
|
+
this.successSignalSeen = false;
|
|
79
|
+
let supervisorResult = await this.supervisorRunner.run(task);
|
|
55
80
|
|
|
56
|
-
if (
|
|
81
|
+
if (supervisorResult.error) {
|
|
57
82
|
this.emitSummary({ success: false, turns: 0 });
|
|
58
83
|
return { success: false, turns: 0 };
|
|
59
84
|
}
|
|
60
85
|
|
|
61
|
-
for
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
86
|
+
// Check for the success signal in either the SDK result text or the
|
|
87
|
+
// streamed message content. The SDK result text only reflects the last
|
|
88
|
+
// assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
|
|
89
|
+
// early and then continues (e.g. filing issues), we must also check the
|
|
90
|
+
// flag set by emitLine during streaming.
|
|
91
|
+
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
|
|
92
|
+
this.emitSummary({ success: true, turns: 0 });
|
|
93
|
+
return { success: true, turns: 0 };
|
|
94
|
+
}
|
|
66
95
|
|
|
67
|
-
|
|
96
|
+
for (let turn = 1; turn <= this.maxTurns; turn++) {
|
|
97
|
+
// Supervisor's output becomes the agent's input
|
|
98
|
+
this.currentSource = "agent";
|
|
68
99
|
this.currentTurn = turn;
|
|
69
|
-
let
|
|
100
|
+
let agentResult;
|
|
70
101
|
if (turn === 1) {
|
|
71
|
-
|
|
102
|
+
agentResult = await this.agentRunner.run(supervisorResult.text);
|
|
72
103
|
} else {
|
|
73
|
-
|
|
104
|
+
agentResult = await this.agentRunner.resume(supervisorResult.text);
|
|
74
105
|
}
|
|
75
106
|
|
|
76
|
-
if (
|
|
107
|
+
if (agentResult.error) {
|
|
77
108
|
this.emitSummary({ success: false, turns: turn });
|
|
78
109
|
return { success: false, turns: turn };
|
|
79
110
|
}
|
|
80
111
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
}
|
|
112
|
+
// Build the full agent transcript from buffered NDJSON events so the
|
|
113
|
+
// supervisor sees tool calls and reasoning, not just the SDK result summary.
|
|
114
|
+
const agentTranscript = this.extractTranscript(this.agentRunner);
|
|
85
115
|
|
|
86
|
-
|
|
87
|
-
|
|
116
|
+
const supervisorPrompt =
|
|
117
|
+
`The agent reported:\n\n${agentTranscript}\n\n` +
|
|
118
|
+
`Review the agent's work and decide how to proceed.`;
|
|
119
|
+
|
|
120
|
+
this.currentSource = "supervisor";
|
|
88
121
|
this.currentTurn = turn;
|
|
89
|
-
|
|
122
|
+
this.successSignalSeen = false;
|
|
123
|
+
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
|
|
90
124
|
|
|
91
|
-
if (
|
|
125
|
+
if (supervisorResult.error) {
|
|
92
126
|
this.emitSummary({ success: false, turns: turn });
|
|
93
127
|
return { success: false, turns: turn };
|
|
94
128
|
}
|
|
129
|
+
|
|
130
|
+
// The supervisor's turn is fully complete — check for success signal
|
|
131
|
+
// in either the SDK result text or streamed messages.
|
|
132
|
+
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
|
|
133
|
+
this.emitSummary({ success: true, turns: turn });
|
|
134
|
+
return { success: true, turns: turn };
|
|
135
|
+
}
|
|
95
136
|
}
|
|
96
137
|
|
|
97
138
|
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
98
139
|
return { success: false, turns: this.maxTurns };
|
|
99
140
|
}
|
|
100
141
|
|
|
142
|
+
/**
|
|
143
|
+
* Extract a human-readable transcript from an AgentRunner's buffered output.
|
|
144
|
+
* Drains the buffer and replays events through a TraceCollector.
|
|
145
|
+
* @param {import("./agent-runner.js").AgentRunner} runner
|
|
146
|
+
* @returns {string}
|
|
147
|
+
*/
|
|
148
|
+
extractTranscript(runner) {
|
|
149
|
+
const lines = runner.drainOutput();
|
|
150
|
+
const collector = new TraceCollector();
|
|
151
|
+
for (const line of lines) {
|
|
152
|
+
collector.addLine(line);
|
|
153
|
+
}
|
|
154
|
+
return collector.toText() || "[The agent produced no output.]";
|
|
155
|
+
}
|
|
156
|
+
|
|
101
157
|
/**
|
|
102
158
|
* Emit a single NDJSON line tagged with the current source and turn.
|
|
103
159
|
* Called in real-time via the AgentRunner onLine callback.
|
|
160
|
+
*
|
|
161
|
+
* When the current source is the supervisor, also scans assistant text
|
|
162
|
+
* content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
|
|
104
163
|
* @param {string} line - Raw NDJSON line from the runner
|
|
105
164
|
*/
|
|
106
165
|
emitLine(line) {
|
|
@@ -111,6 +170,21 @@ export class Supervisor {
|
|
|
111
170
|
event,
|
|
112
171
|
};
|
|
113
172
|
this.output.write(JSON.stringify(tagged) + "\n");
|
|
173
|
+
|
|
174
|
+
// Scan supervisor assistant messages for the success signal in real time.
|
|
175
|
+
// The SDK result text only reflects the final assistant message, but the
|
|
176
|
+
// supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
|
|
177
|
+
// then continue with follow-up tool calls.
|
|
178
|
+
if (this.currentSource === "supervisor" && event.type === "assistant") {
|
|
179
|
+
const content = event.message?.content ?? event.content ?? [];
|
|
180
|
+
if (Array.isArray(content)) {
|
|
181
|
+
for (const block of content) {
|
|
182
|
+
if (block.type === "text" && isSuccessful(block.text)) {
|
|
183
|
+
this.successSignalSeen = true;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
114
188
|
}
|
|
115
189
|
|
|
116
190
|
/**
|
|
@@ -138,6 +212,10 @@ export class Supervisor {
|
|
|
138
212
|
* @param {string} [deps.model] - Claude model identifier
|
|
139
213
|
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
|
|
140
214
|
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
215
|
+
* @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
|
|
216
|
+
* @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
|
|
217
|
+
* @param {string} [deps.supervisorProfile] - Supervisor agent profile name
|
|
218
|
+
* @param {string} [deps.agentProfile] - Agent profile name
|
|
141
219
|
* @returns {Supervisor}
|
|
142
220
|
*/
|
|
143
221
|
export function createSupervisor({
|
|
@@ -148,6 +226,10 @@ export function createSupervisor({
|
|
|
148
226
|
model,
|
|
149
227
|
maxTurns,
|
|
150
228
|
allowedTools,
|
|
229
|
+
supervisorDisallowedTools,
|
|
230
|
+
supervisorAllowedTools,
|
|
231
|
+
supervisorProfile,
|
|
232
|
+
agentProfile,
|
|
151
233
|
}) {
|
|
152
234
|
// Forward-reference: onLine captures `supervisor` before construction completes.
|
|
153
235
|
// This is safe because onLine is only called during run(), after construction.
|
|
@@ -163,17 +245,45 @@ export function createSupervisor({
|
|
|
163
245
|
allowedTools,
|
|
164
246
|
onLine,
|
|
165
247
|
settingSources: ["project"],
|
|
248
|
+
agentProfile,
|
|
249
|
+
systemPrompt: {
|
|
250
|
+
type: "preset",
|
|
251
|
+
preset: "claude_code",
|
|
252
|
+
append: AGENT_SYSTEM_PROMPT,
|
|
253
|
+
},
|
|
166
254
|
});
|
|
167
255
|
|
|
256
|
+
// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
|
|
257
|
+
// The relay loop handles agent communication — letting the supervisor use
|
|
258
|
+
// Task would bypass the relay and produce an empty agent trace.
|
|
259
|
+
const defaultDisallowed = ["Task", "TaskOutput"];
|
|
260
|
+
const disallowedTools = supervisorDisallowedTools
|
|
261
|
+
? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
|
|
262
|
+
: defaultDisallowed;
|
|
263
|
+
|
|
168
264
|
const supervisorRunner = createAgentRunner({
|
|
169
265
|
cwd: supervisorCwd,
|
|
170
266
|
query,
|
|
171
267
|
output: new PassThrough(),
|
|
172
268
|
model,
|
|
173
269
|
maxTurns: 10,
|
|
174
|
-
allowedTools:
|
|
270
|
+
allowedTools: supervisorAllowedTools ?? [
|
|
271
|
+
"Bash",
|
|
272
|
+
"Read",
|
|
273
|
+
"Glob",
|
|
274
|
+
"Grep",
|
|
275
|
+
"Write",
|
|
276
|
+
"Edit",
|
|
277
|
+
],
|
|
278
|
+
disallowedTools,
|
|
175
279
|
onLine,
|
|
176
280
|
settingSources: ["project"],
|
|
281
|
+
agentProfile: supervisorProfile,
|
|
282
|
+
systemPrompt: {
|
|
283
|
+
type: "preset",
|
|
284
|
+
preset: "claude_code",
|
|
285
|
+
append: SUPERVISOR_SYSTEM_PROMPT,
|
|
286
|
+
},
|
|
177
287
|
});
|
|
178
288
|
|
|
179
289
|
supervisor = new Supervisor({
|
package/src/tee-writer.js
CHANGED
|
@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
|
|
|
107
107
|
if (parsed.event) {
|
|
108
108
|
if (parsed.source && parsed.source !== this.lastSource) {
|
|
109
109
|
this.lastSource = parsed.source;
|
|
110
|
-
this.textStream.write(`\n[${parsed.source}]\n`);
|
|
111
110
|
}
|
|
112
111
|
this.collector.addLine(JSON.stringify(parsed.event));
|
|
113
112
|
this.flushTurns();
|
|
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
|
|
|
119
118
|
*/
|
|
120
119
|
flushTurns() {
|
|
121
120
|
const turns = this.collector.turns;
|
|
121
|
+
const prefix =
|
|
122
|
+
this.mode === "supervised" && this.lastSource
|
|
123
|
+
? `[${this.lastSource}] `
|
|
124
|
+
: "";
|
|
122
125
|
while (this.turnsEmitted < turns.length) {
|
|
123
126
|
const turn = turns[this.turnsEmitted++];
|
|
124
127
|
if (turn.role === "assistant") {
|
|
125
128
|
for (const block of turn.content) {
|
|
126
129
|
if (block.type === "text") {
|
|
127
|
-
this.textStream.write(block.text
|
|
130
|
+
this.textStream.write(`${prefix}${block.text}\n`);
|
|
128
131
|
} else if (block.type === "tool_use") {
|
|
129
132
|
const input = summarizeInput(block.input);
|
|
130
|
-
this.textStream.write(
|
|
133
|
+
this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
|
|
131
134
|
}
|
|
132
135
|
}
|
|
133
136
|
}
|
package/src/trace-collector.js
CHANGED
|
@@ -38,6 +38,13 @@ export class TraceCollector {
|
|
|
38
38
|
return;
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
// Unwrap combined supervised trace format {source, turn, event}.
|
|
42
|
+
// The Supervisor emits this wrapper; when replayed through addLine the
|
|
43
|
+
// inner event is the one we need.
|
|
44
|
+
if (event.event && !event.type && typeof event.source === "string") {
|
|
45
|
+
event = event.event;
|
|
46
|
+
}
|
|
47
|
+
|
|
41
48
|
switch (event.type) {
|
|
42
49
|
case "system":
|
|
43
50
|
this.handleSystem(event);
|
package/test/supervisor.test.js
CHANGED
|
@@ -6,8 +6,10 @@ import {
|
|
|
6
6
|
AgentRunner,
|
|
7
7
|
Supervisor,
|
|
8
8
|
createSupervisor,
|
|
9
|
+
SUPERVISOR_SYSTEM_PROMPT,
|
|
10
|
+
AGENT_SYSTEM_PROMPT,
|
|
9
11
|
} from "@forwardimpact/libeval";
|
|
10
|
-
import {
|
|
12
|
+
import { isSuccessful } from "../src/supervisor.js";
|
|
11
13
|
|
|
12
14
|
/**
|
|
13
15
|
* Create a mock AgentRunner that yields pre-scripted responses.
|
|
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
|
|
|
61
63
|
return runner;
|
|
62
64
|
}
|
|
63
65
|
|
|
64
|
-
describe("
|
|
65
|
-
test("detects
|
|
66
|
-
assert.strictEqual(
|
|
66
|
+
describe("isSuccessful", () => {
|
|
67
|
+
test("detects EVALUATION_SUCCESSFUL on its own line", () => {
|
|
68
|
+
assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
|
|
67
69
|
assert.strictEqual(
|
|
68
|
-
|
|
70
|
+
isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
|
|
69
71
|
true,
|
|
70
72
|
);
|
|
71
|
-
assert.strictEqual(
|
|
73
|
+
assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
|
|
72
74
|
});
|
|
73
75
|
|
|
74
|
-
test("
|
|
75
|
-
assert.strictEqual(
|
|
76
|
-
assert.strictEqual(
|
|
77
|
-
assert.strictEqual(
|
|
76
|
+
test("tolerates markdown formatting around the signal", () => {
|
|
77
|
+
assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
|
|
78
|
+
assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
|
|
79
|
+
assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
|
|
80
|
+
assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
|
|
81
|
+
assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
|
|
82
|
+
assert.strictEqual(
|
|
83
|
+
isSuccessful(
|
|
84
|
+
"Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
|
|
85
|
+
),
|
|
86
|
+
true,
|
|
87
|
+
);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
|
|
91
|
+
assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
|
|
92
|
+
assert.strictEqual(
|
|
93
|
+
isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
|
|
94
|
+
true,
|
|
95
|
+
);
|
|
96
|
+
assert.strictEqual(
|
|
97
|
+
isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
|
|
98
|
+
true,
|
|
99
|
+
);
|
|
78
100
|
});
|
|
79
101
|
|
|
80
102
|
test("does not match empty or unrelated text", () => {
|
|
81
|
-
assert.strictEqual(
|
|
82
|
-
assert.strictEqual(
|
|
83
|
-
assert.strictEqual(
|
|
103
|
+
assert.strictEqual(isSuccessful(""), false);
|
|
104
|
+
assert.strictEqual(isSuccessful("All done!"), false);
|
|
105
|
+
assert.strictEqual(isSuccessful("DONE"), false);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test("does not match old EVALUATION_COMPLETE signal", () => {
|
|
109
|
+
assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
|
|
84
110
|
});
|
|
85
111
|
});
|
|
86
112
|
|
|
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
|
|
|
118
144
|
);
|
|
119
145
|
});
|
|
120
146
|
|
|
121
|
-
test("completes on
|
|
147
|
+
test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
|
|
148
|
+
const agentRunner = createMockRunner([]);
|
|
149
|
+
|
|
150
|
+
const supervisorRunner = createMockRunner([
|
|
151
|
+
{ text: "EVALUATION_SUCCESSFUL" },
|
|
152
|
+
]);
|
|
153
|
+
|
|
154
|
+
const output = new PassThrough();
|
|
155
|
+
const supervisor = new Supervisor({
|
|
156
|
+
agentRunner,
|
|
157
|
+
supervisorRunner,
|
|
158
|
+
output,
|
|
159
|
+
maxTurns: 10,
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
const result = await supervisor.run("Install stuff");
|
|
163
|
+
|
|
164
|
+
assert.strictEqual(result.success, true);
|
|
165
|
+
assert.strictEqual(result.turns, 0);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
test("completes after one agent turn", async () => {
|
|
122
169
|
const agentRunner = createMockRunner([
|
|
123
170
|
{ text: "I installed the packages." },
|
|
124
171
|
]);
|
|
125
172
|
|
|
126
173
|
const supervisorRunner = createMockRunner([
|
|
127
|
-
{ text: "
|
|
174
|
+
{ text: "Welcome! Please install the packages." },
|
|
175
|
+
{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
|
|
128
176
|
]);
|
|
129
177
|
|
|
130
178
|
const output = new PassThrough();
|
|
@@ -141,6 +189,67 @@ describe("Supervisor", () => {
|
|
|
141
189
|
assert.strictEqual(result.turns, 1);
|
|
142
190
|
});
|
|
143
191
|
|
|
192
|
+
test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
|
|
193
|
+
// Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
|
|
194
|
+
// an early message, then continues with follow-up work (e.g. filing issues).
|
|
195
|
+
// The SDK result text reflects only the final message, which does NOT
|
|
196
|
+
// contain the signal.
|
|
197
|
+
const agentRunner = createMockRunner([
|
|
198
|
+
{ text: "I installed the packages." },
|
|
199
|
+
]);
|
|
200
|
+
|
|
201
|
+
// The supervisor's result text is the Summary (no signal), but messages
|
|
202
|
+
// include one with EVALUATION_SUCCESSFUL.
|
|
203
|
+
const supervisorMessages = [
|
|
204
|
+
undefined, // turn 0: use default
|
|
205
|
+
[
|
|
206
|
+
{
|
|
207
|
+
type: "assistant",
|
|
208
|
+
message: {
|
|
209
|
+
content: [
|
|
210
|
+
{
|
|
211
|
+
type: "text",
|
|
212
|
+
text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
|
|
213
|
+
},
|
|
214
|
+
],
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
type: "assistant",
|
|
219
|
+
message: {
|
|
220
|
+
content: [
|
|
221
|
+
{ type: "text", text: "## Summary\n\nAll issues filed." },
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
],
|
|
226
|
+
];
|
|
227
|
+
|
|
228
|
+
const supervisorRunner = createMockRunner(
|
|
229
|
+
[
|
|
230
|
+
{ text: "Welcome! Please install the packages." },
|
|
231
|
+
// Result text is the final message — does NOT contain the signal
|
|
232
|
+
{ text: "## Summary\n\nAll issues filed." },
|
|
233
|
+
],
|
|
234
|
+
supervisorMessages,
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
const output = new PassThrough();
|
|
238
|
+
const supervisor = new Supervisor({
|
|
239
|
+
agentRunner,
|
|
240
|
+
supervisorRunner,
|
|
241
|
+
output,
|
|
242
|
+
maxTurns: 10,
|
|
243
|
+
});
|
|
244
|
+
agentRunner.onLine = (line) => supervisor.emitLine(line);
|
|
245
|
+
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
|
|
246
|
+
|
|
247
|
+
const result = await supervisor.run("Install stuff");
|
|
248
|
+
|
|
249
|
+
assert.strictEqual(result.success, true);
|
|
250
|
+
assert.strictEqual(result.turns, 1);
|
|
251
|
+
});
|
|
252
|
+
|
|
144
253
|
test("runs multiple turns before completion", async () => {
|
|
145
254
|
const agentRunner = createMockRunner([
|
|
146
255
|
{ text: "Started working." },
|
|
@@ -149,9 +258,10 @@ describe("Supervisor", () => {
|
|
|
149
258
|
]);
|
|
150
259
|
|
|
151
260
|
const supervisorRunner = createMockRunner([
|
|
261
|
+
{ text: "Here is your task. Do the work." },
|
|
152
262
|
{ text: "Keep going, you need to do more." },
|
|
153
263
|
{ text: "Almost there, continue." },
|
|
154
|
-
{ text: "
|
|
264
|
+
{ text: "EVALUATION_SUCCESSFUL" },
|
|
155
265
|
]);
|
|
156
266
|
|
|
157
267
|
const output = new PassThrough();
|
|
@@ -169,14 +279,14 @@ describe("Supervisor", () => {
|
|
|
169
279
|
});
|
|
170
280
|
|
|
171
281
|
test("enforces maxTurns limit", async () => {
|
|
172
|
-
//
|
|
282
|
+
// Supervisor starts, agent responds each turn, supervisor never says done
|
|
173
283
|
const agentRunner = createMockRunner([
|
|
174
|
-
{ text: "Turn 0" },
|
|
175
284
|
{ text: "Turn 1" },
|
|
176
285
|
{ text: "Turn 2" },
|
|
177
286
|
]);
|
|
178
287
|
|
|
179
288
|
const supervisorRunner = createMockRunner([
|
|
289
|
+
{ text: "Start working." },
|
|
180
290
|
{ text: "Continue." },
|
|
181
291
|
{ text: "Continue." },
|
|
182
292
|
]);
|
|
@@ -196,16 +306,17 @@ describe("Supervisor", () => {
|
|
|
196
306
|
});
|
|
197
307
|
|
|
198
308
|
test("output contains tagged lines with correct source and turn", async () => {
|
|
199
|
-
const agentMessages = [[{ type: "assistant", content: "Working" }]];
|
|
200
309
|
const supervisorMessages = [
|
|
201
|
-
[{ type: "assistant", content: "
|
|
310
|
+
[{ type: "assistant", content: "Go ahead" }],
|
|
311
|
+
[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
|
|
202
312
|
];
|
|
313
|
+
const agentMessages = [[{ type: "assistant", content: "Working" }]];
|
|
203
314
|
|
|
204
|
-
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
|
|
205
315
|
const supervisorRunner = createMockRunner(
|
|
206
|
-
[{ text: "
|
|
316
|
+
[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
|
|
207
317
|
supervisorMessages,
|
|
208
318
|
);
|
|
319
|
+
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
|
|
209
320
|
|
|
210
321
|
const output = new PassThrough();
|
|
211
322
|
const supervisor = new Supervisor({
|
|
@@ -225,19 +336,19 @@ describe("Supervisor", () => {
|
|
|
225
336
|
.split("\n")
|
|
226
337
|
.filter((l) => l.length > 0);
|
|
227
338
|
|
|
228
|
-
// Should have:
|
|
229
|
-
assert.ok(lines.length >=
|
|
230
|
-
|
|
231
|
-
const agentLine = JSON.parse(lines[0]);
|
|
232
|
-
assert.strictEqual(agentLine.source, "agent");
|
|
233
|
-
assert.strictEqual(agentLine.turn, 0);
|
|
234
|
-
assert.ok("event" in agentLine);
|
|
339
|
+
// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
|
|
340
|
+
assert.ok(lines.length >= 4);
|
|
235
341
|
|
|
236
|
-
const supervisorLine = JSON.parse(lines[
|
|
342
|
+
const supervisorLine = JSON.parse(lines[0]);
|
|
237
343
|
assert.strictEqual(supervisorLine.source, "supervisor");
|
|
238
|
-
assert.strictEqual(supervisorLine.turn,
|
|
344
|
+
assert.strictEqual(supervisorLine.turn, 0);
|
|
239
345
|
assert.ok("event" in supervisorLine);
|
|
240
346
|
|
|
347
|
+
const agentLine = JSON.parse(lines[1]);
|
|
348
|
+
assert.strictEqual(agentLine.source, "agent");
|
|
349
|
+
assert.strictEqual(agentLine.turn, 1);
|
|
350
|
+
assert.ok("event" in agentLine);
|
|
351
|
+
|
|
241
352
|
const summaryLine = JSON.parse(lines[lines.length - 1]);
|
|
242
353
|
assert.strictEqual(summaryLine.source, "orchestrator");
|
|
243
354
|
assert.strictEqual(summaryLine.type, "summary");
|
|
@@ -250,11 +361,14 @@ describe("Supervisor", () => {
|
|
|
250
361
|
source: "sdk-internal",
|
|
251
362
|
content: "test",
|
|
252
363
|
};
|
|
253
|
-
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
|
|
254
364
|
const supervisorRunner = createMockRunner(
|
|
255
|
-
[{ text: "
|
|
256
|
-
[
|
|
365
|
+
[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
|
|
366
|
+
[
|
|
367
|
+
[{ type: "assistant", content: "Go" }],
|
|
368
|
+
[{ type: "assistant", content: "ok" }],
|
|
369
|
+
],
|
|
257
370
|
);
|
|
371
|
+
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
|
|
258
372
|
|
|
259
373
|
const output = new PassThrough();
|
|
260
374
|
const supervisor = new Supervisor({
|
|
@@ -274,27 +388,30 @@ describe("Supervisor", () => {
|
|
|
274
388
|
.split("\n")
|
|
275
389
|
.filter((l) => l.length > 0);
|
|
276
390
|
|
|
277
|
-
|
|
391
|
+
// First line is supervisor turn 0, second is agent turn 1
|
|
392
|
+
const tagged = JSON.parse(lines[1]);
|
|
278
393
|
// The original event's `source` field is preserved inside `event`
|
|
279
394
|
assert.strictEqual(tagged.source, "agent");
|
|
280
395
|
assert.strictEqual(tagged.event.source, "sdk-internal");
|
|
281
396
|
});
|
|
282
397
|
|
|
283
|
-
test("emits
|
|
284
|
-
const
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
398
|
+
test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
|
|
399
|
+
const supervisorMessages = [
|
|
400
|
+
[{ type: "assistant", content: "Starting..." }],
|
|
401
|
+
];
|
|
402
|
+
const supervisorRunner = createMockRunner(
|
|
403
|
+
[{ text: "Starting...", success: false }],
|
|
404
|
+
supervisorMessages,
|
|
288
405
|
);
|
|
289
406
|
|
|
290
407
|
// Override run to simulate an error return
|
|
291
|
-
const origRun =
|
|
292
|
-
|
|
293
|
-
const result = await origRun.call(
|
|
408
|
+
const origRun = supervisorRunner.run;
|
|
409
|
+
supervisorRunner.run = async (task) => {
|
|
410
|
+
const result = await origRun.call(supervisorRunner, task);
|
|
294
411
|
return { ...result, error: new Error("Process exited with code 1") };
|
|
295
412
|
};
|
|
296
413
|
|
|
297
|
-
const
|
|
414
|
+
const agentRunner = createMockRunner([]);
|
|
298
415
|
|
|
299
416
|
const output = new PassThrough();
|
|
300
417
|
const supervisor = new Supervisor({
|
|
@@ -311,18 +428,18 @@ describe("Supervisor", () => {
|
|
|
311
428
|
assert.strictEqual(result.success, false);
|
|
312
429
|
assert.strictEqual(result.turns, 0);
|
|
313
430
|
|
|
314
|
-
// Output should still contain the
|
|
431
|
+
// Output should still contain the supervisor's buffered lines + summary
|
|
315
432
|
const data = output.read()?.toString() ?? "";
|
|
316
433
|
const lines = data
|
|
317
434
|
.trim()
|
|
318
435
|
.split("\n")
|
|
319
436
|
.filter((l) => l.length > 0);
|
|
320
437
|
|
|
321
|
-
assert.ok(lines.length >= 2, "Expected at least
|
|
438
|
+
assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
|
|
322
439
|
|
|
323
|
-
const
|
|
324
|
-
assert.strictEqual(
|
|
325
|
-
assert.strictEqual(
|
|
440
|
+
const supervisorLine = JSON.parse(lines[0]);
|
|
441
|
+
assert.strictEqual(supervisorLine.source, "supervisor");
|
|
442
|
+
assert.strictEqual(supervisorLine.turn, 0);
|
|
326
443
|
|
|
327
444
|
const summaryLine = JSON.parse(lines[lines.length - 1]);
|
|
328
445
|
assert.strictEqual(summaryLine.source, "orchestrator");
|
|
@@ -339,4 +456,99 @@ describe("Supervisor", () => {
|
|
|
339
456
|
});
|
|
340
457
|
assert.ok(supervisor instanceof Supervisor);
|
|
341
458
|
});
|
|
459
|
+
|
|
460
|
+
test("createSupervisor uses default supervisor tools when none specified", () => {
|
|
461
|
+
const supervisor = createSupervisor({
|
|
462
|
+
supervisorCwd: "/tmp/sup",
|
|
463
|
+
agentCwd: "/tmp/agent",
|
|
464
|
+
query: async function* () {},
|
|
465
|
+
output: new PassThrough(),
|
|
466
|
+
});
|
|
467
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
|
|
468
|
+
"Bash",
|
|
469
|
+
"Read",
|
|
470
|
+
"Glob",
|
|
471
|
+
"Grep",
|
|
472
|
+
"Write",
|
|
473
|
+
"Edit",
|
|
474
|
+
]);
|
|
475
|
+
});
|
|
476
|
+
|
|
477
|
+
test("createSupervisor passes custom supervisor tools", () => {
|
|
478
|
+
const supervisor = createSupervisor({
|
|
479
|
+
supervisorCwd: "/tmp/sup",
|
|
480
|
+
agentCwd: "/tmp/agent",
|
|
481
|
+
query: async function* () {},
|
|
482
|
+
output: new PassThrough(),
|
|
483
|
+
supervisorAllowedTools: ["Read", "Glob", "Grep"],
|
|
484
|
+
});
|
|
485
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
|
|
486
|
+
"Read",
|
|
487
|
+
"Glob",
|
|
488
|
+
"Grep",
|
|
489
|
+
]);
|
|
490
|
+
});
|
|
491
|
+
|
|
492
|
+
test("createSupervisor wires system prompts to both runners", () => {
|
|
493
|
+
const supervisor = createSupervisor({
|
|
494
|
+
supervisorCwd: "/tmp/sup",
|
|
495
|
+
agentCwd: "/tmp/agent",
|
|
496
|
+
query: async function* () {},
|
|
497
|
+
output: new PassThrough(),
|
|
498
|
+
});
|
|
499
|
+
|
|
500
|
+
assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
|
|
501
|
+
type: "preset",
|
|
502
|
+
preset: "claude_code",
|
|
503
|
+
append: AGENT_SYSTEM_PROMPT,
|
|
504
|
+
});
|
|
505
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
|
|
506
|
+
type: "preset",
|
|
507
|
+
preset: "claude_code",
|
|
508
|
+
append: SUPERVISOR_SYSTEM_PROMPT,
|
|
509
|
+
});
|
|
510
|
+
});
|
|
511
|
+
|
|
512
|
+
test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
|
|
513
|
+
const supervisor = createSupervisor({
|
|
514
|
+
supervisorCwd: "/tmp/sup",
|
|
515
|
+
agentCwd: "/tmp/agent",
|
|
516
|
+
query: async function* () {},
|
|
517
|
+
output: new PassThrough(),
|
|
518
|
+
});
|
|
519
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
|
|
520
|
+
"Task",
|
|
521
|
+
"TaskOutput",
|
|
522
|
+
]);
|
|
523
|
+
// Agent should not have disallowed tools
|
|
524
|
+
assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
|
|
528
|
+
const supervisor = createSupervisor({
|
|
529
|
+
supervisorCwd: "/tmp/sup",
|
|
530
|
+
agentCwd: "/tmp/agent",
|
|
531
|
+
query: async function* () {},
|
|
532
|
+
output: new PassThrough(),
|
|
533
|
+
supervisorDisallowedTools: ["WebSearch", "Task"],
|
|
534
|
+
});
|
|
535
|
+
const disallowed = supervisor.supervisorRunner.disallowedTools;
|
|
536
|
+
assert.ok(disallowed.includes("Task"));
|
|
537
|
+
assert.ok(disallowed.includes("TaskOutput"));
|
|
538
|
+
assert.ok(disallowed.includes("WebSearch"));
|
|
539
|
+
// No duplicates
|
|
540
|
+
assert.strictEqual(disallowed.length, new Set(disallowed).size);
|
|
541
|
+
});
|
|
542
|
+
|
|
543
|
+
test("system prompt constants are non-empty strings", () => {
|
|
544
|
+
assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
|
|
545
|
+
assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
|
|
546
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
|
|
547
|
+
assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
|
|
548
|
+
});
|
|
549
|
+
|
|
550
|
+
test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
|
|
551
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
|
|
552
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
|
|
553
|
+
});
|
|
342
554
|
});
|
package/test/tee-writer.test.js
CHANGED
|
@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
|
|
|
187
187
|
assert.strictEqual(fileLines.length, 3);
|
|
188
188
|
assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
|
|
189
189
|
|
|
190
|
-
// Text should show source
|
|
191
|
-
assert.ok(textData.includes("[agent]"));
|
|
192
|
-
assert.ok(textData.includes("
|
|
193
|
-
assert.ok(textData.includes("[supervisor]"));
|
|
194
|
-
assert.ok(textData.includes("Looks good"));
|
|
190
|
+
// Text should show source prefixes on content lines
|
|
191
|
+
assert.ok(textData.includes("[agent] Working on it"));
|
|
192
|
+
assert.ok(textData.includes("[supervisor] Looks good"));
|
|
195
193
|
assert.ok(textData.includes("Evaluation completed after 1 turns"));
|
|
196
194
|
});
|
|
197
195
|
|
|
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
|
|
|
254
252
|
await writeLines(writer, events);
|
|
255
253
|
|
|
256
254
|
const textData = collect(textStream);
|
|
257
|
-
// [agent]
|
|
258
|
-
|
|
259
|
-
assert.
|
|
255
|
+
// [agent] prefix should appear on each content line
|
|
256
|
+
assert.ok(textData.includes("[agent] Step 1"));
|
|
257
|
+
assert.ok(textData.includes("[agent] Step 2"));
|
|
260
258
|
});
|
|
261
259
|
|
|
262
260
|
test("handles partial lines across chunks", async () => {
|
|
@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
|
|
|
149
149
|
assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
|
|
150
150
|
});
|
|
151
151
|
|
|
152
|
+
test("unwraps combined supervised trace format {source, turn, event}", () => {
|
|
153
|
+
const collector = new TraceCollector();
|
|
154
|
+
|
|
155
|
+
// System init wrapped in supervisor envelope
|
|
156
|
+
collector.addLine(
|
|
157
|
+
JSON.stringify({
|
|
158
|
+
source: "agent",
|
|
159
|
+
turn: 0,
|
|
160
|
+
event: {
|
|
161
|
+
type: "system",
|
|
162
|
+
subtype: "init",
|
|
163
|
+
session_id: "sess-supervised",
|
|
164
|
+
model: "claude-opus-4-6",
|
|
165
|
+
tools: ["Bash"],
|
|
166
|
+
},
|
|
167
|
+
}),
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
// Assistant message wrapped in supervisor envelope
|
|
171
|
+
collector.addLine(
|
|
172
|
+
JSON.stringify({
|
|
173
|
+
source: "agent",
|
|
174
|
+
turn: 1,
|
|
175
|
+
event: {
|
|
176
|
+
type: "assistant",
|
|
177
|
+
message: {
|
|
178
|
+
content: [{ type: "text", text: "I ran the tests." }],
|
|
179
|
+
usage: { input_tokens: 100, output_tokens: 50 },
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
}),
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
// Tool result wrapped in supervisor envelope
|
|
186
|
+
collector.addLine(
|
|
187
|
+
JSON.stringify({
|
|
188
|
+
source: "agent",
|
|
189
|
+
turn: 1,
|
|
190
|
+
event: {
|
|
191
|
+
type: "user",
|
|
192
|
+
message: {
|
|
193
|
+
role: "user",
|
|
194
|
+
content: [
|
|
195
|
+
{
|
|
196
|
+
type: "tool_result",
|
|
197
|
+
tool_use_id: "toolu_sup",
|
|
198
|
+
content: "All tests passed",
|
|
199
|
+
},
|
|
200
|
+
],
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
}),
|
|
204
|
+
);
|
|
205
|
+
|
|
206
|
+
// Result event wrapped in supervisor envelope
|
|
207
|
+
collector.addLine(
|
|
208
|
+
JSON.stringify({
|
|
209
|
+
source: "supervisor",
|
|
210
|
+
turn: 1,
|
|
211
|
+
event: {
|
|
212
|
+
type: "result",
|
|
213
|
+
subtype: "success",
|
|
214
|
+
total_cost_usd: 0.44,
|
|
215
|
+
duration_ms: 30000,
|
|
216
|
+
num_turns: 2,
|
|
217
|
+
},
|
|
218
|
+
}),
|
|
219
|
+
);
|
|
220
|
+
|
|
221
|
+
const trace = collector.toJSON();
|
|
222
|
+
assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
|
|
223
|
+
assert.strictEqual(trace.turns.length, 2);
|
|
224
|
+
assert.strictEqual(trace.turns[0].role, "assistant");
|
|
225
|
+
assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
|
|
226
|
+
assert.strictEqual(trace.turns[1].role, "tool_result");
|
|
227
|
+
assert.strictEqual(trace.turns[1].content, "All tests passed");
|
|
228
|
+
assert.strictEqual(trace.summary.result, "success");
|
|
229
|
+
assert.strictEqual(trace.summary.totalCostUsd, 0.44);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
test("skips orchestrator summary lines from supervised traces", () => {
|
|
233
|
+
const collector = new TraceCollector();
|
|
234
|
+
collector.addLine(
|
|
235
|
+
JSON.stringify({
|
|
236
|
+
source: "orchestrator",
|
|
237
|
+
type: "summary",
|
|
238
|
+
success: true,
|
|
239
|
+
turns: 3,
|
|
240
|
+
}),
|
|
241
|
+
);
|
|
242
|
+
|
|
243
|
+
// Orchestrator summaries have no inner event and no recognized type
|
|
244
|
+
// after unwrap — they should be silently skipped.
|
|
245
|
+
assert.strictEqual(collector.toJSON().turns.length, 0);
|
|
246
|
+
});
|
|
247
|
+
|
|
152
248
|
test("skips rate_limit_event and unknown types", () => {
|
|
153
249
|
const collector = new TraceCollector();
|
|
154
250
|
collector.addLine(
|