@forwardimpact/libeval 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +13 -5
- package/index.js +6 -1
- package/package.json +4 -3
- package/src/agent-runner.js +19 -1
- package/src/commands/run.js +12 -4
- package/src/commands/supervise.js +20 -4
- package/src/supervisor.js +108 -31
- package/src/tee-writer.js +6 -3
- package/test/supervisor.test.js +200 -49
- package/test/tee-writer.test.js +6 -8
package/bin/fit-eval.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#!/usr/bin/env
|
|
1
|
+
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
import { runOutputCommand } from "../src/commands/output.js";
|
|
4
4
|
import { runTeeCommand } from "../src/commands/tee.js";
|
|
@@ -25,21 +25,28 @@ Commands:
|
|
|
25
25
|
supervise [options] Run a supervised agent ↔ supervisor relay loop
|
|
26
26
|
|
|
27
27
|
Run options:
|
|
28
|
-
--task=PATH
|
|
28
|
+
--task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
29
|
+
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
30
|
--cwd=DIR Agent working directory (default: .)
|
|
30
31
|
--model=MODEL Claude model to use (default: opus)
|
|
31
32
|
--max-turns=N Maximum agentic turns (default: 50)
|
|
32
33
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
33
34
|
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
35
|
+
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
34
36
|
|
|
35
37
|
Supervise options:
|
|
36
|
-
--task=PATH
|
|
38
|
+
--task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
39
|
+
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
37
40
|
--supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
38
41
|
--agent-cwd=DIR Agent working directory (default: temp directory)
|
|
39
42
|
--model=MODEL Claude model to use (default: opus)
|
|
40
43
|
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
41
44
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
42
45
|
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
46
|
+
--supervisor-allowed-tools=LIST
|
|
47
|
+
Comma-separated tools for supervisor (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
48
|
+
--supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
49
|
+
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
43
50
|
|
|
44
51
|
Options:
|
|
45
52
|
--help Show this help message
|
|
@@ -50,8 +57,9 @@ Examples:
|
|
|
50
57
|
fit-eval output --format=json < trace.ndjson
|
|
51
58
|
fit-eval tee < trace.ndjson
|
|
52
59
|
fit-eval tee output.ndjson < trace.ndjson
|
|
53
|
-
fit-eval run --task
|
|
54
|
-
fit-eval
|
|
60
|
+
fit-eval run --task-text="Perform a security audit of the repository." --model=opus
|
|
61
|
+
fit-eval run --task-file=scenarios/guide-setup/task.md --model=opus
|
|
62
|
+
fit-eval supervise --task-file=scenarios/guide-setup/task.md --supervisor-cwd=.
|
|
55
63
|
`.trim();
|
|
56
64
|
|
|
57
65
|
async function main() {
|
package/index.js
CHANGED
|
@@ -1,4 +1,9 @@
|
|
|
1
1
|
export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
|
|
2
2
|
export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
|
|
3
|
-
export {
|
|
3
|
+
export {
|
|
4
|
+
Supervisor,
|
|
5
|
+
createSupervisor,
|
|
6
|
+
SUPERVISOR_SYSTEM_PROMPT,
|
|
7
|
+
AGENT_SYSTEM_PROMPT,
|
|
8
|
+
} from "./src/supervisor.js";
|
|
4
9
|
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.5",
|
|
4
4
|
"description": "Process Claude Code stream-json output into structured traces",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
@@ -10,13 +10,14 @@
|
|
|
10
10
|
"fit-eval": "./bin/fit-eval.js"
|
|
11
11
|
},
|
|
12
12
|
"engines": {
|
|
13
|
-
"bun": ">=1.2.0"
|
|
13
|
+
"bun": ">=1.2.0",
|
|
14
|
+
"node": ">=18.0.0"
|
|
14
15
|
},
|
|
15
16
|
"scripts": {
|
|
16
17
|
"test": "bun run node --test test/*.test.js"
|
|
17
18
|
},
|
|
18
19
|
"dependencies": {
|
|
19
|
-
"@anthropic-ai/claude-agent-sdk": "^0.
|
|
20
|
+
"@anthropic-ai/claude-agent-sdk": "^0.2.91"
|
|
20
21
|
},
|
|
21
22
|
"publishConfig": {
|
|
22
23
|
"access": "public"
|
package/src/agent-runner.js
CHANGED
|
@@ -18,6 +18,9 @@ export class AgentRunner {
|
|
|
18
18
|
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
19
|
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
20
|
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
21
|
+
* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
|
|
22
|
+
* @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
|
|
23
|
+
* @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
|
|
21
24
|
*/
|
|
22
25
|
constructor({
|
|
23
26
|
cwd,
|
|
@@ -29,6 +32,9 @@ export class AgentRunner {
|
|
|
29
32
|
permissionMode,
|
|
30
33
|
onLine,
|
|
31
34
|
settingSources,
|
|
35
|
+
agentProfile,
|
|
36
|
+
systemPrompt,
|
|
37
|
+
disallowedTools,
|
|
32
38
|
}) {
|
|
33
39
|
if (!cwd) throw new Error("cwd is required");
|
|
34
40
|
if (!query) throw new Error("query is required");
|
|
@@ -49,6 +55,9 @@ export class AgentRunner {
|
|
|
49
55
|
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
50
56
|
this.onLine = onLine ?? null;
|
|
51
57
|
this.settingSources = settingSources ?? [];
|
|
58
|
+
this.agentProfile = agentProfile ?? null;
|
|
59
|
+
this.systemPrompt = systemPrompt ?? null;
|
|
60
|
+
this.disallowedTools = disallowedTools ?? [];
|
|
52
61
|
this.sessionId = null;
|
|
53
62
|
this.buffer = [];
|
|
54
63
|
}
|
|
@@ -74,6 +83,11 @@ export class AgentRunner {
|
|
|
74
83
|
permissionMode: this.permissionMode,
|
|
75
84
|
allowDangerouslySkipPermissions: true,
|
|
76
85
|
settingSources: this.settingSources,
|
|
86
|
+
...(this.disallowedTools.length > 0 && {
|
|
87
|
+
disallowedTools: this.disallowedTools,
|
|
88
|
+
}),
|
|
89
|
+
...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
|
|
90
|
+
...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
|
|
77
91
|
},
|
|
78
92
|
})) {
|
|
79
93
|
const line = JSON.stringify(message);
|
|
@@ -113,7 +127,11 @@ export class AgentRunner {
|
|
|
113
127
|
try {
|
|
114
128
|
for await (const message of this.query({
|
|
115
129
|
prompt,
|
|
116
|
-
options: {
|
|
130
|
+
options: {
|
|
131
|
+
resume: this.sessionId,
|
|
132
|
+
permissionMode: this.permissionMode,
|
|
133
|
+
allowDangerouslySkipPermissions: true,
|
|
134
|
+
},
|
|
117
135
|
})) {
|
|
118
136
|
const line = JSON.stringify(message);
|
|
119
137
|
this.output.write(line + "\n");
|
package/src/commands/run.js
CHANGED
|
@@ -24,28 +24,35 @@ function parseFlag(args, name) {
|
|
|
24
24
|
* Usage: fit-eval run [options]
|
|
25
25
|
*
|
|
26
26
|
* Options:
|
|
27
|
-
* --task=PATH
|
|
27
|
+
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
28
|
+
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
28
29
|
* --cwd=DIR Agent working directory (default: .)
|
|
29
30
|
* --model=MODEL Claude model to use (default: opus)
|
|
30
31
|
* --max-turns=N Maximum agentic turns (default: 50)
|
|
31
32
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
32
33
|
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
34
|
+
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
33
35
|
*
|
|
34
36
|
* @param {string[]} args - Command arguments
|
|
35
37
|
*/
|
|
36
38
|
export async function runRunCommand(args) {
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
+
const taskFile = parseFlag(args, "task-file");
|
|
40
|
+
const taskText = parseFlag(args, "task-text");
|
|
41
|
+
if (taskFile && taskText)
|
|
42
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
43
|
+
if (!taskFile && !taskText)
|
|
44
|
+
throw new Error("--task-file or --task-text is required");
|
|
39
45
|
|
|
40
46
|
const cwd = resolve(parseFlag(args, "cwd") ?? ".");
|
|
41
47
|
const model = parseFlag(args, "model") ?? "opus";
|
|
42
48
|
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
|
|
43
49
|
const outputPath = parseFlag(args, "output");
|
|
50
|
+
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
44
51
|
const allowedTools = (
|
|
45
52
|
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
46
53
|
).split(",");
|
|
47
54
|
|
|
48
|
-
const taskContent = readFileSync(
|
|
55
|
+
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
49
56
|
|
|
50
57
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
51
58
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -63,6 +70,7 @@ export async function runRunCommand(args) {
|
|
|
63
70
|
maxTurns,
|
|
64
71
|
allowedTools,
|
|
65
72
|
settingSources: ["project"],
|
|
73
|
+
agentProfile,
|
|
66
74
|
});
|
|
67
75
|
|
|
68
76
|
const result = await runner.run(taskContent);
|
|
@@ -25,19 +25,26 @@ function parseFlag(args, name) {
|
|
|
25
25
|
* Usage: fit-eval supervise [options]
|
|
26
26
|
*
|
|
27
27
|
* Options:
|
|
28
|
-
* --task=PATH
|
|
28
|
+
* --task-file=PATH Path to task file (mutually exclusive with --task-text)
|
|
29
|
+
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
30
|
* --supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
30
31
|
* --agent-cwd=DIR Agent working directory (default: temp directory)
|
|
31
32
|
* --model=MODEL Claude model to use (default: opus)
|
|
32
33
|
* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
33
34
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
34
35
|
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
36
|
+
* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
37
|
+
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
35
38
|
*
|
|
36
39
|
* @param {string[]} args - Command arguments
|
|
37
40
|
*/
|
|
38
41
|
export async function runSuperviseCommand(args) {
|
|
39
|
-
const
|
|
40
|
-
|
|
42
|
+
const taskFile = parseFlag(args, "task-file");
|
|
43
|
+
const taskText = parseFlag(args, "task-text");
|
|
44
|
+
if (taskFile && taskText)
|
|
45
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
46
|
+
if (!taskFile && !taskText)
|
|
47
|
+
throw new Error("--task-file or --task-text is required");
|
|
41
48
|
|
|
42
49
|
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
|
|
43
50
|
const agentCwd = resolve(
|
|
@@ -47,11 +54,17 @@ export async function runSuperviseCommand(args) {
|
|
|
47
54
|
const model = parseFlag(args, "model") ?? "opus";
|
|
48
55
|
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
|
|
49
56
|
const outputPath = parseFlag(args, "output");
|
|
57
|
+
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
|
|
58
|
+
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
50
59
|
const allowedTools = (
|
|
51
60
|
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
52
61
|
).split(",");
|
|
62
|
+
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
63
|
+
const supervisorAllowedTools = supervisorAllowedToolsRaw
|
|
64
|
+
? supervisorAllowedToolsRaw.split(",")
|
|
65
|
+
: undefined;
|
|
53
66
|
|
|
54
|
-
const taskContent = readFileSync(
|
|
67
|
+
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
55
68
|
|
|
56
69
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
57
70
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -73,6 +86,9 @@ export async function runSuperviseCommand(args) {
|
|
|
73
86
|
model,
|
|
74
87
|
maxTurns,
|
|
75
88
|
allowedTools,
|
|
89
|
+
supervisorAllowedTools,
|
|
90
|
+
supervisorProfile,
|
|
91
|
+
agentProfile,
|
|
76
92
|
});
|
|
77
93
|
|
|
78
94
|
const result = await supervisor.run(taskContent);
|
package/src/supervisor.js
CHANGED
|
@@ -1,25 +1,38 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* Supervisor — orchestrates a relay loop between an agent and a supervisor,
|
|
3
|
-
* both running as AgentRunner instances. The
|
|
4
|
-
*
|
|
3
|
+
* both running as AgentRunner instances. The supervisor receives the task first,
|
|
4
|
+
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
|
+
* agent → supervisor → agent.
|
|
5
6
|
*
|
|
6
7
|
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
7
8
|
*/
|
|
8
9
|
|
|
9
10
|
import { PassThrough } from "node:stream";
|
|
10
11
|
import { createAgentRunner } from "./agent-runner.js";
|
|
12
|
+
import { TraceCollector } from "./trace-collector.js";
|
|
11
13
|
|
|
12
14
|
/**
|
|
13
|
-
* Check if the supervisor's response signals evaluation
|
|
14
|
-
*
|
|
15
|
-
*
|
|
15
|
+
* Check if the supervisor's response signals evaluation success.
|
|
16
|
+
* Matches EVALUATION_SUCCESSFUL anywhere in the text, tolerating markdown
|
|
17
|
+
* formatting (e.g. **EVALUATION_SUCCESSFUL**). Uses word boundaries to
|
|
18
|
+
* avoid matching inside longer identifiers.
|
|
16
19
|
* @param {string} text
|
|
17
20
|
* @returns {boolean}
|
|
18
21
|
*/
|
|
19
|
-
export function
|
|
20
|
-
return
|
|
22
|
+
export function isSuccessful(text) {
|
|
23
|
+
return /(?:^|[\s*_~`])EVALUATION_SUCCESSFUL(?:[\s*_~`.,!?]|$)/m.test(text);
|
|
21
24
|
}
|
|
22
25
|
|
|
26
|
+
/** System prompt appended for the supervisor runner in supervise mode. */
|
|
27
|
+
export const SUPERVISOR_SYSTEM_PROMPT =
|
|
28
|
+
"You supervise another AI agent through a relay — your output becomes the agent's next input. " +
|
|
29
|
+
"Guide the agent, answer its questions, and write EVALUATION_SUCCESSFUL when their task is complete.";
|
|
30
|
+
|
|
31
|
+
/** System prompt appended for the agent runner in supervise mode. */
|
|
32
|
+
export const AGENT_SYSTEM_PROMPT =
|
|
33
|
+
"You are being supervised by another AI agent. " +
|
|
34
|
+
"When requirements are ambiguous or you are uncertain, stop and ask a clarifying question before proceeding.";
|
|
35
|
+
|
|
23
36
|
export class Supervisor {
|
|
24
37
|
/**
|
|
25
38
|
* @param {object} deps
|
|
@@ -44,60 +57,88 @@ export class Supervisor {
|
|
|
44
57
|
|
|
45
58
|
/**
|
|
46
59
|
* Run the supervisor ↔ agent relay loop.
|
|
47
|
-
*
|
|
60
|
+
* The supervisor receives the task first, introduces itself, and delegates
|
|
61
|
+
* work to the agent. The loop then alternates: agent → supervisor → agent.
|
|
62
|
+
* @param {string} task - The initial task for the supervisor
|
|
48
63
|
* @returns {Promise<{success: boolean, turns: number}>}
|
|
49
64
|
*/
|
|
50
65
|
async run(task) {
|
|
51
|
-
// Turn 0:
|
|
52
|
-
this.currentSource = "
|
|
66
|
+
// Turn 0: Supervisor receives the task and introduces it to the agent
|
|
67
|
+
this.currentSource = "supervisor";
|
|
53
68
|
this.currentTurn = 0;
|
|
54
|
-
let
|
|
69
|
+
let supervisorResult = await this.supervisorRunner.run(task);
|
|
55
70
|
|
|
56
|
-
if (
|
|
71
|
+
if (supervisorResult.error) {
|
|
57
72
|
this.emitSummary({ success: false, turns: 0 });
|
|
58
73
|
return { success: false, turns: 0 };
|
|
59
74
|
}
|
|
60
75
|
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
76
|
+
// The supervisor's turn is fully complete (all tool calls executed) by the
|
|
77
|
+
// time we check the signal — no work is interrupted.
|
|
78
|
+
if (isSuccessful(supervisorResult.text)) {
|
|
79
|
+
this.emitSummary({ success: true, turns: 0 });
|
|
80
|
+
return { success: true, turns: 0 };
|
|
81
|
+
}
|
|
66
82
|
|
|
67
|
-
|
|
83
|
+
for (let turn = 1; turn <= this.maxTurns; turn++) {
|
|
84
|
+
// Supervisor's output becomes the agent's input
|
|
85
|
+
this.currentSource = "agent";
|
|
68
86
|
this.currentTurn = turn;
|
|
69
|
-
let
|
|
87
|
+
let agentResult;
|
|
70
88
|
if (turn === 1) {
|
|
71
|
-
|
|
89
|
+
agentResult = await this.agentRunner.run(supervisorResult.text);
|
|
72
90
|
} else {
|
|
73
|
-
|
|
91
|
+
agentResult = await this.agentRunner.resume(supervisorResult.text);
|
|
74
92
|
}
|
|
75
93
|
|
|
76
|
-
if (
|
|
94
|
+
if (agentResult.error) {
|
|
77
95
|
this.emitSummary({ success: false, turns: turn });
|
|
78
96
|
return { success: false, turns: turn };
|
|
79
97
|
}
|
|
80
98
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
}
|
|
99
|
+
// Build the full agent transcript from buffered NDJSON events so the
|
|
100
|
+
// supervisor sees tool calls and reasoning, not just the SDK result summary.
|
|
101
|
+
const agentTranscript = this.extractTranscript(this.agentRunner);
|
|
85
102
|
|
|
86
|
-
|
|
87
|
-
|
|
103
|
+
const supervisorPrompt =
|
|
104
|
+
`The agent reported:\n\n${agentTranscript}\n\n` +
|
|
105
|
+
`Review the agent's work and decide how to proceed.`;
|
|
106
|
+
|
|
107
|
+
this.currentSource = "supervisor";
|
|
88
108
|
this.currentTurn = turn;
|
|
89
|
-
|
|
109
|
+
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
|
|
90
110
|
|
|
91
|
-
if (
|
|
111
|
+
if (supervisorResult.error) {
|
|
92
112
|
this.emitSummary({ success: false, turns: turn });
|
|
93
113
|
return { success: false, turns: turn };
|
|
94
114
|
}
|
|
115
|
+
|
|
116
|
+
// The supervisor's turn is fully complete — check for success signal.
|
|
117
|
+
if (isSuccessful(supervisorResult.text)) {
|
|
118
|
+
this.emitSummary({ success: true, turns: turn });
|
|
119
|
+
return { success: true, turns: turn };
|
|
120
|
+
}
|
|
95
121
|
}
|
|
96
122
|
|
|
97
123
|
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
98
124
|
return { success: false, turns: this.maxTurns };
|
|
99
125
|
}
|
|
100
126
|
|
|
127
|
+
/**
|
|
128
|
+
* Extract a human-readable transcript from an AgentRunner's buffered output.
|
|
129
|
+
* Drains the buffer and replays events through a TraceCollector.
|
|
130
|
+
* @param {import("./agent-runner.js").AgentRunner} runner
|
|
131
|
+
* @returns {string}
|
|
132
|
+
*/
|
|
133
|
+
extractTranscript(runner) {
|
|
134
|
+
const lines = runner.drainOutput();
|
|
135
|
+
const collector = new TraceCollector();
|
|
136
|
+
for (const line of lines) {
|
|
137
|
+
collector.addLine(line);
|
|
138
|
+
}
|
|
139
|
+
return collector.toText() || "[The agent produced no output.]";
|
|
140
|
+
}
|
|
141
|
+
|
|
101
142
|
/**
|
|
102
143
|
* Emit a single NDJSON line tagged with the current source and turn.
|
|
103
144
|
* Called in real-time via the AgentRunner onLine callback.
|
|
@@ -138,6 +179,10 @@ export class Supervisor {
|
|
|
138
179
|
* @param {string} [deps.model] - Claude model identifier
|
|
139
180
|
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
|
|
140
181
|
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
182
|
+
* @param {string[]} [deps.supervisorAllowedTools] - Tools the supervisor may use (default: Bash, Read, Glob, Grep, Write, Edit)
|
|
183
|
+
* @param {string[]} [deps.supervisorDisallowedTools] - Tools to explicitly block from the supervisor
|
|
184
|
+
* @param {string} [deps.supervisorProfile] - Supervisor agent profile name
|
|
185
|
+
* @param {string} [deps.agentProfile] - Agent profile name
|
|
141
186
|
* @returns {Supervisor}
|
|
142
187
|
*/
|
|
143
188
|
export function createSupervisor({
|
|
@@ -148,6 +193,10 @@ export function createSupervisor({
|
|
|
148
193
|
model,
|
|
149
194
|
maxTurns,
|
|
150
195
|
allowedTools,
|
|
196
|
+
supervisorDisallowedTools,
|
|
197
|
+
supervisorAllowedTools,
|
|
198
|
+
supervisorProfile,
|
|
199
|
+
agentProfile,
|
|
151
200
|
}) {
|
|
152
201
|
// Forward-reference: onLine captures `supervisor` before construction completes.
|
|
153
202
|
// This is safe because onLine is only called during run(), after construction.
|
|
@@ -163,17 +212,45 @@ export function createSupervisor({
|
|
|
163
212
|
allowedTools,
|
|
164
213
|
onLine,
|
|
165
214
|
settingSources: ["project"],
|
|
215
|
+
agentProfile,
|
|
216
|
+
systemPrompt: {
|
|
217
|
+
type: "preset",
|
|
218
|
+
preset: "claude_code",
|
|
219
|
+
append: AGENT_SYSTEM_PROMPT,
|
|
220
|
+
},
|
|
166
221
|
});
|
|
167
222
|
|
|
223
|
+
// Block Task/TaskOutput so the supervisor cannot spawn its own sub-agents.
|
|
224
|
+
// The relay loop handles agent communication — letting the supervisor use
|
|
225
|
+
// Task would bypass the relay and produce an empty agent trace.
|
|
226
|
+
const defaultDisallowed = ["Task", "TaskOutput"];
|
|
227
|
+
const disallowedTools = supervisorDisallowedTools
|
|
228
|
+
? [...new Set([...defaultDisallowed, ...supervisorDisallowedTools])]
|
|
229
|
+
: defaultDisallowed;
|
|
230
|
+
|
|
168
231
|
const supervisorRunner = createAgentRunner({
|
|
169
232
|
cwd: supervisorCwd,
|
|
170
233
|
query,
|
|
171
234
|
output: new PassThrough(),
|
|
172
235
|
model,
|
|
173
236
|
maxTurns: 10,
|
|
174
|
-
allowedTools:
|
|
237
|
+
allowedTools: supervisorAllowedTools ?? [
|
|
238
|
+
"Bash",
|
|
239
|
+
"Read",
|
|
240
|
+
"Glob",
|
|
241
|
+
"Grep",
|
|
242
|
+
"Write",
|
|
243
|
+
"Edit",
|
|
244
|
+
],
|
|
245
|
+
disallowedTools,
|
|
175
246
|
onLine,
|
|
176
247
|
settingSources: ["project"],
|
|
248
|
+
agentProfile: supervisorProfile,
|
|
249
|
+
systemPrompt: {
|
|
250
|
+
type: "preset",
|
|
251
|
+
preset: "claude_code",
|
|
252
|
+
append: SUPERVISOR_SYSTEM_PROMPT,
|
|
253
|
+
},
|
|
177
254
|
});
|
|
178
255
|
|
|
179
256
|
supervisor = new Supervisor({
|
package/src/tee-writer.js
CHANGED
|
@@ -107,7 +107,6 @@ export class TeeWriter extends Writable {
|
|
|
107
107
|
if (parsed.event) {
|
|
108
108
|
if (parsed.source && parsed.source !== this.lastSource) {
|
|
109
109
|
this.lastSource = parsed.source;
|
|
110
|
-
this.textStream.write(`\n[${parsed.source}]\n`);
|
|
111
110
|
}
|
|
112
111
|
this.collector.addLine(JSON.stringify(parsed.event));
|
|
113
112
|
this.flushTurns();
|
|
@@ -119,15 +118,19 @@ export class TeeWriter extends Writable {
|
|
|
119
118
|
*/
|
|
120
119
|
flushTurns() {
|
|
121
120
|
const turns = this.collector.turns;
|
|
121
|
+
const prefix =
|
|
122
|
+
this.mode === "supervised" && this.lastSource
|
|
123
|
+
? `[${this.lastSource}] `
|
|
124
|
+
: "";
|
|
122
125
|
while (this.turnsEmitted < turns.length) {
|
|
123
126
|
const turn = turns[this.turnsEmitted++];
|
|
124
127
|
if (turn.role === "assistant") {
|
|
125
128
|
for (const block of turn.content) {
|
|
126
129
|
if (block.type === "text") {
|
|
127
|
-
this.textStream.write(block.text
|
|
130
|
+
this.textStream.write(`${prefix}${block.text}\n`);
|
|
128
131
|
} else if (block.type === "tool_use") {
|
|
129
132
|
const input = summarizeInput(block.input);
|
|
130
|
-
this.textStream.write(
|
|
133
|
+
this.textStream.write(`${prefix}> Tool: ${block.name} ${input}\n`);
|
|
131
134
|
}
|
|
132
135
|
}
|
|
133
136
|
}
|
package/test/supervisor.test.js
CHANGED
|
@@ -6,8 +6,10 @@ import {
|
|
|
6
6
|
AgentRunner,
|
|
7
7
|
Supervisor,
|
|
8
8
|
createSupervisor,
|
|
9
|
+
SUPERVISOR_SYSTEM_PROMPT,
|
|
10
|
+
AGENT_SYSTEM_PROMPT,
|
|
9
11
|
} from "@forwardimpact/libeval";
|
|
10
|
-
import {
|
|
12
|
+
import { isSuccessful } from "../src/supervisor.js";
|
|
11
13
|
|
|
12
14
|
/**
|
|
13
15
|
* Create a mock AgentRunner that yields pre-scripted responses.
|
|
@@ -61,26 +63,50 @@ function createMockRunner(responses, messages) {
|
|
|
61
63
|
return runner;
|
|
62
64
|
}
|
|
63
65
|
|
|
64
|
-
describe("
|
|
65
|
-
test("detects
|
|
66
|
-
assert.strictEqual(
|
|
66
|
+
describe("isSuccessful", () => {
|
|
67
|
+
test("detects EVALUATION_SUCCESSFUL on its own line", () => {
|
|
68
|
+
assert.strictEqual(isSuccessful("EVALUATION_SUCCESSFUL"), true);
|
|
67
69
|
assert.strictEqual(
|
|
68
|
-
|
|
70
|
+
isSuccessful("Some text\nEVALUATION_SUCCESSFUL\nMore text"),
|
|
69
71
|
true,
|
|
70
72
|
);
|
|
71
|
-
assert.strictEqual(
|
|
73
|
+
assert.strictEqual(isSuccessful("Done.\n\nEVALUATION_SUCCESSFUL"), true);
|
|
72
74
|
});
|
|
73
75
|
|
|
74
|
-
test("
|
|
75
|
-
assert.strictEqual(
|
|
76
|
-
assert.strictEqual(
|
|
77
|
-
assert.strictEqual(
|
|
76
|
+
test("tolerates markdown formatting around the signal", () => {
|
|
77
|
+
assert.strictEqual(isSuccessful("**EVALUATION_SUCCESSFUL**"), true);
|
|
78
|
+
assert.strictEqual(isSuccessful("*EVALUATION_SUCCESSFUL*"), true);
|
|
79
|
+
assert.strictEqual(isSuccessful("__EVALUATION_SUCCESSFUL__"), true);
|
|
80
|
+
assert.strictEqual(isSuccessful("_EVALUATION_SUCCESSFUL_"), true);
|
|
81
|
+
assert.strictEqual(isSuccessful("`EVALUATION_SUCCESSFUL`"), true);
|
|
82
|
+
assert.strictEqual(
|
|
83
|
+
isSuccessful(
|
|
84
|
+
"Good work.\n\n**EVALUATION_SUCCESSFUL**\n\nNow filing issues.",
|
|
85
|
+
),
|
|
86
|
+
true,
|
|
87
|
+
);
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
test("matches EVALUATION_SUCCESSFUL anywhere in text", () => {
|
|
91
|
+
assert.strictEqual(isSuccessful("not EVALUATION_SUCCESSFUL yet"), true);
|
|
92
|
+
assert.strictEqual(
|
|
93
|
+
isSuccessful("The agent is EVALUATION_SUCCESSFUL done"),
|
|
94
|
+
true,
|
|
95
|
+
);
|
|
96
|
+
assert.strictEqual(
|
|
97
|
+
isSuccessful("Great work! EVALUATION_SUCCESSFUL. Now filing issues."),
|
|
98
|
+
true,
|
|
99
|
+
);
|
|
78
100
|
});
|
|
79
101
|
|
|
80
102
|
test("does not match empty or unrelated text", () => {
|
|
81
|
-
assert.strictEqual(
|
|
82
|
-
assert.strictEqual(
|
|
83
|
-
assert.strictEqual(
|
|
103
|
+
assert.strictEqual(isSuccessful(""), false);
|
|
104
|
+
assert.strictEqual(isSuccessful("All done!"), false);
|
|
105
|
+
assert.strictEqual(isSuccessful("DONE"), false);
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
test("does not match old EVALUATION_COMPLETE signal", () => {
|
|
109
|
+
assert.strictEqual(isSuccessful("EVALUATION_COMPLETE"), false);
|
|
84
110
|
});
|
|
85
111
|
});
|
|
86
112
|
|
|
@@ -118,13 +144,35 @@ describe("Supervisor", () => {
|
|
|
118
144
|
);
|
|
119
145
|
});
|
|
120
146
|
|
|
121
|
-
test("completes on
|
|
147
|
+
test("completes on EVALUATION_SUCCESSFUL from supervisor at turn 0", async () => {
|
|
148
|
+
const agentRunner = createMockRunner([]);
|
|
149
|
+
|
|
150
|
+
const supervisorRunner = createMockRunner([
|
|
151
|
+
{ text: "EVALUATION_SUCCESSFUL" },
|
|
152
|
+
]);
|
|
153
|
+
|
|
154
|
+
const output = new PassThrough();
|
|
155
|
+
const supervisor = new Supervisor({
|
|
156
|
+
agentRunner,
|
|
157
|
+
supervisorRunner,
|
|
158
|
+
output,
|
|
159
|
+
maxTurns: 10,
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
const result = await supervisor.run("Install stuff");
|
|
163
|
+
|
|
164
|
+
assert.strictEqual(result.success, true);
|
|
165
|
+
assert.strictEqual(result.turns, 0);
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
test("completes after one agent turn", async () => {
|
|
122
169
|
const agentRunner = createMockRunner([
|
|
123
170
|
{ text: "I installed the packages." },
|
|
124
171
|
]);
|
|
125
172
|
|
|
126
173
|
const supervisorRunner = createMockRunner([
|
|
127
|
-
{ text: "
|
|
174
|
+
{ text: "Welcome! Please install the packages." },
|
|
175
|
+
{ text: "Good work.\n\nEVALUATION_SUCCESSFUL" },
|
|
128
176
|
]);
|
|
129
177
|
|
|
130
178
|
const output = new PassThrough();
|
|
@@ -149,9 +197,10 @@ describe("Supervisor", () => {
|
|
|
149
197
|
]);
|
|
150
198
|
|
|
151
199
|
const supervisorRunner = createMockRunner([
|
|
200
|
+
{ text: "Here is your task. Do the work." },
|
|
152
201
|
{ text: "Keep going, you need to do more." },
|
|
153
202
|
{ text: "Almost there, continue." },
|
|
154
|
-
{ text: "
|
|
203
|
+
{ text: "EVALUATION_SUCCESSFUL" },
|
|
155
204
|
]);
|
|
156
205
|
|
|
157
206
|
const output = new PassThrough();
|
|
@@ -169,14 +218,14 @@ describe("Supervisor", () => {
|
|
|
169
218
|
});
|
|
170
219
|
|
|
171
220
|
test("enforces maxTurns limit", async () => {
|
|
172
|
-
//
|
|
221
|
+
// Supervisor starts, agent responds each turn, supervisor never says done
|
|
173
222
|
const agentRunner = createMockRunner([
|
|
174
|
-
{ text: "Turn 0" },
|
|
175
223
|
{ text: "Turn 1" },
|
|
176
224
|
{ text: "Turn 2" },
|
|
177
225
|
]);
|
|
178
226
|
|
|
179
227
|
const supervisorRunner = createMockRunner([
|
|
228
|
+
{ text: "Start working." },
|
|
180
229
|
{ text: "Continue." },
|
|
181
230
|
{ text: "Continue." },
|
|
182
231
|
]);
|
|
@@ -196,16 +245,17 @@ describe("Supervisor", () => {
|
|
|
196
245
|
});
|
|
197
246
|
|
|
198
247
|
test("output contains tagged lines with correct source and turn", async () => {
|
|
199
|
-
const agentMessages = [[{ type: "assistant", content: "Working" }]];
|
|
200
248
|
const supervisorMessages = [
|
|
201
|
-
[{ type: "assistant", content: "
|
|
249
|
+
[{ type: "assistant", content: "Go ahead" }],
|
|
250
|
+
[{ type: "assistant", content: "EVALUATION_SUCCESSFUL" }],
|
|
202
251
|
];
|
|
252
|
+
const agentMessages = [[{ type: "assistant", content: "Working" }]];
|
|
203
253
|
|
|
204
|
-
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
|
|
205
254
|
const supervisorRunner = createMockRunner(
|
|
206
|
-
[{ text: "
|
|
255
|
+
[{ text: "Go ahead" }, { text: "EVALUATION_SUCCESSFUL" }],
|
|
207
256
|
supervisorMessages,
|
|
208
257
|
);
|
|
258
|
+
const agentRunner = createMockRunner([{ text: "Working" }], agentMessages);
|
|
209
259
|
|
|
210
260
|
const output = new PassThrough();
|
|
211
261
|
const supervisor = new Supervisor({
|
|
@@ -225,19 +275,19 @@ describe("Supervisor", () => {
|
|
|
225
275
|
.split("\n")
|
|
226
276
|
.filter((l) => l.length > 0);
|
|
227
277
|
|
|
228
|
-
// Should have:
|
|
229
|
-
assert.ok(lines.length >=
|
|
230
|
-
|
|
231
|
-
const agentLine = JSON.parse(lines[0]);
|
|
232
|
-
assert.strictEqual(agentLine.source, "agent");
|
|
233
|
-
assert.strictEqual(agentLine.turn, 0);
|
|
234
|
-
assert.ok("event" in agentLine);
|
|
278
|
+
// Should have: supervisor turn 0, agent turn 1, supervisor turn 1, orchestrator summary
|
|
279
|
+
assert.ok(lines.length >= 4);
|
|
235
280
|
|
|
236
|
-
const supervisorLine = JSON.parse(lines[
|
|
281
|
+
const supervisorLine = JSON.parse(lines[0]);
|
|
237
282
|
assert.strictEqual(supervisorLine.source, "supervisor");
|
|
238
|
-
assert.strictEqual(supervisorLine.turn,
|
|
283
|
+
assert.strictEqual(supervisorLine.turn, 0);
|
|
239
284
|
assert.ok("event" in supervisorLine);
|
|
240
285
|
|
|
286
|
+
const agentLine = JSON.parse(lines[1]);
|
|
287
|
+
assert.strictEqual(agentLine.source, "agent");
|
|
288
|
+
assert.strictEqual(agentLine.turn, 1);
|
|
289
|
+
assert.ok("event" in agentLine);
|
|
290
|
+
|
|
241
291
|
const summaryLine = JSON.parse(lines[lines.length - 1]);
|
|
242
292
|
assert.strictEqual(summaryLine.source, "orchestrator");
|
|
243
293
|
assert.strictEqual(summaryLine.type, "summary");
|
|
@@ -250,11 +300,14 @@ describe("Supervisor", () => {
|
|
|
250
300
|
source: "sdk-internal",
|
|
251
301
|
content: "test",
|
|
252
302
|
};
|
|
253
|
-
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
|
|
254
303
|
const supervisorRunner = createMockRunner(
|
|
255
|
-
[{ text: "
|
|
256
|
-
[
|
|
304
|
+
[{ text: "Go" }, { text: "EVALUATION_SUCCESSFUL" }],
|
|
305
|
+
[
|
|
306
|
+
[{ type: "assistant", content: "Go" }],
|
|
307
|
+
[{ type: "assistant", content: "ok" }],
|
|
308
|
+
],
|
|
257
309
|
);
|
|
310
|
+
const agentRunner = createMockRunner([{ text: "Done" }], [[sourceEvent]]);
|
|
258
311
|
|
|
259
312
|
const output = new PassThrough();
|
|
260
313
|
const supervisor = new Supervisor({
|
|
@@ -274,27 +327,30 @@ describe("Supervisor", () => {
|
|
|
274
327
|
.split("\n")
|
|
275
328
|
.filter((l) => l.length > 0);
|
|
276
329
|
|
|
277
|
-
|
|
330
|
+
// First line is supervisor turn 0, second is agent turn 1
|
|
331
|
+
const tagged = JSON.parse(lines[1]);
|
|
278
332
|
// The original event's `source` field is preserved inside `event`
|
|
279
333
|
assert.strictEqual(tagged.source, "agent");
|
|
280
334
|
assert.strictEqual(tagged.event.source, "sdk-internal");
|
|
281
335
|
});
|
|
282
336
|
|
|
283
|
-
test("emits
|
|
284
|
-
const
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
337
|
+
test("emits supervisor output and summary when supervisor errors on turn 0", async () => {
|
|
338
|
+
const supervisorMessages = [
|
|
339
|
+
[{ type: "assistant", content: "Starting..." }],
|
|
340
|
+
];
|
|
341
|
+
const supervisorRunner = createMockRunner(
|
|
342
|
+
[{ text: "Starting...", success: false }],
|
|
343
|
+
supervisorMessages,
|
|
288
344
|
);
|
|
289
345
|
|
|
290
346
|
// Override run to simulate an error return
|
|
291
|
-
const origRun =
|
|
292
|
-
|
|
293
|
-
const result = await origRun.call(
|
|
347
|
+
const origRun = supervisorRunner.run;
|
|
348
|
+
supervisorRunner.run = async (task) => {
|
|
349
|
+
const result = await origRun.call(supervisorRunner, task);
|
|
294
350
|
return { ...result, error: new Error("Process exited with code 1") };
|
|
295
351
|
};
|
|
296
352
|
|
|
297
|
-
const
|
|
353
|
+
const agentRunner = createMockRunner([]);
|
|
298
354
|
|
|
299
355
|
const output = new PassThrough();
|
|
300
356
|
const supervisor = new Supervisor({
|
|
@@ -311,18 +367,18 @@ describe("Supervisor", () => {
|
|
|
311
367
|
assert.strictEqual(result.success, false);
|
|
312
368
|
assert.strictEqual(result.turns, 0);
|
|
313
369
|
|
|
314
|
-
// Output should still contain the
|
|
370
|
+
// Output should still contain the supervisor's buffered lines + summary
|
|
315
371
|
const data = output.read()?.toString() ?? "";
|
|
316
372
|
const lines = data
|
|
317
373
|
.trim()
|
|
318
374
|
.split("\n")
|
|
319
375
|
.filter((l) => l.length > 0);
|
|
320
376
|
|
|
321
|
-
assert.ok(lines.length >= 2, "Expected at least
|
|
377
|
+
assert.ok(lines.length >= 2, "Expected at least supervisor line + summary");
|
|
322
378
|
|
|
323
|
-
const
|
|
324
|
-
assert.strictEqual(
|
|
325
|
-
assert.strictEqual(
|
|
379
|
+
const supervisorLine = JSON.parse(lines[0]);
|
|
380
|
+
assert.strictEqual(supervisorLine.source, "supervisor");
|
|
381
|
+
assert.strictEqual(supervisorLine.turn, 0);
|
|
326
382
|
|
|
327
383
|
const summaryLine = JSON.parse(lines[lines.length - 1]);
|
|
328
384
|
assert.strictEqual(summaryLine.source, "orchestrator");
|
|
@@ -339,4 +395,99 @@ describe("Supervisor", () => {
|
|
|
339
395
|
});
|
|
340
396
|
assert.ok(supervisor instanceof Supervisor);
|
|
341
397
|
});
|
|
398
|
+
|
|
399
|
+
test("createSupervisor uses default supervisor tools when none specified", () => {
|
|
400
|
+
const supervisor = createSupervisor({
|
|
401
|
+
supervisorCwd: "/tmp/sup",
|
|
402
|
+
agentCwd: "/tmp/agent",
|
|
403
|
+
query: async function* () {},
|
|
404
|
+
output: new PassThrough(),
|
|
405
|
+
});
|
|
406
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
|
|
407
|
+
"Bash",
|
|
408
|
+
"Read",
|
|
409
|
+
"Glob",
|
|
410
|
+
"Grep",
|
|
411
|
+
"Write",
|
|
412
|
+
"Edit",
|
|
413
|
+
]);
|
|
414
|
+
});
|
|
415
|
+
|
|
416
|
+
test("createSupervisor passes custom supervisor tools", () => {
|
|
417
|
+
const supervisor = createSupervisor({
|
|
418
|
+
supervisorCwd: "/tmp/sup",
|
|
419
|
+
agentCwd: "/tmp/agent",
|
|
420
|
+
query: async function* () {},
|
|
421
|
+
output: new PassThrough(),
|
|
422
|
+
supervisorAllowedTools: ["Read", "Glob", "Grep"],
|
|
423
|
+
});
|
|
424
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.allowedTools, [
|
|
425
|
+
"Read",
|
|
426
|
+
"Glob",
|
|
427
|
+
"Grep",
|
|
428
|
+
]);
|
|
429
|
+
});
|
|
430
|
+
|
|
431
|
+
test("createSupervisor wires system prompts to both runners", () => {
|
|
432
|
+
const supervisor = createSupervisor({
|
|
433
|
+
supervisorCwd: "/tmp/sup",
|
|
434
|
+
agentCwd: "/tmp/agent",
|
|
435
|
+
query: async function* () {},
|
|
436
|
+
output: new PassThrough(),
|
|
437
|
+
});
|
|
438
|
+
|
|
439
|
+
assert.deepStrictEqual(supervisor.agentRunner.systemPrompt, {
|
|
440
|
+
type: "preset",
|
|
441
|
+
preset: "claude_code",
|
|
442
|
+
append: AGENT_SYSTEM_PROMPT,
|
|
443
|
+
});
|
|
444
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.systemPrompt, {
|
|
445
|
+
type: "preset",
|
|
446
|
+
preset: "claude_code",
|
|
447
|
+
append: SUPERVISOR_SYSTEM_PROMPT,
|
|
448
|
+
});
|
|
449
|
+
});
|
|
450
|
+
|
|
451
|
+
test("createSupervisor blocks Task and TaskOutput on supervisor by default", () => {
|
|
452
|
+
const supervisor = createSupervisor({
|
|
453
|
+
supervisorCwd: "/tmp/sup",
|
|
454
|
+
agentCwd: "/tmp/agent",
|
|
455
|
+
query: async function* () {},
|
|
456
|
+
output: new PassThrough(),
|
|
457
|
+
});
|
|
458
|
+
assert.deepStrictEqual(supervisor.supervisorRunner.disallowedTools, [
|
|
459
|
+
"Task",
|
|
460
|
+
"TaskOutput",
|
|
461
|
+
]);
|
|
462
|
+
// Agent should not have disallowed tools
|
|
463
|
+
assert.deepStrictEqual(supervisor.agentRunner.disallowedTools, []);
|
|
464
|
+
});
|
|
465
|
+
|
|
466
|
+
test("createSupervisor merges custom supervisorDisallowedTools with defaults", () => {
|
|
467
|
+
const supervisor = createSupervisor({
|
|
468
|
+
supervisorCwd: "/tmp/sup",
|
|
469
|
+
agentCwd: "/tmp/agent",
|
|
470
|
+
query: async function* () {},
|
|
471
|
+
output: new PassThrough(),
|
|
472
|
+
supervisorDisallowedTools: ["WebSearch", "Task"],
|
|
473
|
+
});
|
|
474
|
+
const disallowed = supervisor.supervisorRunner.disallowedTools;
|
|
475
|
+
assert.ok(disallowed.includes("Task"));
|
|
476
|
+
assert.ok(disallowed.includes("TaskOutput"));
|
|
477
|
+
assert.ok(disallowed.includes("WebSearch"));
|
|
478
|
+
// No duplicates
|
|
479
|
+
assert.strictEqual(disallowed.length, new Set(disallowed).size);
|
|
480
|
+
});
|
|
481
|
+
|
|
482
|
+
test("system prompt constants are non-empty strings", () => {
|
|
483
|
+
assert.ok(typeof SUPERVISOR_SYSTEM_PROMPT === "string");
|
|
484
|
+
assert.ok(typeof AGENT_SYSTEM_PROMPT === "string");
|
|
485
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.length > 0);
|
|
486
|
+
assert.ok(AGENT_SYSTEM_PROMPT.length > 0);
|
|
487
|
+
});
|
|
488
|
+
|
|
489
|
+
test("SUPERVISOR_SYSTEM_PROMPT explains relay mechanism", () => {
|
|
490
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("relay"));
|
|
491
|
+
assert.ok(SUPERVISOR_SYSTEM_PROMPT.includes("EVALUATION_SUCCESSFUL"));
|
|
492
|
+
});
|
|
342
493
|
});
|
package/test/tee-writer.test.js
CHANGED
|
@@ -187,11 +187,9 @@ describe("TeeWriter", () => {
|
|
|
187
187
|
assert.strictEqual(fileLines.length, 3);
|
|
188
188
|
assert.strictEqual(JSON.parse(fileLines[0]).source, "agent");
|
|
189
189
|
|
|
190
|
-
// Text should show source
|
|
191
|
-
assert.ok(textData.includes("[agent]"));
|
|
192
|
-
assert.ok(textData.includes("
|
|
193
|
-
assert.ok(textData.includes("[supervisor]"));
|
|
194
|
-
assert.ok(textData.includes("Looks good"));
|
|
190
|
+
// Text should show source prefixes on content lines
|
|
191
|
+
assert.ok(textData.includes("[agent] Working on it"));
|
|
192
|
+
assert.ok(textData.includes("[supervisor] Looks good"));
|
|
195
193
|
assert.ok(textData.includes("Evaluation completed after 1 turns"));
|
|
196
194
|
});
|
|
197
195
|
|
|
@@ -254,9 +252,9 @@ describe("TeeWriter", () => {
|
|
|
254
252
|
await writeLines(writer, events);
|
|
255
253
|
|
|
256
254
|
const textData = collect(textStream);
|
|
257
|
-
// [agent]
|
|
258
|
-
|
|
259
|
-
assert.
|
|
255
|
+
// [agent] prefix should appear on each content line
|
|
256
|
+
assert.ok(textData.includes("[agent] Step 1"));
|
|
257
|
+
assert.ok(textData.includes("[agent] Step 2"));
|
|
260
258
|
});
|
|
261
259
|
|
|
262
260
|
test("handles partial lines across chunks", async () => {
|