@forwardimpact/libeval 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +26 -1
- package/index.js +3 -0
- package/package.json +6 -3
- package/src/agent-runner.js +154 -0
- package/src/commands/run.js +76 -0
- package/src/commands/supervise.js +86 -0
- package/src/commands/tee.js +13 -75
- package/src/supervisor.js +186 -0
- package/src/tee-writer.js +157 -0
- package/test/agent-runner.test.js +317 -0
- package/test/supervisor.test.js +342 -0
- package/test/tee-writer.test.js +326 -0
package/bin/fit-eval.js
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
|
-
#!/usr/bin/env
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
2
|
|
|
3
3
|
import { runOutputCommand } from "../src/commands/output.js";
|
|
4
4
|
import { runTeeCommand } from "../src/commands/tee.js";
|
|
5
|
+
import { runRunCommand } from "../src/commands/run.js";
|
|
6
|
+
import { runSuperviseCommand } from "../src/commands/supervise.js";
|
|
5
7
|
|
|
6
8
|
const COMMANDS = {
|
|
7
9
|
output: runOutputCommand,
|
|
8
10
|
tee: runTeeCommand,
|
|
11
|
+
run: runRunCommand,
|
|
12
|
+
supervise: runSuperviseCommand,
|
|
9
13
|
};
|
|
10
14
|
|
|
11
15
|
const HELP_TEXT = `
|
|
@@ -17,6 +21,25 @@ Usage:
|
|
|
17
21
|
Commands:
|
|
18
22
|
output [--format=json|text] Process trace and output formatted result
|
|
19
23
|
tee [output.ndjson] Stream text to stdout, optionally save raw NDJSON
|
|
24
|
+
run [options] Run a single agent via the Claude Agent SDK
|
|
25
|
+
supervise [options] Run a supervised agent ↔ supervisor relay loop
|
|
26
|
+
|
|
27
|
+
Run options:
|
|
28
|
+
--task=PATH Path to task file (required)
|
|
29
|
+
--cwd=DIR Agent working directory (default: .)
|
|
30
|
+
--model=MODEL Claude model to use (default: opus)
|
|
31
|
+
--max-turns=N Maximum agentic turns (default: 50)
|
|
32
|
+
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
33
|
+
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
34
|
+
|
|
35
|
+
Supervise options:
|
|
36
|
+
--task=PATH Path to task file (required)
|
|
37
|
+
--supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
38
|
+
--agent-cwd=DIR Agent working directory (default: temp directory)
|
|
39
|
+
--model=MODEL Claude model to use (default: opus)
|
|
40
|
+
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
41
|
+
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
42
|
+
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
20
43
|
|
|
21
44
|
Options:
|
|
22
45
|
--help Show this help message
|
|
@@ -27,6 +50,8 @@ Examples:
|
|
|
27
50
|
fit-eval output --format=json < trace.ndjson
|
|
28
51
|
fit-eval tee < trace.ndjson
|
|
29
52
|
fit-eval tee output.ndjson < trace.ndjson
|
|
53
|
+
fit-eval run --task=.github/tasks/security-audit.md --model=opus
|
|
54
|
+
fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
|
|
30
55
|
`.trim();
|
|
31
56
|
|
|
32
57
|
async function main() {
|
package/index.js
CHANGED
|
@@ -1 +1,4 @@
|
|
|
1
1
|
export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
|
|
2
|
+
export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
|
|
3
|
+
export { Supervisor, createSupervisor } from "./src/supervisor.js";
|
|
4
|
+
export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"description": "Process Claude Code stream-json output into structured traces",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
@@ -10,10 +10,13 @@
|
|
|
10
10
|
"fit-eval": "./bin/fit-eval.js"
|
|
11
11
|
},
|
|
12
12
|
"engines": {
|
|
13
|
-
"
|
|
13
|
+
"bun": ">=1.2.0"
|
|
14
14
|
},
|
|
15
15
|
"scripts": {
|
|
16
|
-
"test": "node --test test/*.test.js"
|
|
16
|
+
"test": "bun run node --test test/*.test.js"
|
|
17
|
+
},
|
|
18
|
+
"dependencies": {
|
|
19
|
+
"@anthropic-ai/claude-agent-sdk": "^0.1.0"
|
|
17
20
|
},
|
|
18
21
|
"publishConfig": {
|
|
19
22
|
"access": "public"
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
|
|
3
|
+
* events to an output stream. Building block for both `fit-eval run` and
|
|
4
|
+
* `fit-eval supervise`.
|
|
5
|
+
*
|
|
6
|
+
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
export class AgentRunner {
|
|
10
|
+
/**
|
|
11
|
+
* @param {object} deps
|
|
12
|
+
* @param {string} deps.cwd - Agent working directory
|
|
13
|
+
* @param {function} deps.query - SDK query function (injected for testing)
|
|
14
|
+
* @param {import("stream").Writable} deps.output - Stream to emit NDJSON to
|
|
15
|
+
* @param {string} [deps.model] - Claude model identifier
|
|
16
|
+
* @param {number} [deps.maxTurns] - Maximum agentic turns
|
|
17
|
+
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
18
|
+
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
|
+
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
|
+
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
21
|
+
*/
|
|
22
|
+
constructor({
|
|
23
|
+
cwd,
|
|
24
|
+
query,
|
|
25
|
+
output,
|
|
26
|
+
model,
|
|
27
|
+
maxTurns,
|
|
28
|
+
allowedTools,
|
|
29
|
+
permissionMode,
|
|
30
|
+
onLine,
|
|
31
|
+
settingSources,
|
|
32
|
+
}) {
|
|
33
|
+
if (!cwd) throw new Error("cwd is required");
|
|
34
|
+
if (!query) throw new Error("query is required");
|
|
35
|
+
if (!output) throw new Error("output is required");
|
|
36
|
+
this.cwd = cwd;
|
|
37
|
+
this.query = query;
|
|
38
|
+
this.output = output;
|
|
39
|
+
this.model = model ?? "opus";
|
|
40
|
+
this.maxTurns = maxTurns ?? 50;
|
|
41
|
+
this.allowedTools = allowedTools ?? [
|
|
42
|
+
"Bash",
|
|
43
|
+
"Read",
|
|
44
|
+
"Glob",
|
|
45
|
+
"Grep",
|
|
46
|
+
"Write",
|
|
47
|
+
"Edit",
|
|
48
|
+
];
|
|
49
|
+
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
50
|
+
this.onLine = onLine ?? null;
|
|
51
|
+
this.settingSources = settingSources ?? [];
|
|
52
|
+
this.sessionId = null;
|
|
53
|
+
this.buffer = [];
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Run a new agent session with the given task.
|
|
58
|
+
* @param {string} task - The task prompt
|
|
59
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
|
|
60
|
+
*/
|
|
61
|
+
async run(task) {
|
|
62
|
+
let text = "";
|
|
63
|
+
let stopReason = null;
|
|
64
|
+
let error = null;
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
for await (const message of this.query({
|
|
68
|
+
prompt: task,
|
|
69
|
+
options: {
|
|
70
|
+
cwd: this.cwd,
|
|
71
|
+
allowedTools: this.allowedTools,
|
|
72
|
+
maxTurns: this.maxTurns,
|
|
73
|
+
model: this.model,
|
|
74
|
+
permissionMode: this.permissionMode,
|
|
75
|
+
allowDangerouslySkipPermissions: true,
|
|
76
|
+
settingSources: this.settingSources,
|
|
77
|
+
},
|
|
78
|
+
})) {
|
|
79
|
+
const line = JSON.stringify(message);
|
|
80
|
+
this.output.write(line + "\n");
|
|
81
|
+
this.buffer.push(line);
|
|
82
|
+
if (this.onLine) this.onLine(line);
|
|
83
|
+
|
|
84
|
+
if (message.type === "system" && message.subtype === "init") {
|
|
85
|
+
this.sessionId = message.session_id;
|
|
86
|
+
}
|
|
87
|
+
if (message.type === "result") {
|
|
88
|
+
text = message.result ?? "";
|
|
89
|
+
stopReason = message.subtype;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
} catch (err) {
|
|
93
|
+
error = err;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// If the SDK already emitted a successful result, honour it even when the
|
|
97
|
+
// stream throws afterwards (e.g. "Credit balance is too low" during
|
|
98
|
+
// cleanup). Only treat errors as fatal when no result was received yet.
|
|
99
|
+
const success = stopReason === "success";
|
|
100
|
+
return { success, text, sessionId: this.sessionId, error };
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* Resume an existing session with a follow-up prompt.
|
|
105
|
+
* @param {string} prompt - The follow-up prompt
|
|
106
|
+
* @returns {Promise<{success: boolean, text: string}>}
|
|
107
|
+
*/
|
|
108
|
+
async resume(prompt) {
|
|
109
|
+
let text = "";
|
|
110
|
+
let stopReason = null;
|
|
111
|
+
let error = null;
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
for await (const message of this.query({
|
|
115
|
+
prompt,
|
|
116
|
+
options: { resume: this.sessionId },
|
|
117
|
+
})) {
|
|
118
|
+
const line = JSON.stringify(message);
|
|
119
|
+
this.output.write(line + "\n");
|
|
120
|
+
this.buffer.push(line);
|
|
121
|
+
if (this.onLine) this.onLine(line);
|
|
122
|
+
|
|
123
|
+
if (message.type === "result") {
|
|
124
|
+
text = message.result ?? "";
|
|
125
|
+
stopReason = message.subtype;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
} catch (err) {
|
|
129
|
+
error = err;
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
const success = stopReason === "success";
|
|
133
|
+
return { success, text, error };
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/**
|
|
137
|
+
* Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
|
|
138
|
+
* @returns {string[]}
|
|
139
|
+
*/
|
|
140
|
+
drainOutput() {
|
|
141
|
+
const lines = [...this.buffer];
|
|
142
|
+
this.buffer = [];
|
|
143
|
+
return lines;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Factory function — wires real dependencies.
|
|
149
|
+
* @param {object} deps - Same as AgentRunner constructor
|
|
150
|
+
* @returns {AgentRunner}
|
|
151
|
+
*/
|
|
152
|
+
export function createAgentRunner(deps) {
|
|
153
|
+
return new AgentRunner(deps);
|
|
154
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
import { readFileSync, createWriteStream } from "node:fs";
|
|
2
|
+
import { resolve } from "node:path";
|
|
3
|
+
import { createAgentRunner } from "../agent-runner.js";
|
|
4
|
+
import { createTeeWriter } from "../tee-writer.js";
|
|
5
|
+
|
|
6
|
+
/**
|
|
7
|
+
* Parse a --key=value or --key value flag from args.
|
|
8
|
+
* @param {string[]} args
|
|
9
|
+
* @param {string} name - Flag name without --
|
|
10
|
+
* @returns {string|undefined}
|
|
11
|
+
*/
|
|
12
|
+
function parseFlag(args, name) {
|
|
13
|
+
const prefix = `--${name}=`;
|
|
14
|
+
for (let i = 0; i < args.length; i++) {
|
|
15
|
+
if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
|
|
16
|
+
if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
|
|
17
|
+
}
|
|
18
|
+
return undefined;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Run command — execute a single agent via the Claude Agent SDK.
|
|
23
|
+
*
|
|
24
|
+
* Usage: fit-eval run [options]
|
|
25
|
+
*
|
|
26
|
+
* Options:
|
|
27
|
+
* --task=PATH Path to task file (required)
|
|
28
|
+
* --cwd=DIR Agent working directory (default: .)
|
|
29
|
+
* --model=MODEL Claude model to use (default: opus)
|
|
30
|
+
* --max-turns=N Maximum agentic turns (default: 50)
|
|
31
|
+
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
32
|
+
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
33
|
+
*
|
|
34
|
+
* @param {string[]} args - Command arguments
|
|
35
|
+
*/
|
|
36
|
+
export async function runRunCommand(args) {
|
|
37
|
+
const task = parseFlag(args, "task");
|
|
38
|
+
if (!task) throw new Error("--task is required");
|
|
39
|
+
|
|
40
|
+
const cwd = resolve(parseFlag(args, "cwd") ?? ".");
|
|
41
|
+
const model = parseFlag(args, "model") ?? "opus";
|
|
42
|
+
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
|
|
43
|
+
const outputPath = parseFlag(args, "output");
|
|
44
|
+
const allowedTools = (
|
|
45
|
+
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
46
|
+
).split(",");
|
|
47
|
+
|
|
48
|
+
const taskContent = readFileSync(task, "utf8");
|
|
49
|
+
|
|
50
|
+
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
51
|
+
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
52
|
+
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
53
|
+
const output = fileStream
|
|
54
|
+
? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
|
|
55
|
+
: process.stdout;
|
|
56
|
+
|
|
57
|
+
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
58
|
+
const runner = createAgentRunner({
|
|
59
|
+
cwd,
|
|
60
|
+
query,
|
|
61
|
+
output,
|
|
62
|
+
model,
|
|
63
|
+
maxTurns,
|
|
64
|
+
allowedTools,
|
|
65
|
+
settingSources: ["project"],
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
const result = await runner.run(taskContent);
|
|
69
|
+
|
|
70
|
+
if (fileStream) {
|
|
71
|
+
await new Promise((r) => output.end(r));
|
|
72
|
+
await new Promise((r) => fileStream.end(r));
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
process.exit(result.success ? 0 : 1);
|
|
76
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
|
|
2
|
+
import { resolve, join } from "node:path";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { createSupervisor } from "../supervisor.js";
|
|
5
|
+
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
|
|
7
|
+
/**
|
|
8
|
+
* Parse a --key=value or --key value flag from args.
|
|
9
|
+
* @param {string[]} args
|
|
10
|
+
* @param {string} name - Flag name without --
|
|
11
|
+
* @returns {string|undefined}
|
|
12
|
+
*/
|
|
13
|
+
function parseFlag(args, name) {
|
|
14
|
+
const prefix = `--${name}=`;
|
|
15
|
+
for (let i = 0; i < args.length; i++) {
|
|
16
|
+
if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
|
|
17
|
+
if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
|
|
18
|
+
}
|
|
19
|
+
return undefined;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
|
|
24
|
+
*
|
|
25
|
+
* Usage: fit-eval supervise [options]
|
|
26
|
+
*
|
|
27
|
+
* Options:
|
|
28
|
+
* --task=PATH Path to task file (required)
|
|
29
|
+
* --supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
30
|
+
* --agent-cwd=DIR Agent working directory (default: temp directory)
|
|
31
|
+
* --model=MODEL Claude model to use (default: opus)
|
|
32
|
+
* --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
33
|
+
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
34
|
+
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
35
|
+
*
|
|
36
|
+
* @param {string[]} args - Command arguments
|
|
37
|
+
*/
|
|
38
|
+
export async function runSuperviseCommand(args) {
|
|
39
|
+
const task = parseFlag(args, "task");
|
|
40
|
+
if (!task) throw new Error("--task is required");
|
|
41
|
+
|
|
42
|
+
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
|
|
43
|
+
const agentCwd = resolve(
|
|
44
|
+
parseFlag(args, "agent-cwd") ??
|
|
45
|
+
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
46
|
+
);
|
|
47
|
+
const model = parseFlag(args, "model") ?? "opus";
|
|
48
|
+
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
|
|
49
|
+
const outputPath = parseFlag(args, "output");
|
|
50
|
+
const allowedTools = (
|
|
51
|
+
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
52
|
+
).split(",");
|
|
53
|
+
|
|
54
|
+
const taskContent = readFileSync(task, "utf8");
|
|
55
|
+
|
|
56
|
+
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
57
|
+
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
58
|
+
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
59
|
+
const output = fileStream
|
|
60
|
+
? createTeeWriter({
|
|
61
|
+
fileStream,
|
|
62
|
+
textStream: process.stdout,
|
|
63
|
+
mode: "supervised",
|
|
64
|
+
})
|
|
65
|
+
: process.stdout;
|
|
66
|
+
|
|
67
|
+
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
68
|
+
const supervisor = createSupervisor({
|
|
69
|
+
supervisorCwd,
|
|
70
|
+
agentCwd,
|
|
71
|
+
query,
|
|
72
|
+
output,
|
|
73
|
+
model,
|
|
74
|
+
maxTurns,
|
|
75
|
+
allowedTools,
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
const result = await supervisor.run(taskContent);
|
|
79
|
+
|
|
80
|
+
if (fileStream) {
|
|
81
|
+
await new Promise((r) => output.end(r));
|
|
82
|
+
await new Promise((r) => fileStream.end(r));
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
process.exit(result.success ? 0 : 1);
|
|
86
|
+
}
|
package/src/commands/tee.js
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
import { createWriteStream } from "fs";
|
|
2
|
-
import {
|
|
2
|
+
import { PassThrough } from "node:stream";
|
|
3
|
+
import { pipeline } from "node:stream/promises";
|
|
4
|
+
import { createTeeWriter } from "../tee-writer.js";
|
|
3
5
|
|
|
4
6
|
/**
|
|
5
7
|
* Tee command — stream text output to stdout while optionally saving the raw
|
|
@@ -12,46 +14,18 @@ import { createTraceCollector } from "@forwardimpact/libeval";
|
|
|
12
14
|
export async function runTeeCommand(args) {
|
|
13
15
|
const outputPath = args.find((a) => !a.startsWith("-")) ?? null;
|
|
14
16
|
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
15
|
-
const collector = createTraceCollector();
|
|
16
|
-
const turnsEmitted = { count: 0 };
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
const line = buffer.slice(0, newlineIdx);
|
|
27
|
-
buffer = buffer.slice(newlineIdx + 1);
|
|
28
|
-
|
|
29
|
-
if (fileStream) {
|
|
30
|
-
fileStream.write(line + "\n");
|
|
31
|
-
}
|
|
32
|
-
|
|
33
|
-
collector.addLine(line);
|
|
34
|
-
flushNewTurns(collector, turnsEmitted);
|
|
35
|
-
}
|
|
36
|
-
}
|
|
18
|
+
// TeeWriter requires a fileStream; when no output file is specified,
|
|
19
|
+
// use a PassThrough as a no-op sink (NDJSON is not saved).
|
|
20
|
+
const sink = fileStream ?? new PassThrough();
|
|
21
|
+
const tee = createTeeWriter({
|
|
22
|
+
fileStream: sink,
|
|
23
|
+
textStream: process.stdout,
|
|
24
|
+
mode: "raw",
|
|
25
|
+
});
|
|
37
26
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
if (fileStream) {
|
|
41
|
-
fileStream.write(buffer + "\n");
|
|
42
|
-
}
|
|
43
|
-
collector.addLine(buffer);
|
|
44
|
-
flushNewTurns(collector, turnsEmitted);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// Emit the result summary at the end
|
|
48
|
-
if (collector.result) {
|
|
49
|
-
const text = collector.toText();
|
|
50
|
-
const lastNewline = text.lastIndexOf("\n---");
|
|
51
|
-
if (lastNewline !== -1) {
|
|
52
|
-
process.stdout.write(text.slice(lastNewline) + "\n");
|
|
53
|
-
}
|
|
54
|
-
}
|
|
27
|
+
try {
|
|
28
|
+
await pipeline(process.stdin, tee);
|
|
55
29
|
} finally {
|
|
56
30
|
if (fileStream) {
|
|
57
31
|
await new Promise((resolve, reject) => {
|
|
@@ -61,39 +35,3 @@ export async function runTeeCommand(args) {
|
|
|
61
35
|
}
|
|
62
36
|
}
|
|
63
37
|
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Write text for any new turns that haven't been emitted yet.
|
|
67
|
-
* @param {import("@forwardimpact/libeval").TraceCollector} collector
|
|
68
|
-
* @param {{ count: number }} turnsEmitted
|
|
69
|
-
*/
|
|
70
|
-
function flushNewTurns(collector, turnsEmitted) {
|
|
71
|
-
const turns = collector.turns;
|
|
72
|
-
while (turnsEmitted.count < turns.length) {
|
|
73
|
-
const turn = turns[turnsEmitted.count];
|
|
74
|
-
turnsEmitted.count++;
|
|
75
|
-
|
|
76
|
-
if (turn.role === "assistant") {
|
|
77
|
-
for (const block of turn.content) {
|
|
78
|
-
if (block.type === "text") {
|
|
79
|
-
process.stdout.write(block.text + "\n");
|
|
80
|
-
} else if (block.type === "tool_use") {
|
|
81
|
-
const inputSummary = summarizeInput(block.input);
|
|
82
|
-
process.stdout.write(`> Tool: ${block.name} ${inputSummary}\n`);
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
/**
|
|
90
|
-
* Summarize tool input for text display, truncated to keep logs readable.
|
|
91
|
-
* @param {object} input - Tool input object
|
|
92
|
-
* @returns {string} Truncated summary
|
|
93
|
-
*/
|
|
94
|
-
function summarizeInput(input) {
|
|
95
|
-
if (!input || typeof input !== "object") return "";
|
|
96
|
-
const json = JSON.stringify(input);
|
|
97
|
-
if (json.length <= 200) return json;
|
|
98
|
-
return json.slice(0, 197) + "...";
|
|
99
|
-
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Supervisor — orchestrates a relay loop between an agent and a supervisor,
|
|
3
|
+
* both running as AgentRunner instances. The agent works on a task while the
|
|
4
|
+
* supervisor observes and decides when the evaluation is complete.
|
|
5
|
+
*
|
|
6
|
+
* Follows OO+DI: constructor injection, factory function, tests bypass factory.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { PassThrough } from "node:stream";
|
|
10
|
+
import { createAgentRunner } from "./agent-runner.js";
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Check if the supervisor's response signals evaluation completion.
|
|
14
|
+
* Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
|
|
15
|
+
* to avoid false positives from natural language.
|
|
16
|
+
* @param {string} text
|
|
17
|
+
* @returns {boolean}
|
|
18
|
+
*/
|
|
19
|
+
export function isDone(text) {
|
|
20
|
+
return /^EVALUATION_COMPLETE$/m.test(text);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export class Supervisor {
|
|
24
|
+
/**
|
|
25
|
+
* @param {object} deps
|
|
26
|
+
* @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions
|
|
27
|
+
* @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
|
|
28
|
+
* @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
|
|
29
|
+
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
|
|
30
|
+
*/
|
|
31
|
+
constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
|
|
32
|
+
if (!agentRunner) throw new Error("agentRunner is required");
|
|
33
|
+
if (!supervisorRunner) throw new Error("supervisorRunner is required");
|
|
34
|
+
if (!output) throw new Error("output is required");
|
|
35
|
+
this.agentRunner = agentRunner;
|
|
36
|
+
this.supervisorRunner = supervisorRunner;
|
|
37
|
+
this.output = output;
|
|
38
|
+
this.maxTurns = maxTurns ?? 20;
|
|
39
|
+
/** @type {"agent"|"supervisor"} */
|
|
40
|
+
this.currentSource = "agent";
|
|
41
|
+
/** @type {number} */
|
|
42
|
+
this.currentTurn = 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Run the supervisor ↔ agent relay loop.
|
|
47
|
+
* @param {string} task - The initial task for the agent
|
|
48
|
+
* @returns {Promise<{success: boolean, turns: number}>}
|
|
49
|
+
*/
|
|
50
|
+
async run(task) {
|
|
51
|
+
// Turn 0: Agent receives the task and starts working
|
|
52
|
+
this.currentSource = "agent";
|
|
53
|
+
this.currentTurn = 0;
|
|
54
|
+
let agentResult = await this.agentRunner.run(task);
|
|
55
|
+
|
|
56
|
+
if (agentResult.error) {
|
|
57
|
+
this.emitSummary({ success: false, turns: 0 });
|
|
58
|
+
return { success: false, turns: 0 };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for (let turn = 1; turn <= this.maxTurns; turn++) {
|
|
62
|
+
// Supervisor observes the agent's output
|
|
63
|
+
const supervisorPrompt =
|
|
64
|
+
`The agent reported:\n\n${agentResult.text}\n\n` +
|
|
65
|
+
`Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
|
|
66
|
+
|
|
67
|
+
this.currentSource = "supervisor";
|
|
68
|
+
this.currentTurn = turn;
|
|
69
|
+
let supervisorResult;
|
|
70
|
+
if (turn === 1) {
|
|
71
|
+
supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
|
|
72
|
+
} else {
|
|
73
|
+
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (supervisorResult.error) {
|
|
77
|
+
this.emitSummary({ success: false, turns: turn });
|
|
78
|
+
return { success: false, turns: turn };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
if (isDone(supervisorResult.text)) {
|
|
82
|
+
this.emitSummary({ success: true, turns: turn });
|
|
83
|
+
return { success: true, turns: turn };
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
// Supervisor's response becomes the agent's next input
|
|
87
|
+
this.currentSource = "agent";
|
|
88
|
+
this.currentTurn = turn;
|
|
89
|
+
agentResult = await this.agentRunner.resume(supervisorResult.text);
|
|
90
|
+
|
|
91
|
+
if (agentResult.error) {
|
|
92
|
+
this.emitSummary({ success: false, turns: turn });
|
|
93
|
+
return { success: false, turns: turn };
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
this.emitSummary({ success: false, turns: this.maxTurns });
|
|
98
|
+
return { success: false, turns: this.maxTurns };
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
/**
|
|
102
|
+
* Emit a single NDJSON line tagged with the current source and turn.
|
|
103
|
+
* Called in real-time via the AgentRunner onLine callback.
|
|
104
|
+
* @param {string} line - Raw NDJSON line from the runner
|
|
105
|
+
*/
|
|
106
|
+
emitLine(line) {
|
|
107
|
+
const event = JSON.parse(line);
|
|
108
|
+
const tagged = {
|
|
109
|
+
source: this.currentSource,
|
|
110
|
+
turn: this.currentTurn,
|
|
111
|
+
event,
|
|
112
|
+
};
|
|
113
|
+
this.output.write(JSON.stringify(tagged) + "\n");
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Emit a final orchestrator summary line.
|
|
118
|
+
* @param {{success: boolean, turns: number}} result
|
|
119
|
+
*/
|
|
120
|
+
emitSummary(result) {
|
|
121
|
+
const summary = {
|
|
122
|
+
source: "orchestrator",
|
|
123
|
+
type: "summary",
|
|
124
|
+
success: result.success,
|
|
125
|
+
turns: result.turns,
|
|
126
|
+
};
|
|
127
|
+
this.output.write(JSON.stringify(summary) + "\n");
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Factory function — wires both AgentRunners with their respective configs.
|
|
133
|
+
* @param {object} deps
|
|
134
|
+
* @param {string} deps.supervisorCwd - Supervisor working directory
|
|
135
|
+
* @param {string} deps.agentCwd - Agent working directory
|
|
136
|
+
* @param {function} deps.query - SDK query function
|
|
137
|
+
* @param {import("stream").Writable} deps.output - Final output stream
|
|
138
|
+
* @param {string} [deps.model] - Claude model identifier
|
|
139
|
+
* @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
|
|
140
|
+
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
141
|
+
* @returns {Supervisor}
|
|
142
|
+
*/
|
|
143
|
+
export function createSupervisor({
|
|
144
|
+
supervisorCwd,
|
|
145
|
+
agentCwd,
|
|
146
|
+
query,
|
|
147
|
+
output,
|
|
148
|
+
model,
|
|
149
|
+
maxTurns,
|
|
150
|
+
allowedTools,
|
|
151
|
+
}) {
|
|
152
|
+
// Forward-reference: onLine captures `supervisor` before construction completes.
|
|
153
|
+
// This is safe because onLine is only called during run(), after construction.
|
|
154
|
+
let supervisor;
|
|
155
|
+
const onLine = (line) => supervisor.emitLine(line);
|
|
156
|
+
|
|
157
|
+
const agentRunner = createAgentRunner({
|
|
158
|
+
cwd: agentCwd,
|
|
159
|
+
query,
|
|
160
|
+
output: new PassThrough(),
|
|
161
|
+
model,
|
|
162
|
+
maxTurns: 50,
|
|
163
|
+
allowedTools,
|
|
164
|
+
onLine,
|
|
165
|
+
settingSources: ["project"],
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
const supervisorRunner = createAgentRunner({
|
|
169
|
+
cwd: supervisorCwd,
|
|
170
|
+
query,
|
|
171
|
+
output: new PassThrough(),
|
|
172
|
+
model,
|
|
173
|
+
maxTurns: 10,
|
|
174
|
+
allowedTools: ["Read", "Glob", "Grep"],
|
|
175
|
+
onLine,
|
|
176
|
+
settingSources: ["project"],
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
supervisor = new Supervisor({
|
|
180
|
+
agentRunner,
|
|
181
|
+
supervisorRunner,
|
|
182
|
+
output,
|
|
183
|
+
maxTurns,
|
|
184
|
+
});
|
|
185
|
+
return supervisor;
|
|
186
|
+
}
|