@forwardimpact/libeval 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +2 -2
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/agent-runner.js +97 -39
- package/src/commands/run.js +43 -18
- package/src/commands/supervise.js +59 -37
- package/src/supervisor.js +298 -59
- package/test/mock-runner.js +101 -0
- package/test/supervisor-intervention.test.js +359 -0
- package/test/{supervisor.test.js → supervisor-output.test.js} +120 -306
- package/test/supervisor-run.test.js +310 -0
package/bin/fit-eval.js
CHANGED
|
@@ -29,7 +29,7 @@ Run options:
|
|
|
29
29
|
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
30
30
|
--cwd=DIR Agent working directory (default: .)
|
|
31
31
|
--model=MODEL Claude model to use (default: opus)
|
|
32
|
-
--max-turns=N Maximum agentic turns (default: 50)
|
|
32
|
+
--max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
|
|
33
33
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
34
34
|
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
35
35
|
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
@@ -40,7 +40,7 @@ Supervise options:
|
|
|
40
40
|
--supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
41
41
|
--agent-cwd=DIR Agent working directory (default: temp directory)
|
|
42
42
|
--model=MODEL Claude model to use (default: opus)
|
|
43
|
-
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
43
|
+
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
|
|
44
44
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
45
45
|
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
46
46
|
--supervisor-allowed-tools=LIST
|
package/index.js
CHANGED
package/package.json
CHANGED
package/src/agent-runner.js
CHANGED
|
@@ -17,6 +17,7 @@ export class AgentRunner {
|
|
|
17
17
|
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
18
18
|
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
19
|
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
|
+
* @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
|
|
20
21
|
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
21
22
|
* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
|
|
22
23
|
* @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
|
|
@@ -31,6 +32,7 @@ export class AgentRunner {
|
|
|
31
32
|
allowedTools,
|
|
32
33
|
permissionMode,
|
|
33
34
|
onLine,
|
|
35
|
+
onBatch,
|
|
34
36
|
settingSources,
|
|
35
37
|
agentProfile,
|
|
36
38
|
systemPrompt,
|
|
@@ -43,7 +45,7 @@ export class AgentRunner {
|
|
|
43
45
|
this.query = query;
|
|
44
46
|
this.output = output;
|
|
45
47
|
this.model = model ?? "opus";
|
|
46
|
-
this.maxTurns = maxTurns ?? 50;
|
|
48
|
+
this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
|
|
47
49
|
this.allowedTools = allowedTools ?? [
|
|
48
50
|
"Bash",
|
|
49
51
|
"Read",
|
|
@@ -54,101 +56,140 @@ export class AgentRunner {
|
|
|
54
56
|
];
|
|
55
57
|
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
56
58
|
this.onLine = onLine ?? null;
|
|
59
|
+
this.onBatch = onBatch ?? null;
|
|
57
60
|
this.settingSources = settingSources ?? [];
|
|
58
61
|
this.agentProfile = agentProfile ?? null;
|
|
59
62
|
this.systemPrompt = systemPrompt ?? null;
|
|
60
63
|
this.disallowedTools = disallowedTools ?? [];
|
|
61
64
|
this.sessionId = null;
|
|
62
65
|
this.buffer = [];
|
|
66
|
+
/** @type {AbortController|null} */
|
|
67
|
+
this.currentAbortController = null;
|
|
63
68
|
}
|
|
64
69
|
|
|
65
70
|
/**
|
|
66
71
|
* Run a new agent session with the given task.
|
|
67
72
|
* @param {string} task - The task prompt
|
|
68
|
-
* @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
|
|
73
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
69
74
|
*/
|
|
70
75
|
async run(task) {
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
let error = null;
|
|
74
|
-
|
|
76
|
+
const abortController = new AbortController();
|
|
77
|
+
this.currentAbortController = abortController;
|
|
75
78
|
try {
|
|
76
|
-
|
|
79
|
+
const iterator = this.query({
|
|
77
80
|
prompt: task,
|
|
78
81
|
options: {
|
|
79
82
|
cwd: this.cwd,
|
|
80
83
|
allowedTools: this.allowedTools,
|
|
81
|
-
maxTurns: this.maxTurns,
|
|
84
|
+
...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
|
|
82
85
|
model: this.model,
|
|
83
86
|
permissionMode: this.permissionMode,
|
|
84
87
|
allowDangerouslySkipPermissions: true,
|
|
85
88
|
settingSources: this.settingSources,
|
|
89
|
+
abortController,
|
|
86
90
|
...(this.disallowedTools.length > 0 && {
|
|
87
91
|
disallowedTools: this.disallowedTools,
|
|
88
92
|
}),
|
|
89
93
|
...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
|
|
90
94
|
...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
|
|
91
95
|
},
|
|
92
|
-
})
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if (this.onLine) this.onLine(line);
|
|
97
|
-
|
|
98
|
-
if (message.type === "system" && message.subtype === "init") {
|
|
99
|
-
this.sessionId = message.session_id;
|
|
100
|
-
}
|
|
101
|
-
if (message.type === "result") {
|
|
102
|
-
text = message.result ?? "";
|
|
103
|
-
stopReason = message.subtype;
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
} catch (err) {
|
|
107
|
-
error = err;
|
|
96
|
+
});
|
|
97
|
+
return await this.#consumeQuery(iterator);
|
|
98
|
+
} finally {
|
|
99
|
+
this.currentAbortController = null;
|
|
108
100
|
}
|
|
109
|
-
|
|
110
|
-
// If the SDK already emitted a successful result, honour it even when the
|
|
111
|
-
// stream throws afterwards (e.g. "Credit balance is too low" during
|
|
112
|
-
// cleanup). Only treat errors as fatal when no result was received yet.
|
|
113
|
-
const success = stopReason === "success";
|
|
114
|
-
return { success, text, sessionId: this.sessionId, error };
|
|
115
101
|
}
|
|
116
102
|
|
|
117
103
|
/**
|
|
118
104
|
* Resume an existing session with a follow-up prompt.
|
|
119
105
|
* @param {string} prompt - The follow-up prompt
|
|
120
|
-
* @returns {Promise<{success: boolean, text: string}>}
|
|
106
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
121
107
|
*/
|
|
122
108
|
async resume(prompt) {
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
let error = null;
|
|
126
|
-
|
|
109
|
+
const abortController = new AbortController();
|
|
110
|
+
this.currentAbortController = abortController;
|
|
127
111
|
try {
|
|
128
|
-
|
|
112
|
+
const iterator = this.query({
|
|
129
113
|
prompt,
|
|
130
114
|
options: {
|
|
131
115
|
resume: this.sessionId,
|
|
132
116
|
permissionMode: this.permissionMode,
|
|
133
117
|
allowDangerouslySkipPermissions: true,
|
|
118
|
+
abortController,
|
|
134
119
|
},
|
|
135
|
-
})
|
|
120
|
+
});
|
|
121
|
+
return await this.#consumeQuery(iterator);
|
|
122
|
+
} finally {
|
|
123
|
+
this.currentAbortController = null;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Shared consumer for both `run()` and `resume()`. Iterates the SDK query
|
|
129
|
+
* iterator, mirroring every line to the output stream / buffer / onLine
|
|
130
|
+
* callback, and — when `onBatch` is set — flushes accumulated lines to it
|
|
131
|
+
* at natural boundaries (assistant messages with text blocks, and the
|
|
132
|
+
* terminal `result` message).
|
|
133
|
+
*
|
|
134
|
+
* INVARIANT: the `await this.onBatch(...)` call below is the ONLY
|
|
135
|
+
* suspension point in this loop. While it is pending, no further lines
|
|
136
|
+
* are pulled from the SDK generator. The Supervisor relies on this — its
|
|
137
|
+
* onBatch callback flips `currentSource` to "supervisor" for the duration
|
|
138
|
+
* of its mid-turn LLM call, and the invariant guarantees no agent line
|
|
139
|
+
* can arrive concurrently and be mis-tagged.
|
|
140
|
+
*
|
|
141
|
+
* If the supervisor calls `abort()` from inside the callback, the next
|
|
142
|
+
* iteration of the for-await loop will throw. We catch the throw, check
|
|
143
|
+
* `currentAbortController.signal.aborted` (avoiding fragility around
|
|
144
|
+
* AbortError vs DOMException shapes), and report `aborted: true` so the
|
|
145
|
+
* caller can distinguish "supervisor asked us to stop" from a real error.
|
|
146
|
+
* @param {AsyncIterable<object>} iterator
|
|
147
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
148
|
+
*/
|
|
149
|
+
async #consumeQuery(iterator) {
|
|
150
|
+
let text = "";
|
|
151
|
+
let stopReason = null;
|
|
152
|
+
let error = null;
|
|
153
|
+
let aborted = false;
|
|
154
|
+
const pendingBatch = [];
|
|
155
|
+
|
|
156
|
+
try {
|
|
157
|
+
for await (const message of iterator) {
|
|
136
158
|
const line = JSON.stringify(message);
|
|
137
159
|
this.output.write(line + "\n");
|
|
138
160
|
this.buffer.push(line);
|
|
139
161
|
if (this.onLine) this.onLine(line);
|
|
162
|
+
if (this.onBatch) pendingBatch.push(line);
|
|
140
163
|
|
|
164
|
+
if (message.type === "system" && message.subtype === "init") {
|
|
165
|
+
this.sessionId = message.session_id;
|
|
166
|
+
}
|
|
141
167
|
if (message.type === "result") {
|
|
142
168
|
text = message.result ?? "";
|
|
143
169
|
stopReason = message.subtype;
|
|
144
170
|
}
|
|
171
|
+
|
|
172
|
+
const shouldFlush =
|
|
173
|
+
this.onBatch &&
|
|
174
|
+
(message.type === "result" ||
|
|
175
|
+
(message.type === "assistant" && hasTextBlock(message)));
|
|
176
|
+
if (shouldFlush) {
|
|
177
|
+
const batchLines = pendingBatch.splice(0, pendingBatch.length);
|
|
178
|
+
await this.onBatch(batchLines, {
|
|
179
|
+
abort: () => this.currentAbortController?.abort(),
|
|
180
|
+
});
|
|
181
|
+
}
|
|
145
182
|
}
|
|
146
183
|
} catch (err) {
|
|
147
|
-
|
|
184
|
+
if (this.currentAbortController?.signal.aborted) {
|
|
185
|
+
aborted = true;
|
|
186
|
+
} else {
|
|
187
|
+
error = err;
|
|
188
|
+
}
|
|
148
189
|
}
|
|
149
190
|
|
|
150
191
|
const success = stopReason === "success";
|
|
151
|
-
return { success, text, error };
|
|
192
|
+
return { success, text, sessionId: this.sessionId, error, aborted };
|
|
152
193
|
}
|
|
153
194
|
|
|
154
195
|
/**
|
|
@@ -162,6 +203,23 @@ export class AgentRunner {
|
|
|
162
203
|
}
|
|
163
204
|
}
|
|
164
205
|
|
|
206
|
+
/**
|
|
207
|
+
* Whether an SDK assistant message contains at least one text block.
|
|
208
|
+
* Tool-only assistant messages return false so they accumulate into the
|
|
209
|
+
* pending batch and flush with the next text block (or with the terminal
|
|
210
|
+
* `result` message), keeping supervisor LLM cost bounded.
|
|
211
|
+
* @param {object} message
|
|
212
|
+
* @returns {boolean}
|
|
213
|
+
*/
|
|
214
|
+
function hasTextBlock(message) {
|
|
215
|
+
const content = message.message?.content ?? message.content;
|
|
216
|
+
if (!Array.isArray(content)) return false;
|
|
217
|
+
for (const block of content) {
|
|
218
|
+
if (block.type === "text" && block.text) return true;
|
|
219
|
+
}
|
|
220
|
+
return false;
|
|
221
|
+
}
|
|
222
|
+
|
|
165
223
|
/**
|
|
166
224
|
* Factory function — wires real dependencies.
|
|
167
225
|
* @param {object} deps - Same as AgentRunner constructor
|
package/src/commands/run.js
CHANGED
|
@@ -18,6 +18,38 @@ function parseFlag(args, name) {
|
|
|
18
18
|
return undefined;
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
/**
|
|
22
|
+
* Parse and validate run command options from args.
|
|
23
|
+
* @param {string[]} args
|
|
24
|
+
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
|
|
25
|
+
*/
|
|
26
|
+
function parseRunOptions(args) {
|
|
27
|
+
const taskFile = parseFlag(args, "task-file");
|
|
28
|
+
const taskText = parseFlag(args, "task-text");
|
|
29
|
+
if (taskFile && taskText)
|
|
30
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
31
|
+
if (!taskFile && !taskText)
|
|
32
|
+
throw new Error("--task-file or --task-text is required");
|
|
33
|
+
|
|
34
|
+
const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
|
|
35
|
+
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
|
|
36
|
+
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
37
|
+
if (taskAmend) taskContent += `\n\n${taskAmend}`;
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
taskContent,
|
|
41
|
+
cwd: resolve(parseFlag(args, "cwd") ?? "."),
|
|
42
|
+
model: parseFlag(args, "model") ?? "opus",
|
|
43
|
+
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
|
|
44
|
+
outputPath: parseFlag(args, "output"),
|
|
45
|
+
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
|
|
46
|
+
allowedTools: (
|
|
47
|
+
parseFlag(args, "allowed-tools") ??
|
|
48
|
+
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
49
|
+
).split(","),
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
21
53
|
/**
|
|
22
54
|
* Run command — execute a single agent via the Claude Agent SDK.
|
|
23
55
|
*
|
|
@@ -28,31 +60,24 @@ function parseFlag(args, name) {
|
|
|
28
60
|
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
61
|
* --cwd=DIR Agent working directory (default: .)
|
|
30
62
|
* --model=MODEL Claude model to use (default: opus)
|
|
31
|
-
* --max-turns=N Maximum agentic turns (default: 50)
|
|
63
|
+
* --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
|
|
32
64
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
33
65
|
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
34
66
|
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
67
|
+
* --task-amend=TEXT Additional text appended to the task prompt
|
|
35
68
|
*
|
|
36
69
|
* @param {string[]} args - Command arguments
|
|
37
70
|
*/
|
|
38
71
|
export async function runRunCommand(args) {
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
|
|
49
|
-
const outputPath = parseFlag(args, "output");
|
|
50
|
-
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
51
|
-
const allowedTools = (
|
|
52
|
-
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
53
|
-
).split(",");
|
|
54
|
-
|
|
55
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
72
|
+
const {
|
|
73
|
+
taskContent,
|
|
74
|
+
cwd,
|
|
75
|
+
model,
|
|
76
|
+
maxTurns,
|
|
77
|
+
outputPath,
|
|
78
|
+
agentProfile,
|
|
79
|
+
allowedTools,
|
|
80
|
+
} = parseRunOptions(args);
|
|
56
81
|
|
|
57
82
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
58
83
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -19,6 +19,50 @@ function parseFlag(args, name) {
|
|
|
19
19
|
return undefined;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
/**
|
|
23
|
+
* Parse all supervise flags from args into an options object.
|
|
24
|
+
* @param {string[]} args
|
|
25
|
+
* @returns {object}
|
|
26
|
+
*/
|
|
27
|
+
function parseSuperviseOptions(args) {
|
|
28
|
+
const taskFile = parseFlag(args, "task-file");
|
|
29
|
+
const taskText = parseFlag(args, "task-text");
|
|
30
|
+
if (taskFile && taskText)
|
|
31
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
32
|
+
if (!taskFile && !taskText)
|
|
33
|
+
throw new Error("--task-file or --task-text is required");
|
|
34
|
+
|
|
35
|
+
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
36
|
+
|
|
37
|
+
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
|
|
38
|
+
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
39
|
+
if (taskAmend) taskContent += `\n\n${taskAmend}`;
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
taskContent,
|
|
43
|
+
supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
|
|
44
|
+
agentCwd: resolve(
|
|
45
|
+
parseFlag(args, "agent-cwd") ??
|
|
46
|
+
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
47
|
+
),
|
|
48
|
+
model: parseFlag(args, "model") ?? "opus",
|
|
49
|
+
maxTurns: (() => {
|
|
50
|
+
const raw = parseFlag(args, "max-turns") ?? "20";
|
|
51
|
+
return raw === "0" ? 0 : parseInt(raw, 10);
|
|
52
|
+
})(),
|
|
53
|
+
outputPath: parseFlag(args, "output"),
|
|
54
|
+
supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
|
|
55
|
+
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
|
|
56
|
+
allowedTools: (
|
|
57
|
+
parseFlag(args, "allowed-tools") ??
|
|
58
|
+
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
59
|
+
).split(","),
|
|
60
|
+
supervisorAllowedTools: supervisorAllowedToolsRaw
|
|
61
|
+
? supervisorAllowedToolsRaw.split(",")
|
|
62
|
+
: undefined,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
22
66
|
/**
|
|
23
67
|
* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
|
|
24
68
|
*
|
|
@@ -30,45 +74,23 @@ function parseFlag(args, name) {
|
|
|
30
74
|
* --supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
31
75
|
* --agent-cwd=DIR Agent working directory (default: temp directory)
|
|
32
76
|
* --model=MODEL Claude model to use (default: opus)
|
|
33
|
-
* --max-turns=N Maximum supervisor
|
|
77
|
+
* --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
|
|
34
78
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
35
79
|
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
36
80
|
* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
37
81
|
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
82
|
+
* --task-amend=TEXT Additional text appended to the task prompt
|
|
38
83
|
*
|
|
39
84
|
* @param {string[]} args - Command arguments
|
|
40
85
|
*/
|
|
41
86
|
export async function runSuperviseCommand(args) {
|
|
42
|
-
const
|
|
43
|
-
const taskText = parseFlag(args, "task-text");
|
|
44
|
-
if (taskFile && taskText)
|
|
45
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
46
|
-
if (!taskFile && !taskText)
|
|
47
|
-
throw new Error("--task-file or --task-text is required");
|
|
48
|
-
|
|
49
|
-
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
|
|
50
|
-
const agentCwd = resolve(
|
|
51
|
-
parseFlag(args, "agent-cwd") ??
|
|
52
|
-
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
53
|
-
);
|
|
54
|
-
const model = parseFlag(args, "model") ?? "opus";
|
|
55
|
-
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
|
|
56
|
-
const outputPath = parseFlag(args, "output");
|
|
57
|
-
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
|
|
58
|
-
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
59
|
-
const allowedTools = (
|
|
60
|
-
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
61
|
-
).split(",");
|
|
62
|
-
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
63
|
-
const supervisorAllowedTools = supervisorAllowedToolsRaw
|
|
64
|
-
? supervisorAllowedToolsRaw.split(",")
|
|
65
|
-
: undefined;
|
|
66
|
-
|
|
67
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
87
|
+
const opts = parseSuperviseOptions(args);
|
|
68
88
|
|
|
69
89
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
70
90
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
71
|
-
const fileStream = outputPath
|
|
91
|
+
const fileStream = opts.outputPath
|
|
92
|
+
? createWriteStream(opts.outputPath)
|
|
93
|
+
: null;
|
|
72
94
|
const output = fileStream
|
|
73
95
|
? createTeeWriter({
|
|
74
96
|
fileStream,
|
|
@@ -79,19 +101,19 @@ export async function runSuperviseCommand(args) {
|
|
|
79
101
|
|
|
80
102
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
81
103
|
const supervisor = createSupervisor({
|
|
82
|
-
supervisorCwd,
|
|
83
|
-
agentCwd,
|
|
104
|
+
supervisorCwd: opts.supervisorCwd,
|
|
105
|
+
agentCwd: opts.agentCwd,
|
|
84
106
|
query,
|
|
85
107
|
output,
|
|
86
|
-
model,
|
|
87
|
-
maxTurns,
|
|
88
|
-
allowedTools,
|
|
89
|
-
supervisorAllowedTools,
|
|
90
|
-
supervisorProfile,
|
|
91
|
-
agentProfile,
|
|
108
|
+
model: opts.model,
|
|
109
|
+
maxTurns: opts.maxTurns,
|
|
110
|
+
allowedTools: opts.allowedTools,
|
|
111
|
+
supervisorAllowedTools: opts.supervisorAllowedTools,
|
|
112
|
+
supervisorProfile: opts.supervisorProfile,
|
|
113
|
+
agentProfile: opts.agentProfile,
|
|
92
114
|
});
|
|
93
115
|
|
|
94
|
-
const result = await supervisor.run(taskContent);
|
|
116
|
+
const result = await supervisor.run(opts.taskContent);
|
|
95
117
|
|
|
96
118
|
if (fileStream) {
|
|
97
119
|
await new Promise((r) => output.end(r));
|