@forwardimpact/libeval 0.1.6 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-eval.js +2 -2
- package/index.js +2 -0
- package/package.json +1 -1
- package/src/agent-runner.js +178 -43
- package/src/commands/run.js +43 -18
- package/src/commands/supervise.js +59 -37
- package/src/supervisor.js +298 -59
- package/test/agent-runner-batching.test.js +271 -0
- package/test/mock-runner.js +113 -0
- package/test/supervisor-batching.test.js +175 -0
- package/test/supervisor-intervention.test.js +365 -0
- package/test/{supervisor.test.js → supervisor-output.test.js} +121 -306
- package/test/supervisor-run.test.js +310 -0
package/bin/fit-eval.js
CHANGED
|
@@ -29,7 +29,7 @@ Run options:
|
|
|
29
29
|
--task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
30
30
|
--cwd=DIR Agent working directory (default: .)
|
|
31
31
|
--model=MODEL Claude model to use (default: opus)
|
|
32
|
-
--max-turns=N Maximum agentic turns (default: 50)
|
|
32
|
+
--max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
|
|
33
33
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
34
34
|
--allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
35
35
|
--agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
@@ -40,7 +40,7 @@ Supervise options:
|
|
|
40
40
|
--supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
41
41
|
--agent-cwd=DIR Agent working directory (default: temp directory)
|
|
42
42
|
--model=MODEL Claude model to use (default: opus)
|
|
43
|
-
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
|
|
43
|
+
--max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
|
|
44
44
|
--output=PATH Write NDJSON trace to file (default: stdout)
|
|
45
45
|
--allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
46
46
|
--supervisor-allowed-tools=LIST
|
package/index.js
CHANGED
package/package.json
CHANGED
package/src/agent-runner.js
CHANGED
|
@@ -17,6 +17,8 @@ export class AgentRunner {
|
|
|
17
17
|
* @param {string[]} [deps.allowedTools] - Tools the agent may use
|
|
18
18
|
* @param {string} [deps.permissionMode] - SDK permission mode
|
|
19
19
|
* @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
|
|
20
|
+
* @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries: every `batchSize` assistant text blocks, the terminal `result` message, and — on iterator crash/abort — once more in a final flush carrying any lines that never reached a boundary. Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
|
|
21
|
+
* @param {number} [deps.batchSize] - Assistant text-block messages to accumulate before firing onBatch. Tool-only assistant messages ride along without counting. Default 3: the supervisor reviews the agent every three text turns instead of every turn. The terminal `result` always flushes regardless of count.
|
|
20
22
|
* @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
|
|
21
23
|
* @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
|
|
22
24
|
* @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
|
|
@@ -31,6 +33,8 @@ export class AgentRunner {
|
|
|
31
33
|
allowedTools,
|
|
32
34
|
permissionMode,
|
|
33
35
|
onLine,
|
|
36
|
+
onBatch,
|
|
37
|
+
batchSize,
|
|
34
38
|
settingSources,
|
|
35
39
|
agentProfile,
|
|
36
40
|
systemPrompt,
|
|
@@ -43,7 +47,7 @@ export class AgentRunner {
|
|
|
43
47
|
this.query = query;
|
|
44
48
|
this.output = output;
|
|
45
49
|
this.model = model ?? "opus";
|
|
46
|
-
this.maxTurns = maxTurns ?? 50;
|
|
50
|
+
this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
|
|
47
51
|
this.allowedTools = allowedTools ?? [
|
|
48
52
|
"Bash",
|
|
49
53
|
"Read",
|
|
@@ -54,101 +58,214 @@ export class AgentRunner {
|
|
|
54
58
|
];
|
|
55
59
|
this.permissionMode = permissionMode ?? "bypassPermissions";
|
|
56
60
|
this.onLine = onLine ?? null;
|
|
61
|
+
this.onBatch = onBatch ?? null;
|
|
62
|
+
this.batchSize = batchSize ?? 3;
|
|
57
63
|
this.settingSources = settingSources ?? [];
|
|
58
64
|
this.agentProfile = agentProfile ?? null;
|
|
59
65
|
this.systemPrompt = systemPrompt ?? null;
|
|
60
66
|
this.disallowedTools = disallowedTools ?? [];
|
|
61
67
|
this.sessionId = null;
|
|
62
68
|
this.buffer = [];
|
|
69
|
+
/** @type {AbortController|null} */
|
|
70
|
+
this.currentAbortController = null;
|
|
63
71
|
}
|
|
64
72
|
|
|
65
73
|
/**
|
|
66
74
|
* Run a new agent session with the given task.
|
|
67
75
|
* @param {string} task - The task prompt
|
|
68
|
-
* @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
|
|
76
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
69
77
|
*/
|
|
70
78
|
async run(task) {
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
let error = null;
|
|
74
|
-
|
|
79
|
+
const abortController = new AbortController();
|
|
80
|
+
this.currentAbortController = abortController;
|
|
75
81
|
try {
|
|
76
|
-
|
|
82
|
+
const iterator = this.query({
|
|
77
83
|
prompt: task,
|
|
78
84
|
options: {
|
|
79
85
|
cwd: this.cwd,
|
|
80
86
|
allowedTools: this.allowedTools,
|
|
81
|
-
maxTurns: this.maxTurns,
|
|
87
|
+
...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
|
|
82
88
|
model: this.model,
|
|
83
89
|
permissionMode: this.permissionMode,
|
|
84
90
|
allowDangerouslySkipPermissions: true,
|
|
85
91
|
settingSources: this.settingSources,
|
|
92
|
+
abortController,
|
|
86
93
|
...(this.disallowedTools.length > 0 && {
|
|
87
94
|
disallowedTools: this.disallowedTools,
|
|
88
95
|
}),
|
|
89
96
|
...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
|
|
90
97
|
...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
|
|
91
98
|
},
|
|
92
|
-
})
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
if (this.onLine) this.onLine(line);
|
|
97
|
-
|
|
98
|
-
if (message.type === "system" && message.subtype === "init") {
|
|
99
|
-
this.sessionId = message.session_id;
|
|
100
|
-
}
|
|
101
|
-
if (message.type === "result") {
|
|
102
|
-
text = message.result ?? "";
|
|
103
|
-
stopReason = message.subtype;
|
|
104
|
-
}
|
|
105
|
-
}
|
|
106
|
-
} catch (err) {
|
|
107
|
-
error = err;
|
|
99
|
+
});
|
|
100
|
+
return await this.#consumeQuery(iterator);
|
|
101
|
+
} finally {
|
|
102
|
+
this.currentAbortController = null;
|
|
108
103
|
}
|
|
109
|
-
|
|
110
|
-
// If the SDK already emitted a successful result, honour it even when the
|
|
111
|
-
// stream throws afterwards (e.g. "Credit balance is too low" during
|
|
112
|
-
// cleanup). Only treat errors as fatal when no result was received yet.
|
|
113
|
-
const success = stopReason === "success";
|
|
114
|
-
return { success, text, sessionId: this.sessionId, error };
|
|
115
104
|
}
|
|
116
105
|
|
|
117
106
|
/**
|
|
118
107
|
* Resume an existing session with a follow-up prompt.
|
|
119
108
|
* @param {string} prompt - The follow-up prompt
|
|
120
|
-
* @returns {Promise<{success: boolean, text: string}>}
|
|
109
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
121
110
|
*/
|
|
122
111
|
async resume(prompt) {
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
let error = null;
|
|
126
|
-
|
|
112
|
+
const abortController = new AbortController();
|
|
113
|
+
this.currentAbortController = abortController;
|
|
127
114
|
try {
|
|
128
|
-
|
|
115
|
+
const iterator = this.query({
|
|
129
116
|
prompt,
|
|
130
117
|
options: {
|
|
131
118
|
resume: this.sessionId,
|
|
132
119
|
permissionMode: this.permissionMode,
|
|
133
120
|
allowDangerouslySkipPermissions: true,
|
|
121
|
+
abortController,
|
|
134
122
|
},
|
|
135
|
-
})
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
123
|
+
});
|
|
124
|
+
return await this.#consumeQuery(iterator);
|
|
125
|
+
} finally {
|
|
126
|
+
this.currentAbortController = null;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
/**
|
|
131
|
+
* Shared consumer for both `run()` and `resume()`. Iterates the SDK query
|
|
132
|
+
* iterator, mirroring every line to the output stream / buffer / onLine
|
|
133
|
+
* callback, and — when `onBatch` is set — flushes accumulated lines to it
|
|
134
|
+
* at coarse boundaries: every `batchSize` assistant text-block messages,
|
|
135
|
+
* and the terminal `result` message. Tool-only assistant messages still
|
|
136
|
+
* accumulate in the pending batch and ride along in the next flush, so
|
|
137
|
+
* the supervisor always sees the tool calls that led up to each text
|
|
138
|
+
* block. Raising `batchSize` above 1 is the knob that makes the mid-turn
|
|
139
|
+
* supervisor review less chatty — with the default of 3, the supervisor
|
|
140
|
+
* sees the agent in chunks of three text turns instead of every turn.
|
|
141
|
+
*
|
|
142
|
+
* Corollary: a turn that is *entirely* tool_use with no text blocks and
|
|
143
|
+
* then hits `result` produces exactly one flush at `result` regardless
|
|
144
|
+
* of how many tools ran. That is deliberate — the supervisor only needs
|
|
145
|
+
* to weigh in when the agent surfaces something text-like to react to.
|
|
146
|
+
*
|
|
147
|
+
* INVARIANT: the `await this.onBatch(...)` call below is the ONLY
|
|
148
|
+
* suspension point in this loop. While it is pending, no further lines
|
|
149
|
+
* are pulled from the SDK generator. The Supervisor relies on this — its
|
|
150
|
+
* onBatch callback flips `currentSource` to "supervisor" for the duration
|
|
151
|
+
* of its mid-turn LLM call, and the invariant guarantees no agent line
|
|
152
|
+
* can arrive concurrently and be mis-tagged.
|
|
153
|
+
*
|
|
154
|
+
* If the supervisor calls `abort()` from inside the callback, the next
|
|
155
|
+
* iteration of the for-await loop will throw. We catch the throw, check
|
|
156
|
+
* `currentAbortController.signal.aborted` (avoiding fragility around
|
|
157
|
+
* AbortError vs DOMException shapes), and report `aborted: true` so the
|
|
158
|
+
* caller can distinguish "supervisor asked us to stop" from a real error.
|
|
159
|
+
*
|
|
160
|
+
* If the iterator throws before a flush boundary, any lines still in the
|
|
161
|
+
* pending batch would otherwise vanish without the supervisor seeing
|
|
162
|
+
* them. The `finally` block emits a terminal batch so the supervisor can
|
|
163
|
+
* observe the partial state (e.g. note a crash or react to an external
|
|
164
|
+
* abort). A throw from that final flush becomes the returned `error`
|
|
165
|
+
* only if no earlier error was captured — the original failure wins.
|
|
166
|
+
* @param {AsyncIterable<object>} iterator
|
|
167
|
+
* @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
|
|
168
|
+
*/
|
|
169
|
+
async #consumeQuery(iterator) {
|
|
170
|
+
let text = "";
|
|
171
|
+
let stopReason = null;
|
|
172
|
+
let error = null;
|
|
173
|
+
let aborted = false;
|
|
174
|
+
const state = { pendingBatch: [], assistantTextCount: 0 };
|
|
140
175
|
|
|
176
|
+
try {
|
|
177
|
+
for await (const message of iterator) {
|
|
178
|
+
this.#recordLine(message, state);
|
|
141
179
|
if (message.type === "result") {
|
|
142
180
|
text = message.result ?? "";
|
|
143
181
|
stopReason = message.subtype;
|
|
144
182
|
}
|
|
183
|
+
await this.#maybeFlushBatch(message, state);
|
|
145
184
|
}
|
|
146
185
|
} catch (err) {
|
|
147
|
-
|
|
186
|
+
if (this.currentAbortController?.signal.aborted) {
|
|
187
|
+
aborted = true;
|
|
188
|
+
} else {
|
|
189
|
+
error = err;
|
|
190
|
+
}
|
|
148
191
|
}
|
|
149
192
|
|
|
193
|
+
const flushErr = await this.#terminalFlush(state, { error, aborted });
|
|
194
|
+
if (flushErr && !error) error = flushErr;
|
|
195
|
+
|
|
150
196
|
const success = stopReason === "success";
|
|
151
|
-
return { success, text, error };
|
|
197
|
+
return { success, text, sessionId: this.sessionId, error, aborted };
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Mirror a single SDK message to the output stream, buffer, onLine
|
|
202
|
+
* callback, and (when set) the pending-batch state. Also handles
|
|
203
|
+
* session id capture and text-block counting so `#consumeQuery` can
|
|
204
|
+
* stay within the complexity budget.
|
|
205
|
+
* @param {object} message
|
|
206
|
+
* @param {{pendingBatch: string[], assistantTextCount: number}} state
|
|
207
|
+
*/
|
|
208
|
+
#recordLine(message, state) {
|
|
209
|
+
const line = JSON.stringify(message);
|
|
210
|
+
this.output.write(line + "\n");
|
|
211
|
+
this.buffer.push(line);
|
|
212
|
+
if (this.onLine) this.onLine(line);
|
|
213
|
+
if (this.onBatch) state.pendingBatch.push(line);
|
|
214
|
+
|
|
215
|
+
if (message.type === "system" && message.subtype === "init") {
|
|
216
|
+
this.sessionId = message.session_id;
|
|
217
|
+
}
|
|
218
|
+
if (message.type === "assistant" && hasTextBlock(message)) {
|
|
219
|
+
state.assistantTextCount++;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/**
|
|
224
|
+
* Terminal flush — only fires on the abnormal-end paths (iterator
|
|
225
|
+
* threw or was aborted mid-stream). Delivers any pending lines so the
|
|
226
|
+
* supervisor sees the partial state instead of losing the tail of
|
|
227
|
+
* the run. A natural-end iterator that simply ran out of messages
|
|
228
|
+
* without a `result` marker is treated as an incomplete stub (the
|
|
229
|
+
* real SDK always terminates with `result`) and its pending batch is
|
|
230
|
+
* not re-flushed. Returns an error thrown by the flush callback, or
|
|
231
|
+
* `null` if the flush succeeded or did not fire.
|
|
232
|
+
* @param {{pendingBatch: string[], assistantTextCount: number}} state
|
|
233
|
+
* @param {{error: Error|null, aborted: boolean}} outcome
|
|
234
|
+
* @returns {Promise<Error|null>}
|
|
235
|
+
*/
|
|
236
|
+
async #terminalFlush(state, { error, aborted }) {
|
|
237
|
+
const loopEndedAbnormally = Boolean(error || aborted);
|
|
238
|
+
if (!loopEndedAbnormally) return null;
|
|
239
|
+
if (!this.onBatch || state.pendingBatch.length === 0) return null;
|
|
240
|
+
try {
|
|
241
|
+
const batchLines = state.pendingBatch.splice(0);
|
|
242
|
+
await this.onBatch(batchLines, {
|
|
243
|
+
abort: () => this.currentAbortController?.abort(),
|
|
244
|
+
});
|
|
245
|
+
return null;
|
|
246
|
+
} catch (flushErr) {
|
|
247
|
+
return flushErr;
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/**
|
|
252
|
+
* Flush the pending batch to `onBatch` if either the batchSize threshold
|
|
253
|
+
* has been reached or the current message is the terminal `result`.
|
|
254
|
+
* Extracted so that `#consumeQuery` stays within the project's complexity
|
|
255
|
+
* budget — the flush is one cohesive unit of logic in its own right.
|
|
256
|
+
* @param {object} message
|
|
257
|
+
* @param {{pendingBatch: string[], assistantTextCount: number}} state
|
|
258
|
+
*/
|
|
259
|
+
async #maybeFlushBatch(message, state) {
|
|
260
|
+
if (!this.onBatch) return;
|
|
261
|
+
const shouldFlush =
|
|
262
|
+
message.type === "result" || state.assistantTextCount >= this.batchSize;
|
|
263
|
+
if (!shouldFlush) return;
|
|
264
|
+
state.assistantTextCount = 0;
|
|
265
|
+
const batchLines = state.pendingBatch.splice(0);
|
|
266
|
+
await this.onBatch(batchLines, {
|
|
267
|
+
abort: () => this.currentAbortController?.abort(),
|
|
268
|
+
});
|
|
152
269
|
}
|
|
153
270
|
|
|
154
271
|
/**
|
|
@@ -162,6 +279,24 @@ export class AgentRunner {
|
|
|
162
279
|
}
|
|
163
280
|
}
|
|
164
281
|
|
|
282
|
+
/**
|
|
283
|
+
* Whether an SDK assistant message contains at least one text block.
|
|
284
|
+
* Only text-block messages count toward the `batchSize` threshold — tool-only
|
|
285
|
+
* assistant messages accumulate silently into the pending batch and ride along
|
|
286
|
+
* in the next flush, keeping supervisor LLM cost bounded. Exported so the mock
|
|
287
|
+
* runner can mirror the real flush predicate without duplicating the logic.
|
|
288
|
+
* @param {object} message
|
|
289
|
+
* @returns {boolean}
|
|
290
|
+
*/
|
|
291
|
+
export function hasTextBlock(message) {
|
|
292
|
+
const content = message.message?.content ?? message.content;
|
|
293
|
+
if (!Array.isArray(content)) return false;
|
|
294
|
+
for (const block of content) {
|
|
295
|
+
if (block.type === "text" && block.text) return true;
|
|
296
|
+
}
|
|
297
|
+
return false;
|
|
298
|
+
}
|
|
299
|
+
|
|
165
300
|
/**
|
|
166
301
|
* Factory function — wires real dependencies.
|
|
167
302
|
* @param {object} deps - Same as AgentRunner constructor
|
package/src/commands/run.js
CHANGED
|
@@ -18,6 +18,38 @@ function parseFlag(args, name) {
|
|
|
18
18
|
return undefined;
|
|
19
19
|
}
|
|
20
20
|
|
|
21
|
+
/**
|
|
22
|
+
* Parse and validate run command options from args.
|
|
23
|
+
* @param {string[]} args
|
|
24
|
+
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
|
|
25
|
+
*/
|
|
26
|
+
function parseRunOptions(args) {
|
|
27
|
+
const taskFile = parseFlag(args, "task-file");
|
|
28
|
+
const taskText = parseFlag(args, "task-text");
|
|
29
|
+
if (taskFile && taskText)
|
|
30
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
31
|
+
if (!taskFile && !taskText)
|
|
32
|
+
throw new Error("--task-file or --task-text is required");
|
|
33
|
+
|
|
34
|
+
const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
|
|
35
|
+
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
|
|
36
|
+
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
37
|
+
if (taskAmend) taskContent += `\n\n${taskAmend}`;
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
taskContent,
|
|
41
|
+
cwd: resolve(parseFlag(args, "cwd") ?? "."),
|
|
42
|
+
model: parseFlag(args, "model") ?? "opus",
|
|
43
|
+
maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
|
|
44
|
+
outputPath: parseFlag(args, "output"),
|
|
45
|
+
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
|
|
46
|
+
allowedTools: (
|
|
47
|
+
parseFlag(args, "allowed-tools") ??
|
|
48
|
+
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
49
|
+
).split(","),
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
|
|
21
53
|
/**
|
|
22
54
|
* Run command — execute a single agent via the Claude Agent SDK.
|
|
23
55
|
*
|
|
@@ -28,31 +60,24 @@ function parseFlag(args, name) {
|
|
|
28
60
|
* --task-text=STRING Inline task text (mutually exclusive with --task-file)
|
|
29
61
|
* --cwd=DIR Agent working directory (default: .)
|
|
30
62
|
* --model=MODEL Claude model to use (default: opus)
|
|
31
|
-
* --max-turns=N Maximum agentic turns (default: 50)
|
|
63
|
+
* --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
|
|
32
64
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
33
65
|
* --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
34
66
|
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
67
|
+
* --task-amend=TEXT Additional text appended to the task prompt
|
|
35
68
|
*
|
|
36
69
|
* @param {string[]} args - Command arguments
|
|
37
70
|
*/
|
|
38
71
|
export async function runRunCommand(args) {
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
|
|
49
|
-
const outputPath = parseFlag(args, "output");
|
|
50
|
-
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
51
|
-
const allowedTools = (
|
|
52
|
-
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
53
|
-
).split(",");
|
|
54
|
-
|
|
55
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
72
|
+
const {
|
|
73
|
+
taskContent,
|
|
74
|
+
cwd,
|
|
75
|
+
model,
|
|
76
|
+
maxTurns,
|
|
77
|
+
outputPath,
|
|
78
|
+
agentProfile,
|
|
79
|
+
allowedTools,
|
|
80
|
+
} = parseRunOptions(args);
|
|
56
81
|
|
|
57
82
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
58
83
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -19,6 +19,50 @@ function parseFlag(args, name) {
|
|
|
19
19
|
return undefined;
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
/**
|
|
23
|
+
* Parse all supervise flags from args into an options object.
|
|
24
|
+
* @param {string[]} args
|
|
25
|
+
* @returns {object}
|
|
26
|
+
*/
|
|
27
|
+
function parseSuperviseOptions(args) {
|
|
28
|
+
const taskFile = parseFlag(args, "task-file");
|
|
29
|
+
const taskText = parseFlag(args, "task-text");
|
|
30
|
+
if (taskFile && taskText)
|
|
31
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
32
|
+
if (!taskFile && !taskText)
|
|
33
|
+
throw new Error("--task-file or --task-text is required");
|
|
34
|
+
|
|
35
|
+
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
36
|
+
|
|
37
|
+
const taskAmend = parseFlag(args, "task-amend") ?? undefined;
|
|
38
|
+
let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
39
|
+
if (taskAmend) taskContent += `\n\n${taskAmend}`;
|
|
40
|
+
|
|
41
|
+
return {
|
|
42
|
+
taskContent,
|
|
43
|
+
supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
|
|
44
|
+
agentCwd: resolve(
|
|
45
|
+
parseFlag(args, "agent-cwd") ??
|
|
46
|
+
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
47
|
+
),
|
|
48
|
+
model: parseFlag(args, "model") ?? "opus",
|
|
49
|
+
maxTurns: (() => {
|
|
50
|
+
const raw = parseFlag(args, "max-turns") ?? "20";
|
|
51
|
+
return raw === "0" ? 0 : parseInt(raw, 10);
|
|
52
|
+
})(),
|
|
53
|
+
outputPath: parseFlag(args, "output"),
|
|
54
|
+
supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
|
|
55
|
+
agentProfile: parseFlag(args, "agent-profile") ?? undefined,
|
|
56
|
+
allowedTools: (
|
|
57
|
+
parseFlag(args, "allowed-tools") ??
|
|
58
|
+
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
59
|
+
).split(","),
|
|
60
|
+
supervisorAllowedTools: supervisorAllowedToolsRaw
|
|
61
|
+
? supervisorAllowedToolsRaw.split(",")
|
|
62
|
+
: undefined,
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
22
66
|
/**
|
|
23
67
|
* Supervise command — run two agents in a relay loop via the Claude Agent SDK.
|
|
24
68
|
*
|
|
@@ -30,45 +74,23 @@ function parseFlag(args, name) {
|
|
|
30
74
|
* --supervisor-cwd=DIR Supervisor working directory (default: .)
|
|
31
75
|
* --agent-cwd=DIR Agent working directory (default: temp directory)
|
|
32
76
|
* --model=MODEL Claude model to use (default: opus)
|
|
33
|
-
* --max-turns=N Maximum supervisor
|
|
77
|
+
* --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
|
|
34
78
|
* --output=PATH Write NDJSON trace to file (default: stdout)
|
|
35
79
|
* --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
|
|
36
80
|
* --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
|
|
37
81
|
* --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
|
|
82
|
+
* --task-amend=TEXT Additional text appended to the task prompt
|
|
38
83
|
*
|
|
39
84
|
* @param {string[]} args - Command arguments
|
|
40
85
|
*/
|
|
41
86
|
export async function runSuperviseCommand(args) {
|
|
42
|
-
const
|
|
43
|
-
const taskText = parseFlag(args, "task-text");
|
|
44
|
-
if (taskFile && taskText)
|
|
45
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
46
|
-
if (!taskFile && !taskText)
|
|
47
|
-
throw new Error("--task-file or --task-text is required");
|
|
48
|
-
|
|
49
|
-
const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
|
|
50
|
-
const agentCwd = resolve(
|
|
51
|
-
parseFlag(args, "agent-cwd") ??
|
|
52
|
-
mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
53
|
-
);
|
|
54
|
-
const model = parseFlag(args, "model") ?? "opus";
|
|
55
|
-
const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
|
|
56
|
-
const outputPath = parseFlag(args, "output");
|
|
57
|
-
const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
|
|
58
|
-
const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
|
|
59
|
-
const allowedTools = (
|
|
60
|
-
parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
|
|
61
|
-
).split(",");
|
|
62
|
-
const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
|
|
63
|
-
const supervisorAllowedTools = supervisorAllowedToolsRaw
|
|
64
|
-
? supervisorAllowedToolsRaw.split(",")
|
|
65
|
-
: undefined;
|
|
66
|
-
|
|
67
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
87
|
+
const opts = parseSuperviseOptions(args);
|
|
68
88
|
|
|
69
89
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
70
90
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
71
|
-
const fileStream = outputPath
|
|
91
|
+
const fileStream = opts.outputPath
|
|
92
|
+
? createWriteStream(opts.outputPath)
|
|
93
|
+
: null;
|
|
72
94
|
const output = fileStream
|
|
73
95
|
? createTeeWriter({
|
|
74
96
|
fileStream,
|
|
@@ -79,19 +101,19 @@ export async function runSuperviseCommand(args) {
|
|
|
79
101
|
|
|
80
102
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
81
103
|
const supervisor = createSupervisor({
|
|
82
|
-
supervisorCwd,
|
|
83
|
-
agentCwd,
|
|
104
|
+
supervisorCwd: opts.supervisorCwd,
|
|
105
|
+
agentCwd: opts.agentCwd,
|
|
84
106
|
query,
|
|
85
107
|
output,
|
|
86
|
-
model,
|
|
87
|
-
maxTurns,
|
|
88
|
-
allowedTools,
|
|
89
|
-
supervisorAllowedTools,
|
|
90
|
-
supervisorProfile,
|
|
91
|
-
agentProfile,
|
|
108
|
+
model: opts.model,
|
|
109
|
+
maxTurns: opts.maxTurns,
|
|
110
|
+
allowedTools: opts.allowedTools,
|
|
111
|
+
supervisorAllowedTools: opts.supervisorAllowedTools,
|
|
112
|
+
supervisorProfile: opts.supervisorProfile,
|
|
113
|
+
agentProfile: opts.agentProfile,
|
|
92
114
|
});
|
|
93
115
|
|
|
94
|
-
const result = await supervisor.run(taskContent);
|
|
116
|
+
const result = await supervisor.run(opts.taskContent);
|
|
95
117
|
|
|
96
118
|
if (fileStream) {
|
|
97
119
|
await new Promise((r) => output.end(r));
|