@forwardimpact/libeval 0.1.5 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -29,7 +29,7 @@ Run options:
29
29
  --task-text=STRING Inline task text (mutually exclusive with --task-file)
30
30
  --cwd=DIR Agent working directory (default: .)
31
31
  --model=MODEL Claude model to use (default: opus)
32
- --max-turns=N Maximum agentic turns (default: 50)
32
+ --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
33
33
  --output=PATH Write NDJSON trace to file (default: stdout)
34
34
  --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
35
35
  --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
@@ -40,7 +40,7 @@ Supervise options:
40
40
  --supervisor-cwd=DIR Supervisor working directory (default: .)
41
41
  --agent-cwd=DIR Agent working directory (default: temp directory)
42
42
  --model=MODEL Claude model to use (default: opus)
43
- --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
43
+ --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20, 0 = unlimited)
44
44
  --output=PATH Write NDJSON trace to file (default: stdout)
45
45
  --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
46
46
  --supervisor-allowed-tools=LIST
package/index.js CHANGED
@@ -5,5 +5,7 @@ export {
5
5
  createSupervisor,
6
6
  SUPERVISOR_SYSTEM_PROMPT,
7
7
  AGENT_SYSTEM_PROMPT,
8
+ isComplete,
9
+ isIntervention,
8
10
  } from "./src/supervisor.js";
9
11
  export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.5",
3
+ "version": "0.1.8",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -17,6 +17,7 @@ export class AgentRunner {
17
17
  * @param {string[]} [deps.allowedTools] - Tools the agent may use
18
18
  * @param {string} [deps.permissionMode] - SDK permission mode
19
19
  * @param {function} [deps.onLine] - Callback invoked with each NDJSON line as it's produced
20
+ * @param {function} [deps.onBatch] - Async callback invoked with a batch of NDJSON lines at flush boundaries (assistant text blocks and result messages). Receives `(lines, { abort })` where calling `abort()` stops the in-flight SDK session via the AbortController. Optional; assignable at runtime so the Supervisor can swap it per turn.
20
21
  * @param {string[]} [deps.settingSources] - SDK setting sources (e.g. ['project'] to load CLAUDE.md)
21
22
  * @param {string} [deps.agentProfile] - Agent profile name to pass as --agent to the Claude CLI
22
23
  * @param {string|object} [deps.systemPrompt] - SDK system prompt (string replaces default; {type:'preset', preset:'claude_code', append} appends)
@@ -31,6 +32,7 @@ export class AgentRunner {
31
32
  allowedTools,
32
33
  permissionMode,
33
34
  onLine,
35
+ onBatch,
34
36
  settingSources,
35
37
  agentProfile,
36
38
  systemPrompt,
@@ -43,7 +45,7 @@ export class AgentRunner {
43
45
  this.query = query;
44
46
  this.output = output;
45
47
  this.model = model ?? "opus";
46
- this.maxTurns = maxTurns ?? 50;
48
+ this.maxTurns = maxTurns ?? 50; // 0 means unlimited (omit from SDK)
47
49
  this.allowedTools = allowedTools ?? [
48
50
  "Bash",
49
51
  "Read",
@@ -54,101 +56,140 @@ export class AgentRunner {
54
56
  ];
55
57
  this.permissionMode = permissionMode ?? "bypassPermissions";
56
58
  this.onLine = onLine ?? null;
59
+ this.onBatch = onBatch ?? null;
57
60
  this.settingSources = settingSources ?? [];
58
61
  this.agentProfile = agentProfile ?? null;
59
62
  this.systemPrompt = systemPrompt ?? null;
60
63
  this.disallowedTools = disallowedTools ?? [];
61
64
  this.sessionId = null;
62
65
  this.buffer = [];
66
+ /** @type {AbortController|null} */
67
+ this.currentAbortController = null;
63
68
  }
64
69
 
65
70
  /**
66
71
  * Run a new agent session with the given task.
67
72
  * @param {string} task - The task prompt
68
- * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
73
+ * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
69
74
  */
70
75
  async run(task) {
71
- let text = "";
72
- let stopReason = null;
73
- let error = null;
74
-
76
+ const abortController = new AbortController();
77
+ this.currentAbortController = abortController;
75
78
  try {
76
- for await (const message of this.query({
79
+ const iterator = this.query({
77
80
  prompt: task,
78
81
  options: {
79
82
  cwd: this.cwd,
80
83
  allowedTools: this.allowedTools,
81
- maxTurns: this.maxTurns,
84
+ ...(this.maxTurns > 0 && { maxTurns: this.maxTurns }),
82
85
  model: this.model,
83
86
  permissionMode: this.permissionMode,
84
87
  allowDangerouslySkipPermissions: true,
85
88
  settingSources: this.settingSources,
89
+ abortController,
86
90
  ...(this.disallowedTools.length > 0 && {
87
91
  disallowedTools: this.disallowedTools,
88
92
  }),
89
93
  ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
90
94
  ...(this.agentProfile && { extraArgs: { agent: this.agentProfile } }),
91
95
  },
92
- })) {
93
- const line = JSON.stringify(message);
94
- this.output.write(line + "\n");
95
- this.buffer.push(line);
96
- if (this.onLine) this.onLine(line);
97
-
98
- if (message.type === "system" && message.subtype === "init") {
99
- this.sessionId = message.session_id;
100
- }
101
- if (message.type === "result") {
102
- text = message.result ?? "";
103
- stopReason = message.subtype;
104
- }
105
- }
106
- } catch (err) {
107
- error = err;
96
+ });
97
+ return await this.#consumeQuery(iterator);
98
+ } finally {
99
+ this.currentAbortController = null;
108
100
  }
109
-
110
- // If the SDK already emitted a successful result, honour it even when the
111
- // stream throws afterwards (e.g. "Credit balance is too low" during
112
- // cleanup). Only treat errors as fatal when no result was received yet.
113
- const success = stopReason === "success";
114
- return { success, text, sessionId: this.sessionId, error };
115
101
  }
116
102
 
117
103
  /**
118
104
  * Resume an existing session with a follow-up prompt.
119
105
  * @param {string} prompt - The follow-up prompt
120
- * @returns {Promise<{success: boolean, text: string}>}
106
+ * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
121
107
  */
122
108
  async resume(prompt) {
123
- let text = "";
124
- let stopReason = null;
125
- let error = null;
126
-
109
+ const abortController = new AbortController();
110
+ this.currentAbortController = abortController;
127
111
  try {
128
- for await (const message of this.query({
112
+ const iterator = this.query({
129
113
  prompt,
130
114
  options: {
131
115
  resume: this.sessionId,
132
116
  permissionMode: this.permissionMode,
133
117
  allowDangerouslySkipPermissions: true,
118
+ abortController,
134
119
  },
135
- })) {
120
+ });
121
+ return await this.#consumeQuery(iterator);
122
+ } finally {
123
+ this.currentAbortController = null;
124
+ }
125
+ }
126
+
127
+ /**
128
+ * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
129
+ * iterator, mirroring every line to the output stream / buffer / onLine
130
+ * callback, and — when `onBatch` is set — flushes accumulated lines to it
131
+ * at natural boundaries (assistant messages with text blocks, and the
132
+ * terminal `result` message).
133
+ *
134
+ * INVARIANT: the `await this.onBatch(...)` call below is the ONLY
135
+ * suspension point in this loop. While it is pending, no further lines
136
+ * are pulled from the SDK generator. The Supervisor relies on this — its
137
+ * onBatch callback flips `currentSource` to "supervisor" for the duration
138
+ * of its mid-turn LLM call, and the invariant guarantees no agent line
139
+ * can arrive concurrently and be mis-tagged.
140
+ *
141
+ * If the supervisor calls `abort()` from inside the callback, the next
142
+ * iteration of the for-await loop will throw. We catch the throw, check
143
+ * `currentAbortController.signal.aborted` (avoiding fragility around
144
+ * AbortError vs DOMException shapes), and report `aborted: true` so the
145
+ * caller can distinguish "supervisor asked us to stop" from a real error.
146
+ * @param {AsyncIterable<object>} iterator
147
+ * @returns {Promise<{success: boolean, text: string, sessionId: string|null, error: Error|null, aborted: boolean}>}
148
+ */
149
+ async #consumeQuery(iterator) {
150
+ let text = "";
151
+ let stopReason = null;
152
+ let error = null;
153
+ let aborted = false;
154
+ const pendingBatch = [];
155
+
156
+ try {
157
+ for await (const message of iterator) {
136
158
  const line = JSON.stringify(message);
137
159
  this.output.write(line + "\n");
138
160
  this.buffer.push(line);
139
161
  if (this.onLine) this.onLine(line);
162
+ if (this.onBatch) pendingBatch.push(line);
140
163
 
164
+ if (message.type === "system" && message.subtype === "init") {
165
+ this.sessionId = message.session_id;
166
+ }
141
167
  if (message.type === "result") {
142
168
  text = message.result ?? "";
143
169
  stopReason = message.subtype;
144
170
  }
171
+
172
+ const shouldFlush =
173
+ this.onBatch &&
174
+ (message.type === "result" ||
175
+ (message.type === "assistant" && hasTextBlock(message)));
176
+ if (shouldFlush) {
177
+ const batchLines = pendingBatch.splice(0, pendingBatch.length);
178
+ await this.onBatch(batchLines, {
179
+ abort: () => this.currentAbortController?.abort(),
180
+ });
181
+ }
145
182
  }
146
183
  } catch (err) {
147
- error = err;
184
+ if (this.currentAbortController?.signal.aborted) {
185
+ aborted = true;
186
+ } else {
187
+ error = err;
188
+ }
148
189
  }
149
190
 
150
191
  const success = stopReason === "success";
151
- return { success, text, error };
192
+ return { success, text, sessionId: this.sessionId, error, aborted };
152
193
  }
153
194
 
154
195
  /**
@@ -162,6 +203,23 @@ export class AgentRunner {
162
203
  }
163
204
  }
164
205
 
206
+ /**
207
+ * Whether an SDK assistant message contains at least one text block.
208
+ * Tool-only assistant messages return false so they accumulate into the
209
+ * pending batch and flush with the next text block (or with the terminal
210
+ * `result` message), keeping supervisor LLM cost bounded.
211
+ * @param {object} message
212
+ * @returns {boolean}
213
+ */
214
+ function hasTextBlock(message) {
215
+ const content = message.message?.content ?? message.content;
216
+ if (!Array.isArray(content)) return false;
217
+ for (const block of content) {
218
+ if (block.type === "text" && block.text) return true;
219
+ }
220
+ return false;
221
+ }
222
+
165
223
  /**
166
224
  * Factory function — wires real dependencies.
167
225
  * @param {object} deps - Same as AgentRunner constructor
@@ -18,6 +18,38 @@ function parseFlag(args, name) {
18
18
  return undefined;
19
19
  }
20
20
 
21
+ /**
22
+ * Parse and validate run command options from args.
23
+ * @param {string[]} args
24
+ * @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
25
+ */
26
+ function parseRunOptions(args) {
27
+ const taskFile = parseFlag(args, "task-file");
28
+ const taskText = parseFlag(args, "task-text");
29
+ if (taskFile && taskText)
30
+ throw new Error("--task-file and --task-text are mutually exclusive");
31
+ if (!taskFile && !taskText)
32
+ throw new Error("--task-file or --task-text is required");
33
+
34
+ const maxTurnsRaw = parseFlag(args, "max-turns") ?? "50";
35
+ const taskAmend = parseFlag(args, "task-amend") ?? undefined;
36
+ let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
37
+ if (taskAmend) taskContent += `\n\n${taskAmend}`;
38
+
39
+ return {
40
+ taskContent,
41
+ cwd: resolve(parseFlag(args, "cwd") ?? "."),
42
+ model: parseFlag(args, "model") ?? "opus",
43
+ maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
44
+ outputPath: parseFlag(args, "output"),
45
+ agentProfile: parseFlag(args, "agent-profile") ?? undefined,
46
+ allowedTools: (
47
+ parseFlag(args, "allowed-tools") ??
48
+ "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
49
+ ).split(","),
50
+ };
51
+ }
52
+
21
53
  /**
22
54
  * Run command — execute a single agent via the Claude Agent SDK.
23
55
  *
@@ -28,31 +60,24 @@ function parseFlag(args, name) {
28
60
  * --task-text=STRING Inline task text (mutually exclusive with --task-file)
29
61
  * --cwd=DIR Agent working directory (default: .)
30
62
  * --model=MODEL Claude model to use (default: opus)
31
- * --max-turns=N Maximum agentic turns (default: 50)
63
+ * --max-turns=N Maximum agentic turns (default: 50, 0 = unlimited)
32
64
  * --output=PATH Write NDJSON trace to file (default: stdout)
33
65
  * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
34
66
  * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
67
+ * --task-amend=TEXT Additional text appended to the task prompt
35
68
  *
36
69
  * @param {string[]} args - Command arguments
37
70
  */
38
71
  export async function runRunCommand(args) {
39
- const taskFile = parseFlag(args, "task-file");
40
- const taskText = parseFlag(args, "task-text");
41
- if (taskFile && taskText)
42
- throw new Error("--task-file and --task-text are mutually exclusive");
43
- if (!taskFile && !taskText)
44
- throw new Error("--task-file or --task-text is required");
45
-
46
- const cwd = resolve(parseFlag(args, "cwd") ?? ".");
47
- const model = parseFlag(args, "model") ?? "opus";
48
- const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
49
- const outputPath = parseFlag(args, "output");
50
- const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
51
- const allowedTools = (
52
- parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
53
- ).split(",");
54
-
55
- const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
72
+ const {
73
+ taskContent,
74
+ cwd,
75
+ model,
76
+ maxTurns,
77
+ outputPath,
78
+ agentProfile,
79
+ allowedTools,
80
+ } = parseRunOptions(args);
56
81
 
57
82
  // When --output is specified, stream text to stdout while writing NDJSON to file.
58
83
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
@@ -19,6 +19,50 @@ function parseFlag(args, name) {
19
19
  return undefined;
20
20
  }
21
21
 
22
+ /**
23
+ * Parse all supervise flags from args into an options object.
24
+ * @param {string[]} args
25
+ * @returns {object}
26
+ */
27
+ function parseSuperviseOptions(args) {
28
+ const taskFile = parseFlag(args, "task-file");
29
+ const taskText = parseFlag(args, "task-text");
30
+ if (taskFile && taskText)
31
+ throw new Error("--task-file and --task-text are mutually exclusive");
32
+ if (!taskFile && !taskText)
33
+ throw new Error("--task-file or --task-text is required");
34
+
35
+ const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
36
+
37
+ const taskAmend = parseFlag(args, "task-amend") ?? undefined;
38
+ let taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
39
+ if (taskAmend) taskContent += `\n\n${taskAmend}`;
40
+
41
+ return {
42
+ taskContent,
43
+ supervisorCwd: resolve(parseFlag(args, "supervisor-cwd") ?? "."),
44
+ agentCwd: resolve(
45
+ parseFlag(args, "agent-cwd") ??
46
+ mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
47
+ ),
48
+ model: parseFlag(args, "model") ?? "opus",
49
+ maxTurns: (() => {
50
+ const raw = parseFlag(args, "max-turns") ?? "20";
51
+ return raw === "0" ? 0 : parseInt(raw, 10);
52
+ })(),
53
+ outputPath: parseFlag(args, "output"),
54
+ supervisorProfile: parseFlag(args, "supervisor-profile") ?? undefined,
55
+ agentProfile: parseFlag(args, "agent-profile") ?? undefined,
56
+ allowedTools: (
57
+ parseFlag(args, "allowed-tools") ??
58
+ "Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
59
+ ).split(","),
60
+ supervisorAllowedTools: supervisorAllowedToolsRaw
61
+ ? supervisorAllowedToolsRaw.split(",")
62
+ : undefined,
63
+ };
64
+ }
65
+
22
66
  /**
23
67
  * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
24
68
  *
@@ -30,45 +74,23 @@ function parseFlag(args, name) {
30
74
  * --supervisor-cwd=DIR Supervisor working directory (default: .)
31
75
  * --agent-cwd=DIR Agent working directory (default: temp directory)
32
76
  * --model=MODEL Claude model to use (default: opus)
33
- * --max-turns=N Maximum supervisor agent exchanges (default: 20)
77
+ * --max-turns=N Maximum supervisor / agent exchanges (default: 20, 0 = unlimited)
34
78
  * --output=PATH Write NDJSON trace to file (default: stdout)
35
79
  * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
36
80
  * --supervisor-profile=NAME Supervisor agent profile name (passed as --agent to Claude CLI)
37
81
  * --agent-profile=NAME Agent profile name (passed as --agent to Claude CLI)
82
+ * --task-amend=TEXT Additional text appended to the task prompt
38
83
  *
39
84
  * @param {string[]} args - Command arguments
40
85
  */
41
86
  export async function runSuperviseCommand(args) {
42
- const taskFile = parseFlag(args, "task-file");
43
- const taskText = parseFlag(args, "task-text");
44
- if (taskFile && taskText)
45
- throw new Error("--task-file and --task-text are mutually exclusive");
46
- if (!taskFile && !taskText)
47
- throw new Error("--task-file or --task-text is required");
48
-
49
- const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
50
- const agentCwd = resolve(
51
- parseFlag(args, "agent-cwd") ??
52
- mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
53
- );
54
- const model = parseFlag(args, "model") ?? "opus";
55
- const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
56
- const outputPath = parseFlag(args, "output");
57
- const supervisorProfile = parseFlag(args, "supervisor-profile") ?? undefined;
58
- const agentProfile = parseFlag(args, "agent-profile") ?? undefined;
59
- const allowedTools = (
60
- parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
61
- ).split(",");
62
- const supervisorAllowedToolsRaw = parseFlag(args, "supervisor-allowed-tools");
63
- const supervisorAllowedTools = supervisorAllowedToolsRaw
64
- ? supervisorAllowedToolsRaw.split(",")
65
- : undefined;
66
-
67
- const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
87
+ const opts = parseSuperviseOptions(args);
68
88
 
69
89
  // When --output is specified, stream text to stdout while writing NDJSON to file.
70
90
  // Otherwise, write NDJSON directly to stdout (backwards-compatible).
71
- const fileStream = outputPath ? createWriteStream(outputPath) : null;
91
+ const fileStream = opts.outputPath
92
+ ? createWriteStream(opts.outputPath)
93
+ : null;
72
94
  const output = fileStream
73
95
  ? createTeeWriter({
74
96
  fileStream,
@@ -79,19 +101,19 @@ export async function runSuperviseCommand(args) {
79
101
 
80
102
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
81
103
  const supervisor = createSupervisor({
82
- supervisorCwd,
83
- agentCwd,
104
+ supervisorCwd: opts.supervisorCwd,
105
+ agentCwd: opts.agentCwd,
84
106
  query,
85
107
  output,
86
- model,
87
- maxTurns,
88
- allowedTools,
89
- supervisorAllowedTools,
90
- supervisorProfile,
91
- agentProfile,
108
+ model: opts.model,
109
+ maxTurns: opts.maxTurns,
110
+ allowedTools: opts.allowedTools,
111
+ supervisorAllowedTools: opts.supervisorAllowedTools,
112
+ supervisorProfile: opts.supervisorProfile,
113
+ agentProfile: opts.agentProfile,
92
114
  });
93
115
 
94
- const result = await supervisor.run(taskContent);
116
+ const result = await supervisor.run(opts.taskContent);
95
117
 
96
118
  if (fileStream) {
97
119
  await new Promise((r) => output.end(r));