@forwardimpact/libeval 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -1,11 +1,15 @@
1
- #!/usr/bin/env node
1
+ #!/usr/bin/env bun
2
2
 
3
3
  import { runOutputCommand } from "../src/commands/output.js";
4
4
  import { runTeeCommand } from "../src/commands/tee.js";
5
+ import { runRunCommand } from "../src/commands/run.js";
6
+ import { runSuperviseCommand } from "../src/commands/supervise.js";
5
7
 
6
8
  const COMMANDS = {
7
9
  output: runOutputCommand,
8
10
  tee: runTeeCommand,
11
+ run: runRunCommand,
12
+ supervise: runSuperviseCommand,
9
13
  };
10
14
 
11
15
  const HELP_TEXT = `
@@ -17,6 +21,25 @@ Usage:
17
21
  Commands:
18
22
  output [--format=json|text] Process trace and output formatted result
19
23
  tee [output.ndjson] Stream text to stdout, optionally save raw NDJSON
24
+ run [options] Run a single agent via the Claude Agent SDK
25
+ supervise [options] Run a supervised agent ↔ supervisor relay loop
26
+
27
+ Run options:
28
+ --task=PATH Path to task file (required)
29
+ --cwd=DIR Agent working directory (default: .)
30
+ --model=MODEL Claude model to use (default: opus)
31
+ --max-turns=N Maximum agentic turns (default: 50)
32
+ --output=PATH Write NDJSON trace to file (default: stdout)
33
+ --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
34
+
35
+ Supervise options:
36
+ --task=PATH Path to task file (required)
37
+ --supervisor-cwd=DIR Supervisor working directory (default: .)
38
+ --agent-cwd=DIR Agent working directory (default: temp directory)
39
+ --model=MODEL Claude model to use (default: opus)
40
+ --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
41
+ --output=PATH Write NDJSON trace to file (default: stdout)
42
+ --allowed-tools=LIST Comma-separated tools for agent (default: Bash,Read,Glob,Grep,Write,Edit)
20
43
 
21
44
  Options:
22
45
  --help Show this help message
@@ -27,6 +50,8 @@ Examples:
27
50
  fit-eval output --format=json < trace.ndjson
28
51
  fit-eval tee < trace.ndjson
29
52
  fit-eval tee output.ndjson < trace.ndjson
53
+ fit-eval run --task=.github/tasks/security-audit.md --model=opus
54
+ fit-eval supervise --task=scenarios/guide-setup/task.md --supervisor-cwd=.
30
55
  `.trim();
31
56
 
32
57
  async function main() {
package/index.js CHANGED
@@ -1 +1,4 @@
1
1
  export { TraceCollector, createTraceCollector } from "./src/trace-collector.js";
2
+ export { AgentRunner, createAgentRunner } from "./src/agent-runner.js";
3
+ export { Supervisor, createSupervisor } from "./src/supervisor.js";
4
+ export { TeeWriter, createTeeWriter } from "./src/tee-writer.js";
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -10,10 +10,13 @@
10
10
  "fit-eval": "./bin/fit-eval.js"
11
11
  },
12
12
  "engines": {
13
- "node": ">=22.0.0"
13
+ "bun": ">=1.2.0"
14
14
  },
15
15
  "scripts": {
16
- "test": "node --test test/*.test.js"
16
+ "test": "bun run node --test test/*.test.js"
17
+ },
18
+ "dependencies": {
19
+ "@anthropic-ai/claude-agent-sdk": "^0.1.0"
17
20
  },
18
21
  "publishConfig": {
19
22
  "access": "public"
@@ -0,0 +1,142 @@
1
+ /**
2
+ * AgentRunner — runs a single Claude Agent SDK session and emits raw NDJSON
3
+ * events to an output stream. Building block for both `fit-eval run` and
4
+ * `fit-eval supervise`.
5
+ *
6
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
7
+ */
8
+
9
+ export class AgentRunner {
10
+ /**
11
+ * @param {object} deps
12
+ * @param {string} deps.cwd - Agent working directory
13
+ * @param {function} deps.query - SDK query function (injected for testing)
14
+ * @param {import("stream").Writable} deps.output - Stream to emit NDJSON to
15
+ * @param {string} [deps.model] - Claude model identifier
16
+ * @param {number} [deps.maxTurns] - Maximum agentic turns
17
+ * @param {string[]} [deps.allowedTools] - Tools the agent may use
18
+ * @param {string} [deps.permissionMode] - SDK permission mode
19
+ */
20
+ constructor({
21
+ cwd,
22
+ query,
23
+ output,
24
+ model,
25
+ maxTurns,
26
+ allowedTools,
27
+ permissionMode,
28
+ }) {
29
+ if (!cwd) throw new Error("cwd is required");
30
+ if (!query) throw new Error("query is required");
31
+ if (!output) throw new Error("output is required");
32
+ this.cwd = cwd;
33
+ this.query = query;
34
+ this.output = output;
35
+ this.model = model ?? "opus";
36
+ this.maxTurns = maxTurns ?? 50;
37
+ this.allowedTools = allowedTools ?? [
38
+ "Bash",
39
+ "Read",
40
+ "Glob",
41
+ "Grep",
42
+ "Write",
43
+ "Edit",
44
+ ];
45
+ this.permissionMode = permissionMode ?? "bypassPermissions";
46
+ this.sessionId = null;
47
+ this.buffer = [];
48
+ }
49
+
50
+ /**
51
+ * Run a new agent session with the given task.
52
+ * @param {string} task - The task prompt
53
+ * @returns {Promise<{success: boolean, text: string, sessionId: string|null}>}
54
+ */
55
+ async run(task) {
56
+ let text = "";
57
+ let stopReason = null;
58
+ let error = null;
59
+
60
+ try {
61
+ for await (const message of this.query({
62
+ prompt: task,
63
+ options: {
64
+ cwd: this.cwd,
65
+ allowedTools: this.allowedTools,
66
+ maxTurns: this.maxTurns,
67
+ model: this.model,
68
+ permissionMode: this.permissionMode,
69
+ allowDangerouslySkipPermissions: true,
70
+ },
71
+ })) {
72
+ const line = JSON.stringify(message);
73
+ this.output.write(line + "\n");
74
+ this.buffer.push(line);
75
+
76
+ if (message.type === "system" && message.subtype === "init") {
77
+ this.sessionId = message.session_id;
78
+ }
79
+ if (message.type === "result") {
80
+ text = message.result ?? "";
81
+ stopReason = message.subtype;
82
+ }
83
+ }
84
+ } catch (err) {
85
+ error = err;
86
+ }
87
+
88
+ const success = !error && stopReason === "success";
89
+ return { success, text, sessionId: this.sessionId, error };
90
+ }
91
+
92
+ /**
93
+ * Resume an existing session with a follow-up prompt.
94
+ * @param {string} prompt - The follow-up prompt
95
+ * @returns {Promise<{success: boolean, text: string}>}
96
+ */
97
+ async resume(prompt) {
98
+ let text = "";
99
+ let stopReason = null;
100
+ let error = null;
101
+
102
+ try {
103
+ for await (const message of this.query({
104
+ prompt,
105
+ options: { resume: this.sessionId },
106
+ })) {
107
+ const line = JSON.stringify(message);
108
+ this.output.write(line + "\n");
109
+ this.buffer.push(line);
110
+
111
+ if (message.type === "result") {
112
+ text = message.result ?? "";
113
+ stopReason = message.subtype;
114
+ }
115
+ }
116
+ } catch (err) {
117
+ error = err;
118
+ }
119
+
120
+ const success = !error && stopReason === "success";
121
+ return { success, text, error };
122
+ }
123
+
124
+ /**
125
+ * Drain buffered output lines. Used by Supervisor to tag and re-emit lines.
126
+ * @returns {string[]}
127
+ */
128
+ drainOutput() {
129
+ const lines = [...this.buffer];
130
+ this.buffer = [];
131
+ return lines;
132
+ }
133
+ }
134
+
135
+ /**
136
+ * Factory function — wires real dependencies.
137
+ * @param {object} deps - Same as AgentRunner constructor
138
+ * @returns {AgentRunner}
139
+ */
140
+ export function createAgentRunner(deps) {
141
+ return new AgentRunner(deps);
142
+ }
@@ -0,0 +1,75 @@
1
+ import { readFileSync, createWriteStream } from "node:fs";
2
+ import { resolve } from "node:path";
3
+ import { createAgentRunner } from "../agent-runner.js";
4
+ import { createTeeWriter } from "../tee-writer.js";
5
+
6
+ /**
7
+ * Parse a --key=value or --key value flag from args.
8
+ * @param {string[]} args
9
+ * @param {string} name - Flag name without --
10
+ * @returns {string|undefined}
11
+ */
12
+ function parseFlag(args, name) {
13
+ const prefix = `--${name}=`;
14
+ for (let i = 0; i < args.length; i++) {
15
+ if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
16
+ if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
17
+ }
18
+ return undefined;
19
+ }
20
+
21
+ /**
22
+ * Run command — execute a single agent via the Claude Agent SDK.
23
+ *
24
+ * Usage: fit-eval run [options]
25
+ *
26
+ * Options:
27
+ * --task=PATH Path to task file (required)
28
+ * --cwd=DIR Agent working directory (default: .)
29
+ * --model=MODEL Claude model to use (default: opus)
30
+ * --max-turns=N Maximum agentic turns (default: 50)
31
+ * --output=PATH Write NDJSON trace to file (default: stdout)
32
+ * --allowed-tools=LIST Comma-separated tools (default: Bash,Read,Glob,Grep,Write,Edit)
33
+ *
34
+ * @param {string[]} args - Command arguments
35
+ */
36
+ export async function runRunCommand(args) {
37
+ const task = parseFlag(args, "task");
38
+ if (!task) throw new Error("--task is required");
39
+
40
+ const cwd = resolve(parseFlag(args, "cwd") ?? ".");
41
+ const model = parseFlag(args, "model") ?? "opus";
42
+ const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "50", 10);
43
+ const outputPath = parseFlag(args, "output");
44
+ const allowedTools = (
45
+ parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
46
+ ).split(",");
47
+
48
+ const taskContent = readFileSync(task, "utf8");
49
+
50
+ // When --output is specified, stream text to stdout while writing NDJSON to file.
51
+ // Otherwise, write NDJSON directly to stdout (backwards-compatible).
52
+ const fileStream = outputPath ? createWriteStream(outputPath) : null;
53
+ const output = fileStream
54
+ ? createTeeWriter({ fileStream, textStream: process.stdout, mode: "raw" })
55
+ : process.stdout;
56
+
57
+ const { query } = await import("@anthropic-ai/claude-agent-sdk");
58
+ const runner = createAgentRunner({
59
+ cwd,
60
+ query,
61
+ output,
62
+ model,
63
+ maxTurns,
64
+ allowedTools,
65
+ });
66
+
67
+ const result = await runner.run(taskContent);
68
+
69
+ if (fileStream) {
70
+ await new Promise((r) => output.end(r));
71
+ await new Promise((r) => fileStream.end(r));
72
+ }
73
+
74
+ process.exit(result.success ? 0 : 1);
75
+ }
@@ -0,0 +1,86 @@
1
+ import { readFileSync, createWriteStream, mkdtempSync } from "node:fs";
2
+ import { resolve, join } from "node:path";
3
+ import { tmpdir } from "node:os";
4
+ import { createSupervisor } from "../supervisor.js";
5
+ import { createTeeWriter } from "../tee-writer.js";
6
+
7
+ /**
8
+ * Parse a --key=value or --key value flag from args.
9
+ * @param {string[]} args
10
+ * @param {string} name - Flag name without --
11
+ * @returns {string|undefined}
12
+ */
13
+ function parseFlag(args, name) {
14
+ const prefix = `--${name}=`;
15
+ for (let i = 0; i < args.length; i++) {
16
+ if (args[i].startsWith(prefix)) return args[i].slice(prefix.length);
17
+ if (args[i] === `--${name}` && i + 1 < args.length) return args[i + 1];
18
+ }
19
+ return undefined;
20
+ }
21
+
22
+ /**
23
+ * Supervise command — run two agents in a relay loop via the Claude Agent SDK.
24
+ *
25
+ * Usage: fit-eval supervise [options]
26
+ *
27
+ * Options:
28
+ * --task=PATH Path to task file (required)
29
+ * --supervisor-cwd=DIR Supervisor working directory (default: .)
30
+ * --agent-cwd=DIR Agent working directory (default: temp directory)
31
+ * --model=MODEL Claude model to use (default: opus)
32
+ * --max-turns=N Maximum supervisor ↔ agent exchanges (default: 20)
33
+ * --output=PATH Write NDJSON trace to file (default: stdout)
34
+ * --allowed-tools=LIST Comma-separated tools for the agent (default: Bash,Read,Glob,Grep,Write,Edit)
35
+ *
36
+ * @param {string[]} args - Command arguments
37
+ */
38
+ export async function runSuperviseCommand(args) {
39
+ const task = parseFlag(args, "task");
40
+ if (!task) throw new Error("--task is required");
41
+
42
+ const supervisorCwd = resolve(parseFlag(args, "supervisor-cwd") ?? ".");
43
+ const agentCwd = resolve(
44
+ parseFlag(args, "agent-cwd") ??
45
+ mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
46
+ );
47
+ const model = parseFlag(args, "model") ?? "opus";
48
+ const maxTurns = parseInt(parseFlag(args, "max-turns") ?? "20", 10);
49
+ const outputPath = parseFlag(args, "output");
50
+ const allowedTools = (
51
+ parseFlag(args, "allowed-tools") ?? "Bash,Read,Glob,Grep,Write,Edit"
52
+ ).split(",");
53
+
54
+ const taskContent = readFileSync(task, "utf8");
55
+
56
+ // When --output is specified, stream text to stdout while writing NDJSON to file.
57
+ // Otherwise, write NDJSON directly to stdout (backwards-compatible).
58
+ const fileStream = outputPath ? createWriteStream(outputPath) : null;
59
+ const output = fileStream
60
+ ? createTeeWriter({
61
+ fileStream,
62
+ textStream: process.stdout,
63
+ mode: "supervised",
64
+ })
65
+ : process.stdout;
66
+
67
+ const { query } = await import("@anthropic-ai/claude-agent-sdk");
68
+ const supervisor = createSupervisor({
69
+ supervisorCwd,
70
+ agentCwd,
71
+ query,
72
+ output,
73
+ model,
74
+ maxTurns,
75
+ allowedTools,
76
+ });
77
+
78
+ const result = await supervisor.run(taskContent);
79
+
80
+ if (fileStream) {
81
+ await new Promise((r) => output.end(r));
82
+ await new Promise((r) => fileStream.end(r));
83
+ }
84
+
85
+ process.exit(result.success ? 0 : 1);
86
+ }
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Supervisor — orchestrates a relay loop between an agent and a supervisor,
3
+ * both running as AgentRunner instances. The agent works on a task while the
4
+ * supervisor observes and decides when the evaluation is complete.
5
+ *
6
+ * Follows OO+DI: constructor injection, factory function, tests bypass factory.
7
+ */
8
+
9
+ import { PassThrough } from "node:stream";
10
+ import { createAgentRunner } from "./agent-runner.js";
11
+
12
+ /**
13
+ * Check if the supervisor's response signals evaluation completion.
14
+ * Uses a structured signal — `EVALUATION_COMPLETE` on its own line —
15
+ * to avoid false positives from natural language.
16
+ * @param {string} text
17
+ * @returns {boolean}
18
+ */
19
+ export function isDone(text) {
20
+ return /^EVALUATION_COMPLETE$/m.test(text);
21
+ }
22
+
23
+ export class Supervisor {
24
+ /**
25
+ * @param {object} deps
26
+ * @param {import("./agent-runner.js").AgentRunner} deps.agentRunner - Runs the agent sessions
27
+ * @param {import("./agent-runner.js").AgentRunner} deps.supervisorRunner - Runs the supervisor sessions
28
+ * @param {import("stream").Writable} deps.output - Stream to emit tagged NDJSON to
29
+ * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
30
+ */
31
+ constructor({ agentRunner, supervisorRunner, output, maxTurns }) {
32
+ if (!agentRunner) throw new Error("agentRunner is required");
33
+ if (!supervisorRunner) throw new Error("supervisorRunner is required");
34
+ if (!output) throw new Error("output is required");
35
+ this.agentRunner = agentRunner;
36
+ this.supervisorRunner = supervisorRunner;
37
+ this.output = output;
38
+ this.maxTurns = maxTurns ?? 20;
39
+ }
40
+
41
+ /**
42
+ * Run the supervisor ↔ agent relay loop.
43
+ * @param {string} task - The initial task for the agent
44
+ * @returns {Promise<{success: boolean, turns: number}>}
45
+ */
46
+ async run(task) {
47
+ // Turn 0: Agent receives the task and starts working
48
+ let agentResult = await this.agentRunner.run(task);
49
+ this.emitTagged("agent", 0);
50
+
51
+ if (agentResult.error) {
52
+ this.emitSummary({ success: false, turns: 0 });
53
+ return { success: false, turns: 0 };
54
+ }
55
+
56
+ for (let turn = 1; turn <= this.maxTurns; turn++) {
57
+ // Supervisor observes the agent's output
58
+ const supervisorPrompt =
59
+ `The agent reported:\n\n${agentResult.text}\n\n` +
60
+ `Decide: provide guidance, answer a question, or say EVALUATION_COMPLETE on its own line.`;
61
+
62
+ let supervisorResult;
63
+ if (turn === 1) {
64
+ supervisorResult = await this.supervisorRunner.run(supervisorPrompt);
65
+ } else {
66
+ supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
67
+ }
68
+ this.emitTagged("supervisor", turn);
69
+
70
+ if (supervisorResult.error) {
71
+ this.emitSummary({ success: false, turns: turn });
72
+ return { success: false, turns: turn };
73
+ }
74
+
75
+ if (isDone(supervisorResult.text)) {
76
+ this.emitSummary({ success: true, turns: turn });
77
+ return { success: true, turns: turn };
78
+ }
79
+
80
+ // Supervisor's response becomes the agent's next input
81
+ agentResult = await this.agentRunner.resume(supervisorResult.text);
82
+ this.emitTagged("agent", turn);
83
+
84
+ if (agentResult.error) {
85
+ this.emitSummary({ success: false, turns: turn });
86
+ return { success: false, turns: turn };
87
+ }
88
+ }
89
+
90
+ this.emitSummary({ success: false, turns: this.maxTurns });
91
+ return { success: false, turns: this.maxTurns };
92
+ }
93
+
94
+ /**
95
+ * Drain a runner's buffered output and re-emit each line tagged with
96
+ * source and turn metadata.
97
+ * @param {"agent"|"supervisor"} source
98
+ * @param {number} turn
99
+ */
100
+ emitTagged(source, turn) {
101
+ const runner =
102
+ source === "agent" ? this.agentRunner : this.supervisorRunner;
103
+ for (const line of runner.drainOutput()) {
104
+ const event = JSON.parse(line);
105
+ const tagged = { source, turn, event };
106
+ this.output.write(JSON.stringify(tagged) + "\n");
107
+ }
108
+ }
109
+
110
+ /**
111
+ * Emit a final orchestrator summary line.
112
+ * @param {{success: boolean, turns: number}} result
113
+ */
114
+ emitSummary(result) {
115
+ const summary = {
116
+ source: "orchestrator",
117
+ type: "summary",
118
+ success: result.success,
119
+ turns: result.turns,
120
+ };
121
+ this.output.write(JSON.stringify(summary) + "\n");
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Factory function — wires both AgentRunners with their respective configs.
127
+ * @param {object} deps
128
+ * @param {string} deps.supervisorCwd - Supervisor working directory
129
+ * @param {string} deps.agentCwd - Agent working directory
130
+ * @param {function} deps.query - SDK query function
131
+ * @param {import("stream").Writable} deps.output - Final output stream
132
+ * @param {string} [deps.model] - Claude model identifier
133
+ * @param {number} [deps.maxTurns] - Maximum supervisor ↔ agent exchanges
134
+ * @param {string[]} [deps.allowedTools] - Tools the agent may use
135
+ * @returns {Supervisor}
136
+ */
137
+ export function createSupervisor({
138
+ supervisorCwd,
139
+ agentCwd,
140
+ query,
141
+ output,
142
+ model,
143
+ maxTurns,
144
+ allowedTools,
145
+ }) {
146
+ const agentRunner = createAgentRunner({
147
+ cwd: agentCwd,
148
+ query,
149
+ output: new PassThrough(),
150
+ model,
151
+ maxTurns: 50,
152
+ allowedTools,
153
+ });
154
+
155
+ const supervisorRunner = createAgentRunner({
156
+ cwd: supervisorCwd,
157
+ query,
158
+ output: new PassThrough(),
159
+ model,
160
+ maxTurns: 10,
161
+ allowedTools: ["Read", "Glob", "Grep"],
162
+ });
163
+
164
+ return new Supervisor({ agentRunner, supervisorRunner, output, maxTurns });
165
+ }