@forwardimpact/libeval 0.1.23 → 0.1.25

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/fit-eval.js CHANGED
@@ -17,65 +17,77 @@ const { version: VERSION } = JSON.parse(
17
17
  const definition = {
18
18
  name: "fit-eval",
19
19
  version: VERSION,
20
- description: "Process Claude Code stream-json output",
20
+ description:
21
+ "Run agents and capture NDJSON traces — for agent evaluations or multi-agent collaboration",
21
22
  commands: [
22
- {
23
- name: "output",
24
- args: "",
25
- description: "Process trace and output formatted result",
26
- },
27
- {
28
- name: "tee",
29
- args: "[output.ndjson]",
30
- description: "Stream text to stdout, optionally save raw NDJSON",
31
- },
32
23
  {
33
24
  name: "run",
34
25
  args: "",
35
- description: "Run a single agent via the Claude Agent SDK",
26
+ description: "Run a single agent autonomously on a defined task",
36
27
  options: {
37
- "task-file": { type: "string", description: "Path to task file" },
38
- "task-text": { type: "string", description: "Inline task text" },
28
+ "task-file": {
29
+ type: "string",
30
+ description: "Path to a markdown task file",
31
+ },
32
+ "task-text": {
33
+ type: "string",
34
+ description: "Inline task text (alternative to --task-file)",
35
+ },
39
36
  "task-amend": {
40
37
  type: "string",
41
- description: "Additional text appended to task",
38
+ description: "Additional text appended to the task",
42
39
  },
43
40
  model: { type: "string", description: "Claude model (default: opus)" },
44
41
  "max-turns": {
45
42
  type: "string",
46
- description: "Max agentic turns (default: 50)",
43
+ description: "Max agentic turns (default: 50, 0 = unlimited)",
44
+ },
45
+ output: {
46
+ type: "string",
47
+ description: "Write the NDJSON trace to a file",
48
+ },
49
+ cwd: { type: "string", description: "Working directory for the agent" },
50
+ "agent-profile": {
51
+ type: "string",
52
+ description: "Agent profile name to load",
47
53
  },
48
- output: { type: "string", description: "Write NDJSON trace to file" },
49
- cwd: { type: "string", description: "Working directory" },
50
- "agent-profile": { type: "string", description: "Agent profile name" },
51
54
  "allowed-tools": {
52
55
  type: "string",
53
- description: "Comma-separated tool list",
56
+ description: "Comma-separated tool allowlist",
54
57
  },
55
58
  },
56
59
  },
57
60
  {
58
61
  name: "supervise",
59
62
  args: "",
60
- description: "Run a supervised agent-supervisor relay loop",
63
+ description:
64
+ "Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
61
65
  options: {
62
- "task-file": { type: "string", description: "Path to task file" },
63
- "task-text": { type: "string", description: "Inline task text" },
66
+ "task-file": {
67
+ type: "string",
68
+ description: "Path to a markdown task file",
69
+ },
70
+ "task-text": {
71
+ type: "string",
72
+ description: "Inline task text (alternative to --task-file)",
73
+ },
64
74
  "task-amend": {
65
75
  type: "string",
66
- description: "Additional text appended to task",
76
+ description: "Additional text appended to the task",
67
77
  },
68
78
  model: { type: "string", description: "Claude model (default: opus)" },
69
79
  "max-turns": {
70
80
  type: "string",
71
- description: "Max agentic turns (default: 50)",
81
+ description: "Max agentic turns (default: 20, 0 = unlimited)",
82
+ },
83
+ output: {
84
+ type: "string",
85
+ description: "Write the NDJSON trace to a file",
72
86
  },
73
- output: { type: "string", description: "Write NDJSON trace to file" },
74
- cwd: { type: "string", description: "Working directory" },
75
87
  "agent-profile": { type: "string", description: "Agent profile name" },
76
88
  "allowed-tools": {
77
89
  type: "string",
78
- description: "Comma-separated tool list",
90
+ description: "Agent tool allowlist",
79
91
  },
80
92
  "supervisor-cwd": {
81
93
  type: "string",
@@ -84,31 +96,41 @@ const definition = {
84
96
  "agent-cwd": { type: "string", description: "Agent working directory" },
85
97
  "supervisor-profile": {
86
98
  type: "string",
87
- description: "Supervisor profile name",
99
+ description: "Supervisor (judge) profile name",
88
100
  },
89
101
  "supervisor-allowed-tools": {
90
102
  type: "string",
91
- description: "Supervisor tool list",
103
+ description: "Supervisor tool allowlist",
92
104
  },
93
105
  },
94
106
  },
95
107
  {
96
108
  name: "facilitate",
97
109
  args: "",
98
- description: "Run a facilitated multi-agent session",
110
+ description:
111
+ "Run a facilitator with N participants — typical shape for multi-agent collaboration",
99
112
  options: {
100
- "task-file": { type: "string", description: "Path to task file" },
101
- "task-text": { type: "string", description: "Inline task text" },
113
+ "task-file": {
114
+ type: "string",
115
+ description: "Path to a markdown task file",
116
+ },
117
+ "task-text": {
118
+ type: "string",
119
+ description: "Inline task text (alternative to --task-file)",
120
+ },
102
121
  "task-amend": {
103
122
  type: "string",
104
- description: "Additional text appended to task",
123
+ description: "Additional text appended to the task",
105
124
  },
106
125
  model: { type: "string", description: "Claude model (default: opus)" },
107
126
  "max-turns": {
108
127
  type: "string",
109
- description: "Max facilitator LLM turns (default: 20)",
128
+ description: "Max agentic turns (default: 20, 0 = unlimited)",
129
+ },
130
+ output: {
131
+ type: "string",
132
+ description: "Write the NDJSON trace to a file",
110
133
  },
111
- output: { type: "string", description: "Write NDJSON trace to file" },
112
134
  "facilitator-cwd": {
113
135
  type: "string",
114
136
  description: "Facilitator working directory",
@@ -119,14 +141,27 @@ const definition = {
119
141
  },
120
142
  "agent-profiles": {
121
143
  type: "string",
122
- description: "Comma-separated agent profile names",
144
+ description:
145
+ "Comma-separated list of participant profile names (required)",
123
146
  },
124
147
  "agent-cwd": {
125
148
  type: "string",
126
- description: "Agent working directory (default: .)",
149
+ description: "Working directory shared by participants (default: .)",
127
150
  },
128
151
  },
129
152
  },
153
+ {
154
+ name: "output",
155
+ args: "",
156
+ description:
157
+ "Read NDJSON from stdin and emit a structured or readable form",
158
+ },
159
+ {
160
+ name: "tee",
161
+ args: "[output.ndjson]",
162
+ description:
163
+ "Stream readable text to stdout while saving raw NDJSON to a file",
164
+ },
130
165
  ],
131
166
  globalOptions: {
132
167
  format: { type: "string", description: "Output format (json|text)" },
@@ -135,10 +170,36 @@ const definition = {
135
170
  json: { type: "boolean", description: "Output help as JSON" },
136
171
  },
137
172
  examples: [
173
+ "fit-eval run --task-file=task.md --output=trace.ndjson",
174
+ "fit-eval supervise --task-file=task.md --supervisor-profile=judge --agent-profile=coder --output=trace.ndjson",
175
+ 'fit-eval facilitate --task-file=task.md --facilitator-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
138
176
  "fit-eval output --format=text < trace.ndjson",
139
- "fit-eval run --task-file=task.md --model=opus",
140
- "fit-eval supervise --task-file=task.md --supervisor-cwd=.",
141
- 'fit-eval facilitate --task-file=task.md --agent-profiles "security-engineer,technical-writer"',
177
+ ],
178
+ documentation: [
179
+ {
180
+ title: "Agent Evaluations",
181
+ url: "https://www.forwardimpact.team/docs/guides/agent-evaluations/index.md",
182
+ description:
183
+ "Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
184
+ },
185
+ {
186
+ title: "Agent Collaboration",
187
+ url: "https://www.forwardimpact.team/docs/guides/agent-collaboration/index.md",
188
+ description:
189
+ "Author a facilitator and participant profiles, run a multi-agent session, and read the message flow.",
190
+ },
191
+ {
192
+ title: "Trace Analysis",
193
+ url: "https://www.forwardimpact.team/docs/guides/trace-analysis/index.md",
194
+ description:
195
+ "Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
196
+ },
197
+ {
198
+ title: "Agent Teams",
199
+ url: "https://www.forwardimpact.team/docs/guides/agent-teams/index.md",
200
+ description:
201
+ "How to author the agent, supervisor, and facilitator profiles consumed by --agent-profile, --supervisor-profile, --facilitator-profile, and --agent-profiles.",
202
+ },
142
203
  ],
143
204
  };
144
205
 
package/bin/fit-trace.js CHANGED
@@ -23,6 +23,7 @@ import {
23
23
  runInitCommand,
24
24
  runTurnCommand,
25
25
  runFilterCommand,
26
+ runSplitCommand,
26
27
  } from "../src/commands/trace.js";
27
28
 
28
29
  const { version: VERSION } = JSON.parse(
@@ -32,12 +33,14 @@ const { version: VERSION } = JSON.parse(
32
33
  const definition = {
33
34
  name: "fit-trace",
34
35
  version: VERSION,
35
- description: "Download, query, and search agent execution traces",
36
+ description:
37
+ "Download, query, and analyze agent execution traces — read NDJSON output from fit-eval as qualitative research",
36
38
  commands: [
37
39
  {
38
40
  name: "runs",
39
41
  args: "[pattern]",
40
- description: "List recent workflow runs (default pattern: agent)",
42
+ description:
43
+ "List recent GitHub Actions workflow runs (default pattern: agent)",
41
44
  options: {
42
45
  lookback: {
43
46
  type: "string",
@@ -155,7 +158,7 @@ const definition = {
155
158
  {
156
159
  name: "filter",
157
160
  args: "<file>",
158
- description: "Filter turns by structural properties",
161
+ description: "Filter turns by role, tool, or error status",
159
162
  options: {
160
163
  role: {
161
164
  type: "string",
@@ -167,8 +170,23 @@ const definition = {
167
170
  },
168
171
  error: {
169
172
  type: "boolean",
170
- description:
171
- "Error tool_result turns only (flag-only; for non-errors use the API)",
173
+ description: "Error tool_result turns only",
174
+ },
175
+ },
176
+ },
177
+ {
178
+ name: "split",
179
+ args: "<file>",
180
+ description:
181
+ "Split a combined trace into per-source files (one per agent or supervisor)",
182
+ options: {
183
+ mode: {
184
+ type: "string",
185
+ description: "Execution mode: run (no-op), supervise, or facilitate",
186
+ },
187
+ "output-dir": {
188
+ type: "string",
189
+ description: "Output directory (default: same as input)",
172
190
  },
173
191
  },
174
192
  },
@@ -185,16 +203,34 @@ const definition = {
185
203
  examples: [
186
204
  "fit-trace runs --lookback 7d",
187
205
  "fit-trace download 24497273755",
206
+ "fit-trace split structured.json --mode=facilitate",
188
207
  "fit-trace overview structured.json",
189
208
  "fit-trace timeline structured.json",
209
+ "fit-trace stats structured.json",
210
+ "fit-trace tool structured.json Conclude",
190
211
  "fit-trace search structured.json 'error|fail' --context 1",
191
- "fit-trace tool structured.json Bash",
192
- "fit-trace batch structured.json 0 20",
193
- "fit-trace init structured.json",
212
+ "fit-trace filter structured.json --tool Bash --error",
194
213
  "fit-trace turn structured.json 3",
195
- "fit-trace filter structured.json --role system",
196
- "fit-trace filter structured.json --tool Bash --role assistant",
197
- "fit-trace search structured.json 'error' --full",
214
+ ],
215
+ documentation: [
216
+ {
217
+ title: "Trace Analysis",
218
+ url: "https://www.forwardimpact.team/docs/guides/trace-analysis/index.md",
219
+ description:
220
+ "The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
221
+ },
222
+ {
223
+ title: "Agent Evaluations",
224
+ url: "https://www.forwardimpact.team/docs/guides/agent-evaluations/index.md",
225
+ description:
226
+ "How `fit-eval supervise` produces the traces this skill analyzes.",
227
+ },
228
+ {
229
+ title: "Agent Collaboration",
230
+ url: "https://www.forwardimpact.team/docs/guides/agent-collaboration/index.md",
231
+ description:
232
+ "How `fit-eval facilitate` produces multi-agent traces; `split` is the bridge into per-source files.",
233
+ },
198
234
  ],
199
235
  };
200
236
 
@@ -219,6 +255,7 @@ const COMMANDS = {
219
255
  init: runInitCommand,
220
256
  turn: runTurnCommand,
221
257
  filter: runFilterCommand,
258
+ split: runSplitCommand,
222
259
  };
223
260
 
224
261
  async function main() {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.23",
3
+ "version": "0.1.25",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
@@ -32,7 +32,7 @@
32
32
  "@forwardimpact/libcli": "^0.1.0",
33
33
  "@forwardimpact/libconfig": "^0.1.0",
34
34
  "@forwardimpact/libtelemetry": "^0.1.22",
35
- "zod": "^4.3.6"
35
+ "zod": "^4.4.1"
36
36
  },
37
37
  "devDependencies": {
38
38
  "@forwardimpact/libharness": "^0.1.14"
@@ -18,7 +18,7 @@ function applyDefaults(deps) {
18
18
  cwd: deps.cwd,
19
19
  query: deps.query,
20
20
  output: deps.output,
21
- model: deps.model ?? "opus",
21
+ model: deps.model ?? "claude-opus-4-7[1m]",
22
22
  maxTurns: deps.maxTurns ?? 50,
23
23
  allowedTools: deps.allowedTools ?? DEFAULT_ALLOWED_TOOLS,
24
24
  onLine: deps.onLine ?? null,
@@ -44,7 +44,7 @@ function parseFacilitateOptions(values) {
44
44
  taskAmend,
45
45
  agentConfigs,
46
46
  facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
47
- model: values.model ?? "opus",
47
+ model: values.model ?? "claude-opus-4-7[1m]",
48
48
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
49
49
  outputPath: values.output,
50
50
  facilitatorProfile: values["facilitator-profile"] ?? undefined,
@@ -27,7 +27,7 @@ function parseRunOptions(values) {
27
27
  taskContent,
28
28
  taskAmend,
29
29
  cwd: resolve(values.cwd ?? "."),
30
- model: values.model ?? "opus",
30
+ model: values.model ?? "claude-opus-4-7[1m]",
31
31
  maxTurns: maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10),
32
32
  outputPath: values.output,
33
33
  agentProfile: values["agent-profile"] ?? undefined,
@@ -29,7 +29,7 @@ function parseSuperviseOptions(values) {
29
29
  agentCwd: resolve(
30
30
  values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
31
31
  ),
32
- model: values.model ?? "opus",
32
+ model: values.model ?? "claude-opus-4-7[1m]",
33
33
  maxTurns: (() => {
34
34
  const raw = values["max-turns"] ?? "20";
35
35
  return raw === "0" ? 0 : parseInt(raw, 10);
@@ -1,5 +1,5 @@
1
- import { readFileSync, writeFileSync } from "node:fs";
2
- import { join } from "node:path";
1
+ import { readFileSync, writeFileSync, mkdirSync } from "node:fs";
2
+ import { join, dirname } from "node:path";
3
3
  import { createTraceCollector } from "@forwardimpact/libeval";
4
4
  import { createTraceQuery } from "../trace-query.js";
5
5
  import { createTraceGitHub } from "../trace-github.js";
@@ -150,6 +150,113 @@ export async function runFilterCommand(values, args) {
150
150
  writeJSON(loadTrace(args[0]).filter(opts), values);
151
151
  }
152
152
 
153
+ // --- Split command ---
154
+
155
+ /** Valid agent source name pattern: lowercase letter, then lowercase alphanumeric or hyphen */
156
+ const VALID_SOURCE_NAME = /^[a-z][a-z0-9-]*$/;
157
+
158
+ /**
159
+ * Split a combined NDJSON trace into per-source files.
160
+ * @param {object} values - Parsed option values
161
+ * @param {string[]} args - [file]
162
+ */
163
+ export async function runSplitCommand(values, args) {
164
+ const file = args[0];
165
+ if (!file) throw new Error("split: missing input file");
166
+
167
+ const mode = values.mode;
168
+ if (!mode) throw new Error("split: --mode is required");
169
+
170
+ if (mode === "run") {
171
+ process.stdout.write(
172
+ "run mode: trace is already in final form, no split needed\n",
173
+ );
174
+ return;
175
+ }
176
+
177
+ const outputDir = values["output-dir"] || dirname(file);
178
+ mkdirSync(outputDir, { recursive: true });
179
+
180
+ const buckets = parseBuckets(readFileSync(file, "utf8"));
181
+
182
+ if (mode === "supervise") {
183
+ writeBucket(buckets, "agent", outputDir);
184
+ writeBucket(buckets, "supervisor", outputDir);
185
+ } else if (mode === "facilitate") {
186
+ splitFacilitated(buckets, outputDir);
187
+ }
188
+ }
189
+
190
+ /**
191
+ * Parse NDJSON content into per-source buckets of unwrapped event lines.
192
+ * Skips empty lines, malformed JSON, non-envelope lines, and orchestrator events.
193
+ * @param {string} content - Raw NDJSON file content
194
+ * @returns {Map<string, string[]>} source name -> array of unwrapped JSON lines
195
+ */
196
+ function parseBuckets(content) {
197
+ const buckets = new Map();
198
+
199
+ for (const raw of content.split("\n")) {
200
+ const trimmed = raw.trim();
201
+ if (!trimmed) continue;
202
+
203
+ let envelope;
204
+ try {
205
+ envelope = JSON.parse(trimmed);
206
+ } catch {
207
+ continue;
208
+ }
209
+
210
+ if (!envelope.event || typeof envelope.source !== "string") continue;
211
+ if (envelope.source === "orchestrator") continue;
212
+
213
+ if (!buckets.has(envelope.source)) {
214
+ buckets.set(envelope.source, []);
215
+ }
216
+ buckets.get(envelope.source).push(JSON.stringify(envelope.event));
217
+ }
218
+
219
+ return buckets;
220
+ }
221
+
222
+ /**
223
+ * Write facilitated mode split: facilitator, per-agent, and combined agent files.
224
+ * @param {Map<string, string[]>} buckets
225
+ * @param {string} outputDir
226
+ */
227
+ function splitFacilitated(buckets, outputDir) {
228
+ writeBucket(buckets, "facilitator", outputDir);
229
+
230
+ const agentSources = [...buckets.keys()].filter(
231
+ (s) => s !== "facilitator" && VALID_SOURCE_NAME.test(s),
232
+ );
233
+
234
+ for (const name of agentSources) {
235
+ writeBucket(buckets, name, outputDir);
236
+ }
237
+
238
+ const combinedLines = agentSources.flatMap((n) => buckets.get(n) ?? []);
239
+ if (combinedLines.length > 0) {
240
+ writeFileSync(
241
+ join(outputDir, "trace-agent.ndjson"),
242
+ combinedLines.join("\n") + "\n",
243
+ );
244
+ }
245
+ }
246
+
247
+ /**
248
+ * Write a single source bucket to a trace-{name}.ndjson file.
249
+ * @param {Map<string, string[]>} buckets
250
+ * @param {string} name
251
+ * @param {string} outputDir
252
+ */
253
+ function writeBucket(buckets, name, outputDir) {
254
+ const lines = buckets.get(name);
255
+ if (!lines || lines.length === 0) return;
256
+ const outPath = join(outputDir, `trace-${name}.ndjson`);
257
+ writeFileSync(outPath, lines.join("\n") + "\n");
258
+ }
259
+
153
260
  // --- Shared helpers ---
154
261
 
155
262
  /**