@forwardimpact/libeval 0.1.50 → 0.1.52

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +36 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/apm-installer.js +48 -44
  8. package/src/benchmark/env-loader.js +35 -23
  9. package/src/benchmark/invariants.js +128 -0
  10. package/src/benchmark/judge.js +18 -19
  11. package/src/benchmark/npm-installer.js +33 -33
  12. package/src/benchmark/report.js +40 -26
  13. package/src/benchmark/result.js +11 -11
  14. package/src/benchmark/runner.js +90 -46
  15. package/src/benchmark/task-family.js +78 -65
  16. package/src/benchmark/workdir.js +100 -93
  17. package/src/commands/assert.js +30 -22
  18. package/src/commands/benchmark-invariants.js +74 -0
  19. package/src/commands/benchmark-report.js +24 -15
  20. package/src/commands/benchmark-run.js +16 -9
  21. package/src/commands/by-discussion.js +33 -23
  22. package/src/commands/callback.js +20 -11
  23. package/src/commands/discuss.js +31 -13
  24. package/src/commands/facilitate.js +21 -14
  25. package/src/commands/output.js +15 -13
  26. package/src/commands/run.js +28 -14
  27. package/src/commands/supervise.js +29 -19
  28. package/src/commands/task-input.js +10 -5
  29. package/src/commands/tee.js +24 -9
  30. package/src/commands/trace.js +181 -99
  31. package/src/discuss-tools.js +48 -2
  32. package/src/discusser.js +53 -2
  33. package/src/events/github.js +27 -5
  34. package/src/facilitator.js +4 -0
  35. package/src/inbox-poller.js +84 -0
  36. package/src/judge.js +4 -1
  37. package/src/message-bus.js +6 -0
  38. package/src/orchestration-loop.js +14 -4
  39. package/src/orchestration-toolkit.js +14 -0
  40. package/src/profile-prompt.js +22 -9
  41. package/src/redaction.js +31 -9
  42. package/src/reply-emitter.js +47 -0
  43. package/src/supervisor.js +4 -0
  44. package/src/tee-writer.js +4 -2
  45. package/src/trace-collector.js +9 -2
  46. package/src/trace-github.js +47 -27
  47. package/src/benchmark/scorer.js +0 -138
  48. package/src/commands/benchmark-score.js +0 -68
package/README.md CHANGED
@@ -71,11 +71,12 @@ while participants work in parallel — nothing blocks the LLM thread.
71
71
 
72
72
  ### Discuss-mode replies
73
73
 
74
- In discussion mode, Answer calls routed to the lead are captured as
75
- thread replies delivered via the bridge callback. The lead delegates work
76
- via Ask; each agent's Answer becomes a separate reply posted to the
77
- discussion thread. No explicit reply tool is needed on the lead surface —
78
- the message bus intercepts answers and appends them to `ctx.replies[]`.
74
+ In discussion mode, Answer calls routed to the lead are streamed to
75
+ the discussion thread as they are produced each agent's Answer becomes
76
+ a separate reply posted immediately, not batched at session end. The
77
+ lead and agents can also call `Acknowledge` to post brief messages
78
+ directly to the thread (status updates, human follow-up responses).
79
+ The message bus intercepts answers and appends them to `ctx.replies[]`.
79
80
 
80
81
  `RequestForComment` is a separate coordination tool available on agent
81
82
  roles (facilitated agents and discuss agents). It queues an intent to
@@ -104,8 +105,8 @@ only feeds the summary's `success`/`verdict`.
104
105
  | Fac. agent | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
105
106
  | Supervisor | ✓ | ✓ | ✓ | ✓ | ✓ | |
106
107
  | Sup. agent | ✓ | ✓ | ✓ | ✓ | | |
107
- | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn` |
108
- | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
108
+ | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn`, `Acknowledge` |
109
+ | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment`, `Acknowledge` |
109
110
  | Judge | | | | | ✓ | |
110
111
 
111
112
  Ask's `to` accepts a participant name on multi-participant roles
@@ -169,7 +170,9 @@ downloadable through retention.
169
170
  | `orchestration-toolkit.js` | Shared Ask/Answer/Announce/Conclude/RollCall/RequestForComment handlers + builders. |
170
171
  | `orchestration-loop.js` | Unified lead+participant loop; reminder/violation handling. |
171
172
  | `facilitator.js` / `supervisor.js` / `discusser.js` / `judge.js` | Per-mode class + factory + system prompt. |
172
- | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`. |
173
+ | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`/`Acknowledge`. |
174
+ | `reply-emitter.js` | Fire-and-forget POST of reply/ack events to the callback URL. |
175
+ | `inbox-poller.js` | Long-poll the bridge inbox for injected human messages. |
173
176
  | `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers. |
174
177
  | `redaction.js` | Env-var allowlist + credential-shape pattern redaction. |
175
178
 
@@ -4,10 +4,11 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync, realpathSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
10
- import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
11
+ import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
11
12
  import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
12
13
 
13
14
  // `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
@@ -26,7 +27,8 @@ export const definition = {
26
27
  commands: [
27
28
  {
28
29
  name: "run",
29
- args: "",
30
+ args: [],
31
+ handler: runBenchmarkRunCommand,
30
32
  description:
31
33
  "Run every task in a family for N runs and emit one result record per (task, runIndex).",
32
34
  options: {
@@ -78,10 +80,11 @@ export const definition = {
78
80
  },
79
81
  },
80
82
  {
81
- name: "score",
82
- args: "",
83
+ name: "invariants",
84
+ args: [],
85
+ handler: runBenchmarkInvariantsCommand,
83
86
  description:
84
- "Score a single task against a post-run workdir without invoking an agent.",
87
+ "Check a single task's invariants against a post-run workdir without invoking an agent.",
85
88
  options: {
86
89
  family: {
87
90
  type: "string",
@@ -94,7 +97,7 @@ export const definition = {
94
97
  workdir: {
95
98
  type: "string",
96
99
  description:
97
- "Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
100
+ "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
98
101
  },
99
102
  output: {
100
103
  type: "string",
@@ -104,7 +107,8 @@ export const definition = {
104
107
  },
105
108
  {
106
109
  name: "report",
107
- args: "",
110
+ args: [],
111
+ handler: runBenchmarkReportCommand,
108
112
  description:
109
113
  "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
110
114
  options: {
@@ -132,7 +136,7 @@ export const definition = {
132
136
  examples: [
133
137
  "fit-benchmark run --family=./families/coding",
134
138
  "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
135
- "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
139
+ "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
136
140
  "fit-benchmark report --format=text",
137
141
  "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
138
142
  ],
@@ -152,35 +156,30 @@ export const definition = {
152
156
  ],
153
157
  };
154
158
 
155
- const cli = createCli(definition);
156
159
  const logger = createLogger("benchmark");
157
160
 
158
- const COMMANDS = {
159
- run: runBenchmarkRunCommand,
160
- score: runBenchmarkScoreCommand,
161
- report: runBenchmarkReportCommand,
162
- };
163
-
164
161
  async function main() {
165
- const parsed = cli.parse(process.argv.slice(2));
166
- if (!parsed) process.exit(0);
167
-
168
- const { values, positionals } = parsed;
162
+ const runtime = createDefaultRuntime();
163
+ const cli = createCli(definition, { runtime });
164
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
165
+ if (!parsed) return runtime.proc.exit(0);
169
166
 
167
+ const { positionals } = parsed;
170
168
  if (positionals.length === 0) {
171
169
  cli.usageError("no command specified");
172
- process.exit(2);
170
+ return runtime.proc.exit(2);
173
171
  }
174
172
 
175
- const [command, ...args] = positionals;
176
- const handler = COMMANDS[command];
177
-
178
- if (!handler) {
173
+ const command = positionals[0];
174
+ if (!definition.commands.some((c) => c.name === command)) {
179
175
  cli.usageError(`unknown command "${command}"`);
180
- process.exit(2);
176
+ return runtime.proc.exit(2);
181
177
  }
182
178
 
183
- await handler(values, args);
179
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
180
+ const envelope = result ?? { ok: true };
181
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
182
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
184
183
  }
185
184
 
186
185
  // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
@@ -188,7 +187,7 @@ async function main() {
188
187
  if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
189
188
  main().catch((error) => {
190
189
  logger.exception("main", error);
191
- cli.error(error.message);
190
+ createCli(definition).error(error.message);
192
191
  process.exit(1);
193
192
  });
194
193
  }
package/bin/fit-eval.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runOutputCommand } from "../src/commands/output.js";
@@ -65,7 +66,9 @@ const definition = {
65
66
  commands: [
66
67
  {
67
68
  name: "run",
68
- args: "",
69
+ args: [],
70
+ argsUsage: "",
71
+ handler: runRunCommand,
69
72
  description: "Run a single agent autonomously on a defined task",
70
73
  options: {
71
74
  ...TASK_INPUT_OPTIONS,
@@ -100,7 +103,9 @@ const definition = {
100
103
  },
101
104
  {
102
105
  name: "supervise",
103
- args: "",
106
+ args: [],
107
+ argsUsage: "",
108
+ handler: runSuperviseCommand,
104
109
  description:
105
110
  "Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
106
111
  options: {
@@ -143,7 +148,9 @@ const definition = {
143
148
  },
144
149
  {
145
150
  name: "facilitate",
146
- args: "",
151
+ args: [],
152
+ argsUsage: "",
153
+ handler: runFacilitateCommand,
147
154
  description:
148
155
  "Run a facilitator with N participants — typical shape for multi-agent collaboration",
149
156
  options: {
@@ -178,7 +185,9 @@ const definition = {
178
185
  },
179
186
  {
180
187
  name: "discuss",
181
- args: "",
188
+ args: [],
189
+ argsUsage: "",
190
+ handler: runDiscussCommand,
182
191
  description:
183
192
  "Run an async, suspendable discussion — Chair + N participants + bridge callback",
184
193
  options: {
@@ -217,19 +226,25 @@ const definition = {
217
226
  },
218
227
  {
219
228
  name: "output",
220
- args: "",
229
+ args: [],
230
+ argsUsage: "",
231
+ handler: runOutputCommand,
221
232
  description:
222
233
  "Read NDJSON from stdin and emit a structured or readable form",
223
234
  },
224
235
  {
225
236
  name: "tee",
226
- args: "[output.ndjson]",
237
+ args: ["output"],
238
+ argsUsage: "[output.ndjson]",
239
+ handler: runTeeCommand,
227
240
  description:
228
241
  "Stream readable text to stdout while saving raw NDJSON to a file",
229
242
  },
230
243
  {
231
244
  name: "callback",
232
- args: "",
245
+ args: [],
246
+ argsUsage: "",
247
+ handler: runCallbackCommand,
233
248
  description:
234
249
  "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
235
250
  options: {
@@ -298,43 +313,34 @@ const definition = {
298
313
  ],
299
314
  };
300
315
 
301
- const cli = createCli(definition);
302
316
  const logger = createLogger("eval");
303
317
 
304
- const COMMANDS = {
305
- output: runOutputCommand,
306
- tee: runTeeCommand,
307
- run: runRunCommand,
308
- supervise: runSuperviseCommand,
309
- facilitate: runFacilitateCommand,
310
- discuss: runDiscussCommand,
311
- callback: runCallbackCommand,
312
- };
313
-
314
318
  async function main() {
315
- const parsed = cli.parse(process.argv.slice(2));
316
- if (!parsed) process.exit(0);
317
-
318
- const { values, positionals } = parsed;
319
+ const runtime = createDefaultRuntime();
320
+ const cli = createCli(definition, { runtime });
321
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
322
+ if (!parsed) return runtime.proc.exit(0);
319
323
 
324
+ const { positionals } = parsed;
320
325
  if (positionals.length === 0) {
321
326
  cli.usageError("no command specified");
322
- process.exit(2);
327
+ return runtime.proc.exit(2);
323
328
  }
324
329
 
325
- const [command, ...args] = positionals;
326
- const handler = COMMANDS[command];
327
-
328
- if (!handler) {
330
+ const command = positionals[0];
331
+ if (!definition.commands.some((c) => c.name === command)) {
329
332
  cli.usageError(`unknown command "${command}"`);
330
- process.exit(2);
333
+ return runtime.proc.exit(2);
331
334
  }
332
335
 
333
- await handler(values, args);
336
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
337
+ const envelope = result ?? { ok: true };
338
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
339
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
334
340
  }
335
341
 
336
342
  main().catch((error) => {
337
343
  logger.exception("main", error);
338
- cli.error(error.message);
344
+ createCli(definition).error(error.message);
339
345
  process.exit(1);
340
346
  });
package/bin/fit-trace.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createScriptConfig } from "@forwardimpact/libconfig";
8
9
  import { createLogger } from "@forwardimpact/libtelemetry";
9
10
 
@@ -46,7 +47,9 @@ const definition = {
46
47
  commands: [
47
48
  {
48
49
  name: "runs",
49
- args: "[pattern]",
50
+ args: ["pattern"],
51
+ argsUsage: "[pattern]",
52
+ handler: runRunsCommand,
50
53
  description:
51
54
  "List recent GitHub Actions workflow runs (default pattern: agent)",
52
55
  options: {
@@ -63,7 +66,9 @@ const definition = {
63
66
  },
64
67
  {
65
68
  name: "download",
66
- args: "<run-id>",
69
+ args: ["run-id"],
70
+ argsUsage: "<run-id>",
71
+ handler: runDownloadCommand,
67
72
  description: "Download trace artifact and convert to structured JSON",
68
73
  options: {
69
74
  dir: { type: "string", description: "Output directory" },
@@ -77,32 +82,44 @@ const definition = {
77
82
  },
78
83
  {
79
84
  name: "overview",
80
- args: "<file>",
85
+ args: ["file"],
86
+ argsUsage: "<file>",
87
+ handler: runOverviewCommand,
81
88
  description: "Metadata, summary, turn count, tool frequency",
82
89
  },
83
90
  {
84
91
  name: "count",
85
- args: "<file>",
92
+ args: ["file"],
93
+ argsUsage: "<file>",
94
+ handler: runCountCommand,
86
95
  description: "Number of turns",
87
96
  },
88
97
  {
89
98
  name: "batch",
90
- args: "<file> <from> <to>",
99
+ args: ["file", "from", "to"],
100
+ argsUsage: "<file> <from> <to>",
101
+ handler: runBatchCommand,
91
102
  description: "Turns in range [from, to) (zero-indexed)",
92
103
  },
93
104
  {
94
105
  name: "head",
95
- args: "<file> [N]",
106
+ args: ["file", "n"],
107
+ argsUsage: "<file> [N]",
108
+ handler: runHeadCommand,
96
109
  description: "First N turns (default 10)",
97
110
  },
98
111
  {
99
112
  name: "tail",
100
- args: "<file> [N]",
113
+ args: ["file", "n"],
114
+ argsUsage: "<file> [N]",
115
+ handler: runTailCommand,
101
116
  description: "Last N turns (default 10)",
102
117
  },
103
118
  {
104
119
  name: "search",
105
- args: "<file> <pattern>",
120
+ args: ["file", "pattern"],
121
+ argsUsage: "<file> <pattern>",
122
+ handler: runSearchCommand,
106
123
  description: "Search all content for regex pattern",
107
124
  options: {
108
125
  limit: {
@@ -121,22 +138,30 @@ const definition = {
121
138
  },
122
139
  {
123
140
  name: "tools",
124
- args: "<file>",
141
+ args: ["file"],
142
+ argsUsage: "<file>",
143
+ handler: runToolsCommand,
125
144
  description: "Tool usage frequency (descending)",
126
145
  },
127
146
  {
128
147
  name: "tool",
129
- args: "<file> <name>",
148
+ args: ["file", "name"],
149
+ argsUsage: "<file> <name>",
150
+ handler: runToolCommand,
130
151
  description: "All turns involving a specific tool",
131
152
  },
132
153
  {
133
154
  name: "errors",
134
- args: "<file>",
155
+ args: ["file"],
156
+ argsUsage: "<file>",
157
+ handler: runErrorsCommand,
135
158
  description: "Tool results with isError=true",
136
159
  },
137
160
  {
138
161
  name: "reasoning",
139
- args: "<file>",
162
+ args: ["file"],
163
+ argsUsage: "<file>",
164
+ handler: runReasoningCommand,
140
165
  description: "Agent reasoning text only",
141
166
  options: {
142
167
  from: { type: "string", description: "Start at turn index" },
@@ -145,27 +170,37 @@ const definition = {
145
170
  },
146
171
  {
147
172
  name: "timeline",
148
- args: "<file>",
173
+ args: ["file"],
174
+ argsUsage: "<file>",
175
+ handler: runTimelineCommand,
149
176
  description: "Compact one-line-per-turn overview",
150
177
  },
151
178
  {
152
179
  name: "stats",
153
- args: "<file>",
180
+ args: ["file"],
181
+ argsUsage: "<file>",
182
+ handler: runStatsCommand,
154
183
  description: "Token usage and cost breakdown",
155
184
  },
156
185
  {
157
186
  name: "init",
158
- args: "<file>",
187
+ args: ["file"],
188
+ argsUsage: "<file>",
189
+ handler: runInitCommand,
159
190
  description: "Full system/init event",
160
191
  },
161
192
  {
162
193
  name: "turn",
163
- args: "<file> <index>",
194
+ args: ["file", "index"],
195
+ argsUsage: "<file> <index>",
196
+ handler: runTurnCommand,
164
197
  description: "Single turn by index",
165
198
  },
166
199
  {
167
200
  name: "by-discussion",
168
- args: "<discussion-id> [trace-dir]",
201
+ args: ["discussion-id", "trace-dir"],
202
+ argsUsage: "<discussion-id> [trace-dir]",
203
+ handler: runByDiscussionCommand,
169
204
  description:
170
205
  "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
171
206
  options: {
@@ -177,7 +212,9 @@ const definition = {
177
212
  },
178
213
  {
179
214
  name: "filter",
180
- args: "<file>",
215
+ args: ["file"],
216
+ argsUsage: "<file>",
217
+ handler: runFilterCommand,
181
218
  description: "Filter turns by role, tool, or error status",
182
219
  options: {
183
220
  role: {
@@ -196,7 +233,9 @@ const definition = {
196
233
  },
197
234
  {
198
235
  name: "split",
199
- args: "<file>",
236
+ args: ["file"],
237
+ argsUsage: "<file>",
238
+ handler: runSplitCommand,
200
239
  description:
201
240
  "Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
202
241
  options: {
@@ -217,9 +256,11 @@ const definition = {
217
256
  },
218
257
  {
219
258
  name: "assert",
220
- args: "<test-name> <file>",
259
+ args: ["test-name", "file"],
260
+ argsUsage: "<test-name> <file>",
261
+ handler: runAssertCommand,
221
262
  description:
222
- "Shell-friendly assertion — outputs structured JSON for scoring hooks",
263
+ "Shell-friendly assertion — outputs structured JSON for invariant hooks",
223
264
  options: {
224
265
  grep: {
225
266
  type: "string",
@@ -299,57 +340,42 @@ const definition = {
299
340
  ],
300
341
  };
301
342
 
302
- const cli = createCli(definition);
303
343
  const logger = createLogger("trace");
304
344
 
305
- const COMMANDS = {
306
- runs: runRunsCommand,
307
- download: runDownloadCommand,
308
- overview: runOverviewCommand,
309
- count: runCountCommand,
310
- batch: runBatchCommand,
311
- head: runHeadCommand,
312
- tail: runTailCommand,
313
- search: runSearchCommand,
314
- tools: runToolsCommand,
315
- tool: runToolCommand,
316
- errors: runErrorsCommand,
317
- reasoning: runReasoningCommand,
318
- timeline: runTimelineCommand,
319
- stats: runStatsCommand,
320
- init: runInitCommand,
321
- turn: runTurnCommand,
322
- filter: runFilterCommand,
323
- split: runSplitCommand,
324
- assert: runAssertCommand,
325
- "by-discussion": runByDiscussionCommand,
326
- };
345
+ // Commands that talk to the GitHub API need a config-backed token resolver;
346
+ // the rest only read local trace files through the runtime.
347
+ const NEEDS_CONFIG = new Set(["runs", "download"]);
327
348
 
328
349
  async function main() {
329
- const parsed = cli.parse(process.argv.slice(2));
330
- if (!parsed) process.exit(0);
331
-
332
- const { values, positionals } = parsed;
350
+ const runtime = createDefaultRuntime();
351
+ const cli = createCli(definition, { runtime });
352
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
353
+ if (!parsed) return runtime.proc.exit(0);
333
354
 
355
+ const { positionals } = parsed;
334
356
  if (positionals.length === 0) {
335
357
  cli.usageError("no command specified");
336
- process.exit(2);
358
+ return runtime.proc.exit(2);
337
359
  }
338
360
 
339
- const [command, ...args] = positionals;
340
- const handler = COMMANDS[command];
341
-
342
- if (!handler) {
361
+ const command = positionals[0];
362
+ if (!definition.commands.some((c) => c.name === command)) {
343
363
  cli.usageError(`unknown command "${command}"`);
344
- process.exit(2);
364
+ return runtime.proc.exit(2);
345
365
  }
346
366
 
347
- const config = await createScriptConfig("eval");
348
- await handler(values, args, { config });
367
+ const config = NEEDS_CONFIG.has(command)
368
+ ? await createScriptConfig("eval")
369
+ : undefined;
370
+
371
+ const result = await cli.dispatch(parsed, { deps: { runtime, config } });
372
+ const envelope = result ?? { ok: true };
373
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
374
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
349
375
  }
350
376
 
351
377
  main().catch((error) => {
352
378
  logger.exception("main", error);
353
- cli.error(error.message);
379
+ createCli(definition).error(error.message);
354
380
  process.exit(1);
355
381
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.50",
3
+ "version": "0.1.52",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -29,12 +29,16 @@ export class AgentRunner {
29
29
  * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
30
30
  * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
31
31
  * @param {object} deps.redactor
32
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} [deps.runtime] -
33
+ * Ambient collaborators. Only `proc.env` is read (to record Skill
34
+ * invocations into `LIBEVAL_SKILL`); when absent the write is skipped.
32
35
  */
33
36
  constructor(deps) {
34
37
  if (!deps.cwd) throw new Error("cwd is required");
35
38
  if (!deps.query) throw new Error("query is required");
36
39
  if (!deps.output) throw new Error("output is required");
37
40
  if (!deps.redactor) throw new Error("redactor is required");
41
+ this.runtime = deps.runtime ?? null;
38
42
  this.cwd = deps.cwd;
39
43
  this.query = deps.query;
40
44
  this.output = deps.output;
@@ -179,20 +183,24 @@ export class AgentRunner {
179
183
  if (message.type === "system" && message.subtype === "init") {
180
184
  this.sessionId = message.session_id;
181
185
  }
182
- if (message.type === "assistant") trackSkillInvocation(message);
186
+ if (message.type === "assistant") this.#trackSkillInvocation(message);
183
187
  }
184
- }
185
188
 
186
- function trackSkillInvocation(message) {
187
- const content = message.message?.content ?? message.content;
188
- if (!Array.isArray(content)) return;
189
- for (const block of content) {
190
- if (
191
- block.type === "tool_use" &&
192
- block.name === "Skill" &&
193
- block.input?.skill
194
- ) {
195
- process.env.LIBEVAL_SKILL = block.input.skill;
189
+ #trackSkillInvocation(message) {
190
+ const content = message.message?.content ?? message.content;
191
+ if (!Array.isArray(content)) return;
192
+ // Skill metric is recorded into the env map; without a runtime there is
193
+ // no env surface to write to, so the side-effect is simply skipped.
194
+ const env = this.runtime?.proc?.env ?? null;
195
+ if (!env) return;
196
+ for (const block of content) {
197
+ if (
198
+ block.type === "tool_use" &&
199
+ block.name === "Skill" &&
200
+ block.input?.skill
201
+ ) {
202
+ env.LIBEVAL_SKILL = block.input.skill;
203
+ }
196
204
  }
197
205
  }
198
206
  }