@forwardimpact/libeval 0.1.50 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +49 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/env-loader.js +35 -23
  8. package/src/benchmark/{scorer.js → invariants.js} +14 -12
  9. package/src/benchmark/judge.js +5 -8
  10. package/src/benchmark/report.js +15 -15
  11. package/src/benchmark/result.js +11 -11
  12. package/src/benchmark/runner.js +11 -11
  13. package/src/benchmark/task-family.js +6 -4
  14. package/src/benchmark/workdir.js +18 -3
  15. package/src/commands/assert.js +30 -22
  16. package/src/commands/benchmark-invariants.js +74 -0
  17. package/src/commands/benchmark-report.js +23 -15
  18. package/src/commands/benchmark-run.js +15 -8
  19. package/src/commands/by-discussion.js +29 -18
  20. package/src/commands/callback.js +20 -11
  21. package/src/commands/discuss.js +28 -11
  22. package/src/commands/facilitate.js +18 -12
  23. package/src/commands/output.js +11 -12
  24. package/src/commands/run.js +22 -12
  25. package/src/commands/supervise.js +27 -18
  26. package/src/commands/task-input.js +10 -5
  27. package/src/commands/trace.js +174 -97
  28. package/src/discuss-tools.js +48 -2
  29. package/src/discusser.js +49 -2
  30. package/src/events/github.js +27 -5
  31. package/src/inbox-poller.js +84 -0
  32. package/src/judge.js +1 -1
  33. package/src/message-bus.js +6 -0
  34. package/src/orchestration-loop.js +14 -4
  35. package/src/orchestration-toolkit.js +14 -0
  36. package/src/redaction.js +31 -9
  37. package/src/reply-emitter.js +47 -0
  38. package/src/commands/benchmark-score.js +0 -68
package/README.md CHANGED
@@ -71,11 +71,12 @@ while participants work in parallel — nothing blocks the LLM thread.
71
71
 
72
72
  ### Discuss-mode replies
73
73
 
74
- In discussion mode, Answer calls routed to the lead are captured as
75
- thread replies delivered via the bridge callback. The lead delegates work
76
- via Ask; each agent's Answer becomes a separate reply posted to the
77
- discussion thread. No explicit reply tool is needed on the lead surface —
78
- the message bus intercepts answers and appends them to `ctx.replies[]`.
74
+ In discussion mode, Answer calls routed to the lead are streamed to
75
+ the discussion thread as they are produced each agent's Answer becomes
76
+ a separate reply posted immediately, not batched at session end. The
77
+ lead and agents can also call `Acknowledge` to post brief messages
78
+ directly to the thread (status updates, human follow-up responses).
79
+ The message bus intercepts answers and appends them to `ctx.replies[]`.
79
80
 
80
81
  `RequestForComment` is a separate coordination tool available on agent
81
82
  roles (facilitated agents and discuss agents). It queues an intent to
@@ -104,8 +105,8 @@ only feeds the summary's `success`/`verdict`.
104
105
  | Fac. agent | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
105
106
  | Supervisor | ✓ | ✓ | ✓ | ✓ | ✓ | |
106
107
  | Sup. agent | ✓ | ✓ | ✓ | ✓ | | |
107
- | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn` |
108
- | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
108
+ | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn`, `Acknowledge` |
109
+ | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment`, `Acknowledge` |
109
110
  | Judge | | | | | ✓ | |
110
111
 
111
112
  Ask's `to` accepts a participant name on multi-participant roles
@@ -169,7 +170,9 @@ downloadable through retention.
169
170
  | `orchestration-toolkit.js` | Shared Ask/Answer/Announce/Conclude/RollCall/RequestForComment handlers + builders. |
170
171
  | `orchestration-loop.js` | Unified lead+participant loop; reminder/violation handling. |
171
172
  | `facilitator.js` / `supervisor.js` / `discusser.js` / `judge.js` | Per-mode class + factory + system prompt. |
172
- | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`. |
173
+ | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`/`Acknowledge`. |
174
+ | `reply-emitter.js` | Fire-and-forget POST of reply/ack events to the callback URL. |
175
+ | `inbox-poller.js` | Long-poll the bridge inbox for injected human messages. |
173
176
  | `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers. |
174
177
  | `redaction.js` | Env-var allowlist + credential-shape pattern redaction. |
175
178
 
@@ -4,10 +4,11 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync, realpathSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
10
- import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
11
+ import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
11
12
  import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
12
13
 
13
14
  // `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
@@ -26,7 +27,8 @@ export const definition = {
26
27
  commands: [
27
28
  {
28
29
  name: "run",
29
- args: "",
30
+ args: [],
31
+ handler: runBenchmarkRunCommand,
30
32
  description:
31
33
  "Run every task in a family for N runs and emit one result record per (task, runIndex).",
32
34
  options: {
@@ -78,10 +80,11 @@ export const definition = {
78
80
  },
79
81
  },
80
82
  {
81
- name: "score",
82
- args: "",
83
+ name: "invariants",
84
+ args: [],
85
+ handler: runBenchmarkInvariantsCommand,
83
86
  description:
84
- "Score a single task against a post-run workdir without invoking an agent.",
87
+ "Check a single task's invariants against a post-run workdir without invoking an agent.",
85
88
  options: {
86
89
  family: {
87
90
  type: "string",
@@ -94,7 +97,7 @@ export const definition = {
94
97
  workdir: {
95
98
  type: "string",
96
99
  description:
97
- "Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
100
+ "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
98
101
  },
99
102
  output: {
100
103
  type: "string",
@@ -104,7 +107,8 @@ export const definition = {
104
107
  },
105
108
  {
106
109
  name: "report",
107
- args: "",
110
+ args: [],
111
+ handler: runBenchmarkReportCommand,
108
112
  description:
109
113
  "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
110
114
  options: {
@@ -132,7 +136,7 @@ export const definition = {
132
136
  examples: [
133
137
  "fit-benchmark run --family=./families/coding",
134
138
  "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
135
- "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
139
+ "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
136
140
  "fit-benchmark report --format=text",
137
141
  "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
138
142
  ],
@@ -152,35 +156,30 @@ export const definition = {
152
156
  ],
153
157
  };
154
158
 
155
- const cli = createCli(definition);
156
159
  const logger = createLogger("benchmark");
157
160
 
158
- const COMMANDS = {
159
- run: runBenchmarkRunCommand,
160
- score: runBenchmarkScoreCommand,
161
- report: runBenchmarkReportCommand,
162
- };
163
-
164
161
  async function main() {
165
- const parsed = cli.parse(process.argv.slice(2));
166
- if (!parsed) process.exit(0);
167
-
168
- const { values, positionals } = parsed;
162
+ const runtime = createDefaultRuntime();
163
+ const cli = createCli(definition, { runtime });
164
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
165
+ if (!parsed) return runtime.proc.exit(0);
169
166
 
167
+ const { positionals } = parsed;
170
168
  if (positionals.length === 0) {
171
169
  cli.usageError("no command specified");
172
- process.exit(2);
170
+ return runtime.proc.exit(2);
173
171
  }
174
172
 
175
- const [command, ...args] = positionals;
176
- const handler = COMMANDS[command];
177
-
178
- if (!handler) {
173
+ const command = positionals[0];
174
+ if (!definition.commands.some((c) => c.name === command)) {
179
175
  cli.usageError(`unknown command "${command}"`);
180
- process.exit(2);
176
+ return runtime.proc.exit(2);
181
177
  }
182
178
 
183
- await handler(values, args);
179
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
180
+ const envelope = result ?? { ok: true };
181
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
182
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
184
183
  }
185
184
 
186
185
  // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
@@ -188,7 +187,7 @@ async function main() {
188
187
  if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
189
188
  main().catch((error) => {
190
189
  logger.exception("main", error);
191
- cli.error(error.message);
190
+ createCli(definition).error(error.message);
192
191
  process.exit(1);
193
192
  });
194
193
  }
package/bin/fit-eval.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runOutputCommand } from "../src/commands/output.js";
@@ -14,6 +15,19 @@ import { runFacilitateCommand } from "../src/commands/facilitate.js";
14
15
  import { runDiscussCommand } from "../src/commands/discuss.js";
15
16
  import { runCallbackCommand } from "../src/commands/callback.js";
16
17
 
18
+ // `tee` streams stdin→stdout via Node's `pipeline`, which needs real stream
19
+ // objects the runtime surface does not expose; it keeps the legacy
20
+ // `(values, args)` signature and this adapter bridges it into dispatch.
21
+ async function teeHandler(ctx) {
22
+ const out = ctx.args.output;
23
+ try {
24
+ await runTeeCommand(ctx.options, out ? [out] : []);
25
+ return { ok: true };
26
+ } catch (error) {
27
+ return { ok: false, code: 1, error: error.message };
28
+ }
29
+ }
30
+
17
31
  // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
18
32
  // the readFileSync branch in the compiled binary (which would ENOENT against
19
33
  // the bunfs virtual mount). Source execution falls through to package.json.
@@ -65,7 +79,9 @@ const definition = {
65
79
  commands: [
66
80
  {
67
81
  name: "run",
68
- args: "",
82
+ args: [],
83
+ argsUsage: "",
84
+ handler: runRunCommand,
69
85
  description: "Run a single agent autonomously on a defined task",
70
86
  options: {
71
87
  ...TASK_INPUT_OPTIONS,
@@ -100,7 +116,9 @@ const definition = {
100
116
  },
101
117
  {
102
118
  name: "supervise",
103
- args: "",
119
+ args: [],
120
+ argsUsage: "",
121
+ handler: runSuperviseCommand,
104
122
  description:
105
123
  "Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
106
124
  options: {
@@ -143,7 +161,9 @@ const definition = {
143
161
  },
144
162
  {
145
163
  name: "facilitate",
146
- args: "",
164
+ args: [],
165
+ argsUsage: "",
166
+ handler: runFacilitateCommand,
147
167
  description:
148
168
  "Run a facilitator with N participants — typical shape for multi-agent collaboration",
149
169
  options: {
@@ -178,7 +198,9 @@ const definition = {
178
198
  },
179
199
  {
180
200
  name: "discuss",
181
- args: "",
201
+ args: [],
202
+ argsUsage: "",
203
+ handler: runDiscussCommand,
182
204
  description:
183
205
  "Run an async, suspendable discussion — Chair + N participants + bridge callback",
184
206
  options: {
@@ -217,19 +239,25 @@ const definition = {
217
239
  },
218
240
  {
219
241
  name: "output",
220
- args: "",
242
+ args: [],
243
+ argsUsage: "",
244
+ handler: runOutputCommand,
221
245
  description:
222
246
  "Read NDJSON from stdin and emit a structured or readable form",
223
247
  },
224
248
  {
225
249
  name: "tee",
226
- args: "[output.ndjson]",
250
+ args: ["output"],
251
+ argsUsage: "[output.ndjson]",
252
+ handler: teeHandler,
227
253
  description:
228
254
  "Stream readable text to stdout while saving raw NDJSON to a file",
229
255
  },
230
256
  {
231
257
  name: "callback",
232
- args: "",
258
+ args: [],
259
+ argsUsage: "",
260
+ handler: runCallbackCommand,
233
261
  description:
234
262
  "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
235
263
  options: {
@@ -298,43 +326,34 @@ const definition = {
298
326
  ],
299
327
  };
300
328
 
301
- const cli = createCli(definition);
302
329
  const logger = createLogger("eval");
303
330
 
304
- const COMMANDS = {
305
- output: runOutputCommand,
306
- tee: runTeeCommand,
307
- run: runRunCommand,
308
- supervise: runSuperviseCommand,
309
- facilitate: runFacilitateCommand,
310
- discuss: runDiscussCommand,
311
- callback: runCallbackCommand,
312
- };
313
-
314
331
  async function main() {
315
- const parsed = cli.parse(process.argv.slice(2));
316
- if (!parsed) process.exit(0);
317
-
318
- const { values, positionals } = parsed;
332
+ const runtime = createDefaultRuntime();
333
+ const cli = createCli(definition, { runtime });
334
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
335
+ if (!parsed) return runtime.proc.exit(0);
319
336
 
337
+ const { positionals } = parsed;
320
338
  if (positionals.length === 0) {
321
339
  cli.usageError("no command specified");
322
- process.exit(2);
340
+ return runtime.proc.exit(2);
323
341
  }
324
342
 
325
- const [command, ...args] = positionals;
326
- const handler = COMMANDS[command];
327
-
328
- if (!handler) {
343
+ const command = positionals[0];
344
+ if (!definition.commands.some((c) => c.name === command)) {
329
345
  cli.usageError(`unknown command "${command}"`);
330
- process.exit(2);
346
+ return runtime.proc.exit(2);
331
347
  }
332
348
 
333
- await handler(values, args);
349
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
350
+ const envelope = result ?? { ok: true };
351
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
352
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
334
353
  }
335
354
 
336
355
  main().catch((error) => {
337
356
  logger.exception("main", error);
338
- cli.error(error.message);
357
+ createCli(definition).error(error.message);
339
358
  process.exit(1);
340
359
  });
package/bin/fit-trace.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createScriptConfig } from "@forwardimpact/libconfig";
8
9
  import { createLogger } from "@forwardimpact/libtelemetry";
9
10
 
@@ -46,7 +47,9 @@ const definition = {
46
47
  commands: [
47
48
  {
48
49
  name: "runs",
49
- args: "[pattern]",
50
+ args: ["pattern"],
51
+ argsUsage: "[pattern]",
52
+ handler: runRunsCommand,
50
53
  description:
51
54
  "List recent GitHub Actions workflow runs (default pattern: agent)",
52
55
  options: {
@@ -63,7 +66,9 @@ const definition = {
63
66
  },
64
67
  {
65
68
  name: "download",
66
- args: "<run-id>",
69
+ args: ["run-id"],
70
+ argsUsage: "<run-id>",
71
+ handler: runDownloadCommand,
67
72
  description: "Download trace artifact and convert to structured JSON",
68
73
  options: {
69
74
  dir: { type: "string", description: "Output directory" },
@@ -77,32 +82,44 @@ const definition = {
77
82
  },
78
83
  {
79
84
  name: "overview",
80
- args: "<file>",
85
+ args: ["file"],
86
+ argsUsage: "<file>",
87
+ handler: runOverviewCommand,
81
88
  description: "Metadata, summary, turn count, tool frequency",
82
89
  },
83
90
  {
84
91
  name: "count",
85
- args: "<file>",
92
+ args: ["file"],
93
+ argsUsage: "<file>",
94
+ handler: runCountCommand,
86
95
  description: "Number of turns",
87
96
  },
88
97
  {
89
98
  name: "batch",
90
- args: "<file> <from> <to>",
99
+ args: ["file", "from", "to"],
100
+ argsUsage: "<file> <from> <to>",
101
+ handler: runBatchCommand,
91
102
  description: "Turns in range [from, to) (zero-indexed)",
92
103
  },
93
104
  {
94
105
  name: "head",
95
- args: "<file> [N]",
106
+ args: ["file", "n"],
107
+ argsUsage: "<file> [N]",
108
+ handler: runHeadCommand,
96
109
  description: "First N turns (default 10)",
97
110
  },
98
111
  {
99
112
  name: "tail",
100
- args: "<file> [N]",
113
+ args: ["file", "n"],
114
+ argsUsage: "<file> [N]",
115
+ handler: runTailCommand,
101
116
  description: "Last N turns (default 10)",
102
117
  },
103
118
  {
104
119
  name: "search",
105
- args: "<file> <pattern>",
120
+ args: ["file", "pattern"],
121
+ argsUsage: "<file> <pattern>",
122
+ handler: runSearchCommand,
106
123
  description: "Search all content for regex pattern",
107
124
  options: {
108
125
  limit: {
@@ -121,22 +138,30 @@ const definition = {
121
138
  },
122
139
  {
123
140
  name: "tools",
124
- args: "<file>",
141
+ args: ["file"],
142
+ argsUsage: "<file>",
143
+ handler: runToolsCommand,
125
144
  description: "Tool usage frequency (descending)",
126
145
  },
127
146
  {
128
147
  name: "tool",
129
- args: "<file> <name>",
148
+ args: ["file", "name"],
149
+ argsUsage: "<file> <name>",
150
+ handler: runToolCommand,
130
151
  description: "All turns involving a specific tool",
131
152
  },
132
153
  {
133
154
  name: "errors",
134
- args: "<file>",
155
+ args: ["file"],
156
+ argsUsage: "<file>",
157
+ handler: runErrorsCommand,
135
158
  description: "Tool results with isError=true",
136
159
  },
137
160
  {
138
161
  name: "reasoning",
139
- args: "<file>",
162
+ args: ["file"],
163
+ argsUsage: "<file>",
164
+ handler: runReasoningCommand,
140
165
  description: "Agent reasoning text only",
141
166
  options: {
142
167
  from: { type: "string", description: "Start at turn index" },
@@ -145,27 +170,37 @@ const definition = {
145
170
  },
146
171
  {
147
172
  name: "timeline",
148
- args: "<file>",
173
+ args: ["file"],
174
+ argsUsage: "<file>",
175
+ handler: runTimelineCommand,
149
176
  description: "Compact one-line-per-turn overview",
150
177
  },
151
178
  {
152
179
  name: "stats",
153
- args: "<file>",
180
+ args: ["file"],
181
+ argsUsage: "<file>",
182
+ handler: runStatsCommand,
154
183
  description: "Token usage and cost breakdown",
155
184
  },
156
185
  {
157
186
  name: "init",
158
- args: "<file>",
187
+ args: ["file"],
188
+ argsUsage: "<file>",
189
+ handler: runInitCommand,
159
190
  description: "Full system/init event",
160
191
  },
161
192
  {
162
193
  name: "turn",
163
- args: "<file> <index>",
194
+ args: ["file", "index"],
195
+ argsUsage: "<file> <index>",
196
+ handler: runTurnCommand,
164
197
  description: "Single turn by index",
165
198
  },
166
199
  {
167
200
  name: "by-discussion",
168
- args: "<discussion-id> [trace-dir]",
201
+ args: ["discussion-id", "trace-dir"],
202
+ argsUsage: "<discussion-id> [trace-dir]",
203
+ handler: runByDiscussionCommand,
169
204
  description:
170
205
  "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
171
206
  options: {
@@ -177,7 +212,9 @@ const definition = {
177
212
  },
178
213
  {
179
214
  name: "filter",
180
- args: "<file>",
215
+ args: ["file"],
216
+ argsUsage: "<file>",
217
+ handler: runFilterCommand,
181
218
  description: "Filter turns by role, tool, or error status",
182
219
  options: {
183
220
  role: {
@@ -196,7 +233,9 @@ const definition = {
196
233
  },
197
234
  {
198
235
  name: "split",
199
- args: "<file>",
236
+ args: ["file"],
237
+ argsUsage: "<file>",
238
+ handler: runSplitCommand,
200
239
  description:
201
240
  "Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
202
241
  options: {
@@ -217,9 +256,11 @@ const definition = {
217
256
  },
218
257
  {
219
258
  name: "assert",
220
- args: "<test-name> <file>",
259
+ args: ["test-name", "file"],
260
+ argsUsage: "<test-name> <file>",
261
+ handler: runAssertCommand,
221
262
  description:
222
- "Shell-friendly assertion — outputs structured JSON for scoring hooks",
263
+ "Shell-friendly assertion — outputs structured JSON for invariant hooks",
223
264
  options: {
224
265
  grep: {
225
266
  type: "string",
@@ -299,57 +340,42 @@ const definition = {
299
340
  ],
300
341
  };
301
342
 
302
- const cli = createCli(definition);
303
343
  const logger = createLogger("trace");
304
344
 
305
- const COMMANDS = {
306
- runs: runRunsCommand,
307
- download: runDownloadCommand,
308
- overview: runOverviewCommand,
309
- count: runCountCommand,
310
- batch: runBatchCommand,
311
- head: runHeadCommand,
312
- tail: runTailCommand,
313
- search: runSearchCommand,
314
- tools: runToolsCommand,
315
- tool: runToolCommand,
316
- errors: runErrorsCommand,
317
- reasoning: runReasoningCommand,
318
- timeline: runTimelineCommand,
319
- stats: runStatsCommand,
320
- init: runInitCommand,
321
- turn: runTurnCommand,
322
- filter: runFilterCommand,
323
- split: runSplitCommand,
324
- assert: runAssertCommand,
325
- "by-discussion": runByDiscussionCommand,
326
- };
345
+ // Commands that talk to the GitHub API need a config-backed token resolver;
346
+ // the rest only read local trace files through the runtime.
347
+ const NEEDS_CONFIG = new Set(["runs", "download"]);
327
348
 
328
349
  async function main() {
329
- const parsed = cli.parse(process.argv.slice(2));
330
- if (!parsed) process.exit(0);
331
-
332
- const { values, positionals } = parsed;
350
+ const runtime = createDefaultRuntime();
351
+ const cli = createCli(definition, { runtime });
352
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
353
+ if (!parsed) return runtime.proc.exit(0);
333
354
 
355
+ const { positionals } = parsed;
334
356
  if (positionals.length === 0) {
335
357
  cli.usageError("no command specified");
336
- process.exit(2);
358
+ return runtime.proc.exit(2);
337
359
  }
338
360
 
339
- const [command, ...args] = positionals;
340
- const handler = COMMANDS[command];
341
-
342
- if (!handler) {
361
+ const command = positionals[0];
362
+ if (!definition.commands.some((c) => c.name === command)) {
343
363
  cli.usageError(`unknown command "${command}"`);
344
- process.exit(2);
364
+ return runtime.proc.exit(2);
345
365
  }
346
366
 
347
- const config = await createScriptConfig("eval");
348
- await handler(values, args, { config });
367
+ const config = NEEDS_CONFIG.has(command)
368
+ ? await createScriptConfig("eval")
369
+ : undefined;
370
+
371
+ const result = await cli.dispatch(parsed, { deps: { runtime, config } });
372
+ const envelope = result ?? { ok: true };
373
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
374
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
349
375
  }
350
376
 
351
377
  main().catch((error) => {
352
378
  logger.exception("main", error);
353
- cli.error(error.message);
379
+ createCli(definition).error(error.message);
354
380
  process.exit(1);
355
381
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.50",
3
+ "version": "0.1.51",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -29,12 +29,16 @@ export class AgentRunner {
29
29
  * @param {string[]} [deps.disallowedTools] - Tools to explicitly remove from the model's context
30
30
  * @param {Record<string, object>} [deps.mcpServers] - MCP server configs to pass to the SDK query
31
31
  * @param {object} deps.redactor
32
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} [deps.runtime] -
33
+ * Ambient collaborators. Only `proc.env` is read (to record Skill
34
+ * invocations into `LIBEVAL_SKILL`); when absent the write is skipped.
32
35
  */
33
36
  constructor(deps) {
34
37
  if (!deps.cwd) throw new Error("cwd is required");
35
38
  if (!deps.query) throw new Error("query is required");
36
39
  if (!deps.output) throw new Error("output is required");
37
40
  if (!deps.redactor) throw new Error("redactor is required");
41
+ this.runtime = deps.runtime ?? null;
38
42
  this.cwd = deps.cwd;
39
43
  this.query = deps.query;
40
44
  this.output = deps.output;
@@ -179,20 +183,24 @@ export class AgentRunner {
179
183
  if (message.type === "system" && message.subtype === "init") {
180
184
  this.sessionId = message.session_id;
181
185
  }
182
- if (message.type === "assistant") trackSkillInvocation(message);
186
+ if (message.type === "assistant") this.#trackSkillInvocation(message);
183
187
  }
184
- }
185
188
 
186
- function trackSkillInvocation(message) {
187
- const content = message.message?.content ?? message.content;
188
- if (!Array.isArray(content)) return;
189
- for (const block of content) {
190
- if (
191
- block.type === "tool_use" &&
192
- block.name === "Skill" &&
193
- block.input?.skill
194
- ) {
195
- process.env.LIBEVAL_SKILL = block.input.skill;
189
+ #trackSkillInvocation(message) {
190
+ const content = message.message?.content ?? message.content;
191
+ if (!Array.isArray(content)) return;
192
+ // Skill metric is recorded into the env map; without a runtime there is
193
+ // no env surface to write to, so the side-effect is simply skipped.
194
+ const env = this.runtime?.proc?.env ?? null;
195
+ if (!env) return;
196
+ for (const block of content) {
197
+ if (
198
+ block.type === "tool_use" &&
199
+ block.name === "Skill" &&
200
+ block.input?.skill
201
+ ) {
202
+ env.LIBEVAL_SKILL = block.input.skill;
203
+ }
196
204
  }
197
205
  }
198
206
  }