@forwardimpact/libeval 0.1.49 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +76 -78
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +2 -2
  6. package/src/agent-runner.js +23 -13
  7. package/src/benchmark/env-loader.js +35 -23
  8. package/src/benchmark/{scorer.js → invariants.js} +14 -12
  9. package/src/benchmark/judge.js +5 -8
  10. package/src/benchmark/npm-installer.js +87 -0
  11. package/src/benchmark/report.js +15 -15
  12. package/src/benchmark/result.js +11 -11
  13. package/src/benchmark/runner.js +17 -11
  14. package/src/benchmark/task-family.js +6 -4
  15. package/src/benchmark/workdir.js +23 -3
  16. package/src/commands/assert.js +30 -22
  17. package/src/commands/benchmark-invariants.js +74 -0
  18. package/src/commands/benchmark-report.js +23 -15
  19. package/src/commands/benchmark-run.js +22 -7
  20. package/src/commands/by-discussion.js +29 -18
  21. package/src/commands/callback.js +20 -11
  22. package/src/commands/discuss.js +30 -21
  23. package/src/commands/facilitate.js +20 -21
  24. package/src/commands/output.js +11 -12
  25. package/src/commands/run.js +24 -21
  26. package/src/commands/supervise.js +27 -27
  27. package/src/commands/task-input.js +54 -0
  28. package/src/commands/trace.js +174 -97
  29. package/src/discuss-tools.js +48 -2
  30. package/src/discusser.js +49 -2
  31. package/src/events/github.js +155 -0
  32. package/src/inbox-poller.js +84 -0
  33. package/src/index.js +10 -0
  34. package/src/judge.js +1 -1
  35. package/src/message-bus.js +6 -0
  36. package/src/orchestration-loop.js +19 -5
  37. package/src/orchestration-toolkit.js +14 -0
  38. package/src/redaction.js +31 -9
  39. package/src/reply-emitter.js +47 -0
  40. package/src/commands/benchmark-score.js +0 -68
package/README.md CHANGED
@@ -71,11 +71,12 @@ while participants work in parallel — nothing blocks the LLM thread.
71
71
 
72
72
  ### Discuss-mode replies
73
73
 
74
- In discussion mode, Answer calls routed to the lead are captured as
75
- thread replies delivered via the bridge callback. The lead delegates work
76
- via Ask; each agent's Answer becomes a separate reply posted to the
77
- discussion thread. No explicit reply tool is needed on the lead surface —
78
- the message bus intercepts answers and appends them to `ctx.replies[]`.
74
+ In discussion mode, Answer calls routed to the lead are streamed to
75
+ the discussion thread as they are produced each agent's Answer becomes
76
+ a separate reply posted immediately, not batched at session end. The
77
+ lead and agents can also call `Acknowledge` to post brief messages
78
+ directly to the thread (status updates, human follow-up responses).
79
+ The message bus intercepts answers and appends them to `ctx.replies[]`.
79
80
 
80
81
  `RequestForComment` is a separate coordination tool available on agent
81
82
  roles (facilitated agents and discuss agents). It queues an intent to
@@ -104,8 +105,8 @@ only feeds the summary's `success`/`verdict`.
104
105
  | Fac. agent | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
105
106
  | Supervisor | ✓ | ✓ | ✓ | ✓ | ✓ | |
106
107
  | Sup. agent | ✓ | ✓ | ✓ | ✓ | | |
107
- | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn` |
108
- | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
108
+ | Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn`, `Acknowledge` |
109
+ | Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment`, `Acknowledge` |
109
110
  | Judge | | | | | ✓ | |
110
111
 
111
112
  Ask's `to` accepts a participant name on multi-participant roles
@@ -169,7 +170,9 @@ downloadable through retention.
169
170
  | `orchestration-toolkit.js` | Shared Ask/Answer/Announce/Conclude/RollCall/RequestForComment handlers + builders. |
170
171
  | `orchestration-loop.js` | Unified lead+participant loop; reminder/violation handling. |
171
172
  | `facilitator.js` / `supervisor.js` / `discusser.js` / `judge.js` | Per-mode class + factory + system prompt. |
172
- | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`. |
173
+ | `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`/`Acknowledge`. |
174
+ | `reply-emitter.js` | Fire-and-forget POST of reply/ack events to the callback URL. |
175
+ | `inbox-poller.js` | Long-poll the bridge inbox for injected human messages. |
173
176
  | `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers. |
174
177
  | `redaction.js` | Env-var allowlist + credential-shape pattern redaction. |
175
178
 
@@ -4,10 +4,11 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync, realpathSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
10
- import { runBenchmarkScoreCommand } from "../src/commands/benchmark-score.js";
11
+ import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
11
12
  import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
12
13
 
13
14
  // `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
@@ -26,7 +27,8 @@ export const definition = {
26
27
  commands: [
27
28
  {
28
29
  name: "run",
29
- args: "",
30
+ args: [],
31
+ handler: runBenchmarkRunCommand,
30
32
  description:
31
33
  "Run every task in a family for N runs and emit one result record per (task, runIndex).",
32
34
  options: {
@@ -78,10 +80,11 @@ export const definition = {
78
80
  },
79
81
  },
80
82
  {
81
- name: "score",
82
- args: "",
83
+ name: "invariants",
84
+ args: [],
85
+ handler: runBenchmarkInvariantsCommand,
83
86
  description:
84
- "Score a single task against a post-run workdir without invoking an agent.",
87
+ "Check a single task's invariants against a post-run workdir without invoking an agent.",
85
88
  options: {
86
89
  family: {
87
90
  type: "string",
@@ -94,7 +97,7 @@ export const definition = {
94
97
  workdir: {
95
98
  type: "string",
96
99
  description:
97
- "Post-run directory; <workdir>/cwd/ is the agent CWD scoring runs against",
100
+ "Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
98
101
  },
99
102
  output: {
100
103
  type: "string",
@@ -104,7 +107,8 @@ export const definition = {
104
107
  },
105
108
  {
106
109
  name: "report",
107
- args: "",
110
+ args: [],
111
+ handler: runBenchmarkReportCommand,
108
112
  description:
109
113
  "Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
110
114
  options: {
@@ -132,7 +136,7 @@ export const definition = {
132
136
  examples: [
133
137
  "fit-benchmark run --family=./families/coding",
134
138
  "fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
135
- "fit-benchmark score --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
139
+ "fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
136
140
  "fit-benchmark report --format=text",
137
141
  "fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
138
142
  ],
@@ -152,35 +156,30 @@ export const definition = {
152
156
  ],
153
157
  };
154
158
 
155
- const cli = createCli(definition);
156
159
  const logger = createLogger("benchmark");
157
160
 
158
- const COMMANDS = {
159
- run: runBenchmarkRunCommand,
160
- score: runBenchmarkScoreCommand,
161
- report: runBenchmarkReportCommand,
162
- };
163
-
164
161
  async function main() {
165
- const parsed = cli.parse(process.argv.slice(2));
166
- if (!parsed) process.exit(0);
167
-
168
- const { values, positionals } = parsed;
162
+ const runtime = createDefaultRuntime();
163
+ const cli = createCli(definition, { runtime });
164
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
165
+ if (!parsed) return runtime.proc.exit(0);
169
166
 
167
+ const { positionals } = parsed;
170
168
  if (positionals.length === 0) {
171
169
  cli.usageError("no command specified");
172
- process.exit(2);
170
+ return runtime.proc.exit(2);
173
171
  }
174
172
 
175
- const [command, ...args] = positionals;
176
- const handler = COMMANDS[command];
177
-
178
- if (!handler) {
173
+ const command = positionals[0];
174
+ if (!definition.commands.some((c) => c.name === command)) {
179
175
  cli.usageError(`unknown command "${command}"`);
180
- process.exit(2);
176
+ return runtime.proc.exit(2);
181
177
  }
182
178
 
183
- await handler(values, args);
179
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
180
+ const envelope = result ?? { ok: true };
181
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
182
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
184
183
  }
185
184
 
186
185
  // Run main only when invoked as a CLI. Importing for tests (e.g. parity)
@@ -188,7 +187,7 @@ async function main() {
188
187
  if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
189
188
  main().catch((error) => {
190
189
  logger.exception("main", error);
191
- cli.error(error.message);
190
+ createCli(definition).error(error.message);
192
191
  process.exit(1);
193
192
  });
194
193
  }
package/bin/fit-eval.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createLogger } from "@forwardimpact/libtelemetry";
8
9
 
9
10
  import { runOutputCommand } from "../src/commands/output.js";
@@ -14,6 +15,19 @@ import { runFacilitateCommand } from "../src/commands/facilitate.js";
14
15
  import { runDiscussCommand } from "../src/commands/discuss.js";
15
16
  import { runCallbackCommand } from "../src/commands/callback.js";
16
17
 
18
+ // `tee` streams stdin→stdout via Node's `pipeline`, which needs real stream
19
+ // objects the runtime surface does not expose; it keeps the legacy
20
+ // `(values, args)` signature and this adapter bridges it into dispatch.
21
+ async function teeHandler(ctx) {
22
+ const out = ctx.args.output;
23
+ try {
24
+ await runTeeCommand(ctx.options, out ? [out] : []);
25
+ return { ok: true };
26
+ } catch (error) {
27
+ return { ok: false, code: 1, error: error.message };
28
+ }
29
+ }
30
+
17
31
  // `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
18
32
  // the readFileSync branch in the compiled binary (which would ENOENT against
19
33
  // the bunfs virtual mount). Source execution falls through to package.json.
@@ -34,6 +48,29 @@ const LEAD_OPTIONS = {
34
48
  },
35
49
  };
36
50
 
51
+ // Shared task-input flags: --task-file (path), --task-text (inline), and
52
+ // --task-event (path to native GitHub event JSON composed into a task via
53
+ // libeval/src/events/github.js). Exactly one of the three is required.
54
+ const TASK_INPUT_OPTIONS = {
55
+ "task-file": {
56
+ type: "string",
57
+ description: "Path to a markdown task file",
58
+ },
59
+ "task-text": {
60
+ type: "string",
61
+ description: "Inline task text (alternative to --task-file)",
62
+ },
63
+ "task-event": {
64
+ type: "string",
65
+ description:
66
+ "Path to a native GitHub event payload JSON, composed into the task via libeval/src/events/github.js (reads $GITHUB_EVENT_NAME)",
67
+ },
68
+ "task-amend": {
69
+ type: "string",
70
+ description: "Additional text appended to the task",
71
+ },
72
+ };
73
+
37
74
  const definition = {
38
75
  name: "fit-eval",
39
76
  version: VERSION,
@@ -42,21 +79,12 @@ const definition = {
42
79
  commands: [
43
80
  {
44
81
  name: "run",
45
- args: "",
82
+ args: [],
83
+ argsUsage: "",
84
+ handler: runRunCommand,
46
85
  description: "Run a single agent autonomously on a defined task",
47
86
  options: {
48
- "task-file": {
49
- type: "string",
50
- description: "Path to a markdown task file",
51
- },
52
- "task-text": {
53
- type: "string",
54
- description: "Inline task text (alternative to --task-file)",
55
- },
56
- "task-amend": {
57
- type: "string",
58
- description: "Additional text appended to the task",
59
- },
87
+ ...TASK_INPUT_OPTIONS,
60
88
  "agent-model": {
61
89
  type: "string",
62
90
  description:
@@ -88,22 +116,13 @@ const definition = {
88
116
  },
89
117
  {
90
118
  name: "supervise",
91
- args: "",
119
+ args: [],
120
+ argsUsage: "",
121
+ handler: runSuperviseCommand,
92
122
  description:
93
123
  "Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
94
124
  options: {
95
- "task-file": {
96
- type: "string",
97
- description: "Path to a markdown task file",
98
- },
99
- "task-text": {
100
- type: "string",
101
- description: "Inline task text (alternative to --task-file)",
102
- },
103
- "task-amend": {
104
- type: "string",
105
- description: "Additional text appended to the task",
106
- },
125
+ ...TASK_INPUT_OPTIONS,
107
126
  "agent-model": {
108
127
  type: "string",
109
128
  description:
@@ -142,22 +161,13 @@ const definition = {
142
161
  },
143
162
  {
144
163
  name: "facilitate",
145
- args: "",
164
+ args: [],
165
+ argsUsage: "",
166
+ handler: runFacilitateCommand,
146
167
  description:
147
168
  "Run a facilitator with N participants — typical shape for multi-agent collaboration",
148
169
  options: {
149
- "task-file": {
150
- type: "string",
151
- description: "Path to a markdown task file",
152
- },
153
- "task-text": {
154
- type: "string",
155
- description: "Inline task text (alternative to --task-file)",
156
- },
157
- "task-amend": {
158
- type: "string",
159
- description: "Additional text appended to the task",
160
- },
170
+ ...TASK_INPUT_OPTIONS,
161
171
  "agent-model": {
162
172
  type: "string",
163
173
  description: "Claude model for agents (default: claude-opus-4-7[1m])",
@@ -188,22 +198,13 @@ const definition = {
188
198
  },
189
199
  {
190
200
  name: "discuss",
191
- args: "",
201
+ args: [],
202
+ argsUsage: "",
203
+ handler: runDiscussCommand,
192
204
  description:
193
205
  "Run an async, suspendable discussion — Chair + N participants + bridge callback",
194
206
  options: {
195
- "task-file": {
196
- type: "string",
197
- description: "Path to a markdown task file",
198
- },
199
- "task-text": {
200
- type: "string",
201
- description: "Inline task text (alternative to --task-file)",
202
- },
203
- "task-amend": {
204
- type: "string",
205
- description: "Additional text appended to the task",
206
- },
207
+ ...TASK_INPUT_OPTIONS,
207
208
  "agent-model": {
208
209
  type: "string",
209
210
  description: "Claude model for agents (default: claude-opus-4-7[1m])",
@@ -238,19 +239,25 @@ const definition = {
238
239
  },
239
240
  {
240
241
  name: "output",
241
- args: "",
242
+ args: [],
243
+ argsUsage: "",
244
+ handler: runOutputCommand,
242
245
  description:
243
246
  "Read NDJSON from stdin and emit a structured or readable form",
244
247
  },
245
248
  {
246
249
  name: "tee",
247
- args: "[output.ndjson]",
250
+ args: ["output"],
251
+ argsUsage: "[output.ndjson]",
252
+ handler: teeHandler,
248
253
  description:
249
254
  "Stream readable text to stdout while saving raw NDJSON to a file",
250
255
  },
251
256
  {
252
257
  name: "callback",
253
- args: "",
258
+ args: [],
259
+ argsUsage: "",
260
+ handler: runCallbackCommand,
254
261
  description:
255
262
  "Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
256
263
  options: {
@@ -319,43 +326,34 @@ const definition = {
319
326
  ],
320
327
  };
321
328
 
322
- const cli = createCli(definition);
323
329
  const logger = createLogger("eval");
324
330
 
325
- const COMMANDS = {
326
- output: runOutputCommand,
327
- tee: runTeeCommand,
328
- run: runRunCommand,
329
- supervise: runSuperviseCommand,
330
- facilitate: runFacilitateCommand,
331
- discuss: runDiscussCommand,
332
- callback: runCallbackCommand,
333
- };
334
-
335
331
  async function main() {
336
- const parsed = cli.parse(process.argv.slice(2));
337
- if (!parsed) process.exit(0);
338
-
339
- const { values, positionals } = parsed;
332
+ const runtime = createDefaultRuntime();
333
+ const cli = createCli(definition, { runtime });
334
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
335
+ if (!parsed) return runtime.proc.exit(0);
340
336
 
337
+ const { positionals } = parsed;
341
338
  if (positionals.length === 0) {
342
339
  cli.usageError("no command specified");
343
- process.exit(2);
340
+ return runtime.proc.exit(2);
344
341
  }
345
342
 
346
- const [command, ...args] = positionals;
347
- const handler = COMMANDS[command];
348
-
349
- if (!handler) {
343
+ const command = positionals[0];
344
+ if (!definition.commands.some((c) => c.name === command)) {
350
345
  cli.usageError(`unknown command "${command}"`);
351
- process.exit(2);
346
+ return runtime.proc.exit(2);
352
347
  }
353
348
 
354
- await handler(values, args);
349
+ const result = await cli.dispatch(parsed, { deps: { runtime } });
350
+ const envelope = result ?? { ok: true };
351
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
352
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
355
353
  }
356
354
 
357
355
  main().catch((error) => {
358
356
  logger.exception("main", error);
359
- cli.error(error.message);
357
+ createCli(definition).error(error.message);
360
358
  process.exit(1);
361
359
  });
package/bin/fit-trace.js CHANGED
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
4
4
 
5
5
  import { readFileSync } from "node:fs";
6
6
  import { createCli } from "@forwardimpact/libcli";
7
+ import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
7
8
  import { createScriptConfig } from "@forwardimpact/libconfig";
8
9
  import { createLogger } from "@forwardimpact/libtelemetry";
9
10
 
@@ -46,7 +47,9 @@ const definition = {
46
47
  commands: [
47
48
  {
48
49
  name: "runs",
49
- args: "[pattern]",
50
+ args: ["pattern"],
51
+ argsUsage: "[pattern]",
52
+ handler: runRunsCommand,
50
53
  description:
51
54
  "List recent GitHub Actions workflow runs (default pattern: agent)",
52
55
  options: {
@@ -63,7 +66,9 @@ const definition = {
63
66
  },
64
67
  {
65
68
  name: "download",
66
- args: "<run-id>",
69
+ args: ["run-id"],
70
+ argsUsage: "<run-id>",
71
+ handler: runDownloadCommand,
67
72
  description: "Download trace artifact and convert to structured JSON",
68
73
  options: {
69
74
  dir: { type: "string", description: "Output directory" },
@@ -77,32 +82,44 @@ const definition = {
77
82
  },
78
83
  {
79
84
  name: "overview",
80
- args: "<file>",
85
+ args: ["file"],
86
+ argsUsage: "<file>",
87
+ handler: runOverviewCommand,
81
88
  description: "Metadata, summary, turn count, tool frequency",
82
89
  },
83
90
  {
84
91
  name: "count",
85
- args: "<file>",
92
+ args: ["file"],
93
+ argsUsage: "<file>",
94
+ handler: runCountCommand,
86
95
  description: "Number of turns",
87
96
  },
88
97
  {
89
98
  name: "batch",
90
- args: "<file> <from> <to>",
99
+ args: ["file", "from", "to"],
100
+ argsUsage: "<file> <from> <to>",
101
+ handler: runBatchCommand,
91
102
  description: "Turns in range [from, to) (zero-indexed)",
92
103
  },
93
104
  {
94
105
  name: "head",
95
- args: "<file> [N]",
106
+ args: ["file", "n"],
107
+ argsUsage: "<file> [N]",
108
+ handler: runHeadCommand,
96
109
  description: "First N turns (default 10)",
97
110
  },
98
111
  {
99
112
  name: "tail",
100
- args: "<file> [N]",
113
+ args: ["file", "n"],
114
+ argsUsage: "<file> [N]",
115
+ handler: runTailCommand,
101
116
  description: "Last N turns (default 10)",
102
117
  },
103
118
  {
104
119
  name: "search",
105
- args: "<file> <pattern>",
120
+ args: ["file", "pattern"],
121
+ argsUsage: "<file> <pattern>",
122
+ handler: runSearchCommand,
106
123
  description: "Search all content for regex pattern",
107
124
  options: {
108
125
  limit: {
@@ -121,22 +138,30 @@ const definition = {
121
138
  },
122
139
  {
123
140
  name: "tools",
124
- args: "<file>",
141
+ args: ["file"],
142
+ argsUsage: "<file>",
143
+ handler: runToolsCommand,
125
144
  description: "Tool usage frequency (descending)",
126
145
  },
127
146
  {
128
147
  name: "tool",
129
- args: "<file> <name>",
148
+ args: ["file", "name"],
149
+ argsUsage: "<file> <name>",
150
+ handler: runToolCommand,
130
151
  description: "All turns involving a specific tool",
131
152
  },
132
153
  {
133
154
  name: "errors",
134
- args: "<file>",
155
+ args: ["file"],
156
+ argsUsage: "<file>",
157
+ handler: runErrorsCommand,
135
158
  description: "Tool results with isError=true",
136
159
  },
137
160
  {
138
161
  name: "reasoning",
139
- args: "<file>",
162
+ args: ["file"],
163
+ argsUsage: "<file>",
164
+ handler: runReasoningCommand,
140
165
  description: "Agent reasoning text only",
141
166
  options: {
142
167
  from: { type: "string", description: "Start at turn index" },
@@ -145,27 +170,37 @@ const definition = {
145
170
  },
146
171
  {
147
172
  name: "timeline",
148
- args: "<file>",
173
+ args: ["file"],
174
+ argsUsage: "<file>",
175
+ handler: runTimelineCommand,
149
176
  description: "Compact one-line-per-turn overview",
150
177
  },
151
178
  {
152
179
  name: "stats",
153
- args: "<file>",
180
+ args: ["file"],
181
+ argsUsage: "<file>",
182
+ handler: runStatsCommand,
154
183
  description: "Token usage and cost breakdown",
155
184
  },
156
185
  {
157
186
  name: "init",
158
- args: "<file>",
187
+ args: ["file"],
188
+ argsUsage: "<file>",
189
+ handler: runInitCommand,
159
190
  description: "Full system/init event",
160
191
  },
161
192
  {
162
193
  name: "turn",
163
- args: "<file> <index>",
194
+ args: ["file", "index"],
195
+ argsUsage: "<file> <index>",
196
+ handler: runTurnCommand,
164
197
  description: "Single turn by index",
165
198
  },
166
199
  {
167
200
  name: "by-discussion",
168
- args: "<discussion-id> [trace-dir]",
201
+ args: ["discussion-id", "trace-dir"],
202
+ argsUsage: "<discussion-id> [trace-dir]",
203
+ handler: runByDiscussionCommand,
169
204
  description:
170
205
  "List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
171
206
  options: {
@@ -177,7 +212,9 @@ const definition = {
177
212
  },
178
213
  {
179
214
  name: "filter",
180
- args: "<file>",
215
+ args: ["file"],
216
+ argsUsage: "<file>",
217
+ handler: runFilterCommand,
181
218
  description: "Filter turns by role, tool, or error status",
182
219
  options: {
183
220
  role: {
@@ -196,7 +233,9 @@ const definition = {
196
233
  },
197
234
  {
198
235
  name: "split",
199
- args: "<file>",
236
+ args: ["file"],
237
+ argsUsage: "<file>",
238
+ handler: runSplitCommand,
200
239
  description:
201
240
  "Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
202
241
  options: {
@@ -217,9 +256,11 @@ const definition = {
217
256
  },
218
257
  {
219
258
  name: "assert",
220
- args: "<test-name> <file>",
259
+ args: ["test-name", "file"],
260
+ argsUsage: "<test-name> <file>",
261
+ handler: runAssertCommand,
221
262
  description:
222
- "Shell-friendly assertion — outputs structured JSON for scoring hooks",
263
+ "Shell-friendly assertion — outputs structured JSON for invariant hooks",
223
264
  options: {
224
265
  grep: {
225
266
  type: "string",
@@ -299,57 +340,42 @@ const definition = {
299
340
  ],
300
341
  };
301
342
 
302
- const cli = createCli(definition);
303
343
  const logger = createLogger("trace");
304
344
 
305
- const COMMANDS = {
306
- runs: runRunsCommand,
307
- download: runDownloadCommand,
308
- overview: runOverviewCommand,
309
- count: runCountCommand,
310
- batch: runBatchCommand,
311
- head: runHeadCommand,
312
- tail: runTailCommand,
313
- search: runSearchCommand,
314
- tools: runToolsCommand,
315
- tool: runToolCommand,
316
- errors: runErrorsCommand,
317
- reasoning: runReasoningCommand,
318
- timeline: runTimelineCommand,
319
- stats: runStatsCommand,
320
- init: runInitCommand,
321
- turn: runTurnCommand,
322
- filter: runFilterCommand,
323
- split: runSplitCommand,
324
- assert: runAssertCommand,
325
- "by-discussion": runByDiscussionCommand,
326
- };
345
+ // Commands that talk to the GitHub API need a config-backed token resolver;
346
+ // the rest only read local trace files through the runtime.
347
+ const NEEDS_CONFIG = new Set(["runs", "download"]);
327
348
 
328
349
  async function main() {
329
- const parsed = cli.parse(process.argv.slice(2));
330
- if (!parsed) process.exit(0);
331
-
332
- const { values, positionals } = parsed;
350
+ const runtime = createDefaultRuntime();
351
+ const cli = createCli(definition, { runtime });
352
+ const parsed = cli.parse(runtime.proc.argv.slice(2));
353
+ if (!parsed) return runtime.proc.exit(0);
333
354
 
355
+ const { positionals } = parsed;
334
356
  if (positionals.length === 0) {
335
357
  cli.usageError("no command specified");
336
- process.exit(2);
358
+ return runtime.proc.exit(2);
337
359
  }
338
360
 
339
- const [command, ...args] = positionals;
340
- const handler = COMMANDS[command];
341
-
342
- if (!handler) {
361
+ const command = positionals[0];
362
+ if (!definition.commands.some((c) => c.name === command)) {
343
363
  cli.usageError(`unknown command "${command}"`);
344
- process.exit(2);
364
+ return runtime.proc.exit(2);
345
365
  }
346
366
 
347
- const config = await createScriptConfig("eval");
348
- await handler(values, args, { config });
367
+ const config = NEEDS_CONFIG.has(command)
368
+ ? await createScriptConfig("eval")
369
+ : undefined;
370
+
371
+ const result = await cli.dispatch(parsed, { deps: { runtime, config } });
372
+ const envelope = result ?? { ok: true };
373
+ if (!envelope.ok && envelope.error) cli.error(envelope.error);
374
+ runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
349
375
  }
350
376
 
351
377
  main().catch((error) => {
352
378
  logger.exception("main", error);
353
- cli.error(error.message);
379
+ createCli(definition).error(error.message);
354
380
  process.exit(1);
355
381
  });