@forwardimpact/libeval 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +76 -78
- package/bin/fit-trace.js +83 -57
- package/package.json +2 -2
- package/src/agent-runner.js +23 -13
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/{scorer.js → invariants.js} +14 -12
- package/src/benchmark/judge.js +5 -8
- package/src/benchmark/npm-installer.js +87 -0
- package/src/benchmark/report.js +15 -15
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +17 -11
- package/src/benchmark/task-family.js +6 -4
- package/src/benchmark/workdir.js +23 -3
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +23 -15
- package/src/commands/benchmark-run.js +22 -7
- package/src/commands/by-discussion.js +29 -18
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +30 -21
- package/src/commands/facilitate.js +20 -21
- package/src/commands/output.js +11 -12
- package/src/commands/run.js +24 -21
- package/src/commands/supervise.js +27 -27
- package/src/commands/task-input.js +54 -0
- package/src/commands/trace.js +174 -97
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +49 -2
- package/src/events/github.js +155 -0
- package/src/inbox-poller.js +84 -0
- package/src/index.js +10 -0
- package/src/judge.js +1 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +19 -5
- package/src/orchestration-toolkit.js +14 -0
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/commands/benchmark-score.js +0 -68
package/README.md
CHANGED
|
@@ -71,11 +71,12 @@ while participants work in parallel — nothing blocks the LLM thread.
|
|
|
71
71
|
|
|
72
72
|
### Discuss-mode replies
|
|
73
73
|
|
|
74
|
-
In discussion mode, Answer calls routed to the lead are
|
|
75
|
-
thread
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
74
|
+
In discussion mode, Answer calls routed to the lead are streamed to
|
|
75
|
+
the discussion thread as they are produced — each agent's Answer becomes
|
|
76
|
+
a separate reply posted immediately, not batched at session end. The
|
|
77
|
+
lead and agents can also call `Acknowledge` to post brief messages
|
|
78
|
+
directly to the thread (status updates, human follow-up responses).
|
|
79
|
+
The message bus intercepts answers and appends them to `ctx.replies[]`.
|
|
79
80
|
|
|
80
81
|
`RequestForComment` is a separate coordination tool available on agent
|
|
81
82
|
roles (facilitated agents and discuss agents). It queues an intent to
|
|
@@ -104,8 +105,8 @@ only feeds the summary's `success`/`verdict`.
|
|
|
104
105
|
| Fac. agent | ✓ | ✓ | ✓ | ✓ | | `RequestForComment` |
|
|
105
106
|
| Supervisor | ✓ | ✓ | ✓ | ✓ | ✓ | |
|
|
106
107
|
| Sup. agent | ✓ | ✓ | ✓ | ✓ | | |
|
|
107
|
-
| Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn`
|
|
108
|
-
| Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment`
|
|
108
|
+
| Discuss lead | ✓ | ✓ | ✓ | ✓ | | `Recess`, `Adjourn`, `Acknowledge` |
|
|
109
|
+
| Discuss agt | ✓ | ✓ | ✓ | ✓ | | `RequestForComment`, `Acknowledge` |
|
|
109
110
|
| Judge | | | | | ✓ | |
|
|
110
111
|
|
|
111
112
|
Ask's `to` accepts a participant name on multi-participant roles
|
|
@@ -169,7 +170,9 @@ downloadable through retention.
|
|
|
169
170
|
| `orchestration-toolkit.js` | Shared Ask/Answer/Announce/Conclude/RollCall/RequestForComment handlers + builders. |
|
|
170
171
|
| `orchestration-loop.js` | Unified lead+participant loop; reminder/violation handling. |
|
|
171
172
|
| `facilitator.js` / `supervisor.js` / `discusser.js` / `judge.js` | Per-mode class + factory + system prompt. |
|
|
172
|
-
| `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`.
|
|
173
|
+
| `discuss-tools.js` | Discuss-only `Recess`/`Adjourn`/`Acknowledge`. |
|
|
174
|
+
| `reply-emitter.js` | Fire-and-forget POST of reply/ack events to the callback URL. |
|
|
175
|
+
| `inbox-poller.js` | Long-poll the bridge inbox for injected human messages. |
|
|
173
176
|
| `trace-collector.js` / `trace-query.js` / `trace-github.js` | Trace ingestion / querying / GitHub-attachment helpers. |
|
|
174
177
|
| `redaction.js` | Env-var allowlist + credential-shape pattern redaction. |
|
|
175
178
|
|
package/bin/fit-benchmark.js
CHANGED
|
@@ -4,10 +4,11 @@ import "@forwardimpact/libpreflight/node22";
|
|
|
4
4
|
|
|
5
5
|
import { readFileSync, realpathSync } from "node:fs";
|
|
6
6
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
7
8
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
8
9
|
|
|
9
10
|
import { runBenchmarkRunCommand } from "../src/commands/benchmark-run.js";
|
|
10
|
-
import {
|
|
11
|
+
import { runBenchmarkInvariantsCommand } from "../src/commands/benchmark-invariants.js";
|
|
11
12
|
import { runBenchmarkReportCommand } from "../src/commands/benchmark-report.js";
|
|
12
13
|
|
|
13
14
|
// `bun build --compile` injects FIT_BENCHMARK_VERSION via --define, eliminating
|
|
@@ -26,7 +27,8 @@ export const definition = {
|
|
|
26
27
|
commands: [
|
|
27
28
|
{
|
|
28
29
|
name: "run",
|
|
29
|
-
args:
|
|
30
|
+
args: [],
|
|
31
|
+
handler: runBenchmarkRunCommand,
|
|
30
32
|
description:
|
|
31
33
|
"Run every task in a family for N runs and emit one result record per (task, runIndex).",
|
|
32
34
|
options: {
|
|
@@ -78,10 +80,11 @@ export const definition = {
|
|
|
78
80
|
},
|
|
79
81
|
},
|
|
80
82
|
{
|
|
81
|
-
name: "
|
|
82
|
-
args:
|
|
83
|
+
name: "invariants",
|
|
84
|
+
args: [],
|
|
85
|
+
handler: runBenchmarkInvariantsCommand,
|
|
83
86
|
description:
|
|
84
|
-
"
|
|
87
|
+
"Check a single task's invariants against a post-run workdir without invoking an agent.",
|
|
85
88
|
options: {
|
|
86
89
|
family: {
|
|
87
90
|
type: "string",
|
|
@@ -94,7 +97,7 @@ export const definition = {
|
|
|
94
97
|
workdir: {
|
|
95
98
|
type: "string",
|
|
96
99
|
description:
|
|
97
|
-
"Post-run directory; <workdir>/cwd/ is the agent CWD
|
|
100
|
+
"Post-run directory; <workdir>/cwd/ is the agent CWD invariants run against",
|
|
98
101
|
},
|
|
99
102
|
output: {
|
|
100
103
|
type: "string",
|
|
@@ -104,7 +107,8 @@ export const definition = {
|
|
|
104
107
|
},
|
|
105
108
|
{
|
|
106
109
|
name: "report",
|
|
107
|
-
args:
|
|
110
|
+
args: [],
|
|
111
|
+
handler: runBenchmarkReportCommand,
|
|
108
112
|
description:
|
|
109
113
|
"Aggregate result records into pass@k via the OpenAI HumanEval estimator.",
|
|
110
114
|
options: {
|
|
@@ -132,7 +136,7 @@ export const definition = {
|
|
|
132
136
|
examples: [
|
|
133
137
|
"fit-benchmark run --family=./families/coding",
|
|
134
138
|
"fit-benchmark run --family=./families/coding --runs=10 --agent-model=claude-sonnet-4-6",
|
|
135
|
-
"fit-benchmark
|
|
139
|
+
"fit-benchmark invariants --family=./families/coding --task=todo-api --workdir=./benchmark-runs/runs/todo-api/0",
|
|
136
140
|
"fit-benchmark report --format=text",
|
|
137
141
|
"fit-benchmark report --input=./runs/today --k=1,3,5 --format=text",
|
|
138
142
|
],
|
|
@@ -152,35 +156,30 @@ export const definition = {
|
|
|
152
156
|
],
|
|
153
157
|
};
|
|
154
158
|
|
|
155
|
-
const cli = createCli(definition);
|
|
156
159
|
const logger = createLogger("benchmark");
|
|
157
160
|
|
|
158
|
-
const COMMANDS = {
|
|
159
|
-
run: runBenchmarkRunCommand,
|
|
160
|
-
score: runBenchmarkScoreCommand,
|
|
161
|
-
report: runBenchmarkReportCommand,
|
|
162
|
-
};
|
|
163
|
-
|
|
164
161
|
async function main() {
|
|
165
|
-
const
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
162
|
+
const runtime = createDefaultRuntime();
|
|
163
|
+
const cli = createCli(definition, { runtime });
|
|
164
|
+
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
165
|
+
if (!parsed) return runtime.proc.exit(0);
|
|
169
166
|
|
|
167
|
+
const { positionals } = parsed;
|
|
170
168
|
if (positionals.length === 0) {
|
|
171
169
|
cli.usageError("no command specified");
|
|
172
|
-
|
|
170
|
+
return runtime.proc.exit(2);
|
|
173
171
|
}
|
|
174
172
|
|
|
175
|
-
const
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
if (!handler) {
|
|
173
|
+
const command = positionals[0];
|
|
174
|
+
if (!definition.commands.some((c) => c.name === command)) {
|
|
179
175
|
cli.usageError(`unknown command "${command}"`);
|
|
180
|
-
|
|
176
|
+
return runtime.proc.exit(2);
|
|
181
177
|
}
|
|
182
178
|
|
|
183
|
-
await
|
|
179
|
+
const result = await cli.dispatch(parsed, { deps: { runtime } });
|
|
180
|
+
const envelope = result ?? { ok: true };
|
|
181
|
+
if (!envelope.ok && envelope.error) cli.error(envelope.error);
|
|
182
|
+
runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
|
|
184
183
|
}
|
|
185
184
|
|
|
186
185
|
// Run main only when invoked as a CLI. Importing for tests (e.g. parity)
|
|
@@ -188,7 +187,7 @@ async function main() {
|
|
|
188
187
|
if (import.meta.url === `file://${realpathSync(process.argv[1])}`) {
|
|
189
188
|
main().catch((error) => {
|
|
190
189
|
logger.exception("main", error);
|
|
191
|
-
|
|
190
|
+
createCli(definition).error(error.message);
|
|
192
191
|
process.exit(1);
|
|
193
192
|
});
|
|
194
193
|
}
|
package/bin/fit-eval.js
CHANGED
|
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
|
|
|
4
4
|
|
|
5
5
|
import { readFileSync } from "node:fs";
|
|
6
6
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
7
8
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
8
9
|
|
|
9
10
|
import { runOutputCommand } from "../src/commands/output.js";
|
|
@@ -14,6 +15,19 @@ import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
|
14
15
|
import { runDiscussCommand } from "../src/commands/discuss.js";
|
|
15
16
|
import { runCallbackCommand } from "../src/commands/callback.js";
|
|
16
17
|
|
|
18
|
+
// `tee` streams stdin→stdout via Node's `pipeline`, which needs real stream
|
|
19
|
+
// objects the runtime surface does not expose; it keeps the legacy
|
|
20
|
+
// `(values, args)` signature and this adapter bridges it into dispatch.
|
|
21
|
+
async function teeHandler(ctx) {
|
|
22
|
+
const out = ctx.args.output;
|
|
23
|
+
try {
|
|
24
|
+
await runTeeCommand(ctx.options, out ? [out] : []);
|
|
25
|
+
return { ok: true };
|
|
26
|
+
} catch (error) {
|
|
27
|
+
return { ok: false, code: 1, error: error.message };
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
17
31
|
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
18
32
|
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
19
33
|
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
@@ -34,6 +48,29 @@ const LEAD_OPTIONS = {
|
|
|
34
48
|
},
|
|
35
49
|
};
|
|
36
50
|
|
|
51
|
+
// Shared task-input flags: --task-file (path), --task-text (inline), and
|
|
52
|
+
// --task-event (path to native GitHub event JSON composed into a task via
|
|
53
|
+
// libeval/src/events/github.js). Exactly one of the three is required.
|
|
54
|
+
const TASK_INPUT_OPTIONS = {
|
|
55
|
+
"task-file": {
|
|
56
|
+
type: "string",
|
|
57
|
+
description: "Path to a markdown task file",
|
|
58
|
+
},
|
|
59
|
+
"task-text": {
|
|
60
|
+
type: "string",
|
|
61
|
+
description: "Inline task text (alternative to --task-file)",
|
|
62
|
+
},
|
|
63
|
+
"task-event": {
|
|
64
|
+
type: "string",
|
|
65
|
+
description:
|
|
66
|
+
"Path to a native GitHub event payload JSON, composed into the task via libeval/src/events/github.js (reads $GITHUB_EVENT_NAME)",
|
|
67
|
+
},
|
|
68
|
+
"task-amend": {
|
|
69
|
+
type: "string",
|
|
70
|
+
description: "Additional text appended to the task",
|
|
71
|
+
},
|
|
72
|
+
};
|
|
73
|
+
|
|
37
74
|
const definition = {
|
|
38
75
|
name: "fit-eval",
|
|
39
76
|
version: VERSION,
|
|
@@ -42,21 +79,12 @@ const definition = {
|
|
|
42
79
|
commands: [
|
|
43
80
|
{
|
|
44
81
|
name: "run",
|
|
45
|
-
args:
|
|
82
|
+
args: [],
|
|
83
|
+
argsUsage: "",
|
|
84
|
+
handler: runRunCommand,
|
|
46
85
|
description: "Run a single agent autonomously on a defined task",
|
|
47
86
|
options: {
|
|
48
|
-
|
|
49
|
-
type: "string",
|
|
50
|
-
description: "Path to a markdown task file",
|
|
51
|
-
},
|
|
52
|
-
"task-text": {
|
|
53
|
-
type: "string",
|
|
54
|
-
description: "Inline task text (alternative to --task-file)",
|
|
55
|
-
},
|
|
56
|
-
"task-amend": {
|
|
57
|
-
type: "string",
|
|
58
|
-
description: "Additional text appended to the task",
|
|
59
|
-
},
|
|
87
|
+
...TASK_INPUT_OPTIONS,
|
|
60
88
|
"agent-model": {
|
|
61
89
|
type: "string",
|
|
62
90
|
description:
|
|
@@ -88,22 +116,13 @@ const definition = {
|
|
|
88
116
|
},
|
|
89
117
|
{
|
|
90
118
|
name: "supervise",
|
|
91
|
-
args:
|
|
119
|
+
args: [],
|
|
120
|
+
argsUsage: "",
|
|
121
|
+
handler: runSuperviseCommand,
|
|
92
122
|
description:
|
|
93
123
|
"Run a supervisor–agent relay — typical shape for agent-as-judge evaluations",
|
|
94
124
|
options: {
|
|
95
|
-
|
|
96
|
-
type: "string",
|
|
97
|
-
description: "Path to a markdown task file",
|
|
98
|
-
},
|
|
99
|
-
"task-text": {
|
|
100
|
-
type: "string",
|
|
101
|
-
description: "Inline task text (alternative to --task-file)",
|
|
102
|
-
},
|
|
103
|
-
"task-amend": {
|
|
104
|
-
type: "string",
|
|
105
|
-
description: "Additional text appended to the task",
|
|
106
|
-
},
|
|
125
|
+
...TASK_INPUT_OPTIONS,
|
|
107
126
|
"agent-model": {
|
|
108
127
|
type: "string",
|
|
109
128
|
description:
|
|
@@ -142,22 +161,13 @@ const definition = {
|
|
|
142
161
|
},
|
|
143
162
|
{
|
|
144
163
|
name: "facilitate",
|
|
145
|
-
args:
|
|
164
|
+
args: [],
|
|
165
|
+
argsUsage: "",
|
|
166
|
+
handler: runFacilitateCommand,
|
|
146
167
|
description:
|
|
147
168
|
"Run a facilitator with N participants — typical shape for multi-agent collaboration",
|
|
148
169
|
options: {
|
|
149
|
-
|
|
150
|
-
type: "string",
|
|
151
|
-
description: "Path to a markdown task file",
|
|
152
|
-
},
|
|
153
|
-
"task-text": {
|
|
154
|
-
type: "string",
|
|
155
|
-
description: "Inline task text (alternative to --task-file)",
|
|
156
|
-
},
|
|
157
|
-
"task-amend": {
|
|
158
|
-
type: "string",
|
|
159
|
-
description: "Additional text appended to the task",
|
|
160
|
-
},
|
|
170
|
+
...TASK_INPUT_OPTIONS,
|
|
161
171
|
"agent-model": {
|
|
162
172
|
type: "string",
|
|
163
173
|
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
@@ -188,22 +198,13 @@ const definition = {
|
|
|
188
198
|
},
|
|
189
199
|
{
|
|
190
200
|
name: "discuss",
|
|
191
|
-
args:
|
|
201
|
+
args: [],
|
|
202
|
+
argsUsage: "",
|
|
203
|
+
handler: runDiscussCommand,
|
|
192
204
|
description:
|
|
193
205
|
"Run an async, suspendable discussion — Chair + N participants + bridge callback",
|
|
194
206
|
options: {
|
|
195
|
-
|
|
196
|
-
type: "string",
|
|
197
|
-
description: "Path to a markdown task file",
|
|
198
|
-
},
|
|
199
|
-
"task-text": {
|
|
200
|
-
type: "string",
|
|
201
|
-
description: "Inline task text (alternative to --task-file)",
|
|
202
|
-
},
|
|
203
|
-
"task-amend": {
|
|
204
|
-
type: "string",
|
|
205
|
-
description: "Additional text appended to the task",
|
|
206
|
-
},
|
|
207
|
+
...TASK_INPUT_OPTIONS,
|
|
207
208
|
"agent-model": {
|
|
208
209
|
type: "string",
|
|
209
210
|
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
@@ -238,19 +239,25 @@ const definition = {
|
|
|
238
239
|
},
|
|
239
240
|
{
|
|
240
241
|
name: "output",
|
|
241
|
-
args:
|
|
242
|
+
args: [],
|
|
243
|
+
argsUsage: "",
|
|
244
|
+
handler: runOutputCommand,
|
|
242
245
|
description:
|
|
243
246
|
"Read NDJSON from stdin and emit a structured or readable form",
|
|
244
247
|
},
|
|
245
248
|
{
|
|
246
249
|
name: "tee",
|
|
247
|
-
args: "
|
|
250
|
+
args: ["output"],
|
|
251
|
+
argsUsage: "[output.ndjson]",
|
|
252
|
+
handler: teeHandler,
|
|
248
253
|
description:
|
|
249
254
|
"Stream readable text to stdout while saving raw NDJSON to a file",
|
|
250
255
|
},
|
|
251
256
|
{
|
|
252
257
|
name: "callback",
|
|
253
|
-
args:
|
|
258
|
+
args: [],
|
|
259
|
+
argsUsage: "",
|
|
260
|
+
handler: runCallbackCommand,
|
|
254
261
|
description:
|
|
255
262
|
"Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
|
|
256
263
|
options: {
|
|
@@ -319,43 +326,34 @@ const definition = {
|
|
|
319
326
|
],
|
|
320
327
|
};
|
|
321
328
|
|
|
322
|
-
const cli = createCli(definition);
|
|
323
329
|
const logger = createLogger("eval");
|
|
324
330
|
|
|
325
|
-
const COMMANDS = {
|
|
326
|
-
output: runOutputCommand,
|
|
327
|
-
tee: runTeeCommand,
|
|
328
|
-
run: runRunCommand,
|
|
329
|
-
supervise: runSuperviseCommand,
|
|
330
|
-
facilitate: runFacilitateCommand,
|
|
331
|
-
discuss: runDiscussCommand,
|
|
332
|
-
callback: runCallbackCommand,
|
|
333
|
-
};
|
|
334
|
-
|
|
335
331
|
async function main() {
|
|
336
|
-
const
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
332
|
+
const runtime = createDefaultRuntime();
|
|
333
|
+
const cli = createCli(definition, { runtime });
|
|
334
|
+
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
335
|
+
if (!parsed) return runtime.proc.exit(0);
|
|
340
336
|
|
|
337
|
+
const { positionals } = parsed;
|
|
341
338
|
if (positionals.length === 0) {
|
|
342
339
|
cli.usageError("no command specified");
|
|
343
|
-
|
|
340
|
+
return runtime.proc.exit(2);
|
|
344
341
|
}
|
|
345
342
|
|
|
346
|
-
const
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
if (!handler) {
|
|
343
|
+
const command = positionals[0];
|
|
344
|
+
if (!definition.commands.some((c) => c.name === command)) {
|
|
350
345
|
cli.usageError(`unknown command "${command}"`);
|
|
351
|
-
|
|
346
|
+
return runtime.proc.exit(2);
|
|
352
347
|
}
|
|
353
348
|
|
|
354
|
-
await
|
|
349
|
+
const result = await cli.dispatch(parsed, { deps: { runtime } });
|
|
350
|
+
const envelope = result ?? { ok: true };
|
|
351
|
+
if (!envelope.ok && envelope.error) cli.error(envelope.error);
|
|
352
|
+
runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
|
|
355
353
|
}
|
|
356
354
|
|
|
357
355
|
main().catch((error) => {
|
|
358
356
|
logger.exception("main", error);
|
|
359
|
-
|
|
357
|
+
createCli(definition).error(error.message);
|
|
360
358
|
process.exit(1);
|
|
361
359
|
});
|
package/bin/fit-trace.js
CHANGED
|
@@ -4,6 +4,7 @@ import "@forwardimpact/libpreflight/node22";
|
|
|
4
4
|
|
|
5
5
|
import { readFileSync } from "node:fs";
|
|
6
6
|
import { createCli } from "@forwardimpact/libcli";
|
|
7
|
+
import { createDefaultRuntime } from "@forwardimpact/libutil/runtime";
|
|
7
8
|
import { createScriptConfig } from "@forwardimpact/libconfig";
|
|
8
9
|
import { createLogger } from "@forwardimpact/libtelemetry";
|
|
9
10
|
|
|
@@ -46,7 +47,9 @@ const definition = {
|
|
|
46
47
|
commands: [
|
|
47
48
|
{
|
|
48
49
|
name: "runs",
|
|
49
|
-
args: "
|
|
50
|
+
args: ["pattern"],
|
|
51
|
+
argsUsage: "[pattern]",
|
|
52
|
+
handler: runRunsCommand,
|
|
50
53
|
description:
|
|
51
54
|
"List recent GitHub Actions workflow runs (default pattern: agent)",
|
|
52
55
|
options: {
|
|
@@ -63,7 +66,9 @@ const definition = {
|
|
|
63
66
|
},
|
|
64
67
|
{
|
|
65
68
|
name: "download",
|
|
66
|
-
args: "
|
|
69
|
+
args: ["run-id"],
|
|
70
|
+
argsUsage: "<run-id>",
|
|
71
|
+
handler: runDownloadCommand,
|
|
67
72
|
description: "Download trace artifact and convert to structured JSON",
|
|
68
73
|
options: {
|
|
69
74
|
dir: { type: "string", description: "Output directory" },
|
|
@@ -77,32 +82,44 @@ const definition = {
|
|
|
77
82
|
},
|
|
78
83
|
{
|
|
79
84
|
name: "overview",
|
|
80
|
-
args: "
|
|
85
|
+
args: ["file"],
|
|
86
|
+
argsUsage: "<file>",
|
|
87
|
+
handler: runOverviewCommand,
|
|
81
88
|
description: "Metadata, summary, turn count, tool frequency",
|
|
82
89
|
},
|
|
83
90
|
{
|
|
84
91
|
name: "count",
|
|
85
|
-
args: "
|
|
92
|
+
args: ["file"],
|
|
93
|
+
argsUsage: "<file>",
|
|
94
|
+
handler: runCountCommand,
|
|
86
95
|
description: "Number of turns",
|
|
87
96
|
},
|
|
88
97
|
{
|
|
89
98
|
name: "batch",
|
|
90
|
-
args: "
|
|
99
|
+
args: ["file", "from", "to"],
|
|
100
|
+
argsUsage: "<file> <from> <to>",
|
|
101
|
+
handler: runBatchCommand,
|
|
91
102
|
description: "Turns in range [from, to) (zero-indexed)",
|
|
92
103
|
},
|
|
93
104
|
{
|
|
94
105
|
name: "head",
|
|
95
|
-
args: "
|
|
106
|
+
args: ["file", "n"],
|
|
107
|
+
argsUsage: "<file> [N]",
|
|
108
|
+
handler: runHeadCommand,
|
|
96
109
|
description: "First N turns (default 10)",
|
|
97
110
|
},
|
|
98
111
|
{
|
|
99
112
|
name: "tail",
|
|
100
|
-
args: "
|
|
113
|
+
args: ["file", "n"],
|
|
114
|
+
argsUsage: "<file> [N]",
|
|
115
|
+
handler: runTailCommand,
|
|
101
116
|
description: "Last N turns (default 10)",
|
|
102
117
|
},
|
|
103
118
|
{
|
|
104
119
|
name: "search",
|
|
105
|
-
args: "
|
|
120
|
+
args: ["file", "pattern"],
|
|
121
|
+
argsUsage: "<file> <pattern>",
|
|
122
|
+
handler: runSearchCommand,
|
|
106
123
|
description: "Search all content for regex pattern",
|
|
107
124
|
options: {
|
|
108
125
|
limit: {
|
|
@@ -121,22 +138,30 @@ const definition = {
|
|
|
121
138
|
},
|
|
122
139
|
{
|
|
123
140
|
name: "tools",
|
|
124
|
-
args: "
|
|
141
|
+
args: ["file"],
|
|
142
|
+
argsUsage: "<file>",
|
|
143
|
+
handler: runToolsCommand,
|
|
125
144
|
description: "Tool usage frequency (descending)",
|
|
126
145
|
},
|
|
127
146
|
{
|
|
128
147
|
name: "tool",
|
|
129
|
-
args: "
|
|
148
|
+
args: ["file", "name"],
|
|
149
|
+
argsUsage: "<file> <name>",
|
|
150
|
+
handler: runToolCommand,
|
|
130
151
|
description: "All turns involving a specific tool",
|
|
131
152
|
},
|
|
132
153
|
{
|
|
133
154
|
name: "errors",
|
|
134
|
-
args: "
|
|
155
|
+
args: ["file"],
|
|
156
|
+
argsUsage: "<file>",
|
|
157
|
+
handler: runErrorsCommand,
|
|
135
158
|
description: "Tool results with isError=true",
|
|
136
159
|
},
|
|
137
160
|
{
|
|
138
161
|
name: "reasoning",
|
|
139
|
-
args: "
|
|
162
|
+
args: ["file"],
|
|
163
|
+
argsUsage: "<file>",
|
|
164
|
+
handler: runReasoningCommand,
|
|
140
165
|
description: "Agent reasoning text only",
|
|
141
166
|
options: {
|
|
142
167
|
from: { type: "string", description: "Start at turn index" },
|
|
@@ -145,27 +170,37 @@ const definition = {
|
|
|
145
170
|
},
|
|
146
171
|
{
|
|
147
172
|
name: "timeline",
|
|
148
|
-
args: "
|
|
173
|
+
args: ["file"],
|
|
174
|
+
argsUsage: "<file>",
|
|
175
|
+
handler: runTimelineCommand,
|
|
149
176
|
description: "Compact one-line-per-turn overview",
|
|
150
177
|
},
|
|
151
178
|
{
|
|
152
179
|
name: "stats",
|
|
153
|
-
args: "
|
|
180
|
+
args: ["file"],
|
|
181
|
+
argsUsage: "<file>",
|
|
182
|
+
handler: runStatsCommand,
|
|
154
183
|
description: "Token usage and cost breakdown",
|
|
155
184
|
},
|
|
156
185
|
{
|
|
157
186
|
name: "init",
|
|
158
|
-
args: "
|
|
187
|
+
args: ["file"],
|
|
188
|
+
argsUsage: "<file>",
|
|
189
|
+
handler: runInitCommand,
|
|
159
190
|
description: "Full system/init event",
|
|
160
191
|
},
|
|
161
192
|
{
|
|
162
193
|
name: "turn",
|
|
163
|
-
args: "
|
|
194
|
+
args: ["file", "index"],
|
|
195
|
+
argsUsage: "<file> <index>",
|
|
196
|
+
handler: runTurnCommand,
|
|
164
197
|
description: "Single turn by index",
|
|
165
198
|
},
|
|
166
199
|
{
|
|
167
200
|
name: "by-discussion",
|
|
168
|
-
args: "
|
|
201
|
+
args: ["discussion-id", "trace-dir"],
|
|
202
|
+
argsUsage: "<discussion-id> [trace-dir]",
|
|
203
|
+
handler: runByDiscussionCommand,
|
|
169
204
|
description:
|
|
170
205
|
"List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
|
|
171
206
|
options: {
|
|
@@ -177,7 +212,9 @@ const definition = {
|
|
|
177
212
|
},
|
|
178
213
|
{
|
|
179
214
|
name: "filter",
|
|
180
|
-
args: "
|
|
215
|
+
args: ["file"],
|
|
216
|
+
argsUsage: "<file>",
|
|
217
|
+
handler: runFilterCommand,
|
|
181
218
|
description: "Filter turns by role, tool, or error status",
|
|
182
219
|
options: {
|
|
183
220
|
role: {
|
|
@@ -196,7 +233,9 @@ const definition = {
|
|
|
196
233
|
},
|
|
197
234
|
{
|
|
198
235
|
name: "split",
|
|
199
|
-
args: "
|
|
236
|
+
args: ["file"],
|
|
237
|
+
argsUsage: "<file>",
|
|
238
|
+
handler: runSplitCommand,
|
|
200
239
|
description:
|
|
201
240
|
"Split a combined trace into per-source files following the `trace--<case>--<participant>.<role>.ndjson` convention",
|
|
202
241
|
options: {
|
|
@@ -217,9 +256,11 @@ const definition = {
|
|
|
217
256
|
},
|
|
218
257
|
{
|
|
219
258
|
name: "assert",
|
|
220
|
-
args: "
|
|
259
|
+
args: ["test-name", "file"],
|
|
260
|
+
argsUsage: "<test-name> <file>",
|
|
261
|
+
handler: runAssertCommand,
|
|
221
262
|
description:
|
|
222
|
-
"Shell-friendly assertion — outputs structured JSON for
|
|
263
|
+
"Shell-friendly assertion — outputs structured JSON for invariant hooks",
|
|
223
264
|
options: {
|
|
224
265
|
grep: {
|
|
225
266
|
type: "string",
|
|
@@ -299,57 +340,42 @@ const definition = {
|
|
|
299
340
|
],
|
|
300
341
|
};
|
|
301
342
|
|
|
302
|
-
const cli = createCli(definition);
|
|
303
343
|
const logger = createLogger("trace");
|
|
304
344
|
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
overview: runOverviewCommand,
|
|
309
|
-
count: runCountCommand,
|
|
310
|
-
batch: runBatchCommand,
|
|
311
|
-
head: runHeadCommand,
|
|
312
|
-
tail: runTailCommand,
|
|
313
|
-
search: runSearchCommand,
|
|
314
|
-
tools: runToolsCommand,
|
|
315
|
-
tool: runToolCommand,
|
|
316
|
-
errors: runErrorsCommand,
|
|
317
|
-
reasoning: runReasoningCommand,
|
|
318
|
-
timeline: runTimelineCommand,
|
|
319
|
-
stats: runStatsCommand,
|
|
320
|
-
init: runInitCommand,
|
|
321
|
-
turn: runTurnCommand,
|
|
322
|
-
filter: runFilterCommand,
|
|
323
|
-
split: runSplitCommand,
|
|
324
|
-
assert: runAssertCommand,
|
|
325
|
-
"by-discussion": runByDiscussionCommand,
|
|
326
|
-
};
|
|
345
|
+
// Commands that talk to the GitHub API need a config-backed token resolver;
|
|
346
|
+
// the rest only read local trace files through the runtime.
|
|
347
|
+
const NEEDS_CONFIG = new Set(["runs", "download"]);
|
|
327
348
|
|
|
328
349
|
async function main() {
|
|
329
|
-
const
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
350
|
+
const runtime = createDefaultRuntime();
|
|
351
|
+
const cli = createCli(definition, { runtime });
|
|
352
|
+
const parsed = cli.parse(runtime.proc.argv.slice(2));
|
|
353
|
+
if (!parsed) return runtime.proc.exit(0);
|
|
333
354
|
|
|
355
|
+
const { positionals } = parsed;
|
|
334
356
|
if (positionals.length === 0) {
|
|
335
357
|
cli.usageError("no command specified");
|
|
336
|
-
|
|
358
|
+
return runtime.proc.exit(2);
|
|
337
359
|
}
|
|
338
360
|
|
|
339
|
-
const
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
if (!handler) {
|
|
361
|
+
const command = positionals[0];
|
|
362
|
+
if (!definition.commands.some((c) => c.name === command)) {
|
|
343
363
|
cli.usageError(`unknown command "${command}"`);
|
|
344
|
-
|
|
364
|
+
return runtime.proc.exit(2);
|
|
345
365
|
}
|
|
346
366
|
|
|
347
|
-
const config =
|
|
348
|
-
|
|
367
|
+
const config = NEEDS_CONFIG.has(command)
|
|
368
|
+
? await createScriptConfig("eval")
|
|
369
|
+
: undefined;
|
|
370
|
+
|
|
371
|
+
const result = await cli.dispatch(parsed, { deps: { runtime, config } });
|
|
372
|
+
const envelope = result ?? { ok: true };
|
|
373
|
+
if (!envelope.ok && envelope.error) cli.error(envelope.error);
|
|
374
|
+
runtime.proc.exit(envelope.ok ? 0 : (envelope.code ?? 1));
|
|
349
375
|
}
|
|
350
376
|
|
|
351
377
|
main().catch((error) => {
|
|
352
378
|
logger.exception("main", error);
|
|
353
|
-
|
|
379
|
+
createCli(definition).error(error.message);
|
|
354
380
|
process.exit(1);
|
|
355
381
|
});
|