@agjs/tsforge 0.1.14 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/scripts/analyze-malformed.ts +264 -0
- package/scripts/analyze-runs.ts +279 -0
- package/scripts/benchmark-catalog.ts +387 -0
- package/scripts/browser-check.ts +87 -0
- package/scripts/build-rule-docs.ts +122 -0
- package/scripts/build-rules-md.ts +129 -0
- package/scripts/cli-metrics.ts +203 -0
- package/scripts/coverage-check.ts +33 -0
- package/scripts/edit-benchmark.ts +314 -0
- package/scripts/eval-create.ts +48 -0
- package/scripts/eval-spec.ts +47 -0
- package/scripts/eval-sum.ts +79 -0
- package/scripts/gen-tests.ts +140 -0
- package/scripts/headless-build.ts +292 -0
- package/scripts/interactive-eval.ts +172 -0
- package/scripts/rejudge.ts +135 -0
- package/scripts/run-eval-todo.ts +59 -0
- package/scripts/smoke.ts +18 -0
- package/scripts/stub-check.ts +44 -0
- package/scripts/sweep-report.ts +76 -0
- package/scripts/sweep.ts +389 -0
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agjs/tsforge",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.1.
|
|
4
|
+
"version": "0.1.15",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
|
|
7
7
|
"repository": {
|
|
@@ -17,6 +17,7 @@
|
|
|
17
17
|
"files": [
|
|
18
18
|
"bin",
|
|
19
19
|
"src",
|
|
20
|
+
"scripts",
|
|
20
21
|
"strict.eslint.config.mjs",
|
|
21
22
|
"strict.web.eslint.config.mjs"
|
|
22
23
|
],
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
// Correlate tool-call repair incidents with the per-call thinking mode,
|
|
3
|
+
// across every JSONL event log we have (~/.tsforge/logs + evals/runs). Tracks:
|
|
4
|
+
// - Per-rule repair rates (L0: drop-null, unwrap-autolink; L1: coerce:*; etc.)
|
|
5
|
+
// - L3 re-ask frequency (when repair gave up)
|
|
6
|
+
// - Correlation with thinking mode (hypothesis: thinking-off has higher failure)
|
|
7
|
+
//
|
|
8
|
+
// bun packages/core/scripts/analyze-malformed.ts
|
|
9
|
+
//
|
|
10
|
+
// Old logs predate the per-call `thinking` field — those calls land in the
|
|
11
|
+
// "unknown" bucket; rates firm up as new logs accumulate.
|
|
12
|
+
import { readdirSync, readFileSync, existsSync } from "node:fs";
|
|
13
|
+
import { join } from "node:path";
|
|
14
|
+
import { homedir } from "node:os";
|
|
15
|
+
import { isRecord } from "../src/lib/guards";
|
|
16
|
+
|
|
17
|
+
interface IBucket {
|
|
18
|
+
calls: number;
|
|
19
|
+
salvaged: number;
|
|
20
|
+
malformedNudges: number;
|
|
21
|
+
repairs: Map<string, number>; // rule name → count
|
|
22
|
+
reasks: number; // L3 re-ask count
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
type ThinkingMode = "on" | "off" | "unknown";
|
|
26
|
+
|
|
27
|
+
const buckets: Record<ThinkingMode, IBucket> = {
|
|
28
|
+
on: {
|
|
29
|
+
calls: 0,
|
|
30
|
+
salvaged: 0,
|
|
31
|
+
malformedNudges: 0,
|
|
32
|
+
repairs: new Map(),
|
|
33
|
+
reasks: 0,
|
|
34
|
+
},
|
|
35
|
+
off: {
|
|
36
|
+
calls: 0,
|
|
37
|
+
salvaged: 0,
|
|
38
|
+
malformedNudges: 0,
|
|
39
|
+
repairs: new Map(),
|
|
40
|
+
reasks: 0,
|
|
41
|
+
},
|
|
42
|
+
unknown: {
|
|
43
|
+
calls: 0,
|
|
44
|
+
salvaged: 0,
|
|
45
|
+
malformedNudges: 0,
|
|
46
|
+
repairs: new Map(),
|
|
47
|
+
reasks: 0,
|
|
48
|
+
},
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
function modeOf(event: Record<string, unknown>): ThinkingMode {
|
|
52
|
+
if (event.thinking === true) {
|
|
53
|
+
return "on";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return event.thinking === false ? "off" : "unknown";
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** The thinking mode of the most recent usage event — repair events
|
|
60
|
+
* carry no flag of their own (they fire after the call), so they inherit it. */
|
|
61
|
+
let lastCallMode: ThinkingMode = "unknown";
|
|
62
|
+
|
|
63
|
+
function ingestTool(event: Record<string, unknown>): void {
|
|
64
|
+
const msg = event.message;
|
|
65
|
+
|
|
66
|
+
if (typeof msg !== "string") {
|
|
67
|
+
return;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
if (msg.includes("recovered") && msg.includes("malformed")) {
|
|
71
|
+
// Salvage warnings carry their own per-call flag (new logs).
|
|
72
|
+
buckets[
|
|
73
|
+
event.thinking === undefined ? lastCallMode : modeOf(event)
|
|
74
|
+
].salvaged += 1;
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (msg.includes("malformed tool-call text")) {
|
|
78
|
+
buckets[lastCallMode].malformedNudges += 1;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function ingestRepair(event: Record<string, unknown>): void {
|
|
83
|
+
const mode = event.thinking === undefined ? lastCallMode : modeOf(event);
|
|
84
|
+
const bucket = buckets[mode];
|
|
85
|
+
const msg = event.message;
|
|
86
|
+
|
|
87
|
+
if (typeof msg !== "string") {
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Format: "tool:L0:drop-null:field" or "tool:L1:coerce:files" or "tool:L3-re-ask"
|
|
92
|
+
const parts = msg.split(":");
|
|
93
|
+
|
|
94
|
+
if (parts[parts.length - 1] === "L3-re-ask") {
|
|
95
|
+
bucket.reasks += 1;
|
|
96
|
+
} else {
|
|
97
|
+
// Extract the rule name (e.g. "drop-null:field" → "drop-null")
|
|
98
|
+
const rule = parts.slice(1).join(":");
|
|
99
|
+
|
|
100
|
+
bucket.repairs.set(rule, (bucket.repairs.get(rule) ?? 0) + 1);
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function ingestLine(line: string): void {
|
|
105
|
+
let event: unknown;
|
|
106
|
+
|
|
107
|
+
try {
|
|
108
|
+
event = JSON.parse(line);
|
|
109
|
+
} catch {
|
|
110
|
+
return;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if (!isRecord(event) || typeof event.message !== "string") {
|
|
114
|
+
return;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (event.kind === "usage") {
|
|
118
|
+
lastCallMode = modeOf(event);
|
|
119
|
+
buckets[lastCallMode].calls += 1;
|
|
120
|
+
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
if (event.kind === "tool") {
|
|
125
|
+
ingestTool(event);
|
|
126
|
+
|
|
127
|
+
return;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if (event.kind === "repair") {
|
|
131
|
+
ingestRepair(event);
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
function ingestFile(path: string): void {
|
|
136
|
+
lastCallMode = "unknown";
|
|
137
|
+
|
|
138
|
+
for (const line of readFileSync(path, "utf8").split("\n")) {
|
|
139
|
+
if (line.length > 0) {
|
|
140
|
+
ingestLine(line);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function collectLogs(): string[] {
|
|
146
|
+
const files: string[] = [];
|
|
147
|
+
const home = join(homedir(), ".tsforge", "logs");
|
|
148
|
+
|
|
149
|
+
if (existsSync(home)) {
|
|
150
|
+
for (const f of readdirSync(home)) {
|
|
151
|
+
if (f.endsWith(".jsonl")) {
|
|
152
|
+
files.push(join(home, f));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const runs = join("evals", "runs");
|
|
158
|
+
|
|
159
|
+
if (existsSync(runs)) {
|
|
160
|
+
for (const dir of readdirSync(runs)) {
|
|
161
|
+
for (const name of ["events.jsonl", "run.jsonl", "log.jsonl"]) {
|
|
162
|
+
const candidate = join(runs, dir, name);
|
|
163
|
+
|
|
164
|
+
if (existsSync(candidate)) {
|
|
165
|
+
files.push(candidate);
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return files;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const logs = collectLogs();
|
|
175
|
+
|
|
176
|
+
for (const f of logs) {
|
|
177
|
+
ingestFile(f);
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
function rate(b: IBucket): string {
|
|
181
|
+
if (b.calls === 0) {
|
|
182
|
+
return "—";
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
return `${(((b.salvaged + b.malformedNudges) / b.calls) * 100).toFixed(2)}%`;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
function repairRate(b: IBucket): string {
|
|
189
|
+
const totalRepairs = Array.from(b.repairs.values()).reduce(
|
|
190
|
+
(a, c) => a + c,
|
|
191
|
+
0
|
|
192
|
+
);
|
|
193
|
+
|
|
194
|
+
if (b.calls === 0) {
|
|
195
|
+
return "—";
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return `${((totalRepairs / b.calls) * 100).toFixed(2)}%`;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
process.stdout.write(`scanned ${String(logs.length)} log file(s)\n\n`);
|
|
202
|
+
|
|
203
|
+
// Salvage & malformed incidents by thinking mode
|
|
204
|
+
process.stdout.write(
|
|
205
|
+
"thinking calls salvaged malformed-nudges incident-rate\n"
|
|
206
|
+
);
|
|
207
|
+
|
|
208
|
+
for (const mode of ["on", "off", "unknown"] as const) {
|
|
209
|
+
const b = buckets[mode];
|
|
210
|
+
|
|
211
|
+
process.stdout.write(
|
|
212
|
+
`${mode.padEnd(9)} ${String(b.calls).padStart(5)} ${String(b.salvaged).padStart(8)} ${String(b.malformedNudges).padStart(16)} ${rate(b)}\n`
|
|
213
|
+
);
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
process.stdout.write(
|
|
217
|
+
"\n" +
|
|
218
|
+
"incident-rate = (salvaged + malformed-nudges) / model calls. 'unknown' =\n" +
|
|
219
|
+
"logs predating the per-call thinking flag.\n"
|
|
220
|
+
);
|
|
221
|
+
|
|
222
|
+
// Repair ladder statistics
|
|
223
|
+
process.stdout.write("\n\nREPAIR LADDER STATISTICS:\n\n");
|
|
224
|
+
process.stdout.write("thinking calls repairs reasks repair-rate\n");
|
|
225
|
+
|
|
226
|
+
for (const mode of ["on", "off", "unknown"] as const) {
|
|
227
|
+
const b = buckets[mode];
|
|
228
|
+
const totalRepairs = Array.from(b.repairs.values()).reduce(
|
|
229
|
+
(a, c) => a + c,
|
|
230
|
+
0
|
|
231
|
+
);
|
|
232
|
+
|
|
233
|
+
process.stdout.write(
|
|
234
|
+
`${mode.padEnd(9)} ${String(b.calls).padStart(5)} ${String(totalRepairs).padStart(7)} ${String(b.reasks).padStart(5)} ${repairRate(b)}\n`
|
|
235
|
+
);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Top repair rules across all modes
|
|
239
|
+
const allRules = new Map<string, number>();
|
|
240
|
+
|
|
241
|
+
for (const b of Object.values(buckets)) {
|
|
242
|
+
for (const [rule, count] of b.repairs) {
|
|
243
|
+
allRules.set(rule, (allRules.get(rule) ?? 0) + count);
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (allRules.size > 0) {
|
|
248
|
+
process.stdout.write("\n\nTOP REPAIR RULES (across all modes):\n");
|
|
249
|
+
const sorted = Array.from(allRules.entries())
|
|
250
|
+
.sort((a, b) => b[1] - a[1])
|
|
251
|
+
.slice(0, 20);
|
|
252
|
+
|
|
253
|
+
for (const [rule, count] of sorted) {
|
|
254
|
+
process.stdout.write(` ${String(count).padStart(5)} ${rule}\n`);
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
process.stdout.write(
|
|
259
|
+
"\n" +
|
|
260
|
+
"repair-rate = total repairs / model calls. Re-run as new logs accumulate;\n" +
|
|
261
|
+
"track per-rule rates to identify systemic model failures worth adding L2\n" +
|
|
262
|
+
"safe-defaults for. L3 re-ask rate should trend to near-zero (recoverable\n" +
|
|
263
|
+
"repairs succeed; unrecoverable args are infrequent and addressed in prompting).\n"
|
|
264
|
+
);
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
// Extract luck-INDEPENDENT mechanism signals from eval run logs, so harness
|
|
2
|
+
// changes can be judged by what they actually did — not by a single noisy
|
|
3
|
+
// turn-count. Reads each run dir's plain-text run.log (+ result.json) and
|
|
4
|
+
// tabulates, then summarizes the spread across runs.
|
|
5
|
+
//
|
|
6
|
+
// Run: bun run packages/core/scripts/analyze-runs.ts money 5
|
|
7
|
+
// (analyze the latest 5 `money-*` run dirs)
|
|
8
|
+
// Or: bun run packages/core/scripts/analyze-runs.ts <dir> <dir> ...
|
|
9
|
+
import { readdir } from "node:fs/promises";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { isRecord } from "../src/lib/guards";
|
|
12
|
+
|
|
13
|
+
const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
|
|
14
|
+
|
|
15
|
+
interface IRunMetrics {
|
|
16
|
+
runId: string;
|
|
17
|
+
passed: boolean;
|
|
18
|
+
turns: number;
|
|
19
|
+
totalSeconds: number;
|
|
20
|
+
/** Highest per-turn gate error count seen — >1 proves the combined parser
|
|
21
|
+
* surfaced structured, per-error feedback (not one opaque blob). */
|
|
22
|
+
maxErrorsSurfaced: number;
|
|
23
|
+
/** Times the model enumerated source lines by hand to locate an error —
|
|
24
|
+
* should be 0 since gate feedback shows the offending line. */
|
|
25
|
+
handCountingLines: number;
|
|
26
|
+
/** Most file mutations applied in a SINGLE turn — >1 means it fixed several
|
|
27
|
+
* sites at once instead of one-per-turn. */
|
|
28
|
+
maxEditsPerTurn: number;
|
|
29
|
+
totalEdits: number;
|
|
30
|
+
/** Longest single turn (s) — usually one heavy reasoning turn; shows that
|
|
31
|
+
* wall-time variance is the model thinking, not harness churn. */
|
|
32
|
+
slowestTurnSeconds: number;
|
|
33
|
+
/** Char volume of the heaviest turn (reasoning+content). Shows whether a
|
|
34
|
+
* thinking_token_budget binds (drops it) and flags spirals (huge value). */
|
|
35
|
+
maxTurnChars: number;
|
|
36
|
+
/** Tool calls the harness rejected (bad input / scope / match failure) — the
|
|
37
|
+
* open-model tool-calling friction; 0 = clean. Repaired calls excluded. */
|
|
38
|
+
toolRejects: number;
|
|
39
|
+
regressions: number;
|
|
40
|
+
quality: number | undefined;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const TIMING = /⏱ turn (\d+) took ([\d.]+)(s|ms) \(total ([\d.]+)(s|ms)\)/;
|
|
44
|
+
const RED = /turn \d+: red \((\d+) error/;
|
|
45
|
+
const ASKING = /turn (\d+): asking model/;
|
|
46
|
+
// Hand-counting = the model re-typing the file with SEQUENTIAL line numbers
|
|
47
|
+
// (`1: …`, `2: …`) to LOCATE an error it can't see — the costly pattern the
|
|
48
|
+
// located-feedback fix removes. Deliberately excludes `Line 37:`-style citations
|
|
49
|
+
// of feedback-provided lines, which are the model USING the located errors.
|
|
50
|
+
const HAND_COUNT = /^\s*\d+:\s+(?:export|const|function|return|if|for|\}|\/\/)/;
|
|
51
|
+
|
|
52
|
+
interface ITurnTiming {
|
|
53
|
+
turn: number;
|
|
54
|
+
tookSeconds: number | null;
|
|
55
|
+
totalSeconds: number | null;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Parse a `turn N took Xs (total Ys)` line into its seconds (ms normalized). */
|
|
59
|
+
function parseTiming(line: string): ITurnTiming | null {
|
|
60
|
+
const m = TIMING.exec(line);
|
|
61
|
+
|
|
62
|
+
if (m?.[1] === undefined) {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const took =
|
|
67
|
+
m[2] === undefined
|
|
68
|
+
? null
|
|
69
|
+
: m[3] === "ms"
|
|
70
|
+
? Number(m[2]) / 1000
|
|
71
|
+
: Number(m[2]);
|
|
72
|
+
const total =
|
|
73
|
+
m[4] === undefined
|
|
74
|
+
? null
|
|
75
|
+
: m[5] === "ms"
|
|
76
|
+
? Number(m[4]) / 1000
|
|
77
|
+
: Number(m[4]);
|
|
78
|
+
|
|
79
|
+
return { turn: Number(m[1]), tookSeconds: took, totalSeconds: total };
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
function parseLog(
|
|
83
|
+
runId: string,
|
|
84
|
+
log: string
|
|
85
|
+
): Omit<IRunMetrics, "regressions" | "quality"> {
|
|
86
|
+
const lines = log.split("\n");
|
|
87
|
+
|
|
88
|
+
let turns = 0;
|
|
89
|
+
let totalSeconds = 0;
|
|
90
|
+
let slowestTurnSeconds = 0;
|
|
91
|
+
let maxErrorsSurfaced = 0;
|
|
92
|
+
let handCountingLines = 0;
|
|
93
|
+
let totalEdits = 0;
|
|
94
|
+
let maxEditsPerTurn = 0;
|
|
95
|
+
let editsThisTurn = 0;
|
|
96
|
+
let charsThisTurn = 0;
|
|
97
|
+
let maxTurnChars = 0;
|
|
98
|
+
let toolRejects = 0;
|
|
99
|
+
const passed = /spec ".*": done/.test(log) || /· turn \d+: GREEN/.test(log);
|
|
100
|
+
|
|
101
|
+
for (const line of lines) {
|
|
102
|
+
const asking = ASKING.exec(line);
|
|
103
|
+
|
|
104
|
+
if (asking !== null) {
|
|
105
|
+
maxEditsPerTurn = Math.max(maxEditsPerTurn, editsThisTurn);
|
|
106
|
+
editsThisTurn = 0;
|
|
107
|
+
charsThisTurn = 0;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Reasoning+content volume of the turn — how "does a thinking_token_budget
|
|
111
|
+
// bind?" shows up here (and a spiral is a huge maxTurnChars).
|
|
112
|
+
charsThisTurn += line.length;
|
|
113
|
+
|
|
114
|
+
if (line.includes("✎ edit") || line.includes("✚ create")) {
|
|
115
|
+
totalEdits += 1;
|
|
116
|
+
editsThisTurn += 1;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
const timing = parseTiming(line);
|
|
120
|
+
|
|
121
|
+
if (timing !== null) {
|
|
122
|
+
turns = Math.max(turns, timing.turn);
|
|
123
|
+
maxTurnChars = Math.max(maxTurnChars, charsThisTurn);
|
|
124
|
+
|
|
125
|
+
if (timing.tookSeconds !== null) {
|
|
126
|
+
slowestTurnSeconds = Math.max(slowestTurnSeconds, timing.tookSeconds);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (timing.totalSeconds !== null) {
|
|
130
|
+
totalSeconds = timing.totalSeconds;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const red = RED.exec(line);
|
|
135
|
+
|
|
136
|
+
if (red?.[1] !== undefined) {
|
|
137
|
+
maxErrorsSurfaced = Math.max(maxErrorsSurfaced, Number(red[1]));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (HAND_COUNT.test(line)) {
|
|
141
|
+
handCountingLines += 1;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
if (/tool_input_rejected:|tool_rejected:/.test(line)) {
|
|
145
|
+
toolRejects += 1;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
maxEditsPerTurn = Math.max(maxEditsPerTurn, editsThisTurn);
|
|
150
|
+
|
|
151
|
+
return {
|
|
152
|
+
runId,
|
|
153
|
+
passed,
|
|
154
|
+
turns,
|
|
155
|
+
totalSeconds,
|
|
156
|
+
maxErrorsSurfaced,
|
|
157
|
+
handCountingLines,
|
|
158
|
+
maxEditsPerTurn,
|
|
159
|
+
totalEdits,
|
|
160
|
+
slowestTurnSeconds,
|
|
161
|
+
maxTurnChars,
|
|
162
|
+
toolRejects,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async function readResult(
|
|
167
|
+
dir: string
|
|
168
|
+
): Promise<{ regressions: number; quality: number | undefined }> {
|
|
169
|
+
const file = Bun.file(join(dir, "result.json"));
|
|
170
|
+
|
|
171
|
+
if (!(await file.exists())) {
|
|
172
|
+
return { regressions: 0, quality: undefined };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
const data: unknown = JSON.parse(await file.text());
|
|
176
|
+
|
|
177
|
+
if (!isRecord(data)) {
|
|
178
|
+
return { regressions: 0, quality: undefined };
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
const quality = typeof data.quality === "number" ? data.quality : undefined;
|
|
182
|
+
let regressions = 0;
|
|
183
|
+
|
|
184
|
+
if (Array.isArray(data.tasks)) {
|
|
185
|
+
for (const t of data.tasks) {
|
|
186
|
+
if (isRecord(t) && typeof t.regressions === "number") {
|
|
187
|
+
regressions += t.regressions;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
return { regressions, quality };
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
async function resolveDirs(): Promise<string[]> {
|
|
196
|
+
const args = process.argv.slice(2);
|
|
197
|
+
|
|
198
|
+
// `<seed> <count>` form: latest N run dirs whose name starts with the seed.
|
|
199
|
+
if (args.length === 2 && /^\d+$/.test(args[1] ?? "")) {
|
|
200
|
+
const prefix = args[0] ?? "";
|
|
201
|
+
const count = Number(args[1]);
|
|
202
|
+
const all = await readdir(evalsRoot, { withFileTypes: true });
|
|
203
|
+
const dirs = all
|
|
204
|
+
.filter((d) => d.isDirectory() && d.name.startsWith(prefix))
|
|
205
|
+
.map((d) => d.name)
|
|
206
|
+
.sort();
|
|
207
|
+
|
|
208
|
+
return dirs.slice(-count).map((name) => join(evalsRoot, name));
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return args.map((a) => (a.startsWith("/") ? a : join(evalsRoot, a)));
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
function median(values: number[]): number {
|
|
215
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
216
|
+
const mid = Math.floor(sorted.length / 2);
|
|
217
|
+
|
|
218
|
+
if (sorted.length === 0) {
|
|
219
|
+
return 0;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
return sorted.length % 2 === 0
|
|
223
|
+
? ((sorted[mid - 1] ?? 0) + (sorted[mid] ?? 0)) / 2
|
|
224
|
+
: (sorted[mid] ?? 0);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const dirs = await resolveDirs();
|
|
228
|
+
const metrics: IRunMetrics[] = [];
|
|
229
|
+
|
|
230
|
+
for (const dir of dirs) {
|
|
231
|
+
const log = Bun.file(join(dir, "run.log"));
|
|
232
|
+
|
|
233
|
+
if (!(await log.exists())) {
|
|
234
|
+
continue;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const runId = dir.split("/").slice(-1)[0] ?? dir;
|
|
238
|
+
const base = parseLog(runId, await log.text());
|
|
239
|
+
const extra = await readResult(dir);
|
|
240
|
+
|
|
241
|
+
metrics.push({ ...base, ...extra });
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
process.stdout.write(`\n=== run analysis (${metrics.length} runs) ===\n\n`);
|
|
245
|
+
process.stdout.write(
|
|
246
|
+
"pass turns time(s) slowTurn(s) maxTurnChars maxErr handCount toolRej maxEdits/turn edits regress Q\n"
|
|
247
|
+
);
|
|
248
|
+
|
|
249
|
+
for (const m of metrics) {
|
|
250
|
+
process.stdout.write(
|
|
251
|
+
[
|
|
252
|
+
m.passed ? " ✓ " : " ✗ ",
|
|
253
|
+
String(m.turns).padStart(5),
|
|
254
|
+
m.totalSeconds.toFixed(0).padStart(8),
|
|
255
|
+
m.slowestTurnSeconds.toFixed(0).padStart(12),
|
|
256
|
+
String(m.maxTurnChars).padStart(13),
|
|
257
|
+
String(m.maxErrorsSurfaced).padStart(7),
|
|
258
|
+
String(m.handCountingLines).padStart(10),
|
|
259
|
+
String(m.toolRejects).padStart(8),
|
|
260
|
+
String(m.maxEditsPerTurn).padStart(14),
|
|
261
|
+
String(m.totalEdits).padStart(7),
|
|
262
|
+
String(m.regressions).padStart(8),
|
|
263
|
+
(m.quality === undefined ? "-" : String(m.quality)).padStart(3),
|
|
264
|
+
` ${m.runId}`,
|
|
265
|
+
].join("") + "\n"
|
|
266
|
+
);
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
const turns = metrics.map((m) => m.turns);
|
|
270
|
+
const times = metrics.map((m) => m.totalSeconds);
|
|
271
|
+
const passRate = metrics.filter((m) => m.passed).length;
|
|
272
|
+
|
|
273
|
+
process.stdout.write(
|
|
274
|
+
`\nturns: min ${Math.min(...turns)} median ${median(turns)} max ${Math.max(...turns)} (spread ${Math.max(...turns) - Math.min(...turns)})\n`
|
|
275
|
+
);
|
|
276
|
+
process.stdout.write(
|
|
277
|
+
`time: min ${Math.min(...times).toFixed(0)}s median ${median(times).toFixed(0)}s max ${Math.max(...times).toFixed(0)}s\n`
|
|
278
|
+
);
|
|
279
|
+
process.stdout.write(`pass: ${passRate}/${metrics.length}\n`);
|