@agjs/tsforge 0.1.14 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agjs/tsforge",
3
3
  "type": "module",
4
- "version": "0.1.14",
4
+ "version": "0.1.15",
5
5
  "license": "MIT",
6
6
  "description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
7
7
  "repository": {
@@ -17,6 +17,7 @@
17
17
  "files": [
18
18
  "bin",
19
19
  "src",
20
+ "scripts",
20
21
  "strict.eslint.config.mjs",
21
22
  "strict.web.eslint.config.mjs"
22
23
  ],
@@ -0,0 +1,264 @@
1
+ #!/usr/bin/env bun
2
+ // Correlate tool-call repair incidents with the per-call thinking mode,
3
+ // across every JSONL event log we have (~/.tsforge/logs + evals/runs). Tracks:
4
+ // - Per-rule repair rates (L0: drop-null, unwrap-autolink; L1: coerce:*; etc.)
5
+ // - L3 re-ask frequency (when repair gave up)
6
+ // - Correlation with thinking mode (hypothesis: thinking-off has higher failure)
7
+ //
8
+ // bun packages/core/scripts/analyze-malformed.ts
9
+ //
10
+ // Old logs predate the per-call `thinking` field — those calls land in the
11
+ // "unknown" bucket; rates firm up as new logs accumulate.
12
+ import { readdirSync, readFileSync, existsSync } from "node:fs";
13
+ import { join } from "node:path";
14
+ import { homedir } from "node:os";
15
+ import { isRecord } from "../src/lib/guards";
16
+
17
+ interface IBucket {
18
+ calls: number;
19
+ salvaged: number;
20
+ malformedNudges: number;
21
+ repairs: Map<string, number>; // rule name → count
22
+ reasks: number; // L3 re-ask count
23
+ }
24
+
25
+ type ThinkingMode = "on" | "off" | "unknown";
26
+
27
+ const buckets: Record<ThinkingMode, IBucket> = {
28
+ on: {
29
+ calls: 0,
30
+ salvaged: 0,
31
+ malformedNudges: 0,
32
+ repairs: new Map(),
33
+ reasks: 0,
34
+ },
35
+ off: {
36
+ calls: 0,
37
+ salvaged: 0,
38
+ malformedNudges: 0,
39
+ repairs: new Map(),
40
+ reasks: 0,
41
+ },
42
+ unknown: {
43
+ calls: 0,
44
+ salvaged: 0,
45
+ malformedNudges: 0,
46
+ repairs: new Map(),
47
+ reasks: 0,
48
+ },
49
+ };
50
+
51
+ function modeOf(event: Record<string, unknown>): ThinkingMode {
52
+ if (event.thinking === true) {
53
+ return "on";
54
+ }
55
+
56
+ return event.thinking === false ? "off" : "unknown";
57
+ }
58
+
59
+ /** The thinking mode of the most recent usage event — repair events
60
+ * carry no flag of their own (they fire after the call), so they inherit it. */
61
+ let lastCallMode: ThinkingMode = "unknown";
62
+
63
+ function ingestTool(event: Record<string, unknown>): void {
64
+ const msg = event.message;
65
+
66
+ if (typeof msg !== "string") {
67
+ return;
68
+ }
69
+
70
+ if (msg.includes("recovered") && msg.includes("malformed")) {
71
+ // Salvage warnings carry their own per-call flag (new logs).
72
+ buckets[
73
+ event.thinking === undefined ? lastCallMode : modeOf(event)
74
+ ].salvaged += 1;
75
+ }
76
+
77
+ if (msg.includes("malformed tool-call text")) {
78
+ buckets[lastCallMode].malformedNudges += 1;
79
+ }
80
+ }
81
+
82
+ function ingestRepair(event: Record<string, unknown>): void {
83
+ const mode = event.thinking === undefined ? lastCallMode : modeOf(event);
84
+ const bucket = buckets[mode];
85
+ const msg = event.message;
86
+
87
+ if (typeof msg !== "string") {
88
+ return;
89
+ }
90
+
91
+ // Format: "tool:L0:drop-null:field" or "tool:L1:coerce:files" or "tool:L3-re-ask"
92
+ const parts = msg.split(":");
93
+
94
+ if (parts[parts.length - 1] === "L3-re-ask") {
95
+ bucket.reasks += 1;
96
+ } else {
97
+ // Extract the rule name (e.g. "drop-null:field" → "drop-null")
98
+ const rule = parts.slice(1).join(":");
99
+
100
+ bucket.repairs.set(rule, (bucket.repairs.get(rule) ?? 0) + 1);
101
+ }
102
+ }
103
+
104
+ function ingestLine(line: string): void {
105
+ let event: unknown;
106
+
107
+ try {
108
+ event = JSON.parse(line);
109
+ } catch {
110
+ return;
111
+ }
112
+
113
+ if (!isRecord(event) || typeof event.message !== "string") {
114
+ return;
115
+ }
116
+
117
+ if (event.kind === "usage") {
118
+ lastCallMode = modeOf(event);
119
+ buckets[lastCallMode].calls += 1;
120
+
121
+ return;
122
+ }
123
+
124
+ if (event.kind === "tool") {
125
+ ingestTool(event);
126
+
127
+ return;
128
+ }
129
+
130
+ if (event.kind === "repair") {
131
+ ingestRepair(event);
132
+ }
133
+ }
134
+
135
+ function ingestFile(path: string): void {
136
+ lastCallMode = "unknown";
137
+
138
+ for (const line of readFileSync(path, "utf8").split("\n")) {
139
+ if (line.length > 0) {
140
+ ingestLine(line);
141
+ }
142
+ }
143
+ }
144
+
145
+ function collectLogs(): string[] {
146
+ const files: string[] = [];
147
+ const home = join(homedir(), ".tsforge", "logs");
148
+
149
+ if (existsSync(home)) {
150
+ for (const f of readdirSync(home)) {
151
+ if (f.endsWith(".jsonl")) {
152
+ files.push(join(home, f));
153
+ }
154
+ }
155
+ }
156
+
157
+ const runs = join("evals", "runs");
158
+
159
+ if (existsSync(runs)) {
160
+ for (const dir of readdirSync(runs)) {
161
+ for (const name of ["events.jsonl", "run.jsonl", "log.jsonl"]) {
162
+ const candidate = join(runs, dir, name);
163
+
164
+ if (existsSync(candidate)) {
165
+ files.push(candidate);
166
+ }
167
+ }
168
+ }
169
+ }
170
+
171
+ return files;
172
+ }
173
+
174
+ const logs = collectLogs();
175
+
176
+ for (const f of logs) {
177
+ ingestFile(f);
178
+ }
179
+
180
+ function rate(b: IBucket): string {
181
+ if (b.calls === 0) {
182
+ return "—";
183
+ }
184
+
185
+ return `${(((b.salvaged + b.malformedNudges) / b.calls) * 100).toFixed(2)}%`;
186
+ }
187
+
188
+ function repairRate(b: IBucket): string {
189
+ const totalRepairs = Array.from(b.repairs.values()).reduce(
190
+ (a, c) => a + c,
191
+ 0
192
+ );
193
+
194
+ if (b.calls === 0) {
195
+ return "—";
196
+ }
197
+
198
+ return `${((totalRepairs / b.calls) * 100).toFixed(2)}%`;
199
+ }
200
+
201
+ process.stdout.write(`scanned ${String(logs.length)} log file(s)\n\n`);
202
+
203
+ // Salvage & malformed incidents by thinking mode
204
+ process.stdout.write(
205
+ "thinking calls salvaged malformed-nudges incident-rate\n"
206
+ );
207
+
208
+ for (const mode of ["on", "off", "unknown"] as const) {
209
+ const b = buckets[mode];
210
+
211
+ process.stdout.write(
212
+ `${mode.padEnd(9)} ${String(b.calls).padStart(5)} ${String(b.salvaged).padStart(8)} ${String(b.malformedNudges).padStart(16)} ${rate(b)}\n`
213
+ );
214
+ }
215
+
216
+ process.stdout.write(
217
+ "\n" +
218
+ "incident-rate = (salvaged + malformed-nudges) / model calls. 'unknown' =\n" +
219
+ "logs predating the per-call thinking flag.\n"
220
+ );
221
+
222
+ // Repair ladder statistics
223
+ process.stdout.write("\n\nREPAIR LADDER STATISTICS:\n\n");
224
+ process.stdout.write("thinking calls repairs reasks repair-rate\n");
225
+
226
+ for (const mode of ["on", "off", "unknown"] as const) {
227
+ const b = buckets[mode];
228
+ const totalRepairs = Array.from(b.repairs.values()).reduce(
229
+ (a, c) => a + c,
230
+ 0
231
+ );
232
+
233
+ process.stdout.write(
234
+ `${mode.padEnd(9)} ${String(b.calls).padStart(5)} ${String(totalRepairs).padStart(7)} ${String(b.reasks).padStart(5)} ${repairRate(b)}\n`
235
+ );
236
+ }
237
+
238
+ // Top repair rules across all modes
239
+ const allRules = new Map<string, number>();
240
+
241
+ for (const b of Object.values(buckets)) {
242
+ for (const [rule, count] of b.repairs) {
243
+ allRules.set(rule, (allRules.get(rule) ?? 0) + count);
244
+ }
245
+ }
246
+
247
+ if (allRules.size > 0) {
248
+ process.stdout.write("\n\nTOP REPAIR RULES (across all modes):\n");
249
+ const sorted = Array.from(allRules.entries())
250
+ .sort((a, b) => b[1] - a[1])
251
+ .slice(0, 20);
252
+
253
+ for (const [rule, count] of sorted) {
254
+ process.stdout.write(` ${String(count).padStart(5)} ${rule}\n`);
255
+ }
256
+ }
257
+
258
+ process.stdout.write(
259
+ "\n" +
260
+ "repair-rate = total repairs / model calls. Re-run as new logs accumulate;\n" +
261
+ "track per-rule rates to identify systemic model failures worth adding L2\n" +
262
+ "safe-defaults for. L3 re-ask rate should trend to near-zero (recoverable\n" +
263
+ "repairs succeed; unrecoverable args are infrequent and addressed in prompting).\n"
264
+ );
@@ -0,0 +1,279 @@
1
+ // Extract luck-INDEPENDENT mechanism signals from eval run logs, so harness
2
+ // changes can be judged by what they actually did — not by a single noisy
3
+ // turn-count. Reads each run dir's plain-text run.log (+ result.json) and
4
+ // tabulates, then summarizes the spread across runs.
5
+ //
6
+ // Run: bun run packages/core/scripts/analyze-runs.ts money 5
7
+ // (analyze the latest 5 `money-*` run dirs)
8
+ // Or: bun run packages/core/scripts/analyze-runs.ts <dir> <dir> ...
9
+ import { readdir } from "node:fs/promises";
10
+ import { join } from "node:path";
11
+ import { isRecord } from "../src/lib/guards";
12
+
13
+ const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
14
+
15
+ interface IRunMetrics {
16
+ runId: string;
17
+ passed: boolean;
18
+ turns: number;
19
+ totalSeconds: number;
20
+ /** Highest per-turn gate error count seen — >1 proves the combined parser
21
+ * surfaced structured, per-error feedback (not one opaque blob). */
22
+ maxErrorsSurfaced: number;
23
+ /** Times the model enumerated source lines by hand to locate an error —
24
+ * should be 0 since gate feedback shows the offending line. */
25
+ handCountingLines: number;
26
+ /** Most file mutations applied in a SINGLE turn — >1 means it fixed several
27
+ * sites at once instead of one-per-turn. */
28
+ maxEditsPerTurn: number;
29
+ totalEdits: number;
30
+ /** Longest single turn (s) — usually one heavy reasoning turn; shows that
31
+ * wall-time variance is the model thinking, not harness churn. */
32
+ slowestTurnSeconds: number;
33
+ /** Char volume of the heaviest turn (reasoning+content). Shows whether a
34
+ * thinking_token_budget binds (drops it) and flags spirals (huge value). */
35
+ maxTurnChars: number;
36
+ /** Tool calls the harness rejected (bad input / scope / match failure) — the
37
+ * open-model tool-calling friction; 0 = clean. Repaired calls excluded. */
38
+ toolRejects: number;
39
+ regressions: number;
40
+ quality: number | undefined;
41
+ }
42
+
43
+ const TIMING = /⏱ turn (\d+) took ([\d.]+)(s|ms) \(total ([\d.]+)(s|ms)\)/;
44
+ const RED = /turn \d+: red \((\d+) error/;
45
+ const ASKING = /turn (\d+): asking model/;
46
+ // Hand-counting = the model re-typing the file with SEQUENTIAL line numbers
47
+ // (`1: …`, `2: …`) to LOCATE an error it can't see — the costly pattern the
48
+ // located-feedback fix removes. Deliberately excludes `Line 37:`-style citations
49
+ // of feedback-provided lines, which are the model USING the located errors.
50
+ const HAND_COUNT = /^\s*\d+:\s+(?:export|const|function|return|if|for|\}|\/\/)/;
51
+
52
+ interface ITurnTiming {
53
+ turn: number;
54
+ tookSeconds: number | null;
55
+ totalSeconds: number | null;
56
+ }
57
+
58
+ /** Parse a `turn N took Xs (total Ys)` line into its seconds (ms normalized). */
59
+ function parseTiming(line: string): ITurnTiming | null {
60
+ const m = TIMING.exec(line);
61
+
62
+ if (m?.[1] === undefined) {
63
+ return null;
64
+ }
65
+
66
+ const took =
67
+ m[2] === undefined
68
+ ? null
69
+ : m[3] === "ms"
70
+ ? Number(m[2]) / 1000
71
+ : Number(m[2]);
72
+ const total =
73
+ m[4] === undefined
74
+ ? null
75
+ : m[5] === "ms"
76
+ ? Number(m[4]) / 1000
77
+ : Number(m[4]);
78
+
79
+ return { turn: Number(m[1]), tookSeconds: took, totalSeconds: total };
80
+ }
81
+
82
+ function parseLog(
83
+ runId: string,
84
+ log: string
85
+ ): Omit<IRunMetrics, "regressions" | "quality"> {
86
+ const lines = log.split("\n");
87
+
88
+ let turns = 0;
89
+ let totalSeconds = 0;
90
+ let slowestTurnSeconds = 0;
91
+ let maxErrorsSurfaced = 0;
92
+ let handCountingLines = 0;
93
+ let totalEdits = 0;
94
+ let maxEditsPerTurn = 0;
95
+ let editsThisTurn = 0;
96
+ let charsThisTurn = 0;
97
+ let maxTurnChars = 0;
98
+ let toolRejects = 0;
99
+ const passed = /spec ".*": done/.test(log) || /· turn \d+: GREEN/.test(log);
100
+
101
+ for (const line of lines) {
102
+ const asking = ASKING.exec(line);
103
+
104
+ if (asking !== null) {
105
+ maxEditsPerTurn = Math.max(maxEditsPerTurn, editsThisTurn);
106
+ editsThisTurn = 0;
107
+ charsThisTurn = 0;
108
+ }
109
+
110
+ // Reasoning+content volume of the turn — how "does a thinking_token_budget
111
+ // bind?" shows up here (and a spiral is a huge maxTurnChars).
112
+ charsThisTurn += line.length;
113
+
114
+ if (line.includes("✎ edit") || line.includes("✚ create")) {
115
+ totalEdits += 1;
116
+ editsThisTurn += 1;
117
+ }
118
+
119
+ const timing = parseTiming(line);
120
+
121
+ if (timing !== null) {
122
+ turns = Math.max(turns, timing.turn);
123
+ maxTurnChars = Math.max(maxTurnChars, charsThisTurn);
124
+
125
+ if (timing.tookSeconds !== null) {
126
+ slowestTurnSeconds = Math.max(slowestTurnSeconds, timing.tookSeconds);
127
+ }
128
+
129
+ if (timing.totalSeconds !== null) {
130
+ totalSeconds = timing.totalSeconds;
131
+ }
132
+ }
133
+
134
+ const red = RED.exec(line);
135
+
136
+ if (red?.[1] !== undefined) {
137
+ maxErrorsSurfaced = Math.max(maxErrorsSurfaced, Number(red[1]));
138
+ }
139
+
140
+ if (HAND_COUNT.test(line)) {
141
+ handCountingLines += 1;
142
+ }
143
+
144
+ if (/tool_input_rejected:|tool_rejected:/.test(line)) {
145
+ toolRejects += 1;
146
+ }
147
+ }
148
+
149
+ maxEditsPerTurn = Math.max(maxEditsPerTurn, editsThisTurn);
150
+
151
+ return {
152
+ runId,
153
+ passed,
154
+ turns,
155
+ totalSeconds,
156
+ maxErrorsSurfaced,
157
+ handCountingLines,
158
+ maxEditsPerTurn,
159
+ totalEdits,
160
+ slowestTurnSeconds,
161
+ maxTurnChars,
162
+ toolRejects,
163
+ };
164
+ }
165
+
166
+ async function readResult(
167
+ dir: string
168
+ ): Promise<{ regressions: number; quality: number | undefined }> {
169
+ const file = Bun.file(join(dir, "result.json"));
170
+
171
+ if (!(await file.exists())) {
172
+ return { regressions: 0, quality: undefined };
173
+ }
174
+
175
+ const data: unknown = JSON.parse(await file.text());
176
+
177
+ if (!isRecord(data)) {
178
+ return { regressions: 0, quality: undefined };
179
+ }
180
+
181
+ const quality = typeof data.quality === "number" ? data.quality : undefined;
182
+ let regressions = 0;
183
+
184
+ if (Array.isArray(data.tasks)) {
185
+ for (const t of data.tasks) {
186
+ if (isRecord(t) && typeof t.regressions === "number") {
187
+ regressions += t.regressions;
188
+ }
189
+ }
190
+ }
191
+
192
+ return { regressions, quality };
193
+ }
194
+
195
+ async function resolveDirs(): Promise<string[]> {
196
+ const args = process.argv.slice(2);
197
+
198
+ // `<seed> <count>` form: latest N run dirs whose name starts with the seed.
199
+ if (args.length === 2 && /^\d+$/.test(args[1] ?? "")) {
200
+ const prefix = args[0] ?? "";
201
+ const count = Number(args[1]);
202
+ const all = await readdir(evalsRoot, { withFileTypes: true });
203
+ const dirs = all
204
+ .filter((d) => d.isDirectory() && d.name.startsWith(prefix))
205
+ .map((d) => d.name)
206
+ .sort();
207
+
208
+ return dirs.slice(-count).map((name) => join(evalsRoot, name));
209
+ }
210
+
211
+ return args.map((a) => (a.startsWith("/") ? a : join(evalsRoot, a)));
212
+ }
213
+
214
+ function median(values: number[]): number {
215
+ const sorted = [...values].sort((a, b) => a - b);
216
+ const mid = Math.floor(sorted.length / 2);
217
+
218
+ if (sorted.length === 0) {
219
+ return 0;
220
+ }
221
+
222
+ return sorted.length % 2 === 0
223
+ ? ((sorted[mid - 1] ?? 0) + (sorted[mid] ?? 0)) / 2
224
+ : (sorted[mid] ?? 0);
225
+ }
226
+
227
+ const dirs = await resolveDirs();
228
+ const metrics: IRunMetrics[] = [];
229
+
230
+ for (const dir of dirs) {
231
+ const log = Bun.file(join(dir, "run.log"));
232
+
233
+ if (!(await log.exists())) {
234
+ continue;
235
+ }
236
+
237
+ const runId = dir.split("/").slice(-1)[0] ?? dir;
238
+ const base = parseLog(runId, await log.text());
239
+ const extra = await readResult(dir);
240
+
241
+ metrics.push({ ...base, ...extra });
242
+ }
243
+
244
+ process.stdout.write(`\n=== run analysis (${metrics.length} runs) ===\n\n`);
245
+ process.stdout.write(
246
+ "pass turns time(s) slowTurn(s) maxTurnChars maxErr handCount toolRej maxEdits/turn edits regress Q\n"
247
+ );
248
+
249
+ for (const m of metrics) {
250
+ process.stdout.write(
251
+ [
252
+ m.passed ? " ✓ " : " ✗ ",
253
+ String(m.turns).padStart(5),
254
+ m.totalSeconds.toFixed(0).padStart(8),
255
+ m.slowestTurnSeconds.toFixed(0).padStart(12),
256
+ String(m.maxTurnChars).padStart(13),
257
+ String(m.maxErrorsSurfaced).padStart(7),
258
+ String(m.handCountingLines).padStart(10),
259
+ String(m.toolRejects).padStart(8),
260
+ String(m.maxEditsPerTurn).padStart(14),
261
+ String(m.totalEdits).padStart(7),
262
+ String(m.regressions).padStart(8),
263
+ (m.quality === undefined ? "-" : String(m.quality)).padStart(3),
264
+ ` ${m.runId}`,
265
+ ].join("") + "\n"
266
+ );
267
+ }
268
+
269
+ const turns = metrics.map((m) => m.turns);
270
+ const times = metrics.map((m) => m.totalSeconds);
271
+ const passRate = metrics.filter((m) => m.passed).length;
272
+
273
+ process.stdout.write(
274
+ `\nturns: min ${Math.min(...turns)} median ${median(turns)} max ${Math.max(...turns)} (spread ${Math.max(...turns) - Math.min(...turns)})\n`
275
+ );
276
+ process.stdout.write(
277
+ `time: min ${Math.min(...times).toFixed(0)}s median ${median(times).toFixed(0)}s max ${Math.max(...times).toFixed(0)}s\n`
278
+ );
279
+ process.stdout.write(`pass: ${passRate}/${metrics.length}\n`);