@agjs/tsforge 0.2.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,6 +2,7 @@ import { join, dirname } from "node:path";
2
2
  import { existsSync } from "node:fs";
3
3
  import { ESLint } from "eslint";
4
4
  import { WEB_TEMPLATES, type WebFramework } from "./web-templates";
5
+ import { isRecord } from "./lib/guards";
5
6
 
6
7
  /**
7
8
  * Build the gate that confirms "done" — and makes tsforge a TypeScript-SPECIALIZED
@@ -106,10 +107,16 @@ const STRICT_TSCONFIG = `{
106
107
  /** Strict overlay for a project that ALREADY has a tsconfig: extend it (so the
107
108
  * project's paths/jsx/module/lib still resolve — a bare strict config would
108
109
  * mis-compile a real app) but FORCE every strictness flag on top, so a loosely-
109
- * configured repo still gets tsforge's strict-TS floor. Written as a sibling
110
- * `tsforge.tsconfig.json` and gated with `tsc -p`. */
111
- const STRICT_TSCONFIG_OVERRIDE = `{
112
- "extends": "./tsconfig.json",
110
+ * configured repo still gets tsforge's strict-TS floor.
111
+ *
112
+ * PERSISTENCE POLICY: written under `.tsforge/` (tsforge's cache namespace), NOT
113
+ * as a sibling in the project root — so the gate never litters the user's repo
114
+ * with a `tsforge.tsconfig.json`. `extends` points one level up to the project's
115
+ * own config, and `include`/`exclude` are re-stated relative to the subdir
116
+ * because `extends` does not inherit them (they default to the config's own
117
+ * directory otherwise — which under `.tsforge/` would compile nothing). */
118
+ const STRICT_TSCONFIG_OVERLAY = `{
119
+ "extends": "../tsconfig.json",
113
120
  "compilerOptions": {
114
121
  "strict": true,
115
122
  "noUncheckedIndexedAccess": true,
@@ -119,10 +126,16 @@ const STRICT_TSCONFIG_OVERRIDE = `{
119
126
  "erasableSyntaxOnly": true,
120
127
  "skipLibCheck": true,
121
128
  "noEmit": true
122
- }
129
+ },
130
+ "include": ["../**/*.ts", "../**/*.tsx"],
131
+ "exclude": ["../node_modules", "../dist", "../build", "../scratch", "../.tsforge"]
123
132
  }
124
133
  `;
125
134
 
135
+ /** The gate overlay's home: tsforge's cache dir + the overlay filename. */
136
+ const GATE_TSCONFIG_DIR = ".tsforge";
137
+ const GATE_TSCONFIG_FILE = "tsconfig.gate.json";
138
+
126
139
  // The web-stack scaffolds (Vite + React full-kit, or Vite vanilla) live in the
127
140
  // registry; this module just lays them down and builds their gate. shadcn/TanStack
128
141
  // boilerplate is held to a web-tailored strict config (no `I`-prefix — React names
@@ -373,7 +386,12 @@ export function buildWebGate(framework: WebFramework): IGate {
373
386
  // HARNESS-authored and app-agnostic: we deliberately do NOT run a model-authored
374
387
  // checks.json — the 27b writes over-strict interaction assertions (exact
375
388
  // placeholders/fill flows) it then can't satisfy and spirals on (iter3/4).
376
- const render = `bun "${BROWSER_CHECK}" dist/index.html --smoke --crawl`;
389
+ // OPT-IN quality oracles (default OFF so existing web runs are unchanged):
390
+ // TSFORGE_A11Y=1 adds axe (serious/critical fail), TSFORGE_SCREENSHOTS=1 writes
391
+ // per-route PNGs. A "frontend"/"strict" profile can set these.
392
+ const a11y = process.env.TSFORGE_A11Y === "1" ? " --a11y" : "";
393
+ const shots = process.env.TSFORGE_SCREENSHOTS === "1" ? " --screenshots" : "";
394
+ const render = `bun "${BROWSER_CHECK}" dist/index.html --smoke --crawl${a11y}${shots}`;
377
395
  // Prettier enforces formatting (the fix step runs `prettier --write` first, so
378
396
  // this passes without the model ever hand-formatting). Respects .prettierignore
379
397
  // (vendored ui/ + lib/ skipped). Runs after lint so a parse error fails there.
@@ -444,6 +462,22 @@ export function buildWebFix(framework: WebFramework): string {
444
462
  return `${lintFix} ; ${format}`;
445
463
  }
446
464
 
465
+ /**
466
+ * The core (non-web) auto-fix command — same janitor as buildWebFix but uses the
467
+ * bundled strict.eslint.config.mjs. Run BEFORE the gate each cycle so padding-line,
468
+ * prefer-const, curly, etc. are squashed without model turns.
469
+ */
470
+ export function buildCoreFix(): string {
471
+ const lintFix =
472
+ `"${ESLINT_BIN}" --no-config-lookup -c "${STRICT_CONFIG}" --fix .`.replace(
473
+ /\s+/g,
474
+ " "
475
+ );
476
+ const format = `"${PRETTIER_BIN}" --write .`;
477
+
478
+ return `${lintFix} ; ${format}`;
479
+ }
480
+
447
481
  async function ensureFile(
448
482
  cwd: string,
449
483
  name: string,
@@ -468,7 +502,7 @@ export async function buildGate(
468
502
  cwd: string,
469
503
  packs?: readonly string[],
470
504
  ruleOverrides?: Readonly<Record<string, "error" | "warn" | "off">>,
471
- options?: { enableTypeAware?: boolean }
505
+ options?: { enableTypeAware?: boolean; includeTests?: boolean }
472
506
  ): Promise<IGate> {
473
507
  const parts: string[] = [];
474
508
  const labels: string[] = [];
@@ -494,29 +528,95 @@ export async function buildGate(
494
528
  }
495
529
  }
496
530
 
531
+ // Tests run LAST (after the cheap static floor) so a type/lint error fails
532
+ // fast without paying for a test run. Only appended when the project actually
533
+ // has tests to run — a strict-floor-only run, or a project with none, skips it.
534
+ if (options?.includeTests === true) {
535
+ const test = await discoverTestCommand(cwd);
536
+
537
+ if (test !== null) {
538
+ parts.push(test);
539
+ labels.push("tests");
540
+ }
541
+ }
542
+
497
543
  return { command: parts.join(" && "), label: labels.join(" + ") };
498
544
  }
499
545
 
546
+ /** The npm-init placeholder test script — running it always fails, so it must
547
+ * NOT count as "the project has tests". */
548
+ const PLACEHOLDER_TEST = /no test specified/i;
549
+
550
+ /**
551
+ * The project's test command for the gate, or null when there's nothing to run.
552
+ * Prefers an explicit, real package.json `test` script (run via `bun run test`);
553
+ * else falls back to `bun test` when the project has test files; else null — so
554
+ * a greenfield app with no tests yet stays at the strict floor instead of
555
+ * failing a gate that runs a placeholder/absent test command.
556
+ */
557
+ export async function discoverTestCommand(cwd: string): Promise<string | null> {
558
+ const pkgFile = Bun.file(join(cwd, "package.json"));
559
+
560
+ if (await pkgFile.exists()) {
561
+ try {
562
+ const pkg: unknown = await pkgFile.json();
563
+ const scripts = isRecord(pkg) ? pkg.scripts : undefined;
564
+ const script = isRecord(scripts) ? scripts.test : undefined;
565
+
566
+ if (
567
+ typeof script === "string" &&
568
+ script.trim().length > 0 &&
569
+ !PLACEHOLDER_TEST.test(script)
570
+ ) {
571
+ return "bun run test";
572
+ }
573
+ } catch {
574
+ // Malformed package.json — fall through to file detection.
575
+ }
576
+ }
577
+
578
+ return (await hasTestFiles(cwd)) ? "bun test" : null;
579
+ }
580
+
581
+ /** True when the project has at least one *.test.* / *.spec.* file (outside
582
+ * node_modules) — the signal that a bare `bun test` has something to run. */
583
+ async function hasTestFiles(cwd: string): Promise<boolean> {
584
+ const glob = new Bun.Glob("**/*.{test,spec}.{ts,tsx,js,jsx}");
585
+
586
+ for await (const path of glob.scan({ cwd, onlyFiles: true })) {
587
+ if (!path.includes("node_modules")) {
588
+ return true;
589
+ }
590
+ }
591
+
592
+ return false;
593
+ }
594
+
500
595
  /**
501
596
  * The type-aware floor — ALWAYS tsforge-strict (user policy: a repo's own config
502
- * is never trusted to be strict enough). With a project tsconfig, extend it but
503
- * force the strict flags; greenfield, bring the full strict one. null when not a
504
- * TS project. (The strict override / bundled config win over whatever the repo set.)
597
+ * is never trusted to be strict enough). With a project tsconfig, extend it under
598
+ * `.tsforge/` but force the strict flags; greenfield, bring the full strict one.
599
+ * null when not a TS project. (The strict overlay / bundled config win over
600
+ * whatever the repo set.)
505
601
  */
506
602
  async function tscPart(cwd: string): Promise<string | null> {
507
603
  const hasTsconfig = await Bun.file(join(cwd, "tsconfig.json")).exists();
508
604
 
509
605
  if (hasTsconfig) {
606
+ // EPHEMERAL gate artifact: lives in .tsforge/ (Bun.write makes the dir), so
607
+ // we never drop a tsforge.tsconfig.json in the user's project root.
510
608
  await Bun.write(
511
- join(cwd, "tsforge.tsconfig.json"),
512
- STRICT_TSCONFIG_OVERRIDE
609
+ join(cwd, GATE_TSCONFIG_DIR, GATE_TSCONFIG_FILE),
610
+ STRICT_TSCONFIG_OVERLAY
513
611
  );
612
+ await ignoreGateArtifact(cwd);
514
613
 
515
- return `"${TSC_BIN}" --noEmit -p tsforge.tsconfig.json`;
614
+ return `"${TSC_BIN}" --noEmit -p ${GATE_TSCONFIG_DIR}/${GATE_TSCONFIG_FILE}`;
516
615
  }
517
616
 
518
617
  // Greenfield: bring a strict tsconfig so tsc can gate — but only when this is
519
618
  // actually a TS project (has a package.json), so we never litter a random dir.
619
+ // Unlike the overlay, a greenfield tsconfig.json is a DURABLE project file.
520
620
  if (await Bun.file(join(cwd, "package.json")).exists()) {
521
621
  await Bun.write(join(cwd, "tsconfig.json"), STRICT_TSCONFIG);
522
622
 
@@ -526,6 +626,20 @@ async function tscPart(cwd: string): Promise<string | null> {
526
626
  return null;
527
627
  }
528
628
 
629
+ /** Keep the ephemeral gate overlay out of git WITHOUT touching the user's root
630
+ * .gitignore: drop a scoped `.tsforge/.gitignore` ignoring just the overlay.
631
+ * Created only when absent, so a user-authored `.tsforge/.gitignore` (e.g. one
632
+ * that intentionally tracks rules.json) is never clobbered. */
633
+ async function ignoreGateArtifact(cwd: string): Promise<void> {
634
+ const ignore = join(cwd, GATE_TSCONFIG_DIR, ".gitignore");
635
+
636
+ if (await Bun.file(ignore).exists()) {
637
+ return;
638
+ }
639
+
640
+ await Bun.write(ignore, `${GATE_TSCONFIG_FILE}\n`);
641
+ }
642
+
529
643
  /** The syntactic idiom layer — ALWAYS tsforge's bundled strict eslint config
530
644
  * (user policy). We deliberately do NOT defer to the project's own `lint`
531
645
  * script: that's exactly how a weak repo would dodge the strict-TS floor. The
@@ -1,3 +1,5 @@
1
+ import type { FailureClass } from "./failure-class";
2
+
1
3
  export interface IJudgeInput {
2
4
  goal: string;
3
5
  criteria: string;
@@ -21,6 +23,9 @@ export interface IRunRecord {
21
23
  ms: number;
22
24
  /** LLM-judge quality score (1–5), when available. */
23
25
  quality?: number;
26
+ /** Structured reason a failed run failed (from classifyRun); omitted/`none`
27
+ * for a passing run. The substrate for turning failures into interventions. */
28
+ failureClass?: FailureClass;
24
29
  }
25
30
 
26
31
  /** Aggregated metrics for a variant across its runs. */
@@ -33,4 +38,8 @@ export interface IVariantSummary {
33
38
  avgMs: number;
34
39
  /** Average quality across runs that were scored (0 if none). */
35
40
  avgQuality: number;
41
+ /** Count of failed runs by failure class (e.g. {"type-error": 2}); empty when
42
+ * no run carried a class. Lets a sweep show WHY a variant failed, not just how
43
+ * often. */
44
+ failureClasses: Record<string, number>;
36
45
  }
@@ -0,0 +1,263 @@
1
+ import type { ILoopEvent } from "../loop/loop.types";
2
+ import type { ErrorSet } from "../validate/validate.types";
3
+
4
+ /**
5
+ * Why a run failed — a structured reason, so every failed run maps to a possible
6
+ * harness intervention (the self-improving north-star). Derived purely from the
7
+ * event stream (+ an optional final gate error set), so the same classifier
8
+ * serves the live loop, the eval sweep, and the offline log analyzer.
9
+ */
10
+ export const FAILURE_CLASS = {
11
+ /** The run reached a green gate — no failure. */
12
+ none: "none",
13
+ /** Model emitted tool calls the parser couldn't read (repair L3 / salvage). */
14
+ toolMalformed: "tool-malformed",
15
+ /** Edits kept missing their target (missing-file / not-found / ambiguous). */
16
+ editReject: "edit-reject",
17
+ /** Hit the turn cap or the gate stalled with no decisive error class. */
18
+ noProgress: "no-progress",
19
+ /** Final gate red dominated by tsc type errors. */
20
+ typeError: "type-error",
21
+ /** Final gate red dominated by ESLint rule violations. */
22
+ lintRule: "lint-rule",
23
+ /** Imported a module that doesn't exist (TS2307 / "Cannot find module"). */
24
+ hallucinatedImport: "hallucinated-import",
25
+ /** Output degenerated into a repetition loop (StreamGuard fired). */
26
+ degeneration: "degeneration",
27
+ /** A per-call/timeout backstop tripped. */
28
+ timeout: "timeout",
29
+ /** A route rendered as an empty/phantom page. */
30
+ routePhantom: "route-phantom",
31
+ /** The built app failed to render / threw in the browser oracle. */
32
+ browserFail: "browser-fail",
33
+ /** The bundler/build step (vite) failed. */
34
+ buildFail: "build-fail",
35
+ /** Failed, but no signal was decisive. */
36
+ unknown: "unknown",
37
+ } as const;
38
+
39
+ export type FailureClass = (typeof FAILURE_CLASS)[keyof typeof FAILURE_CLASS];
40
+
41
+ /** Per-signal tallies behind a classification — kept for debugging/telemetry. */
42
+ export interface IFailureSignals {
43
+ repairs: number;
44
+ salvages: number;
45
+ editRejects: number;
46
+ degenerated: boolean;
47
+ tsErrors: number;
48
+ lintErrors: number;
49
+ missingModule: number;
50
+ browser: number;
51
+ build: number;
52
+ }
53
+
54
+ export interface IFailureSummary {
55
+ failureClass: FailureClass;
56
+ /** The dominant rule/code for type-error|lint-rule (e.g. "TS18048", "no-as"). */
57
+ detail?: string;
58
+ signals: IFailureSignals;
59
+ }
60
+
61
+ const TS_CODE = /^TS\d+$/;
62
+ const MISSING_MODULE = /cannot find module/i;
63
+ const DEGENERATE = /degenerat/i;
64
+ const TOOL_MALFORMED = /salvage|recovered|malformed|re-ask/i;
65
+ const REJECTED = /reject/i;
66
+ const BROWSER = /blank|did not render|did not mount|page error|uncaught/i;
67
+ const ROUTE = /route|phantom|stub/i;
68
+ const BUILD = /vite|esbuild|build failed|bundl/i;
69
+
70
+ /** The most frequently occurring string, or undefined for an empty list. */
71
+ function mostCommon(values: readonly string[]): string | undefined {
72
+ const counts = new Map<string, number>();
73
+ let best: string | undefined;
74
+ let bestN = 0;
75
+
76
+ for (const value of values) {
77
+ const n = (counts.get(value) ?? 0) + 1;
78
+
79
+ counts.set(value, n);
80
+
81
+ if (n > bestN) {
82
+ bestN = n;
83
+ best = value;
84
+ }
85
+ }
86
+
87
+ return best;
88
+ }
89
+
90
+ /** The final red gate's rules: prefer the explicit error set, else the rules
91
+ * carried on the last failing `validated` event. */
92
+ function finalRules(
93
+ events: readonly ILoopEvent[],
94
+ finalErrors?: ErrorSet
95
+ ): string[] {
96
+ if (finalErrors !== undefined) {
97
+ return finalErrors.flatMap((e) => (e.rule === undefined ? [] : [e.rule]));
98
+ }
99
+
100
+ let last: readonly string[] = [];
101
+
102
+ for (const event of events) {
103
+ if (event.kind === "validated" && event.passed === false && event.rules) {
104
+ last = event.rules;
105
+ }
106
+ }
107
+
108
+ return [...last];
109
+ }
110
+
111
+ /** Concatenated message/output text across the run — for keyword signals that
112
+ * aren't structured into a dedicated field (missing module, browser, build). */
113
+ function runText(
114
+ events: readonly ILoopEvent[],
115
+ finalErrors?: ErrorSet
116
+ ): string {
117
+ const parts: string[] = [];
118
+
119
+ for (const event of events) {
120
+ parts.push(event.message);
121
+
122
+ if (event.output !== undefined) {
123
+ parts.push(event.output);
124
+ }
125
+ }
126
+
127
+ for (const e of finalErrors ?? []) {
128
+ parts.push(e.message);
129
+ }
130
+
131
+ return parts.join("\n");
132
+ }
133
+
134
+ function gatherSignals(
135
+ events: readonly ILoopEvent[],
136
+ finalErrors?: ErrorSet
137
+ ): IFailureSignals {
138
+ const rules = finalRules(events, finalErrors);
139
+ const text = runText(events, finalErrors);
140
+ const missingModule =
141
+ rules.filter((r) => r === "TS2307").length +
142
+ (MISSING_MODULE.test(text) ? 1 : 0);
143
+
144
+ return {
145
+ repairs: events.filter((e) => e.kind === "repair").length,
146
+ salvages: events.filter(
147
+ (e) => e.kind === "tool" && TOOL_MALFORMED.test(e.message)
148
+ ).length,
149
+ editRejects: events.filter(
150
+ (e) => e.kind === "edit" && REJECTED.test(e.message)
151
+ ).length,
152
+ degenerated: events.some((e) => DEGENERATE.test(e.message)),
153
+ tsErrors: rules.filter((r) => TS_CODE.test(r) && r !== "TS2307").length,
154
+ lintErrors: rules.filter((r) => !TS_CODE.test(r)).length,
155
+ missingModule,
156
+ browser: BROWSER.test(text) ? 1 : 0,
157
+ build: BUILD.test(text) ? 1 : 0,
158
+ };
159
+ }
160
+
161
+ function finalStatusOf(
162
+ events: readonly ILoopEvent[]
163
+ ): "done" | "stuck" | "none" {
164
+ let status: "done" | "stuck" | "none" = "none";
165
+
166
+ for (const event of events) {
167
+ if (event.kind === "done") {
168
+ status = "done";
169
+ } else if (event.kind === "stuck") {
170
+ status = "stuck";
171
+ }
172
+ }
173
+
174
+ return status;
175
+ }
176
+
177
+ /** Pick the dominant gate-error class (type vs lint), with its commonest code. */
178
+ function classifyGateErrors(
179
+ events: readonly ILoopEvent[],
180
+ finalErrors: ErrorSet | undefined,
181
+ signals: IFailureSignals
182
+ ): IFailureSummary | undefined {
183
+ const rules = finalRules(events, finalErrors);
184
+
185
+ if (signals.tsErrors > 0 && signals.tsErrors >= signals.lintErrors) {
186
+ return {
187
+ failureClass: FAILURE_CLASS.typeError,
188
+ detail: mostCommon(rules.filter((r) => TS_CODE.test(r))),
189
+ signals,
190
+ };
191
+ }
192
+
193
+ if (signals.lintErrors > 0) {
194
+ return {
195
+ failureClass: FAILURE_CLASS.lintRule,
196
+ detail: mostCommon(rules.filter((r) => !TS_CODE.test(r))),
197
+ signals,
198
+ };
199
+ }
200
+
201
+ return undefined;
202
+ }
203
+
204
+ /** Behavioral fallback when no gate-error class dominates. */
205
+ function classifyBehavior(signals: IFailureSignals): FailureClass {
206
+ if (signals.degenerated) {
207
+ return FAILURE_CLASS.degeneration;
208
+ }
209
+
210
+ if (signals.salvages > 0 || signals.repairs > 0) {
211
+ return FAILURE_CLASS.toolMalformed;
212
+ }
213
+
214
+ if (signals.editRejects > 0) {
215
+ return FAILURE_CLASS.editReject;
216
+ }
217
+
218
+ return FAILURE_CLASS.noProgress;
219
+ }
220
+
221
+ /**
222
+ * Classify a run from its event stream. Pass the final gate `ErrorSet` when the
223
+ * caller has it (authoritative); otherwise the classifier falls back to the
224
+ * `rules` carried on the last failing `validated` event and keyword signals.
225
+ * A run that reached a green gate classifies as `none`.
226
+ */
227
+ export function classifyRun(
228
+ events: readonly ILoopEvent[],
229
+ finalErrors?: ErrorSet
230
+ ): IFailureSummary {
231
+ const signals = gatherSignals(events, finalErrors);
232
+
233
+ if (finalStatusOf(events) === "done") {
234
+ return { failureClass: FAILURE_CLASS.none, signals };
235
+ }
236
+
237
+ if (signals.missingModule > 0) {
238
+ return { failureClass: FAILURE_CLASS.hallucinatedImport, signals };
239
+ }
240
+
241
+ if (signals.browser > 0) {
242
+ const text = runText(events, finalErrors);
243
+
244
+ return {
245
+ failureClass: ROUTE.test(text)
246
+ ? FAILURE_CLASS.routePhantom
247
+ : FAILURE_CLASS.browserFail,
248
+ signals,
249
+ };
250
+ }
251
+
252
+ if (signals.build > 0 && signals.tsErrors === 0 && signals.lintErrors === 0) {
253
+ return { failureClass: FAILURE_CLASS.buildFail, signals };
254
+ }
255
+
256
+ const gate = classifyGateErrors(events, finalErrors, signals);
257
+
258
+ if (gate !== undefined) {
259
+ return gate;
260
+ }
261
+
262
+ return { failureClass: classifyBehavior(signals), signals };
263
+ }
package/src/eval/index.ts CHANGED
@@ -2,6 +2,14 @@ export * from "./eval.types";
2
2
  export { judge } from "./judge";
3
3
  export { summarize } from "./score";
4
4
  export { analyzeEvents, type IRunMetrics } from "./metrics";
5
+ export {
6
+ classifyRun,
7
+ FAILURE_CLASS,
8
+ type FailureClass,
9
+ type IFailureSummary,
10
+ type IFailureSignals,
11
+ } from "./failure-class";
12
+ export { parseEventLog } from "./parse-log";
5
13
  export {
6
14
  buildSweepReport,
7
15
  renderSweepReportMarkdown,
@@ -1,4 +1,5 @@
1
1
  import type { ILoopEvent } from "../loop/loop.types";
2
+ import { classifyRun, type FailureClass } from "./failure-class";
2
3
 
3
4
  /** Behavioral metrics distilled from a run's event stream — the signals the
4
5
  * local-model literature says predict outcomes (tokens-to-solution, repair
@@ -6,6 +7,10 @@ import type { ILoopEvent } from "../loop/loop.types";
6
7
  * the cli-metrics script. */
7
8
  export interface IRunMetrics {
8
9
  finalStatus: "done" | "stuck" | "none";
10
+ /** Structured reason the run failed (`none` when it reached green). The single
11
+ * source of truth for failure classification — the cli-metrics analyzer and
12
+ * the eval sweep both read this rather than re-deriving it. */
13
+ failureClass: FailureClass;
9
14
  /** Model turns (one per `cycle` event). */
10
15
  turns: number;
11
16
  /** Model calls (one per `usage` event). */
@@ -29,6 +34,7 @@ export interface IRunMetrics {
29
34
  function emptyMetrics(): IRunMetrics {
30
35
  return {
31
36
  finalStatus: "none",
37
+ failureClass: "none",
32
38
  turns: 0,
33
39
  modelCalls: 0,
34
40
  tokensOut: 0,
@@ -82,6 +88,7 @@ export function analyzeEvents(events: readonly ILoopEvent[]): IRunMetrics {
82
88
 
83
89
  m.filesCreated = created.size;
84
90
  m.avgTokensPerSecond = tpsCount > 0 ? Math.round(tpsSum / tpsCount) : 0;
91
+ m.failureClass = classifyRun(events).failureClass;
85
92
 
86
93
  return m;
87
94
  }
@@ -0,0 +1,105 @@
1
+ import type { ILoopEvent } from "../loop/loop.types";
2
+ import { isRecord } from "../lib/guards";
3
+
4
+ /** The known event kinds, as a runtime set, so a JSONL line can be validated
5
+ * into a typed ILoopEvent without an `as` cast. Keep in sync with ILoopEvent. */
6
+ const KNOWN_KINDS = new Set<string>([
7
+ "start",
8
+ "red",
9
+ "cycle",
10
+ "token",
11
+ "message",
12
+ "fix",
13
+ "edit",
14
+ "create",
15
+ "validated",
16
+ "done",
17
+ "stuck",
18
+ "run",
19
+ "tool",
20
+ "repair",
21
+ "timing",
22
+ "usage",
23
+ "ttsr",
24
+ ]);
25
+
26
+ function isKind(value: string): value is ILoopEvent["kind"] {
27
+ return KNOWN_KINDS.has(value);
28
+ }
29
+
30
+ function optionalString(value: unknown): string | undefined {
31
+ return typeof value === "string" ? value : undefined;
32
+ }
33
+
34
+ function stringArray(value: unknown): string[] | undefined {
35
+ if (!Array.isArray(value)) {
36
+ return undefined;
37
+ }
38
+
39
+ return value.filter((v): v is string => typeof v === "string");
40
+ }
41
+
42
+ /** Coerce one parsed JSONL record into an ILoopEvent, or null when it isn't one.
43
+ * Reads only the fields the failure classifier + metrics consume — enough to
44
+ * reconstruct a typed event stream from a `--log` file. */
45
+ function coerceEvent(record: unknown): ILoopEvent | null {
46
+ if (!isRecord(record)) {
47
+ return null;
48
+ }
49
+
50
+ const kind = record.kind;
51
+
52
+ if (typeof kind !== "string" || !isKind(kind)) {
53
+ return null;
54
+ }
55
+
56
+ const event: ILoopEvent = {
57
+ kind,
58
+ task: optionalString(record.task) ?? "",
59
+ message: optionalString(record.message) ?? "",
60
+ };
61
+ const output = optionalString(record.output);
62
+ const rules = stringArray(record.rules);
63
+
64
+ if (output !== undefined) {
65
+ event.output = output;
66
+ }
67
+
68
+ if (typeof record.passed === "boolean") {
69
+ event.passed = record.passed;
70
+ }
71
+
72
+ if (rules !== undefined) {
73
+ event.rules = rules;
74
+ }
75
+
76
+ return event;
77
+ }
78
+
79
+ /** Parse a `--log` JSONL transcript (one serialized event per line) into a typed
80
+ * event stream. Malformed lines and non-event records are skipped. */
81
+ export function parseEventLog(jsonl: string): ILoopEvent[] {
82
+ const events: ILoopEvent[] = [];
83
+
84
+ for (const line of jsonl.split("\n")) {
85
+ if (line.trim().length === 0) {
86
+ continue;
87
+ }
88
+
89
+ let parsed: unknown;
90
+
91
+ try {
92
+ parsed = JSON.parse(line);
93
+ } catch {
94
+ continue;
95
+ }
96
+
97
+ const event = coerceEvent(parsed);
98
+
99
+ if (event !== null) {
100
+ events.push(event);
101
+ }
102
+ }
103
+
104
+ return events;
105
+ }
@@ -164,5 +164,24 @@ export function renderSweepReportMarkdown(report: ISweepReport): string {
164
164
  ...rows,
165
165
  "",
166
166
  "`*` = significant at p < 0.05 (two-proportion z-test vs baseline).",
167
+ ...failureSection(report),
167
168
  ].join("\n");
168
169
  }
170
+
171
+ /** Format a variant's failure-class tally, e.g. "type-error×2, no-progress×1". */
172
+ function formatFailureClasses(classes: Record<string, number>): string {
173
+ return Object.entries(classes)
174
+ .sort(([, a], [, b]) => b - a)
175
+ .map(([cls, n]) => `${cls}×${String(n)}`)
176
+ .join(", ");
177
+ }
178
+
179
+ /** A "why failures happened" section — per-variant failure-class breakdown.
180
+ * Empty (no lines) when every run passed, so a clean sweep stays terse. */
181
+ function failureSection(report: ISweepReport): string[] {
182
+ const lines = report.variants
183
+ .filter((v) => Object.keys(v.failureClasses).length > 0)
184
+ .map((v) => `- **${v.label}**: ${formatFailureClasses(v.failureClasses)}`);
185
+
186
+ return lines.length === 0 ? [] : ["", "### Failure breakdown", ...lines];
187
+ }