@agjs/tsforge 0.1.19 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. package/package.json +6 -2
  2. package/scripts/browser-check.ts +41 -5
  3. package/scripts/build-rules-md.ts +78 -21
  4. package/scripts/cli-metrics.ts +10 -0
  5. package/scripts/sweep.ts +53 -23
  6. package/scripts/web-sweep.ts +292 -0
  7. package/src/browser/index.ts +3 -0
  8. package/src/browser/oracle.ts +215 -8
  9. package/src/cli.ts +22 -4
  10. package/src/config/index.ts +8 -0
  11. package/src/config/profiles.ts +150 -0
  12. package/src/config/tsforge-config.ts +64 -5
  13. package/src/detect-gate.ts +144 -13
  14. package/src/eval/eval.types.ts +9 -0
  15. package/src/eval/failure-class.ts +263 -0
  16. package/src/eval/index.ts +8 -0
  17. package/src/eval/metrics.ts +7 -0
  18. package/src/eval/parse-log.ts +105 -0
  19. package/src/eval/report.ts +19 -0
  20. package/src/eval/score.ts +10 -0
  21. package/src/loop/feedback/meta-rule-docs.ts +48 -0
  22. package/src/loop/feedback/rule-docs.ts +150 -0
  23. package/src/loop/loop.types.ts +4 -0
  24. package/src/loop/rule-docs.generated.json +131 -1
  25. package/src/loop/ttsr-defaults.ts +175 -4
  26. package/src/loop/turn.ts +3 -0
  27. package/src/meta-rules/registry.ts +32 -0
  28. package/src/meta-rules/rules/ci/no-github-context-in-shell.ts +40 -0
  29. package/src/meta-rules/rules/ci/no-pull-request-target-untrusted-checkout.ts +42 -0
  30. package/src/meta-rules/rules/ci/workflow-permissions-explicit.ts +49 -0
  31. package/src/meta-rules/rules/ci/workflow-permissions-least-privilege.ts +44 -0
  32. package/src/meta-rules/rules/config/next-image-remote-patterns-no-wildcards.ts +77 -0
  33. package/src/meta-rules/rules/config/next-instrumentation-present.ts +66 -0
  34. package/src/meta-rules/rules/config/next-proxy-over-middleware.ts +64 -0
  35. package/src/meta-rules/rules/config/tsconfig-recommended-flags.ts +75 -0
  36. package/src/meta-rules/rules/supply-chain/dependency-overrides-require-comment.ts +61 -0
  37. package/src/meta-rules/rules/supply-chain/fastify-security-plugins.ts +54 -0
  38. package/src/meta-rules/rules/supply-chain/lockfile-required.ts +51 -0
  39. package/src/meta-rules/rules/supply-chain/migrations-must-be-checked-in.ts +49 -0
  40. package/src/meta-rules/rules/supply-chain/no-git-or-tarball-dependencies.ts +70 -0
  41. package/src/meta-rules/rules/supply-chain/package-manager-field-required.ts +31 -0
  42. package/src/meta-rules/rules/supply-chain/production-must-not-use-drizzle-push.ts +75 -0
  43. package/src/meta-rules/rules/supply-chain/single-package-manager.ts +30 -0
  44. package/src/meta-rules/utils/lockfiles.ts +105 -0
  45. package/src/meta-rules/utils/workflow-yaml.ts +86 -0
  46. package/src/rule-packs/authorization/index.ts +26 -0
  47. package/src/rule-packs/authorization/rules/id-param-requires-object-authz.ts +87 -0
  48. package/src/rule-packs/authorization/rules/mutating-route-requires-authz.ts +116 -0
  49. package/src/rule-packs/authorization/rules/server-action-requires-authz.ts +101 -0
  50. package/src/rule-packs/authorization/utils.ts +285 -0
  51. package/src/rule-packs/boundary-utils.ts +13 -0
  52. package/src/rule-packs/code-flow/index.ts +4 -1
  53. package/src/rule-packs/code-flow/rules/no-throw-literal.ts +67 -0
  54. package/src/rule-packs/drizzle/index.ts +7 -0
  55. package/src/rule-packs/drizzle/rules/update-delete-account-scoped-must-filter-scope.ts +106 -0
  56. package/src/rule-packs/drizzle/rules/update-delete-must-have-where.ts +73 -0
  57. package/src/rule-packs/drizzle/utils.ts +133 -1
  58. package/src/rule-packs/fastify/index.ts +38 -0
  59. package/src/rule-packs/fastify/rules/error-handler-must-set-status.ts +78 -0
  60. package/src/rule-packs/fastify/rules/prefer-return-over-reply-send.ts +104 -0
  61. package/src/rule-packs/fastify/rules/require-fp-for-shared-plugins.ts +106 -0
  62. package/src/rule-packs/fastify/rules/require-plugin-name.ts +54 -0
  63. package/src/rule-packs/fastify/rules/require-response-schema.ts +62 -0
  64. package/src/rule-packs/fastify/rules/require-route-schema.ts +104 -0
  65. package/src/rule-packs/fastify/rules/test-inject-must-close-app.ts +44 -0
  66. package/src/rule-packs/fastify/utils/fastifyChain.ts +231 -0
  67. package/src/rule-packs/index.ts +10 -0
  68. package/src/rule-packs/jwt-cookies/index.ts +10 -0
  69. package/src/rule-packs/jwt-cookies/rules/auth-cookie-must-set-maxage-or-expires.ts +132 -0
  70. package/src/rule-packs/jwt-cookies/rules/auth-cookie-must-set-samesite.ts +151 -0
  71. package/src/rule-packs/jwt-cookies/rules/jwt-must-verify-not-decode.ts +124 -0
  72. package/src/rule-packs/module-boundaries/index.ts +3 -0
  73. package/src/rule-packs/module-boundaries/rules/no-react-in-services.ts +111 -0
  74. package/src/rule-packs/nextjs/index.ts +32 -0
  75. package/src/rule-packs/nextjs/rules/await-dynamic-request-apis.ts +65 -0
  76. package/src/rule-packs/nextjs/rules/error-boundary-require-use-client.ts +38 -0
  77. package/src/rule-packs/nextjs/rules/mutation-should-revalidate-cache.ts +152 -0
  78. package/src/rule-packs/nextjs/rules/no-html-img-element.ts +45 -0
  79. package/src/rule-packs/nextjs/rules/no-internal-api-fetch.ts +126 -0
  80. package/src/rule-packs/nextjs/rules/no-secret-props-to-client.ts +118 -0
  81. package/src/rule-packs/nextjs/rules/no-sensitive-next-public-env.ts +72 -0
  82. package/src/rule-packs/nextjs/rules/prefer-lazy-use-state-init.ts +85 -0
  83. package/src/rule-packs/nextjs/rules/server-action-requires-authz-and-validation.ts +178 -0
  84. package/src/rule-packs/nextjs/rules/server-only-modules-import-server-only.ts +87 -0
  85. package/src/rule-packs/nextjs/utils.ts +18 -0
  86. package/src/rule-packs/react-component-architecture/index.ts +18 -0
  87. package/src/rule-packs/react-component-architecture/rules/dangerous-html-requires-sanitize.ts +83 -0
  88. package/src/rule-packs/react-component-architecture/rules/no-anonymous-useEffect.ts +61 -0
  89. package/src/rule-packs/react-component-architecture/rules/no-component-invocation.ts +55 -0
  90. package/src/rule-packs/react-component-architecture/rules/no-derived-state-in-effect.ts +204 -0
  91. package/src/rule-packs/react-component-architecture/rules/no-nested-component.ts +152 -0
  92. package/src/rule-packs/react-component-architecture/rules/no-react-fc.ts +57 -0
  93. package/src/rule-packs/rule-catalog.types.ts +21 -0
  94. package/src/rule-packs/rule-metadata.ts +163 -0
  95. package/src/rule-packs/runtime-boundaries/index.ts +33 -0
  96. package/src/rule-packs/runtime-boundaries/rules/no-prototype-polluting-merge.ts +113 -0
  97. package/src/rule-packs/runtime-boundaries/rules/no-user-controlled-fetch-url.ts +69 -0
  98. package/src/rule-packs/runtime-boundaries/rules/no-user-controlled-redirect.ts +79 -0
  99. package/src/rule-packs/runtime-boundaries/rules/upload-must-set-limits.ts +126 -0
  100. package/src/rule-packs/runtime-boundaries/rules/webhook-must-verify-signature-before-parse.ts +87 -0
  101. package/src/rule-packs/security/index.ts +35 -0
  102. package/src/rule-packs/security/rules/catch-must-handle.ts +126 -0
  103. package/src/rule-packs/security/rules/no-auth-token-in-storage.ts +107 -0
  104. package/src/rule-packs/security/rules/no-child-process-exec.ts +72 -0
  105. package/src/rule-packs/security/rules/no-dynamic-regexp.ts +56 -0
  106. package/src/rule-packs/security/rules/no-inner-html-assignment.ts +42 -0
  107. package/src/rule-packs/security/rules/no-spawn-with-shell.ts +106 -0
  108. package/src/rule-packs/structured-logging/index.ts +6 -0
  109. package/src/rule-packs/structured-logging/rules/caught-error-log-requires-cause.ts +234 -0
  110. package/src/rule-packs/structured-logging/rules/logger-not-console.ts +146 -0
  111. package/src/rule-packs/test-conventions/index.ts +9 -0
  112. package/src/rule-packs/test-conventions/rules/fake-timers-must-be-restored.ts +143 -0
  113. package/src/rule-packs/test-conventions/rules/no-conditional-expect.ts +77 -0
  114. package/src/rule-packs/test-conventions/rules/no-real-network-in-unit-tests.ts +174 -0
  115. package/src/rule-packs/typescript-core/index.ts +30 -0
  116. package/src/rule-packs/typescript-core/rules/exported-functions-require-return-type.ts +74 -0
  117. package/src/rule-packs/typescript-core/rules/fetch-must-check-ok.ts +106 -0
  118. package/src/rule-packs/typescript-core/rules/json-parse-must-validate.ts +97 -0
  119. package/src/rule-packs/typescript-core/rules/no-unsafe-boundary-cast.ts +70 -0
  120. package/src/stack-detection/packs.ts +57 -0
  121. package/strict.type-aware.eslint.config.mjs +33 -0
  122. package/strict.web.eslint.config.mjs +32 -1
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agjs/tsforge",
3
3
  "type": "module",
4
- "version": "0.1.19",
4
+ "version": "0.2.1",
5
5
  "license": "MIT",
6
6
  "description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
7
7
  "repository": {
@@ -19,7 +19,8 @@
19
19
  "src",
20
20
  "scripts",
21
21
  "strict.eslint.config.mjs",
22
- "strict.web.eslint.config.mjs"
22
+ "strict.web.eslint.config.mjs",
23
+ "strict.type-aware.eslint.config.mjs"
23
24
  ],
24
25
  "engines": {
25
26
  "bun": ">=1.3.14"
@@ -34,6 +35,9 @@
34
35
  "@stylistic/eslint-plugin": "^5.10.0",
35
36
  "@typescript-eslint/utils": "8.60.0",
36
37
  "cli-highlight": "2.1.11",
38
+ "eslint-plugin-react": "^7.37.5",
39
+ "eslint-plugin-react-hooks": "^7.1.1",
40
+ "eslint-plugin-jsx-a11y": "^6.10.2",
37
41
  "eslint": "10.4.0",
38
42
  "prettier": "3.8.3",
39
43
  "typescript": "6.0.3",
@@ -5,27 +5,59 @@
5
5
  //
6
6
  // bun browser-check.ts <htmlFile> # render-only (no errors)
7
7
  // bun browser-check.ts <htmlFile> --smoke # render + generic behaviour smoke
8
+ // bun browser-check.ts <htmlFile> --a11y # + axe accessibility (serious/critical fail)
9
+ // bun browser-check.ts <htmlFile> --screenshots[=dir] # + per-route PNGs (artifact)
10
+ // bun browser-check.ts <htmlFile> --perf # + a basic DOM-size/mount-time budget
8
11
  // bun browser-check.ts <htmlFile> <checks.json> # render + interaction checks
9
12
  // bun browser-check.ts <htmlFile> <selector> [text]
10
13
  import { readdir } from "node:fs/promises";
11
14
  import { dirname, join } from "node:path";
12
- import { renderCheck, parseChecks, type IRenderOptions } from "../src/browser";
15
+ import {
16
+ renderCheck,
17
+ parseChecks,
18
+ type IRenderOptions,
19
+ type IPerfBudget,
20
+ } from "../src/browser";
13
21
  import { crawlableRoutePaths } from "../src/web-routes";
14
22
 
15
23
  const rawArgs = process.argv.slice(2);
16
24
  const smoke = rawArgs.includes("--smoke");
17
25
  const crawl = rawArgs.includes("--crawl");
18
- const [file, arg2, arg3] = rawArgs.filter(
19
- (a) => a !== "--smoke" && a !== "--crawl"
20
- );
26
+ const a11y = rawArgs.includes("--a11y");
27
+ const perf = rawArgs.includes("--perf");
28
+ const screenshotsArg = rawArgs.find((a) => a.startsWith("--screenshots"));
29
+ // Positionals are anything that isn't a recognized `--flag`.
30
+ const [file, arg2, arg3] = rawArgs.filter((a) => !a.startsWith("--"));
21
31
 
22
32
  if (file === undefined) {
23
33
  process.stderr.write(
24
- "usage: browser-check.ts <htmlFile> [--smoke] [--crawl] [checks.json | selector [text]]\n"
34
+ "usage: browser-check.ts <htmlFile> [--smoke] [--crawl] [--a11y] " +
35
+ "[--screenshots[=dir]] [--perf] [checks.json | selector [text]]\n"
25
36
  );
26
37
  process.exit(2);
27
38
  }
28
39
 
40
+ /** A conservative default budget — a tripwire for runaway render trees / slow
41
+ * mounts, not a tuned Lighthouse target. */
42
+ const DEFAULT_PERF_BUDGET: IPerfBudget = {
43
+ maxDomNodes: 5000,
44
+ maxMountMs: 6000,
45
+ };
46
+
47
+ /** The screenshot dir: `--screenshots=<dir>`, else a `screenshots/` folder next
48
+ * to the HTML file. undefined when `--screenshots` wasn't passed. */
49
+ function screenshotDir(): string | undefined {
50
+ if (screenshotsArg === undefined) {
51
+ return undefined;
52
+ }
53
+
54
+ const eq = screenshotsArg.indexOf("=");
55
+
56
+ return eq === -1
57
+ ? join(dirname(file ?? "."), "screenshots")
58
+ : screenshotsArg.slice(eq + 1);
59
+ }
60
+
29
61
  /** With --crawl, enumerate the app's static routes from `<buildDir>/src/routes/`
30
62
  * (the build dir is the parent of dist/) so every page — not just the home —
31
63
  * is render-checked. Dynamic ($param) routes are skipped. */
@@ -66,10 +98,14 @@ async function checksFor(): Promise<Partial<IRenderOptions>> {
66
98
  };
67
99
  }
68
100
 
101
+ const shots = screenshotDir();
69
102
  const result = await renderCheck({
70
103
  file,
71
104
  smoke,
105
+ a11y,
72
106
  routes: await routesFor(),
107
+ ...(perf ? { perfBudget: DEFAULT_PERF_BUDGET } : {}),
108
+ ...(shots !== undefined ? { screenshotDir: shots } : {}),
73
109
  ...(await checksFor()),
74
110
  });
75
111
 
@@ -1,9 +1,10 @@
1
- // Generate RULES.md: a catalog of all rule packs and meta-rules.
2
- // This produces a deterministic, human-readable reference of what gets enforced.
3
- // bun run packages/core/scripts/build-rules-md.ts
1
+ // Generate RULES.md grouped by adoption tier, then pack.
4
2
  import { join } from "node:path";
5
3
  import { RULE_PACKS } from "../src/rule-packs";
6
4
  import { META_RULES } from "../src/meta-rules";
5
+ import { getRuleCatalogEntry } from "../src/rule-packs/rule-metadata";
6
+ import type { RuleTier } from "../src/rule-packs/rule-catalog.types";
7
+ import { PROFILE_DEFINITIONS } from "../src/config/profiles";
7
8
 
8
9
  function getRuleDescription(obj: unknown): string | undefined {
9
10
  const isObject = (val: unknown): val is Record<string, unknown> =>
@@ -30,15 +31,28 @@ function getRuleDescription(obj: unknown): string | undefined {
30
31
  return typeof description === "string" ? description : undefined;
31
32
  }
32
33
 
34
+ const TIER_ORDER: readonly RuleTier[] = [
35
+ "safety",
36
+ "framework",
37
+ "architecture",
38
+ "experimental",
39
+ ];
40
+
33
41
  const out: string[] = [
34
42
  "# Rules and Meta-Rules Catalog",
35
43
  "",
36
- "This document lists all rules enforced by tsforge across rule packs and meta-rules.",
44
+ "Rules are grouped by **adoption tier**. Use `profile` in `tsforge.config.json` to control which tiers are active by default.",
45
+ "",
46
+ "## Profiles",
37
47
  "",
38
48
  ];
39
49
 
40
- // Section: Rule Packs
41
- out.push("## Rule Packs");
50
+ for (const profile of Object.values(PROFILE_DEFINITIONS)) {
51
+ out.push(`- **${profile.id}**: ${profile.description}`);
52
+ }
53
+
54
+ out.push("");
55
+ out.push("## Rule Packs by Tier");
42
56
  out.push("");
43
57
 
44
58
  type PackId = keyof typeof RULE_PACKS;
@@ -47,36 +61,62 @@ function isPackId(id: string): id is PackId {
47
61
  return id in RULE_PACKS;
48
62
  }
49
63
 
50
- const packIds = Object.keys(RULE_PACKS).sort();
64
+ const entriesByTier = new Map<
65
+ RuleTier,
66
+ { packId: string; ruleName: string; severity: string; description: string }[]
67
+ >();
51
68
 
52
- for (const packId of packIds) {
69
+ for (const packId of Object.keys(RULE_PACKS).sort()) {
53
70
  if (!isPackId(packId)) {
54
71
  continue;
55
72
  }
56
73
 
57
74
  const pack = RULE_PACKS[packId];
58
75
 
59
- out.push(`### ${packId}`);
60
- out.push("");
61
- out.push(pack.description);
62
- out.push("");
63
-
64
- const ruleNames = Object.keys(pack.rules).sort();
65
-
66
- for (const ruleName of ruleNames) {
76
+ for (const ruleName of Object.keys(pack.rules).sort()) {
67
77
  const rule = pack.rules[ruleName];
68
78
  const severity = pack.rulesConfig[ruleName] ?? "warn";
69
79
  const description = getRuleDescription(rule) ?? ruleName;
70
- const severityUpper = severity.toUpperCase();
71
- const line = `- **${ruleName}** [${severityUpper}]: ${description}`;
80
+ const tier = getRuleCatalogEntry(ruleName, packId).tier;
81
+ const list = entriesByTier.get(tier) ?? [];
82
+
83
+ list.push({
84
+ packId,
85
+ ruleName,
86
+ severity: severity.toUpperCase(),
87
+ description,
88
+ });
89
+ entriesByTier.set(tier, list);
90
+ }
91
+ }
92
+
93
+ for (const tier of TIER_ORDER) {
94
+ const entries = entriesByTier.get(tier) ?? [];
95
+
96
+ if (entries.length === 0) {
97
+ continue;
98
+ }
72
99
 
73
- out.push(line);
100
+ out.push(`### Tier: ${tier}`);
101
+ out.push("");
102
+
103
+ for (const entry of entries.sort((a, b) => {
104
+ const byPack = a.packId.localeCompare(b.packId);
105
+
106
+ if (byPack !== 0) {
107
+ return byPack;
108
+ }
109
+
110
+ return a.ruleName.localeCompare(b.ruleName);
111
+ })) {
112
+ out.push(
113
+ `- **${entry.packId}/${entry.ruleName}** [${entry.severity}]: ${entry.description}`
114
+ );
74
115
  }
75
116
 
76
117
  out.push("");
77
118
  }
78
119
 
79
- // Section: Meta-Rules
80
120
  out.push("## Meta-Rules");
81
121
  out.push("");
82
122
  out.push(
@@ -103,7 +143,6 @@ for (const rule of META_RULES) {
103
143
  rulesByCategory.set(cat, rules);
104
144
  }
105
145
 
106
- // Render meta-rules by category.
107
146
  for (const category of categoryOrder) {
108
147
  const rules = rulesByCategory.get(category) ?? [];
109
148
 
@@ -123,6 +162,24 @@ for (const category of categoryOrder) {
123
162
  out.push("");
124
163
  }
125
164
 
165
+ out.push("## Out of scope");
166
+ out.push("");
167
+ out.push(
168
+ "The following are intentionally deferred — wrong tool for the syntactic ESLint gate, or require cross-file analysis:"
169
+ );
170
+ out.push("");
171
+ out.push(
172
+ "- GraphQL/WebSocket/OpenAPI contract rules (until OpenAPI dep + parser)"
173
+ );
174
+ out.push(
175
+ "- Container/Kubernetes YAML hardening (future meta-rules when Dockerfile/k8s detected)"
176
+ );
177
+ out.push("- LLM/MCP security packs (opt-in when AI SDK deps detected)");
178
+ out.push("- FSD layer DAG / full authorization taint tracking");
179
+ out.push("- Lighthouse / bundle-analyzer CI gates");
180
+ out.push("- Violation ratcheting / baseline snapshots (Phase 5)");
181
+ out.push("");
182
+
126
183
  const path = join(import.meta.dir, "..", "RULES.md");
127
184
 
128
185
  await Bun.write(path, out.join("\n"));
@@ -10,6 +10,7 @@ import { readdir } from "node:fs/promises";
10
10
  import { homedir } from "node:os";
11
11
  import { join } from "node:path";
12
12
  import { isRecord } from "../src/lib/guards";
13
+ import { classifyRun, parseEventLog } from "../src/eval";
13
14
 
14
15
  function num(value: unknown): number {
15
16
  return typeof value === "number" ? value : 0;
@@ -168,6 +169,9 @@ async function main(): Promise<void> {
168
169
  const text = await Bun.file(path).text();
169
170
  const lines = text.split("\n").filter((l) => l.trim().length > 0);
170
171
  const m = analyze(lines);
172
+ // Single source of truth for WHY a run failed — the same classifier the eval
173
+ // sweep and the reusable analyzeEvents() use, fed the typed event stream.
174
+ const failure = classifyRun(parseEventLog(text));
171
175
  const pct =
172
176
  m.contextWindow > 0
173
177
  ? Math.round((m.peakContext / m.contextWindow) * 100)
@@ -182,6 +186,12 @@ async function main(): Promise<void> {
182
186
  ["model", m.model],
183
187
  ["context window", String(m.contextWindow)],
184
188
  ["final status", m.finalStatus],
189
+ [
190
+ "failure class",
191
+ failure.detail === undefined
192
+ ? failure.failureClass
193
+ : `${failure.failureClass} (${failure.detail})`,
194
+ ],
185
195
  ["turns (repair iterations)", String(m.turns)],
186
196
  ["model calls", String(m.modelCalls)],
187
197
  ["tokens out (→ solution)", String(m.tokensOut)],
package/scripts/sweep.ts CHANGED
@@ -11,7 +11,14 @@ import { runSpec, qualityRepair } from "../src/loop";
11
11
  import { modelAgent } from "../src/agent";
12
12
  import { OpenAICompatibleProvider } from "../src/inference";
13
13
  import { resolveActiveModel, resolveApiKey } from "../src/models-config";
14
- import { summarize, type IRunRecord } from "../src/eval";
14
+ import { providerConfig } from "../src/cli";
15
+ import {
16
+ summarize,
17
+ classifyRun,
18
+ renderSweepReportMarkdown,
19
+ buildSweepReport,
20
+ type IRunRecord,
21
+ } from "../src/eval";
15
22
  import { renderEvent } from "../src/render";
16
23
  import type { ILoopEvent } from "../src/loop";
17
24
 
@@ -109,28 +116,32 @@ const seedFiles = await readdir(seedDir, { recursive: true });
109
116
  // unreachable endpoint and hung with an empty run.log.)
110
117
  const { entry: activeModel } = await resolveActiveModel();
111
118
 
112
- const provider = new OpenAICompatibleProvider({
113
- baseUrl: activeModel.baseUrl,
114
- model: activeModel.model,
115
- apiKey: resolveApiKey(activeModel),
116
- // Thinking tokens count against the limit, so give reasoning + code room.
117
- maxTokens: Number(process.env.TSFORGE_MAX_TOKENS ?? "16384"),
118
- // Opt-in only: a repetition penalty breaks rare temp-0 loops but DEGRADES
119
- // algorithmic code (it made `money` write unsafe/any code that failed the
120
- // strict gate). Default off; enable via env if a target genuinely loops.
121
- repetitionPenalty:
122
- process.env.TSFORGE_REPETITION_PENALTY === undefined
123
- ? undefined
124
- : Number(process.env.TSFORGE_REPETITION_PENALTY),
125
- });
119
+ // Build the wire config the SAME way the CLI does (`providerConfig`), so the
120
+ // sweep inherits the active entry's provider dialect — `reasoning`,
121
+ // `reasoningEffort`, `extraBody`, `extraHeaders`. Hand-rolling the config here
122
+ // dropped those fields, so a DeepSeek sweep sent qwen-only params and hit the
123
+ // 400s the interactive path already handles. maxTokens still defaults to
124
+ // PROVIDER_LIMITS (16384) thinking tokens count against it, so reasoning +
125
+ // code get room. Repetition penalty stays opt-in via TSFORGE_REPETITION_PENALTY.
126
+ const provider = new OpenAICompatibleProvider(providerConfig(activeModel));
126
127
 
127
128
  // The judge scores quality. Point it at a flagship via TSFORGE_JUDGE_URL/MODEL
128
- // (+ TSFORGE_JUDGE_KEY) to measure the gap; defaults to the active model judging itself.
129
- const judgeProvider = new OpenAICompatibleProvider({
130
- baseUrl: process.env.TSFORGE_JUDGE_URL ?? activeModel.baseUrl,
131
- model: process.env.TSFORGE_JUDGE_MODEL ?? activeModel.model,
132
- apiKey: process.env.TSFORGE_JUDGE_KEY ?? resolveApiKey(activeModel),
133
- });
129
+ // (+ TSFORGE_JUDGE_KEY) to measure the gap. When NOT overridden, the active
130
+ // model judges itself — reuse its full dialect via providerConfig so a
131
+ // self-judge against DeepSeek speaks DeepSeek too. An explicit external judge
132
+ // is a plain generic call (its own endpoint, no inherited reasoning dialect).
133
+ const judgeOverridden =
134
+ process.env.TSFORGE_JUDGE_URL !== undefined ||
135
+ process.env.TSFORGE_JUDGE_MODEL !== undefined;
136
+ const judgeProvider = new OpenAICompatibleProvider(
137
+ judgeOverridden
138
+ ? {
139
+ baseUrl: process.env.TSFORGE_JUDGE_URL ?? activeModel.baseUrl,
140
+ model: process.env.TSFORGE_JUDGE_MODEL ?? activeModel.model,
141
+ apiKey: process.env.TSFORGE_JUDGE_KEY ?? resolveApiKey(activeModel),
142
+ }
143
+ : providerConfig(activeModel)
144
+ );
134
145
 
135
146
  /** Sortable timestamp `YYYYMMDD-HHMMSS` so run dirs sort newest-last by name. */
136
147
  function stamp(): string {
@@ -263,8 +274,12 @@ async function runOne(
263
274
  // Every run gets a full transcript at <runDir>/run.log; stream to the
264
275
  // terminal too when TSFORGE_STREAM=1.
265
276
  const log = Bun.file(join(runDir, "run.log")).writer();
277
+ // Keep the structured events so a failed run can be classified (WHY it
278
+ // failed), not just counted — fed to classifyRun below.
279
+ const runEvents: ILoopEvent[] = [];
266
280
 
267
281
  const onEvent = (e: ILoopEvent): void => {
282
+ runEvents.push(e);
268
283
  void log.write(renderEvent(e, { color: false }));
269
284
  // Flush per event — otherwise Bun's FileSink buffers and `tail -f` shows
270
285
  // nothing until the run ends. The log must be live.
@@ -354,6 +369,9 @@ async function runOne(
354
369
  );
355
370
 
356
371
  const vLabel = variantLabel(variantEnv);
372
+ const failureClass = passed
373
+ ? undefined
374
+ : classifyRun(runEvents).failureClass;
357
375
 
358
376
  records.push({
359
377
  label: `${vLabel} temp=${temp}`,
@@ -361,9 +379,10 @@ async function runOne(
361
379
  cycles,
362
380
  ms,
363
381
  quality,
382
+ ...(failureClass === undefined ? {} : { failureClass }),
364
383
  });
365
384
  process.stdout.write(
366
- ` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : "blocked"} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
385
+ ` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : `blocked[${failureClass ?? "unknown"}]`} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
367
386
  );
368
387
  } finally {
369
388
  restore();
@@ -375,11 +394,22 @@ const summaries = summarize(records);
375
394
  process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
376
395
 
377
396
  for (const s of summaries) {
397
+ const failures = Object.entries(s.failureClasses)
398
+ .sort(([, a], [, b]) => b - a)
399
+ .map(([cls, n]) => `${cls}×${String(n)}`)
400
+ .join(", ");
401
+
378
402
  process.stdout.write(
379
- `${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms\n`
403
+ `${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms${failures.length > 0 ? ` [${failures}]` : ""}\n`
380
404
  );
381
405
  }
382
406
 
407
+ // The statistical report (Wilson CI + z-test vs baseline) now also tabulates a
408
+ // per-variant failure-class breakdown — WHY runs failed, not just how often.
409
+ process.stdout.write(
410
+ `\n${renderSweepReportMarkdown(buildSweepReport(records))}\n`
411
+ );
412
+
383
413
  const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
384
414
 
385
415
  await Bun.write(