@agjs/tsforge 0.1.19 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -2
- package/scripts/browser-check.ts +41 -5
- package/scripts/build-rules-md.ts +78 -21
- package/scripts/cli-metrics.ts +10 -0
- package/scripts/sweep.ts +53 -23
- package/scripts/web-sweep.ts +292 -0
- package/src/browser/index.ts +3 -0
- package/src/browser/oracle.ts +215 -8
- package/src/cli.ts +22 -4
- package/src/config/index.ts +8 -0
- package/src/config/profiles.ts +150 -0
- package/src/config/tsforge-config.ts +64 -5
- package/src/detect-gate.ts +144 -13
- package/src/eval/eval.types.ts +9 -0
- package/src/eval/failure-class.ts +263 -0
- package/src/eval/index.ts +8 -0
- package/src/eval/metrics.ts +7 -0
- package/src/eval/parse-log.ts +105 -0
- package/src/eval/report.ts +19 -0
- package/src/eval/score.ts +10 -0
- package/src/loop/feedback/meta-rule-docs.ts +48 -0
- package/src/loop/feedback/rule-docs.ts +150 -0
- package/src/loop/loop.types.ts +4 -0
- package/src/loop/rule-docs.generated.json +131 -1
- package/src/loop/ttsr-defaults.ts +175 -4
- package/src/loop/turn.ts +3 -0
- package/src/meta-rules/registry.ts +32 -0
- package/src/meta-rules/rules/ci/no-github-context-in-shell.ts +40 -0
- package/src/meta-rules/rules/ci/no-pull-request-target-untrusted-checkout.ts +42 -0
- package/src/meta-rules/rules/ci/workflow-permissions-explicit.ts +49 -0
- package/src/meta-rules/rules/ci/workflow-permissions-least-privilege.ts +44 -0
- package/src/meta-rules/rules/config/next-image-remote-patterns-no-wildcards.ts +77 -0
- package/src/meta-rules/rules/config/next-instrumentation-present.ts +66 -0
- package/src/meta-rules/rules/config/next-proxy-over-middleware.ts +64 -0
- package/src/meta-rules/rules/config/tsconfig-recommended-flags.ts +75 -0
- package/src/meta-rules/rules/supply-chain/dependency-overrides-require-comment.ts +61 -0
- package/src/meta-rules/rules/supply-chain/fastify-security-plugins.ts +54 -0
- package/src/meta-rules/rules/supply-chain/lockfile-required.ts +51 -0
- package/src/meta-rules/rules/supply-chain/migrations-must-be-checked-in.ts +49 -0
- package/src/meta-rules/rules/supply-chain/no-git-or-tarball-dependencies.ts +70 -0
- package/src/meta-rules/rules/supply-chain/package-manager-field-required.ts +31 -0
- package/src/meta-rules/rules/supply-chain/production-must-not-use-drizzle-push.ts +75 -0
- package/src/meta-rules/rules/supply-chain/single-package-manager.ts +30 -0
- package/src/meta-rules/utils/lockfiles.ts +105 -0
- package/src/meta-rules/utils/workflow-yaml.ts +86 -0
- package/src/rule-packs/authorization/index.ts +26 -0
- package/src/rule-packs/authorization/rules/id-param-requires-object-authz.ts +87 -0
- package/src/rule-packs/authorization/rules/mutating-route-requires-authz.ts +116 -0
- package/src/rule-packs/authorization/rules/server-action-requires-authz.ts +101 -0
- package/src/rule-packs/authorization/utils.ts +285 -0
- package/src/rule-packs/boundary-utils.ts +13 -0
- package/src/rule-packs/code-flow/index.ts +4 -1
- package/src/rule-packs/code-flow/rules/no-throw-literal.ts +67 -0
- package/src/rule-packs/drizzle/index.ts +7 -0
- package/src/rule-packs/drizzle/rules/update-delete-account-scoped-must-filter-scope.ts +106 -0
- package/src/rule-packs/drizzle/rules/update-delete-must-have-where.ts +73 -0
- package/src/rule-packs/drizzle/utils.ts +133 -1
- package/src/rule-packs/fastify/index.ts +38 -0
- package/src/rule-packs/fastify/rules/error-handler-must-set-status.ts +78 -0
- package/src/rule-packs/fastify/rules/prefer-return-over-reply-send.ts +104 -0
- package/src/rule-packs/fastify/rules/require-fp-for-shared-plugins.ts +106 -0
- package/src/rule-packs/fastify/rules/require-plugin-name.ts +54 -0
- package/src/rule-packs/fastify/rules/require-response-schema.ts +62 -0
- package/src/rule-packs/fastify/rules/require-route-schema.ts +104 -0
- package/src/rule-packs/fastify/rules/test-inject-must-close-app.ts +44 -0
- package/src/rule-packs/fastify/utils/fastifyChain.ts +231 -0
- package/src/rule-packs/index.ts +10 -0
- package/src/rule-packs/jwt-cookies/index.ts +10 -0
- package/src/rule-packs/jwt-cookies/rules/auth-cookie-must-set-maxage-or-expires.ts +132 -0
- package/src/rule-packs/jwt-cookies/rules/auth-cookie-must-set-samesite.ts +151 -0
- package/src/rule-packs/jwt-cookies/rules/jwt-must-verify-not-decode.ts +124 -0
- package/src/rule-packs/module-boundaries/index.ts +3 -0
- package/src/rule-packs/module-boundaries/rules/no-react-in-services.ts +111 -0
- package/src/rule-packs/nextjs/index.ts +32 -0
- package/src/rule-packs/nextjs/rules/await-dynamic-request-apis.ts +65 -0
- package/src/rule-packs/nextjs/rules/error-boundary-require-use-client.ts +38 -0
- package/src/rule-packs/nextjs/rules/mutation-should-revalidate-cache.ts +152 -0
- package/src/rule-packs/nextjs/rules/no-html-img-element.ts +45 -0
- package/src/rule-packs/nextjs/rules/no-internal-api-fetch.ts +126 -0
- package/src/rule-packs/nextjs/rules/no-secret-props-to-client.ts +118 -0
- package/src/rule-packs/nextjs/rules/no-sensitive-next-public-env.ts +72 -0
- package/src/rule-packs/nextjs/rules/prefer-lazy-use-state-init.ts +85 -0
- package/src/rule-packs/nextjs/rules/server-action-requires-authz-and-validation.ts +178 -0
- package/src/rule-packs/nextjs/rules/server-only-modules-import-server-only.ts +87 -0
- package/src/rule-packs/nextjs/utils.ts +18 -0
- package/src/rule-packs/react-component-architecture/index.ts +18 -0
- package/src/rule-packs/react-component-architecture/rules/dangerous-html-requires-sanitize.ts +83 -0
- package/src/rule-packs/react-component-architecture/rules/no-anonymous-useEffect.ts +61 -0
- package/src/rule-packs/react-component-architecture/rules/no-component-invocation.ts +55 -0
- package/src/rule-packs/react-component-architecture/rules/no-derived-state-in-effect.ts +204 -0
- package/src/rule-packs/react-component-architecture/rules/no-nested-component.ts +152 -0
- package/src/rule-packs/react-component-architecture/rules/no-react-fc.ts +57 -0
- package/src/rule-packs/rule-catalog.types.ts +21 -0
- package/src/rule-packs/rule-metadata.ts +163 -0
- package/src/rule-packs/runtime-boundaries/index.ts +33 -0
- package/src/rule-packs/runtime-boundaries/rules/no-prototype-polluting-merge.ts +113 -0
- package/src/rule-packs/runtime-boundaries/rules/no-user-controlled-fetch-url.ts +69 -0
- package/src/rule-packs/runtime-boundaries/rules/no-user-controlled-redirect.ts +79 -0
- package/src/rule-packs/runtime-boundaries/rules/upload-must-set-limits.ts +126 -0
- package/src/rule-packs/runtime-boundaries/rules/webhook-must-verify-signature-before-parse.ts +87 -0
- package/src/rule-packs/security/index.ts +35 -0
- package/src/rule-packs/security/rules/catch-must-handle.ts +126 -0
- package/src/rule-packs/security/rules/no-auth-token-in-storage.ts +107 -0
- package/src/rule-packs/security/rules/no-child-process-exec.ts +72 -0
- package/src/rule-packs/security/rules/no-dynamic-regexp.ts +56 -0
- package/src/rule-packs/security/rules/no-inner-html-assignment.ts +42 -0
- package/src/rule-packs/security/rules/no-spawn-with-shell.ts +106 -0
- package/src/rule-packs/structured-logging/index.ts +6 -0
- package/src/rule-packs/structured-logging/rules/caught-error-log-requires-cause.ts +234 -0
- package/src/rule-packs/structured-logging/rules/logger-not-console.ts +146 -0
- package/src/rule-packs/test-conventions/index.ts +9 -0
- package/src/rule-packs/test-conventions/rules/fake-timers-must-be-restored.ts +143 -0
- package/src/rule-packs/test-conventions/rules/no-conditional-expect.ts +77 -0
- package/src/rule-packs/test-conventions/rules/no-real-network-in-unit-tests.ts +174 -0
- package/src/rule-packs/typescript-core/index.ts +30 -0
- package/src/rule-packs/typescript-core/rules/exported-functions-require-return-type.ts +74 -0
- package/src/rule-packs/typescript-core/rules/fetch-must-check-ok.ts +106 -0
- package/src/rule-packs/typescript-core/rules/json-parse-must-validate.ts +97 -0
- package/src/rule-packs/typescript-core/rules/no-unsafe-boundary-cast.ts +70 -0
- package/src/stack-detection/packs.ts +57 -0
- package/strict.type-aware.eslint.config.mjs +33 -0
- package/strict.web.eslint.config.mjs +32 -1
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agjs/tsforge",
|
|
3
3
|
"type": "module",
|
|
4
|
-
"version": "0.1
|
|
4
|
+
"version": "0.2.1",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
|
|
7
7
|
"repository": {
|
|
@@ -19,7 +19,8 @@
|
|
|
19
19
|
"src",
|
|
20
20
|
"scripts",
|
|
21
21
|
"strict.eslint.config.mjs",
|
|
22
|
-
"strict.web.eslint.config.mjs"
|
|
22
|
+
"strict.web.eslint.config.mjs",
|
|
23
|
+
"strict.type-aware.eslint.config.mjs"
|
|
23
24
|
],
|
|
24
25
|
"engines": {
|
|
25
26
|
"bun": ">=1.3.14"
|
|
@@ -34,6 +35,9 @@
|
|
|
34
35
|
"@stylistic/eslint-plugin": "^5.10.0",
|
|
35
36
|
"@typescript-eslint/utils": "8.60.0",
|
|
36
37
|
"cli-highlight": "2.1.11",
|
|
38
|
+
"eslint-plugin-react": "^7.37.5",
|
|
39
|
+
"eslint-plugin-react-hooks": "^7.1.1",
|
|
40
|
+
"eslint-plugin-jsx-a11y": "^6.10.2",
|
|
37
41
|
"eslint": "10.4.0",
|
|
38
42
|
"prettier": "3.8.3",
|
|
39
43
|
"typescript": "6.0.3",
|
package/scripts/browser-check.ts
CHANGED
|
@@ -5,27 +5,59 @@
|
|
|
5
5
|
//
|
|
6
6
|
// bun browser-check.ts <htmlFile> # render-only (no errors)
|
|
7
7
|
// bun browser-check.ts <htmlFile> --smoke # render + generic behaviour smoke
|
|
8
|
+
// bun browser-check.ts <htmlFile> --a11y # + axe accessibility (serious/critical fail)
|
|
9
|
+
// bun browser-check.ts <htmlFile> --screenshots[=dir] # + per-route PNGs (artifact)
|
|
10
|
+
// bun browser-check.ts <htmlFile> --perf # + a basic DOM-size/mount-time budget
|
|
8
11
|
// bun browser-check.ts <htmlFile> <checks.json> # render + interaction checks
|
|
9
12
|
// bun browser-check.ts <htmlFile> <selector> [text]
|
|
10
13
|
import { readdir } from "node:fs/promises";
|
|
11
14
|
import { dirname, join } from "node:path";
|
|
12
|
-
import {
|
|
15
|
+
import {
|
|
16
|
+
renderCheck,
|
|
17
|
+
parseChecks,
|
|
18
|
+
type IRenderOptions,
|
|
19
|
+
type IPerfBudget,
|
|
20
|
+
} from "../src/browser";
|
|
13
21
|
import { crawlableRoutePaths } from "../src/web-routes";
|
|
14
22
|
|
|
15
23
|
const rawArgs = process.argv.slice(2);
|
|
16
24
|
const smoke = rawArgs.includes("--smoke");
|
|
17
25
|
const crawl = rawArgs.includes("--crawl");
|
|
18
|
-
const
|
|
19
|
-
|
|
20
|
-
);
|
|
26
|
+
const a11y = rawArgs.includes("--a11y");
|
|
27
|
+
const perf = rawArgs.includes("--perf");
|
|
28
|
+
const screenshotsArg = rawArgs.find((a) => a.startsWith("--screenshots"));
|
|
29
|
+
// Positionals are anything that isn't a recognized `--flag`.
|
|
30
|
+
const [file, arg2, arg3] = rawArgs.filter((a) => !a.startsWith("--"));
|
|
21
31
|
|
|
22
32
|
if (file === undefined) {
|
|
23
33
|
process.stderr.write(
|
|
24
|
-
"usage: browser-check.ts <htmlFile> [--smoke] [--crawl] [
|
|
34
|
+
"usage: browser-check.ts <htmlFile> [--smoke] [--crawl] [--a11y] " +
|
|
35
|
+
"[--screenshots[=dir]] [--perf] [checks.json | selector [text]]\n"
|
|
25
36
|
);
|
|
26
37
|
process.exit(2);
|
|
27
38
|
}
|
|
28
39
|
|
|
40
|
+
/** A conservative default budget — a tripwire for runaway render trees / slow
|
|
41
|
+
* mounts, not a tuned Lighthouse target. */
|
|
42
|
+
const DEFAULT_PERF_BUDGET: IPerfBudget = {
|
|
43
|
+
maxDomNodes: 5000,
|
|
44
|
+
maxMountMs: 6000,
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
/** The screenshot dir: `--screenshots=<dir>`, else a `screenshots/` folder next
|
|
48
|
+
* to the HTML file. undefined when `--screenshots` wasn't passed. */
|
|
49
|
+
function screenshotDir(): string | undefined {
|
|
50
|
+
if (screenshotsArg === undefined) {
|
|
51
|
+
return undefined;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const eq = screenshotsArg.indexOf("=");
|
|
55
|
+
|
|
56
|
+
return eq === -1
|
|
57
|
+
? join(dirname(file ?? "."), "screenshots")
|
|
58
|
+
: screenshotsArg.slice(eq + 1);
|
|
59
|
+
}
|
|
60
|
+
|
|
29
61
|
/** With --crawl, enumerate the app's static routes from `<buildDir>/src/routes/`
|
|
30
62
|
* (the build dir is the parent of dist/) so every page — not just the home —
|
|
31
63
|
* is render-checked. Dynamic ($param) routes are skipped. */
|
|
@@ -66,10 +98,14 @@ async function checksFor(): Promise<Partial<IRenderOptions>> {
|
|
|
66
98
|
};
|
|
67
99
|
}
|
|
68
100
|
|
|
101
|
+
const shots = screenshotDir();
|
|
69
102
|
const result = await renderCheck({
|
|
70
103
|
file,
|
|
71
104
|
smoke,
|
|
105
|
+
a11y,
|
|
72
106
|
routes: await routesFor(),
|
|
107
|
+
...(perf ? { perfBudget: DEFAULT_PERF_BUDGET } : {}),
|
|
108
|
+
...(shots !== undefined ? { screenshotDir: shots } : {}),
|
|
73
109
|
...(await checksFor()),
|
|
74
110
|
});
|
|
75
111
|
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
-
// Generate RULES.md
|
|
2
|
-
// This produces a deterministic, human-readable reference of what gets enforced.
|
|
3
|
-
// bun run packages/core/scripts/build-rules-md.ts
|
|
1
|
+
// Generate RULES.md grouped by adoption tier, then pack.
|
|
4
2
|
import { join } from "node:path";
|
|
5
3
|
import { RULE_PACKS } from "../src/rule-packs";
|
|
6
4
|
import { META_RULES } from "../src/meta-rules";
|
|
5
|
+
import { getRuleCatalogEntry } from "../src/rule-packs/rule-metadata";
|
|
6
|
+
import type { RuleTier } from "../src/rule-packs/rule-catalog.types";
|
|
7
|
+
import { PROFILE_DEFINITIONS } from "../src/config/profiles";
|
|
7
8
|
|
|
8
9
|
function getRuleDescription(obj: unknown): string | undefined {
|
|
9
10
|
const isObject = (val: unknown): val is Record<string, unknown> =>
|
|
@@ -30,15 +31,28 @@ function getRuleDescription(obj: unknown): string | undefined {
|
|
|
30
31
|
return typeof description === "string" ? description : undefined;
|
|
31
32
|
}
|
|
32
33
|
|
|
34
|
+
const TIER_ORDER: readonly RuleTier[] = [
|
|
35
|
+
"safety",
|
|
36
|
+
"framework",
|
|
37
|
+
"architecture",
|
|
38
|
+
"experimental",
|
|
39
|
+
];
|
|
40
|
+
|
|
33
41
|
const out: string[] = [
|
|
34
42
|
"# Rules and Meta-Rules Catalog",
|
|
35
43
|
"",
|
|
36
|
-
"
|
|
44
|
+
"Rules are grouped by **adoption tier**. Use `profile` in `tsforge.config.json` to control which tiers are active by default.",
|
|
45
|
+
"",
|
|
46
|
+
"## Profiles",
|
|
37
47
|
"",
|
|
38
48
|
];
|
|
39
49
|
|
|
40
|
-
|
|
41
|
-
out.push(
|
|
50
|
+
for (const profile of Object.values(PROFILE_DEFINITIONS)) {
|
|
51
|
+
out.push(`- **${profile.id}**: ${profile.description}`);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
out.push("");
|
|
55
|
+
out.push("## Rule Packs by Tier");
|
|
42
56
|
out.push("");
|
|
43
57
|
|
|
44
58
|
type PackId = keyof typeof RULE_PACKS;
|
|
@@ -47,36 +61,62 @@ function isPackId(id: string): id is PackId {
|
|
|
47
61
|
return id in RULE_PACKS;
|
|
48
62
|
}
|
|
49
63
|
|
|
50
|
-
const
|
|
64
|
+
const entriesByTier = new Map<
|
|
65
|
+
RuleTier,
|
|
66
|
+
{ packId: string; ruleName: string; severity: string; description: string }[]
|
|
67
|
+
>();
|
|
51
68
|
|
|
52
|
-
for (const packId of
|
|
69
|
+
for (const packId of Object.keys(RULE_PACKS).sort()) {
|
|
53
70
|
if (!isPackId(packId)) {
|
|
54
71
|
continue;
|
|
55
72
|
}
|
|
56
73
|
|
|
57
74
|
const pack = RULE_PACKS[packId];
|
|
58
75
|
|
|
59
|
-
|
|
60
|
-
out.push("");
|
|
61
|
-
out.push(pack.description);
|
|
62
|
-
out.push("");
|
|
63
|
-
|
|
64
|
-
const ruleNames = Object.keys(pack.rules).sort();
|
|
65
|
-
|
|
66
|
-
for (const ruleName of ruleNames) {
|
|
76
|
+
for (const ruleName of Object.keys(pack.rules).sort()) {
|
|
67
77
|
const rule = pack.rules[ruleName];
|
|
68
78
|
const severity = pack.rulesConfig[ruleName] ?? "warn";
|
|
69
79
|
const description = getRuleDescription(rule) ?? ruleName;
|
|
70
|
-
const
|
|
71
|
-
const
|
|
80
|
+
const tier = getRuleCatalogEntry(ruleName, packId).tier;
|
|
81
|
+
const list = entriesByTier.get(tier) ?? [];
|
|
82
|
+
|
|
83
|
+
list.push({
|
|
84
|
+
packId,
|
|
85
|
+
ruleName,
|
|
86
|
+
severity: severity.toUpperCase(),
|
|
87
|
+
description,
|
|
88
|
+
});
|
|
89
|
+
entriesByTier.set(tier, list);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
for (const tier of TIER_ORDER) {
|
|
94
|
+
const entries = entriesByTier.get(tier) ?? [];
|
|
95
|
+
|
|
96
|
+
if (entries.length === 0) {
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
72
99
|
|
|
73
|
-
|
|
100
|
+
out.push(`### Tier: ${tier}`);
|
|
101
|
+
out.push("");
|
|
102
|
+
|
|
103
|
+
for (const entry of entries.sort((a, b) => {
|
|
104
|
+
const byPack = a.packId.localeCompare(b.packId);
|
|
105
|
+
|
|
106
|
+
if (byPack !== 0) {
|
|
107
|
+
return byPack;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
return a.ruleName.localeCompare(b.ruleName);
|
|
111
|
+
})) {
|
|
112
|
+
out.push(
|
|
113
|
+
`- **${entry.packId}/${entry.ruleName}** [${entry.severity}]: ${entry.description}`
|
|
114
|
+
);
|
|
74
115
|
}
|
|
75
116
|
|
|
76
117
|
out.push("");
|
|
77
118
|
}
|
|
78
119
|
|
|
79
|
-
// Section: Meta-Rules
|
|
80
120
|
out.push("## Meta-Rules");
|
|
81
121
|
out.push("");
|
|
82
122
|
out.push(
|
|
@@ -103,7 +143,6 @@ for (const rule of META_RULES) {
|
|
|
103
143
|
rulesByCategory.set(cat, rules);
|
|
104
144
|
}
|
|
105
145
|
|
|
106
|
-
// Render meta-rules by category.
|
|
107
146
|
for (const category of categoryOrder) {
|
|
108
147
|
const rules = rulesByCategory.get(category) ?? [];
|
|
109
148
|
|
|
@@ -123,6 +162,24 @@ for (const category of categoryOrder) {
|
|
|
123
162
|
out.push("");
|
|
124
163
|
}
|
|
125
164
|
|
|
165
|
+
out.push("## Out of scope");
|
|
166
|
+
out.push("");
|
|
167
|
+
out.push(
|
|
168
|
+
"The following are intentionally deferred — wrong tool for the syntactic ESLint gate, or require cross-file analysis:"
|
|
169
|
+
);
|
|
170
|
+
out.push("");
|
|
171
|
+
out.push(
|
|
172
|
+
"- GraphQL/WebSocket/OpenAPI contract rules (until OpenAPI dep + parser)"
|
|
173
|
+
);
|
|
174
|
+
out.push(
|
|
175
|
+
"- Container/Kubernetes YAML hardening (future meta-rules when Dockerfile/k8s detected)"
|
|
176
|
+
);
|
|
177
|
+
out.push("- LLM/MCP security packs (opt-in when AI SDK deps detected)");
|
|
178
|
+
out.push("- FSD layer DAG / full authorization taint tracking");
|
|
179
|
+
out.push("- Lighthouse / bundle-analyzer CI gates");
|
|
180
|
+
out.push("- Violation ratcheting / baseline snapshots (Phase 5)");
|
|
181
|
+
out.push("");
|
|
182
|
+
|
|
126
183
|
const path = join(import.meta.dir, "..", "RULES.md");
|
|
127
184
|
|
|
128
185
|
await Bun.write(path, out.join("\n"));
|
package/scripts/cli-metrics.ts
CHANGED
|
@@ -10,6 +10,7 @@ import { readdir } from "node:fs/promises";
|
|
|
10
10
|
import { homedir } from "node:os";
|
|
11
11
|
import { join } from "node:path";
|
|
12
12
|
import { isRecord } from "../src/lib/guards";
|
|
13
|
+
import { classifyRun, parseEventLog } from "../src/eval";
|
|
13
14
|
|
|
14
15
|
function num(value: unknown): number {
|
|
15
16
|
return typeof value === "number" ? value : 0;
|
|
@@ -168,6 +169,9 @@ async function main(): Promise<void> {
|
|
|
168
169
|
const text = await Bun.file(path).text();
|
|
169
170
|
const lines = text.split("\n").filter((l) => l.trim().length > 0);
|
|
170
171
|
const m = analyze(lines);
|
|
172
|
+
// Single source of truth for WHY a run failed — the same classifier the eval
|
|
173
|
+
// sweep and the reusable analyzeEvents() use, fed the typed event stream.
|
|
174
|
+
const failure = classifyRun(parseEventLog(text));
|
|
171
175
|
const pct =
|
|
172
176
|
m.contextWindow > 0
|
|
173
177
|
? Math.round((m.peakContext / m.contextWindow) * 100)
|
|
@@ -182,6 +186,12 @@ async function main(): Promise<void> {
|
|
|
182
186
|
["model", m.model],
|
|
183
187
|
["context window", String(m.contextWindow)],
|
|
184
188
|
["final status", m.finalStatus],
|
|
189
|
+
[
|
|
190
|
+
"failure class",
|
|
191
|
+
failure.detail === undefined
|
|
192
|
+
? failure.failureClass
|
|
193
|
+
: `${failure.failureClass} (${failure.detail})`,
|
|
194
|
+
],
|
|
185
195
|
["turns (repair iterations)", String(m.turns)],
|
|
186
196
|
["model calls", String(m.modelCalls)],
|
|
187
197
|
["tokens out (→ solution)", String(m.tokensOut)],
|
package/scripts/sweep.ts
CHANGED
|
@@ -11,7 +11,14 @@ import { runSpec, qualityRepair } from "../src/loop";
|
|
|
11
11
|
import { modelAgent } from "../src/agent";
|
|
12
12
|
import { OpenAICompatibleProvider } from "../src/inference";
|
|
13
13
|
import { resolveActiveModel, resolveApiKey } from "../src/models-config";
|
|
14
|
-
import {
|
|
14
|
+
import { providerConfig } from "../src/cli";
|
|
15
|
+
import {
|
|
16
|
+
summarize,
|
|
17
|
+
classifyRun,
|
|
18
|
+
renderSweepReportMarkdown,
|
|
19
|
+
buildSweepReport,
|
|
20
|
+
type IRunRecord,
|
|
21
|
+
} from "../src/eval";
|
|
15
22
|
import { renderEvent } from "../src/render";
|
|
16
23
|
import type { ILoopEvent } from "../src/loop";
|
|
17
24
|
|
|
@@ -109,28 +116,32 @@ const seedFiles = await readdir(seedDir, { recursive: true });
|
|
|
109
116
|
// unreachable endpoint and hung with an empty run.log.)
|
|
110
117
|
const { entry: activeModel } = await resolveActiveModel();
|
|
111
118
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
// strict gate). Default off; enable via env if a target genuinely loops.
|
|
121
|
-
repetitionPenalty:
|
|
122
|
-
process.env.TSFORGE_REPETITION_PENALTY === undefined
|
|
123
|
-
? undefined
|
|
124
|
-
: Number(process.env.TSFORGE_REPETITION_PENALTY),
|
|
125
|
-
});
|
|
119
|
+
// Build the wire config the SAME way the CLI does (`providerConfig`), so the
|
|
120
|
+
// sweep inherits the active entry's provider dialect — `reasoning`,
|
|
121
|
+
// `reasoningEffort`, `extraBody`, `extraHeaders`. Hand-rolling the config here
|
|
122
|
+
// dropped those fields, so a DeepSeek sweep sent qwen-only params and hit the
|
|
123
|
+
// 400s the interactive path already handles. maxTokens still defaults to
|
|
124
|
+
// PROVIDER_LIMITS (16384) — thinking tokens count against it, so reasoning +
|
|
125
|
+
// code get room. Repetition penalty stays opt-in via TSFORGE_REPETITION_PENALTY.
|
|
126
|
+
const provider = new OpenAICompatibleProvider(providerConfig(activeModel));
|
|
126
127
|
|
|
127
128
|
// The judge scores quality. Point it at a flagship via TSFORGE_JUDGE_URL/MODEL
|
|
128
|
-
// (+ TSFORGE_JUDGE_KEY) to measure the gap
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
129
|
+
// (+ TSFORGE_JUDGE_KEY) to measure the gap. When NOT overridden, the active
|
|
130
|
+
// model judges itself — reuse its full dialect via providerConfig so a
|
|
131
|
+
// self-judge against DeepSeek speaks DeepSeek too. An explicit external judge
|
|
132
|
+
// is a plain generic call (its own endpoint, no inherited reasoning dialect).
|
|
133
|
+
const judgeOverridden =
|
|
134
|
+
process.env.TSFORGE_JUDGE_URL !== undefined ||
|
|
135
|
+
process.env.TSFORGE_JUDGE_MODEL !== undefined;
|
|
136
|
+
const judgeProvider = new OpenAICompatibleProvider(
|
|
137
|
+
judgeOverridden
|
|
138
|
+
? {
|
|
139
|
+
baseUrl: process.env.TSFORGE_JUDGE_URL ?? activeModel.baseUrl,
|
|
140
|
+
model: process.env.TSFORGE_JUDGE_MODEL ?? activeModel.model,
|
|
141
|
+
apiKey: process.env.TSFORGE_JUDGE_KEY ?? resolveApiKey(activeModel),
|
|
142
|
+
}
|
|
143
|
+
: providerConfig(activeModel)
|
|
144
|
+
);
|
|
134
145
|
|
|
135
146
|
/** Sortable timestamp `YYYYMMDD-HHMMSS` so run dirs sort newest-last by name. */
|
|
136
147
|
function stamp(): string {
|
|
@@ -263,8 +274,12 @@ async function runOne(
|
|
|
263
274
|
// Every run gets a full transcript at <runDir>/run.log; stream to the
|
|
264
275
|
// terminal too when TSFORGE_STREAM=1.
|
|
265
276
|
const log = Bun.file(join(runDir, "run.log")).writer();
|
|
277
|
+
// Keep the structured events so a failed run can be classified (WHY it
|
|
278
|
+
// failed), not just counted — fed to classifyRun below.
|
|
279
|
+
const runEvents: ILoopEvent[] = [];
|
|
266
280
|
|
|
267
281
|
const onEvent = (e: ILoopEvent): void => {
|
|
282
|
+
runEvents.push(e);
|
|
268
283
|
void log.write(renderEvent(e, { color: false }));
|
|
269
284
|
// Flush per event — otherwise Bun's FileSink buffers and `tail -f` shows
|
|
270
285
|
// nothing until the run ends. The log must be live.
|
|
@@ -354,6 +369,9 @@ async function runOne(
|
|
|
354
369
|
);
|
|
355
370
|
|
|
356
371
|
const vLabel = variantLabel(variantEnv);
|
|
372
|
+
const failureClass = passed
|
|
373
|
+
? undefined
|
|
374
|
+
: classifyRun(runEvents).failureClass;
|
|
357
375
|
|
|
358
376
|
records.push({
|
|
359
377
|
label: `${vLabel} temp=${temp}`,
|
|
@@ -361,9 +379,10 @@ async function runOne(
|
|
|
361
379
|
cycles,
|
|
362
380
|
ms,
|
|
363
381
|
quality,
|
|
382
|
+
...(failureClass === undefined ? {} : { failureClass }),
|
|
364
383
|
});
|
|
365
384
|
process.stdout.write(
|
|
366
|
-
` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" :
|
|
385
|
+
` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : `blocked[${failureClass ?? "unknown"}]`} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
|
|
367
386
|
);
|
|
368
387
|
} finally {
|
|
369
388
|
restore();
|
|
@@ -375,11 +394,22 @@ const summaries = summarize(records);
|
|
|
375
394
|
process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
|
|
376
395
|
|
|
377
396
|
for (const s of summaries) {
|
|
397
|
+
const failures = Object.entries(s.failureClasses)
|
|
398
|
+
.sort(([, a], [, b]) => b - a)
|
|
399
|
+
.map(([cls, n]) => `${cls}×${String(n)}`)
|
|
400
|
+
.join(", ");
|
|
401
|
+
|
|
378
402
|
process.stdout.write(
|
|
379
|
-
`${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms\n`
|
|
403
|
+
`${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms${failures.length > 0 ? ` [${failures}]` : ""}\n`
|
|
380
404
|
);
|
|
381
405
|
}
|
|
382
406
|
|
|
407
|
+
// The statistical report (Wilson CI + z-test vs baseline) now also tabulates a
|
|
408
|
+
// per-variant failure-class breakdown — WHY runs failed, not just how often.
|
|
409
|
+
process.stdout.write(
|
|
410
|
+
`\n${renderSweepReportMarkdown(buildSweepReport(records))}\n`
|
|
411
|
+
);
|
|
412
|
+
|
|
383
413
|
const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
|
|
384
414
|
|
|
385
415
|
await Bun.write(
|