@agjs/tsforge 0.1.14 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -1
- package/scripts/analyze-malformed.ts +264 -0
- package/scripts/analyze-runs.ts +279 -0
- package/scripts/benchmark-catalog.ts +387 -0
- package/scripts/browser-check.ts +87 -0
- package/scripts/build-rule-docs.ts +122 -0
- package/scripts/build-rules-md.ts +129 -0
- package/scripts/cli-metrics.ts +203 -0
- package/scripts/coverage-check.ts +33 -0
- package/scripts/edit-benchmark.ts +314 -0
- package/scripts/eval-create.ts +48 -0
- package/scripts/eval-spec.ts +47 -0
- package/scripts/eval-sum.ts +79 -0
- package/scripts/gen-tests.ts +140 -0
- package/scripts/headless-build.ts +292 -0
- package/scripts/interactive-eval.ts +172 -0
- package/scripts/rejudge.ts +135 -0
- package/scripts/run-eval-todo.ts +59 -0
- package/scripts/smoke.ts +18 -0
- package/scripts/stub-check.ts +44 -0
- package/scripts/sweep-report.ts +76 -0
- package/scripts/sweep.ts +389 -0
- package/src/cli.ts +39 -1
- package/src/inference/inference.types.ts +20 -0
- package/src/inference/openai-compatible.ts +11 -34
- package/src/inference/request.ts +148 -0
- package/src/models-config.ts +13 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
// Live "create from scratch" eval against the local Qwen3.6. Not in the suite.
|
|
2
|
+
// Run: bun run packages/core/scripts/eval-create.ts
|
|
3
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
|
|
7
|
+
import { runTask } from "../src/loop";
|
|
8
|
+
|
|
9
|
+
// Only the test exists. greet.ts must be CREATED by the model.
|
|
10
|
+
const TESTFILE = `import { test, expect } from "bun:test";
|
|
11
|
+
import { greet } from "./greet";
|
|
12
|
+
test("greets by name", () => {
|
|
13
|
+
expect(greet("Sam")).toBe("Hello, Sam");
|
|
14
|
+
});
|
|
15
|
+
`;
|
|
16
|
+
|
|
17
|
+
const provider = new OpenAICompatibleProvider({
|
|
18
|
+
baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
|
|
19
|
+
model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
|
|
20
|
+
});
|
|
21
|
+
|
|
22
|
+
const dir = await mkdtemp(join(tmpdir(), "tsforge-eval-create-"));
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
await Bun.write(join(dir, "greet.test.ts"), TESTFILE);
|
|
26
|
+
|
|
27
|
+
// The test is declared so the model can SEE the spec; greet.ts is the target.
|
|
28
|
+
const result = await runTask(
|
|
29
|
+
{
|
|
30
|
+
id: "greet",
|
|
31
|
+
accept: "bun test greet.test.ts",
|
|
32
|
+
files: ["greet.ts", "greet.test.ts"],
|
|
33
|
+
},
|
|
34
|
+
dir,
|
|
35
|
+
provider
|
|
36
|
+
);
|
|
37
|
+
|
|
38
|
+
console.log("result:", JSON.stringify(result));
|
|
39
|
+
|
|
40
|
+
const created = Bun.file(join(dir, "greet.ts"));
|
|
41
|
+
|
|
42
|
+
console.log(
|
|
43
|
+
"greet.ts:\n" +
|
|
44
|
+
((await created.exists()) ? await created.text() : "(not created)")
|
|
45
|
+
);
|
|
46
|
+
} finally {
|
|
47
|
+
await rm(dir, { recursive: true, force: true });
|
|
48
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
// Live multi-task eval against the local Qwen3.6. Not part of the test suite.
|
|
2
|
+
// Run: bun run packages/core/scripts/eval-spec.ts
|
|
3
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
|
|
7
|
+
import { runSpec } from "../src/loop";
|
|
8
|
+
import type { ISpec } from "../src/spec";
|
|
9
|
+
|
|
10
|
+
const FILES: Record<string, string> = {
|
|
11
|
+
"add.ts": `export function add(a: number, b: number): number {\n return 0;\n}\n`,
|
|
12
|
+
"add.test.ts": `import { test, expect } from "bun:test";\nimport { add } from "./add";\ntest("adds", () => { expect(add(2, 3)).toBe(5); });\n`,
|
|
13
|
+
"mul.ts": `export function mul(a: number, b: number): number {\n return 0;\n}\n`,
|
|
14
|
+
"mul.test.ts": `import { test, expect } from "bun:test";\nimport { mul } from "./mul";\ntest("muls", () => { expect(mul(2, 3)).toBe(6); });\n`,
|
|
15
|
+
};
|
|
16
|
+
|
|
17
|
+
const spec: ISpec = {
|
|
18
|
+
id: "math",
|
|
19
|
+
title: "Math helpers",
|
|
20
|
+
verify: "bun test",
|
|
21
|
+
tasks: [
|
|
22
|
+
{ id: "add", accept: "bun test add.test.ts", files: ["add.ts"] },
|
|
23
|
+
{ id: "mul", accept: "bun test mul.test.ts", files: ["mul.ts"] },
|
|
24
|
+
],
|
|
25
|
+
};
|
|
26
|
+
|
|
27
|
+
const provider = new OpenAICompatibleProvider({
|
|
28
|
+
baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
|
|
29
|
+
model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const dir = await mkdtemp(join(tmpdir(), "tsforge-eval-spec-"));
|
|
33
|
+
|
|
34
|
+
try {
|
|
35
|
+
for (const [name, content] of Object.entries(FILES)) {
|
|
36
|
+
await Bun.write(join(dir, name), content);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const result = await runSpec(spec, dir, provider);
|
|
40
|
+
|
|
41
|
+
console.log("spec status:", result.status);
|
|
42
|
+
console.log("tasks:", JSON.stringify(result.results));
|
|
43
|
+
console.log("add.ts:\n" + (await Bun.file(join(dir, "add.ts")).text()));
|
|
44
|
+
console.log("mul.ts:\n" + (await Bun.file(join(dir, "mul.ts")).text()));
|
|
45
|
+
} finally {
|
|
46
|
+
await rm(dir, { recursive: true, force: true });
|
|
47
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
// Live eval against the local Qwen3.6. Not part of the test suite.
|
|
2
|
+
// Run: bun run packages/core/scripts/eval-sum.ts
|
|
3
|
+
import { mkdtemp, rm } from "node:fs/promises";
|
|
4
|
+
import { tmpdir } from "node:os";
|
|
5
|
+
import { join } from "node:path";
|
|
6
|
+
import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
|
|
7
|
+
import { runTask } from "../src/loop";
|
|
8
|
+
|
|
9
|
+
const BROKEN = `export function sum(a: number, b: number): number {
|
|
10
|
+
return 0; // wrong on purpose
|
|
11
|
+
}
|
|
12
|
+
`;
|
|
13
|
+
const TESTFILE = `import { test, expect } from "bun:test";
|
|
14
|
+
import { sum } from "./sum";
|
|
15
|
+
test("adds", () => { expect(sum(2, 3)).toBe(5); });
|
|
16
|
+
`;
|
|
17
|
+
|
|
18
|
+
const EDIT_TOOL = {
|
|
19
|
+
type: "function",
|
|
20
|
+
function: {
|
|
21
|
+
name: "edit",
|
|
22
|
+
description: "Replace an exact, unique snippet in a file.",
|
|
23
|
+
parameters: {
|
|
24
|
+
type: "object",
|
|
25
|
+
properties: {
|
|
26
|
+
file: { type: "string" },
|
|
27
|
+
oldString: { type: "string" },
|
|
28
|
+
newString: { type: "string" },
|
|
29
|
+
},
|
|
30
|
+
required: ["file", "oldString", "newString"],
|
|
31
|
+
},
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
const provider = new OpenAICompatibleProvider({
|
|
36
|
+
baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
|
|
37
|
+
model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
// (1) Diagnostic: WITH full context (file + test), does it emit a valid edit?
|
|
41
|
+
console.log("=== (1) direct call, model given full context ===");
|
|
42
|
+
const diag = await provider.complete(
|
|
43
|
+
[
|
|
44
|
+
{
|
|
45
|
+
role: "system",
|
|
46
|
+
content:
|
|
47
|
+
"You are a TypeScript engineer. Fix the bug by emitting an `edit` " +
|
|
48
|
+
"tool call. oldString must match the file exactly and uniquely.",
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
role: "user",
|
|
52
|
+
content: `File sum.ts:\n${BROKEN}\nTest sum.test.ts:\n${TESTFILE}\nMake the test pass.`,
|
|
53
|
+
},
|
|
54
|
+
],
|
|
55
|
+
{ temperature: 0, tools: [EDIT_TOOL] }
|
|
56
|
+
);
|
|
57
|
+
|
|
58
|
+
console.log("content:", JSON.stringify(diag.content));
|
|
59
|
+
console.log("toolCalls:", JSON.stringify(diag.toolCalls, null, 2));
|
|
60
|
+
|
|
61
|
+
// (2) The blind loop: current ModelAgent gets NO file content (only errors).
|
|
62
|
+
console.log("\n=== (2) full loop via ModelAgent (blind to file contents) ===");
|
|
63
|
+
const dir = await mkdtemp(join(tmpdir(), "tsforge-eval-"));
|
|
64
|
+
|
|
65
|
+
try {
|
|
66
|
+
await Bun.write(join(dir, "sum.ts"), BROKEN);
|
|
67
|
+
await Bun.write(join(dir, "sum.test.ts"), TESTFILE);
|
|
68
|
+
|
|
69
|
+
const result = await runTask(
|
|
70
|
+
{ id: "sum", accept: "bun test sum.test.ts", files: ["sum.ts"] },
|
|
71
|
+
dir,
|
|
72
|
+
provider
|
|
73
|
+
);
|
|
74
|
+
|
|
75
|
+
console.log("result:", JSON.stringify(result));
|
|
76
|
+
console.log("sum.ts after:\n" + (await Bun.file(join(dir, "sum.ts")).text()));
|
|
77
|
+
} finally {
|
|
78
|
+
await rm(dir, { recursive: true, force: true });
|
|
79
|
+
}
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
// Prove the UNTESTED-SPEC pipeline against the live local model, end to end:
|
|
2
|
+
// spec (criteria only, no tests)
|
|
3
|
+
// → generateTests: model writes a suite + throwing stub, verified RED
|
|
4
|
+
// → implement loop: model drives the stub to GREEN against its own tests
|
|
5
|
+
// If the gate goes green, an untested spec became working, verified code with no
|
|
6
|
+
// human-written tests and no flagship model — purely local.
|
|
7
|
+
//
|
|
8
|
+
// Run: TSFORGE_SEED=money bun run packages/core/scripts/gen-tests.ts
|
|
9
|
+
import { mkdir, readdir } from "node:fs/promises";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { parseSpec } from "../src/spec";
|
|
12
|
+
import { generateTests } from "../src/spec/generate-tests";
|
|
13
|
+
import { reviewAndFixSuite } from "../src/spec/review-tests";
|
|
14
|
+
import { runSpec } from "../src/loop";
|
|
15
|
+
import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
|
|
16
|
+
import { renderEvent } from "../src/render";
|
|
17
|
+
import type { ILoopEvent } from "../src/loop";
|
|
18
|
+
|
|
19
|
+
const seed = process.env.TSFORGE_SEED ?? "money";
|
|
20
|
+
const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
|
|
21
|
+
const seedDir = join(evalsRoot, seed);
|
|
22
|
+
|
|
23
|
+
const provider = new OpenAICompatibleProvider({
|
|
24
|
+
baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
|
|
25
|
+
model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
|
|
26
|
+
apiKey: process.env.TSFORGE_API_KEY,
|
|
27
|
+
repetitionPenalty:
|
|
28
|
+
process.env.TSFORGE_REPETITION_PENALTY === undefined
|
|
29
|
+
? undefined
|
|
30
|
+
: Number(process.env.TSFORGE_REPETITION_PENALTY),
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
// OFFLINE teacher: vets the generated suite for unsatisfiable / over-strict /
|
|
34
|
+
// ambiguous assertions before it becomes the gate. Point it at a flagship via
|
|
35
|
+
// TSFORGE_JUDGE_URL/MODEL/KEY; with no override it falls back to the local model
|
|
36
|
+
// (so the step still runs, just weaker). Never a runtime dependency.
|
|
37
|
+
const judge = new OpenAICompatibleProvider({
|
|
38
|
+
baseUrl:
|
|
39
|
+
process.env.TSFORGE_JUDGE_URL ??
|
|
40
|
+
process.env.TSFORGE_BASE_URL ??
|
|
41
|
+
PROVIDER_DEFAULTS.baseUrl,
|
|
42
|
+
model:
|
|
43
|
+
process.env.TSFORGE_JUDGE_MODEL ??
|
|
44
|
+
process.env.TSFORGE_MODEL ??
|
|
45
|
+
PROVIDER_DEFAULTS.model,
|
|
46
|
+
apiKey: process.env.TSFORGE_JUDGE_KEY ?? process.env.TSFORGE_API_KEY,
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
const specText = await Bun.file(join(seedDir, `${seed}.spec.md`)).text();
|
|
50
|
+
const spec = parseSpec(specText);
|
|
51
|
+
const task = spec.tasks[0];
|
|
52
|
+
|
|
53
|
+
if (task === undefined) {
|
|
54
|
+
throw new Error(`spec ${seed} has no tasks`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const implFile = task.files[0];
|
|
58
|
+
const testFile = (task.context ?? []).find((f) => f.endsWith(".test.ts"));
|
|
59
|
+
|
|
60
|
+
if (implFile === undefined || testFile === undefined) {
|
|
61
|
+
throw new Error(
|
|
62
|
+
`spec ${seed} task needs a files: impl and a *.test.ts context`
|
|
63
|
+
);
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Fresh workdir, copied from the seed EXCEPT the hand-written test — the spec
|
|
67
|
+
// goes in genuinely untested. (The impl isn't in the seed either; the stub
|
|
68
|
+
// generation creates it.)
|
|
69
|
+
const d = new Date();
|
|
70
|
+
const p = (n: number): string => String(n).padStart(2, "0");
|
|
71
|
+
const stamp = `${d.getFullYear()}${p(d.getMonth() + 1)}${p(d.getDate())}-${p(d.getHours())}${p(d.getMinutes())}${p(d.getSeconds())}`;
|
|
72
|
+
const runId = `gentests-${seed}-${stamp}`;
|
|
73
|
+
const runDir = join(evalsRoot, runId);
|
|
74
|
+
|
|
75
|
+
await mkdir(runDir, { recursive: true });
|
|
76
|
+
|
|
77
|
+
for (const file of await readdir(seedDir)) {
|
|
78
|
+
if (file === testFile) {
|
|
79
|
+
continue;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
await Bun.write(join(runDir, file), Bun.file(join(seedDir, file)));
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const onEvent = (e: ILoopEvent): void => {
|
|
86
|
+
process.stdout.write(renderEvent(e, { color: true }));
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
// Phase 1 — generate tests + stub from criteria, verified RED.
|
|
90
|
+
process.stdout.write(
|
|
91
|
+
`\n=== phase 1: generate tests for ${seed} → ${testFile} ===\n`
|
|
92
|
+
);
|
|
93
|
+
|
|
94
|
+
const gen = await generateTests(provider, runDir, {
|
|
95
|
+
testFile,
|
|
96
|
+
implFile,
|
|
97
|
+
goal: spec.title,
|
|
98
|
+
criteria: specText,
|
|
99
|
+
maxAttempts: 3,
|
|
100
|
+
onEvent,
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
process.stdout.write(
|
|
104
|
+
`\ntests: ${gen.ok ? "RED & runnable" : "FAILED to produce"} · ${gen.testCount} tests · ${gen.attempts} attempt(s)\n`
|
|
105
|
+
);
|
|
106
|
+
|
|
107
|
+
if (!gen.ok) {
|
|
108
|
+
process.stdout.write(`\nstopping: could not generate a real suite.\n`);
|
|
109
|
+
process.exit(1);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Phase 1.5 — OFFLINE teacher review: catch unsatisfiable / over-strict /
|
|
113
|
+
// ambiguous assertions before they become the gate. Corrections are re-verified
|
|
114
|
+
// RED (reverted if they break it), so this can only ever hand phase 2 a sound
|
|
115
|
+
// suite.
|
|
116
|
+
process.stdout.write(`\n=== phase 1.5: review generated tests (offline) ===\n`);
|
|
117
|
+
|
|
118
|
+
const review = await reviewAndFixSuite(judge, runDir, {
|
|
119
|
+
testFile,
|
|
120
|
+
implFile,
|
|
121
|
+
goal: spec.title,
|
|
122
|
+
criteria: specText,
|
|
123
|
+
onEvent,
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
process.stdout.write(
|
|
127
|
+
`\nreview: ${review.findings.length} finding(s), correction ${review.applied ? "applied" : "not applied"}\n`
|
|
128
|
+
);
|
|
129
|
+
|
|
130
|
+
// Phase 2 — drive the stub to green against the model's own generated tests.
|
|
131
|
+
process.stdout.write(
|
|
132
|
+
`\n=== phase 2: implement ${seed} against the generated tests ===\n`
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
const result = await runSpec(spec, runDir, provider, { onEvent });
|
|
136
|
+
|
|
137
|
+
process.stdout.write(
|
|
138
|
+
`\nimplement: ${result.status === "done" ? "GREEN" : "blocked"}\n`
|
|
139
|
+
);
|
|
140
|
+
process.stdout.write(`\nrun dir → ${runDir}\n`);
|
|
@@ -0,0 +1,292 @@
|
|
|
1
|
+
// Run a from-scratch web build against the LIVE model, NON-interactively — the
|
|
2
|
+
// missing piece for an autonomous improve-the-harness loop. Each run gets its OWN
|
|
3
|
+
// dir under evals/runs/ (node_modules symlinked), runs the staged build with
|
|
4
|
+
// the real web gate, streams progress, and writes a --log-style JSONL so
|
|
5
|
+
// `cli-metrics.ts` can score it (repair turns, tokens, what slipped, salvaged).
|
|
6
|
+
//
|
|
7
|
+
// Run: bun run packages/core/scripts/headless-build.ts "build a kanban board" [react|vanilla] [dir]
|
|
8
|
+
// or: bun run packages/core/scripts/headless-build.ts --app <slug|index> [react|vanilla] [dir]
|
|
9
|
+
// (--app builds a FIXED benchmark-catalog domain with the full generation spec)
|
|
10
|
+
// then: bun run packages/core/scripts/cli-metrics.ts
|
|
11
|
+
import {
|
|
12
|
+
appendFileSync,
|
|
13
|
+
existsSync,
|
|
14
|
+
mkdirSync,
|
|
15
|
+
symlinkSync,
|
|
16
|
+
writeFileSync,
|
|
17
|
+
} from "node:fs";
|
|
18
|
+
import { join } from "node:path";
|
|
19
|
+
import {
|
|
20
|
+
buildWebFix,
|
|
21
|
+
buildWebGate,
|
|
22
|
+
buildWebTypeGate,
|
|
23
|
+
buildWebTscCheck,
|
|
24
|
+
installWebDeps,
|
|
25
|
+
makeFileLinter,
|
|
26
|
+
scaffoldWeb,
|
|
27
|
+
webGuidance,
|
|
28
|
+
} from "../src/detect-gate";
|
|
29
|
+
import { OpenAICompatibleProvider, PROVIDER_LIMITS } from "../src/inference";
|
|
30
|
+
import { resolveActiveModel, resolveApiKey } from "../src/models-config";
|
|
31
|
+
import { Session, LOOP_LIMITS, type Reporter } from "../src/loop";
|
|
32
|
+
import { renderEvent } from "../src/render";
|
|
33
|
+
import { logsDir } from "../src/session-store";
|
|
34
|
+
import type { WebFramework } from "../src/web-templates";
|
|
35
|
+
import {
|
|
36
|
+
BENCHMARK_CATALOG,
|
|
37
|
+
buildBenchmarkPrompt,
|
|
38
|
+
findBenchmarkApp,
|
|
39
|
+
} from "./benchmark-catalog";
|
|
40
|
+
|
|
41
|
+
interface IBuildRequest {
|
|
42
|
+
/** The full task prompt handed to the model. */
|
|
43
|
+
readonly prompt: string;
|
|
44
|
+
/** A short slug for naming the snapshot (the benchmark slug, or "adhoc"). */
|
|
45
|
+
readonly label: string;
|
|
46
|
+
/** argv index where [framework] [dir] start (shifts when --app is used). */
|
|
47
|
+
readonly tailStart: number;
|
|
48
|
+
/** Declared entities (catalog builds) — the coverage gate enforces each has UI;
|
|
49
|
+
* empty for ad-hoc prompts (no enforced entity list). */
|
|
50
|
+
readonly entities: readonly string[];
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/** Resolve the build request from argv: either a benchmark --app or a free prompt. */
|
|
54
|
+
function resolveRequest(): IBuildRequest | undefined {
|
|
55
|
+
if (process.argv[2] === "--app") {
|
|
56
|
+
const selector = process.argv[3] ?? "";
|
|
57
|
+
const app = findBenchmarkApp(selector);
|
|
58
|
+
|
|
59
|
+
if (app === undefined) {
|
|
60
|
+
const list = BENCHMARK_CATALOG.map(
|
|
61
|
+
(a, i) => ` ${String(i + 1)}. ${a.slug} — ${a.name}`
|
|
62
|
+
).join("\n");
|
|
63
|
+
|
|
64
|
+
process.stderr.write(
|
|
65
|
+
`unknown benchmark "${selector}". catalog:\n${list}\n`
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
return undefined;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
prompt: buildBenchmarkPrompt(app),
|
|
73
|
+
label: app.slug,
|
|
74
|
+
tailStart: 4,
|
|
75
|
+
entities: app.entities,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
const prompt = process.argv[2];
|
|
80
|
+
|
|
81
|
+
if (prompt === undefined || prompt.length === 0) {
|
|
82
|
+
return undefined;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return { prompt, label: "adhoc", tailStart: 3, entities: [] };
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/** Tee progress to the terminal, a human-readable agent.log IN THE RUN DIR (so you
|
|
89
|
+
* can `tail -f <rundir>/agent.log` right next to the code), and a JSONL log for
|
|
90
|
+
* cli-metrics. */
|
|
91
|
+
function makeReporter(logFile: string, agentLog: string): Reporter {
|
|
92
|
+
return (event) => {
|
|
93
|
+
process.stdout.write(renderEvent(event, { color: true }));
|
|
94
|
+
appendFileSync(agentLog, renderEvent(event, { color: false }));
|
|
95
|
+
appendFileSync(logFile, `${JSON.stringify({ t: Date.now(), ...event })}\n`);
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** A canonical scaffold whose node_modules we install ONCE and symlink into every
|
|
100
|
+
* run dir — so each build gets a fresh isolated directory without a per-run
|
|
101
|
+
* `bun install`. Returns the absolute node_modules path to symlink. */
|
|
102
|
+
async function ensureDepsCache(
|
|
103
|
+
evalsRoot: string,
|
|
104
|
+
framework: WebFramework
|
|
105
|
+
): Promise<string> {
|
|
106
|
+
const cacheDir = join(evalsRoot, `.web-cache-${framework}`);
|
|
107
|
+
const nodeModules = join(cacheDir, "node_modules");
|
|
108
|
+
|
|
109
|
+
if (!existsSync(nodeModules)) {
|
|
110
|
+
await scaffoldWeb(cacheDir, framework);
|
|
111
|
+
await installWebDeps(cacheDir);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return nodeModules;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/**
|
|
118
|
+
* Plan mode (headless): run the design phase, write the model's build plan to
|
|
119
|
+
* `plan.md` in the run dir, then proceed to implement — headless never blocks for
|
|
120
|
+
* approval (interactive plan mode does the human review). The plan.md is the
|
|
121
|
+
* reviewable artifact: entities, routes, what "done" means, modeling decisions.
|
|
122
|
+
*/
|
|
123
|
+
async function runPlanned(
|
|
124
|
+
session: Session,
|
|
125
|
+
prompt: string,
|
|
126
|
+
framework: WebFramework,
|
|
127
|
+
dir: string
|
|
128
|
+
): Promise<Awaited<ReturnType<Session["buildStaged"]>>> {
|
|
129
|
+
const designed = await session.designBuild(
|
|
130
|
+
prompt,
|
|
131
|
+
{},
|
|
132
|
+
buildWebTypeGate(framework).command
|
|
133
|
+
);
|
|
134
|
+
|
|
135
|
+
if (designed.status === "interrupted") {
|
|
136
|
+
return designed;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
const plan = await session.generatePlan();
|
|
140
|
+
const planPath = join(dir, "plan.md");
|
|
141
|
+
|
|
142
|
+
writeFileSync(planPath, `${plan}\n`);
|
|
143
|
+
process.stdout.write(`\n📋 plan → ${planPath}\n`);
|
|
144
|
+
|
|
145
|
+
return session.implementBuild("", {});
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async function main(): Promise<void> {
|
|
149
|
+
// `--plan` (plan mode): after the design phase, write the model's build plan to
|
|
150
|
+
// plan.md and proceed (headless never blocks for approval). Strip it before any
|
|
151
|
+
// positional-arg logic so it can sit anywhere on the command line.
|
|
152
|
+
const planMode = process.argv.includes("--plan");
|
|
153
|
+
|
|
154
|
+
if (planMode) {
|
|
155
|
+
process.argv = process.argv.filter((a) => a !== "--plan");
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const request = resolveRequest();
|
|
159
|
+
|
|
160
|
+
if (request === undefined) {
|
|
161
|
+
process.stderr.write(
|
|
162
|
+
'usage: headless-build.ts "<prompt>" [react|vanilla] [dir]\n' +
|
|
163
|
+
" or: headless-build.ts --app <slug|index> [react|vanilla] [dir]\n"
|
|
164
|
+
);
|
|
165
|
+
process.exit(2);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
const { prompt, label, tailStart, entities } = request;
|
|
169
|
+
const framework: WebFramework =
|
|
170
|
+
process.argv[tailStart] === "vanilla" ? "vanilla" : "react";
|
|
171
|
+
// The model comes from the registry (~/.tsforge/models.json) unless TSFORGE_*
|
|
172
|
+
// env overrides it — so a catalog run can target a cloud flagship by editing the
|
|
173
|
+
// registry's `active` (or setting env), no code change.
|
|
174
|
+
const { entry } = await resolveActiveModel();
|
|
175
|
+
const model = entry.model;
|
|
176
|
+
const envWindow = Number(process.env.TSFORGE_CONTEXT_WINDOW);
|
|
177
|
+
const contextWindow =
|
|
178
|
+
entry.contextWindow ??
|
|
179
|
+
(Number.isFinite(envWindow) && envWindow > 0 ? envWindow : 262_144);
|
|
180
|
+
|
|
181
|
+
// EACH RUN GETS ITS OWN DIRECTORY: evals/runs/<timestamp>-<label>/ — so you
|
|
182
|
+
// always know exactly where this build's code is, and prior runs are never
|
|
183
|
+
// clobbered. Override with the trailing arg. node_modules is symlinked from a
|
|
184
|
+
// one-time install cache, so a fresh dir doesn't mean a fresh `bun install`.
|
|
185
|
+
const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
|
|
186
|
+
const stamp = new Date()
|
|
187
|
+
.toISOString()
|
|
188
|
+
.replace(/[:T]/g, "-")
|
|
189
|
+
.replace(/\..+$/, "");
|
|
190
|
+
const dir =
|
|
191
|
+
process.argv[tailStart + 1] ?? join(evalsRoot, "runs", `${stamp}-${label}`);
|
|
192
|
+
|
|
193
|
+
mkdirSync(dir, { recursive: true });
|
|
194
|
+
await scaffoldWeb(dir, framework);
|
|
195
|
+
|
|
196
|
+
if (!existsSync(join(dir, "node_modules"))) {
|
|
197
|
+
const cache = await ensureDepsCache(evalsRoot, framework);
|
|
198
|
+
|
|
199
|
+
symlinkSync(cache, join(dir, "node_modules"), "dir");
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const agentLog = join(dir, "agent.log");
|
|
203
|
+
const logFile = join(logsDir(), `${stamp}-headless.jsonl`);
|
|
204
|
+
|
|
205
|
+
mkdirSync(logsDir(), { recursive: true });
|
|
206
|
+
|
|
207
|
+
const provider = new OpenAICompatibleProvider({
|
|
208
|
+
baseUrl: entry.baseUrl,
|
|
209
|
+
model: entry.model,
|
|
210
|
+
apiKey: resolveApiKey(entry),
|
|
211
|
+
maxTokens: entry.maxTokens ?? PROVIDER_LIMITS.maxTokens,
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
const report = makeReporter(logFile, agentLog);
|
|
215
|
+
|
|
216
|
+
process.stdout.write(
|
|
217
|
+
`\n📁 BUILD DIR: ${dir}\n` +
|
|
218
|
+
` follow it: tail -f ${agentLog}\n` +
|
|
219
|
+
` ${framework} scaffold ready (deps symlinked)\n\n`
|
|
220
|
+
);
|
|
221
|
+
report({
|
|
222
|
+
kind: "start",
|
|
223
|
+
task: "session",
|
|
224
|
+
message: `model ${model} · context window ${contextWindow}`,
|
|
225
|
+
model,
|
|
226
|
+
contextWindow,
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
const session = await Session.create({
|
|
230
|
+
provider,
|
|
231
|
+
cwd: dir,
|
|
232
|
+
files: ["**/*"],
|
|
233
|
+
// For catalog builds, APPEND an entity-coverage check to the gate: the app
|
|
234
|
+
// cannot go green until every declared entity has real UI (not just types) —
|
|
235
|
+
// so the model can't satisfice on a subset (4-of-8 entities greened before).
|
|
236
|
+
accept:
|
|
237
|
+
entities.length > 0
|
|
238
|
+
? `${buildWebGate(framework).command} && bun "${join(import.meta.dir, "coverage-check.ts")}" "${dir}" ${entities
|
|
239
|
+
.map((e) => JSON.stringify(e))
|
|
240
|
+
.join(" ")}`
|
|
241
|
+
: buildWebGate(framework).command,
|
|
242
|
+
fix: buildWebFix(framework),
|
|
243
|
+
incrementalCheck: buildWebTscCheck(),
|
|
244
|
+
// WRITE-TIME LINT: surface the gate's eslint moat rules (no-as, I-prefix,
|
|
245
|
+
// prefer-template) on each file the instant it's written — tsc can't see them,
|
|
246
|
+
// so without this they pile up unseen until the gate (a run log showed 12 `as`
|
|
247
|
+
// casts accumulating that way). cwd = the run dir so vendored ignores resolve.
|
|
248
|
+
lintFile: makeFileLinter(framework, dir),
|
|
249
|
+
// Offer the themed-UI-primitives tool so the model generates button/card/input/
|
|
250
|
+
// etc. (tested, theme-coherent) instead of re-authoring them every build.
|
|
251
|
+
scaffoldUi: framework === "react",
|
|
252
|
+
guidance: webGuidance(framework),
|
|
253
|
+
contextWindow,
|
|
254
|
+
// ADAPTIVE THINKING (measured ~80% of build time is REPAIR): default thinking
|
|
255
|
+
// OFF for fast creation; the Session flips it ON automatically while errors are
|
|
256
|
+
// outstanding (interim/gate RED) so repair CONVERGES instead of oscillating to
|
|
257
|
+
// the turn cap (which thinking-off-everywhere did). Best of both: fast create +
|
|
258
|
+
// convergent repair, and no 5-min pre-write spiral (thinking only on repair).
|
|
259
|
+
enableThinking: false,
|
|
260
|
+
// A from-scratch multi-domain app needs more than the 40 default. The full
|
|
261
|
+
// benchmark spec (8+ entities, 40-60 files) needs more still: pm-platform AND
|
|
262
|
+
// hospital-scheduling hit an 80-turn cap (→ 130). Then the ENTITY-COVERAGE gate
|
|
263
|
+
// (cycle-31) raised the bar again: the model can no longer satisfice on 4 of 8
|
|
264
|
+
// entities — it must build ALL of them. A fast flagship (deepseek) hit the 130
|
|
265
|
+
// phase-2 cap while GENUINELY converging on the full 8 (it had built 7 of 8's
|
|
266
|
+
// routes, coverage shrinking 4→1). A complete 8-entity app is more than 130
|
|
267
|
+
// turns of work → 180 so the now-mandatory full build has room to finish.
|
|
268
|
+
maxTurns: LOOP_LIMITS.webMaxTurns,
|
|
269
|
+
report,
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
const result = planMode
|
|
273
|
+
? await runPlanned(session, prompt, framework, dir)
|
|
274
|
+
: await session.buildStaged(
|
|
275
|
+
prompt,
|
|
276
|
+
{},
|
|
277
|
+
buildWebTypeGate(framework).command
|
|
278
|
+
);
|
|
279
|
+
|
|
280
|
+
// The run dir IS the persistent, runnable artifact (per-run, never clobbered):
|
|
281
|
+
// `cd <dir> && bun run dev` (node_modules is symlinked). No separate snapshot.
|
|
282
|
+
process.stdout.write(
|
|
283
|
+
`\n[${result.status} · ${result.turns} turn(s)]\n` +
|
|
284
|
+
`📁 code: ${dir}\n` +
|
|
285
|
+
` agent log: ${agentLog}\n` +
|
|
286
|
+
` jsonl: ${logFile}\n` +
|
|
287
|
+
" score it: bun run packages/core/scripts/cli-metrics.ts\n"
|
|
288
|
+
);
|
|
289
|
+
process.exit(result.status === "done" ? 0 : 1);
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
await main();
|