@agjs/tsforge 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/scripts/sweep.ts +117 -67
- package/src/cli.ts +5 -0
- package/src/detect-gate.ts +8 -4
- package/src/eval/eval.types.ts +7 -0
- package/src/eval/index.ts +1 -0
- package/src/eval/loc.ts +56 -0
- package/src/eval/report.ts +3 -3
- package/src/eval/score.ts +5 -0
- package/src/lib/scope/scope.constants.ts +21 -13
- package/src/lib/scope/scope.ts +7 -6
- package/src/loop/session.ts +3 -0
- package/src/loop/tools/file-ops.ts +2 -2
- package/src/loop/tools/tool-context.ts +4 -0
- package/src/loop/turn.ts +4 -0
- package/src/web-templates.ts +7 -1
package/package.json
CHANGED
package/scripts/sweep.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
// Eval sweep: run
|
|
1
|
+
// Eval sweep: run seed spec(s) N times across temperature + feature flag variants, score, tabulate.
|
|
2
2
|
// Run: TSFORGE_SEED=money TSFORGE_TEMPS=0,0.5 TSFORGE_REPEATS=3 bun run packages/core/scripts/sweep.ts
|
|
3
|
+
// TSFORGE_SEED accepts a comma-separated list (e.g. slugify,debounce,rate-limit) — each seed
|
|
4
|
+
// runs the full variant matrix and gets its own report + saved JSON.
|
|
3
5
|
// A/B feature variants:
|
|
4
6
|
// TSFORGE_FEATURE_VARIANTS=ttsr,hashline (sweep across feature toggles)
|
|
5
7
|
// Each variant is dim=on|off (e.g. ttsr=on×hashline=off) creating a cartesian product.
|
|
@@ -15,6 +17,7 @@ import { providerConfig } from "../src/cli";
|
|
|
15
17
|
import {
|
|
16
18
|
summarize,
|
|
17
19
|
classifyRun,
|
|
20
|
+
countTaskLoc,
|
|
18
21
|
renderSweepReportMarkdown,
|
|
19
22
|
buildSweepReport,
|
|
20
23
|
type IRunRecord,
|
|
@@ -22,7 +25,10 @@ import {
|
|
|
22
25
|
import { renderEvent } from "../src/render";
|
|
23
26
|
import type { ILoopEvent } from "../src/loop";
|
|
24
27
|
|
|
25
|
-
const
|
|
28
|
+
const seeds = (process.env.TSFORGE_SEED ?? "todo")
|
|
29
|
+
.split(",")
|
|
30
|
+
.map((s) => s.trim())
|
|
31
|
+
.filter((s) => s.length > 0);
|
|
26
32
|
const temps = (process.env.TSFORGE_TEMPS ?? "0,0.5")
|
|
27
33
|
.split(",")
|
|
28
34
|
.map((t) => Number(t.trim()));
|
|
@@ -100,15 +106,17 @@ function variantLabel(variant: IFeatureVariant): string {
|
|
|
100
106
|
const featureVariants = parseFeatureVariants();
|
|
101
107
|
|
|
102
108
|
const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
109
|
+
|
|
110
|
+
/** Resolve a seed's directory: prefer a local working seed (evals/<seed>); fall
|
|
111
|
+
* back to the committed corpus (evals/corpus/<seed>) so checked-in seeds run with
|
|
112
|
+
* no manual copy step. */
|
|
113
|
+
async function resolveSeedDir(seed: string): Promise<string> {
|
|
114
|
+
const local = join(evalsRoot, seed);
|
|
115
|
+
|
|
116
|
+
return (await Bun.file(join(local, `${seed}.spec.md`)).exists())
|
|
117
|
+
? local
|
|
118
|
+
: join(evalsRoot, "corpus", seed);
|
|
119
|
+
}
|
|
112
120
|
|
|
113
121
|
// Resolve the model the same way the CLI does: explicit TSFORGE_* env wins, else
|
|
114
122
|
// the active entry from ~/.tsforge/models.json. (Previously this hardcoded the
|
|
@@ -151,36 +159,55 @@ function stamp(): string {
|
|
|
151
159
|
return `${d.getFullYear()}${p(d.getMonth() + 1)}${p(d.getDate())}-${p(d.getHours())}${p(d.getMinutes())}${p(d.getSeconds())}`;
|
|
152
160
|
}
|
|
153
161
|
|
|
154
|
-
const
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
const
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
162
|
+
for (const seed of seeds) {
|
|
163
|
+
const seedDir = await resolveSeedDir(seed);
|
|
164
|
+
// Recursive so nested-directory apps (e.g. a React app under `src/`) copy whole;
|
|
165
|
+
// flat single-dir evals are unaffected (recursive readdir returns the same list).
|
|
166
|
+
const seedFiles = await readdir(seedDir, { recursive: true });
|
|
167
|
+
const records: IRunRecord[] = [];
|
|
168
|
+
|
|
169
|
+
for (const variant of featureVariants) {
|
|
170
|
+
const variantEnv = variantToEnvVars(variant);
|
|
171
|
+
const vLabel = variantLabel(variant);
|
|
172
|
+
|
|
173
|
+
for (const temp of temps) {
|
|
174
|
+
for (let i = 0; i < repeats; i += 1) {
|
|
175
|
+
const runId = `${seed}-${vLabel}-t${temp}-${stamp()}-${i + 1}`;
|
|
176
|
+
const runDir = join(evalsRoot, "runs", runId);
|
|
177
|
+
|
|
178
|
+
// One run's failure (e.g. a request timing out) must not abort the sweep —
|
|
179
|
+
// record it as a blocked run and carry on, so a long batch is resilient.
|
|
180
|
+
try {
|
|
181
|
+
records.push(
|
|
182
|
+
await runOne(
|
|
183
|
+
seed,
|
|
184
|
+
seedDir,
|
|
185
|
+
seedFiles,
|
|
186
|
+
runId,
|
|
187
|
+
runDir,
|
|
188
|
+
temp,
|
|
189
|
+
i,
|
|
190
|
+
variantEnv
|
|
191
|
+
)
|
|
192
|
+
);
|
|
193
|
+
} catch (err) {
|
|
194
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
195
|
+
|
|
196
|
+
records.push({
|
|
197
|
+
label: `${vLabel} temp=${temp}`,
|
|
198
|
+
passed: false,
|
|
199
|
+
cycles: 0,
|
|
200
|
+
ms: 0,
|
|
201
|
+
});
|
|
202
|
+
process.stdout.write(
|
|
203
|
+
` ${seed} ${vLabel} temp=${temp} #${i + 1}: ERRORED (${message}) → ${runId}\n`
|
|
204
|
+
);
|
|
205
|
+
}
|
|
181
206
|
}
|
|
182
207
|
}
|
|
183
208
|
}
|
|
209
|
+
|
|
210
|
+
await reportSeed(seed, records);
|
|
184
211
|
}
|
|
185
212
|
|
|
186
213
|
/** Set env vars for a variant, returning a restore function. */
|
|
@@ -206,7 +233,11 @@ function setVariantEnv(variant: Record<string, string>): () => void {
|
|
|
206
233
|
}
|
|
207
234
|
|
|
208
235
|
/** Copy seed files and prepare the run directory. */
|
|
209
|
-
async function setupRunDir(
|
|
236
|
+
async function setupRunDir(
|
|
237
|
+
dir: string,
|
|
238
|
+
seedDir: string,
|
|
239
|
+
seedFiles: string[]
|
|
240
|
+
): Promise<void> {
|
|
210
241
|
await mkdir(dir, { recursive: true });
|
|
211
242
|
|
|
212
243
|
for (const file of seedFiles) {
|
|
@@ -235,16 +266,19 @@ async function startRed(
|
|
|
235
266
|
}
|
|
236
267
|
|
|
237
268
|
async function runOne(
|
|
269
|
+
seed: string,
|
|
270
|
+
seedDir: string,
|
|
271
|
+
seedFiles: string[],
|
|
238
272
|
runId: string,
|
|
239
273
|
runDir: string,
|
|
240
274
|
temp: number,
|
|
241
275
|
i: number,
|
|
242
276
|
variantEnv: Record<string, string> = {}
|
|
243
|
-
): Promise<
|
|
277
|
+
): Promise<IRunRecord> {
|
|
244
278
|
const restore = setVariantEnv(variantEnv);
|
|
245
279
|
|
|
246
280
|
try {
|
|
247
|
-
await setupRunDir(runDir);
|
|
281
|
+
await setupRunDir(runDir, seedDir, seedFiles);
|
|
248
282
|
|
|
249
283
|
const spec = parseSpec(
|
|
250
284
|
await Bun.file(join(runDir, `${seed}.spec.md`)).text()
|
|
@@ -315,6 +349,16 @@ async function runOne(
|
|
|
315
349
|
const cycles = result.results.reduce((acc, r) => acc + r.cycles, 0);
|
|
316
350
|
const passed = result.status === "done";
|
|
317
351
|
|
|
352
|
+
// LOC is the concision signal the gate can't see — measured post-hoc on the
|
|
353
|
+
// GREEN solution's task files (a failed run has no shipped solution to size).
|
|
354
|
+
let loc: number | undefined;
|
|
355
|
+
|
|
356
|
+
if (passed) {
|
|
357
|
+
const taskFiles = spec.tasks.flatMap((t) => t.files);
|
|
358
|
+
|
|
359
|
+
loc = (await countTaskLoc(runDir, taskFiles)).totalLoc;
|
|
360
|
+
}
|
|
361
|
+
|
|
318
362
|
// Once green, drive QUALITY up: judge → improve-per-critique → re-judge.
|
|
319
363
|
let quality: number | undefined;
|
|
320
364
|
let judgeNotes = "";
|
|
@@ -359,6 +403,7 @@ async function runOne(
|
|
|
359
403
|
cycles,
|
|
360
404
|
ms,
|
|
361
405
|
quality,
|
|
406
|
+
loc,
|
|
362
407
|
judgeNotes,
|
|
363
408
|
tasks: result.results,
|
|
364
409
|
},
|
|
@@ -378,47 +423,52 @@ async function runOne(
|
|
|
378
423
|
? undefined
|
|
379
424
|
: classifyRun(runEvents).failureClass;
|
|
380
425
|
|
|
381
|
-
|
|
426
|
+
process.stdout.write(
|
|
427
|
+
` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : `blocked[${failureClass ?? "unknown"}]`} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}${loc === undefined ? "" : `, ${String(loc)} loc`}) → ${runId}\n`
|
|
428
|
+
);
|
|
429
|
+
|
|
430
|
+
return {
|
|
382
431
|
label: `${vLabel} temp=${temp}`,
|
|
383
432
|
passed,
|
|
384
433
|
cycles,
|
|
385
434
|
ms,
|
|
386
435
|
quality,
|
|
436
|
+
...(loc === undefined ? {} : { loc }),
|
|
387
437
|
...(failureClass === undefined ? {} : { failureClass }),
|
|
388
|
-
}
|
|
389
|
-
process.stdout.write(
|
|
390
|
-
` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : `blocked[${failureClass ?? "unknown"}]`} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
|
|
391
|
-
);
|
|
438
|
+
};
|
|
392
439
|
} finally {
|
|
393
440
|
restore();
|
|
394
441
|
}
|
|
395
442
|
}
|
|
396
443
|
|
|
397
|
-
|
|
444
|
+
/** Print one seed's per-variant summary + statistical report, and save its JSON. */
|
|
445
|
+
async function reportSeed(seed: string, records: IRunRecord[]): Promise<void> {
|
|
446
|
+
const summaries = summarize(records);
|
|
398
447
|
|
|
399
|
-
process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
|
|
448
|
+
process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
|
|
400
449
|
|
|
401
|
-
for (const s of summaries) {
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
450
|
+
for (const s of summaries) {
|
|
451
|
+
const failures = Object.entries(s.failureClasses)
|
|
452
|
+
.sort(([, a], [, b]) => b - a)
|
|
453
|
+
.map(([cls, n]) => `${cls}×${String(n)}`)
|
|
454
|
+
.join(", ");
|
|
406
455
|
|
|
456
|
+
process.stdout.write(
|
|
457
|
+
`${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 ${s.avgLoc.toFixed(1)} loc avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms${failures.length > 0 ? ` [${failures}]` : ""}\n`
|
|
458
|
+
);
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// The statistical report (Wilson CI + z-test vs baseline) now also tabulates a
|
|
462
|
+
// per-variant failure-class breakdown — WHY runs failed, not just how often.
|
|
407
463
|
process.stdout.write(
|
|
408
|
-
|
|
464
|
+
`\n${renderSweepReportMarkdown(buildSweepReport(records))}\n`
|
|
409
465
|
);
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
// The statistical report (Wilson CI + z-test vs baseline) now also tabulates a
|
|
413
|
-
// per-variant failure-class breakdown — WHY runs failed, not just how often.
|
|
414
|
-
process.stdout.write(
|
|
415
|
-
`\n${renderSweepReportMarkdown(buildSweepReport(records))}\n`
|
|
416
|
-
);
|
|
417
466
|
|
|
418
|
-
const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
|
|
467
|
+
const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
|
|
419
468
|
|
|
420
|
-
await Bun.write(
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
);
|
|
424
|
-
process.stdout.write(`\nsaved ${outPath}\n`);
|
|
469
|
+
await Bun.write(
|
|
470
|
+
outPath,
|
|
471
|
+
JSON.stringify({ seed, temps, repeats, records, summaries }, null, 2)
|
|
472
|
+
);
|
|
473
|
+
process.stdout.write(`\nsaved ${outPath}\n`);
|
|
474
|
+
}
|
package/src/cli.ts
CHANGED
|
@@ -896,6 +896,11 @@ async function repl(args: ICliArgs): Promise<number> {
|
|
|
896
896
|
// in the model's list; setSetupWeb() below only wires its callback.
|
|
897
897
|
...(args.web
|
|
898
898
|
? {
|
|
899
|
+
// --web pre-scaffolds the app, so scaffold_web isn't needed — but the
|
|
900
|
+
// build still needs scaffold_ui + scaffold_routes (+ add_dependency),
|
|
901
|
+
// which `scaffoldUi: true` registers. Without this the web guidance
|
|
902
|
+
// tells the model to call tools that aren't in its list and it deadlocks.
|
|
903
|
+
scaffoldUi: true,
|
|
899
904
|
guidance: webGuidance("react"),
|
|
900
905
|
fix: buildWebFix("react"),
|
|
901
906
|
incrementalCheck: buildWebTscCheck(),
|
package/src/detect-gate.ts
CHANGED
|
@@ -385,10 +385,14 @@ async function initMswWorker(cwd: string): Promise<void> {
|
|
|
385
385
|
}
|
|
386
386
|
|
|
387
387
|
try {
|
|
388
|
-
//
|
|
389
|
-
//
|
|
390
|
-
//
|
|
391
|
-
|
|
388
|
+
// `--no-save`, NOT a bare `init`: bare `init` (save flag absent) drops into an
|
|
389
|
+
// interactive @inquirer "save the worker dir to package.json?" prompt, which has
|
|
390
|
+
// no TTY in this headless pipeline and crashes the msw child with ExitPromptError.
|
|
391
|
+
// `--save` would answer it but rewrites package.json un-prettified (fails the
|
|
392
|
+
// gate's format check). `--no-save` copies the worker, skips package.json, and
|
|
393
|
+
// never prompts. The worker lands at the default `/mockServiceWorker.js`, which
|
|
394
|
+
// `worker.start()` finds without any config.
|
|
395
|
+
await Bun.spawn(["bunx", "msw", "init", "public", "--no-save"], {
|
|
392
396
|
cwd,
|
|
393
397
|
stdout: "inherit",
|
|
394
398
|
stderr: "inherit",
|
package/src/eval/eval.types.ts
CHANGED
|
@@ -23,6 +23,10 @@ export interface IRunRecord {
|
|
|
23
23
|
ms: number;
|
|
24
24
|
/** LLM-judge quality score (1–5), when available. */
|
|
25
25
|
quality?: number;
|
|
26
|
+
/** Lines of code in the solution's task files (non-blank, non-comment), measured
|
|
27
|
+
* post-hoc on a green run. The concision signal the gate is blind to; omitted
|
|
28
|
+
* for a failed run (there's no shipped solution to measure). */
|
|
29
|
+
loc?: number;
|
|
26
30
|
/** Structured reason a failed run failed (from classifyRun); omitted/`none`
|
|
27
31
|
* for a passing run. The substrate for turning failures into interventions. */
|
|
28
32
|
failureClass?: FailureClass;
|
|
@@ -38,6 +42,9 @@ export interface IVariantSummary {
|
|
|
38
42
|
avgMs: number;
|
|
39
43
|
/** Average quality across runs that were scored (0 if none). */
|
|
40
44
|
avgQuality: number;
|
|
45
|
+
/** Average LOC across runs that recorded it — i.e. green runs (0 if none). The
|
|
46
|
+
* lower-is-better concision metric, compared per task across variants. */
|
|
47
|
+
avgLoc: number;
|
|
41
48
|
/** Count of failed runs by failure class (e.g. {"type-error": 2}); empty when
|
|
42
49
|
* no run carried a class. Lets a sweep show WHY a variant failed, not just how
|
|
43
50
|
* often. */
|
package/src/eval/index.ts
CHANGED
package/src/eval/loc.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { join } from "node:path";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Lines-of-code counter — a cheap structural proxy for solution SIZE, used by the
|
|
5
|
+
* eval sweep to measure concision (the axis the gate is blind to: it checks that
|
|
6
|
+
* code is correct, never that it is lean).
|
|
7
|
+
*
|
|
8
|
+
* Counts non-blank, non-comment lines. This is deliberately a HEURISTIC (the
|
|
9
|
+
* ponytail-benchmark approach), not a parse: block comments are stripped, then
|
|
10
|
+
* blank lines and line-comment-only lines are dropped. A comment marker inside a
|
|
11
|
+
* string literal is treated as a comment — acceptable, because LOC is only ever
|
|
12
|
+
* compared between solutions to the SAME task, where that noise is constant.
|
|
13
|
+
*/
|
|
14
|
+
export function countLoc(content: string): number {
|
|
15
|
+
const withoutBlocks = content.replace(/\/\*[\s\S]*?\*\//g, "");
|
|
16
|
+
|
|
17
|
+
return withoutBlocks
|
|
18
|
+
.split("\n")
|
|
19
|
+
.map((line) => line.trim())
|
|
20
|
+
.filter((line) => line.length > 0 && !line.startsWith("//")).length;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/** Total + per-file LOC for a task's editable files. */
|
|
24
|
+
export interface ITaskLoc {
|
|
25
|
+
totalLoc: number;
|
|
26
|
+
perFile: Record<string, number>;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Sum LOC across a task's editable `files` (resolved under `cwd`; glob patterns
|
|
31
|
+
* are expanded, plain filenames match themselves). Run AFTER a green solution
|
|
32
|
+
* exists, so it measures what the model actually shipped. A pattern that matches
|
|
33
|
+
* nothing contributes 0.
|
|
34
|
+
*/
|
|
35
|
+
export async function countTaskLoc(
|
|
36
|
+
cwd: string,
|
|
37
|
+
patterns: readonly string[]
|
|
38
|
+
): Promise<ITaskLoc> {
|
|
39
|
+
const perFile: Record<string, number> = {};
|
|
40
|
+
|
|
41
|
+
for (const pattern of patterns) {
|
|
42
|
+
const glob = new Bun.Glob(pattern);
|
|
43
|
+
|
|
44
|
+
for await (const rel of glob.scan({ cwd, onlyFiles: true })) {
|
|
45
|
+
if (rel in perFile) {
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
perFile[rel] = countLoc(await Bun.file(join(cwd, rel)).text());
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
const totalLoc = Object.values(perFile).reduce((acc, n) => acc + n, 0);
|
|
54
|
+
|
|
55
|
+
return { totalLoc, perFile };
|
|
56
|
+
}
|
package/src/eval/report.ts
CHANGED
|
@@ -144,8 +144,8 @@ function baselineCell(report: IVariantReport, baseline: string | null): string {
|
|
|
144
144
|
* (p < 0.05) from the baseline. */
|
|
145
145
|
export function renderSweepReportMarkdown(report: ISweepReport): string {
|
|
146
146
|
const header =
|
|
147
|
-
"| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | vs baseline |\n" +
|
|
148
|
-
"| --- | --- | --- | --- | --- | --- | --- | --- |";
|
|
147
|
+
"| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | LOC | vs baseline |\n" +
|
|
148
|
+
"| --- | --- | --- | --- | --- | --- | --- | --- | --- |";
|
|
149
149
|
|
|
150
150
|
const rows = report.variants.map((v) => {
|
|
151
151
|
const ci = `${pct(v.passRateCI[0])}–${pct(v.passRateCI[1])}`;
|
|
@@ -153,7 +153,7 @@ export function renderSweepReportMarkdown(report: ISweepReport): string {
|
|
|
153
153
|
return (
|
|
154
154
|
`| ${v.label} | ${String(v.runs)} | ${pct(v.passRate)} | ${ci} | ` +
|
|
155
155
|
`${v.avgCycles.toFixed(1)} | ${String(Math.round(v.avgMs))} | ` +
|
|
156
|
-
`${v.avgQuality.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
|
|
156
|
+
`${v.avgQuality.toFixed(1)} | ${v.avgLoc.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
|
|
157
157
|
);
|
|
158
158
|
});
|
|
159
159
|
|
package/src/eval/score.ts
CHANGED
|
@@ -20,6 +20,7 @@ export function summarize(records: IRunRecord[]): IVariantSummary[] {
|
|
|
20
20
|
const sum = (select: (r: IRunRecord) => number): number =>
|
|
21
21
|
list.reduce((acc, r) => acc + select(r), 0);
|
|
22
22
|
const scored = list.filter((r) => r.quality !== undefined);
|
|
23
|
+
const sized = list.filter((r) => r.loc !== undefined);
|
|
23
24
|
const failureClasses: Record<string, number> = {};
|
|
24
25
|
|
|
25
26
|
for (const r of list) {
|
|
@@ -41,6 +42,10 @@ export function summarize(records: IRunRecord[]): IVariantSummary[] {
|
|
|
41
42
|
scored.length > 0
|
|
42
43
|
? scored.reduce((acc, r) => acc + (r.quality ?? 0), 0) / scored.length
|
|
43
44
|
: 0,
|
|
45
|
+
avgLoc:
|
|
46
|
+
sized.length > 0
|
|
47
|
+
? sized.reduce((acc, r) => acc + (r.loc ?? 0), 0) / sized.length
|
|
48
|
+
: 0,
|
|
44
49
|
failureClasses,
|
|
45
50
|
});
|
|
46
51
|
}
|
|
@@ -3,20 +3,28 @@
|
|
|
3
3
|
export const SCRATCH_PREFIX = "scratch/";
|
|
4
4
|
|
|
5
5
|
/**
|
|
6
|
-
* VENDORED, harness-authored files the model must NEVER
|
|
7
|
-
* tested
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
*
|
|
15
|
-
*
|
|
6
|
+
* VENDORED, harness-authored files the model must NEVER rewrite — the SPECIFIC
|
|
7
|
+
* tested/generated files the web scaffold ships, NOT whole directories. The guard
|
|
8
|
+
* exists for ONE reason: stop the model from "fixing" the generic SDK files
|
|
9
|
+
* (`use-resource`/`api`/`result`/…), whose strict-TS errors are unfixable and —
|
|
10
|
+
* with eslint-disable + `@ts-*` suppressions banned — trap it in a loop. A type
|
|
11
|
+
* error involving one is always a wrong CALL SITE, never the library.
|
|
12
|
+
*
|
|
13
|
+
* Deliberately scoped to exact files so the model stays FREE to do what the
|
|
14
|
+
* guidance tells it: create its own helpers in `src/lib/<name>.ts` and primitives
|
|
15
|
+
* in `src/components/ui/<x>.tsx` (and edit `src/components/ui/button.tsx`). It is
|
|
16
|
+
* also applied ONLY to web-scaffold sessions (via `IToolContext.vendored`), so a
|
|
17
|
+
* normal repo that happens to have a `src/lib/` is never affected. `src/mocks/
|
|
18
|
+
* handlers.ts` is NOT vendored — the model registers its mock resources there.
|
|
16
19
|
*/
|
|
17
|
-
export const
|
|
18
|
-
"src/lib
|
|
19
|
-
"src/
|
|
20
|
+
export const WEB_VENDORED_PATTERNS = [
|
|
21
|
+
"src/lib/utils.ts",
|
|
22
|
+
"src/lib/result.ts",
|
|
23
|
+
"src/lib/object.ts",
|
|
24
|
+
"src/lib/sort.ts",
|
|
25
|
+
"src/lib/api.ts",
|
|
26
|
+
"src/lib/use-resource.ts",
|
|
27
|
+
"src/lib/use-form.ts",
|
|
20
28
|
"src/mocks/db.ts",
|
|
21
29
|
"src/mocks/browser.ts",
|
|
22
30
|
"**/*.gen.ts",
|
package/src/lib/scope/scope.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { resolve, relative } from "node:path";
|
|
2
|
-
import { SCRATCH_PREFIX
|
|
2
|
+
import { SCRATCH_PREFIX } from "./scope.constants";
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Normalize a model-supplied path against the workspace root, fixing the common
|
|
@@ -27,11 +27,12 @@ export function isInScope(file: string, patterns: string[]): boolean {
|
|
|
27
27
|
return patterns.some((pattern) => new Bun.Glob(pattern).match(file));
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
/** True when `file`
|
|
31
|
-
*
|
|
32
|
-
*
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
/** True when `file` matches one of `patterns` — the VENDORED, harness-authored
|
|
31
|
+
* files the model must not rewrite. `patterns` is supplied per-session
|
|
32
|
+
* (`IToolContext.vendored`), so it is empty (⇒ always false) outside a web
|
|
33
|
+
* scaffold. Expects the workspace-relative form (`normalizeWorkspacePath` first). */
|
|
34
|
+
export function isVendored(file: string, patterns: readonly string[]): boolean {
|
|
35
|
+
return patterns.some((pattern) => new Bun.Glob(pattern).match(file));
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
/** A file the model may write: its editable scope, OR a throwaway scratch file.
|
package/src/loop/session.ts
CHANGED
|
@@ -18,6 +18,7 @@ import {
|
|
|
18
18
|
} from "../agent";
|
|
19
19
|
import { flags } from "../config";
|
|
20
20
|
import { readFiles } from "../lib/fs";
|
|
21
|
+
import { WEB_VENDORED_PATTERNS } from "../lib/scope";
|
|
21
22
|
import { validate, type ErrorParser } from "../validate";
|
|
22
23
|
import { detectStack } from "../stack-detection";
|
|
23
24
|
import {
|
|
@@ -521,9 +522,11 @@ export class Session {
|
|
|
521
522
|
report({ kind: "tool", task: SESSION_ID, message });
|
|
522
523
|
});
|
|
523
524
|
|
|
525
|
+
const isWebScaffold = cfg.scaffoldWeb === true || cfg.scaffoldUi === true;
|
|
524
526
|
const ctx: ILoopCtx = {
|
|
525
527
|
task,
|
|
526
528
|
cwd: cfg.cwd,
|
|
529
|
+
...(isWebScaffold ? { vendored: WEB_VENDORED_PATTERNS } : {}),
|
|
527
530
|
tsService: await buildTsService(cfg.cwd),
|
|
528
531
|
...(cfg.lintFile === undefined ? {} : { lintFile: cfg.lintFile }),
|
|
529
532
|
parse: cfg.parse,
|
|
@@ -206,7 +206,7 @@ export async function doEdit(
|
|
|
206
206
|
|
|
207
207
|
edit.file = normalizeWorkspacePath(ctx.cwd, edit.file);
|
|
208
208
|
|
|
209
|
-
if (isVendored(edit.file)) {
|
|
209
|
+
if (isVendored(edit.file, ctx.vendored ?? [])) {
|
|
210
210
|
return reject(
|
|
211
211
|
ctx,
|
|
212
212
|
"edit:vendored",
|
|
@@ -312,7 +312,7 @@ export async function doCreate(
|
|
|
312
312
|
|
|
313
313
|
create.file = normalizeWorkspacePath(ctx.cwd, create.file);
|
|
314
314
|
|
|
315
|
-
if (isVendored(create.file)) {
|
|
315
|
+
if (isVendored(create.file, ctx.vendored ?? [])) {
|
|
316
316
|
return reject(
|
|
317
317
|
ctx,
|
|
318
318
|
"create:vendored",
|
|
@@ -8,6 +8,10 @@ export interface IToolContext {
|
|
|
8
8
|
cwd: string;
|
|
9
9
|
/** Editable scope — `edit`/`create` outside it are rejected. */
|
|
10
10
|
files: string[];
|
|
11
|
+
/** VENDORED file globs the model must not rewrite (the web scaffold's shipped
|
|
12
|
+
* SDK/generated files). Set only for web-scaffold sessions; absent/empty ⇒ the
|
|
13
|
+
* vendored guard is inert (non-web builds and normal repos are unaffected). */
|
|
14
|
+
vendored?: readonly string[];
|
|
11
15
|
report: Reporter;
|
|
12
16
|
task: string;
|
|
13
17
|
/** In-process TypeScript LanguageService — backs the semantic tools
|
package/src/loop/turn.ts
CHANGED
|
@@ -115,6 +115,9 @@ export interface ILoopCtx {
|
|
|
115
115
|
/** Wired by the interactive CLI: turn this workspace into a web project (the
|
|
116
116
|
* `scaffold_web` tool calls it). Threaded into the tool context. */
|
|
117
117
|
setupWeb?: (framework: string) => Promise<void>;
|
|
118
|
+
/** VENDORED file globs the model must not rewrite (web-scaffold sessions only).
|
|
119
|
+
* Threaded into the tool context; absent ⇒ the vendored guard is inert. */
|
|
120
|
+
vendored?: readonly string[];
|
|
118
121
|
/** PLAN MODE (set via Session.setPlanMode): threaded into the tool context so
|
|
119
122
|
* mutating tools are rejected at dispatch — the model only plans. */
|
|
120
123
|
readOnly?: boolean;
|
|
@@ -462,6 +465,7 @@ export async function runToolCalls(
|
|
|
462
465
|
tsService: ctx.tsService,
|
|
463
466
|
...(ctx.signal === undefined ? {} : { signal: ctx.signal }),
|
|
464
467
|
...(ctx.setupWeb === undefined ? {} : { setupWeb: ctx.setupWeb }),
|
|
468
|
+
...(ctx.vendored === undefined ? {} : { vendored: ctx.vendored }),
|
|
465
469
|
...(ctx.readOnly === undefined ? {} : { readOnly: ctx.readOnly }),
|
|
466
470
|
...(ctx.mcpRegistry === undefined
|
|
467
471
|
? {}
|
package/src/web-templates.ts
CHANGED
|
@@ -317,6 +317,12 @@ export const Route = createRootRoute({
|
|
|
317
317
|
});
|
|
318
318
|
`;
|
|
319
319
|
|
|
320
|
+
// The placeholder home carries `data-tsforge-stub` (the SAME sentinel scaffold_routes
|
|
321
|
+
// stubs use) so the gate's stub-check FAILS until the model replaces it with the real
|
|
322
|
+
// home. Without this, an unbuilt app — just the scaffold + maybe some types — passes
|
|
323
|
+
// the gate (vite builds, this page renders non-blank, no scaffold_routes stubs to
|
|
324
|
+
// catch) and is falsely declared "done". The model removes the marker when it builds
|
|
325
|
+
// the real home.
|
|
320
326
|
const INDEX_ROUTE_TSX = `import { createFileRoute } from "@tanstack/react-router";
|
|
321
327
|
|
|
322
328
|
import { Button } from "@/components/ui/button";
|
|
@@ -327,7 +333,7 @@ export const Route = createFileRoute("/")({
|
|
|
327
333
|
|
|
328
334
|
function Home() {
|
|
329
335
|
return (
|
|
330
|
-
<main className="flex min-h-screen flex-col items-center justify-center gap-6 bg-background text-foreground">
|
|
336
|
+
<main data-tsforge-stub className="flex min-h-screen flex-col items-center justify-center gap-6 bg-background text-foreground">
|
|
331
337
|
<h1 className="text-3xl font-bold">app</h1>
|
|
332
338
|
<Button>Get started</Button>
|
|
333
339
|
</main>
|