@agjs/tsforge 0.1.13 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,389 @@
1
+ // Eval sweep: run a seed spec N times across temperature + feature flag variants, score, tabulate.
2
+ // Run: TSFORGE_SEED=money TSFORGE_TEMPS=0,0.5 TSFORGE_REPEATS=3 bun run packages/core/scripts/sweep.ts
3
+ // A/B feature variants:
4
+ // TSFORGE_FEATURE_VARIANTS=ttsr,hashline (sweep across feature toggles)
5
+ // Each variant is dim=on|off (e.g. ttsr=on×hashline=off) creating a cartesian product.
6
+ import { mkdir, readdir, rm, stat } from "node:fs/promises";
7
+ import { join } from "node:path";
8
+ import { parseSpec } from "../src/spec";
9
+ import { buildGate, prettierWriteCommand } from "../src/detect-gate";
10
+ import { runSpec, qualityRepair } from "../src/loop";
11
+ import { modelAgent } from "../src/agent";
12
+ import { OpenAICompatibleProvider } from "../src/inference";
13
+ import { resolveActiveModel, resolveApiKey } from "../src/models-config";
14
+ import { summarize, type IRunRecord } from "../src/eval";
15
+ import { renderEvent } from "../src/render";
16
+ import type { ILoopEvent } from "../src/loop";
17
+
18
+ const seed = process.env.TSFORGE_SEED ?? "todo";
19
+ const temps = (process.env.TSFORGE_TEMPS ?? "0,0.5")
20
+ .split(",")
21
+ .map((t) => Number(t.trim()));
22
+ const repeats = Number(process.env.TSFORGE_REPEATS ?? "3");
23
+ // Default quiet (batch). Set TSFORGE_STREAM=1 to watch the model live.
24
+ const stream = process.env.TSFORGE_STREAM === "1";
25
+ const qualityTarget = Number(process.env.TSFORGE_QUALITY_TARGET ?? "5");
26
+ const qualityAttempts = Number(process.env.TSFORGE_QUALITY_ATTEMPTS ?? "2");
27
+
28
+ /** Feature variants to sweep: a cartesian product of feature dimensions.
29
+ * Example: `ttsr,hashline` → generates [ttsr=on×hashline=on, ttsr=on×hashline=off,
30
+ * ttsr=off×hashline=on, ttsr=off×hashline=off]. Each dimension toggles via env var. */
31
+ type IFeatureVariant = Record<string, string>;
32
+
33
+ function parseFeatureVariants(): IFeatureVariant[] {
34
+ const featureDims = (process.env.TSFORGE_FEATURE_VARIANTS ?? "")
35
+ .split(",")
36
+ .map((s) => s.trim())
37
+ .filter((s) => s.length > 0);
38
+
39
+ if (featureDims.length === 0) {
40
+ return [{}]; // No features to sweep → one baseline variant
41
+ }
42
+
43
+ // Cartesian product: each dimension has 2 states (on=1, off=0).
44
+ const variants: IFeatureVariant[] = [];
45
+ const numVariants = Math.pow(2, featureDims.length);
46
+
47
+ for (let i = 0; i < numVariants; i++) {
48
+ const variant: IFeatureVariant = {};
49
+
50
+ for (let d = 0; d < featureDims.length; d++) {
51
+ const dim = featureDims[d];
52
+
53
+ if (dim !== undefined) {
54
+ const state = (i >> d) & 1; // Bit d of i → dimension d state
55
+
56
+ variant[dim] = state === 1 ? "1" : "0";
57
+ }
58
+ }
59
+
60
+ variants.push(variant);
61
+ }
62
+
63
+ return variants;
64
+ }
65
+
66
+ /** Map feature variant to env vars. Each feature dim maps to a TSFORGE_* var. */
67
+ function variantToEnvVars(variant: IFeatureVariant): Record<string, string> {
68
+ const envVars: Record<string, string> = {};
69
+
70
+ for (const [dim, state] of Object.entries(variant)) {
71
+ if (dim === "ttsr") {
72
+ envVars.TSFORGE_TTSR = state === "1" ? "1" : "0";
73
+ } else if (dim === "hashline") {
74
+ envVars.TSFORGE_HASHLINE = state === "1" ? "1" : "0";
75
+ } else if (dim === "lsp_write_feedback") {
76
+ envVars.TSFORGE_LSP_WRITE_FEEDBACK = state === "1" ? "1" : "0";
77
+ }
78
+ // else: unknown dimension, skip
79
+ }
80
+
81
+ return envVars;
82
+ }
83
+
84
+ /** Variant label for logging: e.g. "ttsr=on,hashline=off". */
85
+ function variantLabel(variant: IFeatureVariant): string {
86
+ const parts = Object.entries(variant)
87
+ .sort(([a], [b]) => a.localeCompare(b))
88
+ .map(([dim, state]) => `${dim}=${state === "1" ? "on" : "off"}`);
89
+
90
+ return parts.length > 0 ? parts.join(",") : "baseline";
91
+ }
92
+
93
+ const featureVariants = parseFeatureVariants();
94
+
95
+ const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
96
+ // Prefer a local working seed (evals/<seed>); fall back to the committed corpus
97
+ // (evals/corpus/<seed>) so checked-in seeds run with no manual copy step.
98
+ const localSeedDir = join(evalsRoot, seed);
99
+ const seedDir = (await Bun.file(join(localSeedDir, `${seed}.spec.md`)).exists())
100
+ ? localSeedDir
101
+ : join(evalsRoot, "corpus", seed);
102
+ // Recursive so nested-directory apps (e.g. a React app under `src/`) copy whole;
103
+ // flat single-dir evals are unaffected (recursive readdir returns the same list).
104
+ const seedFiles = await readdir(seedDir, { recursive: true });
105
+
106
+ // Resolve the model the same way the CLI does: explicit TSFORGE_* env wins, else
107
+ // the active entry from ~/.tsforge/models.json. (Previously this hardcoded the
108
+ // localhost default and ignored the registry, so a sweep silently dialed an
109
+ // unreachable endpoint and hung with an empty run.log.)
110
+ const { entry: activeModel } = await resolveActiveModel();
111
+
112
+ const provider = new OpenAICompatibleProvider({
113
+ baseUrl: activeModel.baseUrl,
114
+ model: activeModel.model,
115
+ apiKey: resolveApiKey(activeModel),
116
+ // Thinking tokens count against the limit, so give reasoning + code room.
117
+ maxTokens: Number(process.env.TSFORGE_MAX_TOKENS ?? "16384"),
118
+ // Opt-in only: a repetition penalty breaks rare temp-0 loops but DEGRADES
119
+ // algorithmic code (it made `money` write unsafe/any code that failed the
120
+ // strict gate). Default off; enable via env if a target genuinely loops.
121
+ repetitionPenalty:
122
+ process.env.TSFORGE_REPETITION_PENALTY === undefined
123
+ ? undefined
124
+ : Number(process.env.TSFORGE_REPETITION_PENALTY),
125
+ });
126
+
127
+ // The judge scores quality. Point it at a flagship via TSFORGE_JUDGE_URL/MODEL
128
+ // (+ TSFORGE_JUDGE_KEY) to measure the gap; defaults to the active model judging itself.
129
+ const judgeProvider = new OpenAICompatibleProvider({
130
+ baseUrl: process.env.TSFORGE_JUDGE_URL ?? activeModel.baseUrl,
131
+ model: process.env.TSFORGE_JUDGE_MODEL ?? activeModel.model,
132
+ apiKey: process.env.TSFORGE_JUDGE_KEY ?? resolveApiKey(activeModel),
133
+ });
134
+
135
+ /** Sortable timestamp `YYYYMMDD-HHMMSS` so run dirs sort newest-last by name. */
136
+ function stamp(): string {
137
+ const d = new Date();
138
+ const p = (n: number): string => String(n).padStart(2, "0");
139
+
140
+ return `${d.getFullYear()}${p(d.getMonth() + 1)}${p(d.getDate())}-${p(d.getHours())}${p(d.getMinutes())}${p(d.getSeconds())}`;
141
+ }
142
+
143
+ const records: IRunRecord[] = [];
144
+
145
+ for (const variant of featureVariants) {
146
+ const variantEnv = variantToEnvVars(variant);
147
+ const vLabel = variantLabel(variant);
148
+
149
+ for (const temp of temps) {
150
+ for (let i = 0; i < repeats; i += 1) {
151
+ const runId = `${seed}-${vLabel}-t${temp}-${stamp()}-${i + 1}`;
152
+ const runDir = join(evalsRoot, "runs", runId);
153
+
154
+ // One run's failure (e.g. a request timing out) must not abort the sweep —
155
+ // record it as a blocked run and carry on, so a long batch is resilient.
156
+ try {
157
+ await runOne(runId, runDir, temp, i, variantEnv);
158
+ } catch (err) {
159
+ const message = err instanceof Error ? err.message : String(err);
160
+
161
+ records.push({
162
+ label: `${vLabel} temp=${temp}`,
163
+ passed: false,
164
+ cycles: 0,
165
+ ms: 0,
166
+ });
167
+ process.stdout.write(
168
+ ` ${seed} ${vLabel} temp=${temp} #${i + 1}: ERRORED (${message}) → ${runId}\n`
169
+ );
170
+ }
171
+ }
172
+ }
173
+ }
174
+
175
+ /** Set env vars for a variant, returning a restore function. */
176
+ function setVariantEnv(variant: Record<string, string>): () => void {
177
+ const saved: Record<string, string | undefined> = {};
178
+
179
+ for (const [key, value] of Object.entries(variant)) {
180
+ saved[key] = process.env[key];
181
+ process.env[key] = value;
182
+ }
183
+
184
+ return () => {
185
+ for (const [key, value] of Object.entries(saved)) {
186
+ if (value === undefined) {
187
+ // Rather than delete, we just don't restore the var.
188
+ // It was undefined before, so it stays undefined.
189
+ continue;
190
+ }
191
+
192
+ process.env[key] = value;
193
+ }
194
+ };
195
+ }
196
+
197
+ /** Copy seed files and prepare the run directory. */
198
+ async function setupRunDir(dir: string): Promise<void> {
199
+ await mkdir(dir, { recursive: true });
200
+
201
+ for (const file of seedFiles) {
202
+ const src = join(seedDir, file);
203
+
204
+ if ((await stat(src)).isDirectory()) {
205
+ continue;
206
+ }
207
+
208
+ await Bun.write(join(dir, file), Bun.file(src));
209
+ }
210
+ }
211
+
212
+ /** Remove task files in scratch mode (keep in existing mode). */
213
+ async function startRed(
214
+ dir: string,
215
+ spec: ReturnType<typeof parseSpec>
216
+ ): Promise<void> {
217
+ if (spec.mode !== "existing") {
218
+ for (const task of spec.tasks) {
219
+ for (const f of task.files) {
220
+ await rm(join(dir, f), { force: true });
221
+ }
222
+ }
223
+ }
224
+ }
225
+
226
+ async function runOne(
227
+ runId: string,
228
+ runDir: string,
229
+ temp: number,
230
+ i: number,
231
+ variantEnv: Record<string, string> = {}
232
+ ): Promise<void> {
233
+ const restore = setVariantEnv(variantEnv);
234
+
235
+ try {
236
+ await setupRunDir(runDir);
237
+
238
+ const spec = parseSpec(
239
+ await Bun.file(join(runDir, `${seed}.spec.md`)).text()
240
+ );
241
+
242
+ await startRed(runDir, spec);
243
+
244
+ // Apply tsforge's STRICT FLOOR (bundled tsc-strict + eslint) to the eval
245
+ // gate — the SAME gate the interactive CLI builds. Eval mode otherwise
246
+ // trusts the spec's `accept` verbatim, so an error the tests don't execute
247
+ // (an unguarded index access, an `as any`) slipped through as GREEN. Now
248
+ // every task and the whole-spec verify must clear the strict floor BEFORE
249
+ // its functional tests count.
250
+ // prettier --write FIRST (auto-format), then tsc-strict + eslint. The model
251
+ // never hand-formats, but the gate still enforces type-safety + idioms.
252
+ const strictGate = `${prettierWriteCommand()} && ${(await buildGate(runDir)).command}`;
253
+ const gatedSpec = {
254
+ ...spec,
255
+ tasks: spec.tasks.map((t) => ({
256
+ ...t,
257
+ accept: `${strictGate} && ${t.accept}`,
258
+ })),
259
+ verify:
260
+ spec.verify.length > 0 ? `${strictGate} && ${spec.verify}` : strictGate,
261
+ };
262
+
263
+ // Every run gets a full transcript at <runDir>/run.log; stream to the
264
+ // terminal too when TSFORGE_STREAM=1.
265
+ const log = Bun.file(join(runDir, "run.log")).writer();
266
+
267
+ const onEvent = (e: ILoopEvent): void => {
268
+ void log.write(renderEvent(e, { color: false }));
269
+ // Flush per event — otherwise Bun's FileSink buffers and `tail -f` shows
270
+ // nothing until the run ends. The log must be live.
271
+ void log.flush();
272
+
273
+ if (stream) {
274
+ process.stdout.write(renderEvent(e, { color: true }));
275
+ }
276
+ };
277
+
278
+ const agent = modelAgent(provider, {
279
+ temperature: temp,
280
+ ...(process.env.TSFORGE_THINKING_BUDGET === undefined
281
+ ? {}
282
+ : { thinkingTokenBudget: Number(process.env.TSFORGE_THINKING_BUDGET) }),
283
+ });
284
+ const started = performance.now();
285
+ const result = await runSpec(gatedSpec, runDir, provider, {
286
+ onEvent,
287
+ temperature: temp,
288
+ // Cap reasoning per call to trim turn time — A/B the sweet spot via env.
289
+ ...(process.env.TSFORGE_THINKING_BUDGET === undefined
290
+ ? {}
291
+ : { thinkingTokenBudget: Number(process.env.TSFORGE_THINKING_BUDGET) }),
292
+ });
293
+
294
+ const ms = Math.round(performance.now() - started);
295
+ const cycles = result.results.reduce((acc, r) => acc + r.cycles, 0);
296
+ const passed = result.status === "done";
297
+
298
+ // Once green, drive QUALITY up: judge → improve-per-critique → re-judge.
299
+ let quality: number | undefined;
300
+ let judgeNotes = "";
301
+ const firstTask = spec.tasks[0];
302
+
303
+ if (passed && firstTask !== undefined) {
304
+ const specText = await Bun.file(join(runDir, `${seed}.spec.md`)).text();
305
+
306
+ // The judge is a MEASUREMENT, not part of the build. If it fails (e.g. the
307
+ // server times out), the implement result still stands — degrade to
308
+ // "quality unknown" rather than erroring out a successful run.
309
+ try {
310
+ const qr = await qualityRepair(
311
+ firstTask,
312
+ runDir,
313
+ agent,
314
+ judgeProvider,
315
+ { goal: spec.title, criteria: specText },
316
+ { target: qualityTarget, maxAttempts: qualityAttempts, onEvent }
317
+ );
318
+
319
+ quality = qr.quality;
320
+ judgeNotes = qr.notes;
321
+ } catch (err) {
322
+ judgeNotes = `judge unavailable: ${err instanceof Error ? err.message : String(err)}`;
323
+ }
324
+ }
325
+
326
+ await log.end();
327
+
328
+ // Structured per-run artifact for comparison alongside run.log + the code.
329
+ // Include the feature variant so analysis can reconstruct the conditions.
330
+ await Bun.write(
331
+ join(runDir, "result.json"),
332
+ JSON.stringify(
333
+ {
334
+ seed,
335
+ runId,
336
+ temperature: temp,
337
+ features: variantEnv,
338
+ status: result.status,
339
+ cycles,
340
+ ms,
341
+ quality,
342
+ judgeNotes,
343
+ tasks: result.results,
344
+ },
345
+ null,
346
+ 2
347
+ )
348
+ );
349
+
350
+ const edits = result.results.reduce((a, r) => a + (r.edits ?? 0), 0);
351
+ const regressions = result.results.reduce(
352
+ (a, r) => a + (r.regressions ?? 0),
353
+ 0
354
+ );
355
+
356
+ const vLabel = variantLabel(variantEnv);
357
+
358
+ records.push({
359
+ label: `${vLabel} temp=${temp}`,
360
+ passed,
361
+ cycles,
362
+ ms,
363
+ quality,
364
+ });
365
+ process.stdout.write(
366
+ ` ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : "blocked"} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
367
+ );
368
+ } finally {
369
+ restore();
370
+ }
371
+ }
372
+
373
+ const summaries = summarize(records);
374
+
375
+ process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
376
+
377
+ for (const s of summaries) {
378
+ process.stdout.write(
379
+ `${s.label.padEnd(10)} pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs}) Q ${s.avgQuality.toFixed(1)}/5 avg ${s.avgCycles.toFixed(1)} cyc ${Math.round(s.avgMs)}ms\n`
380
+ );
381
+ }
382
+
383
+ const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
384
+
385
+ await Bun.write(
386
+ outPath,
387
+ JSON.stringify({ seed, temps, repeats, records, summaries }, null, 2)
388
+ );
389
+ process.stdout.write(`\nsaved ${outPath}\n`);
package/src/cli.ts CHANGED
@@ -442,12 +442,14 @@ function makeSpinner(): {
442
442
  clear: () => void;
443
443
  stop: () => void;
444
444
  setLabel: (label: string) => void;
445
+ onTick: (cb: () => void) => void;
445
446
  } {
446
447
  let timer: ReturnType<typeof setInterval> | null = null;
447
448
  let startedAt = 0;
448
449
  let frame = 0;
449
450
  let drawn = false;
450
451
  let label = "thinking";
452
+ let onTickCb: (() => void) | null = null;
451
453
 
452
454
  const clear = (): void => {
453
455
  if (drawn) {
@@ -464,6 +466,7 @@ function makeSpinner(): {
464
466
  `${ERASE_LINE} ${STYLE.dim}${SPINNER_FRAMES[frame] ?? ""} ${label} · ${secs}s${RESET}`
465
467
  );
466
468
  drawn = true;
469
+ onTickCb?.(); // repaint the pinned status bar with live tok/s / context
467
470
  };
468
471
 
469
472
  return {
@@ -488,6 +491,9 @@ function makeSpinner(): {
488
491
  setLabel: (l: string): void => {
489
492
  label = l;
490
493
  },
494
+ onTick: (cb: () => void): void => {
495
+ onTickCb = cb;
496
+ },
491
497
  };
492
498
  }
493
499
 
@@ -964,6 +970,7 @@ async function repl(args: ICliArgs): Promise<number> {
964
970
  active = new AbortController();
965
971
  const started = performance.now();
966
972
 
973
+ lastStatus = "working"; // reflected live on the bar (● working) during the turn
967
974
  spinner.start();
968
975
 
969
976
  try {
@@ -1239,6 +1246,14 @@ async function repl(args: ICliArgs): Promise<number> {
1239
1246
  // inactive and `prompt()` falls back to the inline status line (pipes, --log).
1240
1247
  const statusBar = new StatusBar(process.stdout, true, true);
1241
1248
 
1249
+ // Repaint the bar on every spinner tick so tok/s and the context meter update
1250
+ // live mid-turn (both read live session state), not just at turn boundaries.
1251
+ spinner.onTick(() => {
1252
+ if (statusBar.active) {
1253
+ statusBar.update(statusInfo());
1254
+ }
1255
+ });
1256
+
1242
1257
  process.stdout.on("resize", () => {
1243
1258
  statusBar.resize(statusInfo());
1244
1259
  });