npm - @agjs/tsforge - Versions diffs - 0.1.14 → 0.1.16 - Mend

@agjs/tsforge 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/package.json +2 -1
package/scripts/analyze-malformed.ts +264 -0
package/scripts/analyze-runs.ts +279 -0
package/scripts/benchmark-catalog.ts +387 -0
package/scripts/browser-check.ts +87 -0
package/scripts/build-rule-docs.ts +122 -0
package/scripts/build-rules-md.ts +129 -0
package/scripts/cli-metrics.ts +203 -0
package/scripts/coverage-check.ts +33 -0
package/scripts/edit-benchmark.ts +314 -0
package/scripts/eval-create.ts +48 -0
package/scripts/eval-spec.ts +47 -0
package/scripts/eval-sum.ts +79 -0
package/scripts/gen-tests.ts +140 -0
package/scripts/headless-build.ts +292 -0
package/scripts/interactive-eval.ts +172 -0
package/scripts/rejudge.ts +135 -0
package/scripts/run-eval-todo.ts +59 -0
package/scripts/smoke.ts +18 -0
package/scripts/stub-check.ts +44 -0
package/scripts/sweep-report.ts +76 -0
package/scripts/sweep.ts +389 -0
package/src/cli.ts +39 -1
package/src/inference/inference.types.ts +20 -0
package/src/inference/openai-compatible.ts +11 -34
package/src/inference/request.ts +148 -0
package/src/models-config.ts +13 -0

package/scripts/interactive-eval.ts ADDED Viewed

@@ -0,0 +1,172 @@
+#!/usr/bin/env bun
+// INTERACTIVE-PATH eval: drives Session.send() exactly the way the REPL does —
+// agent-decides scaffold_web, plan mode, the malformed-call retry — the paths
+// headless-build (which pre-scaffolds and calls buildStaged) never exercises.
+// This net exists because a missing `scaffoldWeb: true` config flag killed the
+// agent-decides path for weeks and no eval noticed.
+//
+//   bun packages/core/scripts/interactive-eval.ts                       # default todo-app prompt
+//   bun packages/core/scripts/interactive-eval.ts "build a notes app"   # custom prompt
+//   ... --plan    exercise plan mode (plan → approve → implement)
+//   ... --force   forced-tools arm (tool_choice required + yield_status)
+//
+// Each run gets evals/runs/<timestamp>-interactive[-flags]/ with agent.log,
+// the JSONL event log, and a verdict.json for the analyzer.
+import { appendFileSync, mkdirSync, writeFileSync } from "node:fs";
+import { join, resolve } from "node:path";
+import {
+  Session,
+  PLAN_APPROVED_NOTE,
+  LOOP_LIMITS,
+  type ISendResult,
+} from "../src/loop";
+import { renderEvent, type Reporter } from "../src";
+import {
+  buildGate,
+  buildWebGate,
+  buildWebFix,
+  buildWebTscCheck,
+  scaffoldWeb,
+  installWebDeps,
+  webGuidance,
+} from "../src/detect-gate";
+import { resolveActiveModel } from "../src/models-config";
+import { OpenAICompatibleProvider } from "../src/inference";
+import { providerConfig } from "../src/cli";
+interface IVerdict {
+  status: string;
+  turns: number;
+  scaffolded: boolean;
+  planUsed: boolean;
+  forceTools: boolean;
+  malformedNudges: number;
+  salvaged: number;
+  toolRejections: number;
+}
+function makeReporter(
+  logFile: string,
+  agentLog: string,
+  verdict: IVerdict
+): Reporter {
+  return (event) => {
+    process.stdout.write(renderEvent(event, { color: true }));
+    appendFileSync(agentLog, renderEvent(event, { color: false }));
+    appendFileSync(logFile, `${JSON.stringify({ t: Date.now(), ...event })}\n`);
+    // Live fingerprints for the verdict (same markers analyze-malformed reads).
+    if (event.kind === "tool") {
+      if (event.message.includes("malformed tool-call text")) {
+        verdict.malformedNudges += 1;
+      }
+      if (event.message.startsWith("tool_rejected:")) {
+        verdict.toolRejections += 1;
+      }
+      if (
+        event.message.includes("recovered") &&
+        event.message.includes("malformed")
+      ) {
+        verdict.salvaged += 1;
+      }
+    }
+  };
+}
+async function main(): Promise<number> {
+  const argv = process.argv.slice(2);
+  const plan = argv.includes("--plan");
+  const force = argv.includes("--force");
+  const prompt =
+    argv.find((a) => !a.startsWith("--")) ?? "build a small todo web app";
+  const stamp = new Date()
+    .toISOString()
+    .replace(/[:T]/g, "-")
+    .replace(/\..+$/, "");
+  const label = `interactive${plan ? "-plan" : ""}${force ? "-force" : ""}`;
+  const dir = resolve(join("evals", "runs", `${stamp}-${label}`));
+  mkdirSync(dir, { recursive: true });
+  const verdict: IVerdict = {
+    status: "unknown",
+    turns: 0,
+    scaffolded: false,
+    planUsed: plan,
+    forceTools: force,
+    malformedNudges: 0,
+    salvaged: 0,
+    toolRejections: 0,
+  };
+  const report = makeReporter(
+    join(dir, "events.jsonl"),
+    join(dir, "agent.log"),
+    verdict
+  );
+  const active = await resolveActiveModel();
+  const provider = new OpenAICompatibleProvider(providerConfig(active.entry));
+  const gate = await buildGate(dir);
+  process.stdout.write(
+    `interactive eval → ${dir}\n  model ${active.name} · gate ${gate.label} · ${plan ? "plan-mode" : "direct"}${force ? " · forced-tools" : ""}\n\n`
+  );
+  // The REPL's exact config shape — scaffoldWeb:true is the agent-decides flag.
+  const session = await Session.create({
+    provider,
+    cwd: dir,
+    files: ["**/*"],
+    accept: gate.command,
+    report,
+    scaffoldWeb: true,
+    enableThinking: false,
+    ...(force ? { forceTools: true } : {}),
+  });
+  // The REPL's configureWeb, inlined: scaffold + deps + switch to the web gate.
+  session.setSetupWeb(async (framework) => {
+    const fw = framework === "vanilla" ? "vanilla" : "react";
+    await scaffoldWeb(dir, fw);
+    await installWebDeps(dir);
+    session.setGate(buildWebGate(fw).command);
+    session.setFix(buildWebFix(fw));
+    session.setIncrementalCheck(buildWebTscCheck());
+    session.guide(webGuidance(fw));
+    session.setMaxTurns(LOOP_LIMITS.webMaxTurns);
+    verdict.scaffolded = true;
+  });
+  let result: ISendResult;
+  if (plan) {
+    session.setPlanMode(true);
+    result = await session.send(prompt);
+    const planned = session.messages.at(-1)?.content ?? "";
+    process.stdout.write(
+      `\n— plan turn: ${result.status}; ## Plan present: ${String(/^##\s*plan\b/im.test(planned))} —\n`
+    );
+    session.setPlanMode(false);
+    result = await session.send(PLAN_APPROVED_NOTE);
+  } else {
+    result = await session.send(prompt);
+  }
+  verdict.status = result.status;
+  verdict.turns = result.turns;
+  writeFileSync(join(dir, "verdict.json"), JSON.stringify(verdict, null, 2));
+  process.stdout.write(
+    `\nverdict: ${JSON.stringify(verdict)}\n  run dir: ${dir}\n`
+  );
+  return result.status === "done" && verdict.scaffolded ? 0 : 1;
+}
+process.exit(await main());

package/scripts/rejudge.ts ADDED Viewed

@@ -0,0 +1,135 @@
+// Re-score EXISTING run outputs with a judge — no re-implementation. Its purpose
+// is to resolve the OKR crux: is a local self-judged Q4 the code's true quality,
+// or just the local model lowballing itself? Point it at a FLAGSHIP judge to find
+// out (offline MEASURE only — never a runtime dependency):
+//
+//   TSFORGE_JUDGE_URL=https://… TSFORGE_JUDGE_MODEL=deepseek-… TSFORGE_JUDGE_KEY=… \
+//     bun run packages/core/scripts/rejudge.ts money 5
+//
+// Without the JUDGE_* env it falls back to the local model (self-judge) and just
+// reproduces the existing scores — so it warns when no flagship judge is set.
+import { readdir } from "node:fs/promises";
+import { join } from "node:path";
+import { parseSpec } from "../src/spec";
+import { judge } from "../src/eval";
+import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
+import { isRecord } from "../src/lib/guards";
+const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
+const flagshipSet =
+  process.env.TSFORGE_JUDGE_URL !== undefined ||
+  process.env.TSFORGE_JUDGE_MODEL !== undefined;
+const provider = new OpenAICompatibleProvider({
+  baseUrl:
+    process.env.TSFORGE_JUDGE_URL ??
+    process.env.TSFORGE_BASE_URL ??
+    PROVIDER_DEFAULTS.baseUrl,
+  model:
+    process.env.TSFORGE_JUDGE_MODEL ??
+    process.env.TSFORGE_MODEL ??
+    PROVIDER_DEFAULTS.model,
+  apiKey: process.env.TSFORGE_JUDGE_KEY ?? process.env.TSFORGE_API_KEY,
+});
+async function resolveDirs(): Promise<string[]> {
+  const args = process.argv.slice(2);
+  if (args.length === 2 && /^\d+$/.test(args[1] ?? "")) {
+    const prefix = args[0] ?? "";
+    const count = Number(args[1]);
+    const all = await readdir(evalsRoot, { withFileTypes: true });
+    const dirs = all
+      .filter((d) => d.isDirectory() && d.name.startsWith(prefix))
+      .map((d) => d.name)
+      .sort();
+    return dirs.slice(-count).map((name) => join(evalsRoot, name));
+  }
+  return args.map((a) => (a.startsWith("/") ? a : join(evalsRoot, a)));
+}
+/** The local (self-judge) overall score recorded at run time, if any. */
+async function localScore(dir: string): Promise<number | undefined> {
+  const file = Bun.file(join(dir, "result.json"));
+  if (!(await file.exists())) {
+    return undefined;
+  }
+  const data: unknown = JSON.parse(await file.text());
+  return isRecord(data) && typeof data.quality === "number"
+    ? data.quality
+    : undefined;
+}
+async function specIn(dir: string): Promise<string | undefined> {
+  const entries = await readdir(dir);
+  return entries.find((e) => e.endsWith(".spec.md"));
+}
+const dirs = await resolveDirs();
+if (!flagshipSet) {
+  process.stdout.write(
+    "⚠ No TSFORGE_JUDGE_URL/MODEL set — judging with the LOCAL model (self-judge). " +
+      "Set a flagship judge to measure true quality.\n"
+  );
+}
+process.stdout.write(
+  `\n=== re-judge (${dirs.length} runs, judge=${flagshipSet ? "flagship" : "LOCAL self-judge"}) ===\n\n`
+);
+process.stdout.write("localQ  judgeOverall  corr  design  read  run\n");
+for (const dir of dirs) {
+  const specName = await specIn(dir);
+  if (specName === undefined) {
+    continue;
+  }
+  const spec = parseSpec(await Bun.file(join(dir, specName)).text());
+  const task = spec.tasks[0];
+  if (task === undefined) {
+    continue;
+  }
+  const parts: string[] = [];
+  for (const f of task.files) {
+    const file = Bun.file(join(dir, f));
+    if (await file.exists()) {
+      parts.push(`// ${f}\n${await file.text()}`);
+    }
+  }
+  if (parts.length === 0) {
+    continue;
+  }
+  const score = await judge(provider, {
+    goal: spec.title,
+    criteria: await Bun.file(join(dir, specName)).text(),
+    code: parts.join("\n\n"),
+  });
+  const local = await localScore(dir);
+  const runId = dir.split("/").slice(-1)[0] ?? dir;
+  process.stdout.write(
+    [
+      (local === undefined ? "-" : String(local)).padStart(6),
+      String(score.overall).padStart(13),
+      String(score.correctness).padStart(6),
+      String(score.design).padStart(8),
+      String(score.readability).padStart(6),
+      `  ${runId}`,
+    ].join("") + "\n"
+  );
+}

package/scripts/run-eval-todo.ts ADDED Viewed

@@ -0,0 +1,59 @@
+// Drive the live model through the Todo spec into a fresh, uuid'd run folder
+// under /evals (gitignored). Streams to your terminal AND to run.log.
+// Run: bun run packages/core/scripts/run-eval-todo.ts
+import { mkdir } from "node:fs/promises";
+import { join } from "node:path";
+import { parseSpec } from "../src/spec";
+import { runSpec } from "../src/loop";
+import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
+import { renderEvent } from "../src/render";
+const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
+const seedDir = join(evalsRoot, "todo");
+// One isolated folder per run. Kept at evals/<id> depth so the spec's
+// ../../node_modules paths still resolve.
+const runId = `todo-${crypto.randomUUID().slice(0, 8)}`;
+const runDir = join(evalsRoot, runId);
+await mkdir(runDir, { recursive: true });
+// Copy the seed (spec, tests, constitution) into the run folder.
+for (const file of [
+  "todo.spec.md",
+  "todo.test.ts",
+  "tsconfig.json",
+  "eslint.config.js",
+]) {
+  await Bun.write(join(runDir, file), Bun.file(join(seedDir, file)));
+}
+const spec = parseSpec(await Bun.file(join(runDir, "todo.spec.md")).text());
+const provider = new OpenAICompatibleProvider({
+  baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
+  model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
+});
+// Tee to the terminal (colored) AND run.log (plain).
+const log = Bun.file(join(runDir, "run.log")).writer();
+const out = (colored: string, plain: string): void => {
+  process.stdout.write(colored);
+  void log.write(plain);
+};
+out(`run ${runId}\n`, `run ${runId}\n`);
+const result = await runSpec(spec, runDir, provider, {
+  onEvent: (e) => {
+    out(renderEvent(e, { color: true }), renderEvent(e, { color: false }));
+  },
+});
+const summary = `\n\nspec "${spec.id}" -> ${result.status}\ntasks: ${JSON.stringify(result.results)}\n`;
+out(summary, summary);
+await log.end();
+console.log(`\nFull log + output in: ${runDir}`);

package/scripts/smoke.ts ADDED Viewed

@@ -0,0 +1,18 @@
+// Manual smoke check against a live local model. Not part of the test suite.
+// Run: bun run packages/core/scripts/smoke.ts
+import { OpenAICompatibleProvider, PROVIDER_DEFAULTS } from "../src/inference";
+const p = new OpenAICompatibleProvider({
+  baseUrl: process.env.TSFORGE_BASE_URL ?? PROVIDER_DEFAULTS.baseUrl,
+  model: process.env.TSFORGE_MODEL ?? PROVIDER_DEFAULTS.model,
+});
+const r = await p.complete(
+  [{ role: "user", content: "Reply with exactly: pong" }],
+  {
+    temperature: 0,
+  }
+);
+console.log("content:", JSON.stringify(r.content));
+console.log("toolCalls:", JSON.stringify(r.toolCalls));

package/scripts/stub-check.ts ADDED Viewed

@@ -0,0 +1,44 @@
+#!/usr/bin/env bun
+// Gate step: fail if any route is STILL an unfilled scaffold_routes stub. The
+// scaffold lays down placeholder route files (marked `data-tsforge-stub`); the
+// model must REPLACE each with the real page. An unfilled stub renders an empty
+// placeholder — which the coverage gate (file exists) and the render smoke (root
+// not blank) both miss — so without this an app of empty routes goes green.
+import { readdir, readFile } from "node:fs/promises";
+import { join } from "node:path";
+const MARKER = "data-tsforge-stub";
+const dir = process.argv[2] ?? ".";
+const routesDir = join(dir, "src", "routes");
+let files: string[];
+try {
+  files = (await readdir(routesDir)).filter((f) => f.endsWith(".tsx"));
+} catch {
+  // No routes dir (non-web build) → nothing to enforce.
+  process.exit(0);
+}
+const stubs: string[] = [];
+for (const file of files) {
+  const content = await readFile(join(routesDir, file), "utf8");
+  if (content.includes(MARKER)) {
+    stubs.push(file);
+  }
+}
+if (stubs.length > 0) {
+  process.stdout.write(
+    `stub-check: ${String(stubs.length)} route(s) are still empty scaffold STUBS — ` +
+      `replace each placeholder component with the REAL page (its list/detail/form, ` +
+      `using the SDK + your components). The app is NOT done while these render a ` +
+      `placeholder. Unfilled: ${stubs.join(", ")}\n`
+  );
+  process.exit(1);
+}
+process.stdout.write("stub-check: no unfilled route stubs\n");
+process.exit(0);

package/scripts/sweep-report.ts ADDED Viewed

@@ -0,0 +1,76 @@
+// Turn a sweep's `sweep-<seed>-<ts>.json` into a statistical Markdown report:
+// per-variant pass rate with a 95% Wilson interval and, when TSFORGE_BASELINE
+// names a variant, a two-proportion significance test against it.
+//
+// Run:  bun run packages/core/scripts/sweep-report.ts [sweep.json]
+//   (no arg → the newest sweep-*.json under evals/runs)
+//   TSFORGE_BASELINE="ttsr=off,hashline=off temp=0"  # optional baseline label
+import { readdir } from "node:fs/promises";
+import { join } from "node:path";
+import { isRecord } from "../src/lib/guards";
+import {
+  buildSweepReport,
+  renderSweepReportMarkdown,
+  type IRunRecord,
+} from "../src/eval";
+const RUNS_DIR = "evals/runs";
+async function newestSweep(): Promise<string> {
+  const names = (await readdir(RUNS_DIR))
+    .filter((n) => n.startsWith("sweep-") && n.endsWith(".json"))
+    .sort();
+  const latest = names.at(-1);
+  if (latest === undefined) {
+    throw new Error(`no sweep-*.json found in ${RUNS_DIR}`);
+  }
+  return join(RUNS_DIR, latest);
+}
+function toRecords(value: unknown): IRunRecord[] {
+  if (!isRecord(value) || !Array.isArray(value.records)) {
+    return [];
+  }
+  const out: IRunRecord[] = [];
+  for (const r of value.records) {
+    if (
+      isRecord(r) &&
+      typeof r.label === "string" &&
+      typeof r.passed === "boolean" &&
+      typeof r.cycles === "number" &&
+      typeof r.ms === "number"
+    ) {
+      out.push({
+        label: r.label,
+        passed: r.passed,
+        cycles: r.cycles,
+        ms: r.ms,
+        ...(typeof r.quality === "number" ? { quality: r.quality } : {}),
+      });
+    }
+  }
+  return out;
+}
+const fileArg = process.argv[2];
+const baseline = process.env.TSFORGE_BASELINE;
+const path = fileArg ?? (await newestSweep());
+const parsed: unknown = JSON.parse(await Bun.file(path).text());
+const records = toRecords(parsed);
+if (records.length === 0) {
+  process.stderr.write(`no run records in ${path}\n`);
+  process.exit(1);
+}
+const report = buildSweepReport(records, baseline);
+const markdown = renderSweepReportMarkdown(report);
+const outPath = path.replace(/\.json$/, ".report.md");
+await Bun.write(outPath, `${markdown}\n`);
+process.stdout.write(`${markdown}\n\nwrote ${outPath}\n`);