npm - @agjs/tsforge - Versions diffs - 0.1.14 → 0.1.16 - Mend

@agjs/tsforge 0.1.14 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/package.json +2 -1
package/scripts/analyze-malformed.ts +264 -0
package/scripts/analyze-runs.ts +279 -0
package/scripts/benchmark-catalog.ts +387 -0
package/scripts/browser-check.ts +87 -0
package/scripts/build-rule-docs.ts +122 -0
package/scripts/build-rules-md.ts +129 -0
package/scripts/cli-metrics.ts +203 -0
package/scripts/coverage-check.ts +33 -0
package/scripts/edit-benchmark.ts +314 -0
package/scripts/eval-create.ts +48 -0
package/scripts/eval-spec.ts +47 -0
package/scripts/eval-sum.ts +79 -0
package/scripts/gen-tests.ts +140 -0
package/scripts/headless-build.ts +292 -0
package/scripts/interactive-eval.ts +172 -0
package/scripts/rejudge.ts +135 -0
package/scripts/run-eval-todo.ts +59 -0
package/scripts/smoke.ts +18 -0
package/scripts/stub-check.ts +44 -0
package/scripts/sweep-report.ts +76 -0
package/scripts/sweep.ts +389 -0
package/src/cli.ts +39 -1
package/src/inference/inference.types.ts +20 -0
package/src/inference/openai-compatible.ts +11 -34
package/src/inference/request.ts +148 -0
package/src/models-config.ts +13 -0

package/scripts/sweep.ts ADDED Viewed

@@ -0,0 +1,389 @@
+// Eval sweep: run a seed spec N times across temperature + feature flag variants, score, tabulate.
+// Run:  TSFORGE_SEED=money TSFORGE_TEMPS=0,0.5 TSFORGE_REPEATS=3 bun run packages/core/scripts/sweep.ts
+// A/B feature variants:
+//   TSFORGE_FEATURE_VARIANTS=ttsr,hashline (sweep across feature toggles)
+//   Each variant is dim=on|off (e.g. ttsr=on×hashline=off) creating a cartesian product.
+import { mkdir, readdir, rm, stat } from "node:fs/promises";
+import { join } from "node:path";
+import { parseSpec } from "../src/spec";
+import { buildGate, prettierWriteCommand } from "../src/detect-gate";
+import { runSpec, qualityRepair } from "../src/loop";
+import { modelAgent } from "../src/agent";
+import { OpenAICompatibleProvider } from "../src/inference";
+import { resolveActiveModel, resolveApiKey } from "../src/models-config";
+import { summarize, type IRunRecord } from "../src/eval";
+import { renderEvent } from "../src/render";
+import type { ILoopEvent } from "../src/loop";
+const seed = process.env.TSFORGE_SEED ?? "todo";
+const temps = (process.env.TSFORGE_TEMPS ?? "0,0.5")
+  .split(",")
+  .map((t) => Number(t.trim()));
+const repeats = Number(process.env.TSFORGE_REPEATS ?? "3");
+// Default quiet (batch). Set TSFORGE_STREAM=1 to watch the model live.
+const stream = process.env.TSFORGE_STREAM === "1";
+const qualityTarget = Number(process.env.TSFORGE_QUALITY_TARGET ?? "5");
+const qualityAttempts = Number(process.env.TSFORGE_QUALITY_ATTEMPTS ?? "2");
+/** Feature variants to sweep: a cartesian product of feature dimensions.
+ *  Example: `ttsr,hashline` → generates [ttsr=on×hashline=on, ttsr=on×hashline=off,
+ *  ttsr=off×hashline=on, ttsr=off×hashline=off]. Each dimension toggles via env var. */
+type IFeatureVariant = Record<string, string>;
+function parseFeatureVariants(): IFeatureVariant[] {
+  const featureDims = (process.env.TSFORGE_FEATURE_VARIANTS ?? "")
+    .split(",")
+    .map((s) => s.trim())
+    .filter((s) => s.length > 0);
+  if (featureDims.length === 0) {
+    return [{}]; // No features to sweep → one baseline variant
+  }
+  // Cartesian product: each dimension has 2 states (on=1, off=0).
+  const variants: IFeatureVariant[] = [];
+  const numVariants = Math.pow(2, featureDims.length);
+  for (let i = 0; i < numVariants; i++) {
+    const variant: IFeatureVariant = {};
+    for (let d = 0; d < featureDims.length; d++) {
+      const dim = featureDims[d];
+      if (dim !== undefined) {
+        const state = (i >> d) & 1; // Bit d of i → dimension d state
+        variant[dim] = state === 1 ? "1" : "0";
+      }
+    }
+    variants.push(variant);
+  }
+  return variants;
+}
+/** Map feature variant to env vars. Each feature dim maps to a TSFORGE_* var. */
+function variantToEnvVars(variant: IFeatureVariant): Record<string, string> {
+  const envVars: Record<string, string> = {};
+  for (const [dim, state] of Object.entries(variant)) {
+    if (dim === "ttsr") {
+      envVars.TSFORGE_TTSR = state === "1" ? "1" : "0";
+    } else if (dim === "hashline") {
+      envVars.TSFORGE_HASHLINE = state === "1" ? "1" : "0";
+    } else if (dim === "lsp_write_feedback") {
+      envVars.TSFORGE_LSP_WRITE_FEEDBACK = state === "1" ? "1" : "0";
+    }
+    // else: unknown dimension, skip
+  }
+  return envVars;
+}
+/** Variant label for logging: e.g. "ttsr=on,hashline=off". */
+function variantLabel(variant: IFeatureVariant): string {
+  const parts = Object.entries(variant)
+    .sort(([a], [b]) => a.localeCompare(b))
+    .map(([dim, state]) => `${dim}=${state === "1" ? "on" : "off"}`);
+  return parts.length > 0 ? parts.join(",") : "baseline";
+}
+const featureVariants = parseFeatureVariants();
+const evalsRoot = join(import.meta.dir, "..", "..", "..", "evals");
+// Prefer a local working seed (evals/<seed>); fall back to the committed corpus
+// (evals/corpus/<seed>) so checked-in seeds run with no manual copy step.
+const localSeedDir = join(evalsRoot, seed);
+const seedDir = (await Bun.file(join(localSeedDir, `${seed}.spec.md`)).exists())
+  ? localSeedDir
+  : join(evalsRoot, "corpus", seed);
+// Recursive so nested-directory apps (e.g. a React app under `src/`) copy whole;
+// flat single-dir evals are unaffected (recursive readdir returns the same list).
+const seedFiles = await readdir(seedDir, { recursive: true });
+// Resolve the model the same way the CLI does: explicit TSFORGE_* env wins, else
+// the active entry from ~/.tsforge/models.json. (Previously this hardcoded the
+// localhost default and ignored the registry, so a sweep silently dialed an
+// unreachable endpoint and hung with an empty run.log.)
+const { entry: activeModel } = await resolveActiveModel();
+const provider = new OpenAICompatibleProvider({
+  baseUrl: activeModel.baseUrl,
+  model: activeModel.model,
+  apiKey: resolveApiKey(activeModel),
+  // Thinking tokens count against the limit, so give reasoning + code room.
+  maxTokens: Number(process.env.TSFORGE_MAX_TOKENS ?? "16384"),
+  // Opt-in only: a repetition penalty breaks rare temp-0 loops but DEGRADES
+  // algorithmic code (it made `money` write unsafe/any code that failed the
+  // strict gate). Default off; enable via env if a target genuinely loops.
+  repetitionPenalty:
+    process.env.TSFORGE_REPETITION_PENALTY === undefined
+      ? undefined
+      : Number(process.env.TSFORGE_REPETITION_PENALTY),
+});
+// The judge scores quality. Point it at a flagship via TSFORGE_JUDGE_URL/MODEL
+// (+ TSFORGE_JUDGE_KEY) to measure the gap; defaults to the active model judging itself.
+const judgeProvider = new OpenAICompatibleProvider({
+  baseUrl: process.env.TSFORGE_JUDGE_URL ?? activeModel.baseUrl,
+  model: process.env.TSFORGE_JUDGE_MODEL ?? activeModel.model,
+  apiKey: process.env.TSFORGE_JUDGE_KEY ?? resolveApiKey(activeModel),
+});
+/** Sortable timestamp `YYYYMMDD-HHMMSS` so run dirs sort newest-last by name. */
+function stamp(): string {
+  const d = new Date();
+  const p = (n: number): string => String(n).padStart(2, "0");
+  return `${d.getFullYear()}${p(d.getMonth() + 1)}${p(d.getDate())}-${p(d.getHours())}${p(d.getMinutes())}${p(d.getSeconds())}`;
+}
+const records: IRunRecord[] = [];
+for (const variant of featureVariants) {
+  const variantEnv = variantToEnvVars(variant);
+  const vLabel = variantLabel(variant);
+  for (const temp of temps) {
+    for (let i = 0; i < repeats; i += 1) {
+      const runId = `${seed}-${vLabel}-t${temp}-${stamp()}-${i + 1}`;
+      const runDir = join(evalsRoot, "runs", runId);
+      // One run's failure (e.g. a request timing out) must not abort the sweep —
+      // record it as a blocked run and carry on, so a long batch is resilient.
+      try {
+        await runOne(runId, runDir, temp, i, variantEnv);
+      } catch (err) {
+        const message = err instanceof Error ? err.message : String(err);
+        records.push({
+          label: `${vLabel} temp=${temp}`,
+          passed: false,
+          cycles: 0,
+          ms: 0,
+        });
+        process.stdout.write(
+          `  ${seed} ${vLabel} temp=${temp} #${i + 1}: ERRORED (${message}) → ${runId}\n`
+        );
+      }
+    }
+  }
+}
+/** Set env vars for a variant, returning a restore function. */
+function setVariantEnv(variant: Record<string, string>): () => void {
+  const saved: Record<string, string | undefined> = {};
+  for (const [key, value] of Object.entries(variant)) {
+    saved[key] = process.env[key];
+    process.env[key] = value;
+  }
+  return () => {
+    for (const [key, value] of Object.entries(saved)) {
+      if (value === undefined) {
+        // Rather than delete, we just don't restore the var.
+        // It was undefined before, so it stays undefined.
+        continue;
+      }
+      process.env[key] = value;
+    }
+  };
+}
+/** Copy seed files and prepare the run directory. */
+async function setupRunDir(dir: string): Promise<void> {
+  await mkdir(dir, { recursive: true });
+  for (const file of seedFiles) {
+    const src = join(seedDir, file);
+    if ((await stat(src)).isDirectory()) {
+      continue;
+    }
+    await Bun.write(join(dir, file), Bun.file(src));
+  }
+}
+/** Remove task files in scratch mode (keep in existing mode). */
+async function startRed(
+  dir: string,
+  spec: ReturnType<typeof parseSpec>
+): Promise<void> {
+  if (spec.mode !== "existing") {
+    for (const task of spec.tasks) {
+      for (const f of task.files) {
+        await rm(join(dir, f), { force: true });
+      }
+    }
+  }
+}
+async function runOne(
+  runId: string,
+  runDir: string,
+  temp: number,
+  i: number,
+  variantEnv: Record<string, string> = {}
+): Promise<void> {
+  const restore = setVariantEnv(variantEnv);
+  try {
+    await setupRunDir(runDir);
+    const spec = parseSpec(
+      await Bun.file(join(runDir, `${seed}.spec.md`)).text()
+    );
+    await startRed(runDir, spec);
+    // Apply tsforge's STRICT FLOOR (bundled tsc-strict + eslint) to the eval
+    // gate — the SAME gate the interactive CLI builds. Eval mode otherwise
+    // trusts the spec's `accept` verbatim, so an error the tests don't execute
+    // (an unguarded index access, an `as any`) slipped through as GREEN. Now
+    // every task and the whole-spec verify must clear the strict floor BEFORE
+    // its functional tests count.
+    // prettier --write FIRST (auto-format), then tsc-strict + eslint. The model
+    // never hand-formats, but the gate still enforces type-safety + idioms.
+    const strictGate = `${prettierWriteCommand()} && ${(await buildGate(runDir)).command}`;
+    const gatedSpec = {
+      ...spec,
+      tasks: spec.tasks.map((t) => ({
+        ...t,
+        accept: `${strictGate} && ${t.accept}`,
+      })),
+      verify:
+        spec.verify.length > 0 ? `${strictGate} && ${spec.verify}` : strictGate,
+    };
+    // Every run gets a full transcript at <runDir>/run.log; stream to the
+    // terminal too when TSFORGE_STREAM=1.
+    const log = Bun.file(join(runDir, "run.log")).writer();
+    const onEvent = (e: ILoopEvent): void => {
+      void log.write(renderEvent(e, { color: false }));
+      // Flush per event — otherwise Bun's FileSink buffers and `tail -f` shows
+      // nothing until the run ends. The log must be live.
+      void log.flush();
+      if (stream) {
+        process.stdout.write(renderEvent(e, { color: true }));
+      }
+    };
+    const agent = modelAgent(provider, {
+      temperature: temp,
+      ...(process.env.TSFORGE_THINKING_BUDGET === undefined
+        ? {}
+        : { thinkingTokenBudget: Number(process.env.TSFORGE_THINKING_BUDGET) }),
+    });
+    const started = performance.now();
+    const result = await runSpec(gatedSpec, runDir, provider, {
+      onEvent,
+      temperature: temp,
+      // Cap reasoning per call to trim turn time — A/B the sweet spot via env.
+      ...(process.env.TSFORGE_THINKING_BUDGET === undefined
+        ? {}
+        : { thinkingTokenBudget: Number(process.env.TSFORGE_THINKING_BUDGET) }),
+    });
+    const ms = Math.round(performance.now() - started);
+    const cycles = result.results.reduce((acc, r) => acc + r.cycles, 0);
+    const passed = result.status === "done";
+    // Once green, drive QUALITY up: judge → improve-per-critique → re-judge.
+    let quality: number | undefined;
+    let judgeNotes = "";
+    const firstTask = spec.tasks[0];
+    if (passed && firstTask !== undefined) {
+      const specText = await Bun.file(join(runDir, `${seed}.spec.md`)).text();
+      // The judge is a MEASUREMENT, not part of the build. If it fails (e.g. the
+      // server times out), the implement result still stands — degrade to
+      // "quality unknown" rather than erroring out a successful run.
+      try {
+        const qr = await qualityRepair(
+          firstTask,
+          runDir,
+          agent,
+          judgeProvider,
+          { goal: spec.title, criteria: specText },
+          { target: qualityTarget, maxAttempts: qualityAttempts, onEvent }
+        );
+        quality = qr.quality;
+        judgeNotes = qr.notes;
+      } catch (err) {
+        judgeNotes = `judge unavailable: ${err instanceof Error ? err.message : String(err)}`;
+      }
+    }
+    await log.end();
+    // Structured per-run artifact for comparison alongside run.log + the code.
+    // Include the feature variant so analysis can reconstruct the conditions.
+    await Bun.write(
+      join(runDir, "result.json"),
+      JSON.stringify(
+        {
+          seed,
+          runId,
+          temperature: temp,
+          features: variantEnv,
+          status: result.status,
+          cycles,
+          ms,
+          quality,
+          judgeNotes,
+          tasks: result.results,
+        },
+        null,
+        2
+      )
+    );
+    const edits = result.results.reduce((a, r) => a + (r.edits ?? 0), 0);
+    const regressions = result.results.reduce(
+      (a, r) => a + (r.regressions ?? 0),
+      0
+    );
+    const vLabel = variantLabel(variantEnv);
+    records.push({
+      label: `${vLabel} temp=${temp}`,
+      passed,
+      cycles,
+      ms,
+      quality,
+    });
+    process.stdout.write(
+      `  ${seed} ${vLabel} temp=${temp} #${i + 1}: ${passed ? "done" : "blocked"} (${cycles} cyc, ${edits} edits, ${regressions} regress, ${ms}ms${quality === undefined ? "" : `, Q${quality}/5`}) → ${runId}\n`
+    );
+  } finally {
+    restore();
+  }
+}
+const summaries = summarize(records);
+process.stdout.write(`\n=== sweep: ${seed} (${repeats} runs/variant) ===\n`);
+for (const s of summaries) {
+  process.stdout.write(
+    `${s.label.padEnd(10)}  pass ${Math.round(s.passRate * 100)}% (${s.passed}/${s.runs})  Q ${s.avgQuality.toFixed(1)}/5  avg ${s.avgCycles.toFixed(1)} cyc  ${Math.round(s.avgMs)}ms\n`
+  );
+}
+const outPath = join(evalsRoot, "runs", `sweep-${seed}-${stamp()}.json`);
+await Bun.write(
+  outPath,
+  JSON.stringify({ seed, temps, repeats, records, summaries }, null, 2)
+);
+process.stdout.write(`\nsaved ${outPath}\n`);

package/src/cli.ts CHANGED Viewed

@@ -11,6 +11,7 @@ import {
 } from "./loop";
 import {
   PROVIDER_LIMITS,
+  PROVIDER_DEFAULTS,
   OpenAICompatibleProvider,
   type IOpenAICompatibleConfig,
 } from "./inference";
@@ -261,7 +262,12 @@ async function detectContextWindow(
     const entries = data.data.filter(isRecord);
     const match = entries.find((e) => e.id === entry.model) ?? entries[0];
-    const len = match?.max_model_len;
+    // vLLM uses `max_model_len`; other servers expose `context_window` or
+    // `max_position_embeddings` — accept whichever is present.
+    const len =
+      match?.max_model_len ??
+      match?.context_window ??
+      match?.max_position_embeddings;
     return typeof len === "number" && Number.isFinite(len) ? len : undefined;
   } catch {
@@ -328,6 +334,16 @@ export function providerConfig(entry: IModelEntry): IOpenAICompatibleConfig {
     // instead of emitting tool calls (→ no files written). The StreamGuard is
     // the targeted loop protection. Opt in only to experiment.
     ...(repetitionPenalty === undefined ? {} : { repetitionPenalty }),
+    // Provider dialect + escape hatches — passed straight through so any
+    // OpenAI-ish endpoint (DeepSeek, OpenAI o-series, custom gateways) works.
+    ...(entry.reasoning === undefined ? {} : { reasoning: entry.reasoning }),
+    ...(entry.reasoningEffort === undefined
+      ? {}
+      : { reasoningEffort: entry.reasoningEffort }),
+    ...(entry.extraBody === undefined ? {} : { extraBody: entry.extraBody }),
+    ...(entry.extraHeaders === undefined
+      ? {}
+      : { extraHeaders: entry.extraHeaders }),
   };
 }
@@ -335,6 +351,26 @@ function makeProvider(entry: IModelEntry): OpenAICompatibleProvider {
   return new OpenAICompatibleProvider(providerConfig(entry));
 }
+/** Catch the common footgun: a cloud baseUrl paired with the leftover qwen
+ *  default `model`, which then 400s ("model not supported") on that host. */
+function warnDefaultModelOnRemote(entry: IModelEntry): void {
+  let host: string;
+  try {
+    host = new URL(entry.baseUrl).hostname;
+  } catch {
+    return;
+  }
+  const remote = host !== "localhost" && host !== "127.0.0.1" && host !== "::1";
+  if (remote && entry.model === PROVIDER_DEFAULTS.model) {
+    process.stdout.write(
+      `  ⚠ models.json: model is still "${PROVIDER_DEFAULTS.model}" (the default) but baseUrl is ${host} — set the entry's "model" to a name that host supports.\n`
+    );
+  }
+}
 /** Print the model registry with ★ on the active one (the `/model` listing). */
 async function listModels(
   provider: OpenAICompatibleProvider,
@@ -784,6 +820,8 @@ async function repl(args: ICliArgs): Promise<number> {
   const provider = makeProvider(activeModel.entry);
   let activeName = activeModel.name;
+  warnDefaultModelOnRemote(activeModel.entry);
   // Best-effort cleanup of stale sessions on every launch.
   await pruneSessions();

package/src/inference/inference.types.ts CHANGED Viewed

@@ -118,6 +118,26 @@ export interface IOpenAICompatibleConfig {
    * correctness. Omitted (1.0 = off) by default; set it on code-gen providers.
    */
   repetitionPenalty?: number;
+  /**
+   * How this provider wants reasoning/thinking expressed on the wire:
+   *  - `qwen` (default): `chat_template_kwargs.enable_thinking` + `thinking_token_budget` (vLLM).
+   *  - `deepseek`: top-level `thinking: { type }` + `reasoning_effort`; never sends
+   *    `tool_choice: "required"` (DeepSeek's thinking mode rejects it).
+   *  - `openai`: `reasoning_effort`; uses `max_completion_tokens` and omits `temperature` (o-series).
+   *  - `none`: no reasoning fields.
+   */
+  reasoning?: ReasoningStyle;
+  /** Reasoning effort for `deepseek`/`openai` styles (maps to `reasoning_effort`). */
+  reasoningEffort?: "low" | "medium" | "high";
+  /** Arbitrary fields merged into the request body LAST (override anything above) —
+   *  the escape hatch for any provider-specific param. */
+  extraBody?: Record<string, unknown>;
+  /** Arbitrary request headers (e.g. Azure `api-key`, Anthropic `x-api-key`).
+   *  `${VAR}` in values is interpolated from the environment. */
+  extraHeaders?: Record<string, string>;
   /** Injectable for tests; defaults to global fetch. */
   fetch?: typeof fetch;
 }
+/** Provider reasoning-param dialect. */
+export type ReasoningStyle = "qwen" | "deepseek" | "openai" | "none";

package/src/inference/openai-compatible.ts CHANGED Viewed

@@ -7,8 +7,13 @@ import type {
 } from "./inference.types";
 import { PROVIDER_LIMITS } from "./inference.constants";
 import { fetchWithRetry } from "./transport";
-import { toWire, parseResponse } from "./wire";
+import { parseResponse } from "./wire";
 import { streamResponse } from "./stream";
+import {
+  buildRequestBody,
+  buildRequestHeaders,
+  chatCompletionsUrl,
+} from "./request";
 export { salvageToolCalls } from "./wire";
@@ -40,38 +45,10 @@ export class OpenAICompatibleProvider implements IProvider {
   ): Promise<IModelResponse> {
     const doFetch = this.cfg.fetch ?? fetch;
     const streaming = opts.onToken !== undefined;
-    const headers: Record<string, string> = {
-      "content-type": "application/json",
-    };
-    if (this.cfg.apiKey !== undefined) {
-      headers.authorization = `Bearer ${this.cfg.apiKey}`;
-    }
-    const body = JSON.stringify({
-      model: this.cfg.model,
-      messages: messages.map(toWire),
-      max_tokens: this.cfg.maxTokens ?? PROVIDER_LIMITS.maxTokens,
-      temperature: opts.temperature,
-      ...(this.cfg.repetitionPenalty === undefined
-        ? {}
-        : { repetition_penalty: this.cfg.repetitionPenalty }),
-      ...(opts.tools === undefined
-        ? {}
-        : { tools: opts.tools, tool_choice: opts.toolChoice ?? "auto" }),
-      ...(opts.enableThinking === undefined
-        ? {}
-        : { chat_template_kwargs: { enable_thinking: opts.enableThinking } }),
-      ...(opts.thinkingTokenBudget === undefined
-        ? {}
-        : { thinking_token_budget: opts.thinkingTokenBudget }),
-      // include_usage → the stream emits a final chunk carrying token `usage`
-      // (otherwise a streamed response reports none). Non-stream replies carry it
-      // by default.
-      ...(streaming
-        ? { stream: true, stream_options: { include_usage: true } }
-        : {}),
-    });
+    const headers = buildRequestHeaders(this.cfg);
+    const body = JSON.stringify(
+      buildRequestBody(this.cfg, messages, opts, streaming)
+    );
     // Retry transient CONNECTION blips (socket close / unable-to-connect) — the
     // connect happens before any stream starts, so retrying is safe for both
@@ -79,7 +56,7 @@ export class OpenAICompatibleProvider implements IProvider {
     // a network hiccup from wrecking an eval run.
     const res = await fetchWithRetry(
       doFetch,
-      `${this.cfg.baseUrl}/chat/completions`,
+      chatCompletionsUrl(this.cfg.baseUrl),
       headers,
       body,
       this.cfg.timeoutMs ?? PROVIDER_LIMITS.requestTimeoutMs,