npm - @agjs/tsforge - Versions diffs - 0.1.10 → 0.1.12 - Mend

@agjs/tsforge 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +1 -1
package/src/cli.ts +17 -0
package/src/eval/index.ts +9 -0
package/src/eval/metrics.ts +87 -0
package/src/eval/report.ts +168 -0
package/src/loop/loop.types.ts +3 -0
package/src/loop/session.ts +64 -2
package/src/render/ansi.ts +4 -0
package/src/render/render.types.ts +3 -0

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@agjs/tsforge",
   "type": "module",
-  "version": "0.1.10",
+  "version": "0.1.12",
   "license": "MIT",
   "description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
   "repository": {

package/src/cli.ts CHANGED Viewed

@@ -629,6 +629,7 @@ const HELP = [
   "  /model [name]    list configured models (★ active), or switch to <name>",
   "  /sessions        list saved sessions (resume one with: tsforge --resume <id>)",
   "  /cost            rough conversation size (messages + ~tokens)",
+  "  /metrics         token totals + generation rate (tok/s) this session",
   "  /exit, /quit     leave the session",
   "",
   "Anything else is sent to the agent. It works with its tools; when it stops,",
@@ -1197,6 +1198,21 @@ async function repl(args: ICliArgs): Promise<number> {
         break;
       }
+      case "metrics": {
+        const m = session.metrics;
+        if (m.calls === 0) {
+          process.stdout.write("  no model calls yet\n");
+        } else {
+          process.stdout.write(
+            `  ${String(m.calls)} call(s) · ${String(m.promptTokens)} in / ${String(m.completionTokens)} out · ` +
+              `${String(m.lastTokensPerSecond)} tok/s last · ${String(m.avgTokensPerSecond)} tok/s avg\n`
+          );
+        }
+        break;
+      }
       default:
         process.stdout.write(`unknown command: ${line} (try /help)\n`);
     }
@@ -1217,6 +1233,7 @@ async function repl(args: ICliArgs): Promise<number> {
         elapsedMs: lastElapsedMs,
         status: lastStatus,
         scope: scopeLabel(session.scope) + (planMode ? " · PLAN" : ""),
+        tokensPerSecond: session.metrics.lastTokensPerSecond,
       })
     );
     process.stdout.write("› ");

package/src/eval/index.ts CHANGED Viewed

@@ -1,3 +1,12 @@
 export * from "./eval.types";
 export { judge } from "./judge";
 export { summarize } from "./score";
+export { analyzeEvents, type IRunMetrics } from "./metrics";
+export {
+  buildSweepReport,
+  renderSweepReportMarkdown,
+  wilsonInterval,
+  twoProportionZ,
+  type ISweepReport,
+  type IVariantReport,
+} from "./report";

package/src/eval/metrics.ts ADDED Viewed

@@ -0,0 +1,87 @@
+import type { ILoopEvent } from "../loop/loop.types";
+/** Behavioral metrics distilled from a run's event stream — the signals the
+ *  local-model literature says predict outcomes (tokens-to-solution, repair
+ *  iterations, peak context) rather than vibes. A reusable, pure counterpart to
+ *  the cli-metrics script. */
+export interface IRunMetrics {
+  finalStatus: "done" | "stuck" | "none";
+  /** Model turns (one per `cycle` event). */
+  turns: number;
+  /** Model calls (one per `usage` event). */
+  modelCalls: number;
+  /** Total completion tokens generated. */
+  tokensOut: number;
+  /** Largest prompt-token count seen (the run's context high-water mark). */
+  peakContext: number;
+  /** File mutations (`edit` + `create`). */
+  edits: number;
+  /** Distinct files created. */
+  filesCreated: number;
+  /** Gate runs (`validated` events). */
+  gateRuns: number;
+  /** Summed turn wall-clock from `timing` events, in seconds. */
+  wallClockSeconds: number;
+  /** Mean output rate across calls that reported one (tokens/second). */
+  avgTokensPerSecond: number;
+}
+function emptyMetrics(): IRunMetrics {
+  return {
+    finalStatus: "none",
+    turns: 0,
+    modelCalls: 0,
+    tokensOut: 0,
+    peakContext: 0,
+    edits: 0,
+    filesCreated: 0,
+    gateRuns: 0,
+    wallClockSeconds: 0,
+    avgTokensPerSecond: 0,
+  };
+}
+/** Reduce a run's event stream to its behavioral metrics. Pure — feed it the
+ *  events from a `--log` JSONL or a captured `onEvent` stream. */
+export function analyzeEvents(events: readonly ILoopEvent[]): IRunMetrics {
+  const m = emptyMetrics();
+  const created = new Set<string>();
+  let tpsSum = 0;
+  let tpsCount = 0;
+  for (const event of events) {
+    if (event.kind === "cycle") {
+      m.turns += 1;
+    } else if (event.kind === "usage") {
+      m.modelCalls += 1;
+      m.tokensOut += event.completionTokens ?? 0;
+      m.peakContext = Math.max(m.peakContext, event.promptTokens ?? 0);
+      if (event.tokensPerSecond !== undefined && event.tokensPerSecond > 0) {
+        tpsSum += event.tokensPerSecond;
+        tpsCount += 1;
+      }
+    } else if (event.kind === "create") {
+      m.edits += 1;
+      if (event.file !== undefined && event.file.length > 0) {
+        created.add(event.file);
+      }
+    } else if (event.kind === "edit") {
+      m.edits += 1;
+    } else if (event.kind === "timing") {
+      m.wallClockSeconds += Math.round((event.ms ?? 0) / 1000);
+    } else if (event.kind === "validated") {
+      m.gateRuns += 1;
+    } else if (event.kind === "done") {
+      m.finalStatus = "done";
+    } else if (event.kind === "stuck") {
+      m.finalStatus = "stuck";
+    }
+  }
+  m.filesCreated = created.size;
+  m.avgTokensPerSecond = tpsCount > 0 ? Math.round(tpsSum / tpsCount) : 0;
+  return m;
+}

package/src/eval/report.ts ADDED Viewed

@@ -0,0 +1,168 @@
+import type { IRunRecord, IVariantSummary } from "./eval.types";
+import { summarize } from "./score";
+/** 95% normal quantile — the multiplier for Wilson intervals and the z-test. */
+const Z95 = 1.96;
+/** A variant summary enriched with a confidence interval and, when a baseline is
+ *  given, a significance test of its pass-rate difference from that baseline. */
+export interface IVariantReport extends IVariantSummary {
+  /** 95% Wilson score interval for the pass rate, as [low, high] in [0, 1]. */
+  readonly passRateCI: readonly [number, number];
+  /** Comparison vs the baseline variant (absent for the baseline itself or when
+   *  no baseline was supplied). */
+  readonly vsBaseline?: {
+    readonly deltaPassRate: number;
+    readonly z: number;
+    /** True when |z| > 1.96 (p < 0.05, two-sided). */
+    readonly significant: boolean;
+  };
+}
+export interface ISweepReport {
+  /** The baseline variant label, or null if none was matched. */
+  readonly baseline: string | null;
+  readonly variants: readonly IVariantReport[];
+}
+/** 95% Wilson score interval for `passed` successes out of `n` trials. */
+export function wilsonInterval(passed: number, n: number): [number, number] {
+  if (n === 0) {
+    return [0, 0];
+  }
+  const phat = passed / n;
+  const z2 = Z95 * Z95;
+  const denom = 1 + z2 / n;
+  const centre = phat + z2 / (2 * n);
+  const margin = Z95 * Math.sqrt((phat * (1 - phat)) / n + z2 / (4 * n * n));
+  return [
+    Math.max(0, (centre - margin) / denom),
+    Math.min(1, (centre + margin) / denom),
+  ];
+}
+/** Pooled two-proportion z-statistic comparing rate1 (x1/n1) to rate2 (x2/n2). */
+export function twoProportionZ(
+  x1: number,
+  n1: number,
+  x2: number,
+  n2: number
+): number {
+  if (n1 === 0 || n2 === 0) {
+    return 0;
+  }
+  const pooled = (x1 + x2) / (n1 + n2);
+  const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));
+  if (se === 0) {
+    return 0;
+  }
+  return (x1 / n1 - x2 / n2) / se;
+}
+function compareToBaseline(
+  variant: IVariantSummary,
+  baseline: IVariantSummary
+): IVariantReport["vsBaseline"] {
+  const z = twoProportionZ(
+    variant.passed,
+    variant.runs,
+    baseline.passed,
+    baseline.runs
+  );
+  return {
+    deltaPassRate: variant.passRate - baseline.passRate,
+    z,
+    significant: Math.abs(z) > Z95,
+  };
+}
+/**
+ * Aggregate raw run records into a statistical report: per-variant pass rate with
+ * a 95% Wilson interval, plus — when `baselineLabel` matches a variant — a
+ * two-proportion significance test of every other variant against it.
+ */
+export function buildSweepReport(
+  records: readonly IRunRecord[],
+  baselineLabel?: string
+): ISweepReport {
+  const summaries = summarize([...records]);
+  const baseline =
+    baselineLabel === undefined
+      ? undefined
+      : summaries.find((s) => s.label === baselineLabel);
+  const variants = summaries.map((summary) => {
+    const passRateCI = wilsonInterval(summary.passed, summary.runs);
+    const sameAsBaseline = baseline?.label === summary.label;
+    if (baseline === undefined || sameAsBaseline) {
+      return { ...summary, passRateCI };
+    }
+    return {
+      ...summary,
+      passRateCI,
+      vsBaseline: compareToBaseline(summary, baseline),
+    };
+  });
+  return { baseline: baseline?.label ?? null, variants };
+}
+function pct(value: number): string {
+  return `${Math.round(value * 100)}%`;
+}
+function baselineCell(report: IVariantReport, baseline: string | null): string {
+  if (baseline === null) {
+    return "—";
+  }
+  if (report.label === baseline) {
+    return "baseline";
+  }
+  const v = report.vsBaseline;
+  if (v === undefined) {
+    return "—";
+  }
+  const sign = v.deltaPassRate >= 0 ? "+" : "";
+  const mark = v.significant ? " *" : "";
+  return `${sign}${pct(v.deltaPassRate)} (z=${v.z.toFixed(2)})${mark}`;
+}
+/** Render a sweep report as a Markdown table. `*` marks a significant difference
+ *  (p < 0.05) from the baseline. */
+export function renderSweepReportMarkdown(report: ISweepReport): string {
+  const header =
+    "| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | vs baseline |\n" +
+    "| --- | --- | --- | --- | --- | --- | --- | --- |";
+  const rows = report.variants.map((v) => {
+    const ci = `${pct(v.passRateCI[0])}–${pct(v.passRateCI[1])}`;
+    return (
+      `| ${v.label} | ${String(v.runs)} | ${pct(v.passRate)} | ${ci} | ` +
+      `${v.avgCycles.toFixed(1)} | ${String(Math.round(v.avgMs))} | ` +
+      `${v.avgQuality.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
+    );
+  });
+  return [
+    "## A/B sweep report",
+    "",
+    header,
+    ...rows,
+    "",
+    "`*` = significant at p < 0.05 (two-proportion z-test vs baseline).",
+  ].join("\n");
+}

package/src/loop/loop.types.ts CHANGED Viewed

@@ -51,6 +51,9 @@ export interface ILoopEvent {
   promptTokens?: number;
   completionTokens?: number;
   totalTokens?: number;
+  /** For `usage` events: output generation rate (completion tokens / second),
+   *  measured from the first streamed token to the call's end. */
+  tokensPerSecond?: number;
   /** For `usage` (and salvage-warning `tool`) events: whether THIS model call
    *  ran with thinking enabled — lets the analyzer correlate malformed-tool-call
    *  rate with the thinking mode (see analyze-malformed). */

package/src/loop/session.ts CHANGED Viewed

@@ -114,6 +114,20 @@ export interface ISendResult {
   turns: number;
 }
+/** Cumulative model-call metrics for a session — the basis for `/metrics`. */
+export interface ISessionMetrics {
+  /** Number of model calls made. */
+  readonly calls: number;
+  /** Total prompt (input) tokens billed across all calls. */
+  readonly promptTokens: number;
+  /** Total completion (output) tokens generated across all calls. */
+  readonly completionTokens: number;
+  /** Output generation rate averaged over all calls (tokens/second). */
+  readonly avgTokensPerSecond: number;
+  /** Output generation rate of the most recent call (tokens/second). */
+  readonly lastTokensPerSecond: number;
+}
 export interface ISendOptions {
   /** Caller cancellation (Ctrl-C). */
   signal?: AbortSignal;
@@ -339,6 +353,15 @@ export class Session {
    *  size of the context the model last saw (drives the status gauge and, soon,
    *  auto-compaction). */
   private lastUsage?: ITokenUsage;
+  /** Running totals behind the `metrics` getter. genMs is the summed generation
+   *  time (first-token→end) so the average rate is tokens/total-gen-seconds. */
+  private readonly metricsTotals = {
+    calls: 0,
+    promptTokens: 0,
+    completionTokens: 0,
+    genMs: 0,
+    lastTokensPerSecond: 0,
+  };
   /** Fast check run every few edits while building (e.g. tsc); "" = off. */
   private incrementalCheck: string;
   /** Per-send thinking override, set from ISendOptions for the duration of a
@@ -507,6 +530,31 @@ export class Session {
     return this.lastUsage;
   }
+  /** Cumulative model-call metrics (tokens + generation rate) for this session. */
+  get metrics(): ISessionMetrics {
+    const t = this.metricsTotals;
+    return {
+      calls: t.calls,
+      promptTokens: t.promptTokens,
+      completionTokens: t.completionTokens,
+      avgTokensPerSecond:
+        t.genMs > 0 ? Math.round((t.completionTokens / t.genMs) * 1000) : 0,
+      lastTokensPerSecond: Math.round(t.lastTokensPerSecond),
+    };
+  }
+  /** Fold one call's usage + generation time into the running metrics totals. */
+  private recordUsage(usage: ITokenUsage, genMs: number): void {
+    this.lastUsage = usage;
+    this.metricsTotals.calls += 1;
+    this.metricsTotals.promptTokens += usage.promptTokens;
+    this.metricsTotals.completionTokens += usage.completionTokens;
+    this.metricsTotals.genMs += genMs;
+    this.metricsTotals.lastTokensPerSecond =
+      genMs > 0 ? (usage.completionTokens / genMs) * 1000 : 0;
+  }
   /** The real size of the context the model is currently holding — the prompt
    *  tokens of the last call (what auto-compaction watches), 0 before any call. */
   get contextTokens(): number {
@@ -957,6 +1005,8 @@ export class Session {
     const mcpSchemas = this.ctx.mcpRegistry?.toolSchemas() ?? [];
     const offeredTools =
       mcpSchemas.length > 0 ? [...baseTools, ...mcpSchemas] : baseTools;
+    const callStart = performance.now();
+    let firstTokenAt = 0;
     const res = await this.provider.complete(ctx.messages, {
       tools: offeredTools,
       temperature: this.cfg.temperature ?? 0,
@@ -967,6 +1017,12 @@ export class Session {
         : { thinkingTokenBudget: this.cfg.thinkingTokenBudget }),
       ...(signal === undefined ? {} : { signal }),
       onToken: (token, channel) => {
+        // Stamp the first token so tokens/sec measures generation rate (excluding
+        // prompt-processing / time-to-first-token), not total wall time.
+        if (firstTokenAt === 0) {
+          firstTokenAt = performance.now();
+        }
         // Stream EVERYTHING live — thinking, the tool calls being written, and
         // the answer itself (channel `content`), so the user watches the reply
         // arrive instead of staring at a frozen indicator. The renderer formats
@@ -977,17 +1033,23 @@ export class Session {
     });
     if (res.usage !== undefined) {
-      this.lastUsage = res.usage;
+      const ended = performance.now();
+      const genMs = firstTokenAt > 0 ? ended - firstTokenAt : ended - callStart;
+      const tps = genMs > 0 ? (res.usage.completionTokens / genMs) * 1000 : 0;
+      this.recordUsage(res.usage, genMs);
       // Logged (not shown) so the --log analyzer can compute tokens-to-solution.
       // `thinking` records THIS call's mode, so malformed-call rates can be
       // correlated with it (analyze-malformed).
       report({
         kind: "usage",
         task: SESSION_ID,
-        message: `tokens ${res.usage.promptTokens} in / ${res.usage.completionTokens} out`,
+        message: `tokens ${res.usage.promptTokens} in / ${res.usage.completionTokens} out · ${Math.round(tps)} tok/s`,
         promptTokens: res.usage.promptTokens,
         completionTokens: res.usage.completionTokens,
         totalTokens: res.usage.totalTokens,
+        tokensPerSecond: Math.round(tps),
+        ms: Math.round(genMs),
         ...(enableThinking === undefined ? {} : { thinking: enableThinking }),
       });
     }

package/src/render/ansi.ts CHANGED Viewed

@@ -71,6 +71,10 @@ export function renderStatus(
     );
   }
+  if (info.tokensPerSecond !== undefined && info.tokensPerSecond > 0) {
+    bits.push(`${info.tokensPerSecond} tok/s`);
+  }
   bits.push(info.status, info.scope);
   return `${paint(`  ⎯ ${bits.join(" · ")}`, STYLE.dim, color)}\n`;

package/src/render/render.types.ts CHANGED Viewed

@@ -18,4 +18,7 @@ export interface IStatusInfo {
   status: string;
   /** Editable scope label. */
   scope: string;
+  /** Output generation rate of the last model call (tokens/second); omitted or
+   *  0 before the first call. */
+  tokensPerSecond?: number;
 }