@agjs/tsforge 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agjs/tsforge",
3
3
  "type": "module",
4
- "version": "0.1.10",
4
+ "version": "0.1.12",
5
5
  "license": "MIT",
6
6
  "description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
7
7
  "repository": {
package/src/cli.ts CHANGED
@@ -629,6 +629,7 @@ const HELP = [
629
629
  " /model [name] list configured models (★ active), or switch to <name>",
630
630
  " /sessions list saved sessions (resume one with: tsforge --resume <id>)",
631
631
  " /cost rough conversation size (messages + ~tokens)",
632
+ " /metrics token totals + generation rate (tok/s) this session",
632
633
  " /exit, /quit leave the session",
633
634
  "",
634
635
  "Anything else is sent to the agent. It works with its tools; when it stops,",
@@ -1197,6 +1198,21 @@ async function repl(args: ICliArgs): Promise<number> {
1197
1198
  break;
1198
1199
  }
1199
1200
 
1201
+ case "metrics": {
1202
+ const m = session.metrics;
1203
+
1204
+ if (m.calls === 0) {
1205
+ process.stdout.write(" no model calls yet\n");
1206
+ } else {
1207
+ process.stdout.write(
1208
+ ` ${String(m.calls)} call(s) · ${String(m.promptTokens)} in / ${String(m.completionTokens)} out · ` +
1209
+ `${String(m.lastTokensPerSecond)} tok/s last · ${String(m.avgTokensPerSecond)} tok/s avg\n`
1210
+ );
1211
+ }
1212
+
1213
+ break;
1214
+ }
1215
+
1200
1216
  default:
1201
1217
  process.stdout.write(`unknown command: ${line} (try /help)\n`);
1202
1218
  }
@@ -1217,6 +1233,7 @@ async function repl(args: ICliArgs): Promise<number> {
1217
1233
  elapsedMs: lastElapsedMs,
1218
1234
  status: lastStatus,
1219
1235
  scope: scopeLabel(session.scope) + (planMode ? " · PLAN" : ""),
1236
+ tokensPerSecond: session.metrics.lastTokensPerSecond,
1220
1237
  })
1221
1238
  );
1222
1239
  process.stdout.write("› ");
package/src/eval/index.ts CHANGED
@@ -1,3 +1,12 @@
1
1
  export * from "./eval.types";
2
2
  export { judge } from "./judge";
3
3
  export { summarize } from "./score";
4
+ export { analyzeEvents, type IRunMetrics } from "./metrics";
5
+ export {
6
+ buildSweepReport,
7
+ renderSweepReportMarkdown,
8
+ wilsonInterval,
9
+ twoProportionZ,
10
+ type ISweepReport,
11
+ type IVariantReport,
12
+ } from "./report";
@@ -0,0 +1,87 @@
1
+ import type { ILoopEvent } from "../loop/loop.types";
2
+
3
+ /** Behavioral metrics distilled from a run's event stream — the signals the
4
+ * local-model literature says predict outcomes (tokens-to-solution, repair
5
+ * iterations, peak context) rather than vibes. A reusable, pure counterpart to
6
+ * the cli-metrics script. */
7
+ export interface IRunMetrics {
8
+ finalStatus: "done" | "stuck" | "none";
9
+ /** Model turns (one per `cycle` event). */
10
+ turns: number;
11
+ /** Model calls (one per `usage` event). */
12
+ modelCalls: number;
13
+ /** Total completion tokens generated. */
14
+ tokensOut: number;
15
+ /** Largest prompt-token count seen (the run's context high-water mark). */
16
+ peakContext: number;
17
+ /** File mutations (`edit` + `create`). */
18
+ edits: number;
19
+ /** Distinct files created. */
20
+ filesCreated: number;
21
+ /** Gate runs (`validated` events). */
22
+ gateRuns: number;
23
+ /** Summed turn wall-clock from `timing` events, in seconds. */
24
+ wallClockSeconds: number;
25
+ /** Mean output rate across calls that reported one (tokens/second). */
26
+ avgTokensPerSecond: number;
27
+ }
28
+
29
+ function emptyMetrics(): IRunMetrics {
30
+ return {
31
+ finalStatus: "none",
32
+ turns: 0,
33
+ modelCalls: 0,
34
+ tokensOut: 0,
35
+ peakContext: 0,
36
+ edits: 0,
37
+ filesCreated: 0,
38
+ gateRuns: 0,
39
+ wallClockSeconds: 0,
40
+ avgTokensPerSecond: 0,
41
+ };
42
+ }
43
+
44
+ /** Reduce a run's event stream to its behavioral metrics. Pure — feed it the
45
+ * events from a `--log` JSONL or a captured `onEvent` stream. */
46
+ export function analyzeEvents(events: readonly ILoopEvent[]): IRunMetrics {
47
+ const m = emptyMetrics();
48
+ const created = new Set<string>();
49
+ let tpsSum = 0;
50
+ let tpsCount = 0;
51
+
52
+ for (const event of events) {
53
+ if (event.kind === "cycle") {
54
+ m.turns += 1;
55
+ } else if (event.kind === "usage") {
56
+ m.modelCalls += 1;
57
+ m.tokensOut += event.completionTokens ?? 0;
58
+ m.peakContext = Math.max(m.peakContext, event.promptTokens ?? 0);
59
+
60
+ if (event.tokensPerSecond !== undefined && event.tokensPerSecond > 0) {
61
+ tpsSum += event.tokensPerSecond;
62
+ tpsCount += 1;
63
+ }
64
+ } else if (event.kind === "create") {
65
+ m.edits += 1;
66
+
67
+ if (event.file !== undefined && event.file.length > 0) {
68
+ created.add(event.file);
69
+ }
70
+ } else if (event.kind === "edit") {
71
+ m.edits += 1;
72
+ } else if (event.kind === "timing") {
73
+ m.wallClockSeconds += Math.round((event.ms ?? 0) / 1000);
74
+ } else if (event.kind === "validated") {
75
+ m.gateRuns += 1;
76
+ } else if (event.kind === "done") {
77
+ m.finalStatus = "done";
78
+ } else if (event.kind === "stuck") {
79
+ m.finalStatus = "stuck";
80
+ }
81
+ }
82
+
83
+ m.filesCreated = created.size;
84
+ m.avgTokensPerSecond = tpsCount > 0 ? Math.round(tpsSum / tpsCount) : 0;
85
+
86
+ return m;
87
+ }
@@ -0,0 +1,168 @@
1
+ import type { IRunRecord, IVariantSummary } from "./eval.types";
2
+ import { summarize } from "./score";
3
+
4
+ /** 95% normal quantile — the multiplier for Wilson intervals and the z-test. */
5
+ const Z95 = 1.96;
6
+
7
+ /** A variant summary enriched with a confidence interval and, when a baseline is
8
+ * given, a significance test of its pass-rate difference from that baseline. */
9
+ export interface IVariantReport extends IVariantSummary {
10
+ /** 95% Wilson score interval for the pass rate, as [low, high] in [0, 1]. */
11
+ readonly passRateCI: readonly [number, number];
12
+ /** Comparison vs the baseline variant (absent for the baseline itself or when
13
+ * no baseline was supplied). */
14
+ readonly vsBaseline?: {
15
+ readonly deltaPassRate: number;
16
+ readonly z: number;
17
+ /** True when |z| > 1.96 (p < 0.05, two-sided). */
18
+ readonly significant: boolean;
19
+ };
20
+ }
21
+
22
+ export interface ISweepReport {
23
+ /** The baseline variant label, or null if none was matched. */
24
+ readonly baseline: string | null;
25
+ readonly variants: readonly IVariantReport[];
26
+ }
27
+
28
+ /** 95% Wilson score interval for `passed` successes out of `n` trials. */
29
+ export function wilsonInterval(passed: number, n: number): [number, number] {
30
+ if (n === 0) {
31
+ return [0, 0];
32
+ }
33
+
34
+ const phat = passed / n;
35
+ const z2 = Z95 * Z95;
36
+ const denom = 1 + z2 / n;
37
+ const centre = phat + z2 / (2 * n);
38
+ const margin = Z95 * Math.sqrt((phat * (1 - phat)) / n + z2 / (4 * n * n));
39
+
40
+ return [
41
+ Math.max(0, (centre - margin) / denom),
42
+ Math.min(1, (centre + margin) / denom),
43
+ ];
44
+ }
45
+
46
+ /** Pooled two-proportion z-statistic comparing rate1 (x1/n1) to rate2 (x2/n2). */
47
+ export function twoProportionZ(
48
+ x1: number,
49
+ n1: number,
50
+ x2: number,
51
+ n2: number
52
+ ): number {
53
+ if (n1 === 0 || n2 === 0) {
54
+ return 0;
55
+ }
56
+
57
+ const pooled = (x1 + x2) / (n1 + n2);
58
+ const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));
59
+
60
+ if (se === 0) {
61
+ return 0;
62
+ }
63
+
64
+ return (x1 / n1 - x2 / n2) / se;
65
+ }
66
+
67
+ function compareToBaseline(
68
+ variant: IVariantSummary,
69
+ baseline: IVariantSummary
70
+ ): IVariantReport["vsBaseline"] {
71
+ const z = twoProportionZ(
72
+ variant.passed,
73
+ variant.runs,
74
+ baseline.passed,
75
+ baseline.runs
76
+ );
77
+
78
+ return {
79
+ deltaPassRate: variant.passRate - baseline.passRate,
80
+ z,
81
+ significant: Math.abs(z) > Z95,
82
+ };
83
+ }
84
+
85
+ /**
86
+ * Aggregate raw run records into a statistical report: per-variant pass rate with
87
+ * a 95% Wilson interval, plus — when `baselineLabel` matches a variant — a
88
+ * two-proportion significance test of every other variant against it.
89
+ */
90
+ export function buildSweepReport(
91
+ records: readonly IRunRecord[],
92
+ baselineLabel?: string
93
+ ): ISweepReport {
94
+ const summaries = summarize([...records]);
95
+ const baseline =
96
+ baselineLabel === undefined
97
+ ? undefined
98
+ : summaries.find((s) => s.label === baselineLabel);
99
+
100
+ const variants = summaries.map((summary) => {
101
+ const passRateCI = wilsonInterval(summary.passed, summary.runs);
102
+ const sameAsBaseline = baseline?.label === summary.label;
103
+
104
+ if (baseline === undefined || sameAsBaseline) {
105
+ return { ...summary, passRateCI };
106
+ }
107
+
108
+ return {
109
+ ...summary,
110
+ passRateCI,
111
+ vsBaseline: compareToBaseline(summary, baseline),
112
+ };
113
+ });
114
+
115
+ return { baseline: baseline?.label ?? null, variants };
116
+ }
117
+
118
+ function pct(value: number): string {
119
+ return `${Math.round(value * 100)}%`;
120
+ }
121
+
122
+ function baselineCell(report: IVariantReport, baseline: string | null): string {
123
+ if (baseline === null) {
124
+ return "—";
125
+ }
126
+
127
+ if (report.label === baseline) {
128
+ return "baseline";
129
+ }
130
+
131
+ const v = report.vsBaseline;
132
+
133
+ if (v === undefined) {
134
+ return "—";
135
+ }
136
+
137
+ const sign = v.deltaPassRate >= 0 ? "+" : "";
138
+ const mark = v.significant ? " *" : "";
139
+
140
+ return `${sign}${pct(v.deltaPassRate)} (z=${v.z.toFixed(2)})${mark}`;
141
+ }
142
+
143
+ /** Render a sweep report as a Markdown table. `*` marks a significant difference
144
+ * (p < 0.05) from the baseline. */
145
+ export function renderSweepReportMarkdown(report: ISweepReport): string {
146
+ const header =
147
+ "| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | vs baseline |\n" +
148
+ "| --- | --- | --- | --- | --- | --- | --- | --- |";
149
+
150
+ const rows = report.variants.map((v) => {
151
+ const ci = `${pct(v.passRateCI[0])}–${pct(v.passRateCI[1])}`;
152
+
153
+ return (
154
+ `| ${v.label} | ${String(v.runs)} | ${pct(v.passRate)} | ${ci} | ` +
155
+ `${v.avgCycles.toFixed(1)} | ${String(Math.round(v.avgMs))} | ` +
156
+ `${v.avgQuality.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
157
+ );
158
+ });
159
+
160
+ return [
161
+ "## A/B sweep report",
162
+ "",
163
+ header,
164
+ ...rows,
165
+ "",
166
+ "`*` = significant at p < 0.05 (two-proportion z-test vs baseline).",
167
+ ].join("\n");
168
+ }
@@ -51,6 +51,9 @@ export interface ILoopEvent {
51
51
  promptTokens?: number;
52
52
  completionTokens?: number;
53
53
  totalTokens?: number;
54
+ /** For `usage` events: output generation rate (completion tokens / second),
55
+ * measured from the first streamed token to the call's end. */
56
+ tokensPerSecond?: number;
54
57
  /** For `usage` (and salvage-warning `tool`) events: whether THIS model call
55
58
  * ran with thinking enabled — lets the analyzer correlate malformed-tool-call
56
59
  * rate with the thinking mode (see analyze-malformed). */
@@ -114,6 +114,20 @@ export interface ISendResult {
114
114
  turns: number;
115
115
  }
116
116
 
117
+ /** Cumulative model-call metrics for a session — the basis for `/metrics`. */
118
+ export interface ISessionMetrics {
119
+ /** Number of model calls made. */
120
+ readonly calls: number;
121
+ /** Total prompt (input) tokens billed across all calls. */
122
+ readonly promptTokens: number;
123
+ /** Total completion (output) tokens generated across all calls. */
124
+ readonly completionTokens: number;
125
+ /** Output generation rate averaged over all calls (tokens/second). */
126
+ readonly avgTokensPerSecond: number;
127
+ /** Output generation rate of the most recent call (tokens/second). */
128
+ readonly lastTokensPerSecond: number;
129
+ }
130
+
117
131
  export interface ISendOptions {
118
132
  /** Caller cancellation (Ctrl-C). */
119
133
  signal?: AbortSignal;
@@ -339,6 +353,15 @@ export class Session {
339
353
  * size of the context the model last saw (drives the status gauge and, soon,
340
354
  * auto-compaction). */
341
355
  private lastUsage?: ITokenUsage;
356
+ /** Running totals behind the `metrics` getter. genMs is the summed generation
357
+ * time (first-token→end) so the average rate is tokens/total-gen-seconds. */
358
+ private readonly metricsTotals = {
359
+ calls: 0,
360
+ promptTokens: 0,
361
+ completionTokens: 0,
362
+ genMs: 0,
363
+ lastTokensPerSecond: 0,
364
+ };
342
365
  /** Fast check run every few edits while building (e.g. tsc); "" = off. */
343
366
  private incrementalCheck: string;
344
367
  /** Per-send thinking override, set from ISendOptions for the duration of a
@@ -507,6 +530,31 @@ export class Session {
507
530
  return this.lastUsage;
508
531
  }
509
532
 
533
+ /** Cumulative model-call metrics (tokens + generation rate) for this session. */
534
+ get metrics(): ISessionMetrics {
535
+ const t = this.metricsTotals;
536
+
537
+ return {
538
+ calls: t.calls,
539
+ promptTokens: t.promptTokens,
540
+ completionTokens: t.completionTokens,
541
+ avgTokensPerSecond:
542
+ t.genMs > 0 ? Math.round((t.completionTokens / t.genMs) * 1000) : 0,
543
+ lastTokensPerSecond: Math.round(t.lastTokensPerSecond),
544
+ };
545
+ }
546
+
547
+ /** Fold one call's usage + generation time into the running metrics totals. */
548
+ private recordUsage(usage: ITokenUsage, genMs: number): void {
549
+ this.lastUsage = usage;
550
+ this.metricsTotals.calls += 1;
551
+ this.metricsTotals.promptTokens += usage.promptTokens;
552
+ this.metricsTotals.completionTokens += usage.completionTokens;
553
+ this.metricsTotals.genMs += genMs;
554
+ this.metricsTotals.lastTokensPerSecond =
555
+ genMs > 0 ? (usage.completionTokens / genMs) * 1000 : 0;
556
+ }
557
+
510
558
  /** The real size of the context the model is currently holding — the prompt
511
559
  * tokens of the last call (what auto-compaction watches), 0 before any call. */
512
560
  get contextTokens(): number {
@@ -957,6 +1005,8 @@ export class Session {
957
1005
  const mcpSchemas = this.ctx.mcpRegistry?.toolSchemas() ?? [];
958
1006
  const offeredTools =
959
1007
  mcpSchemas.length > 0 ? [...baseTools, ...mcpSchemas] : baseTools;
1008
+ const callStart = performance.now();
1009
+ let firstTokenAt = 0;
960
1010
  const res = await this.provider.complete(ctx.messages, {
961
1011
  tools: offeredTools,
962
1012
  temperature: this.cfg.temperature ?? 0,
@@ -967,6 +1017,12 @@ export class Session {
967
1017
  : { thinkingTokenBudget: this.cfg.thinkingTokenBudget }),
968
1018
  ...(signal === undefined ? {} : { signal }),
969
1019
  onToken: (token, channel) => {
1020
+ // Stamp the first token so tokens/sec measures generation rate (excluding
1021
+ // prompt-processing / time-to-first-token), not total wall time.
1022
+ if (firstTokenAt === 0) {
1023
+ firstTokenAt = performance.now();
1024
+ }
1025
+
970
1026
  // Stream EVERYTHING live — thinking, the tool calls being written, and
971
1027
  // the answer itself (channel `content`), so the user watches the reply
972
1028
  // arrive instead of staring at a frozen indicator. The renderer formats
@@ -977,17 +1033,23 @@ export class Session {
977
1033
  });
978
1034
 
979
1035
  if (res.usage !== undefined) {
980
- this.lastUsage = res.usage;
1036
+ const ended = performance.now();
1037
+ const genMs = firstTokenAt > 0 ? ended - firstTokenAt : ended - callStart;
1038
+ const tps = genMs > 0 ? (res.usage.completionTokens / genMs) * 1000 : 0;
1039
+
1040
+ this.recordUsage(res.usage, genMs);
981
1041
  // Logged (not shown) so the --log analyzer can compute tokens-to-solution.
982
1042
  // `thinking` records THIS call's mode, so malformed-call rates can be
983
1043
  // correlated with it (analyze-malformed).
984
1044
  report({
985
1045
  kind: "usage",
986
1046
  task: SESSION_ID,
987
- message: `tokens ${res.usage.promptTokens} in / ${res.usage.completionTokens} out`,
1047
+ message: `tokens ${res.usage.promptTokens} in / ${res.usage.completionTokens} out · ${Math.round(tps)} tok/s`,
988
1048
  promptTokens: res.usage.promptTokens,
989
1049
  completionTokens: res.usage.completionTokens,
990
1050
  totalTokens: res.usage.totalTokens,
1051
+ tokensPerSecond: Math.round(tps),
1052
+ ms: Math.round(genMs),
991
1053
  ...(enableThinking === undefined ? {} : { thinking: enableThinking }),
992
1054
  });
993
1055
  }
@@ -71,6 +71,10 @@ export function renderStatus(
71
71
  );
72
72
  }
73
73
 
74
+ if (info.tokensPerSecond !== undefined && info.tokensPerSecond > 0) {
75
+ bits.push(`${info.tokensPerSecond} tok/s`);
76
+ }
77
+
74
78
  bits.push(info.status, info.scope);
75
79
 
76
80
  return `${paint(` ⎯ ${bits.join(" · ")}`, STYLE.dim, color)}\n`;
@@ -18,4 +18,7 @@ export interface IStatusInfo {
18
18
  status: string;
19
19
  /** Editable scope label. */
20
20
  scope: string;
21
+ /** Output generation rate of the last model call (tokens/second); omitted or
22
+ * 0 before the first call. */
23
+ tokensPerSecond?: number;
21
24
  }