@agjs/tsforge 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agjs/tsforge",
3
3
  "type": "module",
4
- "version": "0.1.11",
4
+ "version": "0.1.12",
5
5
  "license": "MIT",
6
6
  "description": "TypeScript coding harness with a deterministic gate, stack-aware guardrails, and stream-level correction.",
7
7
  "repository": {
package/src/eval/index.ts CHANGED
@@ -1,3 +1,12 @@
1
1
  export * from "./eval.types";
2
2
  export { judge } from "./judge";
3
3
  export { summarize } from "./score";
4
+ export { analyzeEvents, type IRunMetrics } from "./metrics";
5
+ export {
6
+ buildSweepReport,
7
+ renderSweepReportMarkdown,
8
+ wilsonInterval,
9
+ twoProportionZ,
10
+ type ISweepReport,
11
+ type IVariantReport,
12
+ } from "./report";
@@ -0,0 +1,87 @@
1
+ import type { ILoopEvent } from "../loop/loop.types";
2
+
3
+ /** Behavioral metrics distilled from a run's event stream — the signals the
4
+ * local-model literature says predict outcomes (tokens-to-solution, repair
5
+ * iterations, peak context) rather than vibes. A reusable, pure counterpart to
6
+ * the cli-metrics script. */
7
+ export interface IRunMetrics {
8
+ finalStatus: "done" | "stuck" | "none";
9
+ /** Model turns (one per `cycle` event). */
10
+ turns: number;
11
+ /** Model calls (one per `usage` event). */
12
+ modelCalls: number;
13
+ /** Total completion tokens generated. */
14
+ tokensOut: number;
15
+ /** Largest prompt-token count seen (the run's context high-water mark). */
16
+ peakContext: number;
17
+ /** File mutations (`edit` + `create`). */
18
+ edits: number;
19
+ /** Distinct files created. */
20
+ filesCreated: number;
21
+ /** Gate runs (`validated` events). */
22
+ gateRuns: number;
23
+ /** Summed turn wall-clock from `timing` events, in seconds. */
24
+ wallClockSeconds: number;
25
+ /** Mean output rate across calls that reported one (tokens/second). */
26
+ avgTokensPerSecond: number;
27
+ }
28
+
29
+ function emptyMetrics(): IRunMetrics {
30
+ return {
31
+ finalStatus: "none",
32
+ turns: 0,
33
+ modelCalls: 0,
34
+ tokensOut: 0,
35
+ peakContext: 0,
36
+ edits: 0,
37
+ filesCreated: 0,
38
+ gateRuns: 0,
39
+ wallClockSeconds: 0,
40
+ avgTokensPerSecond: 0,
41
+ };
42
+ }
43
+
44
+ /** Reduce a run's event stream to its behavioral metrics. Pure — feed it the
45
+ * events from a `--log` JSONL or a captured `onEvent` stream. */
46
+ export function analyzeEvents(events: readonly ILoopEvent[]): IRunMetrics {
47
+ const m = emptyMetrics();
48
+ const created = new Set<string>();
49
+ let tpsSum = 0;
50
+ let tpsCount = 0;
51
+
52
+ for (const event of events) {
53
+ if (event.kind === "cycle") {
54
+ m.turns += 1;
55
+ } else if (event.kind === "usage") {
56
+ m.modelCalls += 1;
57
+ m.tokensOut += event.completionTokens ?? 0;
58
+ m.peakContext = Math.max(m.peakContext, event.promptTokens ?? 0);
59
+
60
+ if (event.tokensPerSecond !== undefined && event.tokensPerSecond > 0) {
61
+ tpsSum += event.tokensPerSecond;
62
+ tpsCount += 1;
63
+ }
64
+ } else if (event.kind === "create") {
65
+ m.edits += 1;
66
+
67
+ if (event.file !== undefined && event.file.length > 0) {
68
+ created.add(event.file);
69
+ }
70
+ } else if (event.kind === "edit") {
71
+ m.edits += 1;
72
+ } else if (event.kind === "timing") {
73
+ m.wallClockSeconds += Math.round((event.ms ?? 0) / 1000);
74
+ } else if (event.kind === "validated") {
75
+ m.gateRuns += 1;
76
+ } else if (event.kind === "done") {
77
+ m.finalStatus = "done";
78
+ } else if (event.kind === "stuck") {
79
+ m.finalStatus = "stuck";
80
+ }
81
+ }
82
+
83
+ m.filesCreated = created.size;
84
+ m.avgTokensPerSecond = tpsCount > 0 ? Math.round(tpsSum / tpsCount) : 0;
85
+
86
+ return m;
87
+ }
@@ -0,0 +1,168 @@
1
+ import type { IRunRecord, IVariantSummary } from "./eval.types";
2
+ import { summarize } from "./score";
3
+
4
+ /** 95% normal quantile — the multiplier for Wilson intervals and the z-test. */
5
+ const Z95 = 1.96;
6
+
7
+ /** A variant summary enriched with a confidence interval and, when a baseline is
8
+ * given, a significance test of its pass-rate difference from that baseline. */
9
+ export interface IVariantReport extends IVariantSummary {
10
+ /** 95% Wilson score interval for the pass rate, as [low, high] in [0, 1]. */
11
+ readonly passRateCI: readonly [number, number];
12
+ /** Comparison vs the baseline variant (absent for the baseline itself or when
13
+ * no baseline was supplied). */
14
+ readonly vsBaseline?: {
15
+ readonly deltaPassRate: number;
16
+ readonly z: number;
17
+ /** True when |z| > 1.96 (p < 0.05, two-sided). */
18
+ readonly significant: boolean;
19
+ };
20
+ }
21
+
22
+ export interface ISweepReport {
23
+ /** The baseline variant label, or null if none was matched. */
24
+ readonly baseline: string | null;
25
+ readonly variants: readonly IVariantReport[];
26
+ }
27
+
28
+ /** 95% Wilson score interval for `passed` successes out of `n` trials. */
29
+ export function wilsonInterval(passed: number, n: number): [number, number] {
30
+ if (n === 0) {
31
+ return [0, 0];
32
+ }
33
+
34
+ const phat = passed / n;
35
+ const z2 = Z95 * Z95;
36
+ const denom = 1 + z2 / n;
37
+ const centre = phat + z2 / (2 * n);
38
+ const margin = Z95 * Math.sqrt((phat * (1 - phat)) / n + z2 / (4 * n * n));
39
+
40
+ return [
41
+ Math.max(0, (centre - margin) / denom),
42
+ Math.min(1, (centre + margin) / denom),
43
+ ];
44
+ }
45
+
46
+ /** Pooled two-proportion z-statistic comparing rate1 (x1/n1) to rate2 (x2/n2). */
47
+ export function twoProportionZ(
48
+ x1: number,
49
+ n1: number,
50
+ x2: number,
51
+ n2: number
52
+ ): number {
53
+ if (n1 === 0 || n2 === 0) {
54
+ return 0;
55
+ }
56
+
57
+ const pooled = (x1 + x2) / (n1 + n2);
58
+ const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));
59
+
60
+ if (se === 0) {
61
+ return 0;
62
+ }
63
+
64
+ return (x1 / n1 - x2 / n2) / se;
65
+ }
66
+
67
+ function compareToBaseline(
68
+ variant: IVariantSummary,
69
+ baseline: IVariantSummary
70
+ ): IVariantReport["vsBaseline"] {
71
+ const z = twoProportionZ(
72
+ variant.passed,
73
+ variant.runs,
74
+ baseline.passed,
75
+ baseline.runs
76
+ );
77
+
78
+ return {
79
+ deltaPassRate: variant.passRate - baseline.passRate,
80
+ z,
81
+ significant: Math.abs(z) > Z95,
82
+ };
83
+ }
84
+
85
+ /**
86
+ * Aggregate raw run records into a statistical report: per-variant pass rate with
87
+ * a 95% Wilson interval, plus — when `baselineLabel` matches a variant — a
88
+ * two-proportion significance test of every other variant against it.
89
+ */
90
+ export function buildSweepReport(
91
+ records: readonly IRunRecord[],
92
+ baselineLabel?: string
93
+ ): ISweepReport {
94
+ const summaries = summarize([...records]);
95
+ const baseline =
96
+ baselineLabel === undefined
97
+ ? undefined
98
+ : summaries.find((s) => s.label === baselineLabel);
99
+
100
+ const variants = summaries.map((summary) => {
101
+ const passRateCI = wilsonInterval(summary.passed, summary.runs);
102
+ const sameAsBaseline = baseline?.label === summary.label;
103
+
104
+ if (baseline === undefined || sameAsBaseline) {
105
+ return { ...summary, passRateCI };
106
+ }
107
+
108
+ return {
109
+ ...summary,
110
+ passRateCI,
111
+ vsBaseline: compareToBaseline(summary, baseline),
112
+ };
113
+ });
114
+
115
+ return { baseline: baseline?.label ?? null, variants };
116
+ }
117
+
118
+ function pct(value: number): string {
119
+ return `${Math.round(value * 100)}%`;
120
+ }
121
+
122
+ function baselineCell(report: IVariantReport, baseline: string | null): string {
123
+ if (baseline === null) {
124
+ return "—";
125
+ }
126
+
127
+ if (report.label === baseline) {
128
+ return "baseline";
129
+ }
130
+
131
+ const v = report.vsBaseline;
132
+
133
+ if (v === undefined) {
134
+ return "—";
135
+ }
136
+
137
+ const sign = v.deltaPassRate >= 0 ? "+" : "";
138
+ const mark = v.significant ? " *" : "";
139
+
140
+ return `${sign}${pct(v.deltaPassRate)} (z=${v.z.toFixed(2)})${mark}`;
141
+ }
142
+
143
+ /** Render a sweep report as a Markdown table. `*` marks a significant difference
144
+ * (p < 0.05) from the baseline. */
145
+ export function renderSweepReportMarkdown(report: ISweepReport): string {
146
+ const header =
147
+ "| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | vs baseline |\n" +
148
+ "| --- | --- | --- | --- | --- | --- | --- | --- |";
149
+
150
+ const rows = report.variants.map((v) => {
151
+ const ci = `${pct(v.passRateCI[0])}–${pct(v.passRateCI[1])}`;
152
+
153
+ return (
154
+ `| ${v.label} | ${String(v.runs)} | ${pct(v.passRate)} | ${ci} | ` +
155
+ `${v.avgCycles.toFixed(1)} | ${String(Math.round(v.avgMs))} | ` +
156
+ `${v.avgQuality.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
157
+ );
158
+ });
159
+
160
+ return [
161
+ "## A/B sweep report",
162
+ "",
163
+ header,
164
+ ...rows,
165
+ "",
166
+ "`*` = significant at p < 0.05 (two-proportion z-test vs baseline).",
167
+ ].join("\n");
168
+ }