@agjs/tsforge 0.1.11 → 0.1.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/eval/index.ts +9 -0
- package/src/eval/metrics.ts +87 -0
- package/src/eval/report.ts +168 -0
package/package.json
CHANGED
package/src/eval/index.ts
CHANGED
|
@@ -1,3 +1,12 @@
|
|
|
1
1
|
export * from "./eval.types";
|
|
2
2
|
export { judge } from "./judge";
|
|
3
3
|
export { summarize } from "./score";
|
|
4
|
+
export { analyzeEvents, type IRunMetrics } from "./metrics";
|
|
5
|
+
export {
|
|
6
|
+
buildSweepReport,
|
|
7
|
+
renderSweepReportMarkdown,
|
|
8
|
+
wilsonInterval,
|
|
9
|
+
twoProportionZ,
|
|
10
|
+
type ISweepReport,
|
|
11
|
+
type IVariantReport,
|
|
12
|
+
} from "./report";
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import type { ILoopEvent } from "../loop/loop.types";
|
|
2
|
+
|
|
3
|
+
/** Behavioral metrics distilled from a run's event stream — the signals the
|
|
4
|
+
* local-model literature says predict outcomes (tokens-to-solution, repair
|
|
5
|
+
* iterations, peak context) rather than vibes. A reusable, pure counterpart to
|
|
6
|
+
* the cli-metrics script. */
|
|
7
|
+
export interface IRunMetrics {
|
|
8
|
+
finalStatus: "done" | "stuck" | "none";
|
|
9
|
+
/** Model turns (one per `cycle` event). */
|
|
10
|
+
turns: number;
|
|
11
|
+
/** Model calls (one per `usage` event). */
|
|
12
|
+
modelCalls: number;
|
|
13
|
+
/** Total completion tokens generated. */
|
|
14
|
+
tokensOut: number;
|
|
15
|
+
/** Largest prompt-token count seen (the run's context high-water mark). */
|
|
16
|
+
peakContext: number;
|
|
17
|
+
/** File mutations (`edit` + `create`). */
|
|
18
|
+
edits: number;
|
|
19
|
+
/** Distinct files created. */
|
|
20
|
+
filesCreated: number;
|
|
21
|
+
/** Gate runs (`validated` events). */
|
|
22
|
+
gateRuns: number;
|
|
23
|
+
/** Summed turn wall-clock from `timing` events, in seconds. */
|
|
24
|
+
wallClockSeconds: number;
|
|
25
|
+
/** Mean output rate across calls that reported one (tokens/second). */
|
|
26
|
+
avgTokensPerSecond: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
function emptyMetrics(): IRunMetrics {
|
|
30
|
+
return {
|
|
31
|
+
finalStatus: "none",
|
|
32
|
+
turns: 0,
|
|
33
|
+
modelCalls: 0,
|
|
34
|
+
tokensOut: 0,
|
|
35
|
+
peakContext: 0,
|
|
36
|
+
edits: 0,
|
|
37
|
+
filesCreated: 0,
|
|
38
|
+
gateRuns: 0,
|
|
39
|
+
wallClockSeconds: 0,
|
|
40
|
+
avgTokensPerSecond: 0,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Reduce a run's event stream to its behavioral metrics. Pure — feed it the
|
|
45
|
+
* events from a `--log` JSONL or a captured `onEvent` stream. */
|
|
46
|
+
export function analyzeEvents(events: readonly ILoopEvent[]): IRunMetrics {
|
|
47
|
+
const m = emptyMetrics();
|
|
48
|
+
const created = new Set<string>();
|
|
49
|
+
let tpsSum = 0;
|
|
50
|
+
let tpsCount = 0;
|
|
51
|
+
|
|
52
|
+
for (const event of events) {
|
|
53
|
+
if (event.kind === "cycle") {
|
|
54
|
+
m.turns += 1;
|
|
55
|
+
} else if (event.kind === "usage") {
|
|
56
|
+
m.modelCalls += 1;
|
|
57
|
+
m.tokensOut += event.completionTokens ?? 0;
|
|
58
|
+
m.peakContext = Math.max(m.peakContext, event.promptTokens ?? 0);
|
|
59
|
+
|
|
60
|
+
if (event.tokensPerSecond !== undefined && event.tokensPerSecond > 0) {
|
|
61
|
+
tpsSum += event.tokensPerSecond;
|
|
62
|
+
tpsCount += 1;
|
|
63
|
+
}
|
|
64
|
+
} else if (event.kind === "create") {
|
|
65
|
+
m.edits += 1;
|
|
66
|
+
|
|
67
|
+
if (event.file !== undefined && event.file.length > 0) {
|
|
68
|
+
created.add(event.file);
|
|
69
|
+
}
|
|
70
|
+
} else if (event.kind === "edit") {
|
|
71
|
+
m.edits += 1;
|
|
72
|
+
} else if (event.kind === "timing") {
|
|
73
|
+
m.wallClockSeconds += Math.round((event.ms ?? 0) / 1000);
|
|
74
|
+
} else if (event.kind === "validated") {
|
|
75
|
+
m.gateRuns += 1;
|
|
76
|
+
} else if (event.kind === "done") {
|
|
77
|
+
m.finalStatus = "done";
|
|
78
|
+
} else if (event.kind === "stuck") {
|
|
79
|
+
m.finalStatus = "stuck";
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
m.filesCreated = created.size;
|
|
84
|
+
m.avgTokensPerSecond = tpsCount > 0 ? Math.round(tpsSum / tpsCount) : 0;
|
|
85
|
+
|
|
86
|
+
return m;
|
|
87
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
import type { IRunRecord, IVariantSummary } from "./eval.types";
|
|
2
|
+
import { summarize } from "./score";
|
|
3
|
+
|
|
4
|
+
/** 95% normal quantile — the multiplier for Wilson intervals and the z-test. */
|
|
5
|
+
const Z95 = 1.96;
|
|
6
|
+
|
|
7
|
+
/** A variant summary enriched with a confidence interval and, when a baseline is
|
|
8
|
+
* given, a significance test of its pass-rate difference from that baseline. */
|
|
9
|
+
export interface IVariantReport extends IVariantSummary {
|
|
10
|
+
/** 95% Wilson score interval for the pass rate, as [low, high] in [0, 1]. */
|
|
11
|
+
readonly passRateCI: readonly [number, number];
|
|
12
|
+
/** Comparison vs the baseline variant (absent for the baseline itself or when
|
|
13
|
+
* no baseline was supplied). */
|
|
14
|
+
readonly vsBaseline?: {
|
|
15
|
+
readonly deltaPassRate: number;
|
|
16
|
+
readonly z: number;
|
|
17
|
+
/** True when |z| > 1.96 (p < 0.05, two-sided). */
|
|
18
|
+
readonly significant: boolean;
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface ISweepReport {
|
|
23
|
+
/** The baseline variant label, or null if none was matched. */
|
|
24
|
+
readonly baseline: string | null;
|
|
25
|
+
readonly variants: readonly IVariantReport[];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/** 95% Wilson score interval for `passed` successes out of `n` trials. */
|
|
29
|
+
export function wilsonInterval(passed: number, n: number): [number, number] {
|
|
30
|
+
if (n === 0) {
|
|
31
|
+
return [0, 0];
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const phat = passed / n;
|
|
35
|
+
const z2 = Z95 * Z95;
|
|
36
|
+
const denom = 1 + z2 / n;
|
|
37
|
+
const centre = phat + z2 / (2 * n);
|
|
38
|
+
const margin = Z95 * Math.sqrt((phat * (1 - phat)) / n + z2 / (4 * n * n));
|
|
39
|
+
|
|
40
|
+
return [
|
|
41
|
+
Math.max(0, (centre - margin) / denom),
|
|
42
|
+
Math.min(1, (centre + margin) / denom),
|
|
43
|
+
];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/** Pooled two-proportion z-statistic comparing rate1 (x1/n1) to rate2 (x2/n2). */
|
|
47
|
+
export function twoProportionZ(
|
|
48
|
+
x1: number,
|
|
49
|
+
n1: number,
|
|
50
|
+
x2: number,
|
|
51
|
+
n2: number
|
|
52
|
+
): number {
|
|
53
|
+
if (n1 === 0 || n2 === 0) {
|
|
54
|
+
return 0;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const pooled = (x1 + x2) / (n1 + n2);
|
|
58
|
+
const se = Math.sqrt(pooled * (1 - pooled) * (1 / n1 + 1 / n2));
|
|
59
|
+
|
|
60
|
+
if (se === 0) {
|
|
61
|
+
return 0;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
return (x1 / n1 - x2 / n2) / se;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function compareToBaseline(
|
|
68
|
+
variant: IVariantSummary,
|
|
69
|
+
baseline: IVariantSummary
|
|
70
|
+
): IVariantReport["vsBaseline"] {
|
|
71
|
+
const z = twoProportionZ(
|
|
72
|
+
variant.passed,
|
|
73
|
+
variant.runs,
|
|
74
|
+
baseline.passed,
|
|
75
|
+
baseline.runs
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
return {
|
|
79
|
+
deltaPassRate: variant.passRate - baseline.passRate,
|
|
80
|
+
z,
|
|
81
|
+
significant: Math.abs(z) > Z95,
|
|
82
|
+
};
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Aggregate raw run records into a statistical report: per-variant pass rate with
|
|
87
|
+
* a 95% Wilson interval, plus — when `baselineLabel` matches a variant — a
|
|
88
|
+
* two-proportion significance test of every other variant against it.
|
|
89
|
+
*/
|
|
90
|
+
export function buildSweepReport(
|
|
91
|
+
records: readonly IRunRecord[],
|
|
92
|
+
baselineLabel?: string
|
|
93
|
+
): ISweepReport {
|
|
94
|
+
const summaries = summarize([...records]);
|
|
95
|
+
const baseline =
|
|
96
|
+
baselineLabel === undefined
|
|
97
|
+
? undefined
|
|
98
|
+
: summaries.find((s) => s.label === baselineLabel);
|
|
99
|
+
|
|
100
|
+
const variants = summaries.map((summary) => {
|
|
101
|
+
const passRateCI = wilsonInterval(summary.passed, summary.runs);
|
|
102
|
+
const sameAsBaseline = baseline?.label === summary.label;
|
|
103
|
+
|
|
104
|
+
if (baseline === undefined || sameAsBaseline) {
|
|
105
|
+
return { ...summary, passRateCI };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
...summary,
|
|
110
|
+
passRateCI,
|
|
111
|
+
vsBaseline: compareToBaseline(summary, baseline),
|
|
112
|
+
};
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
return { baseline: baseline?.label ?? null, variants };
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function pct(value: number): string {
|
|
119
|
+
return `${Math.round(value * 100)}%`;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function baselineCell(report: IVariantReport, baseline: string | null): string {
|
|
123
|
+
if (baseline === null) {
|
|
124
|
+
return "—";
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if (report.label === baseline) {
|
|
128
|
+
return "baseline";
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
const v = report.vsBaseline;
|
|
132
|
+
|
|
133
|
+
if (v === undefined) {
|
|
134
|
+
return "—";
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const sign = v.deltaPassRate >= 0 ? "+" : "";
|
|
138
|
+
const mark = v.significant ? " *" : "";
|
|
139
|
+
|
|
140
|
+
return `${sign}${pct(v.deltaPassRate)} (z=${v.z.toFixed(2)})${mark}`;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Render a sweep report as a Markdown table. `*` marks a significant difference
|
|
144
|
+
* (p < 0.05) from the baseline. */
|
|
145
|
+
export function renderSweepReportMarkdown(report: ISweepReport): string {
|
|
146
|
+
const header =
|
|
147
|
+
"| Variant | Runs | Pass | 95% CI | Cycles | Ms | Quality | vs baseline |\n" +
|
|
148
|
+
"| --- | --- | --- | --- | --- | --- | --- | --- |";
|
|
149
|
+
|
|
150
|
+
const rows = report.variants.map((v) => {
|
|
151
|
+
const ci = `${pct(v.passRateCI[0])}–${pct(v.passRateCI[1])}`;
|
|
152
|
+
|
|
153
|
+
return (
|
|
154
|
+
`| ${v.label} | ${String(v.runs)} | ${pct(v.passRate)} | ${ci} | ` +
|
|
155
|
+
`${v.avgCycles.toFixed(1)} | ${String(Math.round(v.avgMs))} | ` +
|
|
156
|
+
`${v.avgQuality.toFixed(1)} | ${baselineCell(v, report.baseline)} |`
|
|
157
|
+
);
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
return [
|
|
161
|
+
"## A/B sweep report",
|
|
162
|
+
"",
|
|
163
|
+
header,
|
|
164
|
+
...rows,
|
|
165
|
+
"",
|
|
166
|
+
"`*` = significant at p < 0.05 (two-proportion z-test vs baseline).",
|
|
167
|
+
].join("\n");
|
|
168
|
+
}
|