@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/CHANGELOG.md +102 -1
  2. package/README.md +4 -0
  3. package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
  6. package/dist/chunk-6M774GY6.js +53 -0
  7. package/dist/chunk-6M774GY6.js.map +1 -0
  8. package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
  9. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  10. package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
  11. package/dist/chunk-QUKKGHTZ.js +121 -0
  12. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  13. package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
  14. package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
  15. package/dist/chunk-UAND2LOT.js +738 -0
  16. package/dist/chunk-UAND2LOT.js.map +1 -0
  17. package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
  18. package/dist/chunk-USHQBPMH.js.map +1 -0
  19. package/dist/cli.js +3 -3
  20. package/dist/index.d.ts +10 -284
  21. package/dist/index.js +39 -19
  22. package/dist/index.js.map +1 -1
  23. package/dist/integrity-K2oVlF57.d.ts +210 -0
  24. package/dist/openapi.json +1 -1
  25. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  26. package/dist/optimization.d.ts +6 -144
  27. package/dist/optimization.js +9 -2
  28. package/dist/reporting-B82RSv9C.d.ts +593 -0
  29. package/dist/reporting.d.ts +2 -2
  30. package/dist/reporting.js +15 -8
  31. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  32. package/dist/traces.d.ts +101 -181
  33. package/dist/traces.js +16 -5
  34. package/dist/wire/index.js +3 -3
  35. package/docs/research-report-methodology.md +19 -4
  36. package/docs/wire-protocol.md +1 -1
  37. package/package.json +2 -2
  38. package/dist/chunk-3IX6QTB7.js.map +0 -1
  39. package/dist/chunk-HRZELXCR.js.map +0 -1
  40. package/dist/chunk-KRR4VMH7.js +0 -423
  41. package/dist/chunk-KRR4VMH7.js.map +0 -1
  42. package/dist/chunk-WOK2RTWG.js.map +0 -1
  43. package/dist/reporting-Da2ihlcM.d.ts +0 -672
  44. /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
  45. /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
@@ -1,337 +1,409 @@
1
1
  import {
2
- benjaminiHochberg,
3
- cohensD,
4
- confidenceInterval,
5
- pairedBootstrap,
6
- pairedMde,
7
- wilcoxonSignedRank
8
- } from "./chunk-KRR4VMH7.js";
2
+ canonicalize,
3
+ hashJson
4
+ } from "./chunk-6M774GY6.js";
9
5
 
10
- // src/release-confidence.ts
11
- var DEFAULT_THRESHOLDS = {
12
- requireCorpus: true,
13
- minScenarioCount: 1,
14
- minSearchRuns: 1,
15
- minHoldoutRuns: 1,
16
- requireHoldout: true,
17
- minPassRate: 0.8,
18
- minMeanScore: 0.7,
19
- maxOverfitGap: 0.15,
20
- maxMeanCostUsd: Number.POSITIVE_INFINITY,
21
- maxP95WallMs: Number.POSITIVE_INFINITY,
22
- requireAsiForFailures: true,
23
- failureScoreThreshold: 0.5
24
- };
25
- function releaseTraceEvidenceFromMultiShotTrials(trials) {
26
- return trials.map((trial) => ({
27
- scenarioId: trial.scenarioId,
28
- candidateId: trial.variantId,
29
- split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
30
- score: trial.score,
31
- ok: trial.ok,
32
- turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
33
- costUsd: trial.cost,
34
- durationMs: trial.durationMs,
35
- failureMode: trial.error ? "runtime_error" : void 0,
36
- asi: trial.asi,
37
- metadata: trial.metadata
38
- }));
6
+ // src/statistics.ts
7
+ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
8
+ "hallucination",
9
+ "false_confidence",
10
+ "worst_failure"
11
+ ]);
12
+ function normalizeScores(scores) {
13
+ return scores.map((s) => {
14
+ if (INVERTED_DIMENSIONS.has(s.dimension)) {
15
+ return s;
16
+ }
17
+ return s;
18
+ });
39
19
  }
40
- function evaluateReleaseConfidence(input) {
41
- const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
42
- const candidateId = input.candidateId ?? null;
43
- const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
44
- const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
45
- const scenarios = input.scenarios ?? [];
46
- const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
47
- const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
48
- const searchScores = scoresFor(runs, "search");
49
- const holdoutScores = scoresFor(runs, "holdout");
50
- const allScores = [...searchScores, ...holdoutScores];
51
- const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
52
- const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
53
- const searchRuns = runs.filter((r) => r.splitTag === "search").length;
54
- const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
55
- const searchMeanScore = mean(searchScores);
56
- const holdoutMeanScore = mean(holdoutScores);
57
- const metrics = {
58
- scenarioCount,
59
- searchRuns,
60
- holdoutRuns,
61
- passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
62
- meanScore: mean(scoreUniverse),
63
- searchMeanScore,
64
- holdoutMeanScore,
65
- overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),
66
- meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
67
- p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
68
- failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
69
- failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
70
- singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
71
- multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
72
- splitCounts,
73
- domainCounts: countDomains(scenarios),
74
- failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
75
- responsibleSurfaceCounts: countResponsibleSurfaces(traces)
76
- };
77
- const issues = [];
78
- checkCorpus(input, thresholds, metrics, issues);
79
- checkQuality(thresholds, metrics, issues);
80
- checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
81
- checkDiagnostics(thresholds, metrics, issues);
82
- checkEfficiency(thresholds, metrics, issues);
83
- const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
84
- const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
20
+ function weightedMean(scores) {
21
+ if (scores.length === 0) return 0;
22
+ let totalWeight = 0;
23
+ let weightedSum = 0;
24
+ for (const { score, weight } of scores) {
25
+ const w = weight ?? 1;
26
+ weightedSum += score * w;
27
+ totalWeight += w;
28
+ }
29
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
30
+ }
31
+ function confidenceInterval(scores, confidence = 0.95) {
32
+ if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
33
+ if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
34
+ const n = scores.length;
35
+ const mean = scores.reduce((a, b) => a + b, 0) / n;
36
+ const B = 1e3;
37
+ const bootstrapMeans = [];
38
+ for (let i = 0; i < B; i++) {
39
+ let sum = 0;
40
+ for (let j = 0; j < n; j++) {
41
+ sum += scores[Math.floor(Math.random() * n)];
42
+ }
43
+ bootstrapMeans.push(sum / n);
44
+ }
45
+ bootstrapMeans.sort((a, b) => a - b);
46
+ const alpha = 1 - confidence;
47
+ const lowerIdx = Math.floor(alpha / 2 * B);
48
+ const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
85
49
  return {
86
- target: input.target,
87
- candidateId,
88
- baselineId: input.baselineId ?? null,
89
- status,
90
- promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
91
- axes,
92
- issues,
93
- metrics,
94
- dataset: input.dataset ?? null,
95
- gateDecision: input.gateDecision ?? null,
96
- summary: renderSummary(input.target, status, metrics, issues)
50
+ mean,
51
+ lower: bootstrapMeans[lowerIdx],
52
+ upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
97
53
  };
98
54
  }
99
- function assertReleaseConfidence(input) {
100
- const scorecard = evaluateReleaseConfidence(input);
101
- if (scorecard.status === "fail") {
102
- throw new Error(scorecard.summary);
103
- }
104
- return scorecard;
105
- }
106
- function filterCandidate(runs, candidateId, baselineId) {
107
- if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
108
- if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
109
- return [...runs];
110
- }
111
- function filterTraceCandidate(traces, candidateId, baselineId) {
112
- if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
113
- if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
114
- return [...traces];
115
- }
116
- function checkCorpus(input, thresholds, metrics, issues) {
117
- if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
118
- issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
119
- }
120
- if (metrics.scenarioCount < thresholds.minScenarioCount) {
121
- issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
122
- }
123
- if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
124
- issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
125
- }
126
- }
127
- function checkQuality(thresholds, metrics, issues) {
128
- if (metrics.searchRuns < thresholds.minSearchRuns) {
129
- issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
130
- }
131
- if (metrics.passRate < thresholds.minPassRate) {
132
- issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
133
- }
134
- if (metrics.meanScore < thresholds.minMeanScore) {
135
- issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
136
- }
137
- }
138
- function checkGeneralization(gateDecision, thresholds, metrics, issues) {
139
- if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
140
- issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
141
- }
142
- if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
143
- issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
144
- }
145
- if (gateDecision && !gateDecision.promote) {
146
- issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
55
+ function interRaterReliability(judgeScores) {
56
+ if (judgeScores.length < 2) return 1;
57
+ const dimensionMap = /* @__PURE__ */ new Map();
58
+ for (const judgeSet of judgeScores) {
59
+ for (const s of judgeSet) {
60
+ if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
61
+ const arr = dimensionMap.get(s.dimension);
62
+ if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
63
+ arr.push([s.score]);
64
+ } else {
65
+ arr[arr.length - 1].push(s.score);
66
+ }
67
+ }
147
68
  }
148
- }
149
- function checkDiagnostics(thresholds, metrics, issues) {
150
- if (!thresholds.requireAsiForFailures) return;
151
- if (metrics.failedRows > metrics.failuresWithAsi) {
152
- issues.push({
153
- axis: "diagnostics",
154
- severity: "critical",
155
- code: "missing_failure_asi",
156
- detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
157
- });
69
+ const allValues = [];
70
+ const pairDiffs = [];
71
+ for (const items of dimensionMap.values()) {
72
+ for (const ratings of items) {
73
+ if (ratings.length < 2) continue;
74
+ for (const v of ratings) allValues.push(v);
75
+ for (let i = 0; i < ratings.length; i++) {
76
+ for (let j = i + 1; j < ratings.length; j++) {
77
+ pairDiffs.push((ratings[i] - ratings[j]) ** 2);
78
+ }
79
+ }
80
+ }
158
81
  }
159
- }
160
- function checkEfficiency(thresholds, metrics, issues) {
161
- if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
162
- issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` });
82
+ if (pairDiffs.length === 0 || allValues.length < 2) return 1;
83
+ const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
84
+ let expectedDisagreement = 0;
85
+ let expectedCount = 0;
86
+ for (let i = 0; i < allValues.length; i++) {
87
+ for (let j = i + 1; j < allValues.length; j++) {
88
+ expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
89
+ expectedCount++;
90
+ }
163
91
  }
164
- if (metrics.p95WallMs > thresholds.maxP95WallMs) {
165
- issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` });
92
+ expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
93
+ if (expectedDisagreement === 0) return 1;
94
+ return 1 - observedDisagreement / expectedDisagreement;
95
+ }
96
+ function mannWhitneyU(a, b) {
97
+ if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
98
+ const n1 = a.length;
99
+ const n2 = b.length;
100
+ const combined = [
101
+ ...a.map((v) => ({ v, group: "a" })),
102
+ ...b.map((v) => ({ v, group: "b" }))
103
+ ].sort((x, y) => x.v - y.v);
104
+ const ranks = new Array(combined.length);
105
+ let i = 0;
106
+ while (i < combined.length) {
107
+ let j = i;
108
+ while (j < combined.length && combined[j].v === combined[i].v) j++;
109
+ const avgRank = (i + 1 + j) / 2;
110
+ for (let k = i; k < j; k++) ranks[k] = avgRank;
111
+ i = j;
112
+ }
113
+ let r1 = 0;
114
+ for (let k = 0; k < combined.length; k++) {
115
+ if (combined[k].group === "a") r1 += ranks[k];
116
+ }
117
+ const u1 = r1 - n1 * (n1 + 1) / 2;
118
+ const u2 = n1 * n2 - u1;
119
+ const u = Math.min(u1, u2);
120
+ const mu = n1 * n2 / 2;
121
+ const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
122
+ if (sigma === 0) return { u, p: 1 };
123
+ const z = Math.abs(u - mu) / sigma;
124
+ const p = 2 * (1 - normalCdf(z));
125
+ return { u, p };
126
+ }
127
+ function partialCredit(current, target) {
128
+ if (target <= 0) return 1;
129
+ return Math.min(1, Math.max(0, current / target));
130
+ }
131
+ function pairedTTest(before, after) {
132
+ if (before.length !== after.length) {
133
+ throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
166
134
  }
167
- }
168
- function buildAxes(metrics, thresholds, gateDecision, issues) {
169
- return [
170
- axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
171
- axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),
172
- axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),
173
- axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
174
- axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`)
135
+ const n = before.length;
136
+ if (n < 2) return { t: 0, df: 0, p: 1 };
137
+ const diffs = before.map((b, i) => after[i] - b);
138
+ const mean = diffs.reduce((a, b) => a + b, 0) / n;
139
+ const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
140
+ const se = Math.sqrt(variance / n);
141
+ if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
142
+ const t = mean / se;
143
+ const df = n - 1;
144
+ const p = 2 * (1 - studentTCdf(Math.abs(t), df));
145
+ return { t, df, p };
146
+ }
147
+ function wilcoxonSignedRank(before, after) {
148
+ if (before.length !== after.length) {
149
+ throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
150
+ }
151
+ const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
152
+ const n = diffs.length;
153
+ if (n < 6) return { w: 0, p: 1 };
154
+ const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
155
+ const ranks = new Array(n);
156
+ let i = 0;
157
+ while (i < n) {
158
+ let j = i;
159
+ while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
160
+ const avg2 = (i + 1 + j) / 2;
161
+ for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg2;
162
+ i = j;
163
+ }
164
+ let wPlus = 0;
165
+ for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
166
+ const mean = n * (n + 1) / 4;
167
+ const variance = n * (n + 1) * (2 * n + 1) / 24;
168
+ const z = (wPlus - mean) / Math.sqrt(variance);
169
+ const p = 2 * (1 - normalCdf(Math.abs(z)));
170
+ return { w: wPlus, p };
171
+ }
172
+ function cohensD(a, b) {
173
+ if (a.length < 2 || b.length < 2) return 0;
174
+ const meanA = a.reduce((x, y) => x + y, 0) / a.length;
175
+ const meanB = b.reduce((x, y) => x + y, 0) / b.length;
176
+ const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
177
+ const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
178
+ const pooled = Math.sqrt(
179
+ ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
180
+ );
181
+ if (pooled === 0) return 0;
182
+ return (meanB - meanA) / pooled;
183
+ }
184
+ function studentTCdf(t, df) {
185
+ if (df <= 0) return 0.5;
186
+ if (df > 100) return normalCdf(t);
187
+ const x = df / (df + t * t);
188
+ const a = df / 2;
189
+ const b = 0.5;
190
+ const ib = incompleteBeta(x, a, b);
191
+ return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
192
+ }
193
+ function incompleteBeta(x, a, b) {
194
+ if (x <= 0) return 0;
195
+ if (x >= 1) return 1;
196
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
197
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
198
+ const maxIter = 200;
199
+ const eps = 3e-7;
200
+ let c = 1;
201
+ let d = 1 - (a + b) * x / (a + 1);
202
+ if (Math.abs(d) < 1e-30) d = 1e-30;
203
+ d = 1 / d;
204
+ let f = d;
205
+ for (let m = 1; m <= maxIter; m++) {
206
+ const m2 = 2 * m;
207
+ let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
208
+ d = 1 + num * d;
209
+ if (Math.abs(d) < 1e-30) d = 1e-30;
210
+ c = 1 + num / c;
211
+ if (Math.abs(c) < 1e-30) c = 1e-30;
212
+ d = 1 / d;
213
+ f *= d * c;
214
+ num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
215
+ d = 1 + num * d;
216
+ if (Math.abs(d) < 1e-30) d = 1e-30;
217
+ c = 1 + num / c;
218
+ if (Math.abs(c) < 1e-30) c = 1e-30;
219
+ d = 1 / d;
220
+ const delta = d * c;
221
+ f *= delta;
222
+ if (Math.abs(delta - 1) < eps) break;
223
+ }
224
+ return front * f;
225
+ }
226
+ function lnGamma(z) {
227
+ const g = 7;
228
+ const coefs = [
229
+ 0.9999999999998099,
230
+ 676.5203681218851,
231
+ -1259.1392167224028,
232
+ 771.3234287776531,
233
+ -176.6150291621406,
234
+ 12.507343278686905,
235
+ -0.13857109526572012,
236
+ 9984369578019572e-21,
237
+ 15056327351493116e-23
175
238
  ];
239
+ if (z < 0.5) {
240
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
241
+ }
242
+ z -= 1;
243
+ let x = coefs[0];
244
+ for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
245
+ const t = z + g + 0.5;
246
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
247
+ }
248
+ function normalCdf(x) {
249
+ const a1 = 0.254829592;
250
+ const a2 = -0.284496736;
251
+ const a3 = 1.421413741;
252
+ const a4 = -1.453152027;
253
+ const a5 = 1.061405429;
254
+ const p = 0.3275911;
255
+ const sign = x < 0 ? -1 : 1;
256
+ const absX = Math.abs(x);
257
+ const t = 1 / (1 + p * absX);
258
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
259
+ return 0.5 * (1 + sign * y);
176
260
  }
177
- function axis(name, issues, score, detail) {
178
- const own = issues.filter((i) => i.axis === name);
179
- const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
180
- return { name, status, score: bounded(score), detail };
181
- }
182
- function countScenarioSplits(scenarios) {
183
- const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
184
- for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
185
- return counts;
186
- }
187
- function countDomains(scenarios) {
188
- const out = {};
189
- for (const scenario of scenarios) {
190
- const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
191
- out[domain] = (out[domain] ?? 0) + 1;
192
- }
193
- return out;
261
+
262
+ // src/power-analysis.ts
263
+ function requiredSampleSize(opts) {
264
+ const effect = opts.effect;
265
+ if (!Number.isFinite(effect) || effect <= 0) return Infinity;
266
+ const alpha = opts.alpha ?? 0.05;
267
+ const power = opts.power ?? 0.8;
268
+ const twoSided = opts.twoSided ?? true;
269
+ const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
270
+ const zBeta = zQuantile(power);
271
+ const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
272
+ return Math.ceil(n);
273
+ }
274
+ function pairedMde(opts) {
275
+ if (!Number.isFinite(opts.nPaired) || opts.nPaired <= 0) return Infinity;
276
+ const alpha = opts.alpha ?? 0.05;
277
+ const power = opts.power ?? 0.8;
278
+ const twoSided = opts.twoSided ?? true;
279
+ const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
280
+ const zBeta = zQuantile(power);
281
+ return (zAlpha + zBeta) / Math.sqrt(opts.nPaired);
282
+ }
283
+ function bonferroni(pValues, alpha = 0.05) {
284
+ const k = pValues.length;
285
+ const adjusted = pValues.map((p) => Math.min(1, p * k));
286
+ const significant = adjusted.map((p) => p < alpha);
287
+ return { adjusted, significant };
288
+ }
289
+ function benjaminiHochberg(pValues, fdr = 0.05) {
290
+ const n = pValues.length;
291
+ if (n === 0) return { qValues: [], significant: [] };
292
+ const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
293
+ const q = new Array(n);
294
+ let minRight = 1;
295
+ for (let k = n - 1; k >= 0; k--) {
296
+ const rank = k + 1;
297
+ const raw = indexed[k].p * n / rank;
298
+ const bounded = Math.min(minRight, raw);
299
+ minRight = bounded;
300
+ q[indexed[k].i] = Math.min(1, bounded);
301
+ }
302
+ const significant = q.map((v) => v < fdr);
303
+ return { qValues: q, significant };
304
+ }
305
+ function zQuantile(p) {
306
+ if (p <= 0 || p >= 1) {
307
+ if (p === 0) return -Infinity;
308
+ if (p === 1) return Infinity;
309
+ return NaN;
310
+ }
311
+ const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
312
+ const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
313
+ const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
314
+ const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
315
+ const pLow = 0.02425;
316
+ const pHigh = 1 - pLow;
317
+ let q;
318
+ let r;
319
+ if (p < pLow) {
320
+ q = Math.sqrt(-2 * Math.log(p));
321
+ return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
322
+ }
323
+ if (p <= pHigh) {
324
+ q = p - 0.5;
325
+ r = q * q;
326
+ return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
327
+ }
328
+ q = Math.sqrt(-2 * Math.log(1 - p));
329
+ return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
194
330
  }
195
- function countFailureModes(runs, traces, threshold) {
196
- const out = {};
197
- for (const run of runs) {
198
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
199
- if (run.failureMode || score !== void 0 && score < threshold) {
200
- const mode = run.failureMode ?? "low_score";
201
- out[mode] = (out[mode] ?? 0) + 1;
202
- }
331
+
332
+ // src/paired-stats.ts
333
+ function pairedBootstrap(before, after, opts = {}) {
334
+ if (before.length !== after.length) {
335
+ throw new Error(
336
+ `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
337
+ );
203
338
  }
204
- for (const trace of traces) {
205
- if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
206
- const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
207
- out[mode] = (out[mode] ?? 0) + 1;
208
- }
339
+ const confidence = opts.confidence ?? 0.95;
340
+ const resamples = opts.resamples ?? 2e3;
341
+ const statistic = opts.statistic ?? "median";
342
+ if (confidence <= 0 || confidence >= 1) {
343
+ throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
209
344
  }
210
- return out;
211
- }
212
- function countResponsibleSurfaces(traces) {
213
- const out = {};
214
- for (const trace of traces) {
215
- for (const asi of trace.asi ?? []) {
216
- const surface = asi.responsibleSurface ?? "unknown";
217
- out[surface] = (out[surface] ?? 0) + 1;
218
- }
345
+ const n = before.length;
346
+ const deltas = before.map((b, i) => after[i] - b);
347
+ if (n === 0) {
348
+ return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
219
349
  }
220
- return out;
221
- }
222
- function failedRows(runs, traces, threshold) {
223
- const out = [];
224
- for (const run of runs) {
225
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
226
- if (run.failureMode || score !== void 0 && score < threshold) {
227
- const asiMetric = run.outcome.raw.asi;
228
- out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
229
- }
350
+ if (n === 1) {
351
+ const d = deltas[0];
352
+ return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
230
353
  }
231
- for (const trace of traces) {
232
- if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
233
- out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
354
+ const rng = makeRng(opts.seed);
355
+ const samples = new Array(resamples);
356
+ for (let b = 0; b < resamples; b++) {
357
+ let acc = null;
358
+ if (statistic === "mean") {
359
+ let sum = 0;
360
+ for (let k = 0; k < n; k++) {
361
+ sum += deltas[Math.floor(rng() * n)];
362
+ }
363
+ samples[b] = sum / n;
364
+ } else {
365
+ acc = new Array(n);
366
+ for (let k = 0; k < n; k++) {
367
+ acc[k] = deltas[Math.floor(rng() * n)];
368
+ }
369
+ samples[b] = medianInPlace(acc);
234
370
  }
235
371
  }
236
- return out;
237
- }
238
- function passRate(runs, traces, threshold) {
239
- const outcomes = [
240
- ...runs.map((run) => {
241
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
242
- return !run.failureMode && score !== void 0 && score >= threshold;
243
- }),
244
- ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
245
- ];
246
- if (outcomes.length === 0) return 0;
247
- return outcomes.filter(Boolean).length / outcomes.length;
248
- }
249
- function scoresFor(runs, split) {
250
- return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
251
- }
252
- function mean(xs) {
253
- if (xs.length === 0) return Number.NaN;
254
- return xs.reduce((sum, x) => sum + x, 0) / xs.length;
255
- }
256
- function percentile(xs, p) {
257
- if (xs.length === 0) return Number.NaN;
258
- const sorted = [...xs].sort((a, b) => a - b);
259
- return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
260
- }
261
- function isFiniteNumber(value) {
262
- return typeof value === "number" && Number.isFinite(value);
263
- }
264
- function safeDiff(a, b) {
265
- if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
266
- return a - b;
267
- }
268
- function gapScore(gap, maxGap) {
269
- if (!Number.isFinite(gap)) return 0;
270
- if (maxGap <= 0) return gap <= 0 ? 1 : 0;
271
- return bounded(1 - Math.max(0, gap) / maxGap);
272
- }
273
- function efficiencyScore(metrics, thresholds) {
274
- const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
275
- const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
276
- return Math.min(cost, latency);
277
- }
278
- function bounded(x) {
279
- if (!Number.isFinite(x)) return 0;
280
- return Math.max(0, Math.min(1, x));
281
- }
282
- function renderSummary(target, status, metrics, issues) {
283
- const prefix = `release confidence ${status}: ${target}`;
284
- const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
285
- if (issues.length === 0) return `${prefix}; ${metricText}`;
286
- return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
287
- }
288
- function fmt(x) {
289
- if (!Number.isFinite(x)) return String(x);
290
- return x.toFixed(4);
291
- }
292
-
293
- // src/pre-registration.ts
294
- function canonicalize(v) {
295
- if (v === null || typeof v !== "object") return v;
296
- if (Array.isArray(v)) return v.map(canonicalize);
297
- const keys = Object.keys(v).sort();
298
- const out = {};
299
- for (const k of keys) out[k] = canonicalize(v[k]);
300
- return out;
372
+ samples.sort((a, b) => a - b);
373
+ const alpha = 1 - confidence;
374
+ const lowIdx = Math.floor(alpha / 2 * resamples);
375
+ const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
376
+ return {
377
+ n,
378
+ median: medianInPlace([...deltas]),
379
+ mean: deltas.reduce((s, x) => s + x, 0) / n,
380
+ low: samples[lowIdx],
381
+ high: samples[Math.max(highIdx, lowIdx)],
382
+ confidence,
383
+ resamples
384
+ };
301
385
  }
302
- async function hashJson(obj) {
303
- const canonical = canonicalize(obj);
304
- const bytes = new TextEncoder().encode(JSON.stringify(canonical));
305
- const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
306
- return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
386
+ function pairedWilcoxon(before, after) {
387
+ return wilcoxonSignedRank(before, after);
307
388
  }
308
- async function signManifest(m) {
309
- const hash = await hashJson(m);
310
- return { ...m, contentHash: hash, algo: "sha256-content" };
389
+ function bhAdjust(pValues, fdr = 0.05) {
390
+ return benjaminiHochberg(pValues, fdr);
311
391
  }
312
- async function verifyManifest(m) {
313
- const { contentHash, algo: _algo, ...rest } = m;
314
- void _algo;
315
- const resigned = await signManifest(rest);
316
- return resigned.contentHash === contentHash;
392
+ function medianInPlace(xs) {
393
+ if (xs.length === 0) return 0;
394
+ xs.sort((a, b) => a - b);
395
+ const mid = Math.floor(xs.length / 2);
396
+ return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
317
397
  }
318
- async function evaluateHypothesis(manifest, observed) {
319
- if (!await verifyManifest(manifest)) {
320
- throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
321
- }
322
- const reasons = [];
323
- const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
324
- if (!directionOk) reasons.push("wrong_direction");
325
- if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
326
- if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
327
- if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
328
- return {
329
- manifest,
330
- observedN: observed.n,
331
- observedEffect: observed.effect,
332
- observedPValue: observed.pValue,
333
- confirmed: reasons.length === 0,
334
- rejectionReasons: reasons
398
+ function makeRng(seed) {
399
+ if (seed === void 0) return Math.random;
400
+ let s = seed | 0 || 2654435769;
401
+ return () => {
402
+ s = s + 1831565813 | 0;
403
+ let t = s;
404
+ t = Math.imul(t ^ t >>> 15, t | 1);
405
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
406
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
335
407
  };
336
408
  }
337
409
 
@@ -428,10 +500,10 @@ function renderSummaryTableMarkdown(rows, comparator, split) {
428
500
  lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
429
501
  lines.push("|---|---:|---:|---|---:|---:|");
430
502
  for (const r of rows) {
431
- const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
503
+ const ci = `[${fmt(r.ciLow)}, ${fmt(r.ciHigh)}]`;
432
504
  const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
433
505
  const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
434
- lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
506
+ lines.push(`| ${r.candidateId} | ${r.n} | ${fmt(r.mean)} | ${ci} | ${q} | ${d} |`);
435
507
  }
436
508
  return lines.join("\n");
437
509
  }
@@ -595,10 +667,10 @@ function seedRng(seed) {
595
667
  return ((t ^ t >>> 14) >>> 0) / 4294967296;
596
668
  };
597
669
  }
598
- function stdev(xs, mean3) {
670
+ function stdev(xs, mean) {
599
671
  if (xs.length < 2) return 0;
600
672
  let sse = 0;
601
- for (const x of xs) sse += (x - mean3) ** 2;
673
+ for (const x of xs) sse += (x - mean) ** 2;
602
674
  return Math.sqrt(sse / (xs.length - 1));
603
675
  }
604
676
  async function researchReport(runs, opts = {}) {
@@ -780,7 +852,7 @@ function buildMethodology(ctx) {
780
852
  return { assumptions, methods, alternatives, whenNotToApply, citations };
781
853
  }
782
854
  function formatRope(rope) {
783
- return `[${fmt2(rope.low)}, ${fmt2(rope.high)}]`;
855
+ return `[${fmt(rope.low)}, ${fmt(rope.high)}]`;
784
856
  }
785
857
  function classifyCandidate(row, ctx) {
786
858
  if (ctx.comparator && row.candidateId === ctx.comparator) {
@@ -805,30 +877,30 @@ function classifyCandidate(row, ctx) {
805
877
  if (ctx.rope && ci.low >= ctx.rope.low && ci.high <= ctx.rope.high) {
806
878
  return {
807
879
  decision: "equivalent",
808
- reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
880
+ reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
809
881
  };
810
882
  }
811
883
  const significant = Number.isFinite(row.qValue) && row.qValue <= ctx.fdr;
812
884
  const gainPositive = ci.low > 0;
813
885
  const gainNegative = ci.high < 0;
814
886
  if (gainNegative) {
815
- return { decision: "reject", reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] lies entirely below zero.` };
887
+ return { decision: "reject", reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.` };
816
888
  }
817
889
  if (ctx.posterior.n < ctx.minPairs) {
818
890
  return {
819
891
  decision: "needs_more_data",
820
- reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt2(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
892
+ reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
821
893
  };
822
894
  }
823
895
  if (significant && gainPositive) {
824
896
  return {
825
897
  decision: "promote",
826
- reason: `BH-adjusted q=${fmt2(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)}.`
898
+ reason: `BH-adjusted q=${fmt(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)}.`
827
899
  };
828
900
  }
829
901
  return {
830
902
  decision: "hold",
831
- reason: `Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)} but CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
903
+ reason: `Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)} but CI [${fmt(ci.low)}, ${fmt(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
832
904
  };
833
905
  }
834
906
  function buildRecommendation(candidates, ctx) {
@@ -843,11 +915,11 @@ function buildRecommendation(candidates, ctx) {
843
915
  if (chosen) {
844
916
  rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
845
917
  if (chosen.gainCi) {
846
- const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt2(chosen.prGreaterThanZero)}` : "";
847
- rationale.push(`Median paired gain CI: [${fmt2(chosen.gainCi.low)}, ${fmt2(chosen.gainCi.high)}]${probSummary}.`);
918
+ const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt(chosen.prGreaterThanZero)}` : "";
919
+ rationale.push(`Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`);
848
920
  }
849
921
  if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
850
- rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt2(chosen.mde)} score units.`);
922
+ rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt(chosen.mde)} score units.`);
851
923
  }
852
924
  }
853
925
  if (!ctx.comparator) {
@@ -956,9 +1028,9 @@ function renderResearchMarkdown(report) {
956
1028
  const prGt = c.prGreaterThanZero === null ? "-" : c.prGreaterThanZero.toFixed(3);
957
1029
  const q = Number.isFinite(c.qValue) ? c.qValue.toFixed(4) : "-";
958
1030
  const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
959
- const gain = c.gainCi ? `[${fmt2(c.gainCi.low)}, ${fmt2(c.gainCi.high)}]` : "-";
960
- const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt2(c.mde);
961
- lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt2(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
1031
+ const gain = c.gainCi ? `[${fmt(c.gainCi.low)}, ${fmt(c.gainCi.high)}]` : "-";
1032
+ const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt(c.mde);
1033
+ lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
962
1034
  }
963
1035
  lines.push("");
964
1036
  lines.push("## Statistical Summary");
@@ -1113,7 +1185,7 @@ function decisionWeight(decision) {
1113
1185
  return 1;
1114
1186
  }
1115
1187
  function signed(x) {
1116
- return `${x >= 0 ? "+" : ""}${fmt2(x)}`;
1188
+ return `${x >= 0 ? "+" : ""}${fmt(x)}`;
1117
1189
  }
1118
1190
  function avg(xs) {
1119
1191
  if (xs.length === 0) return Number.NaN;
@@ -1124,226 +1196,31 @@ function medianOfSorted(sorted) {
1124
1196
  const mid = Math.floor(sorted.length / 2);
1125
1197
  return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1126
1198
  }
1127
- function fmt2(x) {
1199
+ function fmt(x) {
1128
1200
  if (!Number.isFinite(x)) return String(x);
1129
1201
  return x.toFixed(4);
1130
1202
  }
1131
1203
 
1132
- // src/release-report.ts
1133
- function renderReleaseReport(scorecard, options = {}) {
1134
- const title = options.title ?? `Release Report: ${scorecard.target}`;
1135
- const lines = [];
1136
- lines.push(`# ${title}`);
1137
- lines.push("");
1138
- lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
1139
- lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
1140
- if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
1141
- if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
1142
- lines.push("");
1143
- lines.push(scorecard.summary);
1144
- lines.push("");
1145
- lines.push("## Metrics");
1146
- lines.push("");
1147
- lines.push("| Metric | Value |");
1148
- lines.push("|---|---:|");
1149
- lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
1150
- lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
1151
- lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
1152
- lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
1153
- lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
1154
- lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
1155
- lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
1156
- lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
1157
- lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
1158
- lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
1159
- lines.push("");
1160
- if (scorecard.issues.length > 0) {
1161
- lines.push("## Issues");
1162
- lines.push("");
1163
- for (const issue of scorecard.issues) {
1164
- lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
1165
- }
1166
- lines.push("");
1167
- }
1168
- const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
1169
- if (surfaces.length > 0) {
1170
- lines.push("## Responsible Surfaces");
1171
- lines.push("");
1172
- for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
1173
- lines.push("");
1174
- }
1175
- const failures = entries(scorecard.metrics.failureModeCounts);
1176
- if (failures.length > 0) {
1177
- lines.push("## Failure Modes");
1178
- lines.push("");
1179
- for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
1180
- lines.push("");
1181
- }
1182
- if (options.runs && options.runs.length > 0) {
1183
- lines.push("## Run Summary");
1184
- lines.push("");
1185
- lines.push(summaryTable([...options.runs], {
1186
- comparator: options.comparator ?? scorecard.baselineId ?? void 0,
1187
- split: "holdout"
1188
- }).markdown);
1189
- lines.push("");
1190
- }
1191
- if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
1192
- lines.push("## TraceAnalyst Findings");
1193
- lines.push("");
1194
- for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
1195
- lines.push("");
1196
- }
1197
- const nextActions = options.nextActions ?? defaultNextActions(scorecard);
1198
- if (nextActions.length > 0) {
1199
- lines.push("## Next Actions");
1200
- lines.push("");
1201
- for (const action of nextActions) lines.push(`- ${action}`);
1202
- lines.push("");
1203
- }
1204
- return lines.join("\n").trimEnd() + "\n";
1205
- }
1206
- function defaultNextActions(scorecard) {
1207
- if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
1208
- return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
1209
- }
1210
- function entries(values) {
1211
- return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
1212
- }
1213
- function pct(value) {
1214
- return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
1215
- }
1216
- function num(value) {
1217
- return Number.isFinite(value) ? value.toFixed(3) : "n/a";
1218
- }
1219
-
1220
- // src/promotion-gate.ts
1221
- function bootstrapCi(baseline, candidate, options = {}) {
1222
- const alpha = options.alpha ?? 0.05;
1223
- const iterations = options.iterations ?? 1e3;
1224
- const minTotal = options.minTotalSamples ?? 6;
1225
- const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
1226
- const baselineMean = mean2(baseline);
1227
- const candidateMean = mean2(candidate);
1228
- const delta = candidateMean - baselineMean;
1229
- if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
1230
- return {
1231
- baselineMean,
1232
- candidateMean,
1233
- delta,
1234
- ciLower: -Infinity,
1235
- ciUpper: Infinity,
1236
- iterations: 0,
1237
- alpha,
1238
- verdict: "INCONCLUSIVE"
1239
- };
1240
- }
1241
- const deltas = new Array(iterations);
1242
- for (let i = 0; i < iterations; i++) {
1243
- const bResample = resample(baseline, rng);
1244
- const cResample = resample(candidate, rng);
1245
- deltas[i] = mean2(cResample) - mean2(bResample);
1246
- }
1247
- deltas.sort((a, b) => a - b);
1248
- const lowerIdx = Math.floor(alpha / 2 * iterations);
1249
- const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
1250
- const ciLower = deltas[Math.max(0, lowerIdx)];
1251
- const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
1252
- let verdict;
1253
- if (ciLower > 0) verdict = "ADVANCE";
1254
- else if (ciUpper < 0) verdict = "REVERT";
1255
- else if (delta >= 0) verdict = "KEEP";
1256
- else verdict = "INCONCLUSIVE";
1257
- return {
1258
- baselineMean,
1259
- candidateMean,
1260
- delta,
1261
- ciLower,
1262
- ciUpper,
1263
- iterations,
1264
- alpha,
1265
- verdict
1266
- };
1267
- }
1268
- function mean2(xs) {
1269
- if (xs.length === 0) return 0;
1270
- let s = 0;
1271
- for (const x of xs) s += x;
1272
- return s / xs.length;
1273
- }
1274
- function resample(xs, rng) {
1275
- const out = new Array(xs.length);
1276
- for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
1277
- return out;
1278
- }
1279
- function mulberry32(seed) {
1280
- let t = seed >>> 0;
1281
- return () => {
1282
- t += 1831565813;
1283
- let r = t;
1284
- r = Math.imul(r ^ r >>> 15, r | 1);
1285
- r ^= r + Math.imul(r ^ r >>> 7, r | 61);
1286
- return ((r ^ r >>> 14) >>> 0) / 4294967296;
1287
- };
1288
- }
1289
- function hashSeed(a, b) {
1290
- let h = 2166136261;
1291
- for (const x of [...a, ...b]) {
1292
- const view = new Float64Array([x]);
1293
- const bytes = new Uint8Array(view.buffer);
1294
- for (const byte of bytes) {
1295
- h ^= byte;
1296
- h = Math.imul(h, 16777619);
1297
- }
1298
- }
1299
- return h >>> 0;
1300
- }
1301
- async function judgeReplayGate(args) {
1302
- const concurrency = args.judgeConcurrency ?? 4;
1303
- const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
1304
- const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
1305
- const ci = bootstrapCi(baselineScores, candidateScores, {
1306
- ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
1307
- ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
1308
- ...args.seed !== void 0 ? { seed: args.seed } : {}
1309
- });
1310
- return {
1311
- ...ci,
1312
- baselineSamples: baselineScores.length,
1313
- candidateSamples: candidateScores.length
1314
- };
1315
- }
1316
- async function scoreAll(outputs, judge, concurrency) {
1317
- const results = new Array(outputs.length);
1318
- let next = 0;
1319
- async function worker() {
1320
- while (true) {
1321
- const i = next++;
1322
- if (i >= outputs.length) return;
1323
- const v = await judge(outputs[i]);
1324
- results[i] = Number.isFinite(v) ? v : 0;
1325
- }
1326
- }
1327
- await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
1328
- return results;
1329
- }
1330
-
1331
1204
  export {
1332
- releaseTraceEvidenceFromMultiShotTrials,
1333
- evaluateReleaseConfidence,
1334
- assertReleaseConfidence,
1335
- canonicalize,
1336
- hashJson,
1337
- signManifest,
1338
- verifyManifest,
1339
- evaluateHypothesis,
1205
+ normalizeScores,
1206
+ weightedMean,
1207
+ confidenceInterval,
1208
+ interRaterReliability,
1209
+ mannWhitneyU,
1210
+ partialCredit,
1211
+ pairedTTest,
1212
+ wilcoxonSignedRank,
1213
+ cohensD,
1214
+ requiredSampleSize,
1215
+ bonferroni,
1216
+ benjaminiHochberg,
1217
+ pairedBootstrap,
1218
+ pairedWilcoxon,
1219
+ bhAdjust,
1340
1220
  summaryTable,
1341
1221
  paretoChart,
1342
1222
  gainHistogram,
1343
1223
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
1344
- researchReport,
1345
- renderReleaseReport,
1346
- bootstrapCi,
1347
- judgeReplayGate
1224
+ researchReport
1348
1225
  };
1349
- //# sourceMappingURL=chunk-3IX6QTB7.js.map
1226
+ //# sourceMappingURL=chunk-IOXMGMHQ.js.map