@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
|
@@ -1,337 +1,409 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
pairedBootstrap,
|
|
6
|
-
pairedMde,
|
|
7
|
-
wilcoxonSignedRank
|
|
8
|
-
} from "./chunk-KRR4VMH7.js";
|
|
2
|
+
canonicalize,
|
|
3
|
+
hashJson
|
|
4
|
+
} from "./chunk-6M774GY6.js";
|
|
9
5
|
|
|
10
|
-
// src/
|
|
11
|
-
var
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
failureScoreThreshold: 0.5
|
|
24
|
-
};
|
|
25
|
-
function releaseTraceEvidenceFromMultiShotTrials(trials) {
|
|
26
|
-
return trials.map((trial) => ({
|
|
27
|
-
scenarioId: trial.scenarioId,
|
|
28
|
-
candidateId: trial.variantId,
|
|
29
|
-
split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
|
|
30
|
-
score: trial.score,
|
|
31
|
-
ok: trial.ok,
|
|
32
|
-
turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
|
|
33
|
-
costUsd: trial.cost,
|
|
34
|
-
durationMs: trial.durationMs,
|
|
35
|
-
failureMode: trial.error ? "runtime_error" : void 0,
|
|
36
|
-
asi: trial.asi,
|
|
37
|
-
metadata: trial.metadata
|
|
38
|
-
}));
|
|
6
|
+
// src/statistics.ts
|
|
7
|
+
var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
|
|
8
|
+
"hallucination",
|
|
9
|
+
"false_confidence",
|
|
10
|
+
"worst_failure"
|
|
11
|
+
]);
|
|
12
|
+
function normalizeScores(scores) {
|
|
13
|
+
return scores.map((s) => {
|
|
14
|
+
if (INVERTED_DIMENSIONS.has(s.dimension)) {
|
|
15
|
+
return s;
|
|
16
|
+
}
|
|
17
|
+
return s;
|
|
18
|
+
});
|
|
39
19
|
}
|
|
40
|
-
function
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
const
|
|
55
|
-
const
|
|
56
|
-
const
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
|
|
70
|
-
singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
|
|
71
|
-
multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
|
|
72
|
-
splitCounts,
|
|
73
|
-
domainCounts: countDomains(scenarios),
|
|
74
|
-
failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
|
|
75
|
-
responsibleSurfaceCounts: countResponsibleSurfaces(traces)
|
|
76
|
-
};
|
|
77
|
-
const issues = [];
|
|
78
|
-
checkCorpus(input, thresholds, metrics, issues);
|
|
79
|
-
checkQuality(thresholds, metrics, issues);
|
|
80
|
-
checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
|
|
81
|
-
checkDiagnostics(thresholds, metrics, issues);
|
|
82
|
-
checkEfficiency(thresholds, metrics, issues);
|
|
83
|
-
const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
|
|
84
|
-
const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
|
|
20
|
+
function weightedMean(scores) {
|
|
21
|
+
if (scores.length === 0) return 0;
|
|
22
|
+
let totalWeight = 0;
|
|
23
|
+
let weightedSum = 0;
|
|
24
|
+
for (const { score, weight } of scores) {
|
|
25
|
+
const w = weight ?? 1;
|
|
26
|
+
weightedSum += score * w;
|
|
27
|
+
totalWeight += w;
|
|
28
|
+
}
|
|
29
|
+
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
30
|
+
}
|
|
31
|
+
function confidenceInterval(scores, confidence = 0.95) {
|
|
32
|
+
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
33
|
+
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
34
|
+
const n = scores.length;
|
|
35
|
+
const mean = scores.reduce((a, b) => a + b, 0) / n;
|
|
36
|
+
const B = 1e3;
|
|
37
|
+
const bootstrapMeans = [];
|
|
38
|
+
for (let i = 0; i < B; i++) {
|
|
39
|
+
let sum = 0;
|
|
40
|
+
for (let j = 0; j < n; j++) {
|
|
41
|
+
sum += scores[Math.floor(Math.random() * n)];
|
|
42
|
+
}
|
|
43
|
+
bootstrapMeans.push(sum / n);
|
|
44
|
+
}
|
|
45
|
+
bootstrapMeans.sort((a, b) => a - b);
|
|
46
|
+
const alpha = 1 - confidence;
|
|
47
|
+
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
48
|
+
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
85
49
|
return {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
status,
|
|
90
|
-
promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
|
|
91
|
-
axes,
|
|
92
|
-
issues,
|
|
93
|
-
metrics,
|
|
94
|
-
dataset: input.dataset ?? null,
|
|
95
|
-
gateDecision: input.gateDecision ?? null,
|
|
96
|
-
summary: renderSummary(input.target, status, metrics, issues)
|
|
50
|
+
mean,
|
|
51
|
+
lower: bootstrapMeans[lowerIdx],
|
|
52
|
+
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
97
53
|
};
|
|
98
54
|
}
|
|
99
|
-
function
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
|
|
113
|
-
if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
|
|
114
|
-
return [...traces];
|
|
115
|
-
}
|
|
116
|
-
function checkCorpus(input, thresholds, metrics, issues) {
|
|
117
|
-
if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
|
|
118
|
-
issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
|
|
119
|
-
}
|
|
120
|
-
if (metrics.scenarioCount < thresholds.minScenarioCount) {
|
|
121
|
-
issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
|
|
122
|
-
}
|
|
123
|
-
if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
|
|
124
|
-
issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
function checkQuality(thresholds, metrics, issues) {
|
|
128
|
-
if (metrics.searchRuns < thresholds.minSearchRuns) {
|
|
129
|
-
issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
|
|
130
|
-
}
|
|
131
|
-
if (metrics.passRate < thresholds.minPassRate) {
|
|
132
|
-
issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
|
|
133
|
-
}
|
|
134
|
-
if (metrics.meanScore < thresholds.minMeanScore) {
|
|
135
|
-
issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
function checkGeneralization(gateDecision, thresholds, metrics, issues) {
|
|
139
|
-
if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
|
|
140
|
-
issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
|
|
141
|
-
}
|
|
142
|
-
if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
|
|
143
|
-
issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
|
|
144
|
-
}
|
|
145
|
-
if (gateDecision && !gateDecision.promote) {
|
|
146
|
-
issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
|
|
55
|
+
function interRaterReliability(judgeScores) {
|
|
56
|
+
if (judgeScores.length < 2) return 1;
|
|
57
|
+
const dimensionMap = /* @__PURE__ */ new Map();
|
|
58
|
+
for (const judgeSet of judgeScores) {
|
|
59
|
+
for (const s of judgeSet) {
|
|
60
|
+
if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
|
|
61
|
+
const arr = dimensionMap.get(s.dimension);
|
|
62
|
+
if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
|
|
63
|
+
arr.push([s.score]);
|
|
64
|
+
} else {
|
|
65
|
+
arr[arr.length - 1].push(s.score);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
147
68
|
}
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
69
|
+
const allValues = [];
|
|
70
|
+
const pairDiffs = [];
|
|
71
|
+
for (const items of dimensionMap.values()) {
|
|
72
|
+
for (const ratings of items) {
|
|
73
|
+
if (ratings.length < 2) continue;
|
|
74
|
+
for (const v of ratings) allValues.push(v);
|
|
75
|
+
for (let i = 0; i < ratings.length; i++) {
|
|
76
|
+
for (let j = i + 1; j < ratings.length; j++) {
|
|
77
|
+
pairDiffs.push((ratings[i] - ratings[j]) ** 2);
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
158
81
|
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
82
|
+
if (pairDiffs.length === 0 || allValues.length < 2) return 1;
|
|
83
|
+
const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
|
|
84
|
+
let expectedDisagreement = 0;
|
|
85
|
+
let expectedCount = 0;
|
|
86
|
+
for (let i = 0; i < allValues.length; i++) {
|
|
87
|
+
for (let j = i + 1; j < allValues.length; j++) {
|
|
88
|
+
expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
|
|
89
|
+
expectedCount++;
|
|
90
|
+
}
|
|
163
91
|
}
|
|
164
|
-
|
|
165
|
-
|
|
92
|
+
expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
|
|
93
|
+
if (expectedDisagreement === 0) return 1;
|
|
94
|
+
return 1 - observedDisagreement / expectedDisagreement;
|
|
95
|
+
}
|
|
96
|
+
function mannWhitneyU(a, b) {
|
|
97
|
+
if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
|
|
98
|
+
const n1 = a.length;
|
|
99
|
+
const n2 = b.length;
|
|
100
|
+
const combined = [
|
|
101
|
+
...a.map((v) => ({ v, group: "a" })),
|
|
102
|
+
...b.map((v) => ({ v, group: "b" }))
|
|
103
|
+
].sort((x, y) => x.v - y.v);
|
|
104
|
+
const ranks = new Array(combined.length);
|
|
105
|
+
let i = 0;
|
|
106
|
+
while (i < combined.length) {
|
|
107
|
+
let j = i;
|
|
108
|
+
while (j < combined.length && combined[j].v === combined[i].v) j++;
|
|
109
|
+
const avgRank = (i + 1 + j) / 2;
|
|
110
|
+
for (let k = i; k < j; k++) ranks[k] = avgRank;
|
|
111
|
+
i = j;
|
|
112
|
+
}
|
|
113
|
+
let r1 = 0;
|
|
114
|
+
for (let k = 0; k < combined.length; k++) {
|
|
115
|
+
if (combined[k].group === "a") r1 += ranks[k];
|
|
116
|
+
}
|
|
117
|
+
const u1 = r1 - n1 * (n1 + 1) / 2;
|
|
118
|
+
const u2 = n1 * n2 - u1;
|
|
119
|
+
const u = Math.min(u1, u2);
|
|
120
|
+
const mu = n1 * n2 / 2;
|
|
121
|
+
const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
|
|
122
|
+
if (sigma === 0) return { u, p: 1 };
|
|
123
|
+
const z = Math.abs(u - mu) / sigma;
|
|
124
|
+
const p = 2 * (1 - normalCdf(z));
|
|
125
|
+
return { u, p };
|
|
126
|
+
}
|
|
127
|
+
function partialCredit(current, target) {
|
|
128
|
+
if (target <= 0) return 1;
|
|
129
|
+
return Math.min(1, Math.max(0, current / target));
|
|
130
|
+
}
|
|
131
|
+
function pairedTTest(before, after) {
|
|
132
|
+
if (before.length !== after.length) {
|
|
133
|
+
throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
|
|
166
134
|
}
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
135
|
+
const n = before.length;
|
|
136
|
+
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
137
|
+
const diffs = before.map((b, i) => after[i] - b);
|
|
138
|
+
const mean = diffs.reduce((a, b) => a + b, 0) / n;
|
|
139
|
+
const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
|
|
140
|
+
const se = Math.sqrt(variance / n);
|
|
141
|
+
if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
|
|
142
|
+
const t = mean / se;
|
|
143
|
+
const df = n - 1;
|
|
144
|
+
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
145
|
+
return { t, df, p };
|
|
146
|
+
}
|
|
147
|
+
function wilcoxonSignedRank(before, after) {
|
|
148
|
+
if (before.length !== after.length) {
|
|
149
|
+
throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
|
|
150
|
+
}
|
|
151
|
+
const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
|
|
152
|
+
const n = diffs.length;
|
|
153
|
+
if (n < 6) return { w: 0, p: 1 };
|
|
154
|
+
const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
|
|
155
|
+
const ranks = new Array(n);
|
|
156
|
+
let i = 0;
|
|
157
|
+
while (i < n) {
|
|
158
|
+
let j = i;
|
|
159
|
+
while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
|
|
160
|
+
const avg2 = (i + 1 + j) / 2;
|
|
161
|
+
for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg2;
|
|
162
|
+
i = j;
|
|
163
|
+
}
|
|
164
|
+
let wPlus = 0;
|
|
165
|
+
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
|
|
166
|
+
const mean = n * (n + 1) / 4;
|
|
167
|
+
const variance = n * (n + 1) * (2 * n + 1) / 24;
|
|
168
|
+
const z = (wPlus - mean) / Math.sqrt(variance);
|
|
169
|
+
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
170
|
+
return { w: wPlus, p };
|
|
171
|
+
}
|
|
172
|
+
function cohensD(a, b) {
|
|
173
|
+
if (a.length < 2 || b.length < 2) return 0;
|
|
174
|
+
const meanA = a.reduce((x, y) => x + y, 0) / a.length;
|
|
175
|
+
const meanB = b.reduce((x, y) => x + y, 0) / b.length;
|
|
176
|
+
const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
|
|
177
|
+
const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
|
|
178
|
+
const pooled = Math.sqrt(
|
|
179
|
+
((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
|
|
180
|
+
);
|
|
181
|
+
if (pooled === 0) return 0;
|
|
182
|
+
return (meanB - meanA) / pooled;
|
|
183
|
+
}
|
|
184
|
+
function studentTCdf(t, df) {
|
|
185
|
+
if (df <= 0) return 0.5;
|
|
186
|
+
if (df > 100) return normalCdf(t);
|
|
187
|
+
const x = df / (df + t * t);
|
|
188
|
+
const a = df / 2;
|
|
189
|
+
const b = 0.5;
|
|
190
|
+
const ib = incompleteBeta(x, a, b);
|
|
191
|
+
return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
|
|
192
|
+
}
|
|
193
|
+
function incompleteBeta(x, a, b) {
|
|
194
|
+
if (x <= 0) return 0;
|
|
195
|
+
if (x >= 1) return 1;
|
|
196
|
+
const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
|
|
197
|
+
const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
|
|
198
|
+
const maxIter = 200;
|
|
199
|
+
const eps = 3e-7;
|
|
200
|
+
let c = 1;
|
|
201
|
+
let d = 1 - (a + b) * x / (a + 1);
|
|
202
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
203
|
+
d = 1 / d;
|
|
204
|
+
let f = d;
|
|
205
|
+
for (let m = 1; m <= maxIter; m++) {
|
|
206
|
+
const m2 = 2 * m;
|
|
207
|
+
let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
|
|
208
|
+
d = 1 + num * d;
|
|
209
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
210
|
+
c = 1 + num / c;
|
|
211
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
212
|
+
d = 1 / d;
|
|
213
|
+
f *= d * c;
|
|
214
|
+
num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
|
|
215
|
+
d = 1 + num * d;
|
|
216
|
+
if (Math.abs(d) < 1e-30) d = 1e-30;
|
|
217
|
+
c = 1 + num / c;
|
|
218
|
+
if (Math.abs(c) < 1e-30) c = 1e-30;
|
|
219
|
+
d = 1 / d;
|
|
220
|
+
const delta = d * c;
|
|
221
|
+
f *= delta;
|
|
222
|
+
if (Math.abs(delta - 1) < eps) break;
|
|
223
|
+
}
|
|
224
|
+
return front * f;
|
|
225
|
+
}
|
|
226
|
+
function lnGamma(z) {
|
|
227
|
+
const g = 7;
|
|
228
|
+
const coefs = [
|
|
229
|
+
0.9999999999998099,
|
|
230
|
+
676.5203681218851,
|
|
231
|
+
-1259.1392167224028,
|
|
232
|
+
771.3234287776531,
|
|
233
|
+
-176.6150291621406,
|
|
234
|
+
12.507343278686905,
|
|
235
|
+
-0.13857109526572012,
|
|
236
|
+
9984369578019572e-21,
|
|
237
|
+
15056327351493116e-23
|
|
175
238
|
];
|
|
239
|
+
if (z < 0.5) {
|
|
240
|
+
return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
|
|
241
|
+
}
|
|
242
|
+
z -= 1;
|
|
243
|
+
let x = coefs[0];
|
|
244
|
+
for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
|
|
245
|
+
const t = z + g + 0.5;
|
|
246
|
+
return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
|
|
247
|
+
}
|
|
248
|
+
function normalCdf(x) {
|
|
249
|
+
const a1 = 0.254829592;
|
|
250
|
+
const a2 = -0.284496736;
|
|
251
|
+
const a3 = 1.421413741;
|
|
252
|
+
const a4 = -1.453152027;
|
|
253
|
+
const a5 = 1.061405429;
|
|
254
|
+
const p = 0.3275911;
|
|
255
|
+
const sign = x < 0 ? -1 : 1;
|
|
256
|
+
const absX = Math.abs(x);
|
|
257
|
+
const t = 1 / (1 + p * absX);
|
|
258
|
+
const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
|
|
259
|
+
return 0.5 * (1 + sign * y);
|
|
176
260
|
}
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
const
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
261
|
+
|
|
262
|
+
// src/power-analysis.ts
|
|
263
|
+
function requiredSampleSize(opts) {
|
|
264
|
+
const effect = opts.effect;
|
|
265
|
+
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
266
|
+
const alpha = opts.alpha ?? 0.05;
|
|
267
|
+
const power = opts.power ?? 0.8;
|
|
268
|
+
const twoSided = opts.twoSided ?? true;
|
|
269
|
+
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
270
|
+
const zBeta = zQuantile(power);
|
|
271
|
+
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
272
|
+
return Math.ceil(n);
|
|
273
|
+
}
|
|
274
|
+
function pairedMde(opts) {
|
|
275
|
+
if (!Number.isFinite(opts.nPaired) || opts.nPaired <= 0) return Infinity;
|
|
276
|
+
const alpha = opts.alpha ?? 0.05;
|
|
277
|
+
const power = opts.power ?? 0.8;
|
|
278
|
+
const twoSided = opts.twoSided ?? true;
|
|
279
|
+
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
280
|
+
const zBeta = zQuantile(power);
|
|
281
|
+
return (zAlpha + zBeta) / Math.sqrt(opts.nPaired);
|
|
282
|
+
}
|
|
283
|
+
function bonferroni(pValues, alpha = 0.05) {
|
|
284
|
+
const k = pValues.length;
|
|
285
|
+
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
286
|
+
const significant = adjusted.map((p) => p < alpha);
|
|
287
|
+
return { adjusted, significant };
|
|
288
|
+
}
|
|
289
|
+
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
290
|
+
const n = pValues.length;
|
|
291
|
+
if (n === 0) return { qValues: [], significant: [] };
|
|
292
|
+
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
293
|
+
const q = new Array(n);
|
|
294
|
+
let minRight = 1;
|
|
295
|
+
for (let k = n - 1; k >= 0; k--) {
|
|
296
|
+
const rank = k + 1;
|
|
297
|
+
const raw = indexed[k].p * n / rank;
|
|
298
|
+
const bounded = Math.min(minRight, raw);
|
|
299
|
+
minRight = bounded;
|
|
300
|
+
q[indexed[k].i] = Math.min(1, bounded);
|
|
301
|
+
}
|
|
302
|
+
const significant = q.map((v) => v < fdr);
|
|
303
|
+
return { qValues: q, significant };
|
|
304
|
+
}
|
|
305
|
+
function zQuantile(p) {
|
|
306
|
+
if (p <= 0 || p >= 1) {
|
|
307
|
+
if (p === 0) return -Infinity;
|
|
308
|
+
if (p === 1) return Infinity;
|
|
309
|
+
return NaN;
|
|
310
|
+
}
|
|
311
|
+
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
312
|
+
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
313
|
+
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
314
|
+
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
315
|
+
const pLow = 0.02425;
|
|
316
|
+
const pHigh = 1 - pLow;
|
|
317
|
+
let q;
|
|
318
|
+
let r;
|
|
319
|
+
if (p < pLow) {
|
|
320
|
+
q = Math.sqrt(-2 * Math.log(p));
|
|
321
|
+
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
322
|
+
}
|
|
323
|
+
if (p <= pHigh) {
|
|
324
|
+
q = p - 0.5;
|
|
325
|
+
r = q * q;
|
|
326
|
+
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
327
|
+
}
|
|
328
|
+
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
329
|
+
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
194
330
|
}
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
}
|
|
331
|
+
|
|
332
|
+
// src/paired-stats.ts
|
|
333
|
+
function pairedBootstrap(before, after, opts = {}) {
|
|
334
|
+
if (before.length !== after.length) {
|
|
335
|
+
throw new Error(
|
|
336
|
+
`pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
|
|
337
|
+
);
|
|
203
338
|
}
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
}
|
|
339
|
+
const confidence = opts.confidence ?? 0.95;
|
|
340
|
+
const resamples = opts.resamples ?? 2e3;
|
|
341
|
+
const statistic = opts.statistic ?? "median";
|
|
342
|
+
if (confidence <= 0 || confidence >= 1) {
|
|
343
|
+
throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
|
|
209
344
|
}
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
for (const trace of traces) {
|
|
215
|
-
for (const asi of trace.asi ?? []) {
|
|
216
|
-
const surface = asi.responsibleSurface ?? "unknown";
|
|
217
|
-
out[surface] = (out[surface] ?? 0) + 1;
|
|
218
|
-
}
|
|
345
|
+
const n = before.length;
|
|
346
|
+
const deltas = before.map((b, i) => after[i] - b);
|
|
347
|
+
if (n === 0) {
|
|
348
|
+
return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
|
|
219
349
|
}
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
const out = [];
|
|
224
|
-
for (const run of runs) {
|
|
225
|
-
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
226
|
-
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
227
|
-
const asiMetric = run.outcome.raw.asi;
|
|
228
|
-
out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
|
|
229
|
-
}
|
|
350
|
+
if (n === 1) {
|
|
351
|
+
const d = deltas[0];
|
|
352
|
+
return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
|
|
230
353
|
}
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
354
|
+
const rng = makeRng(opts.seed);
|
|
355
|
+
const samples = new Array(resamples);
|
|
356
|
+
for (let b = 0; b < resamples; b++) {
|
|
357
|
+
let acc = null;
|
|
358
|
+
if (statistic === "mean") {
|
|
359
|
+
let sum = 0;
|
|
360
|
+
for (let k = 0; k < n; k++) {
|
|
361
|
+
sum += deltas[Math.floor(rng() * n)];
|
|
362
|
+
}
|
|
363
|
+
samples[b] = sum / n;
|
|
364
|
+
} else {
|
|
365
|
+
acc = new Array(n);
|
|
366
|
+
for (let k = 0; k < n; k++) {
|
|
367
|
+
acc[k] = deltas[Math.floor(rng() * n)];
|
|
368
|
+
}
|
|
369
|
+
samples[b] = medianInPlace(acc);
|
|
234
370
|
}
|
|
235
371
|
}
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
const
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
}
|
|
249
|
-
function scoresFor(runs, split) {
|
|
250
|
-
return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
|
|
251
|
-
}
|
|
252
|
-
function mean(xs) {
|
|
253
|
-
if (xs.length === 0) return Number.NaN;
|
|
254
|
-
return xs.reduce((sum, x) => sum + x, 0) / xs.length;
|
|
255
|
-
}
|
|
256
|
-
function percentile(xs, p) {
|
|
257
|
-
if (xs.length === 0) return Number.NaN;
|
|
258
|
-
const sorted = [...xs].sort((a, b) => a - b);
|
|
259
|
-
return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
|
|
260
|
-
}
|
|
261
|
-
function isFiniteNumber(value) {
|
|
262
|
-
return typeof value === "number" && Number.isFinite(value);
|
|
263
|
-
}
|
|
264
|
-
function safeDiff(a, b) {
|
|
265
|
-
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
266
|
-
return a - b;
|
|
267
|
-
}
|
|
268
|
-
function gapScore(gap, maxGap) {
|
|
269
|
-
if (!Number.isFinite(gap)) return 0;
|
|
270
|
-
if (maxGap <= 0) return gap <= 0 ? 1 : 0;
|
|
271
|
-
return bounded(1 - Math.max(0, gap) / maxGap);
|
|
272
|
-
}
|
|
273
|
-
function efficiencyScore(metrics, thresholds) {
|
|
274
|
-
const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
|
|
275
|
-
const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
|
|
276
|
-
return Math.min(cost, latency);
|
|
277
|
-
}
|
|
278
|
-
function bounded(x) {
|
|
279
|
-
if (!Number.isFinite(x)) return 0;
|
|
280
|
-
return Math.max(0, Math.min(1, x));
|
|
281
|
-
}
|
|
282
|
-
function renderSummary(target, status, metrics, issues) {
|
|
283
|
-
const prefix = `release confidence ${status}: ${target}`;
|
|
284
|
-
const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
|
|
285
|
-
if (issues.length === 0) return `${prefix}; ${metricText}`;
|
|
286
|
-
return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
|
|
287
|
-
}
|
|
288
|
-
function fmt(x) {
|
|
289
|
-
if (!Number.isFinite(x)) return String(x);
|
|
290
|
-
return x.toFixed(4);
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
// src/pre-registration.ts
|
|
294
|
-
function canonicalize(v) {
|
|
295
|
-
if (v === null || typeof v !== "object") return v;
|
|
296
|
-
if (Array.isArray(v)) return v.map(canonicalize);
|
|
297
|
-
const keys = Object.keys(v).sort();
|
|
298
|
-
const out = {};
|
|
299
|
-
for (const k of keys) out[k] = canonicalize(v[k]);
|
|
300
|
-
return out;
|
|
372
|
+
samples.sort((a, b) => a - b);
|
|
373
|
+
const alpha = 1 - confidence;
|
|
374
|
+
const lowIdx = Math.floor(alpha / 2 * resamples);
|
|
375
|
+
const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
|
|
376
|
+
return {
|
|
377
|
+
n,
|
|
378
|
+
median: medianInPlace([...deltas]),
|
|
379
|
+
mean: deltas.reduce((s, x) => s + x, 0) / n,
|
|
380
|
+
low: samples[lowIdx],
|
|
381
|
+
high: samples[Math.max(highIdx, lowIdx)],
|
|
382
|
+
confidence,
|
|
383
|
+
resamples
|
|
384
|
+
};
|
|
301
385
|
}
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
305
|
-
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
306
|
-
return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
386
|
+
function pairedWilcoxon(before, after) {
|
|
387
|
+
return wilcoxonSignedRank(before, after);
|
|
307
388
|
}
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
return { ...m, contentHash: hash, algo: "sha256-content" };
|
|
389
|
+
function bhAdjust(pValues, fdr = 0.05) {
|
|
390
|
+
return benjaminiHochberg(pValues, fdr);
|
|
311
391
|
}
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
const
|
|
316
|
-
return
|
|
392
|
+
function medianInPlace(xs) {
|
|
393
|
+
if (xs.length === 0) return 0;
|
|
394
|
+
xs.sort((a, b) => a - b);
|
|
395
|
+
const mid = Math.floor(xs.length / 2);
|
|
396
|
+
return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
|
|
317
397
|
}
|
|
318
|
-
|
|
319
|
-
if (
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
|
|
328
|
-
return {
|
|
329
|
-
manifest,
|
|
330
|
-
observedN: observed.n,
|
|
331
|
-
observedEffect: observed.effect,
|
|
332
|
-
observedPValue: observed.pValue,
|
|
333
|
-
confirmed: reasons.length === 0,
|
|
334
|
-
rejectionReasons: reasons
|
|
398
|
+
function makeRng(seed) {
|
|
399
|
+
if (seed === void 0) return Math.random;
|
|
400
|
+
let s = seed | 0 || 2654435769;
|
|
401
|
+
return () => {
|
|
402
|
+
s = s + 1831565813 | 0;
|
|
403
|
+
let t = s;
|
|
404
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
405
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
406
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
335
407
|
};
|
|
336
408
|
}
|
|
337
409
|
|
|
@@ -428,10 +500,10 @@ function renderSummaryTableMarkdown(rows, comparator, split) {
|
|
|
428
500
|
lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
|
|
429
501
|
lines.push("|---|---:|---:|---|---:|---:|");
|
|
430
502
|
for (const r of rows) {
|
|
431
|
-
const ci = `[${
|
|
503
|
+
const ci = `[${fmt(r.ciLow)}, ${fmt(r.ciHigh)}]`;
|
|
432
504
|
const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
|
|
433
505
|
const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
|
|
434
|
-
lines.push(`| ${r.candidateId} | ${r.n} | ${
|
|
506
|
+
lines.push(`| ${r.candidateId} | ${r.n} | ${fmt(r.mean)} | ${ci} | ${q} | ${d} |`);
|
|
435
507
|
}
|
|
436
508
|
return lines.join("\n");
|
|
437
509
|
}
|
|
@@ -595,10 +667,10 @@ function seedRng(seed) {
|
|
|
595
667
|
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
596
668
|
};
|
|
597
669
|
}
|
|
598
|
-
function stdev(xs,
|
|
670
|
+
function stdev(xs, mean) {
|
|
599
671
|
if (xs.length < 2) return 0;
|
|
600
672
|
let sse = 0;
|
|
601
|
-
for (const x of xs) sse += (x -
|
|
673
|
+
for (const x of xs) sse += (x - mean) ** 2;
|
|
602
674
|
return Math.sqrt(sse / (xs.length - 1));
|
|
603
675
|
}
|
|
604
676
|
async function researchReport(runs, opts = {}) {
|
|
@@ -780,7 +852,7 @@ function buildMethodology(ctx) {
|
|
|
780
852
|
return { assumptions, methods, alternatives, whenNotToApply, citations };
|
|
781
853
|
}
|
|
782
854
|
function formatRope(rope) {
|
|
783
|
-
return `[${
|
|
855
|
+
return `[${fmt(rope.low)}, ${fmt(rope.high)}]`;
|
|
784
856
|
}
|
|
785
857
|
function classifyCandidate(row, ctx) {
|
|
786
858
|
if (ctx.comparator && row.candidateId === ctx.comparator) {
|
|
@@ -805,30 +877,30 @@ function classifyCandidate(row, ctx) {
|
|
|
805
877
|
if (ctx.rope && ci.low >= ctx.rope.low && ci.high <= ctx.rope.high) {
|
|
806
878
|
return {
|
|
807
879
|
decision: "equivalent",
|
|
808
|
-
reason: `Paired-delta CI [${
|
|
880
|
+
reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
|
|
809
881
|
};
|
|
810
882
|
}
|
|
811
883
|
const significant = Number.isFinite(row.qValue) && row.qValue <= ctx.fdr;
|
|
812
884
|
const gainPositive = ci.low > 0;
|
|
813
885
|
const gainNegative = ci.high < 0;
|
|
814
886
|
if (gainNegative) {
|
|
815
|
-
return { decision: "reject", reason: `Paired-delta CI [${
|
|
887
|
+
return { decision: "reject", reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.` };
|
|
816
888
|
}
|
|
817
889
|
if (ctx.posterior.n < ctx.minPairs) {
|
|
818
890
|
return {
|
|
819
891
|
decision: "needs_more_data",
|
|
820
|
-
reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${
|
|
892
|
+
reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
|
|
821
893
|
};
|
|
822
894
|
}
|
|
823
895
|
if (significant && gainPositive) {
|
|
824
896
|
return {
|
|
825
897
|
decision: "promote",
|
|
826
|
-
reason: `BH-adjusted q=${
|
|
898
|
+
reason: `BH-adjusted q=${fmt(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)}.`
|
|
827
899
|
};
|
|
828
900
|
}
|
|
829
901
|
return {
|
|
830
902
|
decision: "hold",
|
|
831
|
-
reason: `Pr(\u0394>0)=${
|
|
903
|
+
reason: `Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)} but CI [${fmt(ci.low)}, ${fmt(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
|
|
832
904
|
};
|
|
833
905
|
}
|
|
834
906
|
function buildRecommendation(candidates, ctx) {
|
|
@@ -843,11 +915,11 @@ function buildRecommendation(candidates, ctx) {
|
|
|
843
915
|
if (chosen) {
|
|
844
916
|
rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
|
|
845
917
|
if (chosen.gainCi) {
|
|
846
|
-
const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${
|
|
847
|
-
rationale.push(`Median paired gain CI: [${
|
|
918
|
+
const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt(chosen.prGreaterThanZero)}` : "";
|
|
919
|
+
rationale.push(`Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`);
|
|
848
920
|
}
|
|
849
921
|
if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
|
|
850
|
-
rationale.push(`MDE at current paired N=${chosen.pairedN}: ${
|
|
922
|
+
rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt(chosen.mde)} score units.`);
|
|
851
923
|
}
|
|
852
924
|
}
|
|
853
925
|
if (!ctx.comparator) {
|
|
@@ -956,9 +1028,9 @@ function renderResearchMarkdown(report) {
|
|
|
956
1028
|
const prGt = c.prGreaterThanZero === null ? "-" : c.prGreaterThanZero.toFixed(3);
|
|
957
1029
|
const q = Number.isFinite(c.qValue) ? c.qValue.toFixed(4) : "-";
|
|
958
1030
|
const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
|
|
959
|
-
const gain = c.gainCi ? `[${
|
|
960
|
-
const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" :
|
|
961
|
-
lines.push(`| ${c.candidateId} | ${c.decision} | ${
|
|
1031
|
+
const gain = c.gainCi ? `[${fmt(c.gainCi.low)}, ${fmt(c.gainCi.high)}]` : "-";
|
|
1032
|
+
const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt(c.mde);
|
|
1033
|
+
lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
|
|
962
1034
|
}
|
|
963
1035
|
lines.push("");
|
|
964
1036
|
lines.push("## Statistical Summary");
|
|
@@ -1113,7 +1185,7 @@ function decisionWeight(decision) {
|
|
|
1113
1185
|
return 1;
|
|
1114
1186
|
}
|
|
1115
1187
|
function signed(x) {
|
|
1116
|
-
return `${x >= 0 ? "+" : ""}${
|
|
1188
|
+
return `${x >= 0 ? "+" : ""}${fmt(x)}`;
|
|
1117
1189
|
}
|
|
1118
1190
|
function avg(xs) {
|
|
1119
1191
|
if (xs.length === 0) return Number.NaN;
|
|
@@ -1124,226 +1196,31 @@ function medianOfSorted(sorted) {
|
|
|
1124
1196
|
const mid = Math.floor(sorted.length / 2);
|
|
1125
1197
|
return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
|
|
1126
1198
|
}
|
|
1127
|
-
function
|
|
1199
|
+
function fmt(x) {
|
|
1128
1200
|
if (!Number.isFinite(x)) return String(x);
|
|
1129
1201
|
return x.toFixed(4);
|
|
1130
1202
|
}
|
|
1131
1203
|
|
|
1132
|
-
// src/release-report.ts
|
|
1133
|
-
function renderReleaseReport(scorecard, options = {}) {
|
|
1134
|
-
const title = options.title ?? `Release Report: ${scorecard.target}`;
|
|
1135
|
-
const lines = [];
|
|
1136
|
-
lines.push(`# ${title}`);
|
|
1137
|
-
lines.push("");
|
|
1138
|
-
lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
|
|
1139
|
-
lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
|
|
1140
|
-
if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
|
|
1141
|
-
if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
|
|
1142
|
-
lines.push("");
|
|
1143
|
-
lines.push(scorecard.summary);
|
|
1144
|
-
lines.push("");
|
|
1145
|
-
lines.push("## Metrics");
|
|
1146
|
-
lines.push("");
|
|
1147
|
-
lines.push("| Metric | Value |");
|
|
1148
|
-
lines.push("|---|---:|");
|
|
1149
|
-
lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
|
|
1150
|
-
lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
|
|
1151
|
-
lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
|
|
1152
|
-
lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
|
|
1153
|
-
lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
|
|
1154
|
-
lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
|
|
1155
|
-
lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
|
|
1156
|
-
lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
|
|
1157
|
-
lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
|
|
1158
|
-
lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
|
|
1159
|
-
lines.push("");
|
|
1160
|
-
if (scorecard.issues.length > 0) {
|
|
1161
|
-
lines.push("## Issues");
|
|
1162
|
-
lines.push("");
|
|
1163
|
-
for (const issue of scorecard.issues) {
|
|
1164
|
-
lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
|
|
1165
|
-
}
|
|
1166
|
-
lines.push("");
|
|
1167
|
-
}
|
|
1168
|
-
const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
|
|
1169
|
-
if (surfaces.length > 0) {
|
|
1170
|
-
lines.push("## Responsible Surfaces");
|
|
1171
|
-
lines.push("");
|
|
1172
|
-
for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
|
|
1173
|
-
lines.push("");
|
|
1174
|
-
}
|
|
1175
|
-
const failures = entries(scorecard.metrics.failureModeCounts);
|
|
1176
|
-
if (failures.length > 0) {
|
|
1177
|
-
lines.push("## Failure Modes");
|
|
1178
|
-
lines.push("");
|
|
1179
|
-
for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
|
|
1180
|
-
lines.push("");
|
|
1181
|
-
}
|
|
1182
|
-
if (options.runs && options.runs.length > 0) {
|
|
1183
|
-
lines.push("## Run Summary");
|
|
1184
|
-
lines.push("");
|
|
1185
|
-
lines.push(summaryTable([...options.runs], {
|
|
1186
|
-
comparator: options.comparator ?? scorecard.baselineId ?? void 0,
|
|
1187
|
-
split: "holdout"
|
|
1188
|
-
}).markdown);
|
|
1189
|
-
lines.push("");
|
|
1190
|
-
}
|
|
1191
|
-
if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
|
|
1192
|
-
lines.push("## TraceAnalyst Findings");
|
|
1193
|
-
lines.push("");
|
|
1194
|
-
for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
|
|
1195
|
-
lines.push("");
|
|
1196
|
-
}
|
|
1197
|
-
const nextActions = options.nextActions ?? defaultNextActions(scorecard);
|
|
1198
|
-
if (nextActions.length > 0) {
|
|
1199
|
-
lines.push("## Next Actions");
|
|
1200
|
-
lines.push("");
|
|
1201
|
-
for (const action of nextActions) lines.push(`- ${action}`);
|
|
1202
|
-
lines.push("");
|
|
1203
|
-
}
|
|
1204
|
-
return lines.join("\n").trimEnd() + "\n";
|
|
1205
|
-
}
|
|
1206
|
-
function defaultNextActions(scorecard) {
|
|
1207
|
-
if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
|
|
1208
|
-
return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
|
|
1209
|
-
}
|
|
1210
|
-
function entries(values) {
|
|
1211
|
-
return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
|
1212
|
-
}
|
|
1213
|
-
function pct(value) {
|
|
1214
|
-
return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
|
|
1215
|
-
}
|
|
1216
|
-
function num(value) {
|
|
1217
|
-
return Number.isFinite(value) ? value.toFixed(3) : "n/a";
|
|
1218
|
-
}
|
|
1219
|
-
|
|
1220
|
-
// src/promotion-gate.ts
|
|
1221
|
-
function bootstrapCi(baseline, candidate, options = {}) {
|
|
1222
|
-
const alpha = options.alpha ?? 0.05;
|
|
1223
|
-
const iterations = options.iterations ?? 1e3;
|
|
1224
|
-
const minTotal = options.minTotalSamples ?? 6;
|
|
1225
|
-
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
1226
|
-
const baselineMean = mean2(baseline);
|
|
1227
|
-
const candidateMean = mean2(candidate);
|
|
1228
|
-
const delta = candidateMean - baselineMean;
|
|
1229
|
-
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
1230
|
-
return {
|
|
1231
|
-
baselineMean,
|
|
1232
|
-
candidateMean,
|
|
1233
|
-
delta,
|
|
1234
|
-
ciLower: -Infinity,
|
|
1235
|
-
ciUpper: Infinity,
|
|
1236
|
-
iterations: 0,
|
|
1237
|
-
alpha,
|
|
1238
|
-
verdict: "INCONCLUSIVE"
|
|
1239
|
-
};
|
|
1240
|
-
}
|
|
1241
|
-
const deltas = new Array(iterations);
|
|
1242
|
-
for (let i = 0; i < iterations; i++) {
|
|
1243
|
-
const bResample = resample(baseline, rng);
|
|
1244
|
-
const cResample = resample(candidate, rng);
|
|
1245
|
-
deltas[i] = mean2(cResample) - mean2(bResample);
|
|
1246
|
-
}
|
|
1247
|
-
deltas.sort((a, b) => a - b);
|
|
1248
|
-
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
1249
|
-
const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
|
|
1250
|
-
const ciLower = deltas[Math.max(0, lowerIdx)];
|
|
1251
|
-
const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
|
|
1252
|
-
let verdict;
|
|
1253
|
-
if (ciLower > 0) verdict = "ADVANCE";
|
|
1254
|
-
else if (ciUpper < 0) verdict = "REVERT";
|
|
1255
|
-
else if (delta >= 0) verdict = "KEEP";
|
|
1256
|
-
else verdict = "INCONCLUSIVE";
|
|
1257
|
-
return {
|
|
1258
|
-
baselineMean,
|
|
1259
|
-
candidateMean,
|
|
1260
|
-
delta,
|
|
1261
|
-
ciLower,
|
|
1262
|
-
ciUpper,
|
|
1263
|
-
iterations,
|
|
1264
|
-
alpha,
|
|
1265
|
-
verdict
|
|
1266
|
-
};
|
|
1267
|
-
}
|
|
1268
|
-
function mean2(xs) {
|
|
1269
|
-
if (xs.length === 0) return 0;
|
|
1270
|
-
let s = 0;
|
|
1271
|
-
for (const x of xs) s += x;
|
|
1272
|
-
return s / xs.length;
|
|
1273
|
-
}
|
|
1274
|
-
function resample(xs, rng) {
|
|
1275
|
-
const out = new Array(xs.length);
|
|
1276
|
-
for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
|
|
1277
|
-
return out;
|
|
1278
|
-
}
|
|
1279
|
-
function mulberry32(seed) {
|
|
1280
|
-
let t = seed >>> 0;
|
|
1281
|
-
return () => {
|
|
1282
|
-
t += 1831565813;
|
|
1283
|
-
let r = t;
|
|
1284
|
-
r = Math.imul(r ^ r >>> 15, r | 1);
|
|
1285
|
-
r ^= r + Math.imul(r ^ r >>> 7, r | 61);
|
|
1286
|
-
return ((r ^ r >>> 14) >>> 0) / 4294967296;
|
|
1287
|
-
};
|
|
1288
|
-
}
|
|
1289
|
-
function hashSeed(a, b) {
|
|
1290
|
-
let h = 2166136261;
|
|
1291
|
-
for (const x of [...a, ...b]) {
|
|
1292
|
-
const view = new Float64Array([x]);
|
|
1293
|
-
const bytes = new Uint8Array(view.buffer);
|
|
1294
|
-
for (const byte of bytes) {
|
|
1295
|
-
h ^= byte;
|
|
1296
|
-
h = Math.imul(h, 16777619);
|
|
1297
|
-
}
|
|
1298
|
-
}
|
|
1299
|
-
return h >>> 0;
|
|
1300
|
-
}
|
|
1301
|
-
async function judgeReplayGate(args) {
|
|
1302
|
-
const concurrency = args.judgeConcurrency ?? 4;
|
|
1303
|
-
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
1304
|
-
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
1305
|
-
const ci = bootstrapCi(baselineScores, candidateScores, {
|
|
1306
|
-
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
1307
|
-
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
1308
|
-
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
1309
|
-
});
|
|
1310
|
-
return {
|
|
1311
|
-
...ci,
|
|
1312
|
-
baselineSamples: baselineScores.length,
|
|
1313
|
-
candidateSamples: candidateScores.length
|
|
1314
|
-
};
|
|
1315
|
-
}
|
|
1316
|
-
async function scoreAll(outputs, judge, concurrency) {
|
|
1317
|
-
const results = new Array(outputs.length);
|
|
1318
|
-
let next = 0;
|
|
1319
|
-
async function worker() {
|
|
1320
|
-
while (true) {
|
|
1321
|
-
const i = next++;
|
|
1322
|
-
if (i >= outputs.length) return;
|
|
1323
|
-
const v = await judge(outputs[i]);
|
|
1324
|
-
results[i] = Number.isFinite(v) ? v : 0;
|
|
1325
|
-
}
|
|
1326
|
-
}
|
|
1327
|
-
await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
1328
|
-
return results;
|
|
1329
|
-
}
|
|
1330
|
-
|
|
1331
1204
|
export {
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1205
|
+
normalizeScores,
|
|
1206
|
+
weightedMean,
|
|
1207
|
+
confidenceInterval,
|
|
1208
|
+
interRaterReliability,
|
|
1209
|
+
mannWhitneyU,
|
|
1210
|
+
partialCredit,
|
|
1211
|
+
pairedTTest,
|
|
1212
|
+
wilcoxonSignedRank,
|
|
1213
|
+
cohensD,
|
|
1214
|
+
requiredSampleSize,
|
|
1215
|
+
bonferroni,
|
|
1216
|
+
benjaminiHochberg,
|
|
1217
|
+
pairedBootstrap,
|
|
1218
|
+
pairedWilcoxon,
|
|
1219
|
+
bhAdjust,
|
|
1340
1220
|
summaryTable,
|
|
1341
1221
|
paretoChart,
|
|
1342
1222
|
gainHistogram,
|
|
1343
1223
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
1344
|
-
researchReport
|
|
1345
|
-
renderReleaseReport,
|
|
1346
|
-
bootstrapCi,
|
|
1347
|
-
judgeReplayGate
|
|
1224
|
+
researchReport
|
|
1348
1225
|
};
|
|
1349
|
-
//# sourceMappingURL=chunk-
|
|
1226
|
+
//# sourceMappingURL=chunk-IOXMGMHQ.js.map
|