@tangle-network/agent-eval 0.21.0 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +102 -1
- package/README.md +4 -0
- package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
- package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
- package/dist/chunk-UAND2LOT.js +738 -0
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -3
- package/dist/index.d.ts +10 -284
- package/dist/index.js +39 -19
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +6 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +15 -8
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +101 -181
- package/dist/traces.js +16 -5
- package/dist/wire/index.js +3 -3
- package/docs/research-report-methodology.md +19 -4
- package/docs/wire-protocol.md +1 -1
- package/package.json +2 -2
- package/dist/chunk-3IX6QTB7.js.map +0 -1
- package/dist/chunk-HRZELXCR.js.map +0 -1
- package/dist/chunk-KRR4VMH7.js +0 -423
- package/dist/chunk-KRR4VMH7.js.map +0 -1
- package/dist/chunk-WOK2RTWG.js.map +0 -1
- package/dist/reporting-Da2ihlcM.d.ts +0 -672
- /package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0
|
@@ -0,0 +1,738 @@
|
|
|
1
|
+
import {
|
|
2
|
+
summaryTable
|
|
3
|
+
} from "./chunk-IOXMGMHQ.js";
|
|
4
|
+
|
|
5
|
+
// src/release-confidence.ts
|
|
6
|
+
var DEFAULT_THRESHOLDS = {
|
|
7
|
+
requireCorpus: true,
|
|
8
|
+
minScenarioCount: 1,
|
|
9
|
+
minSearchRuns: 1,
|
|
10
|
+
minHoldoutRuns: 1,
|
|
11
|
+
requireHoldout: true,
|
|
12
|
+
minPassRate: 0.8,
|
|
13
|
+
minMeanScore: 0.7,
|
|
14
|
+
maxOverfitGap: 0.15,
|
|
15
|
+
maxMeanCostUsd: Number.POSITIVE_INFINITY,
|
|
16
|
+
maxP95WallMs: Number.POSITIVE_INFINITY,
|
|
17
|
+
requireAsiForFailures: true,
|
|
18
|
+
failureScoreThreshold: 0.5
|
|
19
|
+
};
|
|
20
|
+
function releaseTraceEvidenceFromMultiShotTrials(trials) {
|
|
21
|
+
return trials.map((trial) => ({
|
|
22
|
+
scenarioId: trial.scenarioId,
|
|
23
|
+
candidateId: trial.variantId,
|
|
24
|
+
split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
|
|
25
|
+
score: trial.score,
|
|
26
|
+
ok: trial.ok,
|
|
27
|
+
turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
|
|
28
|
+
costUsd: trial.cost,
|
|
29
|
+
durationMs: trial.durationMs,
|
|
30
|
+
failureMode: trial.error ? "runtime_error" : void 0,
|
|
31
|
+
asi: trial.asi,
|
|
32
|
+
metadata: trial.metadata
|
|
33
|
+
}));
|
|
34
|
+
}
|
|
35
|
+
function evaluateReleaseConfidence(input) {
|
|
36
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
|
37
|
+
const candidateId = input.candidateId ?? null;
|
|
38
|
+
const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
|
|
39
|
+
const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
|
|
40
|
+
const scenarios = input.scenarios ?? [];
|
|
41
|
+
const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
|
|
42
|
+
const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
|
|
43
|
+
const searchScores = scoresFor(runs, "search");
|
|
44
|
+
const holdoutScores = scoresFor(runs, "holdout");
|
|
45
|
+
const allScores = [...searchScores, ...holdoutScores];
|
|
46
|
+
const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
|
|
47
|
+
const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
|
|
48
|
+
const searchRuns = runs.filter((r) => r.splitTag === "search").length;
|
|
49
|
+
const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
|
|
50
|
+
const searchMeanScore = mean(searchScores);
|
|
51
|
+
const holdoutMeanScore = mean(holdoutScores);
|
|
52
|
+
const metrics = {
|
|
53
|
+
scenarioCount,
|
|
54
|
+
searchRuns,
|
|
55
|
+
holdoutRuns,
|
|
56
|
+
passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
|
|
57
|
+
meanScore: mean(scoreUniverse),
|
|
58
|
+
searchMeanScore,
|
|
59
|
+
holdoutMeanScore,
|
|
60
|
+
overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),
|
|
61
|
+
meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
|
|
62
|
+
p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
|
|
63
|
+
failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
|
|
64
|
+
failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
|
|
65
|
+
singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
|
|
66
|
+
multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
|
|
67
|
+
splitCounts,
|
|
68
|
+
domainCounts: countDomains(scenarios),
|
|
69
|
+
failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
|
|
70
|
+
responsibleSurfaceCounts: countResponsibleSurfaces(traces)
|
|
71
|
+
};
|
|
72
|
+
const issues = [];
|
|
73
|
+
checkCorpus(input, thresholds, metrics, issues);
|
|
74
|
+
checkQuality(thresholds, metrics, issues);
|
|
75
|
+
checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
|
|
76
|
+
checkDiagnostics(thresholds, metrics, issues);
|
|
77
|
+
checkEfficiency(thresholds, metrics, issues);
|
|
78
|
+
const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
|
|
79
|
+
const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
|
|
80
|
+
return {
|
|
81
|
+
target: input.target,
|
|
82
|
+
candidateId,
|
|
83
|
+
baselineId: input.baselineId ?? null,
|
|
84
|
+
status,
|
|
85
|
+
promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
|
|
86
|
+
axes,
|
|
87
|
+
issues,
|
|
88
|
+
metrics,
|
|
89
|
+
dataset: input.dataset ?? null,
|
|
90
|
+
gateDecision: input.gateDecision ?? null,
|
|
91
|
+
summary: renderSummary(input.target, status, metrics, issues)
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
function assertReleaseConfidence(input) {
|
|
95
|
+
const scorecard = evaluateReleaseConfidence(input);
|
|
96
|
+
if (scorecard.status === "fail") {
|
|
97
|
+
throw new Error(scorecard.summary);
|
|
98
|
+
}
|
|
99
|
+
return scorecard;
|
|
100
|
+
}
|
|
101
|
+
function filterCandidate(runs, candidateId, baselineId) {
|
|
102
|
+
if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
|
|
103
|
+
if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
|
|
104
|
+
return [...runs];
|
|
105
|
+
}
|
|
106
|
+
function filterTraceCandidate(traces, candidateId, baselineId) {
|
|
107
|
+
if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
|
|
108
|
+
if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
|
|
109
|
+
return [...traces];
|
|
110
|
+
}
|
|
111
|
+
function checkCorpus(input, thresholds, metrics, issues) {
|
|
112
|
+
if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
|
|
113
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
|
|
114
|
+
}
|
|
115
|
+
if (metrics.scenarioCount < thresholds.minScenarioCount) {
|
|
116
|
+
issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
|
|
117
|
+
}
|
|
118
|
+
if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
|
|
119
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
function checkQuality(thresholds, metrics, issues) {
|
|
123
|
+
if (metrics.searchRuns < thresholds.minSearchRuns) {
|
|
124
|
+
issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
|
|
125
|
+
}
|
|
126
|
+
if (metrics.passRate < thresholds.minPassRate) {
|
|
127
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
|
|
128
|
+
}
|
|
129
|
+
if (metrics.meanScore < thresholds.minMeanScore) {
|
|
130
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
function checkGeneralization(gateDecision, thresholds, metrics, issues) {
|
|
134
|
+
if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
|
|
135
|
+
issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
|
|
136
|
+
}
|
|
137
|
+
if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
|
|
138
|
+
issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
|
|
139
|
+
}
|
|
140
|
+
if (gateDecision && !gateDecision.promote) {
|
|
141
|
+
issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
function checkDiagnostics(thresholds, metrics, issues) {
|
|
145
|
+
if (!thresholds.requireAsiForFailures) return;
|
|
146
|
+
if (metrics.failedRows > metrics.failuresWithAsi) {
|
|
147
|
+
issues.push({
|
|
148
|
+
axis: "diagnostics",
|
|
149
|
+
severity: "critical",
|
|
150
|
+
code: "missing_failure_asi",
|
|
151
|
+
detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
function checkEfficiency(thresholds, metrics, issues) {
|
|
156
|
+
if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
|
|
157
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` });
|
|
158
|
+
}
|
|
159
|
+
if (metrics.p95WallMs > thresholds.maxP95WallMs) {
|
|
160
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` });
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
function buildAxes(metrics, thresholds, gateDecision, issues) {
|
|
164
|
+
return [
|
|
165
|
+
axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
|
|
166
|
+
axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),
|
|
167
|
+
axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),
|
|
168
|
+
axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
|
|
169
|
+
axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`)
|
|
170
|
+
];
|
|
171
|
+
}
|
|
172
|
+
function axis(name, issues, score, detail) {
|
|
173
|
+
const own = issues.filter((i) => i.axis === name);
|
|
174
|
+
const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
|
|
175
|
+
return { name, status, score: bounded(score), detail };
|
|
176
|
+
}
|
|
177
|
+
function countScenarioSplits(scenarios) {
|
|
178
|
+
const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
|
|
179
|
+
for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
|
|
180
|
+
return counts;
|
|
181
|
+
}
|
|
182
|
+
function countDomains(scenarios) {
|
|
183
|
+
const out = {};
|
|
184
|
+
for (const scenario of scenarios) {
|
|
185
|
+
const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
|
|
186
|
+
out[domain] = (out[domain] ?? 0) + 1;
|
|
187
|
+
}
|
|
188
|
+
return out;
|
|
189
|
+
}
|
|
190
|
+
function countFailureModes(runs, traces, threshold) {
|
|
191
|
+
const out = {};
|
|
192
|
+
for (const run of runs) {
|
|
193
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
194
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
195
|
+
const mode = run.failureMode ?? "low_score";
|
|
196
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
for (const trace of traces) {
|
|
200
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
201
|
+
const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
|
|
202
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
return out;
|
|
206
|
+
}
|
|
207
|
+
function countResponsibleSurfaces(traces) {
|
|
208
|
+
const out = {};
|
|
209
|
+
for (const trace of traces) {
|
|
210
|
+
for (const asi of trace.asi ?? []) {
|
|
211
|
+
const surface = asi.responsibleSurface ?? "unknown";
|
|
212
|
+
out[surface] = (out[surface] ?? 0) + 1;
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
return out;
|
|
216
|
+
}
|
|
217
|
+
function failedRows(runs, traces, threshold) {
|
|
218
|
+
const out = [];
|
|
219
|
+
for (const run of runs) {
|
|
220
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
221
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
222
|
+
const asiMetric = run.outcome.raw.asi;
|
|
223
|
+
out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
for (const trace of traces) {
|
|
227
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
228
|
+
out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
return out;
|
|
232
|
+
}
|
|
233
|
+
function passRate(runs, traces, threshold) {
|
|
234
|
+
const outcomes = [
|
|
235
|
+
...runs.map((run) => {
|
|
236
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
237
|
+
return !run.failureMode && score !== void 0 && score >= threshold;
|
|
238
|
+
}),
|
|
239
|
+
...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
|
|
240
|
+
];
|
|
241
|
+
if (outcomes.length === 0) return 0;
|
|
242
|
+
return outcomes.filter(Boolean).length / outcomes.length;
|
|
243
|
+
}
|
|
244
|
+
function scoresFor(runs, split) {
|
|
245
|
+
return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
|
|
246
|
+
}
|
|
247
|
+
function mean(xs) {
|
|
248
|
+
if (xs.length === 0) return Number.NaN;
|
|
249
|
+
return xs.reduce((sum, x) => sum + x, 0) / xs.length;
|
|
250
|
+
}
|
|
251
|
+
function percentile(xs, p) {
|
|
252
|
+
if (xs.length === 0) return Number.NaN;
|
|
253
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
254
|
+
return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
|
|
255
|
+
}
|
|
256
|
+
function isFiniteNumber(value) {
|
|
257
|
+
return typeof value === "number" && Number.isFinite(value);
|
|
258
|
+
}
|
|
259
|
+
function safeDiff(a, b) {
|
|
260
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
261
|
+
return a - b;
|
|
262
|
+
}
|
|
263
|
+
function gapScore(gap, maxGap) {
|
|
264
|
+
if (!Number.isFinite(gap)) return 0;
|
|
265
|
+
if (maxGap <= 0) return gap <= 0 ? 1 : 0;
|
|
266
|
+
return bounded(1 - Math.max(0, gap) / maxGap);
|
|
267
|
+
}
|
|
268
|
+
function efficiencyScore(metrics, thresholds) {
|
|
269
|
+
const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
|
|
270
|
+
const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
|
|
271
|
+
return Math.min(cost, latency);
|
|
272
|
+
}
|
|
273
|
+
function bounded(x) {
|
|
274
|
+
if (!Number.isFinite(x)) return 0;
|
|
275
|
+
return Math.max(0, Math.min(1, x));
|
|
276
|
+
}
|
|
277
|
+
function renderSummary(target, status, metrics, issues) {
|
|
278
|
+
const prefix = `release confidence ${status}: ${target}`;
|
|
279
|
+
const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
|
|
280
|
+
if (issues.length === 0) return `${prefix}; ${metricText}`;
|
|
281
|
+
return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
|
|
282
|
+
}
|
|
283
|
+
function fmt(x) {
|
|
284
|
+
if (!Number.isFinite(x)) return String(x);
|
|
285
|
+
return x.toFixed(4);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// src/meta-eval/rubric-predictive-validity.ts
|
|
289
|
+
async function rubricPredictiveValidity(input) {
|
|
290
|
+
const minSamples = input.minSamples ?? 8;
|
|
291
|
+
const reduction = input.reduction ?? "latest";
|
|
292
|
+
const resamples = input.bootstrapResamples ?? 500;
|
|
293
|
+
const rng = makeRng(input.seed);
|
|
294
|
+
const outcomes = await input.outcomes.list();
|
|
295
|
+
const outcomesByRun = /* @__PURE__ */ new Map();
|
|
296
|
+
for (const o of outcomes) {
|
|
297
|
+
const arr = outcomesByRun.get(o.runId) ?? [];
|
|
298
|
+
arr.push(o);
|
|
299
|
+
outcomesByRun.set(o.runId, arr);
|
|
300
|
+
}
|
|
301
|
+
const observedRubrics = /* @__PURE__ */ new Set();
|
|
302
|
+
for (const r of input.runs) {
|
|
303
|
+
for (const k of Object.keys(r.outcome.raw)) observedRubrics.add(k);
|
|
304
|
+
}
|
|
305
|
+
const rubrics = input.rubrics ?? [...observedRubrics];
|
|
306
|
+
const buckets = [];
|
|
307
|
+
for (const r of rubrics) {
|
|
308
|
+
for (const o of input.outcomeMetrics) {
|
|
309
|
+
buckets.push({ rubric: r, outcome: o, xs: [], ys: [] });
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
let joined = 0;
|
|
313
|
+
let skipped = 0;
|
|
314
|
+
for (const run of input.runs) {
|
|
315
|
+
const os = outcomesByRun.get(run.runId);
|
|
316
|
+
if (!os || os.length === 0) {
|
|
317
|
+
skipped++;
|
|
318
|
+
continue;
|
|
319
|
+
}
|
|
320
|
+
let joinedThisRun = false;
|
|
321
|
+
for (const r of rubrics) {
|
|
322
|
+
const x = run.outcome.raw[r];
|
|
323
|
+
if (typeof x !== "number" || !Number.isFinite(x)) continue;
|
|
324
|
+
for (const o of input.outcomeMetrics) {
|
|
325
|
+
const values = os.map((row) => row.metrics[o]).filter((v) => typeof v === "number" && Number.isFinite(v));
|
|
326
|
+
if (values.length === 0) continue;
|
|
327
|
+
const y = reduce(values, os, o, reduction);
|
|
328
|
+
if (y === null) continue;
|
|
329
|
+
const bucket = buckets.find((b) => b.rubric === r && b.outcome === o);
|
|
330
|
+
bucket.xs.push(x);
|
|
331
|
+
bucket.ys.push(y);
|
|
332
|
+
joinedThisRun = true;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
if (joinedThisRun) joined++;
|
|
336
|
+
}
|
|
337
|
+
const pairs = [];
|
|
338
|
+
for (const b of buckets) {
|
|
339
|
+
if (b.xs.length < minSamples) continue;
|
|
340
|
+
const pearson = pearsonR(b.xs, b.ys);
|
|
341
|
+
const spearman = pearsonR(rankWithTies(b.xs), rankWithTies(b.ys));
|
|
342
|
+
const ci = bootstrapCi(b.xs, b.ys, resamples, rng);
|
|
343
|
+
const verdict = Math.abs(spearman) >= 0.7 ? "load_bearing" : Math.abs(spearman) >= 0.4 ? "informative" : "decorative";
|
|
344
|
+
pairs.push({
|
|
345
|
+
rubric: b.rubric,
|
|
346
|
+
outcome: b.outcome,
|
|
347
|
+
n: b.xs.length,
|
|
348
|
+
pearson,
|
|
349
|
+
spearman,
|
|
350
|
+
ci95: ci,
|
|
351
|
+
verdict
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
const byRubric = /* @__PURE__ */ new Map();
|
|
355
|
+
for (const p of pairs) {
|
|
356
|
+
const arr = byRubric.get(p.rubric) ?? [];
|
|
357
|
+
arr.push(p);
|
|
358
|
+
byRubric.set(p.rubric, arr);
|
|
359
|
+
}
|
|
360
|
+
const ranked = [...byRubric.entries()].map(([rubric, ps]) => {
|
|
361
|
+
const best = ps.reduce((a, b) => Math.abs(b.spearman) > Math.abs(a.spearman) ? b : a);
|
|
362
|
+
return {
|
|
363
|
+
rubric,
|
|
364
|
+
bestOutcome: best.outcome,
|
|
365
|
+
spearman: best.spearman,
|
|
366
|
+
pearson: best.pearson,
|
|
367
|
+
n: best.n,
|
|
368
|
+
verdict: best.verdict
|
|
369
|
+
};
|
|
370
|
+
}).sort((a, b) => Math.abs(b.spearman) - Math.abs(a.spearman));
|
|
371
|
+
const rubricsWithoutData = rubrics.filter((r) => !byRubric.has(r));
|
|
372
|
+
return { pairs, ranked, joinedSamples: joined, skippedRuns: skipped, rubricsWithoutData };
|
|
373
|
+
}
|
|
374
|
+
function reduce(values, outcomes, metric, kind) {
|
|
375
|
+
if (values.length === 0) return null;
|
|
376
|
+
if (kind === "mean") return values.reduce((s, v) => s + v, 0) / values.length;
|
|
377
|
+
if (kind === "max") return Math.max(...values);
|
|
378
|
+
const sorted = [...outcomes].filter((o) => typeof o.metrics[metric] === "number").sort((a, b) => b.capturedAt - a.capturedAt);
|
|
379
|
+
return sorted[0]?.metrics[metric] ?? null;
|
|
380
|
+
}
|
|
381
|
+
function pearsonR(a, b) {
|
|
382
|
+
if (a.length !== b.length || a.length < 2) return Number.NaN;
|
|
383
|
+
const ma = a.reduce((s, v) => s + v, 0) / a.length;
|
|
384
|
+
const mb = b.reduce((s, v) => s + v, 0) / b.length;
|
|
385
|
+
let num2 = 0, da = 0, db = 0;
|
|
386
|
+
for (let i = 0; i < a.length; i++) {
|
|
387
|
+
const xa = a[i] - ma;
|
|
388
|
+
const xb = b[i] - mb;
|
|
389
|
+
num2 += xa * xb;
|
|
390
|
+
da += xa * xa;
|
|
391
|
+
db += xb * xb;
|
|
392
|
+
}
|
|
393
|
+
if (da === 0 || db === 0) return da === 0 && db === 0 ? 1 : 0;
|
|
394
|
+
return num2 / Math.sqrt(da * db);
|
|
395
|
+
}
|
|
396
|
+
function rankWithTies(xs) {
|
|
397
|
+
const indexed = xs.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
|
|
398
|
+
const r = new Array(xs.length);
|
|
399
|
+
for (let i = 0; i < indexed.length; ) {
|
|
400
|
+
let j = i;
|
|
401
|
+
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
402
|
+
const avg = (i + j + 2) / 2;
|
|
403
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
|
|
404
|
+
i = j + 1;
|
|
405
|
+
}
|
|
406
|
+
return r;
|
|
407
|
+
}
|
|
408
|
+
function bootstrapCi(xs, ys, iterations, rng) {
|
|
409
|
+
const n = xs.length;
|
|
410
|
+
if (n < 3) return { low: Number.NaN, high: Number.NaN };
|
|
411
|
+
const samples = [];
|
|
412
|
+
for (let b = 0; b < iterations; b++) {
|
|
413
|
+
const rx = new Array(n);
|
|
414
|
+
const ry = new Array(n);
|
|
415
|
+
for (let i = 0; i < n; i++) {
|
|
416
|
+
const idx = Math.floor(rng() * n);
|
|
417
|
+
rx[i] = xs[idx];
|
|
418
|
+
ry[i] = ys[idx];
|
|
419
|
+
}
|
|
420
|
+
const r = pearsonR(rx, ry);
|
|
421
|
+
if (Number.isFinite(r)) samples.push(r);
|
|
422
|
+
}
|
|
423
|
+
samples.sort((a, b) => a - b);
|
|
424
|
+
if (samples.length === 0) return { low: Number.NaN, high: Number.NaN };
|
|
425
|
+
return {
|
|
426
|
+
low: samples[Math.floor(0.025 * samples.length)],
|
|
427
|
+
high: samples[Math.min(samples.length - 1, Math.floor(0.975 * samples.length))]
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
function makeRng(seed) {
|
|
431
|
+
if (seed === void 0) return Math.random;
|
|
432
|
+
let s = seed >>> 0;
|
|
433
|
+
return () => {
|
|
434
|
+
s = s + 1831565813 >>> 0;
|
|
435
|
+
let t = s;
|
|
436
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
437
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
438
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
439
|
+
};
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
// src/sequential.ts
|
|
443
|
+
function pairedEvalueSequence(deltas, opts = {}) {
|
|
444
|
+
const c = opts.bound ?? 1;
|
|
445
|
+
const alpha = opts.alpha ?? 0.05;
|
|
446
|
+
const initialShrink = opts.initialBetShrinkage ?? 0.5;
|
|
447
|
+
const rope = opts.rope ?? null;
|
|
448
|
+
if (c <= 0) throw new Error("pairedEvalueSequence: bound must be > 0");
|
|
449
|
+
if (alpha <= 0 || alpha >= 1) throw new Error("pairedEvalueSequence: alpha must be in (0,1)");
|
|
450
|
+
if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
|
|
451
|
+
throw new Error("pairedEvalueSequence: rope must satisfy low \u2264 high");
|
|
452
|
+
}
|
|
453
|
+
const steps = [];
|
|
454
|
+
let clipped = false;
|
|
455
|
+
let evalue = 1;
|
|
456
|
+
let decisionFiredAt = null;
|
|
457
|
+
let sum = 0;
|
|
458
|
+
let sumSq = 0;
|
|
459
|
+
let count = 0;
|
|
460
|
+
for (let i = 0; i < deltas.length; i++) {
|
|
461
|
+
let d = deltas[i];
|
|
462
|
+
if (d < -c || d > c) {
|
|
463
|
+
d = Math.max(-c, Math.min(c, d));
|
|
464
|
+
clipped = true;
|
|
465
|
+
}
|
|
466
|
+
const muHat = count === 0 ? 0 : sum / count;
|
|
467
|
+
const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat);
|
|
468
|
+
const t = i + 1;
|
|
469
|
+
const shrink = initialShrink * Math.min(1, count / 32);
|
|
470
|
+
let lambda = muHat / (varHat + c * c) * shrink;
|
|
471
|
+
const lambdaMax = 0.99 / c;
|
|
472
|
+
if (lambda > lambdaMax) lambda = lambdaMax;
|
|
473
|
+
if (lambda < -lambdaMax) lambda = -lambdaMax;
|
|
474
|
+
evalue = evalue * (1 + lambda * d);
|
|
475
|
+
if (!Number.isFinite(evalue) || evalue < 0) evalue = 0;
|
|
476
|
+
sum += d;
|
|
477
|
+
sumSq += d * d;
|
|
478
|
+
count += 1;
|
|
479
|
+
const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300));
|
|
480
|
+
const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha);
|
|
481
|
+
let decision = "continue";
|
|
482
|
+
if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = "equivalent";
|
|
483
|
+
else if (evalue >= 2 / alpha && muHat > 0) decision = "promote_now";
|
|
484
|
+
else if (evalue >= 2 / alpha && muHat < 0) decision = "reject_now";
|
|
485
|
+
else if (rope && cs.high < rope.low) decision = "reject_now";
|
|
486
|
+
if (decision !== "continue" && decisionFiredAt === null) decisionFiredAt = t;
|
|
487
|
+
steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision });
|
|
488
|
+
}
|
|
489
|
+
const finalDecision = steps.length === 0 ? "continue" : steps[steps.length - 1].decision;
|
|
490
|
+
return { steps, finalDecision, decisionFiredAt, clipped };
|
|
491
|
+
}
|
|
492
|
+
function evaluateInterimReleaseConfidence(input) {
|
|
493
|
+
const candidates = input.deltaSeries.map((s) => {
|
|
494
|
+
const seq = pairedEvalueSequence(s.deltas, {
|
|
495
|
+
alpha: input.alpha,
|
|
496
|
+
bound: input.bound,
|
|
497
|
+
rope: input.rope
|
|
498
|
+
});
|
|
499
|
+
const last = seq.steps[seq.steps.length - 1];
|
|
500
|
+
return {
|
|
501
|
+
candidateId: s.candidateId,
|
|
502
|
+
decision: seq.finalDecision,
|
|
503
|
+
decisionFiredAt: seq.decisionFiredAt,
|
|
504
|
+
finalEvalue: last?.evalue ?? 1,
|
|
505
|
+
finalPValue: last?.pValue ?? 1,
|
|
506
|
+
pairs: seq.steps.length,
|
|
507
|
+
csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,
|
|
508
|
+
csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY
|
|
509
|
+
};
|
|
510
|
+
});
|
|
511
|
+
const promote = candidates.find((c) => c.decision === "promote_now");
|
|
512
|
+
if (promote) return { candidates, recommendation: { decision: "promote_now", candidateId: promote.candidateId } };
|
|
513
|
+
const live = candidates.find((c) => c.decision === "continue");
|
|
514
|
+
if (live) return { candidates, recommendation: { decision: "continue", candidateId: null } };
|
|
515
|
+
const equiv = candidates.find((c) => c.decision === "equivalent");
|
|
516
|
+
if (equiv) return { candidates, recommendation: { decision: "equivalent", candidateId: equiv.candidateId } };
|
|
517
|
+
return { candidates, recommendation: { decision: "reject_now", candidateId: null } };
|
|
518
|
+
}
|
|
519
|
+
function empiricalBernsteinCs(sum, sumSq, n, bound, alpha) {
|
|
520
|
+
if (n === 0) return { low: -bound, high: bound };
|
|
521
|
+
const mean3 = sum / n;
|
|
522
|
+
const variance = Math.max(0, sumSq / n - mean3 * mean3);
|
|
523
|
+
const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1);
|
|
524
|
+
const radius = Math.sqrt(2 * variance * psi / n) + 3 * bound * psi / n;
|
|
525
|
+
return { low: mean3 - radius, high: mean3 + radius };
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
// src/release-report.ts
|
|
529
|
+
function renderReleaseReport(scorecard, options = {}) {
|
|
530
|
+
const title = options.title ?? `Release Report: ${scorecard.target}`;
|
|
531
|
+
const lines = [];
|
|
532
|
+
lines.push(`# ${title}`);
|
|
533
|
+
lines.push("");
|
|
534
|
+
lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
|
|
535
|
+
lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
|
|
536
|
+
if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
|
|
537
|
+
if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
|
|
538
|
+
lines.push("");
|
|
539
|
+
lines.push(scorecard.summary);
|
|
540
|
+
lines.push("");
|
|
541
|
+
lines.push("## Metrics");
|
|
542
|
+
lines.push("");
|
|
543
|
+
lines.push("| Metric | Value |");
|
|
544
|
+
lines.push("|---|---:|");
|
|
545
|
+
lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
|
|
546
|
+
lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
|
|
547
|
+
lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
|
|
548
|
+
lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
|
|
549
|
+
lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
|
|
550
|
+
lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
|
|
551
|
+
lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
|
|
552
|
+
lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
|
|
553
|
+
lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
|
|
554
|
+
lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
|
|
555
|
+
lines.push("");
|
|
556
|
+
if (scorecard.issues.length > 0) {
|
|
557
|
+
lines.push("## Issues");
|
|
558
|
+
lines.push("");
|
|
559
|
+
for (const issue of scorecard.issues) {
|
|
560
|
+
lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
|
|
561
|
+
}
|
|
562
|
+
lines.push("");
|
|
563
|
+
}
|
|
564
|
+
const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
|
|
565
|
+
if (surfaces.length > 0) {
|
|
566
|
+
lines.push("## Responsible Surfaces");
|
|
567
|
+
lines.push("");
|
|
568
|
+
for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
|
|
569
|
+
lines.push("");
|
|
570
|
+
}
|
|
571
|
+
const failures = entries(scorecard.metrics.failureModeCounts);
|
|
572
|
+
if (failures.length > 0) {
|
|
573
|
+
lines.push("## Failure Modes");
|
|
574
|
+
lines.push("");
|
|
575
|
+
for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
|
|
576
|
+
lines.push("");
|
|
577
|
+
}
|
|
578
|
+
if (options.runs && options.runs.length > 0) {
|
|
579
|
+
lines.push("## Run Summary");
|
|
580
|
+
lines.push("");
|
|
581
|
+
lines.push(summaryTable([...options.runs], {
|
|
582
|
+
comparator: options.comparator ?? scorecard.baselineId ?? void 0,
|
|
583
|
+
split: "holdout"
|
|
584
|
+
}).markdown);
|
|
585
|
+
lines.push("");
|
|
586
|
+
}
|
|
587
|
+
if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
|
|
588
|
+
lines.push("## TraceAnalyst Findings");
|
|
589
|
+
lines.push("");
|
|
590
|
+
for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
|
|
591
|
+
lines.push("");
|
|
592
|
+
}
|
|
593
|
+
const nextActions = options.nextActions ?? defaultNextActions(scorecard);
|
|
594
|
+
if (nextActions.length > 0) {
|
|
595
|
+
lines.push("## Next Actions");
|
|
596
|
+
lines.push("");
|
|
597
|
+
for (const action of nextActions) lines.push(`- ${action}`);
|
|
598
|
+
lines.push("");
|
|
599
|
+
}
|
|
600
|
+
return lines.join("\n").trimEnd() + "\n";
|
|
601
|
+
}
|
|
602
|
+
function defaultNextActions(scorecard) {
|
|
603
|
+
if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
|
|
604
|
+
return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
|
|
605
|
+
}
|
|
606
|
+
function entries(values) {
|
|
607
|
+
return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
|
|
608
|
+
}
|
|
609
|
+
function pct(value) {
|
|
610
|
+
return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
|
|
611
|
+
}
|
|
612
|
+
function num(value) {
|
|
613
|
+
return Number.isFinite(value) ? value.toFixed(3) : "n/a";
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
// src/promotion-gate.ts
|
|
617
|
+
function bootstrapCi2(baseline, candidate, options = {}) {
|
|
618
|
+
const alpha = options.alpha ?? 0.05;
|
|
619
|
+
const iterations = options.iterations ?? 1e3;
|
|
620
|
+
const minTotal = options.minTotalSamples ?? 6;
|
|
621
|
+
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
622
|
+
const baselineMean = mean2(baseline);
|
|
623
|
+
const candidateMean = mean2(candidate);
|
|
624
|
+
const delta = candidateMean - baselineMean;
|
|
625
|
+
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
626
|
+
return {
|
|
627
|
+
baselineMean,
|
|
628
|
+
candidateMean,
|
|
629
|
+
delta,
|
|
630
|
+
ciLower: -Infinity,
|
|
631
|
+
ciUpper: Infinity,
|
|
632
|
+
iterations: 0,
|
|
633
|
+
alpha,
|
|
634
|
+
verdict: "INCONCLUSIVE"
|
|
635
|
+
};
|
|
636
|
+
}
|
|
637
|
+
const deltas = new Array(iterations);
|
|
638
|
+
for (let i = 0; i < iterations; i++) {
|
|
639
|
+
const bResample = resample(baseline, rng);
|
|
640
|
+
const cResample = resample(candidate, rng);
|
|
641
|
+
deltas[i] = mean2(cResample) - mean2(bResample);
|
|
642
|
+
}
|
|
643
|
+
deltas.sort((a, b) => a - b);
|
|
644
|
+
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
645
|
+
const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
|
|
646
|
+
const ciLower = deltas[Math.max(0, lowerIdx)];
|
|
647
|
+
const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
|
|
648
|
+
let verdict;
|
|
649
|
+
if (ciLower > 0) verdict = "ADVANCE";
|
|
650
|
+
else if (ciUpper < 0) verdict = "REVERT";
|
|
651
|
+
else if (delta >= 0) verdict = "KEEP";
|
|
652
|
+
else verdict = "INCONCLUSIVE";
|
|
653
|
+
return {
|
|
654
|
+
baselineMean,
|
|
655
|
+
candidateMean,
|
|
656
|
+
delta,
|
|
657
|
+
ciLower,
|
|
658
|
+
ciUpper,
|
|
659
|
+
iterations,
|
|
660
|
+
alpha,
|
|
661
|
+
verdict
|
|
662
|
+
};
|
|
663
|
+
}
|
|
664
|
+
function mean2(xs) {
|
|
665
|
+
if (xs.length === 0) return 0;
|
|
666
|
+
let s = 0;
|
|
667
|
+
for (const x of xs) s += x;
|
|
668
|
+
return s / xs.length;
|
|
669
|
+
}
|
|
670
|
+
function resample(xs, rng) {
|
|
671
|
+
const out = new Array(xs.length);
|
|
672
|
+
for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
|
|
673
|
+
return out;
|
|
674
|
+
}
|
|
675
|
+
function mulberry32(seed) {
|
|
676
|
+
let t = seed >>> 0;
|
|
677
|
+
return () => {
|
|
678
|
+
t += 1831565813;
|
|
679
|
+
let r = t;
|
|
680
|
+
r = Math.imul(r ^ r >>> 15, r | 1);
|
|
681
|
+
r ^= r + Math.imul(r ^ r >>> 7, r | 61);
|
|
682
|
+
return ((r ^ r >>> 14) >>> 0) / 4294967296;
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
function hashSeed(a, b) {
|
|
686
|
+
let h = 2166136261;
|
|
687
|
+
for (const x of [...a, ...b]) {
|
|
688
|
+
const view = new Float64Array([x]);
|
|
689
|
+
const bytes = new Uint8Array(view.buffer);
|
|
690
|
+
for (const byte of bytes) {
|
|
691
|
+
h ^= byte;
|
|
692
|
+
h = Math.imul(h, 16777619);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
return h >>> 0;
|
|
696
|
+
}
|
|
697
|
+
async function judgeReplayGate(args) {
|
|
698
|
+
const concurrency = args.judgeConcurrency ?? 4;
|
|
699
|
+
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
700
|
+
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
701
|
+
const ci = bootstrapCi2(baselineScores, candidateScores, {
|
|
702
|
+
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
703
|
+
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
704
|
+
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
705
|
+
});
|
|
706
|
+
return {
|
|
707
|
+
...ci,
|
|
708
|
+
baselineSamples: baselineScores.length,
|
|
709
|
+
candidateSamples: candidateScores.length
|
|
710
|
+
};
|
|
711
|
+
}
|
|
712
|
+
async function scoreAll(outputs, judge, concurrency) {
|
|
713
|
+
const results = new Array(outputs.length);
|
|
714
|
+
let next = 0;
|
|
715
|
+
async function worker() {
|
|
716
|
+
while (true) {
|
|
717
|
+
const i = next++;
|
|
718
|
+
if (i >= outputs.length) return;
|
|
719
|
+
const v = await judge(outputs[i]);
|
|
720
|
+
results[i] = Number.isFinite(v) ? v : 0;
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
724
|
+
return results;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
export {
|
|
728
|
+
releaseTraceEvidenceFromMultiShotTrials,
|
|
729
|
+
evaluateReleaseConfidence,
|
|
730
|
+
assertReleaseConfidence,
|
|
731
|
+
rubricPredictiveValidity,
|
|
732
|
+
pairedEvalueSequence,
|
|
733
|
+
evaluateInterimReleaseConfidence,
|
|
734
|
+
renderReleaseReport,
|
|
735
|
+
bootstrapCi2 as bootstrapCi,
|
|
736
|
+
judgeReplayGate
|
|
737
|
+
};
|
|
738
|
+
//# sourceMappingURL=chunk-UAND2LOT.js.map
|