@tangle-network/agent-eval 0.20.11 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +137 -170
  3. package/dist/benchmarks/index.d.ts +2 -1
  4. package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
  5. package/dist/chunk-3GN6U53I.js.map +1 -0
  6. package/dist/chunk-3IX6QTB7.js +1349 -0
  7. package/dist/chunk-3IX6QTB7.js.map +1 -0
  8. package/dist/chunk-5IIQKMD5.js +236 -0
  9. package/dist/chunk-5IIQKMD5.js.map +1 -0
  10. package/dist/chunk-ARZ6BEV6.js +1310 -0
  11. package/dist/chunk-ARZ6BEV6.js.map +1 -0
  12. package/dist/chunk-HRZELXCR.js +1354 -0
  13. package/dist/chunk-HRZELXCR.js.map +1 -0
  14. package/dist/chunk-KRR4VMH7.js +423 -0
  15. package/dist/chunk-KRR4VMH7.js.map +1 -0
  16. package/dist/chunk-SNUHRBDL.js +154 -0
  17. package/dist/chunk-SNUHRBDL.js.map +1 -0
  18. package/dist/chunk-WOK2RTWG.js +1920 -0
  19. package/dist/chunk-WOK2RTWG.js.map +1 -0
  20. package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
  21. package/dist/chunk-YUFXO3TU.js +148 -0
  22. package/dist/chunk-YUFXO3TU.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/control-cxwMOAsy.d.ts +259 -0
  26. package/dist/control.d.ts +6 -0
  27. package/dist/control.js +30 -0
  28. package/dist/control.js.map +1 -0
  29. package/dist/dataset-B9qvlm_o.d.ts +112 -0
  30. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  31. package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
  32. package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
  33. package/dist/index.d.ts +178 -2945
  34. package/dist/index.js +1066 -6185
  35. package/dist/index.js.map +1 -1
  36. package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
  37. package/dist/openapi.json +1 -1
  38. package/dist/optimization.d.ts +146 -0
  39. package/dist/optimization.js +60 -0
  40. package/dist/optimization.js.map +1 -0
  41. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  42. package/dist/reporting.d.ts +5 -0
  43. package/dist/reporting.js +36 -0
  44. package/dist/reporting.js.map +1 -0
  45. package/dist/run-record-CX_jcAyr.d.ts +134 -0
  46. package/dist/store-u47QaJ9G.d.ts +297 -0
  47. package/dist/traces.d.ts +914 -0
  48. package/dist/traces.js +120 -0
  49. package/dist/traces.js.map +1 -0
  50. package/dist/wire/index.js +3 -2
  51. package/docs/concepts.md +16 -11
  52. package/docs/feature-guide.md +10 -17
  53. package/docs/integration-launch-gates.md +77 -0
  54. package/docs/product-eval-adoption.md +27 -0
  55. package/docs/research-report-methodology.md +155 -0
  56. package/docs/trace-analysis.md +75 -0
  57. package/package.json +30 -12
  58. package/dist/chunk-JAOLXRIA.js.map +0 -1
  59. /package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0
@@ -0,0 +1,1349 @@
1
+ import {
2
+ benjaminiHochberg,
3
+ cohensD,
4
+ confidenceInterval,
5
+ pairedBootstrap,
6
+ pairedMde,
7
+ wilcoxonSignedRank
8
+ } from "./chunk-KRR4VMH7.js";
9
+
10
+ // src/release-confidence.ts
11
+ var DEFAULT_THRESHOLDS = {
12
+ requireCorpus: true,
13
+ minScenarioCount: 1,
14
+ minSearchRuns: 1,
15
+ minHoldoutRuns: 1,
16
+ requireHoldout: true,
17
+ minPassRate: 0.8,
18
+ minMeanScore: 0.7,
19
+ maxOverfitGap: 0.15,
20
+ maxMeanCostUsd: Number.POSITIVE_INFINITY,
21
+ maxP95WallMs: Number.POSITIVE_INFINITY,
22
+ requireAsiForFailures: true,
23
+ failureScoreThreshold: 0.5
24
+ };
25
+ function releaseTraceEvidenceFromMultiShotTrials(trials) {
26
+ return trials.map((trial) => ({
27
+ scenarioId: trial.scenarioId,
28
+ candidateId: trial.variantId,
29
+ split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
30
+ score: trial.score,
31
+ ok: trial.ok,
32
+ turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
33
+ costUsd: trial.cost,
34
+ durationMs: trial.durationMs,
35
+ failureMode: trial.error ? "runtime_error" : void 0,
36
+ asi: trial.asi,
37
+ metadata: trial.metadata
38
+ }));
39
+ }
40
+ function evaluateReleaseConfidence(input) {
41
+ const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
42
+ const candidateId = input.candidateId ?? null;
43
+ const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
44
+ const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
45
+ const scenarios = input.scenarios ?? [];
46
+ const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
47
+ const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
48
+ const searchScores = scoresFor(runs, "search");
49
+ const holdoutScores = scoresFor(runs, "holdout");
50
+ const allScores = [...searchScores, ...holdoutScores];
51
+ const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
52
+ const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
53
+ const searchRuns = runs.filter((r) => r.splitTag === "search").length;
54
+ const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
55
+ const searchMeanScore = mean(searchScores);
56
+ const holdoutMeanScore = mean(holdoutScores);
57
+ const metrics = {
58
+ scenarioCount,
59
+ searchRuns,
60
+ holdoutRuns,
61
+ passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
62
+ meanScore: mean(scoreUniverse),
63
+ searchMeanScore,
64
+ holdoutMeanScore,
65
+ overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),
66
+ meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
67
+ p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
68
+ failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
69
+ failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
70
+ singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
71
+ multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
72
+ splitCounts,
73
+ domainCounts: countDomains(scenarios),
74
+ failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
75
+ responsibleSurfaceCounts: countResponsibleSurfaces(traces)
76
+ };
77
+ const issues = [];
78
+ checkCorpus(input, thresholds, metrics, issues);
79
+ checkQuality(thresholds, metrics, issues);
80
+ checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
81
+ checkDiagnostics(thresholds, metrics, issues);
82
+ checkEfficiency(thresholds, metrics, issues);
83
+ const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
84
+ const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
85
+ return {
86
+ target: input.target,
87
+ candidateId,
88
+ baselineId: input.baselineId ?? null,
89
+ status,
90
+ promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
91
+ axes,
92
+ issues,
93
+ metrics,
94
+ dataset: input.dataset ?? null,
95
+ gateDecision: input.gateDecision ?? null,
96
+ summary: renderSummary(input.target, status, metrics, issues)
97
+ };
98
+ }
99
+ function assertReleaseConfidence(input) {
100
+ const scorecard = evaluateReleaseConfidence(input);
101
+ if (scorecard.status === "fail") {
102
+ throw new Error(scorecard.summary);
103
+ }
104
+ return scorecard;
105
+ }
106
+ function filterCandidate(runs, candidateId, baselineId) {
107
+ if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
108
+ if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
109
+ return [...runs];
110
+ }
111
+ function filterTraceCandidate(traces, candidateId, baselineId) {
112
+ if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
113
+ if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
114
+ return [...traces];
115
+ }
116
+ function checkCorpus(input, thresholds, metrics, issues) {
117
+ if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
118
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
119
+ }
120
+ if (metrics.scenarioCount < thresholds.minScenarioCount) {
121
+ issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
122
+ }
123
+ if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
124
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
125
+ }
126
+ }
127
+ function checkQuality(thresholds, metrics, issues) {
128
+ if (metrics.searchRuns < thresholds.minSearchRuns) {
129
+ issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
130
+ }
131
+ if (metrics.passRate < thresholds.minPassRate) {
132
+ issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
133
+ }
134
+ if (metrics.meanScore < thresholds.minMeanScore) {
135
+ issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
136
+ }
137
+ }
138
+ function checkGeneralization(gateDecision, thresholds, metrics, issues) {
139
+ if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
140
+ issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
141
+ }
142
+ if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
143
+ issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
144
+ }
145
+ if (gateDecision && !gateDecision.promote) {
146
+ issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
147
+ }
148
+ }
149
+ function checkDiagnostics(thresholds, metrics, issues) {
150
+ if (!thresholds.requireAsiForFailures) return;
151
+ if (metrics.failedRows > metrics.failuresWithAsi) {
152
+ issues.push({
153
+ axis: "diagnostics",
154
+ severity: "critical",
155
+ code: "missing_failure_asi",
156
+ detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
157
+ });
158
+ }
159
+ }
160
+ function checkEfficiency(thresholds, metrics, issues) {
161
+ if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
162
+ issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` });
163
+ }
164
+ if (metrics.p95WallMs > thresholds.maxP95WallMs) {
165
+ issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` });
166
+ }
167
+ }
168
+ function buildAxes(metrics, thresholds, gateDecision, issues) {
169
+ return [
170
+ axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
171
+ axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),
172
+ axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),
173
+ axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
174
+ axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`)
175
+ ];
176
+ }
177
+ function axis(name, issues, score, detail) {
178
+ const own = issues.filter((i) => i.axis === name);
179
+ const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
180
+ return { name, status, score: bounded(score), detail };
181
+ }
182
+ function countScenarioSplits(scenarios) {
183
+ const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
184
+ for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
185
+ return counts;
186
+ }
187
+ function countDomains(scenarios) {
188
+ const out = {};
189
+ for (const scenario of scenarios) {
190
+ const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
191
+ out[domain] = (out[domain] ?? 0) + 1;
192
+ }
193
+ return out;
194
+ }
195
+ function countFailureModes(runs, traces, threshold) {
196
+ const out = {};
197
+ for (const run of runs) {
198
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
199
+ if (run.failureMode || score !== void 0 && score < threshold) {
200
+ const mode = run.failureMode ?? "low_score";
201
+ out[mode] = (out[mode] ?? 0) + 1;
202
+ }
203
+ }
204
+ for (const trace of traces) {
205
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
206
+ const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
207
+ out[mode] = (out[mode] ?? 0) + 1;
208
+ }
209
+ }
210
+ return out;
211
+ }
212
+ function countResponsibleSurfaces(traces) {
213
+ const out = {};
214
+ for (const trace of traces) {
215
+ for (const asi of trace.asi ?? []) {
216
+ const surface = asi.responsibleSurface ?? "unknown";
217
+ out[surface] = (out[surface] ?? 0) + 1;
218
+ }
219
+ }
220
+ return out;
221
+ }
222
+ function failedRows(runs, traces, threshold) {
223
+ const out = [];
224
+ for (const run of runs) {
225
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
226
+ if (run.failureMode || score !== void 0 && score < threshold) {
227
+ const asiMetric = run.outcome.raw.asi;
228
+ out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
229
+ }
230
+ }
231
+ for (const trace of traces) {
232
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
233
+ out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
234
+ }
235
+ }
236
+ return out;
237
+ }
238
+ function passRate(runs, traces, threshold) {
239
+ const outcomes = [
240
+ ...runs.map((run) => {
241
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
242
+ return !run.failureMode && score !== void 0 && score >= threshold;
243
+ }),
244
+ ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
245
+ ];
246
+ if (outcomes.length === 0) return 0;
247
+ return outcomes.filter(Boolean).length / outcomes.length;
248
+ }
249
+ function scoresFor(runs, split) {
250
+ return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
251
+ }
252
+ function mean(xs) {
253
+ if (xs.length === 0) return Number.NaN;
254
+ return xs.reduce((sum, x) => sum + x, 0) / xs.length;
255
+ }
256
+ function percentile(xs, p) {
257
+ if (xs.length === 0) return Number.NaN;
258
+ const sorted = [...xs].sort((a, b) => a - b);
259
+ return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
260
+ }
261
+ function isFiniteNumber(value) {
262
+ return typeof value === "number" && Number.isFinite(value);
263
+ }
264
+ function safeDiff(a, b) {
265
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
266
+ return a - b;
267
+ }
268
+ function gapScore(gap, maxGap) {
269
+ if (!Number.isFinite(gap)) return 0;
270
+ if (maxGap <= 0) return gap <= 0 ? 1 : 0;
271
+ return bounded(1 - Math.max(0, gap) / maxGap);
272
+ }
273
+ function efficiencyScore(metrics, thresholds) {
274
+ const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
275
+ const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
276
+ return Math.min(cost, latency);
277
+ }
278
+ function bounded(x) {
279
+ if (!Number.isFinite(x)) return 0;
280
+ return Math.max(0, Math.min(1, x));
281
+ }
282
+ function renderSummary(target, status, metrics, issues) {
283
+ const prefix = `release confidence ${status}: ${target}`;
284
+ const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
285
+ if (issues.length === 0) return `${prefix}; ${metricText}`;
286
+ return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
287
+ }
288
+ function fmt(x) {
289
+ if (!Number.isFinite(x)) return String(x);
290
+ return x.toFixed(4);
291
+ }
292
+
293
+ // src/pre-registration.ts
294
+ function canonicalize(v) {
295
+ if (v === null || typeof v !== "object") return v;
296
+ if (Array.isArray(v)) return v.map(canonicalize);
297
+ const keys = Object.keys(v).sort();
298
+ const out = {};
299
+ for (const k of keys) out[k] = canonicalize(v[k]);
300
+ return out;
301
+ }
302
+ async function hashJson(obj) {
303
+ const canonical = canonicalize(obj);
304
+ const bytes = new TextEncoder().encode(JSON.stringify(canonical));
305
+ const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
306
+ return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
307
+ }
308
+ async function signManifest(m) {
309
+ const hash = await hashJson(m);
310
+ return { ...m, contentHash: hash, algo: "sha256-content" };
311
+ }
312
+ async function verifyManifest(m) {
313
+ const { contentHash, algo: _algo, ...rest } = m;
314
+ void _algo;
315
+ const resigned = await signManifest(rest);
316
+ return resigned.contentHash === contentHash;
317
+ }
318
+ async function evaluateHypothesis(manifest, observed) {
319
+ if (!await verifyManifest(manifest)) {
320
+ throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
321
+ }
322
+ const reasons = [];
323
+ const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
324
+ if (!directionOk) reasons.push("wrong_direction");
325
+ if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
326
+ if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
327
+ if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
328
+ return {
329
+ manifest,
330
+ observedN: observed.n,
331
+ observedEffect: observed.effect,
332
+ observedPValue: observed.pValue,
333
+ confirmed: reasons.length === 0,
334
+ rejectionReasons: reasons
335
+ };
336
+ }
337
+
338
+ // src/summary-report.ts
339
+ function summaryTable(runs, opts = {}) {
340
+ const split = opts.split ?? "holdout";
341
+ const confidence = opts.confidence ?? 0.95;
342
+ const fdr = opts.fdr ?? 0.05;
343
+ const comparator = opts.comparator ?? null;
344
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
345
+ const byCandidate = /* @__PURE__ */ new Map();
346
+ for (const r of runs) {
347
+ if (r.splitTag !== split) continue;
348
+ const v = r.outcome[scoreField];
349
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
350
+ const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
351
+ bucket.runs.push(r);
352
+ bucket.scores.push(v);
353
+ byCandidate.set(r.candidateId, bucket);
354
+ }
355
+ const candidateIds = [...byCandidate.keys()].sort();
356
+ const compRuns = comparator ? byCandidate.get(comparator) : void 0;
357
+ const tentative = [];
358
+ for (const id of candidateIds) {
359
+ const bucket = byCandidate.get(id);
360
+ const ci = confidenceInterval(bucket.scores, confidence);
361
+ let rawP = Number.NaN;
362
+ let d = Number.NaN;
363
+ if (comparator && compRuns && id !== comparator) {
364
+ const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
365
+ if (paired.before.length >= 6) {
366
+ rawP = wilcoxonSignedRank(paired.before, paired.after).p;
367
+ }
368
+ d = cohensD(compRuns.scores, bucket.scores);
369
+ }
370
+ tentative.push({
371
+ candidateId: id,
372
+ n: bucket.scores.length,
373
+ mean: ci.mean,
374
+ ciLow: ci.lower,
375
+ ciHigh: ci.upper,
376
+ qValue: rawP,
377
+ cohensD: d,
378
+ rawP
379
+ });
380
+ }
381
+ if (comparator) {
382
+ const idxs = [];
383
+ const ps = [];
384
+ for (let i = 0; i < tentative.length; i++) {
385
+ const r = tentative[i];
386
+ if (r.candidateId === comparator) continue;
387
+ if (!Number.isFinite(r.rawP)) continue;
388
+ idxs.push(i);
389
+ ps.push(r.rawP);
390
+ }
391
+ if (ps.length > 0) {
392
+ const { qValues } = benjaminiHochberg(ps, fdr);
393
+ for (let k = 0; k < idxs.length; k++) {
394
+ tentative[idxs[k]].qValue = qValues[k];
395
+ }
396
+ }
397
+ }
398
+ const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
399
+ const markdown = renderSummaryTableMarkdown(rows, comparator, split);
400
+ return { rows, comparator, split, markdown };
401
+ }
402
+ function pairScoresByKey(candidate, baseline, scoreField) {
403
+ const baseIdx = /* @__PURE__ */ new Map();
404
+ for (const r of baseline) {
405
+ const v = r.outcome[scoreField];
406
+ if (typeof v === "number" && Number.isFinite(v)) {
407
+ baseIdx.set(`${r.experimentId}::${r.seed}`, v);
408
+ }
409
+ }
410
+ const before = [];
411
+ const after = [];
412
+ for (const r of candidate) {
413
+ const v = r.outcome[scoreField];
414
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
415
+ const key = `${r.experimentId}::${r.seed}`;
416
+ const b = baseIdx.get(key);
417
+ if (b === void 0) continue;
418
+ before.push(b);
419
+ after.push(v);
420
+ }
421
+ return { before, after };
422
+ }
423
+ function renderSummaryTableMarkdown(rows, comparator, split) {
424
+ const lines = [];
425
+ const cmpLabel = comparator ? ` (vs ${comparator})` : "";
426
+ lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
427
+ lines.push("");
428
+ lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
429
+ lines.push("|---|---:|---:|---|---:|---:|");
430
+ for (const r of rows) {
431
+ const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
432
+ const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
433
+ const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
434
+ lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
435
+ }
436
+ return lines.join("\n");
437
+ }
438
+ function paretoChart(runs, opts = {}) {
439
+ const split = opts.split ?? "holdout";
440
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
441
+ const buckets = /* @__PURE__ */ new Map();
442
+ for (const r of runs) {
443
+ if (r.splitTag !== split) continue;
444
+ const v = r.outcome[scoreField];
445
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
446
+ const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
447
+ bucket.cost.push(r.costUsd);
448
+ bucket.quality.push(v);
449
+ buckets.set(r.candidateId, bucket);
450
+ }
451
+ const points = [];
452
+ for (const [candidateId, bucket] of buckets.entries()) {
453
+ points.push({
454
+ candidateId,
455
+ cost: avg(bucket.cost),
456
+ quality: avg(bucket.quality),
457
+ n: bucket.cost.length,
458
+ onFrontier: false,
459
+ gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
460
+ });
461
+ }
462
+ for (const p of points) {
463
+ p.onFrontier = !points.some((q) => q !== p && dominates(q, p));
464
+ }
465
+ return {
466
+ kind: "pareto-cost-quality",
467
+ split,
468
+ axes: { x: "costUsd", y: "score" },
469
+ points
470
+ };
471
+ }
472
+ function dominates(a, b) {
473
+ return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
474
+ }
475
+ function gateLabel(d) {
476
+ if (d.promote) return "promote";
477
+ if (d.rejectionCode === "few_runs") return "reject_few_runs";
478
+ if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
479
+ if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
480
+ return null;
481
+ }
482
+ function gainHistogram(runs, candidateId, comparator, opts = {}) {
483
+ const split = opts.split ?? "holdout";
484
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
485
+ const binCount = opts.bins ?? 11;
486
+ if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
487
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
488
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
489
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
490
+ const n = before.length;
491
+ if (n === 0) {
492
+ return {
493
+ kind: "gain-distribution",
494
+ candidateId,
495
+ comparator,
496
+ split,
497
+ n: 0,
498
+ bins: [],
499
+ median: 0,
500
+ ci: { low: 0, high: 0 }
501
+ };
502
+ }
503
+ const deltas = before.map((b, i) => after[i] - b);
504
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
505
+ const median = medianOfSorted(sortedDeltas);
506
+ const min = sortedDeltas[0];
507
+ const max = sortedDeltas[sortedDeltas.length - 1];
508
+ const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
509
+ const lo = -bound;
510
+ const hi = bound;
511
+ const width = (hi - lo) / binCount;
512
+ const bins = [];
513
+ for (let i = 0; i < binCount; i++) {
514
+ bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
515
+ }
516
+ for (const d of deltas) {
517
+ let idx = Math.floor((d - lo) / width);
518
+ if (idx < 0) idx = 0;
519
+ if (idx >= binCount) idx = binCount - 1;
520
+ bins[idx].count += 1;
521
+ }
522
+ const ci = pairedBootstrap(before, after, {
523
+ confidence: opts.confidence ?? 0.95,
524
+ resamples: opts.resamples ?? 2e3,
525
+ statistic: "median",
526
+ seed: opts.seed
527
+ });
528
+ return {
529
+ kind: "gain-distribution",
530
+ candidateId,
531
+ comparator,
532
+ split,
533
+ n,
534
+ bins,
535
+ median,
536
+ ci: { low: ci.low, high: ci.high }
537
+ };
538
+ }
539
+ var RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
540
+ function pairedPosterior(runs, candidateId, comparator, opts) {
541
+ const scoreField = opts.split === "holdout" ? "holdoutScore" : "searchScore";
542
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === opts.split);
543
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === opts.split);
544
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
545
+ const n = before.length;
546
+ if (n === 0) return null;
547
+ const deltas = before.map((b, i) => after[i] - b);
548
+ const meanDelta = deltas.reduce((s, x) => s + x, 0) / n;
549
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
550
+ const medianDelta = medianOfSorted(sortedDeltas);
551
+ const sdDelta = stdev(deltas, meanDelta);
552
+ const ci = pairedBootstrap(before, after, {
553
+ confidence: opts.confidence,
554
+ resamples: 2e3,
555
+ statistic: "median",
556
+ seed: opts.seed
557
+ });
558
+ const meanSamples = bootstrapMeanSamples(deltas, 2e3, opts.seed);
559
+ const prGreaterThanZero = meanSamples.length === 0 ? 0 : meanSamples.filter((s) => s > 0).length / meanSamples.length;
560
+ const prInRope = opts.rope === null || meanSamples.length === 0 ? null : meanSamples.filter((s) => s >= opts.rope.low && s <= opts.rope.high).length / meanSamples.length;
561
+ const dStandardised = pairedMde({ nPaired: n, alpha: opts.mdeAlpha, power: opts.mdePower });
562
+ const mde = sdDelta === 0 ? 0 : dStandardised * sdDelta;
563
+ return {
564
+ n,
565
+ meanDelta,
566
+ medianDelta,
567
+ sdDelta,
568
+ ci: { low: ci.low, high: ci.high },
569
+ prGreaterThanZero,
570
+ prInRope,
571
+ mde
572
+ };
573
+ }
574
+ function bootstrapMeanSamples(deltas, resamples, seed) {
575
+ const n = deltas.length;
576
+ if (n === 0) return [];
577
+ if (n === 1) return new Array(resamples).fill(deltas[0]);
578
+ const rng = seedRng(seed);
579
+ const samples = new Array(resamples);
580
+ for (let b = 0; b < resamples; b++) {
581
+ let sum = 0;
582
+ for (let k = 0; k < n; k++) sum += deltas[Math.floor(rng() * n)];
583
+ samples[b] = sum / n;
584
+ }
585
+ return samples;
586
+ }
587
+ function seedRng(seed) {
588
+ if (seed === void 0) return Math.random;
589
+ let s = seed >>> 0;
590
+ return () => {
591
+ s = s + 1831565813 >>> 0;
592
+ let t = s;
593
+ t = Math.imul(t ^ t >>> 15, t | 1);
594
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
595
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
596
+ };
597
+ }
598
+ function stdev(xs, mean3) {
599
+ if (xs.length < 2) return 0;
600
+ let sse = 0;
601
+ for (const x of xs) sse += (x - mean3) ** 2;
602
+ return Math.sqrt(sse / (xs.length - 1));
603
+ }
604
+ async function researchReport(runs, opts = {}) {
605
+ const split = opts.split ?? "holdout";
606
+ const comparator = opts.comparator ?? null;
607
+ const confidence = opts.confidence ?? 0.95;
608
+ const fdr = opts.fdr ?? 0.05;
609
+ const minPairs = Math.max(opts.minPairs ?? 20, RESEARCH_REPORT_HARD_PAIR_FLOOR);
610
+ const rope = opts.rope ?? null;
611
+ const mdePower = opts.mdePower ?? 0.8;
612
+ const mdeAlpha = opts.mdeAlpha ?? fdr;
613
+ const title = opts.title ?? "Agent Evaluation Research Report";
614
+ const generatedAt = opts.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString();
615
+ const preregistrationHash = opts.preregistrationHash ?? null;
616
+ if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
617
+ throw new Error(`researchReport: rope must satisfy low \u2264 high with finite bounds, got ${JSON.stringify(rope)}`);
618
+ }
619
+ const summary = summaryTable(runs, {
620
+ comparator: comparator ?? void 0,
621
+ split,
622
+ confidence,
623
+ fdr
624
+ });
625
+ const pareto = paretoChart(runs, { split, gateDecisions: opts.gateDecisions });
626
+ const candidateIds = opts.candidateIds ?? summary.rows.map((r) => r.candidateId).filter((id) => id !== comparator);
627
+ const gains = comparator ? candidateIds.map((id) => gainHistogram(runs, id, comparator, {
628
+ split,
629
+ confidence,
630
+ seed: opts.seed
631
+ })) : [];
632
+ const gainByCandidate = new Map(gains.map((g) => [g.candidateId, g]));
633
+ const paretoByCandidate = new Map(pareto.points.map((p) => [p.candidateId, p]));
634
+ const posteriorByCandidate = /* @__PURE__ */ new Map();
635
+ if (comparator) {
636
+ for (const id of candidateIds) {
637
+ posteriorByCandidate.set(id, pairedPosterior(runs, id, comparator, {
638
+ split,
639
+ confidence,
640
+ seed: opts.seed,
641
+ rope,
642
+ mdePower,
643
+ mdeAlpha
644
+ }));
645
+ }
646
+ }
647
+ const candidates = summary.rows.map((row) => {
648
+ const gain = gainByCandidate.get(row.candidateId);
649
+ const point = paretoByCandidate.get(row.candidateId);
650
+ const posterior = posteriorByCandidate.get(row.candidateId) ?? null;
651
+ const classified = classifyCandidate(row, {
652
+ comparator,
653
+ posterior,
654
+ point,
655
+ fdr,
656
+ minPairs,
657
+ rope
658
+ });
659
+ return {
660
+ candidateId: row.candidateId,
661
+ n: row.n,
662
+ mean: row.mean,
663
+ ciLow: row.ciLow,
664
+ ciHigh: row.ciHigh,
665
+ qValue: row.qValue,
666
+ cohensD: row.cohensD,
667
+ meanDeltaVsComparator: posterior ? posterior.meanDelta : null,
668
+ pairedN: posterior?.n ?? gain?.n ?? 0,
669
+ medianGain: posterior ? posterior.medianDelta : gain ? gain.median : null,
670
+ meanGain: posterior ? posterior.meanDelta : null,
671
+ gainCi: posterior ? posterior.ci : gain ? gain.ci : null,
672
+ prGreaterThanZero: posterior ? posterior.prGreaterThanZero : null,
673
+ prInRope: posterior ? posterior.prInRope : null,
674
+ mde: posterior ? posterior.mde : null,
675
+ onParetoFrontier: point?.onFrontier ?? false,
676
+ gate: point?.gate,
677
+ decision: classified.decision,
678
+ decisionReason: classified.reason
679
+ };
680
+ }).sort((a, b) => {
681
+ const decisionRank = decisionWeight(b.decision) - decisionWeight(a.decision);
682
+ if (decisionRank !== 0) return decisionRank;
683
+ return b.mean - a.mean;
684
+ });
685
+ const recommendation = buildRecommendation(candidates, {
686
+ comparator,
687
+ failureClusters: opts.failureClusters,
688
+ rope,
689
+ minPairs,
690
+ preregistrationHash
691
+ });
692
+ const executiveSummary = buildExecutiveSummary(candidates, recommendation, {
693
+ comparator,
694
+ split,
695
+ failureClusters: opts.failureClusters,
696
+ preregistrationHash
697
+ });
698
+ const methodology = buildMethodology({ split, comparator, fdr, minPairs, rope, confidence, mdePower, mdeAlpha });
699
+ const runFingerprint = await hashJson(canonicalize({
700
+ triples: runs.filter((r) => r.splitTag === split).map((r) => ({ runId: r.runId, candidateId: r.candidateId, splitTag: r.splitTag })).sort((a, b) => a.runId.localeCompare(b.runId)),
701
+ comparator,
702
+ split
703
+ }));
704
+ const markdown = renderResearchMarkdown({
705
+ title,
706
+ generatedAt,
707
+ split,
708
+ comparator,
709
+ rope,
710
+ runFingerprint,
711
+ preregistrationHash,
712
+ executiveSummary,
713
+ recommendation,
714
+ candidates,
715
+ summary,
716
+ pareto,
717
+ gains,
718
+ methodology,
719
+ failureClusters: opts.failureClusters
720
+ });
721
+ const html = renderResearchHtml(markdown, title);
722
+ return {
723
+ kind: "agent-eval-research-report",
724
+ title,
725
+ generatedAt,
726
+ split,
727
+ comparator,
728
+ runFingerprint,
729
+ preregistrationHash,
730
+ rope,
731
+ executiveSummary,
732
+ recommendation,
733
+ candidates,
734
+ summary,
735
+ charts: { pareto, gains },
736
+ methodology,
737
+ failureClusters: opts.failureClusters,
738
+ markdown,
739
+ html
740
+ };
741
+ }
742
+ function buildMethodology(ctx) {
743
+ const assumptions = [
744
+ "Pairs are matched by (experimentId, seed); the candidate and comparator see the same scenarios in the same order.",
745
+ "Paired deltas are exchangeable conditional on the matched scenario \u2014 no mid-run distribution shift.",
746
+ `Decisions are pre-specified at fdr=${ctx.fdr}, minPairs=${ctx.minPairs}, confidence=${ctx.confidence}; deviating from these post-hoc invalidates the false-discovery control.`
747
+ ];
748
+ if (ctx.rope) {
749
+ assumptions.push(`The Region of Practical Equivalence ${formatRope(ctx.rope)} is supplied by the domain owner; equivalent verdicts are only meaningful if that range is treated as the standing definition of "no material difference."`);
750
+ }
751
+ if (ctx.comparator === null) {
752
+ assumptions.push("No comparator was configured; this run is descriptive, not causal.");
753
+ }
754
+ const methods = [
755
+ "Marginal scores summarised with BH-FDR-adjusted Wilcoxon signed-rank q-values and Cohen's d via summaryTable.",
756
+ "Paired evidence summarised with bootstrap CI on the median delta and Bayesian-bootstrap-style Pr(\u0394>0) and Pr(\u0394\u2208ROPE) on the mean delta.",
757
+ `Minimum detectable effect reported per candidate at \u03B1=${ctx.mdeAlpha} (two-sided), power=${ctx.mdePower}, standardised by the observed paired-delta SD.`,
758
+ "Pareto frontier flagged as a separate axis (cost vs quality); a candidate can be on-frontier without winning the paired test.",
759
+ "Held-out gate decisions, when supplied, override the statistical verdict in the reject direction."
760
+ ];
761
+ const alternatives = [
762
+ "Paired t-test rejected: not robust to the heavy-tailed score distributions common in agent benchmarks.",
763
+ "Unpaired Mann\u2013Whitney rejected: matched scenarios make pairing free; unpaired throws away that variance reduction.",
764
+ "Sequential / always-valid inference (e-values, mSPRT) is the right tool for iterative sweeps and is out of scope for this single-look report \u2014 preregister and run once, or wrap this report in an alpha-spending schedule.",
765
+ "Hierarchical Bayesian shrinkage across many candidates is future work; the current ranking uses raw paired statistics."
766
+ ];
767
+ const whenNotToApply = [
768
+ `Paired N below ${RESEARCH_REPORT_HARD_PAIR_FLOOR} on any candidate \u2014 the bootstrap CI is degenerate.`,
769
+ "Comparator chosen post-hoc by inspecting the same data; q-values are no longer false-discovery-controlled.",
770
+ "Scenarios not drawn under a stable preregistered protocol; the report can describe the data but cannot anchor a launch decision.",
771
+ "Score distributions with mid-run shift (judge model swap, rubric change, infra outage) \u2014 pair exchangeability is violated."
772
+ ];
773
+ const citations = [
774
+ "Benjamini, Y. & Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. JRSS B, 57(1), 289\u2013300.",
775
+ "Wilcoxon, F. (1945). Individual comparisons by ranking methods. Biometrics Bulletin, 1(6), 80\u201383.",
776
+ "Efron, B. (1979). Bootstrap methods: another look at the jackknife. Annals of Statistics, 7(1), 1\u201326.",
777
+ "Rubin, D. B. (1981). The Bayesian bootstrap. Annals of Statistics, 9(1), 130\u2013134.",
778
+ "Kruschke, J. K. (2018). Rejecting or accepting parameter values in Bayesian estimation. Advances in Methods and Practices in Psychological Science, 1(2), 270\u2013280. (ROPE.)"
779
+ ];
780
+ return { assumptions, methods, alternatives, whenNotToApply, citations };
781
+ }
782
+ function formatRope(rope) {
783
+ return `[${fmt2(rope.low)}, ${fmt2(rope.high)}]`;
784
+ }
785
+ function classifyCandidate(row, ctx) {
786
+ if (ctx.comparator && row.candidateId === ctx.comparator) {
787
+ return { decision: "hold", reason: "Comparator baseline." };
788
+ }
789
+ if (!ctx.comparator) {
790
+ return {
791
+ decision: ctx.point?.onFrontier ? "hold" : "needs_more_data",
792
+ reason: "No comparator configured; report ranks candidates but cannot anchor a promotion call."
793
+ };
794
+ }
795
+ if (ctx.point?.gate && ctx.point.gate !== "promote") {
796
+ return { decision: "reject", reason: `Held-out gate returned ${ctx.point.gate}.` };
797
+ }
798
+ if (!ctx.posterior || ctx.posterior.n < RESEARCH_REPORT_HARD_PAIR_FLOOR) {
799
+ return {
800
+ decision: "needs_more_data",
801
+ reason: `Only ${ctx.posterior?.n ?? 0} paired observations; below hard floor of ${RESEARCH_REPORT_HARD_PAIR_FLOOR} for any paired inference.`
802
+ };
803
+ }
804
+ const ci = ctx.posterior.ci;
805
+ if (ctx.rope && ci.low >= ctx.rope.low && ci.high <= ctx.rope.high) {
806
+ return {
807
+ decision: "equivalent",
808
+ reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
809
+ };
810
+ }
811
+ const significant = Number.isFinite(row.qValue) && row.qValue <= ctx.fdr;
812
+ const gainPositive = ci.low > 0;
813
+ const gainNegative = ci.high < 0;
814
+ if (gainNegative) {
815
+ return { decision: "reject", reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] lies entirely below zero.` };
816
+ }
817
+ if (ctx.posterior.n < ctx.minPairs) {
818
+ return {
819
+ decision: "needs_more_data",
820
+ reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt2(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
821
+ };
822
+ }
823
+ if (significant && gainPositive) {
824
+ return {
825
+ decision: "promote",
826
+ reason: `BH-adjusted q=${fmt2(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)}.`
827
+ };
828
+ }
829
+ return {
830
+ decision: "hold",
831
+ reason: `Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)} but CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
832
+ };
833
+ }
834
+ function buildRecommendation(candidates, ctx) {
835
+ const nonComparator = candidates.filter((c) => c.candidateId !== ctx.comparator);
836
+ const bestPromote = nonComparator.find((c) => c.decision === "promote");
837
+ const bestEquivalent = nonComparator.find((c) => c.decision === "equivalent");
838
+ const chosen = bestPromote ?? bestEquivalent ?? nonComparator[0] ?? null;
839
+ const decision = bestPromote ? "promote" : nonComparator.some((c) => c.decision === "needs_more_data") ? "needs_more_data" : bestEquivalent ? "equivalent" : nonComparator.some((c) => c.decision === "hold") ? "hold" : "reject";
840
+ const rationale = [];
841
+ const risks = [];
842
+ const nextActions = [];
843
+ if (chosen) {
844
+ rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
845
+ if (chosen.gainCi) {
846
+ const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt2(chosen.prGreaterThanZero)}` : "";
847
+ rationale.push(`Median paired gain CI: [${fmt2(chosen.gainCi.low)}, ${fmt2(chosen.gainCi.high)}]${probSummary}.`);
848
+ }
849
+ if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
850
+ rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt2(chosen.mde)} score units.`);
851
+ }
852
+ }
853
+ if (!ctx.comparator) {
854
+ risks.push("No comparator was configured; verdict is descriptive, not causal.");
855
+ nextActions.push("Re-run with a stable comparator candidate for paired inference.");
856
+ }
857
+ if (!ctx.preregistrationHash) {
858
+ risks.push("No preregistration hash supplied; readers cannot verify the analysis was specified before data inspection.");
859
+ nextActions.push("Sign a HypothesisManifest before the next sweep and pass `preregistrationHash` so the report cites it.");
860
+ }
861
+ if (ctx.rope === null && nonComparator.length > 0) {
862
+ risks.push('No ROPE configured; the report cannot distinguish "equivalent" from "inconclusive".');
863
+ nextActions.push("Define a domain-specific Region of Practical Equivalence and pass it to lock in the equivalence threshold.");
864
+ }
865
+ const inconclusive = nonComparator.filter((c) => c.decision === "needs_more_data");
866
+ if (inconclusive.length > 0) {
867
+ const worst = inconclusive.reduce((a, b) => b.pairedN < a.pairedN ? b : a);
868
+ risks.push(`${inconclusive.length} candidate(s) below soft floor (${ctx.minPairs} pairs); thinnest is ${worst.candidateId} with ${worst.pairedN}.`);
869
+ nextActions.push(`Collect at least ${ctx.minPairs - worst.pairedN} more matched holdout runs for ${worst.candidateId}.`);
870
+ }
871
+ const rejected = nonComparator.filter((c) => c.decision === "reject");
872
+ if (rejected.length > 0) {
873
+ risks.push(`${rejected.length} candidate(s) failed the paired test or held-out gate; do not ship those variants.`);
874
+ }
875
+ if (ctx.failureClusters && ctx.failureClusters.clusters.length > 0) {
876
+ const top = ctx.failureClusters.clusters[0];
877
+ risks.push(`Top failure cluster: ${top.failureClass} across ${top.runCount} run(s).`);
878
+ nextActions.push("Prioritize the largest failure cluster before broad rollout.");
879
+ }
880
+ if (decision === "promote") {
881
+ nextActions.push("Ship behind the existing promotion gate and monitor canaries.");
882
+ } else if (decision === "hold") {
883
+ nextActions.push("Keep current production candidate while expanding holdout evidence.");
884
+ } else if (decision === "equivalent") {
885
+ nextActions.push("Either keep the comparator (no quality regression) or promote on cost/latency grounds \u2014 equivalence does not justify either; the choice is a product decision, not a stats one.");
886
+ } else if (decision === "reject") {
887
+ nextActions.push("Do not promote this sweep; inspect failures and generate a revised candidate.");
888
+ }
889
+ return {
890
+ decision,
891
+ candidateId: chosen?.candidateId ?? null,
892
+ rationale,
893
+ risks,
894
+ nextActions
895
+ };
896
+ }
897
+ function buildExecutiveSummary(candidates, recommendation, ctx) {
898
+ const lines = [];
899
+ const nonComparator = candidates.filter((c) => c.candidateId !== ctx.comparator);
900
+ lines.push(`Evaluated ${nonComparator.length} candidate(s) on the ${ctx.split} split${ctx.comparator ? ` against ${ctx.comparator}` : ""}.`);
901
+ lines.push(`Recommendation: ${recommendation.decision}${recommendation.candidateId ? ` ${recommendation.candidateId}` : ""}.`);
902
+ const promoted = nonComparator.filter((c) => c.decision === "promote").length;
903
+ const held = nonComparator.filter((c) => c.decision === "hold").length;
904
+ const equivalent = nonComparator.filter((c) => c.decision === "equivalent").length;
905
+ const rejected = nonComparator.filter((c) => c.decision === "reject").length;
906
+ const more = nonComparator.filter((c) => c.decision === "needs_more_data").length;
907
+ lines.push(`Decision mix: ${promoted} promote, ${equivalent} equivalent, ${held} hold, ${rejected} reject, ${more} need more data.`);
908
+ const frontier = nonComparator.filter((c) => c.onParetoFrontier).map((c) => c.candidateId);
909
+ if (frontier.length > 0) lines.push(`Pareto-frontier candidates: ${frontier.join(", ")}.`);
910
+ if (ctx.failureClusters) {
911
+ lines.push(`Failure clustering found ${ctx.failureClusters.totalFailures}/${ctx.failureClusters.totalRuns} failed runs across ${ctx.failureClusters.clusters.length} reportable cluster(s).`);
912
+ }
913
+ lines.push(ctx.preregistrationHash ? `Preregistered analysis: ${ctx.preregistrationHash.slice(0, 12)}\u2026` : "Analysis is post-hoc \u2014 no preregistration hash supplied.");
914
+ return lines;
915
+ }
916
+ function renderResearchMarkdown(report) {
917
+ const lines = [];
918
+ lines.push(`# ${report.title}`);
919
+ lines.push("");
920
+ lines.push(`**Generated:** ${report.generatedAt}`);
921
+ lines.push(`**Primary split:** ${report.split}`);
922
+ lines.push(`**Comparator:** ${report.comparator ?? "not configured"}`);
923
+ lines.push(`**ROPE:** ${report.rope ? formatRope(report.rope) : "not configured"}`);
924
+ lines.push(`**Run fingerprint:** \`${report.runFingerprint}\``);
925
+ lines.push(`**Preregistration:** ${report.preregistrationHash ? `\`${report.preregistrationHash}\`` : "none"}`);
926
+ lines.push("");
927
+ lines.push("## Executive Summary");
928
+ lines.push("");
929
+ for (const item of report.executiveSummary) lines.push(`- ${item}`);
930
+ lines.push("");
931
+ lines.push("## Recommendation");
932
+ lines.push("");
933
+ lines.push(`**Decision:** ${report.recommendation.decision}`);
934
+ lines.push(`**Candidate:** ${report.recommendation.candidateId ?? "N/A"}`);
935
+ lines.push("");
936
+ lines.push("### Rationale");
937
+ lines.push("");
938
+ for (const item of report.recommendation.rationale) lines.push(`- ${item}`);
939
+ lines.push("");
940
+ lines.push("### Risks");
941
+ lines.push("");
942
+ for (const item of report.recommendation.risks.length ? report.recommendation.risks : ["No material report-level risks detected."]) {
943
+ lines.push(`- ${item}`);
944
+ }
945
+ lines.push("");
946
+ lines.push("### Next Actions");
947
+ lines.push("");
948
+ for (const item of report.recommendation.nextActions) lines.push(`- ${item}`);
949
+ lines.push("");
950
+ lines.push("## Candidate Decision Table");
951
+ lines.push("");
952
+ lines.push("| Candidate | Decision | Mean | \u0394\u0304 | Pr(\u0394>0) | q | d | Paired N | Median Gain CI | MDE | Pareto | Gate |");
953
+ lines.push("|---|---|---:|---:|---:|---:|---:|---:|---|---:|---|---|");
954
+ for (const c of report.candidates) {
955
+ const delta = c.meanDeltaVsComparator === null ? "-" : signed(c.meanDeltaVsComparator);
956
+ const prGt = c.prGreaterThanZero === null ? "-" : c.prGreaterThanZero.toFixed(3);
957
+ const q = Number.isFinite(c.qValue) ? c.qValue.toFixed(4) : "-";
958
+ const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
959
+ const gain = c.gainCi ? `[${fmt2(c.gainCi.low)}, ${fmt2(c.gainCi.high)}]` : "-";
960
+ const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt2(c.mde);
961
+ lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt2(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
962
+ }
963
+ lines.push("");
964
+ lines.push("## Statistical Summary");
965
+ lines.push("");
966
+ lines.push(report.summary.markdown);
967
+ lines.push("");
968
+ lines.push("## Methodology");
969
+ lines.push("");
970
+ lines.push("### Assumptions");
971
+ lines.push("");
972
+ for (const item of report.methodology.assumptions) lines.push(`- ${item}`);
973
+ lines.push("");
974
+ lines.push("### Methods");
975
+ lines.push("");
976
+ for (const item of report.methodology.methods) lines.push(`- ${item}`);
977
+ lines.push("");
978
+ lines.push("### Alternatives Considered");
979
+ lines.push("");
980
+ for (const item of report.methodology.alternatives) lines.push(`- ${item}`);
981
+ lines.push("");
982
+ lines.push("### When NOT To Apply");
983
+ lines.push("");
984
+ for (const item of report.methodology.whenNotToApply) lines.push(`- ${item}`);
985
+ lines.push("");
986
+ lines.push("### Citations");
987
+ lines.push("");
988
+ for (const item of report.methodology.citations) lines.push(`- ${item}`);
989
+ lines.push("");
990
+ lines.push("## Chart Specs");
991
+ lines.push("");
992
+ lines.push("The report carries JSON chart specs for Pareto cost/quality and paired gain histograms.");
993
+ lines.push("");
994
+ lines.push("```json");
995
+ lines.push(JSON.stringify({ pareto: report.pareto, gains: report.gains }, null, 2));
996
+ lines.push("```");
997
+ if (report.failureClusters) {
998
+ lines.push("");
999
+ lines.push("## Failure Clusters");
1000
+ lines.push("");
1001
+ lines.push("| Failure Class | Runs | Scenarios | Tool | Example |");
1002
+ lines.push("|---|---:|---:|---|---|");
1003
+ for (const c of report.failureClusters.clusters.slice(0, 10)) {
1004
+ lines.push(`| ${c.failureClass} | ${c.runCount} | ${c.scenarioIds.length} | ${c.toolName ?? "-"} | ${escapePipes(c.exampleError ?? c.exampleRunId)} |`);
1005
+ }
1006
+ }
1007
+ return lines.join("\n");
1008
+ }
1009
+ function renderResearchHtml(markdown, title) {
1010
+ const body = markdownToHtml(markdown);
1011
+ return [
1012
+ "<!doctype html>",
1013
+ '<html lang="en">',
1014
+ "<head>",
1015
+ '<meta charset="utf-8">',
1016
+ '<meta name="viewport" content="width=device-width, initial-scale=1">',
1017
+ `<title>${escapeHtml(title)}</title>`,
1018
+ "<style>",
1019
+ 'body{font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;margin:0;color:#172026;background:#f7f8f8;}',
1020
+ "main{max-width:1080px;margin:0 auto;padding:40px 24px 64px;background:#fff;min-height:100vh;}",
1021
+ "h1{font-size:34px;line-height:1.15;margin:0 0 20px;}h2{margin-top:34px;border-top:1px solid #d9dfdf;padding-top:22px;}h3{margin-top:22px;}",
1022
+ "p,li{line-height:1.55;}table{border-collapse:collapse;width:100%;margin:16px 0;font-size:14px;}th,td{border:1px solid #d9dfdf;padding:8px;text-align:left;}th{background:#eef2f2;}",
1023
+ "code,pre{font-family:ui-monospace,SFMono-Regular,Menlo,monospace;}pre{overflow:auto;background:#111827;color:#f9fafb;padding:16px;border-radius:6px;}",
1024
+ "</style>",
1025
+ "</head>",
1026
+ "<body><main>",
1027
+ body,
1028
+ "</main></body></html>"
1029
+ ].join("\n");
1030
+ }
1031
+ function markdownToHtml(markdown) {
1032
+ const lines = markdown.split("\n");
1033
+ const html = [];
1034
+ let inList = false;
1035
+ let inCode = false;
1036
+ let code = [];
1037
+ let table = [];
1038
+ const flushList = () => {
1039
+ if (inList) {
1040
+ html.push("</ul>");
1041
+ inList = false;
1042
+ }
1043
+ };
1044
+ const flushTable = () => {
1045
+ if (table.length === 0) return;
1046
+ html.push(renderMarkdownTable(table));
1047
+ table = [];
1048
+ };
1049
+ for (const line of lines) {
1050
+ if (line.startsWith("```")) {
1051
+ if (inCode) {
1052
+ html.push(`<pre><code>${escapeHtml(code.join("\n"))}</code></pre>`);
1053
+ code = [];
1054
+ inCode = false;
1055
+ } else {
1056
+ flushList();
1057
+ flushTable();
1058
+ inCode = true;
1059
+ }
1060
+ continue;
1061
+ }
1062
+ if (inCode) {
1063
+ code.push(line);
1064
+ continue;
1065
+ }
1066
+ if (line.startsWith("|")) {
1067
+ flushList();
1068
+ table.push(line);
1069
+ continue;
1070
+ }
1071
+ flushTable();
1072
+ if (line.startsWith("- ")) {
1073
+ if (!inList) {
1074
+ html.push("<ul>");
1075
+ inList = true;
1076
+ }
1077
+ html.push(`<li>${inlineMarkdown(line.slice(2))}</li>`);
1078
+ continue;
1079
+ }
1080
+ flushList();
1081
+ if (line.startsWith("# ")) html.push(`<h1>${inlineMarkdown(line.slice(2))}</h1>`);
1082
+ else if (line.startsWith("## ")) html.push(`<h2>${inlineMarkdown(line.slice(3))}</h2>`);
1083
+ else if (line.startsWith("### ")) html.push(`<h3>${inlineMarkdown(line.slice(4))}</h3>`);
1084
+ else if (line.trim() === "") html.push("");
1085
+ else html.push(`<p>${inlineMarkdown(line)}</p>`);
1086
+ }
1087
+ flushList();
1088
+ flushTable();
1089
+ return html.join("\n");
1090
+ }
1091
+ function renderMarkdownTable(lines) {
1092
+ const rows = lines.filter((line) => !/^\|[-:\s|]+\|$/.test(line)).map((line) => line.slice(1, -1).split("|").map((cell) => inlineMarkdown(cell.trim())));
1093
+ if (rows.length === 0) return "";
1094
+ const [head, ...body] = rows;
1095
+ const th = head.map((cell) => `<th>${cell}</th>`).join("");
1096
+ const trs = body.map((row) => `<tr>${row.map((cell) => `<td>${cell}</td>`).join("")}</tr>`).join("\n");
1097
+ return `<table><thead><tr>${th}</tr></thead><tbody>${trs}</tbody></table>`;
1098
+ }
1099
+ function inlineMarkdown(s) {
1100
+ return escapeHtml(s).replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
1101
+ }
1102
+ function escapeHtml(s) {
1103
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1104
+ }
1105
+ function escapePipes(s) {
1106
+ return s.replace(/\|/g, "\\|");
1107
+ }
1108
+ function decisionWeight(decision) {
1109
+ if (decision === "promote") return 5;
1110
+ if (decision === "equivalent") return 4;
1111
+ if (decision === "hold") return 3;
1112
+ if (decision === "needs_more_data") return 2;
1113
+ return 1;
1114
+ }
1115
+ function signed(x) {
1116
+ return `${x >= 0 ? "+" : ""}${fmt2(x)}`;
1117
+ }
1118
+ function avg(xs) {
1119
+ if (xs.length === 0) return Number.NaN;
1120
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
1121
+ }
1122
+ function medianOfSorted(sorted) {
1123
+ if (sorted.length === 0) return 0;
1124
+ const mid = Math.floor(sorted.length / 2);
1125
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1126
+ }
1127
+ function fmt2(x) {
1128
+ if (!Number.isFinite(x)) return String(x);
1129
+ return x.toFixed(4);
1130
+ }
1131
+
1132
+ // src/release-report.ts
1133
+ function renderReleaseReport(scorecard, options = {}) {
1134
+ const title = options.title ?? `Release Report: ${scorecard.target}`;
1135
+ const lines = [];
1136
+ lines.push(`# ${title}`);
1137
+ lines.push("");
1138
+ lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
1139
+ lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
1140
+ if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
1141
+ if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
1142
+ lines.push("");
1143
+ lines.push(scorecard.summary);
1144
+ lines.push("");
1145
+ lines.push("## Metrics");
1146
+ lines.push("");
1147
+ lines.push("| Metric | Value |");
1148
+ lines.push("|---|---:|");
1149
+ lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
1150
+ lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
1151
+ lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
1152
+ lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
1153
+ lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
1154
+ lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
1155
+ lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
1156
+ lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
1157
+ lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
1158
+ lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
1159
+ lines.push("");
1160
+ if (scorecard.issues.length > 0) {
1161
+ lines.push("## Issues");
1162
+ lines.push("");
1163
+ for (const issue of scorecard.issues) {
1164
+ lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
1165
+ }
1166
+ lines.push("");
1167
+ }
1168
+ const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
1169
+ if (surfaces.length > 0) {
1170
+ lines.push("## Responsible Surfaces");
1171
+ lines.push("");
1172
+ for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
1173
+ lines.push("");
1174
+ }
1175
+ const failures = entries(scorecard.metrics.failureModeCounts);
1176
+ if (failures.length > 0) {
1177
+ lines.push("## Failure Modes");
1178
+ lines.push("");
1179
+ for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
1180
+ lines.push("");
1181
+ }
1182
+ if (options.runs && options.runs.length > 0) {
1183
+ lines.push("## Run Summary");
1184
+ lines.push("");
1185
+ lines.push(summaryTable([...options.runs], {
1186
+ comparator: options.comparator ?? scorecard.baselineId ?? void 0,
1187
+ split: "holdout"
1188
+ }).markdown);
1189
+ lines.push("");
1190
+ }
1191
+ if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
1192
+ lines.push("## TraceAnalyst Findings");
1193
+ lines.push("");
1194
+ for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
1195
+ lines.push("");
1196
+ }
1197
+ const nextActions = options.nextActions ?? defaultNextActions(scorecard);
1198
+ if (nextActions.length > 0) {
1199
+ lines.push("## Next Actions");
1200
+ lines.push("");
1201
+ for (const action of nextActions) lines.push(`- ${action}`);
1202
+ lines.push("");
1203
+ }
1204
+ return lines.join("\n").trimEnd() + "\n";
1205
+ }
1206
+ function defaultNextActions(scorecard) {
1207
+ if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
1208
+ return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
1209
+ }
1210
+ function entries(values) {
1211
+ return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
1212
+ }
1213
+ function pct(value) {
1214
+ return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
1215
+ }
1216
+ function num(value) {
1217
+ return Number.isFinite(value) ? value.toFixed(3) : "n/a";
1218
+ }
1219
+
1220
+ // src/promotion-gate.ts
1221
+ function bootstrapCi(baseline, candidate, options = {}) {
1222
+ const alpha = options.alpha ?? 0.05;
1223
+ const iterations = options.iterations ?? 1e3;
1224
+ const minTotal = options.minTotalSamples ?? 6;
1225
+ const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
1226
+ const baselineMean = mean2(baseline);
1227
+ const candidateMean = mean2(candidate);
1228
+ const delta = candidateMean - baselineMean;
1229
+ if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
1230
+ return {
1231
+ baselineMean,
1232
+ candidateMean,
1233
+ delta,
1234
+ ciLower: -Infinity,
1235
+ ciUpper: Infinity,
1236
+ iterations: 0,
1237
+ alpha,
1238
+ verdict: "INCONCLUSIVE"
1239
+ };
1240
+ }
1241
+ const deltas = new Array(iterations);
1242
+ for (let i = 0; i < iterations; i++) {
1243
+ const bResample = resample(baseline, rng);
1244
+ const cResample = resample(candidate, rng);
1245
+ deltas[i] = mean2(cResample) - mean2(bResample);
1246
+ }
1247
+ deltas.sort((a, b) => a - b);
1248
+ const lowerIdx = Math.floor(alpha / 2 * iterations);
1249
+ const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
1250
+ const ciLower = deltas[Math.max(0, lowerIdx)];
1251
+ const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
1252
+ let verdict;
1253
+ if (ciLower > 0) verdict = "ADVANCE";
1254
+ else if (ciUpper < 0) verdict = "REVERT";
1255
+ else if (delta >= 0) verdict = "KEEP";
1256
+ else verdict = "INCONCLUSIVE";
1257
+ return {
1258
+ baselineMean,
1259
+ candidateMean,
1260
+ delta,
1261
+ ciLower,
1262
+ ciUpper,
1263
+ iterations,
1264
+ alpha,
1265
+ verdict
1266
+ };
1267
+ }
1268
+ function mean2(xs) {
1269
+ if (xs.length === 0) return 0;
1270
+ let s = 0;
1271
+ for (const x of xs) s += x;
1272
+ return s / xs.length;
1273
+ }
1274
+ function resample(xs, rng) {
1275
+ const out = new Array(xs.length);
1276
+ for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
1277
+ return out;
1278
+ }
1279
+ function mulberry32(seed) {
1280
+ let t = seed >>> 0;
1281
+ return () => {
1282
+ t += 1831565813;
1283
+ let r = t;
1284
+ r = Math.imul(r ^ r >>> 15, r | 1);
1285
+ r ^= r + Math.imul(r ^ r >>> 7, r | 61);
1286
+ return ((r ^ r >>> 14) >>> 0) / 4294967296;
1287
+ };
1288
+ }
1289
+ function hashSeed(a, b) {
1290
+ let h = 2166136261;
1291
+ for (const x of [...a, ...b]) {
1292
+ const view = new Float64Array([x]);
1293
+ const bytes = new Uint8Array(view.buffer);
1294
+ for (const byte of bytes) {
1295
+ h ^= byte;
1296
+ h = Math.imul(h, 16777619);
1297
+ }
1298
+ }
1299
+ return h >>> 0;
1300
+ }
1301
+ async function judgeReplayGate(args) {
1302
+ const concurrency = args.judgeConcurrency ?? 4;
1303
+ const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
1304
+ const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
1305
+ const ci = bootstrapCi(baselineScores, candidateScores, {
1306
+ ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
1307
+ ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
1308
+ ...args.seed !== void 0 ? { seed: args.seed } : {}
1309
+ });
1310
+ return {
1311
+ ...ci,
1312
+ baselineSamples: baselineScores.length,
1313
+ candidateSamples: candidateScores.length
1314
+ };
1315
+ }
1316
+ async function scoreAll(outputs, judge, concurrency) {
1317
+ const results = new Array(outputs.length);
1318
+ let next = 0;
1319
+ async function worker() {
1320
+ while (true) {
1321
+ const i = next++;
1322
+ if (i >= outputs.length) return;
1323
+ const v = await judge(outputs[i]);
1324
+ results[i] = Number.isFinite(v) ? v : 0;
1325
+ }
1326
+ }
1327
+ await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
1328
+ return results;
1329
+ }
1330
+
1331
+ export {
1332
+ releaseTraceEvidenceFromMultiShotTrials,
1333
+ evaluateReleaseConfidence,
1334
+ assertReleaseConfidence,
1335
+ canonicalize,
1336
+ hashJson,
1337
+ signManifest,
1338
+ verifyManifest,
1339
+ evaluateHypothesis,
1340
+ summaryTable,
1341
+ paretoChart,
1342
+ gainHistogram,
1343
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
1344
+ researchReport,
1345
+ renderReleaseReport,
1346
+ bootstrapCi,
1347
+ judgeReplayGate
1348
+ };
1349
+ //# sourceMappingURL=chunk-3IX6QTB7.js.map