@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
@@ -0,0 +1,1226 @@
1
+ import {
2
+ canonicalize,
3
+ hashJson
4
+ } from "./chunk-6M774GY6.js";
5
+
6
+ // src/statistics.ts
7
+ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
8
+ "hallucination",
9
+ "false_confidence",
10
+ "worst_failure"
11
+ ]);
12
+ function normalizeScores(scores) {
13
+ return scores.map((s) => {
14
+ if (INVERTED_DIMENSIONS.has(s.dimension)) {
15
+ return s;
16
+ }
17
+ return s;
18
+ });
19
+ }
20
+ function weightedMean(scores) {
21
+ if (scores.length === 0) return 0;
22
+ let totalWeight = 0;
23
+ let weightedSum = 0;
24
+ for (const { score, weight } of scores) {
25
+ const w = weight ?? 1;
26
+ weightedSum += score * w;
27
+ totalWeight += w;
28
+ }
29
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
30
+ }
31
+ function confidenceInterval(scores, confidence = 0.95) {
32
+ if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
33
+ if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
34
+ const n = scores.length;
35
+ const mean = scores.reduce((a, b) => a + b, 0) / n;
36
+ const B = 1e3;
37
+ const bootstrapMeans = [];
38
+ for (let i = 0; i < B; i++) {
39
+ let sum = 0;
40
+ for (let j = 0; j < n; j++) {
41
+ sum += scores[Math.floor(Math.random() * n)];
42
+ }
43
+ bootstrapMeans.push(sum / n);
44
+ }
45
+ bootstrapMeans.sort((a, b) => a - b);
46
+ const alpha = 1 - confidence;
47
+ const lowerIdx = Math.floor(alpha / 2 * B);
48
+ const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
49
+ return {
50
+ mean,
51
+ lower: bootstrapMeans[lowerIdx],
52
+ upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
53
+ };
54
+ }
55
+ function interRaterReliability(judgeScores) {
56
+ if (judgeScores.length < 2) return 1;
57
+ const dimensionMap = /* @__PURE__ */ new Map();
58
+ for (const judgeSet of judgeScores) {
59
+ for (const s of judgeSet) {
60
+ if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
61
+ const arr = dimensionMap.get(s.dimension);
62
+ if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
63
+ arr.push([s.score]);
64
+ } else {
65
+ arr[arr.length - 1].push(s.score);
66
+ }
67
+ }
68
+ }
69
+ const allValues = [];
70
+ const pairDiffs = [];
71
+ for (const items of dimensionMap.values()) {
72
+ for (const ratings of items) {
73
+ if (ratings.length < 2) continue;
74
+ for (const v of ratings) allValues.push(v);
75
+ for (let i = 0; i < ratings.length; i++) {
76
+ for (let j = i + 1; j < ratings.length; j++) {
77
+ pairDiffs.push((ratings[i] - ratings[j]) ** 2);
78
+ }
79
+ }
80
+ }
81
+ }
82
+ if (pairDiffs.length === 0 || allValues.length < 2) return 1;
83
+ const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
84
+ let expectedDisagreement = 0;
85
+ let expectedCount = 0;
86
+ for (let i = 0; i < allValues.length; i++) {
87
+ for (let j = i + 1; j < allValues.length; j++) {
88
+ expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
89
+ expectedCount++;
90
+ }
91
+ }
92
+ expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
93
+ if (expectedDisagreement === 0) return 1;
94
+ return 1 - observedDisagreement / expectedDisagreement;
95
+ }
96
+ function mannWhitneyU(a, b) {
97
+ if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
98
+ const n1 = a.length;
99
+ const n2 = b.length;
100
+ const combined = [
101
+ ...a.map((v) => ({ v, group: "a" })),
102
+ ...b.map((v) => ({ v, group: "b" }))
103
+ ].sort((x, y) => x.v - y.v);
104
+ const ranks = new Array(combined.length);
105
+ let i = 0;
106
+ while (i < combined.length) {
107
+ let j = i;
108
+ while (j < combined.length && combined[j].v === combined[i].v) j++;
109
+ const avgRank = (i + 1 + j) / 2;
110
+ for (let k = i; k < j; k++) ranks[k] = avgRank;
111
+ i = j;
112
+ }
113
+ let r1 = 0;
114
+ for (let k = 0; k < combined.length; k++) {
115
+ if (combined[k].group === "a") r1 += ranks[k];
116
+ }
117
+ const u1 = r1 - n1 * (n1 + 1) / 2;
118
+ const u2 = n1 * n2 - u1;
119
+ const u = Math.min(u1, u2);
120
+ const mu = n1 * n2 / 2;
121
+ const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
122
+ if (sigma === 0) return { u, p: 1 };
123
+ const z = Math.abs(u - mu) / sigma;
124
+ const p = 2 * (1 - normalCdf(z));
125
+ return { u, p };
126
+ }
127
+ function partialCredit(current, target) {
128
+ if (target <= 0) return 1;
129
+ return Math.min(1, Math.max(0, current / target));
130
+ }
131
+ function pairedTTest(before, after) {
132
+ if (before.length !== after.length) {
133
+ throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
134
+ }
135
+ const n = before.length;
136
+ if (n < 2) return { t: 0, df: 0, p: 1 };
137
+ const diffs = before.map((b, i) => after[i] - b);
138
+ const mean = diffs.reduce((a, b) => a + b, 0) / n;
139
+ const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
140
+ const se = Math.sqrt(variance / n);
141
+ if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
142
+ const t = mean / se;
143
+ const df = n - 1;
144
+ const p = 2 * (1 - studentTCdf(Math.abs(t), df));
145
+ return { t, df, p };
146
+ }
147
+ function wilcoxonSignedRank(before, after) {
148
+ if (before.length !== after.length) {
149
+ throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
150
+ }
151
+ const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
152
+ const n = diffs.length;
153
+ if (n < 6) return { w: 0, p: 1 };
154
+ const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
155
+ const ranks = new Array(n);
156
+ let i = 0;
157
+ while (i < n) {
158
+ let j = i;
159
+ while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
160
+ const avg2 = (i + 1 + j) / 2;
161
+ for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg2;
162
+ i = j;
163
+ }
164
+ let wPlus = 0;
165
+ for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
166
+ const mean = n * (n + 1) / 4;
167
+ const variance = n * (n + 1) * (2 * n + 1) / 24;
168
+ const z = (wPlus - mean) / Math.sqrt(variance);
169
+ const p = 2 * (1 - normalCdf(Math.abs(z)));
170
+ return { w: wPlus, p };
171
+ }
172
+ function cohensD(a, b) {
173
+ if (a.length < 2 || b.length < 2) return 0;
174
+ const meanA = a.reduce((x, y) => x + y, 0) / a.length;
175
+ const meanB = b.reduce((x, y) => x + y, 0) / b.length;
176
+ const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
177
+ const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
178
+ const pooled = Math.sqrt(
179
+ ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
180
+ );
181
+ if (pooled === 0) return 0;
182
+ return (meanB - meanA) / pooled;
183
+ }
184
+ function studentTCdf(t, df) {
185
+ if (df <= 0) return 0.5;
186
+ if (df > 100) return normalCdf(t);
187
+ const x = df / (df + t * t);
188
+ const a = df / 2;
189
+ const b = 0.5;
190
+ const ib = incompleteBeta(x, a, b);
191
+ return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
192
+ }
193
+ function incompleteBeta(x, a, b) {
194
+ if (x <= 0) return 0;
195
+ if (x >= 1) return 1;
196
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
197
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
198
+ const maxIter = 200;
199
+ const eps = 3e-7;
200
+ let c = 1;
201
+ let d = 1 - (a + b) * x / (a + 1);
202
+ if (Math.abs(d) < 1e-30) d = 1e-30;
203
+ d = 1 / d;
204
+ let f = d;
205
+ for (let m = 1; m <= maxIter; m++) {
206
+ const m2 = 2 * m;
207
+ let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
208
+ d = 1 + num * d;
209
+ if (Math.abs(d) < 1e-30) d = 1e-30;
210
+ c = 1 + num / c;
211
+ if (Math.abs(c) < 1e-30) c = 1e-30;
212
+ d = 1 / d;
213
+ f *= d * c;
214
+ num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
215
+ d = 1 + num * d;
216
+ if (Math.abs(d) < 1e-30) d = 1e-30;
217
+ c = 1 + num / c;
218
+ if (Math.abs(c) < 1e-30) c = 1e-30;
219
+ d = 1 / d;
220
+ const delta = d * c;
221
+ f *= delta;
222
+ if (Math.abs(delta - 1) < eps) break;
223
+ }
224
+ return front * f;
225
+ }
226
+ function lnGamma(z) {
227
+ const g = 7;
228
+ const coefs = [
229
+ 0.9999999999998099,
230
+ 676.5203681218851,
231
+ -1259.1392167224028,
232
+ 771.3234287776531,
233
+ -176.6150291621406,
234
+ 12.507343278686905,
235
+ -0.13857109526572012,
236
+ 9984369578019572e-21,
237
+ 15056327351493116e-23
238
+ ];
239
+ if (z < 0.5) {
240
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
241
+ }
242
+ z -= 1;
243
+ let x = coefs[0];
244
+ for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
245
+ const t = z + g + 0.5;
246
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
247
+ }
248
+ function normalCdf(x) {
249
+ const a1 = 0.254829592;
250
+ const a2 = -0.284496736;
251
+ const a3 = 1.421413741;
252
+ const a4 = -1.453152027;
253
+ const a5 = 1.061405429;
254
+ const p = 0.3275911;
255
+ const sign = x < 0 ? -1 : 1;
256
+ const absX = Math.abs(x);
257
+ const t = 1 / (1 + p * absX);
258
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
259
+ return 0.5 * (1 + sign * y);
260
+ }
261
+
262
+ // src/power-analysis.ts
263
+ function requiredSampleSize(opts) {
264
+ const effect = opts.effect;
265
+ if (!Number.isFinite(effect) || effect <= 0) return Infinity;
266
+ const alpha = opts.alpha ?? 0.05;
267
+ const power = opts.power ?? 0.8;
268
+ const twoSided = opts.twoSided ?? true;
269
+ const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
270
+ const zBeta = zQuantile(power);
271
+ const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
272
+ return Math.ceil(n);
273
+ }
274
+ function pairedMde(opts) {
275
+ if (!Number.isFinite(opts.nPaired) || opts.nPaired <= 0) return Infinity;
276
+ const alpha = opts.alpha ?? 0.05;
277
+ const power = opts.power ?? 0.8;
278
+ const twoSided = opts.twoSided ?? true;
279
+ const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
280
+ const zBeta = zQuantile(power);
281
+ return (zAlpha + zBeta) / Math.sqrt(opts.nPaired);
282
+ }
283
+ function bonferroni(pValues, alpha = 0.05) {
284
+ const k = pValues.length;
285
+ const adjusted = pValues.map((p) => Math.min(1, p * k));
286
+ const significant = adjusted.map((p) => p < alpha);
287
+ return { adjusted, significant };
288
+ }
289
+ function benjaminiHochberg(pValues, fdr = 0.05) {
290
+ const n = pValues.length;
291
+ if (n === 0) return { qValues: [], significant: [] };
292
+ const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
293
+ const q = new Array(n);
294
+ let minRight = 1;
295
+ for (let k = n - 1; k >= 0; k--) {
296
+ const rank = k + 1;
297
+ const raw = indexed[k].p * n / rank;
298
+ const bounded = Math.min(minRight, raw);
299
+ minRight = bounded;
300
+ q[indexed[k].i] = Math.min(1, bounded);
301
+ }
302
+ const significant = q.map((v) => v < fdr);
303
+ return { qValues: q, significant };
304
+ }
305
+ function zQuantile(p) {
306
+ if (p <= 0 || p >= 1) {
307
+ if (p === 0) return -Infinity;
308
+ if (p === 1) return Infinity;
309
+ return NaN;
310
+ }
311
+ const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
312
+ const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
313
+ const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
314
+ const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
315
+ const pLow = 0.02425;
316
+ const pHigh = 1 - pLow;
317
+ let q;
318
+ let r;
319
+ if (p < pLow) {
320
+ q = Math.sqrt(-2 * Math.log(p));
321
+ return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
322
+ }
323
+ if (p <= pHigh) {
324
+ q = p - 0.5;
325
+ r = q * q;
326
+ return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
327
+ }
328
+ q = Math.sqrt(-2 * Math.log(1 - p));
329
+ return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
330
+ }
331
+
332
+ // src/paired-stats.ts
333
+ function pairedBootstrap(before, after, opts = {}) {
334
+ if (before.length !== after.length) {
335
+ throw new Error(
336
+ `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
337
+ );
338
+ }
339
+ const confidence = opts.confidence ?? 0.95;
340
+ const resamples = opts.resamples ?? 2e3;
341
+ const statistic = opts.statistic ?? "median";
342
+ if (confidence <= 0 || confidence >= 1) {
343
+ throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
344
+ }
345
+ const n = before.length;
346
+ const deltas = before.map((b, i) => after[i] - b);
347
+ if (n === 0) {
348
+ return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
349
+ }
350
+ if (n === 1) {
351
+ const d = deltas[0];
352
+ return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
353
+ }
354
+ const rng = makeRng(opts.seed);
355
+ const samples = new Array(resamples);
356
+ for (let b = 0; b < resamples; b++) {
357
+ let acc = null;
358
+ if (statistic === "mean") {
359
+ let sum = 0;
360
+ for (let k = 0; k < n; k++) {
361
+ sum += deltas[Math.floor(rng() * n)];
362
+ }
363
+ samples[b] = sum / n;
364
+ } else {
365
+ acc = new Array(n);
366
+ for (let k = 0; k < n; k++) {
367
+ acc[k] = deltas[Math.floor(rng() * n)];
368
+ }
369
+ samples[b] = medianInPlace(acc);
370
+ }
371
+ }
372
+ samples.sort((a, b) => a - b);
373
+ const alpha = 1 - confidence;
374
+ const lowIdx = Math.floor(alpha / 2 * resamples);
375
+ const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
376
+ return {
377
+ n,
378
+ median: medianInPlace([...deltas]),
379
+ mean: deltas.reduce((s, x) => s + x, 0) / n,
380
+ low: samples[lowIdx],
381
+ high: samples[Math.max(highIdx, lowIdx)],
382
+ confidence,
383
+ resamples
384
+ };
385
+ }
386
+ function pairedWilcoxon(before, after) {
387
+ return wilcoxonSignedRank(before, after);
388
+ }
389
+ function bhAdjust(pValues, fdr = 0.05) {
390
+ return benjaminiHochberg(pValues, fdr);
391
+ }
392
+ function medianInPlace(xs) {
393
+ if (xs.length === 0) return 0;
394
+ xs.sort((a, b) => a - b);
395
+ const mid = Math.floor(xs.length / 2);
396
+ return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
397
+ }
398
+ function makeRng(seed) {
399
+ if (seed === void 0) return Math.random;
400
+ let s = seed | 0 || 2654435769;
401
+ return () => {
402
+ s = s + 1831565813 | 0;
403
+ let t = s;
404
+ t = Math.imul(t ^ t >>> 15, t | 1);
405
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
406
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
407
+ };
408
+ }
409
+
410
+ // src/summary-report.ts
411
+ function summaryTable(runs, opts = {}) {
412
+ const split = opts.split ?? "holdout";
413
+ const confidence = opts.confidence ?? 0.95;
414
+ const fdr = opts.fdr ?? 0.05;
415
+ const comparator = opts.comparator ?? null;
416
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
417
+ const byCandidate = /* @__PURE__ */ new Map();
418
+ for (const r of runs) {
419
+ if (r.splitTag !== split) continue;
420
+ const v = r.outcome[scoreField];
421
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
422
+ const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
423
+ bucket.runs.push(r);
424
+ bucket.scores.push(v);
425
+ byCandidate.set(r.candidateId, bucket);
426
+ }
427
+ const candidateIds = [...byCandidate.keys()].sort();
428
+ const compRuns = comparator ? byCandidate.get(comparator) : void 0;
429
+ const tentative = [];
430
+ for (const id of candidateIds) {
431
+ const bucket = byCandidate.get(id);
432
+ const ci = confidenceInterval(bucket.scores, confidence);
433
+ let rawP = Number.NaN;
434
+ let d = Number.NaN;
435
+ if (comparator && compRuns && id !== comparator) {
436
+ const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
437
+ if (paired.before.length >= 6) {
438
+ rawP = wilcoxonSignedRank(paired.before, paired.after).p;
439
+ }
440
+ d = cohensD(compRuns.scores, bucket.scores);
441
+ }
442
+ tentative.push({
443
+ candidateId: id,
444
+ n: bucket.scores.length,
445
+ mean: ci.mean,
446
+ ciLow: ci.lower,
447
+ ciHigh: ci.upper,
448
+ qValue: rawP,
449
+ cohensD: d,
450
+ rawP
451
+ });
452
+ }
453
+ if (comparator) {
454
+ const idxs = [];
455
+ const ps = [];
456
+ for (let i = 0; i < tentative.length; i++) {
457
+ const r = tentative[i];
458
+ if (r.candidateId === comparator) continue;
459
+ if (!Number.isFinite(r.rawP)) continue;
460
+ idxs.push(i);
461
+ ps.push(r.rawP);
462
+ }
463
+ if (ps.length > 0) {
464
+ const { qValues } = benjaminiHochberg(ps, fdr);
465
+ for (let k = 0; k < idxs.length; k++) {
466
+ tentative[idxs[k]].qValue = qValues[k];
467
+ }
468
+ }
469
+ }
470
+ const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
471
+ const markdown = renderSummaryTableMarkdown(rows, comparator, split);
472
+ return { rows, comparator, split, markdown };
473
+ }
474
+ function pairScoresByKey(candidate, baseline, scoreField) {
475
+ const baseIdx = /* @__PURE__ */ new Map();
476
+ for (const r of baseline) {
477
+ const v = r.outcome[scoreField];
478
+ if (typeof v === "number" && Number.isFinite(v)) {
479
+ baseIdx.set(`${r.experimentId}::${r.seed}`, v);
480
+ }
481
+ }
482
+ const before = [];
483
+ const after = [];
484
+ for (const r of candidate) {
485
+ const v = r.outcome[scoreField];
486
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
487
+ const key = `${r.experimentId}::${r.seed}`;
488
+ const b = baseIdx.get(key);
489
+ if (b === void 0) continue;
490
+ before.push(b);
491
+ after.push(v);
492
+ }
493
+ return { before, after };
494
+ }
495
+ function renderSummaryTableMarkdown(rows, comparator, split) {
496
+ const lines = [];
497
+ const cmpLabel = comparator ? ` (vs ${comparator})` : "";
498
+ lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
499
+ lines.push("");
500
+ lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
501
+ lines.push("|---|---:|---:|---|---:|---:|");
502
+ for (const r of rows) {
503
+ const ci = `[${fmt(r.ciLow)}, ${fmt(r.ciHigh)}]`;
504
+ const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
505
+ const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
506
+ lines.push(`| ${r.candidateId} | ${r.n} | ${fmt(r.mean)} | ${ci} | ${q} | ${d} |`);
507
+ }
508
+ return lines.join("\n");
509
+ }
510
+ function paretoChart(runs, opts = {}) {
511
+ const split = opts.split ?? "holdout";
512
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
513
+ const buckets = /* @__PURE__ */ new Map();
514
+ for (const r of runs) {
515
+ if (r.splitTag !== split) continue;
516
+ const v = r.outcome[scoreField];
517
+ if (typeof v !== "number" || !Number.isFinite(v)) continue;
518
+ const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
519
+ bucket.cost.push(r.costUsd);
520
+ bucket.quality.push(v);
521
+ buckets.set(r.candidateId, bucket);
522
+ }
523
+ const points = [];
524
+ for (const [candidateId, bucket] of buckets.entries()) {
525
+ points.push({
526
+ candidateId,
527
+ cost: avg(bucket.cost),
528
+ quality: avg(bucket.quality),
529
+ n: bucket.cost.length,
530
+ onFrontier: false,
531
+ gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
532
+ });
533
+ }
534
+ for (const p of points) {
535
+ p.onFrontier = !points.some((q) => q !== p && dominates(q, p));
536
+ }
537
+ return {
538
+ kind: "pareto-cost-quality",
539
+ split,
540
+ axes: { x: "costUsd", y: "score" },
541
+ points
542
+ };
543
+ }
544
+ function dominates(a, b) {
545
+ return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
546
+ }
547
+ function gateLabel(d) {
548
+ if (d.promote) return "promote";
549
+ if (d.rejectionCode === "few_runs") return "reject_few_runs";
550
+ if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
551
+ if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
552
+ return null;
553
+ }
554
+ function gainHistogram(runs, candidateId, comparator, opts = {}) {
555
+ const split = opts.split ?? "holdout";
556
+ const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
557
+ const binCount = opts.bins ?? 11;
558
+ if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
559
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
560
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
561
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
562
+ const n = before.length;
563
+ if (n === 0) {
564
+ return {
565
+ kind: "gain-distribution",
566
+ candidateId,
567
+ comparator,
568
+ split,
569
+ n: 0,
570
+ bins: [],
571
+ median: 0,
572
+ ci: { low: 0, high: 0 }
573
+ };
574
+ }
575
+ const deltas = before.map((b, i) => after[i] - b);
576
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
577
+ const median = medianOfSorted(sortedDeltas);
578
+ const min = sortedDeltas[0];
579
+ const max = sortedDeltas[sortedDeltas.length - 1];
580
+ const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
581
+ const lo = -bound;
582
+ const hi = bound;
583
+ const width = (hi - lo) / binCount;
584
+ const bins = [];
585
+ for (let i = 0; i < binCount; i++) {
586
+ bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
587
+ }
588
+ for (const d of deltas) {
589
+ let idx = Math.floor((d - lo) / width);
590
+ if (idx < 0) idx = 0;
591
+ if (idx >= binCount) idx = binCount - 1;
592
+ bins[idx].count += 1;
593
+ }
594
+ const ci = pairedBootstrap(before, after, {
595
+ confidence: opts.confidence ?? 0.95,
596
+ resamples: opts.resamples ?? 2e3,
597
+ statistic: "median",
598
+ seed: opts.seed
599
+ });
600
+ return {
601
+ kind: "gain-distribution",
602
+ candidateId,
603
+ comparator,
604
+ split,
605
+ n,
606
+ bins,
607
+ median,
608
+ ci: { low: ci.low, high: ci.high }
609
+ };
610
+ }
611
+ var RESEARCH_REPORT_HARD_PAIR_FLOOR = 6;
612
+ function pairedPosterior(runs, candidateId, comparator, opts) {
613
+ const scoreField = opts.split === "holdout" ? "holdoutScore" : "searchScore";
614
+ const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === opts.split);
615
+ const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === opts.split);
616
+ const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
617
+ const n = before.length;
618
+ if (n === 0) return null;
619
+ const deltas = before.map((b, i) => after[i] - b);
620
+ const meanDelta = deltas.reduce((s, x) => s + x, 0) / n;
621
+ const sortedDeltas = [...deltas].sort((a, b) => a - b);
622
+ const medianDelta = medianOfSorted(sortedDeltas);
623
+ const sdDelta = stdev(deltas, meanDelta);
624
+ const ci = pairedBootstrap(before, after, {
625
+ confidence: opts.confidence,
626
+ resamples: 2e3,
627
+ statistic: "median",
628
+ seed: opts.seed
629
+ });
630
+ const meanSamples = bootstrapMeanSamples(deltas, 2e3, opts.seed);
631
+ const prGreaterThanZero = meanSamples.length === 0 ? 0 : meanSamples.filter((s) => s > 0).length / meanSamples.length;
632
+ const prInRope = opts.rope === null || meanSamples.length === 0 ? null : meanSamples.filter((s) => s >= opts.rope.low && s <= opts.rope.high).length / meanSamples.length;
633
+ const dStandardised = pairedMde({ nPaired: n, alpha: opts.mdeAlpha, power: opts.mdePower });
634
+ const mde = sdDelta === 0 ? 0 : dStandardised * sdDelta;
635
+ return {
636
+ n,
637
+ meanDelta,
638
+ medianDelta,
639
+ sdDelta,
640
+ ci: { low: ci.low, high: ci.high },
641
+ prGreaterThanZero,
642
+ prInRope,
643
+ mde
644
+ };
645
+ }
646
+ function bootstrapMeanSamples(deltas, resamples, seed) {
647
+ const n = deltas.length;
648
+ if (n === 0) return [];
649
+ if (n === 1) return new Array(resamples).fill(deltas[0]);
650
+ const rng = seedRng(seed);
651
+ const samples = new Array(resamples);
652
+ for (let b = 0; b < resamples; b++) {
653
+ let sum = 0;
654
+ for (let k = 0; k < n; k++) sum += deltas[Math.floor(rng() * n)];
655
+ samples[b] = sum / n;
656
+ }
657
+ return samples;
658
+ }
659
+ function seedRng(seed) {
660
+ if (seed === void 0) return Math.random;
661
+ let s = seed >>> 0;
662
+ return () => {
663
+ s = s + 1831565813 >>> 0;
664
+ let t = s;
665
+ t = Math.imul(t ^ t >>> 15, t | 1);
666
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
667
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
668
+ };
669
+ }
670
+ function stdev(xs, mean) {
671
+ if (xs.length < 2) return 0;
672
+ let sse = 0;
673
+ for (const x of xs) sse += (x - mean) ** 2;
674
+ return Math.sqrt(sse / (xs.length - 1));
675
+ }
676
+ async function researchReport(runs, opts = {}) {
677
+ const split = opts.split ?? "holdout";
678
+ const comparator = opts.comparator ?? null;
679
+ const confidence = opts.confidence ?? 0.95;
680
+ const fdr = opts.fdr ?? 0.05;
681
+ const minPairs = Math.max(opts.minPairs ?? 20, RESEARCH_REPORT_HARD_PAIR_FLOOR);
682
+ const rope = opts.rope ?? null;
683
+ const mdePower = opts.mdePower ?? 0.8;
684
+ const mdeAlpha = opts.mdeAlpha ?? fdr;
685
+ const title = opts.title ?? "Agent Evaluation Research Report";
686
+ const generatedAt = opts.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString();
687
+ const preregistrationHash = opts.preregistrationHash ?? null;
688
+ if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
689
+ throw new Error(`researchReport: rope must satisfy low \u2264 high with finite bounds, got ${JSON.stringify(rope)}`);
690
+ }
691
+ const summary = summaryTable(runs, {
692
+ comparator: comparator ?? void 0,
693
+ split,
694
+ confidence,
695
+ fdr
696
+ });
697
+ const pareto = paretoChart(runs, { split, gateDecisions: opts.gateDecisions });
698
+ const candidateIds = opts.candidateIds ?? summary.rows.map((r) => r.candidateId).filter((id) => id !== comparator);
699
+ const gains = comparator ? candidateIds.map((id) => gainHistogram(runs, id, comparator, {
700
+ split,
701
+ confidence,
702
+ seed: opts.seed
703
+ })) : [];
704
+ const gainByCandidate = new Map(gains.map((g) => [g.candidateId, g]));
705
+ const paretoByCandidate = new Map(pareto.points.map((p) => [p.candidateId, p]));
706
+ const posteriorByCandidate = /* @__PURE__ */ new Map();
707
+ if (comparator) {
708
+ for (const id of candidateIds) {
709
+ posteriorByCandidate.set(id, pairedPosterior(runs, id, comparator, {
710
+ split,
711
+ confidence,
712
+ seed: opts.seed,
713
+ rope,
714
+ mdePower,
715
+ mdeAlpha
716
+ }));
717
+ }
718
+ }
719
+ const candidates = summary.rows.map((row) => {
720
+ const gain = gainByCandidate.get(row.candidateId);
721
+ const point = paretoByCandidate.get(row.candidateId);
722
+ const posterior = posteriorByCandidate.get(row.candidateId) ?? null;
723
+ const classified = classifyCandidate(row, {
724
+ comparator,
725
+ posterior,
726
+ point,
727
+ fdr,
728
+ minPairs,
729
+ rope
730
+ });
731
+ return {
732
+ candidateId: row.candidateId,
733
+ n: row.n,
734
+ mean: row.mean,
735
+ ciLow: row.ciLow,
736
+ ciHigh: row.ciHigh,
737
+ qValue: row.qValue,
738
+ cohensD: row.cohensD,
739
+ meanDeltaVsComparator: posterior ? posterior.meanDelta : null,
740
+ pairedN: posterior?.n ?? gain?.n ?? 0,
741
+ medianGain: posterior ? posterior.medianDelta : gain ? gain.median : null,
742
+ meanGain: posterior ? posterior.meanDelta : null,
743
+ gainCi: posterior ? posterior.ci : gain ? gain.ci : null,
744
+ prGreaterThanZero: posterior ? posterior.prGreaterThanZero : null,
745
+ prInRope: posterior ? posterior.prInRope : null,
746
+ mde: posterior ? posterior.mde : null,
747
+ onParetoFrontier: point?.onFrontier ?? false,
748
+ gate: point?.gate,
749
+ decision: classified.decision,
750
+ decisionReason: classified.reason
751
+ };
752
+ }).sort((a, b) => {
753
+ const decisionRank = decisionWeight(b.decision) - decisionWeight(a.decision);
754
+ if (decisionRank !== 0) return decisionRank;
755
+ return b.mean - a.mean;
756
+ });
757
+ const recommendation = buildRecommendation(candidates, {
758
+ comparator,
759
+ failureClusters: opts.failureClusters,
760
+ rope,
761
+ minPairs,
762
+ preregistrationHash
763
+ });
764
+ const executiveSummary = buildExecutiveSummary(candidates, recommendation, {
765
+ comparator,
766
+ split,
767
+ failureClusters: opts.failureClusters,
768
+ preregistrationHash
769
+ });
770
+ const methodology = buildMethodology({ split, comparator, fdr, minPairs, rope, confidence, mdePower, mdeAlpha });
771
+ const runFingerprint = await hashJson(canonicalize({
772
+ triples: runs.filter((r) => r.splitTag === split).map((r) => ({ runId: r.runId, candidateId: r.candidateId, splitTag: r.splitTag })).sort((a, b) => a.runId.localeCompare(b.runId)),
773
+ comparator,
774
+ split
775
+ }));
776
+ const markdown = renderResearchMarkdown({
777
+ title,
778
+ generatedAt,
779
+ split,
780
+ comparator,
781
+ rope,
782
+ runFingerprint,
783
+ preregistrationHash,
784
+ executiveSummary,
785
+ recommendation,
786
+ candidates,
787
+ summary,
788
+ pareto,
789
+ gains,
790
+ methodology,
791
+ failureClusters: opts.failureClusters
792
+ });
793
+ const html = renderResearchHtml(markdown, title);
794
+ return {
795
+ kind: "agent-eval-research-report",
796
+ title,
797
+ generatedAt,
798
+ split,
799
+ comparator,
800
+ runFingerprint,
801
+ preregistrationHash,
802
+ rope,
803
+ executiveSummary,
804
+ recommendation,
805
+ candidates,
806
+ summary,
807
+ charts: { pareto, gains },
808
+ methodology,
809
+ failureClusters: opts.failureClusters,
810
+ markdown,
811
+ html
812
+ };
813
+ }
814
+ function buildMethodology(ctx) {
815
+ const assumptions = [
816
+ "Pairs are matched by (experimentId, seed); the candidate and comparator see the same scenarios in the same order.",
817
+ "Paired deltas are exchangeable conditional on the matched scenario \u2014 no mid-run distribution shift.",
818
+ `Decisions are pre-specified at fdr=${ctx.fdr}, minPairs=${ctx.minPairs}, confidence=${ctx.confidence}; deviating from these post-hoc invalidates the false-discovery control.`
819
+ ];
820
+ if (ctx.rope) {
821
+ assumptions.push(`The Region of Practical Equivalence ${formatRope(ctx.rope)} is supplied by the domain owner; equivalent verdicts are only meaningful if that range is treated as the standing definition of "no material difference."`);
822
+ }
823
+ if (ctx.comparator === null) {
824
+ assumptions.push("No comparator was configured; this run is descriptive, not causal.");
825
+ }
826
+ const methods = [
827
+ "Marginal scores summarised with BH-FDR-adjusted Wilcoxon signed-rank q-values and Cohen's d via summaryTable.",
828
+ "Paired evidence summarised with bootstrap CI on the median delta and Bayesian-bootstrap-style Pr(\u0394>0) and Pr(\u0394\u2208ROPE) on the mean delta.",
829
+ `Minimum detectable effect reported per candidate at \u03B1=${ctx.mdeAlpha} (two-sided), power=${ctx.mdePower}, standardised by the observed paired-delta SD.`,
830
+ "Pareto frontier flagged as a separate axis (cost vs quality); a candidate can be on-frontier without winning the paired test.",
831
+ "Held-out gate decisions, when supplied, override the statistical verdict in the reject direction."
832
+ ];
833
+ const alternatives = [
834
+ "Paired t-test rejected: not robust to the heavy-tailed score distributions common in agent benchmarks.",
835
+ "Unpaired Mann\u2013Whitney rejected: matched scenarios make pairing free; unpaired throws away that variance reduction.",
836
+ "Sequential / always-valid inference (e-values, mSPRT) is the right tool for iterative sweeps and is out of scope for this single-look report \u2014 preregister and run once, or wrap this report in an alpha-spending schedule.",
837
+ "Hierarchical Bayesian shrinkage across many candidates is future work; the current ranking uses raw paired statistics."
838
+ ];
839
+ const whenNotToApply = [
840
+ `Paired N below ${RESEARCH_REPORT_HARD_PAIR_FLOOR} on any candidate \u2014 the bootstrap CI is degenerate.`,
841
+ "Comparator chosen post-hoc by inspecting the same data; q-values are no longer false-discovery-controlled.",
842
+ "Scenarios not drawn under a stable preregistered protocol; the report can describe the data but cannot anchor a launch decision.",
843
+ "Score distributions with mid-run shift (judge model swap, rubric change, infra outage) \u2014 pair exchangeability is violated."
844
+ ];
845
+ const citations = [
846
+ "Benjamini, Y. & Hochberg, Y. (1995). Controlling the false discovery rate: a practical and powerful approach to multiple testing. JRSS B, 57(1), 289\u2013300.",
847
+ "Wilcoxon, F. (1945). Individual comparisons by ranking methods. Biometrics Bulletin, 1(6), 80\u201383.",
848
+ "Efron, B. (1979). Bootstrap methods: another look at the jackknife. Annals of Statistics, 7(1), 1\u201326.",
849
+ "Rubin, D. B. (1981). The Bayesian bootstrap. Annals of Statistics, 9(1), 130\u2013134.",
850
+ "Kruschke, J. K. (2018). Rejecting or accepting parameter values in Bayesian estimation. Advances in Methods and Practices in Psychological Science, 1(2), 270\u2013280. (ROPE.)"
851
+ ];
852
+ return { assumptions, methods, alternatives, whenNotToApply, citations };
853
+ }
854
+ function formatRope(rope) {
855
+ return `[${fmt(rope.low)}, ${fmt(rope.high)}]`;
856
+ }
857
+ function classifyCandidate(row, ctx) {
858
+ if (ctx.comparator && row.candidateId === ctx.comparator) {
859
+ return { decision: "hold", reason: "Comparator baseline." };
860
+ }
861
+ if (!ctx.comparator) {
862
+ return {
863
+ decision: ctx.point?.onFrontier ? "hold" : "needs_more_data",
864
+ reason: "No comparator configured; report ranks candidates but cannot anchor a promotion call."
865
+ };
866
+ }
867
+ if (ctx.point?.gate && ctx.point.gate !== "promote") {
868
+ return { decision: "reject", reason: `Held-out gate returned ${ctx.point.gate}.` };
869
+ }
870
+ if (!ctx.posterior || ctx.posterior.n < RESEARCH_REPORT_HARD_PAIR_FLOOR) {
871
+ return {
872
+ decision: "needs_more_data",
873
+ reason: `Only ${ctx.posterior?.n ?? 0} paired observations; below hard floor of ${RESEARCH_REPORT_HARD_PAIR_FLOOR} for any paired inference.`
874
+ };
875
+ }
876
+ const ci = ctx.posterior.ci;
877
+ if (ctx.rope && ci.low >= ctx.rope.low && ci.high <= ctx.rope.high) {
878
+ return {
879
+ decision: "equivalent",
880
+ reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
881
+ };
882
+ }
883
+ const significant = Number.isFinite(row.qValue) && row.qValue <= ctx.fdr;
884
+ const gainPositive = ci.low > 0;
885
+ const gainNegative = ci.high < 0;
886
+ if (gainNegative) {
887
+ return { decision: "reject", reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.` };
888
+ }
889
+ if (ctx.posterior.n < ctx.minPairs) {
890
+ return {
891
+ decision: "needs_more_data",
892
+ reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
893
+ };
894
+ }
895
+ if (significant && gainPositive) {
896
+ return {
897
+ decision: "promote",
898
+ reason: `BH-adjusted q=${fmt(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)}.`
899
+ };
900
+ }
901
+ return {
902
+ decision: "hold",
903
+ reason: `Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)} but CI [${fmt(ci.low)}, ${fmt(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
904
+ };
905
+ }
906
+ function buildRecommendation(candidates, ctx) {
907
+ const nonComparator = candidates.filter((c) => c.candidateId !== ctx.comparator);
908
+ const bestPromote = nonComparator.find((c) => c.decision === "promote");
909
+ const bestEquivalent = nonComparator.find((c) => c.decision === "equivalent");
910
+ const chosen = bestPromote ?? bestEquivalent ?? nonComparator[0] ?? null;
911
+ const decision = bestPromote ? "promote" : nonComparator.some((c) => c.decision === "needs_more_data") ? "needs_more_data" : bestEquivalent ? "equivalent" : nonComparator.some((c) => c.decision === "hold") ? "hold" : "reject";
912
+ const rationale = [];
913
+ const risks = [];
914
+ const nextActions = [];
915
+ if (chosen) {
916
+ rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
917
+ if (chosen.gainCi) {
918
+ const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt(chosen.prGreaterThanZero)}` : "";
919
+ rationale.push(`Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`);
920
+ }
921
+ if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
922
+ rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt(chosen.mde)} score units.`);
923
+ }
924
+ }
925
+ if (!ctx.comparator) {
926
+ risks.push("No comparator was configured; verdict is descriptive, not causal.");
927
+ nextActions.push("Re-run with a stable comparator candidate for paired inference.");
928
+ }
929
+ if (!ctx.preregistrationHash) {
930
+ risks.push("No preregistration hash supplied; readers cannot verify the analysis was specified before data inspection.");
931
+ nextActions.push("Sign a HypothesisManifest before the next sweep and pass `preregistrationHash` so the report cites it.");
932
+ }
933
+ if (ctx.rope === null && nonComparator.length > 0) {
934
+ risks.push('No ROPE configured; the report cannot distinguish "equivalent" from "inconclusive".');
935
+ nextActions.push("Define a domain-specific Region of Practical Equivalence and pass it to lock in the equivalence threshold.");
936
+ }
937
+ const inconclusive = nonComparator.filter((c) => c.decision === "needs_more_data");
938
+ if (inconclusive.length > 0) {
939
+ const worst = inconclusive.reduce((a, b) => b.pairedN < a.pairedN ? b : a);
940
+ risks.push(`${inconclusive.length} candidate(s) below soft floor (${ctx.minPairs} pairs); thinnest is ${worst.candidateId} with ${worst.pairedN}.`);
941
+ nextActions.push(`Collect at least ${ctx.minPairs - worst.pairedN} more matched holdout runs for ${worst.candidateId}.`);
942
+ }
943
+ const rejected = nonComparator.filter((c) => c.decision === "reject");
944
+ if (rejected.length > 0) {
945
+ risks.push(`${rejected.length} candidate(s) failed the paired test or held-out gate; do not ship those variants.`);
946
+ }
947
+ if (ctx.failureClusters && ctx.failureClusters.clusters.length > 0) {
948
+ const top = ctx.failureClusters.clusters[0];
949
+ risks.push(`Top failure cluster: ${top.failureClass} across ${top.runCount} run(s).`);
950
+ nextActions.push("Prioritize the largest failure cluster before broad rollout.");
951
+ }
952
+ if (decision === "promote") {
953
+ nextActions.push("Ship behind the existing promotion gate and monitor canaries.");
954
+ } else if (decision === "hold") {
955
+ nextActions.push("Keep current production candidate while expanding holdout evidence.");
956
+ } else if (decision === "equivalent") {
957
+ nextActions.push("Either keep the comparator (no quality regression) or promote on cost/latency grounds \u2014 equivalence does not justify either; the choice is a product decision, not a stats one.");
958
+ } else if (decision === "reject") {
959
+ nextActions.push("Do not promote this sweep; inspect failures and generate a revised candidate.");
960
+ }
961
+ return {
962
+ decision,
963
+ candidateId: chosen?.candidateId ?? null,
964
+ rationale,
965
+ risks,
966
+ nextActions
967
+ };
968
+ }
969
+ function buildExecutiveSummary(candidates, recommendation, ctx) {
970
+ const lines = [];
971
+ const nonComparator = candidates.filter((c) => c.candidateId !== ctx.comparator);
972
+ lines.push(`Evaluated ${nonComparator.length} candidate(s) on the ${ctx.split} split${ctx.comparator ? ` against ${ctx.comparator}` : ""}.`);
973
+ lines.push(`Recommendation: ${recommendation.decision}${recommendation.candidateId ? ` ${recommendation.candidateId}` : ""}.`);
974
+ const promoted = nonComparator.filter((c) => c.decision === "promote").length;
975
+ const held = nonComparator.filter((c) => c.decision === "hold").length;
976
+ const equivalent = nonComparator.filter((c) => c.decision === "equivalent").length;
977
+ const rejected = nonComparator.filter((c) => c.decision === "reject").length;
978
+ const more = nonComparator.filter((c) => c.decision === "needs_more_data").length;
979
+ lines.push(`Decision mix: ${promoted} promote, ${equivalent} equivalent, ${held} hold, ${rejected} reject, ${more} need more data.`);
980
+ const frontier = nonComparator.filter((c) => c.onParetoFrontier).map((c) => c.candidateId);
981
+ if (frontier.length > 0) lines.push(`Pareto-frontier candidates: ${frontier.join(", ")}.`);
982
+ if (ctx.failureClusters) {
983
+ lines.push(`Failure clustering found ${ctx.failureClusters.totalFailures}/${ctx.failureClusters.totalRuns} failed runs across ${ctx.failureClusters.clusters.length} reportable cluster(s).`);
984
+ }
985
+ lines.push(ctx.preregistrationHash ? `Preregistered analysis: ${ctx.preregistrationHash.slice(0, 12)}\u2026` : "Analysis is post-hoc \u2014 no preregistration hash supplied.");
986
+ return lines;
987
+ }
988
+ function renderResearchMarkdown(report) {
989
+ const lines = [];
990
+ lines.push(`# ${report.title}`);
991
+ lines.push("");
992
+ lines.push(`**Generated:** ${report.generatedAt}`);
993
+ lines.push(`**Primary split:** ${report.split}`);
994
+ lines.push(`**Comparator:** ${report.comparator ?? "not configured"}`);
995
+ lines.push(`**ROPE:** ${report.rope ? formatRope(report.rope) : "not configured"}`);
996
+ lines.push(`**Run fingerprint:** \`${report.runFingerprint}\``);
997
+ lines.push(`**Preregistration:** ${report.preregistrationHash ? `\`${report.preregistrationHash}\`` : "none"}`);
998
+ lines.push("");
999
+ lines.push("## Executive Summary");
1000
+ lines.push("");
1001
+ for (const item of report.executiveSummary) lines.push(`- ${item}`);
1002
+ lines.push("");
1003
+ lines.push("## Recommendation");
1004
+ lines.push("");
1005
+ lines.push(`**Decision:** ${report.recommendation.decision}`);
1006
+ lines.push(`**Candidate:** ${report.recommendation.candidateId ?? "N/A"}`);
1007
+ lines.push("");
1008
+ lines.push("### Rationale");
1009
+ lines.push("");
1010
+ for (const item of report.recommendation.rationale) lines.push(`- ${item}`);
1011
+ lines.push("");
1012
+ lines.push("### Risks");
1013
+ lines.push("");
1014
+ for (const item of report.recommendation.risks.length ? report.recommendation.risks : ["No material report-level risks detected."]) {
1015
+ lines.push(`- ${item}`);
1016
+ }
1017
+ lines.push("");
1018
+ lines.push("### Next Actions");
1019
+ lines.push("");
1020
+ for (const item of report.recommendation.nextActions) lines.push(`- ${item}`);
1021
+ lines.push("");
1022
+ lines.push("## Candidate Decision Table");
1023
+ lines.push("");
1024
+ lines.push("| Candidate | Decision | Mean | \u0394\u0304 | Pr(\u0394>0) | q | d | Paired N | Median Gain CI | MDE | Pareto | Gate |");
1025
+ lines.push("|---|---|---:|---:|---:|---:|---:|---:|---|---:|---|---|");
1026
+ for (const c of report.candidates) {
1027
+ const delta = c.meanDeltaVsComparator === null ? "-" : signed(c.meanDeltaVsComparator);
1028
+ const prGt = c.prGreaterThanZero === null ? "-" : c.prGreaterThanZero.toFixed(3);
1029
+ const q = Number.isFinite(c.qValue) ? c.qValue.toFixed(4) : "-";
1030
+ const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
1031
+ const gain = c.gainCi ? `[${fmt(c.gainCi.low)}, ${fmt(c.gainCi.high)}]` : "-";
1032
+ const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt(c.mde);
1033
+ lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
1034
+ }
1035
+ lines.push("");
1036
+ lines.push("## Statistical Summary");
1037
+ lines.push("");
1038
+ lines.push(report.summary.markdown);
1039
+ lines.push("");
1040
+ lines.push("## Methodology");
1041
+ lines.push("");
1042
+ lines.push("### Assumptions");
1043
+ lines.push("");
1044
+ for (const item of report.methodology.assumptions) lines.push(`- ${item}`);
1045
+ lines.push("");
1046
+ lines.push("### Methods");
1047
+ lines.push("");
1048
+ for (const item of report.methodology.methods) lines.push(`- ${item}`);
1049
+ lines.push("");
1050
+ lines.push("### Alternatives Considered");
1051
+ lines.push("");
1052
+ for (const item of report.methodology.alternatives) lines.push(`- ${item}`);
1053
+ lines.push("");
1054
+ lines.push("### When NOT To Apply");
1055
+ lines.push("");
1056
+ for (const item of report.methodology.whenNotToApply) lines.push(`- ${item}`);
1057
+ lines.push("");
1058
+ lines.push("### Citations");
1059
+ lines.push("");
1060
+ for (const item of report.methodology.citations) lines.push(`- ${item}`);
1061
+ lines.push("");
1062
+ lines.push("## Chart Specs");
1063
+ lines.push("");
1064
+ lines.push("The report carries JSON chart specs for Pareto cost/quality and paired gain histograms.");
1065
+ lines.push("");
1066
+ lines.push("```json");
1067
+ lines.push(JSON.stringify({ pareto: report.pareto, gains: report.gains }, null, 2));
1068
+ lines.push("```");
1069
+ if (report.failureClusters) {
1070
+ lines.push("");
1071
+ lines.push("## Failure Clusters");
1072
+ lines.push("");
1073
+ lines.push("| Failure Class | Runs | Scenarios | Tool | Example |");
1074
+ lines.push("|---|---:|---:|---|---|");
1075
+ for (const c of report.failureClusters.clusters.slice(0, 10)) {
1076
+ lines.push(`| ${c.failureClass} | ${c.runCount} | ${c.scenarioIds.length} | ${c.toolName ?? "-"} | ${escapePipes(c.exampleError ?? c.exampleRunId)} |`);
1077
+ }
1078
+ }
1079
+ return lines.join("\n");
1080
+ }
1081
+ function renderResearchHtml(markdown, title) {
1082
+ const body = markdownToHtml(markdown);
1083
+ return [
1084
+ "<!doctype html>",
1085
+ '<html lang="en">',
1086
+ "<head>",
1087
+ '<meta charset="utf-8">',
1088
+ '<meta name="viewport" content="width=device-width, initial-scale=1">',
1089
+ `<title>${escapeHtml(title)}</title>`,
1090
+ "<style>",
1091
+ 'body{font-family:Inter,ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;margin:0;color:#172026;background:#f7f8f8;}',
1092
+ "main{max-width:1080px;margin:0 auto;padding:40px 24px 64px;background:#fff;min-height:100vh;}",
1093
+ "h1{font-size:34px;line-height:1.15;margin:0 0 20px;}h2{margin-top:34px;border-top:1px solid #d9dfdf;padding-top:22px;}h3{margin-top:22px;}",
1094
+ "p,li{line-height:1.55;}table{border-collapse:collapse;width:100%;margin:16px 0;font-size:14px;}th,td{border:1px solid #d9dfdf;padding:8px;text-align:left;}th{background:#eef2f2;}",
1095
+ "code,pre{font-family:ui-monospace,SFMono-Regular,Menlo,monospace;}pre{overflow:auto;background:#111827;color:#f9fafb;padding:16px;border-radius:6px;}",
1096
+ "</style>",
1097
+ "</head>",
1098
+ "<body><main>",
1099
+ body,
1100
+ "</main></body></html>"
1101
+ ].join("\n");
1102
+ }
1103
+ function markdownToHtml(markdown) {
1104
+ const lines = markdown.split("\n");
1105
+ const html = [];
1106
+ let inList = false;
1107
+ let inCode = false;
1108
+ let code = [];
1109
+ let table = [];
1110
+ const flushList = () => {
1111
+ if (inList) {
1112
+ html.push("</ul>");
1113
+ inList = false;
1114
+ }
1115
+ };
1116
+ const flushTable = () => {
1117
+ if (table.length === 0) return;
1118
+ html.push(renderMarkdownTable(table));
1119
+ table = [];
1120
+ };
1121
+ for (const line of lines) {
1122
+ if (line.startsWith("```")) {
1123
+ if (inCode) {
1124
+ html.push(`<pre><code>${escapeHtml(code.join("\n"))}</code></pre>`);
1125
+ code = [];
1126
+ inCode = false;
1127
+ } else {
1128
+ flushList();
1129
+ flushTable();
1130
+ inCode = true;
1131
+ }
1132
+ continue;
1133
+ }
1134
+ if (inCode) {
1135
+ code.push(line);
1136
+ continue;
1137
+ }
1138
+ if (line.startsWith("|")) {
1139
+ flushList();
1140
+ table.push(line);
1141
+ continue;
1142
+ }
1143
+ flushTable();
1144
+ if (line.startsWith("- ")) {
1145
+ if (!inList) {
1146
+ html.push("<ul>");
1147
+ inList = true;
1148
+ }
1149
+ html.push(`<li>${inlineMarkdown(line.slice(2))}</li>`);
1150
+ continue;
1151
+ }
1152
+ flushList();
1153
+ if (line.startsWith("# ")) html.push(`<h1>${inlineMarkdown(line.slice(2))}</h1>`);
1154
+ else if (line.startsWith("## ")) html.push(`<h2>${inlineMarkdown(line.slice(3))}</h2>`);
1155
+ else if (line.startsWith("### ")) html.push(`<h3>${inlineMarkdown(line.slice(4))}</h3>`);
1156
+ else if (line.trim() === "") html.push("");
1157
+ else html.push(`<p>${inlineMarkdown(line)}</p>`);
1158
+ }
1159
+ flushList();
1160
+ flushTable();
1161
+ return html.join("\n");
1162
+ }
1163
+ function renderMarkdownTable(lines) {
1164
+ const rows = lines.filter((line) => !/^\|[-:\s|]+\|$/.test(line)).map((line) => line.slice(1, -1).split("|").map((cell) => inlineMarkdown(cell.trim())));
1165
+ if (rows.length === 0) return "";
1166
+ const [head, ...body] = rows;
1167
+ const th = head.map((cell) => `<th>${cell}</th>`).join("");
1168
+ const trs = body.map((row) => `<tr>${row.map((cell) => `<td>${cell}</td>`).join("")}</tr>`).join("\n");
1169
+ return `<table><thead><tr>${th}</tr></thead><tbody>${trs}</tbody></table>`;
1170
+ }
1171
+ function inlineMarkdown(s) {
1172
+ return escapeHtml(s).replace(/\*\*([^*]+)\*\*/g, "<strong>$1</strong>");
1173
+ }
1174
+ function escapeHtml(s) {
1175
+ return s.replace(/&/g, "&amp;").replace(/</g, "&lt;").replace(/>/g, "&gt;").replace(/"/g, "&quot;");
1176
+ }
1177
+ function escapePipes(s) {
1178
+ return s.replace(/\|/g, "\\|");
1179
+ }
1180
+ function decisionWeight(decision) {
1181
+ if (decision === "promote") return 5;
1182
+ if (decision === "equivalent") return 4;
1183
+ if (decision === "hold") return 3;
1184
+ if (decision === "needs_more_data") return 2;
1185
+ return 1;
1186
+ }
1187
+ function signed(x) {
1188
+ return `${x >= 0 ? "+" : ""}${fmt(x)}`;
1189
+ }
1190
+ function avg(xs) {
1191
+ if (xs.length === 0) return Number.NaN;
1192
+ return xs.reduce((s, x) => s + x, 0) / xs.length;
1193
+ }
1194
+ function medianOfSorted(sorted) {
1195
+ if (sorted.length === 0) return 0;
1196
+ const mid = Math.floor(sorted.length / 2);
1197
+ return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
1198
+ }
1199
+ function fmt(x) {
1200
+ if (!Number.isFinite(x)) return String(x);
1201
+ return x.toFixed(4);
1202
+ }
1203
+
1204
+ export {
1205
+ normalizeScores,
1206
+ weightedMean,
1207
+ confidenceInterval,
1208
+ interRaterReliability,
1209
+ mannWhitneyU,
1210
+ partialCredit,
1211
+ pairedTTest,
1212
+ wilcoxonSignedRank,
1213
+ cohensD,
1214
+ requiredSampleSize,
1215
+ bonferroni,
1216
+ benjaminiHochberg,
1217
+ pairedBootstrap,
1218
+ pairedWilcoxon,
1219
+ bhAdjust,
1220
+ summaryTable,
1221
+ paretoChart,
1222
+ gainHistogram,
1223
+ RESEARCH_REPORT_HARD_PAIR_FLOOR,
1224
+ researchReport
1225
+ };
1226
+ //# sourceMappingURL=chunk-IOXMGMHQ.js.map