@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -1,263 +1,12 @@
1
+ import {
2
+ cohensD,
3
+ confidenceInterval,
4
+ wilcoxonSignedRank
5
+ } from "./chunk-I4MBDTY5.js";
1
6
  import {
2
7
  canonicalize,
3
8
  hashJson
4
- } from "./chunk-6M774GY6.js";
5
-
6
- // src/statistics.ts
7
- var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
8
- "hallucination",
9
- "false_confidence",
10
- "worst_failure"
11
- ]);
12
- function normalizeScores(scores) {
13
- return scores.map((s) => {
14
- if (INVERTED_DIMENSIONS.has(s.dimension)) {
15
- return s;
16
- }
17
- return s;
18
- });
19
- }
20
- function weightedMean(scores) {
21
- if (scores.length === 0) return 0;
22
- let totalWeight = 0;
23
- let weightedSum = 0;
24
- for (const { score, weight } of scores) {
25
- const w = weight ?? 1;
26
- weightedSum += score * w;
27
- totalWeight += w;
28
- }
29
- return totalWeight > 0 ? weightedSum / totalWeight : 0;
30
- }
31
- function confidenceInterval(scores, confidence = 0.95) {
32
- if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
33
- if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
34
- const n = scores.length;
35
- const mean = scores.reduce((a, b) => a + b, 0) / n;
36
- const B = 1e3;
37
- const bootstrapMeans = [];
38
- for (let i = 0; i < B; i++) {
39
- let sum = 0;
40
- for (let j = 0; j < n; j++) {
41
- sum += scores[Math.floor(Math.random() * n)];
42
- }
43
- bootstrapMeans.push(sum / n);
44
- }
45
- bootstrapMeans.sort((a, b) => a - b);
46
- const alpha = 1 - confidence;
47
- const lowerIdx = Math.floor(alpha / 2 * B);
48
- const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
49
- return {
50
- mean,
51
- lower: bootstrapMeans[lowerIdx],
52
- upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
53
- };
54
- }
55
- function interRaterReliability(judgeScores) {
56
- if (judgeScores.length < 2) return 1;
57
- const dimensionMap = /* @__PURE__ */ new Map();
58
- for (const judgeSet of judgeScores) {
59
- for (const s of judgeSet) {
60
- if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
61
- const arr = dimensionMap.get(s.dimension);
62
- if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
63
- arr.push([s.score]);
64
- } else {
65
- arr[arr.length - 1].push(s.score);
66
- }
67
- }
68
- }
69
- const allValues = [];
70
- const pairDiffs = [];
71
- for (const items of dimensionMap.values()) {
72
- for (const ratings of items) {
73
- if (ratings.length < 2) continue;
74
- for (const v of ratings) allValues.push(v);
75
- for (let i = 0; i < ratings.length; i++) {
76
- for (let j = i + 1; j < ratings.length; j++) {
77
- pairDiffs.push((ratings[i] - ratings[j]) ** 2);
78
- }
79
- }
80
- }
81
- }
82
- if (pairDiffs.length === 0 || allValues.length < 2) return 1;
83
- const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
84
- let expectedDisagreement = 0;
85
- let expectedCount = 0;
86
- for (let i = 0; i < allValues.length; i++) {
87
- for (let j = i + 1; j < allValues.length; j++) {
88
- expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
89
- expectedCount++;
90
- }
91
- }
92
- expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
93
- if (expectedDisagreement === 0) return 1;
94
- return 1 - observedDisagreement / expectedDisagreement;
95
- }
96
- function mannWhitneyU(a, b) {
97
- if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
98
- const n1 = a.length;
99
- const n2 = b.length;
100
- const combined = [
101
- ...a.map((v) => ({ v, group: "a" })),
102
- ...b.map((v) => ({ v, group: "b" }))
103
- ].sort((x, y) => x.v - y.v);
104
- const ranks = new Array(combined.length);
105
- let i = 0;
106
- while (i < combined.length) {
107
- let j = i;
108
- while (j < combined.length && combined[j].v === combined[i].v) j++;
109
- const avgRank = (i + 1 + j) / 2;
110
- for (let k = i; k < j; k++) ranks[k] = avgRank;
111
- i = j;
112
- }
113
- let r1 = 0;
114
- for (let k = 0; k < combined.length; k++) {
115
- if (combined[k].group === "a") r1 += ranks[k];
116
- }
117
- const u1 = r1 - n1 * (n1 + 1) / 2;
118
- const u2 = n1 * n2 - u1;
119
- const u = Math.min(u1, u2);
120
- const mu = n1 * n2 / 2;
121
- const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
122
- if (sigma === 0) return { u, p: 1 };
123
- const z = Math.abs(u - mu) / sigma;
124
- const p = 2 * (1 - normalCdf(z));
125
- return { u, p };
126
- }
127
- function partialCredit(current, target) {
128
- if (target <= 0) return 1;
129
- return Math.min(1, Math.max(0, current / target));
130
- }
131
- function pairedTTest(before, after) {
132
- if (before.length !== after.length) {
133
- throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
134
- }
135
- const n = before.length;
136
- if (n < 2) return { t: 0, df: 0, p: 1 };
137
- const diffs = before.map((b, i) => after[i] - b);
138
- const mean = diffs.reduce((a, b) => a + b, 0) / n;
139
- const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
140
- const se = Math.sqrt(variance / n);
141
- if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
142
- const t = mean / se;
143
- const df = n - 1;
144
- const p = 2 * (1 - studentTCdf(Math.abs(t), df));
145
- return { t, df, p };
146
- }
147
- function wilcoxonSignedRank(before, after) {
148
- if (before.length !== after.length) {
149
- throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
150
- }
151
- const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
152
- const n = diffs.length;
153
- if (n < 6) return { w: 0, p: 1 };
154
- const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
155
- const ranks = new Array(n);
156
- let i = 0;
157
- while (i < n) {
158
- let j = i;
159
- while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
160
- const avg2 = (i + 1 + j) / 2;
161
- for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg2;
162
- i = j;
163
- }
164
- let wPlus = 0;
165
- for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
166
- const mean = n * (n + 1) / 4;
167
- const variance = n * (n + 1) * (2 * n + 1) / 24;
168
- const z = (wPlus - mean) / Math.sqrt(variance);
169
- const p = 2 * (1 - normalCdf(Math.abs(z)));
170
- return { w: wPlus, p };
171
- }
172
- function cohensD(a, b) {
173
- if (a.length < 2 || b.length < 2) return 0;
174
- const meanA = a.reduce((x, y) => x + y, 0) / a.length;
175
- const meanB = b.reduce((x, y) => x + y, 0) / b.length;
176
- const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
177
- const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
178
- const pooled = Math.sqrt(
179
- ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
180
- );
181
- if (pooled === 0) return 0;
182
- return (meanB - meanA) / pooled;
183
- }
184
- function studentTCdf(t, df) {
185
- if (df <= 0) return 0.5;
186
- if (df > 100) return normalCdf(t);
187
- const x = df / (df + t * t);
188
- const a = df / 2;
189
- const b = 0.5;
190
- const ib = incompleteBeta(x, a, b);
191
- return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
192
- }
193
- function incompleteBeta(x, a, b) {
194
- if (x <= 0) return 0;
195
- if (x >= 1) return 1;
196
- const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
197
- const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
198
- const maxIter = 200;
199
- const eps = 3e-7;
200
- let c = 1;
201
- let d = 1 - (a + b) * x / (a + 1);
202
- if (Math.abs(d) < 1e-30) d = 1e-30;
203
- d = 1 / d;
204
- let f = d;
205
- for (let m = 1; m <= maxIter; m++) {
206
- const m2 = 2 * m;
207
- let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
208
- d = 1 + num * d;
209
- if (Math.abs(d) < 1e-30) d = 1e-30;
210
- c = 1 + num / c;
211
- if (Math.abs(c) < 1e-30) c = 1e-30;
212
- d = 1 / d;
213
- f *= d * c;
214
- num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
215
- d = 1 + num * d;
216
- if (Math.abs(d) < 1e-30) d = 1e-30;
217
- c = 1 + num / c;
218
- if (Math.abs(c) < 1e-30) c = 1e-30;
219
- d = 1 / d;
220
- const delta = d * c;
221
- f *= delta;
222
- if (Math.abs(delta - 1) < eps) break;
223
- }
224
- return front * f;
225
- }
226
- function lnGamma(z) {
227
- const g = 7;
228
- const coefs = [
229
- 0.9999999999998099,
230
- 676.5203681218851,
231
- -1259.1392167224028,
232
- 771.3234287776531,
233
- -176.6150291621406,
234
- 12.507343278686905,
235
- -0.13857109526572012,
236
- 9984369578019572e-21,
237
- 15056327351493116e-23
238
- ];
239
- if (z < 0.5) {
240
- return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
241
- }
242
- z -= 1;
243
- let x = coefs[0];
244
- for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
245
- const t = z + g + 0.5;
246
- return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
247
- }
248
- function normalCdf(x) {
249
- const a1 = 0.254829592;
250
- const a2 = -0.284496736;
251
- const a3 = 1.421413741;
252
- const a4 = -1.453152027;
253
- const a5 = 1.061405429;
254
- const p = 0.3275911;
255
- const sign = x < 0 ? -1 : 1;
256
- const absX = Math.abs(x);
257
- const t = 1 / (1 + p * absX);
258
- const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
259
- return 0.5 * (1 + sign * y);
260
- }
9
+ } from "./chunk-4F5DQN55.js";
261
10
 
262
11
  // src/power-analysis.ts
263
12
  function requiredSampleSize(opts) {
@@ -268,7 +17,7 @@ function requiredSampleSize(opts) {
268
17
  const twoSided = opts.twoSided ?? true;
269
18
  const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
270
19
  const zBeta = zQuantile(power);
271
- const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
20
+ const n = 2 * ((zAlpha + zBeta) / effect) ** 2;
272
21
  return Math.ceil(n);
273
22
  }
274
23
  function pairedMde(opts) {
@@ -294,10 +43,11 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
294
43
  let minRight = 1;
295
44
  for (let k = n - 1; k >= 0; k--) {
296
45
  const rank = k + 1;
297
- const raw = indexed[k].p * n / rank;
46
+ const entry = indexed[k];
47
+ const raw = entry.p * n / rank;
298
48
  const bounded = Math.min(minRight, raw);
299
49
  minRight = bounded;
300
- q[indexed[k].i] = Math.min(1, bounded);
50
+ q[entry.i] = Math.min(1, bounded);
301
51
  }
302
52
  const significant = q.map((v) => v < fdr);
303
53
  return { qValues: q, significant };
@@ -308,9 +58,29 @@ function zQuantile(p) {
308
58
  if (p === 1) return Infinity;
309
59
  return NaN;
310
60
  }
311
- const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
312
- const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
313
- const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
61
+ const a = [
62
+ -39.69683028665376,
63
+ 220.9460984245205,
64
+ -275.9285104469687,
65
+ 138.357751867269,
66
+ -30.66479806614716,
67
+ 2.506628277459239
68
+ ];
69
+ const b = [
70
+ -54.47609879822406,
71
+ 161.5858368580409,
72
+ -155.6989798598866,
73
+ 66.80131188771972,
74
+ -13.28068155288572
75
+ ];
76
+ const c = [
77
+ -0.007784894002430293,
78
+ -0.3223964580411365,
79
+ -2.400758277161838,
80
+ -2.549732539343734,
81
+ 4.374664141464968,
82
+ 2.938163982698783
83
+ ];
314
84
  const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
315
85
  const pLow = 0.02425;
316
86
  const pHigh = 1 - pLow;
@@ -332,9 +102,7 @@ function zQuantile(p) {
332
102
  // src/paired-stats.ts
333
103
  function pairedBootstrap(before, after, opts = {}) {
334
104
  if (before.length !== after.length) {
335
- throw new Error(
336
- `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
337
- );
105
+ throw new Error(`pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`);
338
106
  }
339
107
  const confidence = opts.confidence ?? 0.95;
340
108
  const resamples = opts.resamples ?? 2e3;
@@ -686,7 +454,9 @@ async function researchReport(runs, opts = {}) {
686
454
  const generatedAt = opts.generatedAt ?? (/* @__PURE__ */ new Date()).toISOString();
687
455
  const preregistrationHash = opts.preregistrationHash ?? null;
688
456
  if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
689
- throw new Error(`researchReport: rope must satisfy low \u2264 high with finite bounds, got ${JSON.stringify(rope)}`);
457
+ throw new Error(
458
+ `researchReport: rope must satisfy low \u2264 high with finite bounds, got ${JSON.stringify(rope)}`
459
+ );
690
460
  }
691
461
  const summary = summaryTable(runs, {
692
462
  comparator: comparator ?? void 0,
@@ -696,24 +466,29 @@ async function researchReport(runs, opts = {}) {
696
466
  });
697
467
  const pareto = paretoChart(runs, { split, gateDecisions: opts.gateDecisions });
698
468
  const candidateIds = opts.candidateIds ?? summary.rows.map((r) => r.candidateId).filter((id) => id !== comparator);
699
- const gains = comparator ? candidateIds.map((id) => gainHistogram(runs, id, comparator, {
700
- split,
701
- confidence,
702
- seed: opts.seed
703
- })) : [];
469
+ const gains = comparator ? candidateIds.map(
470
+ (id) => gainHistogram(runs, id, comparator, {
471
+ split,
472
+ confidence,
473
+ seed: opts.seed
474
+ })
475
+ ) : [];
704
476
  const gainByCandidate = new Map(gains.map((g) => [g.candidateId, g]));
705
477
  const paretoByCandidate = new Map(pareto.points.map((p) => [p.candidateId, p]));
706
478
  const posteriorByCandidate = /* @__PURE__ */ new Map();
707
479
  if (comparator) {
708
480
  for (const id of candidateIds) {
709
- posteriorByCandidate.set(id, pairedPosterior(runs, id, comparator, {
710
- split,
711
- confidence,
712
- seed: opts.seed,
713
- rope,
714
- mdePower,
715
- mdeAlpha
716
- }));
481
+ posteriorByCandidate.set(
482
+ id,
483
+ pairedPosterior(runs, id, comparator, {
484
+ split,
485
+ confidence,
486
+ seed: opts.seed,
487
+ rope,
488
+ mdePower,
489
+ mdeAlpha
490
+ })
491
+ );
717
492
  }
718
493
  }
719
494
  const candidates = summary.rows.map((row) => {
@@ -767,12 +542,23 @@ async function researchReport(runs, opts = {}) {
767
542
  failureClusters: opts.failureClusters,
768
543
  preregistrationHash
769
544
  });
770
- const methodology = buildMethodology({ split, comparator, fdr, minPairs, rope, confidence, mdePower, mdeAlpha });
771
- const runFingerprint = await hashJson(canonicalize({
772
- triples: runs.filter((r) => r.splitTag === split).map((r) => ({ runId: r.runId, candidateId: r.candidateId, splitTag: r.splitTag })).sort((a, b) => a.runId.localeCompare(b.runId)),
545
+ const methodology = buildMethodology({
546
+ split,
773
547
  comparator,
774
- split
775
- }));
548
+ fdr,
549
+ minPairs,
550
+ rope,
551
+ confidence,
552
+ mdePower,
553
+ mdeAlpha
554
+ });
555
+ const runFingerprint = await hashJson(
556
+ canonicalize({
557
+ triples: runs.filter((r) => r.splitTag === split).map((r) => ({ runId: r.runId, candidateId: r.candidateId, splitTag: r.splitTag })).sort((a, b) => a.runId.localeCompare(b.runId)),
558
+ comparator,
559
+ split
560
+ })
561
+ );
776
562
  const markdown = renderResearchMarkdown({
777
563
  title,
778
564
  generatedAt,
@@ -818,7 +604,9 @@ function buildMethodology(ctx) {
818
604
  `Decisions are pre-specified at fdr=${ctx.fdr}, minPairs=${ctx.minPairs}, confidence=${ctx.confidence}; deviating from these post-hoc invalidates the false-discovery control.`
819
605
  ];
820
606
  if (ctx.rope) {
821
- assumptions.push(`The Region of Practical Equivalence ${formatRope(ctx.rope)} is supplied by the domain owner; equivalent verdicts are only meaningful if that range is treated as the standing definition of "no material difference."`);
607
+ assumptions.push(
608
+ `The Region of Practical Equivalence ${formatRope(ctx.rope)} is supplied by the domain owner; equivalent verdicts are only meaningful if that range is treated as the standing definition of "no material difference."`
609
+ );
822
610
  }
823
611
  if (ctx.comparator === null) {
824
612
  assumptions.push("No comparator was configured; this run is descriptive, not causal.");
@@ -884,7 +672,10 @@ function classifyCandidate(row, ctx) {
884
672
  const gainPositive = ci.low > 0;
885
673
  const gainNegative = ci.high < 0;
886
674
  if (gainNegative) {
887
- return { decision: "reject", reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.` };
675
+ return {
676
+ decision: "reject",
677
+ reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.`
678
+ };
888
679
  }
889
680
  if (ctx.posterior.n < ctx.minPairs) {
890
681
  return {
@@ -916,7 +707,9 @@ function buildRecommendation(candidates, ctx) {
916
707
  rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
917
708
  if (chosen.gainCi) {
918
709
  const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt(chosen.prGreaterThanZero)}` : "";
919
- rationale.push(`Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`);
710
+ rationale.push(
711
+ `Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`
712
+ );
920
713
  }
921
714
  if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
922
715
  rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt(chosen.mde)} score units.`);
@@ -927,22 +720,36 @@ function buildRecommendation(candidates, ctx) {
927
720
  nextActions.push("Re-run with a stable comparator candidate for paired inference.");
928
721
  }
929
722
  if (!ctx.preregistrationHash) {
930
- risks.push("No preregistration hash supplied; readers cannot verify the analysis was specified before data inspection.");
931
- nextActions.push("Sign a HypothesisManifest before the next sweep and pass `preregistrationHash` so the report cites it.");
723
+ risks.push(
724
+ "No preregistration hash supplied; readers cannot verify the analysis was specified before data inspection."
725
+ );
726
+ nextActions.push(
727
+ "Sign a HypothesisManifest before the next sweep and pass `preregistrationHash` so the report cites it."
728
+ );
932
729
  }
933
730
  if (ctx.rope === null && nonComparator.length > 0) {
934
- risks.push('No ROPE configured; the report cannot distinguish "equivalent" from "inconclusive".');
935
- nextActions.push("Define a domain-specific Region of Practical Equivalence and pass it to lock in the equivalence threshold.");
731
+ risks.push(
732
+ 'No ROPE configured; the report cannot distinguish "equivalent" from "inconclusive".'
733
+ );
734
+ nextActions.push(
735
+ "Define a domain-specific Region of Practical Equivalence and pass it to lock in the equivalence threshold."
736
+ );
936
737
  }
937
738
  const inconclusive = nonComparator.filter((c) => c.decision === "needs_more_data");
938
739
  if (inconclusive.length > 0) {
939
740
  const worst = inconclusive.reduce((a, b) => b.pairedN < a.pairedN ? b : a);
940
- risks.push(`${inconclusive.length} candidate(s) below soft floor (${ctx.minPairs} pairs); thinnest is ${worst.candidateId} with ${worst.pairedN}.`);
941
- nextActions.push(`Collect at least ${ctx.minPairs - worst.pairedN} more matched holdout runs for ${worst.candidateId}.`);
741
+ risks.push(
742
+ `${inconclusive.length} candidate(s) below soft floor (${ctx.minPairs} pairs); thinnest is ${worst.candidateId} with ${worst.pairedN}.`
743
+ );
744
+ nextActions.push(
745
+ `Collect at least ${ctx.minPairs - worst.pairedN} more matched holdout runs for ${worst.candidateId}.`
746
+ );
942
747
  }
943
748
  const rejected = nonComparator.filter((c) => c.decision === "reject");
944
749
  if (rejected.length > 0) {
945
- risks.push(`${rejected.length} candidate(s) failed the paired test or held-out gate; do not ship those variants.`);
750
+ risks.push(
751
+ `${rejected.length} candidate(s) failed the paired test or held-out gate; do not ship those variants.`
752
+ );
946
753
  }
947
754
  if (ctx.failureClusters && ctx.failureClusters.clusters.length > 0) {
948
755
  const top = ctx.failureClusters.clusters[0];
@@ -954,9 +761,13 @@ function buildRecommendation(candidates, ctx) {
954
761
  } else if (decision === "hold") {
955
762
  nextActions.push("Keep current production candidate while expanding holdout evidence.");
956
763
  } else if (decision === "equivalent") {
957
- nextActions.push("Either keep the comparator (no quality regression) or promote on cost/latency grounds \u2014 equivalence does not justify either; the choice is a product decision, not a stats one.");
764
+ nextActions.push(
765
+ "Either keep the comparator (no quality regression) or promote on cost/latency grounds \u2014 equivalence does not justify either; the choice is a product decision, not a stats one."
766
+ );
958
767
  } else if (decision === "reject") {
959
- nextActions.push("Do not promote this sweep; inspect failures and generate a revised candidate.");
768
+ nextActions.push(
769
+ "Do not promote this sweep; inspect failures and generate a revised candidate."
770
+ );
960
771
  }
961
772
  return {
962
773
  decision,
@@ -969,20 +780,30 @@ function buildRecommendation(candidates, ctx) {
969
780
  function buildExecutiveSummary(candidates, recommendation, ctx) {
970
781
  const lines = [];
971
782
  const nonComparator = candidates.filter((c) => c.candidateId !== ctx.comparator);
972
- lines.push(`Evaluated ${nonComparator.length} candidate(s) on the ${ctx.split} split${ctx.comparator ? ` against ${ctx.comparator}` : ""}.`);
973
- lines.push(`Recommendation: ${recommendation.decision}${recommendation.candidateId ? ` ${recommendation.candidateId}` : ""}.`);
783
+ lines.push(
784
+ `Evaluated ${nonComparator.length} candidate(s) on the ${ctx.split} split${ctx.comparator ? ` against ${ctx.comparator}` : ""}.`
785
+ );
786
+ lines.push(
787
+ `Recommendation: ${recommendation.decision}${recommendation.candidateId ? ` ${recommendation.candidateId}` : ""}.`
788
+ );
974
789
  const promoted = nonComparator.filter((c) => c.decision === "promote").length;
975
790
  const held = nonComparator.filter((c) => c.decision === "hold").length;
976
791
  const equivalent = nonComparator.filter((c) => c.decision === "equivalent").length;
977
792
  const rejected = nonComparator.filter((c) => c.decision === "reject").length;
978
793
  const more = nonComparator.filter((c) => c.decision === "needs_more_data").length;
979
- lines.push(`Decision mix: ${promoted} promote, ${equivalent} equivalent, ${held} hold, ${rejected} reject, ${more} need more data.`);
794
+ lines.push(
795
+ `Decision mix: ${promoted} promote, ${equivalent} equivalent, ${held} hold, ${rejected} reject, ${more} need more data.`
796
+ );
980
797
  const frontier = nonComparator.filter((c) => c.onParetoFrontier).map((c) => c.candidateId);
981
798
  if (frontier.length > 0) lines.push(`Pareto-frontier candidates: ${frontier.join(", ")}.`);
982
799
  if (ctx.failureClusters) {
983
- lines.push(`Failure clustering found ${ctx.failureClusters.totalFailures}/${ctx.failureClusters.totalRuns} failed runs across ${ctx.failureClusters.clusters.length} reportable cluster(s).`);
800
+ lines.push(
801
+ `Failure clustering found ${ctx.failureClusters.totalFailures}/${ctx.failureClusters.totalRuns} failed runs across ${ctx.failureClusters.clusters.length} reportable cluster(s).`
802
+ );
984
803
  }
985
- lines.push(ctx.preregistrationHash ? `Preregistered analysis: ${ctx.preregistrationHash.slice(0, 12)}\u2026` : "Analysis is post-hoc \u2014 no preregistration hash supplied.");
804
+ lines.push(
805
+ ctx.preregistrationHash ? `Preregistered analysis: ${ctx.preregistrationHash.slice(0, 12)}\u2026` : "Analysis is post-hoc \u2014 no preregistration hash supplied."
806
+ );
986
807
  return lines;
987
808
  }
988
809
  function renderResearchMarkdown(report) {
@@ -994,7 +815,9 @@ function renderResearchMarkdown(report) {
994
815
  lines.push(`**Comparator:** ${report.comparator ?? "not configured"}`);
995
816
  lines.push(`**ROPE:** ${report.rope ? formatRope(report.rope) : "not configured"}`);
996
817
  lines.push(`**Run fingerprint:** \`${report.runFingerprint}\``);
997
- lines.push(`**Preregistration:** ${report.preregistrationHash ? `\`${report.preregistrationHash}\`` : "none"}`);
818
+ lines.push(
819
+ `**Preregistration:** ${report.preregistrationHash ? `\`${report.preregistrationHash}\`` : "none"}`
820
+ );
998
821
  lines.push("");
999
822
  lines.push("## Executive Summary");
1000
823
  lines.push("");
@@ -1021,7 +844,9 @@ function renderResearchMarkdown(report) {
1021
844
  lines.push("");
1022
845
  lines.push("## Candidate Decision Table");
1023
846
  lines.push("");
1024
- lines.push("| Candidate | Decision | Mean | \u0394\u0304 | Pr(\u0394>0) | q | d | Paired N | Median Gain CI | MDE | Pareto | Gate |");
847
+ lines.push(
848
+ "| Candidate | Decision | Mean | \u0394\u0304 | Pr(\u0394>0) | q | d | Paired N | Median Gain CI | MDE | Pareto | Gate |"
849
+ );
1025
850
  lines.push("|---|---|---:|---:|---:|---:|---:|---:|---|---:|---|---|");
1026
851
  for (const c of report.candidates) {
1027
852
  const delta = c.meanDeltaVsComparator === null ? "-" : signed(c.meanDeltaVsComparator);
@@ -1030,7 +855,9 @@ function renderResearchMarkdown(report) {
1030
855
  const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
1031
856
  const gain = c.gainCi ? `[${fmt(c.gainCi.low)}, ${fmt(c.gainCi.high)}]` : "-";
1032
857
  const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt(c.mde);
1033
- lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
858
+ lines.push(
859
+ `| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`
860
+ );
1034
861
  }
1035
862
  lines.push("");
1036
863
  lines.push("## Statistical Summary");
@@ -1061,7 +888,9 @@ function renderResearchMarkdown(report) {
1061
888
  lines.push("");
1062
889
  lines.push("## Chart Specs");
1063
890
  lines.push("");
1064
- lines.push("The report carries JSON chart specs for Pareto cost/quality and paired gain histograms.");
891
+ lines.push(
892
+ "The report carries JSON chart specs for Pareto cost/quality and paired gain histograms."
893
+ );
1065
894
  lines.push("");
1066
895
  lines.push("```json");
1067
896
  lines.push(JSON.stringify({ pareto: report.pareto, gains: report.gains }, null, 2));
@@ -1073,7 +902,9 @@ function renderResearchMarkdown(report) {
1073
902
  lines.push("| Failure Class | Runs | Scenarios | Tool | Example |");
1074
903
  lines.push("|---|---:|---:|---|---|");
1075
904
  for (const c of report.failureClusters.clusters.slice(0, 10)) {
1076
- lines.push(`| ${c.failureClass} | ${c.runCount} | ${c.scenarioIds.length} | ${c.toolName ?? "-"} | ${escapePipes(c.exampleError ?? c.exampleRunId)} |`);
905
+ lines.push(
906
+ `| ${c.failureClass} | ${c.runCount} | ${c.scenarioIds.length} | ${c.toolName ?? "-"} | ${escapePipes(c.exampleError ?? c.exampleRunId)} |`
907
+ );
1077
908
  }
1078
909
  }
1079
910
  return lines.join("\n");
@@ -1161,7 +992,9 @@ function markdownToHtml(markdown) {
1161
992
  return html.join("\n");
1162
993
  }
1163
994
  function renderMarkdownTable(lines) {
1164
- const rows = lines.filter((line) => !/^\|[-:\s|]+\|$/.test(line)).map((line) => line.slice(1, -1).split("|").map((cell) => inlineMarkdown(cell.trim())));
995
+ const rows = lines.filter((line) => !/^\|[-:\s|]+\|$/.test(line)).map(
996
+ (line) => line.slice(1, -1).split("|").map((cell) => inlineMarkdown(cell.trim()))
997
+ );
1165
998
  if (rows.length === 0) return "";
1166
999
  const [head, ...body] = rows;
1167
1000
  const th = head.map((cell) => `<th>${cell}</th>`).join("");
@@ -1202,15 +1035,6 @@ function fmt(x) {
1202
1035
  }
1203
1036
 
1204
1037
  export {
1205
- normalizeScores,
1206
- weightedMean,
1207
- confidenceInterval,
1208
- interRaterReliability,
1209
- mannWhitneyU,
1210
- partialCredit,
1211
- pairedTTest,
1212
- wilcoxonSignedRank,
1213
- cohensD,
1214
1038
  requiredSampleSize,
1215
1039
  bonferroni,
1216
1040
  benjaminiHochberg,
@@ -1223,4 +1047,4 @@ export {
1223
1047
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
1224
1048
  researchReport
1225
1049
  };
1226
- //# sourceMappingURL=chunk-IOXMGMHQ.js.map
1050
+ //# sourceMappingURL=chunk-2A5XJB43.js.map