@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
  38. package/dist/index.d.ts +157 -167
  39. package/dist/index.js +25 -335
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/research-report-methodology.md +4 -4
  64. package/docs/three-package-architecture.md +12 -24
  65. package/package.json +1 -1
  66. package/dist/chunk-2A5XJB43.js.map +0 -1
  67. package/dist/chunk-4F5DQN55.js.map +0 -1
  68. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  69. package/dist/chunk-I4MBDTY5.js +0 -272
  70. package/dist/chunk-I4MBDTY5.js.map +0 -1
  71. package/dist/chunk-JLZQWFV3.js.map +0 -1
  72. package/dist/chunk-K2TPS5LB.js.map +0 -1
  73. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  74. package/dist/chunk-NU65VQ7M.js.map +0 -1
  75. package/dist/chunk-OWLAAMME.js.map +0 -1
  76. package/dist/chunk-SESZDQPX.js.map +0 -1
  77. package/dist/chunk-WHZMVFUV.js.map +0 -1
  78. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  79. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  80. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
@@ -0,0 +1,722 @@
1
+ import {
2
+ ValidationError
3
+ } from "./chunk-NG236HPC.js";
4
+
5
+ // src/judge-calibration.ts
6
+ function calibrateJudge(golden, candidate) {
7
+ const map = /* @__PURE__ */ new Map();
8
+ for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
9
+ for (const c of candidate) {
10
+ const entry = map.get(c.itemId);
11
+ if (entry) entry.j = c.score;
12
+ }
13
+ const common = [...map.values()].filter((v) => Number.isFinite(v.j));
14
+ const n = common.length;
15
+ if (n < 2) {
16
+ return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
17
+ }
18
+ const humans = common.map((c) => c.h);
19
+ const judges = common.map((c) => c.j);
20
+ const pearson = pearsonR(humans, judges);
21
+ const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
22
+ const absDiffs = common.map((c) => Math.abs(c.j - c.h));
23
+ const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
24
+ const worst = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
25
+ return { n, pearson, kappa, mae, worstItems: worst };
26
+ }
27
+ function positionalBias(scores) {
28
+ const pairs = /* @__PURE__ */ new Map();
29
+ for (const s of scores) {
30
+ const slot = pairs.get(s.itemId) ?? {};
31
+ if (s.positionOfAInput === "first") slot.first = s.score;
32
+ else if (s.positionOfAInput === "second") slot.second = s.score;
33
+ pairs.set(s.itemId, slot);
34
+ }
35
+ const deltas = [];
36
+ for (const { first, second } of pairs.values()) {
37
+ if (first !== void 0 && second !== void 0) deltas.push(first - second);
38
+ }
39
+ if (deltas.length === 0) return { avgDelta: 0, n: 0 };
40
+ return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
41
+ }
42
+ function verbosityBias(samples) {
43
+ const n = samples.length;
44
+ if (n < 3) return { pearson: NaN, n };
45
+ return {
46
+ pearson: pearsonR(
47
+ samples.map((s) => s.outputLen),
48
+ samples.map((s) => s.score)
49
+ ),
50
+ n
51
+ };
52
+ }
53
+ function selfPreference(samples) {
54
+ const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
55
+ const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
56
+ if (inF.length === 0 || outF.length === 0)
57
+ return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
58
+ const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
59
+ const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
60
+ return {
61
+ inFamilyMean: inMean,
62
+ outOfFamilyMean: outMean,
63
+ deltaMean: inMean - outMean,
64
+ n: samples.length
65
+ };
66
+ }
67
+ function pearsonR(a, b) {
68
+ if (a.length !== b.length || a.length < 2) return NaN;
69
+ const mA = a.reduce((s, v) => s + v, 0) / a.length;
70
+ const mB = b.reduce((s, v) => s + v, 0) / b.length;
71
+ let num = 0, dA = 0, dB = 0;
72
+ for (let i = 0; i < a.length; i++) {
73
+ const da = a[i] - mA;
74
+ const db = b[i] - mB;
75
+ num += da * db;
76
+ dA += da * da;
77
+ dB += db * db;
78
+ }
79
+ if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
80
+ return num / Math.sqrt(dA * dB);
81
+ }
82
+ function weightedKappa(a, b) {
83
+ if (a.length !== b.length || a.length === 0) return NaN;
84
+ const min = Math.min(...a, ...b);
85
+ const max = Math.max(...a, ...b);
86
+ const K = max - min + 1;
87
+ if (K < 2) return 1;
88
+ const observed = Array.from({ length: K }, () => new Array(K).fill(0));
89
+ const rowMarg = new Array(K).fill(0);
90
+ const colMarg = new Array(K).fill(0);
91
+ for (let i = 0; i < a.length; i++) {
92
+ const ai = a[i] - min;
93
+ const bi = b[i] - min;
94
+ const row = observed[ai];
95
+ row[bi] = (row[bi] ?? 0) + 1;
96
+ rowMarg[ai]++;
97
+ colMarg[bi]++;
98
+ }
99
+ let num = 0;
100
+ let den = 0;
101
+ for (let i = 0; i < K; i++) {
102
+ for (let j = 0; j < K; j++) {
103
+ const w = (i - j) ** 2 / (K - 1) ** 2;
104
+ const expected = rowMarg[i] * colMarg[j] / a.length;
105
+ num += w * observed[i][j];
106
+ den += w * expected;
107
+ }
108
+ }
109
+ if (den === 0) return 1;
110
+ return 1 - num / den;
111
+ }
112
+ function continuousAgreement(scores, opts = {}) {
113
+ const bootstrap = opts.bootstrap ?? 1e3;
114
+ const weights = opts.weights ?? "quadratic";
115
+ const seed = opts.seed ?? 12648430;
116
+ const ciLevel = opts.ciLevel ?? 0.95;
117
+ const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
118
+ const raters = matrix[0]?.length ?? 0;
119
+ const clean = matrix.filter((row) => row.length === raters);
120
+ const nClean = clean.length;
121
+ if (nClean < 2 || raters < 2) {
122
+ return {
123
+ weightedKappa: NaN,
124
+ icc: NaN,
125
+ pearson: NaN,
126
+ spearman: NaN,
127
+ ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
128
+ n: nClean,
129
+ raters
130
+ };
131
+ }
132
+ const kappa = continuousWeightedKappa(clean, weights);
133
+ const icc = icc21(clean);
134
+ const pearson = avgPairwise(clean, pearsonR);
135
+ const spearman = avgPairwise(clean, spearmanR);
136
+ const ciIcc = [NaN, NaN];
137
+ const ciKappa = [NaN, NaN];
138
+ if (bootstrap > 0) {
139
+ const rng = mulberry32(seed);
140
+ const iccs = [];
141
+ const kappas = [];
142
+ for (let b = 0; b < bootstrap; b++) {
143
+ const sample = new Array(nClean);
144
+ for (let i = 0; i < nClean; i++) {
145
+ sample[i] = clean[Math.floor(rng() * nClean)];
146
+ }
147
+ const iccB = icc21(sample);
148
+ const kB = continuousWeightedKappa(sample, weights);
149
+ if (Number.isFinite(iccB)) iccs.push(iccB);
150
+ if (Number.isFinite(kB)) kappas.push(kB);
151
+ }
152
+ const [lo, hi] = percentileBounds(ciLevel);
153
+ if (iccs.length > 0) {
154
+ iccs.sort((a, b) => a - b);
155
+ ciIcc[0] = quantile(iccs, lo);
156
+ ciIcc[1] = quantile(iccs, hi);
157
+ }
158
+ if (kappas.length > 0) {
159
+ kappas.sort((a, b) => a - b);
160
+ ciKappa[0] = quantile(kappas, lo);
161
+ ciKappa[1] = quantile(kappas, hi);
162
+ }
163
+ }
164
+ return {
165
+ weightedKappa: kappa,
166
+ icc,
167
+ pearson,
168
+ spearman,
169
+ ci: { icc: ciIcc, weightedKappa: ciKappa },
170
+ n: nClean,
171
+ raters
172
+ };
173
+ }
174
+ function calibrateJudgeContinuous(golden, candidate, opts = {}) {
175
+ const base = calibrateJudge(golden, candidate);
176
+ const map = /* @__PURE__ */ new Map();
177
+ for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
178
+ for (const c of candidate) {
179
+ const entry = map.get(c.itemId);
180
+ if (entry) entry.j = c.score;
181
+ }
182
+ const rows = [];
183
+ for (const v of map.values()) {
184
+ if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
185
+ }
186
+ const agreement = continuousAgreement(rows, opts);
187
+ return {
188
+ ...base,
189
+ weightedKappaContinuous: agreement.weightedKappa,
190
+ icc: agreement.icc,
191
+ spearman: agreement.spearman,
192
+ ci: agreement.ci
193
+ };
194
+ }
195
+ function continuousWeightedKappa(rows, scheme) {
196
+ if (rows.length === 0) return NaN;
197
+ const raters = rows[0].length;
198
+ if (raters < 2) return NaN;
199
+ const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
200
+ let sum = 0;
201
+ let pairs = 0;
202
+ for (let r1 = 0; r1 < raters; r1++) {
203
+ for (let r2 = r1 + 1; r2 < raters; r2++) {
204
+ const a = rows.map((row) => row[r1]);
205
+ const b = rows.map((row) => row[r2]);
206
+ const n = a.length;
207
+ let obs = 0;
208
+ for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
209
+ obs /= n;
210
+ let exp = 0;
211
+ for (let i = 0; i < n; i++) {
212
+ for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
213
+ }
214
+ exp /= n * n;
215
+ if (exp === 0) {
216
+ sum += obs === 0 ? 1 : 0;
217
+ } else {
218
+ sum += 1 - obs / exp;
219
+ }
220
+ pairs++;
221
+ }
222
+ }
223
+ return pairs === 0 ? NaN : sum / pairs;
224
+ }
225
+ function icc21(rows) {
226
+ const n = rows.length;
227
+ if (n < 2) return NaN;
228
+ const k = rows[0].length;
229
+ if (k < 2) return NaN;
230
+ const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
231
+ const colMeans = new Array(k).fill(0);
232
+ for (let j = 0; j < k; j++) {
233
+ let s = 0;
234
+ for (let i = 0; i < n; i++) s += rows[i][j];
235
+ colMeans[j] = s / n;
236
+ }
237
+ let grand = 0;
238
+ for (let i = 0; i < n; i++) grand += rowMeans[i];
239
+ grand /= n;
240
+ let ssR = 0;
241
+ for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
242
+ ssR *= k;
243
+ let ssC = 0;
244
+ for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
245
+ ssC *= n;
246
+ let ssT = 0;
247
+ for (let i = 0; i < n; i++) {
248
+ for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
249
+ }
250
+ const ssE = ssT - ssR - ssC;
251
+ const dfR = n - 1;
252
+ const dfC = k - 1;
253
+ const dfE = (n - 1) * (k - 1);
254
+ const msR = ssR / dfR;
255
+ const msC = ssC / dfC;
256
+ const msE = dfE > 0 ? ssE / dfE : 0;
257
+ const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
258
+ if (denom === 0) {
259
+ return msR === 0 && msE === 0 ? 1 : 0;
260
+ }
261
+ return (msR - msE) / denom;
262
+ }
263
+ function avgPairwise(rows, fn) {
264
+ const k = rows[0]?.length ?? 0;
265
+ if (k < 2) return NaN;
266
+ let sum = 0;
267
+ let pairs = 0;
268
+ for (let i = 0; i < k; i++) {
269
+ for (let j = i + 1; j < k; j++) {
270
+ const a = rows.map((row) => row[i]);
271
+ const b = rows.map((row) => row[j]);
272
+ const r = fn(a, b);
273
+ if (Number.isFinite(r)) {
274
+ sum += r;
275
+ pairs++;
276
+ }
277
+ }
278
+ }
279
+ return pairs === 0 ? NaN : sum / pairs;
280
+ }
281
+ function spearmanR(a, b) {
282
+ if (a.length !== b.length || a.length < 2) return NaN;
283
+ return pearsonR(rankWithTies(a), rankWithTies(b));
284
+ }
285
+ function rankWithTies(xs) {
286
+ const n = xs.length;
287
+ const indexed = xs.map((v, i2) => ({ v, i: i2 }));
288
+ indexed.sort((x, y) => x.v - y.v);
289
+ const ranks = new Array(n).fill(0);
290
+ let i = 0;
291
+ while (i < n) {
292
+ let j = i;
293
+ while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
294
+ const avg = (i + j) / 2 + 1;
295
+ for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
296
+ i = j + 1;
297
+ }
298
+ return ranks;
299
+ }
300
+ function mulberry32(seed) {
301
+ let a = seed >>> 0;
302
+ return () => {
303
+ a = a + 1831565813 >>> 0;
304
+ let t = a;
305
+ t = Math.imul(t ^ t >>> 15, t | 1);
306
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
307
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
308
+ };
309
+ }
310
+ function percentileBounds(ciLevel) {
311
+ const tail = (1 - ciLevel) / 2;
312
+ return [tail, 1 - tail];
313
+ }
314
+ function quantile(sorted, q) {
315
+ if (sorted.length === 0) return NaN;
316
+ if (sorted.length === 1) return sorted[0];
317
+ const pos = q * (sorted.length - 1);
318
+ const lo = Math.floor(pos);
319
+ const hi = Math.ceil(pos);
320
+ if (lo === hi) return sorted[lo];
321
+ const frac = pos - lo;
322
+ return sorted[lo] * (1 - frac) + sorted[hi] * frac;
323
+ }
324
+
325
+ // src/statistics.ts
326
+ var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set(["hallucination", "false_confidence", "worst_failure"]);
327
+ function normalizeScores(scores) {
328
+ return scores.map((s) => {
329
+ if (INVERTED_DIMENSIONS.has(s.dimension)) {
330
+ return s;
331
+ }
332
+ return s;
333
+ });
334
+ }
335
+ function weightedMean(scores) {
336
+ if (scores.length === 0) return 0;
337
+ let totalWeight = 0;
338
+ let weightedSum = 0;
339
+ for (const { score, weight } of scores) {
340
+ const w = weight ?? 1;
341
+ weightedSum += score * w;
342
+ totalWeight += w;
343
+ }
344
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
345
+ }
346
+ function confidenceInterval(scores, confidence = 0.95) {
347
+ if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
348
+ if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
349
+ const n = scores.length;
350
+ const mean = scores.reduce((a, b) => a + b, 0) / n;
351
+ const B = 1e3;
352
+ const bootstrapMeans = [];
353
+ for (let i = 0; i < B; i++) {
354
+ let sum = 0;
355
+ for (let j = 0; j < n; j++) {
356
+ sum += scores[Math.floor(Math.random() * n)];
357
+ }
358
+ bootstrapMeans.push(sum / n);
359
+ }
360
+ bootstrapMeans.sort((a, b) => a - b);
361
+ const alpha = 1 - confidence;
362
+ const lowerIdx = Math.floor(alpha / 2 * B);
363
+ const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
364
+ return {
365
+ mean,
366
+ lower: bootstrapMeans[lowerIdx],
367
+ upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
368
+ };
369
+ }
370
+ function interRaterReliability(judgeScores) {
371
+ if (judgeScores.length < 2) return 1;
372
+ const dimensionMap = /* @__PURE__ */ new Map();
373
+ for (const judgeSet of judgeScores) {
374
+ for (const s of judgeSet) {
375
+ if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
376
+ const arr = dimensionMap.get(s.dimension);
377
+ if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
378
+ arr.push([s.score]);
379
+ } else {
380
+ arr[arr.length - 1].push(s.score);
381
+ }
382
+ }
383
+ }
384
+ const allValues = [];
385
+ const pairDiffs = [];
386
+ for (const items of dimensionMap.values()) {
387
+ for (const ratings of items) {
388
+ if (ratings.length < 2) continue;
389
+ for (const v of ratings) allValues.push(v);
390
+ for (let i = 0; i < ratings.length; i++) {
391
+ for (let j = i + 1; j < ratings.length; j++) {
392
+ pairDiffs.push((ratings[i] - ratings[j]) ** 2);
393
+ }
394
+ }
395
+ }
396
+ }
397
+ if (pairDiffs.length === 0 || allValues.length < 2) return 1;
398
+ const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
399
+ let expectedDisagreement = 0;
400
+ let expectedCount = 0;
401
+ for (let i = 0; i < allValues.length; i++) {
402
+ for (let j = i + 1; j < allValues.length; j++) {
403
+ expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
404
+ expectedCount++;
405
+ }
406
+ }
407
+ expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
408
+ if (expectedDisagreement === 0) return 1;
409
+ return 1 - observedDisagreement / expectedDisagreement;
410
+ }
411
+ function mannWhitneyU(a, b) {
412
+ if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
413
+ const n1 = a.length;
414
+ const n2 = b.length;
415
+ const combined = [
416
+ ...a.map((v) => ({ v, group: "a" })),
417
+ ...b.map((v) => ({ v, group: "b" }))
418
+ ].sort((x, y) => x.v - y.v);
419
+ const ranks = new Array(combined.length);
420
+ let i = 0;
421
+ while (i < combined.length) {
422
+ let j = i;
423
+ while (j < combined.length && combined[j].v === combined[i].v) j++;
424
+ const avgRank = (i + 1 + j) / 2;
425
+ for (let k = i; k < j; k++) ranks[k] = avgRank;
426
+ i = j;
427
+ }
428
+ let r1 = 0;
429
+ for (let k = 0; k < combined.length; k++) {
430
+ if (combined[k].group === "a") r1 += ranks[k];
431
+ }
432
+ const u1 = r1 - n1 * (n1 + 1) / 2;
433
+ const u2 = n1 * n2 - u1;
434
+ const u = Math.min(u1, u2);
435
+ const mu = n1 * n2 / 2;
436
+ const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
437
+ if (sigma === 0) return { u, p: 1 };
438
+ const z = Math.abs(u - mu) / sigma;
439
+ const p = 2 * (1 - normalCdf(z));
440
+ return { u, p };
441
+ }
442
+ function partialCredit(current, target) {
443
+ if (target <= 0) return 1;
444
+ return Math.min(1, Math.max(0, current / target));
445
+ }
446
+ function pairedTTest(before, after) {
447
+ if (before.length !== after.length) {
448
+ throw new ValidationError(
449
+ `pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`
450
+ );
451
+ }
452
+ const n = before.length;
453
+ if (n < 2) return { t: 0, df: 0, p: 1 };
454
+ const diffs = before.map((b, i) => after[i] - b);
455
+ const mean = diffs.reduce((a, b) => a + b, 0) / n;
456
+ const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
457
+ const se = Math.sqrt(variance / n);
458
+ if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
459
+ const t = mean / se;
460
+ const df = n - 1;
461
+ const p = 2 * (1 - studentTCdf(Math.abs(t), df));
462
+ return { t, df, p };
463
+ }
464
+ function wilcoxonSignedRank(before, after) {
465
+ if (before.length !== after.length) {
466
+ throw new ValidationError(
467
+ `wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`
468
+ );
469
+ }
470
+ const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
471
+ const n = diffs.length;
472
+ if (n < 6) return { w: 0, p: 1 };
473
+ const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
474
+ const ranks = new Array(n);
475
+ let i = 0;
476
+ while (i < n) {
477
+ let j = i;
478
+ while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
479
+ const avg = (i + 1 + j) / 2;
480
+ for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg;
481
+ i = j;
482
+ }
483
+ let wPlus = 0;
484
+ for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
485
+ const mean = n * (n + 1) / 4;
486
+ const variance = n * (n + 1) * (2 * n + 1) / 24;
487
+ const z = (wPlus - mean) / Math.sqrt(variance);
488
+ const p = 2 * (1 - normalCdf(Math.abs(z)));
489
+ return { w: wPlus, p };
490
+ }
491
+ function cohensD(a, b) {
492
+ if (a.length < 2 || b.length < 2) return 0;
493
+ const meanA = a.reduce((x, y) => x + y, 0) / a.length;
494
+ const meanB = b.reduce((x, y) => x + y, 0) / b.length;
495
+ const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
496
+ const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
497
+ const pooled = Math.sqrt(
498
+ ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
499
+ );
500
+ if (pooled === 0) return 0;
501
+ return (meanB - meanA) / pooled;
502
+ }
503
+ function corpusInterRaterAgreement(records, opts = {}) {
504
+ if (records.length === 0) {
505
+ throw new ValidationError("corpusInterRaterAgreement: no score records supplied");
506
+ }
507
+ const judgesSeen = /* @__PURE__ */ new Set();
508
+ const dimsSeen = /* @__PURE__ */ new Set();
509
+ const grid = /* @__PURE__ */ new Map();
510
+ for (const r of records) {
511
+ if (!Number.isFinite(r.score)) {
512
+ throw new ValidationError(
513
+ `corpusInterRaterAgreement: non-finite score for (item=${r.itemId}, judge=${r.judgeName}, dim=${r.dimension})`
514
+ );
515
+ }
516
+ judgesSeen.add(r.judgeName);
517
+ dimsSeen.add(r.dimension);
518
+ const byJudge = grid.get(r.dimension) ?? /* @__PURE__ */ new Map();
519
+ const byItem = byJudge.get(r.judgeName) ?? /* @__PURE__ */ new Map();
520
+ if (byItem.has(r.itemId)) {
521
+ throw new ValidationError(
522
+ `corpusInterRaterAgreement: duplicate record for (item=${r.itemId}, judge=${r.judgeName}, dim=${r.dimension})`
523
+ );
524
+ }
525
+ byItem.set(r.itemId, r.score);
526
+ byJudge.set(r.judgeName, byItem);
527
+ grid.set(r.dimension, byJudge);
528
+ }
529
+ const targetDims = opts.dimensions ?? [...dimsSeen].sort();
530
+ for (const d of targetDims) {
531
+ if (!dimsSeen.has(d)) {
532
+ throw new ValidationError(
533
+ `corpusInterRaterAgreement: dimension '${d}' was requested but no records carry it`
534
+ );
535
+ }
536
+ }
537
+ const targetJudges = opts.judges ? [...opts.judges] : [...judgesSeen].sort();
538
+ for (const j of targetJudges) {
539
+ if (!judgesSeen.has(j)) {
540
+ throw new ValidationError(
541
+ `corpusInterRaterAgreement: judge '${j}' was requested but produced no records`
542
+ );
543
+ }
544
+ }
545
+ if (targetJudges.length < 2) {
546
+ throw new ValidationError(
547
+ `corpusInterRaterAgreement: need \u22652 judges, got ${targetJudges.length}`
548
+ );
549
+ }
550
+ const perDimension = [];
551
+ const iccs = [];
552
+ const kappas = [];
553
+ for (const dim of targetDims) {
554
+ const byJudge = grid.get(dim);
555
+ const judgeItemCounts = {};
556
+ for (const j of targetJudges) {
557
+ const m = byJudge.get(j);
558
+ judgeItemCounts[j] = m?.size ?? 0;
559
+ }
560
+ const emptyJudges = targetJudges.filter((j) => judgeItemCounts[j] === 0);
561
+ if (emptyJudges.length > 0) {
562
+ throw new ValidationError(
563
+ `corpusInterRaterAgreement: dimension '${dim}' has no scores from judge(s) ${emptyJudges.join(", ")} (counts: ${JSON.stringify(judgeItemCounts)})`
564
+ );
565
+ }
566
+ let commonItems = null;
567
+ for (const j of targetJudges) {
568
+ const ids = new Set(byJudge.get(j).keys());
569
+ if (commonItems === null) {
570
+ commonItems = ids;
571
+ } else {
572
+ const prev = commonItems;
573
+ commonItems = new Set([...prev].filter((x) => ids.has(x)));
574
+ }
575
+ }
576
+ const sortedItems = [...commonItems ?? /* @__PURE__ */ new Set()].sort();
577
+ if (sortedItems.length < 2) {
578
+ throw new ValidationError(
579
+ `corpusInterRaterAgreement: dimension '${dim}' has ${sortedItems.length} item(s) rated by all ${targetJudges.length} judges (need \u22652)`
580
+ );
581
+ }
582
+ const matrix = sortedItems.map(
583
+ (itemId) => targetJudges.map((j) => byJudge.get(j).get(itemId))
584
+ );
585
+ const agreement = continuousAgreement(matrix, opts);
586
+ perDimension.push({
587
+ ...agreement,
588
+ dimension: dim,
589
+ itemIds: sortedItems,
590
+ judgeIds: [...targetJudges]
591
+ });
592
+ if (Number.isFinite(agreement.icc)) iccs.push(agreement.icc);
593
+ if (Number.isFinite(agreement.weightedKappa)) kappas.push(agreement.weightedKappa);
594
+ }
595
+ const mean = (xs) => xs.length === 0 ? Number.NaN : xs.reduce((a, b) => a + b, 0) / xs.length;
596
+ return {
597
+ perDimension,
598
+ overallIcc: mean(iccs),
599
+ overallWeightedKappa: mean(kappas),
600
+ dimensions: targetDims,
601
+ judgeIds: targetJudges
602
+ };
603
+ }
604
+ function corpusInterRaterAgreementFromJudgeScores(itemsScores, opts = {}) {
605
+ const records = [];
606
+ const seen = /* @__PURE__ */ new Set();
607
+ for (const { itemId, scores } of itemsScores) {
608
+ if (seen.has(itemId)) {
609
+ throw new ValidationError(
610
+ `corpusInterRaterAgreementFromJudgeScores: duplicate itemId '${itemId}'`
611
+ );
612
+ }
613
+ seen.add(itemId);
614
+ for (const s of scores) {
615
+ records.push({
616
+ itemId,
617
+ judgeName: s.judgeName,
618
+ dimension: s.dimension,
619
+ score: s.score
620
+ });
621
+ }
622
+ }
623
+ return corpusInterRaterAgreement(records, opts);
624
+ }
625
+ function studentTCdf(t, df) {
626
+ if (df <= 0) return 0.5;
627
+ if (df > 100) return normalCdf(t);
628
+ const x = df / (df + t * t);
629
+ const a = df / 2;
630
+ const b = 0.5;
631
+ const ib = incompleteBeta(x, a, b);
632
+ return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
633
+ }
634
+ function incompleteBeta(x, a, b) {
635
+ if (x <= 0) return 0;
636
+ if (x >= 1) return 1;
637
+ const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
638
+ const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
639
+ const maxIter = 200;
640
+ const eps = 3e-7;
641
+ let c = 1;
642
+ let d = 1 - (a + b) * x / (a + 1);
643
+ if (Math.abs(d) < 1e-30) d = 1e-30;
644
+ d = 1 / d;
645
+ let f = d;
646
+ for (let m = 1; m <= maxIter; m++) {
647
+ const m2 = 2 * m;
648
+ let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
649
+ d = 1 + num * d;
650
+ if (Math.abs(d) < 1e-30) d = 1e-30;
651
+ c = 1 + num / c;
652
+ if (Math.abs(c) < 1e-30) c = 1e-30;
653
+ d = 1 / d;
654
+ f *= d * c;
655
+ num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
656
+ d = 1 + num * d;
657
+ if (Math.abs(d) < 1e-30) d = 1e-30;
658
+ c = 1 + num / c;
659
+ if (Math.abs(c) < 1e-30) c = 1e-30;
660
+ d = 1 / d;
661
+ const delta = d * c;
662
+ f *= delta;
663
+ if (Math.abs(delta - 1) < eps) break;
664
+ }
665
+ return front * f;
666
+ }
667
+ function lnGamma(z) {
668
+ const g = 7;
669
+ const coefs = [
670
+ 0.9999999999998099,
671
+ 676.5203681218851,
672
+ -1259.1392167224028,
673
+ 771.3234287776531,
674
+ -176.6150291621406,
675
+ 12.507343278686905,
676
+ -0.13857109526572012,
677
+ 9984369578019572e-21,
678
+ 15056327351493116e-23
679
+ ];
680
+ if (z < 0.5) {
681
+ return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
682
+ }
683
+ z -= 1;
684
+ let x = coefs[0];
685
+ for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
686
+ const t = z + g + 0.5;
687
+ return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
688
+ }
689
+ function normalCdf(x) {
690
+ const a1 = 0.254829592;
691
+ const a2 = -0.284496736;
692
+ const a3 = 1.421413741;
693
+ const a4 = -1.453152027;
694
+ const a5 = 1.061405429;
695
+ const p = 0.3275911;
696
+ const sign = x < 0 ? -1 : 1;
697
+ const absX = Math.abs(x);
698
+ const t = 1 / (1 + p * absX);
699
+ const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
700
+ return 0.5 * (1 + sign * y);
701
+ }
702
+
703
+ export {
704
+ calibrateJudge,
705
+ positionalBias,
706
+ verbosityBias,
707
+ selfPreference,
708
+ continuousAgreement,
709
+ calibrateJudgeContinuous,
710
+ normalizeScores,
711
+ weightedMean,
712
+ confidenceInterval,
713
+ interRaterReliability,
714
+ mannWhitneyU,
715
+ partialCredit,
716
+ pairedTTest,
717
+ wilcoxonSignedRank,
718
+ cohensD,
719
+ corpusInterRaterAgreement,
720
+ corpusInterRaterAgreementFromJudgeScores
721
+ };
722
+ //# sourceMappingURL=chunk-R5UQJNKC.js.map