@tangle-network/agent-eval 0.27.0 → 0.27.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/CHANGELOG.md +72 -0
  2. package/README.md +4 -5
  3. package/dist/builder-eval/index.js +1 -1
  4. package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
  5. package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
  6. package/dist/chunk-4U4BKCXK.js.map +1 -0
  7. package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
  8. package/dist/chunk-5AKPEK5L.js.map +1 -0
  9. package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
  10. package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
  11. package/dist/chunk-K33INZHH.js.map +1 -0
  12. package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
  13. package/dist/chunk-MAZ26DC7.js.map +1 -0
  14. package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
  15. package/dist/chunk-NCRFYPS3.js.map +1 -0
  16. package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
  17. package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
  18. package/dist/chunk-QHF6EQKK.js.map +1 -0
  19. package/dist/chunk-R5UQJNKC.js +722 -0
  20. package/dist/chunk-R5UQJNKC.js.map +1 -0
  21. package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
  22. package/dist/chunk-RUI6SIHY.js.map +1 -0
  23. package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
  24. package/dist/chunk-SZSBQUIJ.js.map +1 -0
  25. package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
  26. package/dist/chunk-VSMTAMNK.js.map +1 -0
  27. package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
  28. package/dist/chunk-XFZCM5Z3.js.map +1 -0
  29. package/dist/cli.js +1 -1
  30. package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
  31. package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
  32. package/dist/control.d.ts +3 -3
  33. package/dist/control.js +2 -2
  34. package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
  35. package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
  36. package/dist/governance/index.d.ts +1 -1
  37. package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
  38. package/dist/index.d.ts +157 -167
  39. package/dist/index.js +25 -335
  40. package/dist/index.js.map +1 -1
  41. package/dist/knowledge/index.d.ts +1 -1
  42. package/dist/knowledge/index.js +2 -2
  43. package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
  44. package/dist/openapi.json +1 -1
  45. package/dist/optimization.d.ts +5 -5
  46. package/dist/optimization.js +5 -5
  47. package/dist/pipelines/index.d.ts +1 -1
  48. package/dist/pipelines/index.js +2 -2
  49. package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
  50. package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
  51. package/dist/reporting.d.ts +4 -4
  52. package/dist/reporting.js +5 -5
  53. package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
  54. package/dist/rl.d.ts +26 -44
  55. package/dist/rl.js +5 -5
  56. package/dist/rl.js.map +1 -1
  57. package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
  58. package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
  59. package/dist/traces.d.ts +1 -1
  60. package/dist/traces.js +2 -2
  61. package/dist/wire/index.d.ts +2 -2
  62. package/dist/wire/index.js +1 -1
  63. package/docs/research-report-methodology.md +4 -4
  64. package/docs/three-package-architecture.md +12 -24
  65. package/package.json +1 -1
  66. package/dist/chunk-2A5XJB43.js.map +0 -1
  67. package/dist/chunk-4F5DQN55.js.map +0 -1
  68. package/dist/chunk-5LBB5B3Z.js.map +0 -1
  69. package/dist/chunk-I4MBDTY5.js +0 -272
  70. package/dist/chunk-I4MBDTY5.js.map +0 -1
  71. package/dist/chunk-JLZQWFV3.js.map +0 -1
  72. package/dist/chunk-K2TPS5LB.js.map +0 -1
  73. package/dist/chunk-LSH4MMOZ.js.map +0 -1
  74. package/dist/chunk-NU65VQ7M.js.map +0 -1
  75. package/dist/chunk-OWLAAMME.js.map +0 -1
  76. package/dist/chunk-SESZDQPX.js.map +0 -1
  77. package/dist/chunk-WHZMVFUV.js.map +0 -1
  78. /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
  79. /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
  80. /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.js CHANGED
@@ -11,7 +11,7 @@ import {
11
11
  failureClusterView,
12
12
  iqr,
13
13
  welchsTTest
14
- } from "./chunk-JLZQWFV3.js";
14
+ } from "./chunk-K33INZHH.js";
15
15
  import {
16
16
  exportTrainingData,
17
17
  toNdjson
@@ -28,7 +28,7 @@ import {
28
28
  pytestTestParser,
29
29
  runTestGradedScenario,
30
30
  vitestTestParser
31
- } from "./chunk-OWLAAMME.js";
31
+ } from "./chunk-QHF6EQKK.js";
32
32
  import {
33
33
  classifyEuAiRisk,
34
34
  euAiActReport,
@@ -43,7 +43,7 @@ import {
43
43
  knowledgeReadinessTracePayload,
44
44
  scoreKnowledgeReadiness,
45
45
  userQuestionsForKnowledgeGaps
46
- } from "./chunk-WWYCWKUM.js";
46
+ } from "./chunk-3CKU6VGU.js";
47
47
  import {
48
48
  controlFailureClassFromVerification,
49
49
  controlRunToRunRecord,
@@ -54,7 +54,7 @@ import {
54
54
  runProposeReview,
55
55
  runProposeReviewAsControlLoop,
56
56
  scoreFromEvals
57
- } from "./chunk-ZN274SWR.js";
57
+ } from "./chunk-PALJO75S.js";
58
58
  import {
59
59
  allCriticalPassed,
60
60
  objectiveEval,
@@ -62,7 +62,7 @@ import {
62
62
  stopOnNoProgress,
63
63
  stopOnRepeatedAction,
64
64
  subjectiveEval
65
- } from "./chunk-LSH4MMOZ.js";
65
+ } from "./chunk-NCRFYPS3.js";
66
66
  import {
67
67
  CallbackResearcher,
68
68
  DEFAULT_MUTATION_PRIMITIVES,
@@ -96,7 +96,7 @@ import {
96
96
  summarizePreferenceMemory,
97
97
  trialTraceFromMultiShotTrial,
98
98
  withAssignedFeedbackSplit
99
- } from "./chunk-WHZMVFUV.js";
99
+ } from "./chunk-SZSBQUIJ.js";
100
100
  import {
101
101
  RunRecordValidationError,
102
102
  isRunRecord,
@@ -111,10 +111,10 @@ import {
111
111
  judgeReplayGate,
112
112
  releaseTraceEvidenceFromMultiShotTrials,
113
113
  renderReleaseReport
114
- } from "./chunk-RAF443UI.js";
114
+ } from "./chunk-DBIGN5MJ.js";
115
115
  import {
116
116
  runEvalCampaign
117
- } from "./chunk-SESZDQPX.js";
117
+ } from "./chunk-RUI6SIHY.js";
118
118
  import {
119
119
  LlmCallError,
120
120
  LlmClient,
@@ -128,7 +128,7 @@ import {
128
128
  import {
129
129
  evaluateInterimReleaseConfidence,
130
130
  pairedEvalueSequence
131
- } from "./chunk-NU65VQ7M.js";
131
+ } from "./chunk-MAZ26DC7.js";
132
132
  import {
133
133
  RESEARCH_REPORT_HARD_PAIR_FLOOR,
134
134
  benjaminiHochberg,
@@ -141,18 +141,26 @@ import {
141
141
  requiredSampleSize,
142
142
  researchReport,
143
143
  summaryTable
144
- } from "./chunk-2A5XJB43.js";
144
+ } from "./chunk-5AKPEK5L.js";
145
145
  import {
146
+ calibrateJudge,
147
+ calibrateJudgeContinuous,
146
148
  cohensD,
147
149
  confidenceInterval,
150
+ continuousAgreement,
151
+ corpusInterRaterAgreement,
152
+ corpusInterRaterAgreementFromJudgeScores,
148
153
  interRaterReliability,
149
154
  mannWhitneyU,
150
155
  normalizeScores,
151
156
  pairedTTest,
152
157
  partialCredit,
158
+ positionalBias,
159
+ selfPreference,
160
+ verbosityBias,
153
161
  weightedMean,
154
162
  wilcoxonSignedRank
155
- } from "./chunk-I4MBDTY5.js";
163
+ } from "./chunk-R5UQJNKC.js";
156
164
  import {
157
165
  DEFAULT_REDACTION_RULES,
158
166
  FileSystemTraceStore,
@@ -166,7 +174,7 @@ import {
166
174
  iterateRawCalls,
167
175
  redactString,
168
176
  redactValue
169
- } from "./chunk-K2TPS5LB.js";
177
+ } from "./chunk-4U4BKCXK.js";
170
178
  import {
171
179
  aggregateLlm,
172
180
  argHash,
@@ -208,7 +216,7 @@ import {
208
216
  hashJson,
209
217
  signManifest,
210
218
  verifyManifest
211
- } from "./chunk-4F5DQN55.js";
219
+ } from "./chunk-VSMTAMNK.js";
212
220
  import {
213
221
  AgentEvalError,
214
222
  CaptureIntegrityError,
@@ -4956,326 +4964,6 @@ function seededShuffle(items, seed) {
4956
4964
  return out;
4957
4965
  }
4958
4966
 
4959
- // src/judge-calibration.ts
4960
- function calibrateJudge(golden, candidate) {
4961
- const map = /* @__PURE__ */ new Map();
4962
- for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
4963
- for (const c of candidate) {
4964
- const entry = map.get(c.itemId);
4965
- if (entry) entry.j = c.score;
4966
- }
4967
- const common = [...map.values()].filter((v) => Number.isFinite(v.j));
4968
- const n = common.length;
4969
- if (n < 2) {
4970
- return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
4971
- }
4972
- const humans = common.map((c) => c.h);
4973
- const judges = common.map((c) => c.j);
4974
- const pearson = pearsonR(humans, judges);
4975
- const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
4976
- const absDiffs = common.map((c) => Math.abs(c.j - c.h));
4977
- const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
4978
- const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
4979
- return { n, pearson, kappa, mae, worstItems: worst2 };
4980
- }
4981
- function positionalBias(scores) {
4982
- const pairs = /* @__PURE__ */ new Map();
4983
- for (const s of scores) {
4984
- const slot = pairs.get(s.itemId) ?? {};
4985
- if (s.positionOfAInput === "first") slot.first = s.score;
4986
- else if (s.positionOfAInput === "second") slot.second = s.score;
4987
- pairs.set(s.itemId, slot);
4988
- }
4989
- const deltas = [];
4990
- for (const { first, second } of pairs.values()) {
4991
- if (first !== void 0 && second !== void 0) deltas.push(first - second);
4992
- }
4993
- if (deltas.length === 0) return { avgDelta: 0, n: 0 };
4994
- return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
4995
- }
4996
- function verbosityBias(samples) {
4997
- const n = samples.length;
4998
- if (n < 3) return { pearson: NaN, n };
4999
- return {
5000
- pearson: pearsonR(
5001
- samples.map((s) => s.outputLen),
5002
- samples.map((s) => s.score)
5003
- ),
5004
- n
5005
- };
5006
- }
5007
- function selfPreference(samples) {
5008
- const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
5009
- const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
5010
- if (inF.length === 0 || outF.length === 0)
5011
- return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
5012
- const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
5013
- const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
5014
- return {
5015
- inFamilyMean: inMean,
5016
- outOfFamilyMean: outMean,
5017
- deltaMean: inMean - outMean,
5018
- n: samples.length
5019
- };
5020
- }
5021
- function pearsonR(a, b) {
5022
- if (a.length !== b.length || a.length < 2) return NaN;
5023
- const mA = a.reduce((s, v) => s + v, 0) / a.length;
5024
- const mB = b.reduce((s, v) => s + v, 0) / b.length;
5025
- let num = 0, dA = 0, dB = 0;
5026
- for (let i = 0; i < a.length; i++) {
5027
- const da = a[i] - mA;
5028
- const db = b[i] - mB;
5029
- num += da * db;
5030
- dA += da * da;
5031
- dB += db * db;
5032
- }
5033
- if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
5034
- return num / Math.sqrt(dA * dB);
5035
- }
5036
- function weightedKappa(a, b) {
5037
- if (a.length !== b.length || a.length === 0) return NaN;
5038
- const min = Math.min(...a, ...b);
5039
- const max = Math.max(...a, ...b);
5040
- const K = max - min + 1;
5041
- if (K < 2) return 1;
5042
- const observed = Array.from({ length: K }, () => new Array(K).fill(0));
5043
- const rowMarg = new Array(K).fill(0);
5044
- const colMarg = new Array(K).fill(0);
5045
- for (let i = 0; i < a.length; i++) {
5046
- const ai = a[i] - min;
5047
- const bi = b[i] - min;
5048
- const row = observed[ai];
5049
- row[bi] = (row[bi] ?? 0) + 1;
5050
- rowMarg[ai]++;
5051
- colMarg[bi]++;
5052
- }
5053
- let num = 0;
5054
- let den = 0;
5055
- for (let i = 0; i < K; i++) {
5056
- for (let j = 0; j < K; j++) {
5057
- const w = (i - j) ** 2 / (K - 1) ** 2;
5058
- const expected = rowMarg[i] * colMarg[j] / a.length;
5059
- num += w * observed[i][j];
5060
- den += w * expected;
5061
- }
5062
- }
5063
- if (den === 0) return 1;
5064
- return 1 - num / den;
5065
- }
5066
- function continuousAgreement(scores, opts = {}) {
5067
- const bootstrap = opts.bootstrap ?? 1e3;
5068
- const weights = opts.weights ?? "quadratic";
5069
- const seed = opts.seed ?? 12648430;
5070
- const ciLevel = opts.ciLevel ?? 0.95;
5071
- const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
5072
- const raters = matrix[0]?.length ?? 0;
5073
- const clean = matrix.filter((row) => row.length === raters);
5074
- const nClean = clean.length;
5075
- if (nClean < 2 || raters < 2) {
5076
- return {
5077
- weightedKappa: NaN,
5078
- icc: NaN,
5079
- pearson: NaN,
5080
- spearman: NaN,
5081
- ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
5082
- n: nClean,
5083
- raters
5084
- };
5085
- }
5086
- const kappa = continuousWeightedKappa(clean, weights);
5087
- const icc = icc21(clean);
5088
- const pearson = avgPairwise(clean, pearsonR);
5089
- const spearman = avgPairwise(clean, spearmanR);
5090
- const ciIcc = [NaN, NaN];
5091
- const ciKappa = [NaN, NaN];
5092
- if (bootstrap > 0) {
5093
- const rng = mulberry32(seed);
5094
- const iccs = [];
5095
- const kappas = [];
5096
- for (let b = 0; b < bootstrap; b++) {
5097
- const sample = new Array(nClean);
5098
- for (let i = 0; i < nClean; i++) {
5099
- sample[i] = clean[Math.floor(rng() * nClean)];
5100
- }
5101
- const iccB = icc21(sample);
5102
- const kB = continuousWeightedKappa(sample, weights);
5103
- if (Number.isFinite(iccB)) iccs.push(iccB);
5104
- if (Number.isFinite(kB)) kappas.push(kB);
5105
- }
5106
- const [lo, hi] = percentileBounds(ciLevel);
5107
- if (iccs.length > 0) {
5108
- iccs.sort((a, b) => a - b);
5109
- ciIcc[0] = quantile(iccs, lo);
5110
- ciIcc[1] = quantile(iccs, hi);
5111
- }
5112
- if (kappas.length > 0) {
5113
- kappas.sort((a, b) => a - b);
5114
- ciKappa[0] = quantile(kappas, lo);
5115
- ciKappa[1] = quantile(kappas, hi);
5116
- }
5117
- }
5118
- return {
5119
- weightedKappa: kappa,
5120
- icc,
5121
- pearson,
5122
- spearman,
5123
- ci: { icc: ciIcc, weightedKappa: ciKappa },
5124
- n: nClean,
5125
- raters
5126
- };
5127
- }
5128
- function calibrateJudgeContinuous(golden, candidate, opts = {}) {
5129
- const base = calibrateJudge(golden, candidate);
5130
- const map = /* @__PURE__ */ new Map();
5131
- for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
5132
- for (const c of candidate) {
5133
- const entry = map.get(c.itemId);
5134
- if (entry) entry.j = c.score;
5135
- }
5136
- const rows = [];
5137
- for (const v of map.values()) {
5138
- if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
5139
- }
5140
- const agreement = continuousAgreement(rows, opts);
5141
- return {
5142
- ...base,
5143
- weightedKappaContinuous: agreement.weightedKappa,
5144
- icc: agreement.icc,
5145
- spearman: agreement.spearman,
5146
- ci: agreement.ci
5147
- };
5148
- }
5149
- function continuousWeightedKappa(rows, scheme) {
5150
- if (rows.length === 0) return NaN;
5151
- const raters = rows[0].length;
5152
- if (raters < 2) return NaN;
5153
- const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
5154
- let sum2 = 0;
5155
- let pairs = 0;
5156
- for (let r1 = 0; r1 < raters; r1++) {
5157
- for (let r2 = r1 + 1; r2 < raters; r2++) {
5158
- const a = rows.map((row) => row[r1]);
5159
- const b = rows.map((row) => row[r2]);
5160
- const n = a.length;
5161
- let obs = 0;
5162
- for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
5163
- obs /= n;
5164
- let exp = 0;
5165
- for (let i = 0; i < n; i++) {
5166
- for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
5167
- }
5168
- exp /= n * n;
5169
- if (exp === 0) {
5170
- sum2 += obs === 0 ? 1 : 0;
5171
- } else {
5172
- sum2 += 1 - obs / exp;
5173
- }
5174
- pairs++;
5175
- }
5176
- }
5177
- return pairs === 0 ? NaN : sum2 / pairs;
5178
- }
5179
- function icc21(rows) {
5180
- const n = rows.length;
5181
- if (n < 2) return NaN;
5182
- const k = rows[0].length;
5183
- if (k < 2) return NaN;
5184
- const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
5185
- const colMeans = new Array(k).fill(0);
5186
- for (let j = 0; j < k; j++) {
5187
- let s = 0;
5188
- for (let i = 0; i < n; i++) s += rows[i][j];
5189
- colMeans[j] = s / n;
5190
- }
5191
- let grand = 0;
5192
- for (let i = 0; i < n; i++) grand += rowMeans[i];
5193
- grand /= n;
5194
- let ssR = 0;
5195
- for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
5196
- ssR *= k;
5197
- let ssC = 0;
5198
- for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
5199
- ssC *= n;
5200
- let ssT = 0;
5201
- for (let i = 0; i < n; i++) {
5202
- for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
5203
- }
5204
- const ssE = ssT - ssR - ssC;
5205
- const dfR = n - 1;
5206
- const dfC = k - 1;
5207
- const dfE = (n - 1) * (k - 1);
5208
- const msR = ssR / dfR;
5209
- const msC = ssC / dfC;
5210
- const msE = dfE > 0 ? ssE / dfE : 0;
5211
- const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
5212
- if (denom === 0) {
5213
- return msR === 0 && msE === 0 ? 1 : 0;
5214
- }
5215
- return (msR - msE) / denom;
5216
- }
5217
- function avgPairwise(rows, fn) {
5218
- const k = rows[0]?.length ?? 0;
5219
- if (k < 2) return NaN;
5220
- let sum2 = 0;
5221
- let pairs = 0;
5222
- for (let i = 0; i < k; i++) {
5223
- for (let j = i + 1; j < k; j++) {
5224
- const a = rows.map((row) => row[i]);
5225
- const b = rows.map((row) => row[j]);
5226
- const r = fn(a, b);
5227
- if (Number.isFinite(r)) {
5228
- sum2 += r;
5229
- pairs++;
5230
- }
5231
- }
5232
- }
5233
- return pairs === 0 ? NaN : sum2 / pairs;
5234
- }
5235
- function spearmanR(a, b) {
5236
- if (a.length !== b.length || a.length < 2) return NaN;
5237
- return pearsonR(rankWithTies(a), rankWithTies(b));
5238
- }
5239
- function rankWithTies(xs) {
5240
- const n = xs.length;
5241
- const indexed = xs.map((v, i2) => ({ v, i: i2 }));
5242
- indexed.sort((x, y) => x.v - y.v);
5243
- const ranks = new Array(n).fill(0);
5244
- let i = 0;
5245
- while (i < n) {
5246
- let j = i;
5247
- while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
5248
- const avg = (i + j) / 2 + 1;
5249
- for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
5250
- i = j + 1;
5251
- }
5252
- return ranks;
5253
- }
5254
- function mulberry32(seed) {
5255
- let a = seed >>> 0;
5256
- return () => {
5257
- a = a + 1831565813 >>> 0;
5258
- let t = a;
5259
- t = Math.imul(t ^ t >>> 15, t | 1);
5260
- t ^= t + Math.imul(t ^ t >>> 7, t | 61);
5261
- return ((t ^ t >>> 14) >>> 0) / 4294967296;
5262
- };
5263
- }
5264
- function percentileBounds(ciLevel) {
5265
- const tail = (1 - ciLevel) / 2;
5266
- return [tail, 1 - tail];
5267
- }
5268
- function quantile(sorted, q) {
5269
- if (sorted.length === 0) return NaN;
5270
- if (sorted.length === 1) return sorted[0];
5271
- const pos = q * (sorted.length - 1);
5272
- const lo = Math.floor(pos);
5273
- const hi = Math.ceil(pos);
5274
- if (lo === hi) return sorted[lo];
5275
- const frac = pos - lo;
5276
- return sorted[lo] * (1 - frac) + sorted[hi] * frac;
5277
- }
5278
-
5279
4967
  // src/observability.ts
5280
4968
  async function toLangfuseEnvelope(store, runId) {
5281
4969
  const run = await store.getRun(runId);
@@ -6077,7 +5765,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
6077
5765
  runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
6078
5766
  }
6079
5767
  const runCounts = [...runCountByScenario.values()];
6080
- const p25 = runCounts.length > 0 ? quantile2(runCounts, 0.25) : 0;
5768
+ const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
6081
5769
  for (const s of scenarios) {
6082
5770
  const count = runCountByScenario.get(s.id) ?? 0;
6083
5771
  if (count <= p25 && count < 3) {
@@ -6131,7 +5819,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
6131
5819
  }
6132
5820
  return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
6133
5821
  }
6134
- function quantile2(xs, p) {
5822
+ function quantile(xs, p) {
6135
5823
  const sorted = [...xs].sort((a, b) => a - b);
6136
5824
  const idx = p * (sorted.length - 1);
6137
5825
  const lo = Math.floor(idx);
@@ -9446,6 +9134,8 @@ export {
9446
9134
  controlFailureClassFromVerification,
9447
9135
  controlRunToFeedbackTrajectory,
9448
9136
  controlRunToRunRecord,
9137
+ corpusInterRaterAgreement,
9138
+ corpusInterRaterAgreementFromJudgeScores,
9449
9139
  createAntiSlopJudge,
9450
9140
  createCompositeMutator,
9451
9141
  createCustomJudge,