@tangle-network/agent-eval 0.27.0 → 0.27.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +72 -0
- package/README.md +4 -5
- package/dist/builder-eval/index.js +1 -1
- package/dist/{chunk-WWYCWKUM.js → chunk-3CKU6VGU.js} +2 -2
- package/dist/{chunk-K2TPS5LB.js → chunk-4U4BKCXK.js} +2 -2
- package/dist/chunk-4U4BKCXK.js.map +1 -0
- package/dist/{chunk-2A5XJB43.js → chunk-5AKPEK5L.js} +3 -3
- package/dist/chunk-5AKPEK5L.js.map +1 -0
- package/dist/{chunk-RAF443UI.js → chunk-DBIGN5MJ.js} +2 -2
- package/dist/{chunk-JLZQWFV3.js → chunk-K33INZHH.js} +2 -2
- package/dist/chunk-K33INZHH.js.map +1 -0
- package/dist/{chunk-NU65VQ7M.js → chunk-MAZ26DC7.js} +1 -1
- package/dist/chunk-MAZ26DC7.js.map +1 -0
- package/dist/{chunk-LSH4MMOZ.js → chunk-NCRFYPS3.js} +1 -1
- package/dist/chunk-NCRFYPS3.js.map +1 -0
- package/dist/{chunk-ZN274SWR.js → chunk-PALJO75S.js} +2 -2
- package/dist/{chunk-OWLAAMME.js → chunk-QHF6EQKK.js} +3 -2
- package/dist/chunk-QHF6EQKK.js.map +1 -0
- package/dist/chunk-R5UQJNKC.js +722 -0
- package/dist/chunk-R5UQJNKC.js.map +1 -0
- package/dist/{chunk-SESZDQPX.js → chunk-RUI6SIHY.js} +3 -3
- package/dist/chunk-RUI6SIHY.js.map +1 -0
- package/dist/{chunk-WHZMVFUV.js → chunk-SZSBQUIJ.js} +2 -2
- package/dist/chunk-SZSBQUIJ.js.map +1 -0
- package/dist/{chunk-4F5DQN55.js → chunk-VSMTAMNK.js} +1 -1
- package/dist/chunk-VSMTAMNK.js.map +1 -0
- package/dist/{chunk-5LBB5B3Z.js → chunk-XFZCM5Z3.js} +1 -1
- package/dist/chunk-XFZCM5Z3.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/{control-CBShYYA6.d.ts → control-BT4qnXiS.d.ts} +2 -2
- package/dist/{control-runtime-BuJHoLg0.d.ts → control-runtime-BZ_lVLYW.d.ts} +1 -0
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{failure-cluster-C2EGSDiT.d.ts → failure-cluster-Cw65_5FY.d.ts} +1 -2
- package/dist/{feedback-trajectory-DfFdrraJ.d.ts → feedback-trajectory-D1aGKusy.d.ts} +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-D3iBCjdF.d.ts → index-BhLlu-qO.d.ts} +1 -1
- package/dist/index.d.ts +157 -167
- package/dist/index.js +25 -335
- package/dist/index.js.map +1 -1
- package/dist/knowledge/index.d.ts +1 -1
- package/dist/knowledge/index.js +2 -2
- package/dist/{multi-layer-verifier-LkP3LVKj.d.ts → multi-layer-verifier-U-c8ge1k.d.ts} +1 -1
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +5 -5
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.d.ts +1 -1
- package/dist/pipelines/index.js +2 -2
- package/dist/{release-report-wfUySN5F.d.ts → release-report-CCQqnK46.d.ts} +1 -1
- package/dist/{replay-BL96gCEP.d.ts → replay-D7z0J43-.d.ts} +4 -5
- package/dist/reporting.d.ts +4 -4
- package/dist/reporting.js +5 -5
- package/dist/{researcher-bGkI7vCl.d.ts → researcher-G81CWc0q.d.ts} +9 -10
- package/dist/rl.d.ts +26 -44
- package/dist/rl.js +5 -5
- package/dist/rl.js.map +1 -1
- package/dist/{sequential-Dgz1n51-.d.ts → sequential-5iSVfzl2.d.ts} +2 -2
- package/dist/{summary-report-DZVXOCK_.d.ts → summary-report-Dl4akLKX.d.ts} +5 -5
- package/dist/traces.d.ts +1 -1
- package/dist/traces.js +2 -2
- package/dist/wire/index.d.ts +2 -2
- package/dist/wire/index.js +1 -1
- package/docs/research-report-methodology.md +4 -4
- package/docs/three-package-architecture.md +12 -24
- package/package.json +1 -1
- package/dist/chunk-2A5XJB43.js.map +0 -1
- package/dist/chunk-4F5DQN55.js.map +0 -1
- package/dist/chunk-5LBB5B3Z.js.map +0 -1
- package/dist/chunk-I4MBDTY5.js +0 -272
- package/dist/chunk-I4MBDTY5.js.map +0 -1
- package/dist/chunk-JLZQWFV3.js.map +0 -1
- package/dist/chunk-K2TPS5LB.js.map +0 -1
- package/dist/chunk-LSH4MMOZ.js.map +0 -1
- package/dist/chunk-NU65VQ7M.js.map +0 -1
- package/dist/chunk-OWLAAMME.js.map +0 -1
- package/dist/chunk-SESZDQPX.js.map +0 -1
- package/dist/chunk-WHZMVFUV.js.map +0 -1
- /package/dist/{chunk-WWYCWKUM.js.map → chunk-3CKU6VGU.js.map} +0 -0
- /package/dist/{chunk-RAF443UI.js.map → chunk-DBIGN5MJ.js.map} +0 -0
- /package/dist/{chunk-ZN274SWR.js.map → chunk-PALJO75S.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
failureClusterView,
|
|
12
12
|
iqr,
|
|
13
13
|
welchsTTest
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-K33INZHH.js";
|
|
15
15
|
import {
|
|
16
16
|
exportTrainingData,
|
|
17
17
|
toNdjson
|
|
@@ -28,7 +28,7 @@ import {
|
|
|
28
28
|
pytestTestParser,
|
|
29
29
|
runTestGradedScenario,
|
|
30
30
|
vitestTestParser
|
|
31
|
-
} from "./chunk-
|
|
31
|
+
} from "./chunk-QHF6EQKK.js";
|
|
32
32
|
import {
|
|
33
33
|
classifyEuAiRisk,
|
|
34
34
|
euAiActReport,
|
|
@@ -43,7 +43,7 @@ import {
|
|
|
43
43
|
knowledgeReadinessTracePayload,
|
|
44
44
|
scoreKnowledgeReadiness,
|
|
45
45
|
userQuestionsForKnowledgeGaps
|
|
46
|
-
} from "./chunk-
|
|
46
|
+
} from "./chunk-3CKU6VGU.js";
|
|
47
47
|
import {
|
|
48
48
|
controlFailureClassFromVerification,
|
|
49
49
|
controlRunToRunRecord,
|
|
@@ -54,7 +54,7 @@ import {
|
|
|
54
54
|
runProposeReview,
|
|
55
55
|
runProposeReviewAsControlLoop,
|
|
56
56
|
scoreFromEvals
|
|
57
|
-
} from "./chunk-
|
|
57
|
+
} from "./chunk-PALJO75S.js";
|
|
58
58
|
import {
|
|
59
59
|
allCriticalPassed,
|
|
60
60
|
objectiveEval,
|
|
@@ -62,7 +62,7 @@ import {
|
|
|
62
62
|
stopOnNoProgress,
|
|
63
63
|
stopOnRepeatedAction,
|
|
64
64
|
subjectiveEval
|
|
65
|
-
} from "./chunk-
|
|
65
|
+
} from "./chunk-NCRFYPS3.js";
|
|
66
66
|
import {
|
|
67
67
|
CallbackResearcher,
|
|
68
68
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
@@ -96,7 +96,7 @@ import {
|
|
|
96
96
|
summarizePreferenceMemory,
|
|
97
97
|
trialTraceFromMultiShotTrial,
|
|
98
98
|
withAssignedFeedbackSplit
|
|
99
|
-
} from "./chunk-
|
|
99
|
+
} from "./chunk-SZSBQUIJ.js";
|
|
100
100
|
import {
|
|
101
101
|
RunRecordValidationError,
|
|
102
102
|
isRunRecord,
|
|
@@ -111,10 +111,10 @@ import {
|
|
|
111
111
|
judgeReplayGate,
|
|
112
112
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
113
113
|
renderReleaseReport
|
|
114
|
-
} from "./chunk-
|
|
114
|
+
} from "./chunk-DBIGN5MJ.js";
|
|
115
115
|
import {
|
|
116
116
|
runEvalCampaign
|
|
117
|
-
} from "./chunk-
|
|
117
|
+
} from "./chunk-RUI6SIHY.js";
|
|
118
118
|
import {
|
|
119
119
|
LlmCallError,
|
|
120
120
|
LlmClient,
|
|
@@ -128,7 +128,7 @@ import {
|
|
|
128
128
|
import {
|
|
129
129
|
evaluateInterimReleaseConfidence,
|
|
130
130
|
pairedEvalueSequence
|
|
131
|
-
} from "./chunk-
|
|
131
|
+
} from "./chunk-MAZ26DC7.js";
|
|
132
132
|
import {
|
|
133
133
|
RESEARCH_REPORT_HARD_PAIR_FLOOR,
|
|
134
134
|
benjaminiHochberg,
|
|
@@ -141,18 +141,26 @@ import {
|
|
|
141
141
|
requiredSampleSize,
|
|
142
142
|
researchReport,
|
|
143
143
|
summaryTable
|
|
144
|
-
} from "./chunk-
|
|
144
|
+
} from "./chunk-5AKPEK5L.js";
|
|
145
145
|
import {
|
|
146
|
+
calibrateJudge,
|
|
147
|
+
calibrateJudgeContinuous,
|
|
146
148
|
cohensD,
|
|
147
149
|
confidenceInterval,
|
|
150
|
+
continuousAgreement,
|
|
151
|
+
corpusInterRaterAgreement,
|
|
152
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
148
153
|
interRaterReliability,
|
|
149
154
|
mannWhitneyU,
|
|
150
155
|
normalizeScores,
|
|
151
156
|
pairedTTest,
|
|
152
157
|
partialCredit,
|
|
158
|
+
positionalBias,
|
|
159
|
+
selfPreference,
|
|
160
|
+
verbosityBias,
|
|
153
161
|
weightedMean,
|
|
154
162
|
wilcoxonSignedRank
|
|
155
|
-
} from "./chunk-
|
|
163
|
+
} from "./chunk-R5UQJNKC.js";
|
|
156
164
|
import {
|
|
157
165
|
DEFAULT_REDACTION_RULES,
|
|
158
166
|
FileSystemTraceStore,
|
|
@@ -166,7 +174,7 @@ import {
|
|
|
166
174
|
iterateRawCalls,
|
|
167
175
|
redactString,
|
|
168
176
|
redactValue
|
|
169
|
-
} from "./chunk-
|
|
177
|
+
} from "./chunk-4U4BKCXK.js";
|
|
170
178
|
import {
|
|
171
179
|
aggregateLlm,
|
|
172
180
|
argHash,
|
|
@@ -208,7 +216,7 @@ import {
|
|
|
208
216
|
hashJson,
|
|
209
217
|
signManifest,
|
|
210
218
|
verifyManifest
|
|
211
|
-
} from "./chunk-
|
|
219
|
+
} from "./chunk-VSMTAMNK.js";
|
|
212
220
|
import {
|
|
213
221
|
AgentEvalError,
|
|
214
222
|
CaptureIntegrityError,
|
|
@@ -4956,326 +4964,6 @@ function seededShuffle(items, seed) {
|
|
|
4956
4964
|
return out;
|
|
4957
4965
|
}
|
|
4958
4966
|
|
|
4959
|
-
// src/judge-calibration.ts
|
|
4960
|
-
function calibrateJudge(golden, candidate) {
|
|
4961
|
-
const map = /* @__PURE__ */ new Map();
|
|
4962
|
-
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
4963
|
-
for (const c of candidate) {
|
|
4964
|
-
const entry = map.get(c.itemId);
|
|
4965
|
-
if (entry) entry.j = c.score;
|
|
4966
|
-
}
|
|
4967
|
-
const common = [...map.values()].filter((v) => Number.isFinite(v.j));
|
|
4968
|
-
const n = common.length;
|
|
4969
|
-
if (n < 2) {
|
|
4970
|
-
return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] };
|
|
4971
|
-
}
|
|
4972
|
-
const humans = common.map((c) => c.h);
|
|
4973
|
-
const judges = common.map((c) => c.j);
|
|
4974
|
-
const pearson = pearsonR(humans, judges);
|
|
4975
|
-
const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round));
|
|
4976
|
-
const absDiffs = common.map((c) => Math.abs(c.j - c.h));
|
|
4977
|
-
const mae = absDiffs.reduce((a, b) => a + b, 0) / n;
|
|
4978
|
-
const worst2 = [...map.entries()].filter(([, v]) => Number.isFinite(v.j)).map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) })).sort((a, b) => b.delta - a.delta).slice(0, 5);
|
|
4979
|
-
return { n, pearson, kappa, mae, worstItems: worst2 };
|
|
4980
|
-
}
|
|
4981
|
-
function positionalBias(scores) {
|
|
4982
|
-
const pairs = /* @__PURE__ */ new Map();
|
|
4983
|
-
for (const s of scores) {
|
|
4984
|
-
const slot = pairs.get(s.itemId) ?? {};
|
|
4985
|
-
if (s.positionOfAInput === "first") slot.first = s.score;
|
|
4986
|
-
else if (s.positionOfAInput === "second") slot.second = s.score;
|
|
4987
|
-
pairs.set(s.itemId, slot);
|
|
4988
|
-
}
|
|
4989
|
-
const deltas = [];
|
|
4990
|
-
for (const { first, second } of pairs.values()) {
|
|
4991
|
-
if (first !== void 0 && second !== void 0) deltas.push(first - second);
|
|
4992
|
-
}
|
|
4993
|
-
if (deltas.length === 0) return { avgDelta: 0, n: 0 };
|
|
4994
|
-
return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length };
|
|
4995
|
-
}
|
|
4996
|
-
function verbosityBias(samples) {
|
|
4997
|
-
const n = samples.length;
|
|
4998
|
-
if (n < 3) return { pearson: NaN, n };
|
|
4999
|
-
return {
|
|
5000
|
-
pearson: pearsonR(
|
|
5001
|
-
samples.map((s) => s.outputLen),
|
|
5002
|
-
samples.map((s) => s.score)
|
|
5003
|
-
),
|
|
5004
|
-
n
|
|
5005
|
-
};
|
|
5006
|
-
}
|
|
5007
|
-
function selfPreference(samples) {
|
|
5008
|
-
const inF = samples.filter((s) => s.inFamily).map((s) => s.score);
|
|
5009
|
-
const outF = samples.filter((s) => !s.inFamily).map((s) => s.score);
|
|
5010
|
-
if (inF.length === 0 || outF.length === 0)
|
|
5011
|
-
return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 };
|
|
5012
|
-
const inMean = inF.reduce((a, b) => a + b, 0) / inF.length;
|
|
5013
|
-
const outMean = outF.reduce((a, b) => a + b, 0) / outF.length;
|
|
5014
|
-
return {
|
|
5015
|
-
inFamilyMean: inMean,
|
|
5016
|
-
outOfFamilyMean: outMean,
|
|
5017
|
-
deltaMean: inMean - outMean,
|
|
5018
|
-
n: samples.length
|
|
5019
|
-
};
|
|
5020
|
-
}
|
|
5021
|
-
function pearsonR(a, b) {
|
|
5022
|
-
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5023
|
-
const mA = a.reduce((s, v) => s + v, 0) / a.length;
|
|
5024
|
-
const mB = b.reduce((s, v) => s + v, 0) / b.length;
|
|
5025
|
-
let num = 0, dA = 0, dB = 0;
|
|
5026
|
-
for (let i = 0; i < a.length; i++) {
|
|
5027
|
-
const da = a[i] - mA;
|
|
5028
|
-
const db = b[i] - mB;
|
|
5029
|
-
num += da * db;
|
|
5030
|
-
dA += da * da;
|
|
5031
|
-
dB += db * db;
|
|
5032
|
-
}
|
|
5033
|
-
if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
|
|
5034
|
-
return num / Math.sqrt(dA * dB);
|
|
5035
|
-
}
|
|
5036
|
-
function weightedKappa(a, b) {
|
|
5037
|
-
if (a.length !== b.length || a.length === 0) return NaN;
|
|
5038
|
-
const min = Math.min(...a, ...b);
|
|
5039
|
-
const max = Math.max(...a, ...b);
|
|
5040
|
-
const K = max - min + 1;
|
|
5041
|
-
if (K < 2) return 1;
|
|
5042
|
-
const observed = Array.from({ length: K }, () => new Array(K).fill(0));
|
|
5043
|
-
const rowMarg = new Array(K).fill(0);
|
|
5044
|
-
const colMarg = new Array(K).fill(0);
|
|
5045
|
-
for (let i = 0; i < a.length; i++) {
|
|
5046
|
-
const ai = a[i] - min;
|
|
5047
|
-
const bi = b[i] - min;
|
|
5048
|
-
const row = observed[ai];
|
|
5049
|
-
row[bi] = (row[bi] ?? 0) + 1;
|
|
5050
|
-
rowMarg[ai]++;
|
|
5051
|
-
colMarg[bi]++;
|
|
5052
|
-
}
|
|
5053
|
-
let num = 0;
|
|
5054
|
-
let den = 0;
|
|
5055
|
-
for (let i = 0; i < K; i++) {
|
|
5056
|
-
for (let j = 0; j < K; j++) {
|
|
5057
|
-
const w = (i - j) ** 2 / (K - 1) ** 2;
|
|
5058
|
-
const expected = rowMarg[i] * colMarg[j] / a.length;
|
|
5059
|
-
num += w * observed[i][j];
|
|
5060
|
-
den += w * expected;
|
|
5061
|
-
}
|
|
5062
|
-
}
|
|
5063
|
-
if (den === 0) return 1;
|
|
5064
|
-
return 1 - num / den;
|
|
5065
|
-
}
|
|
5066
|
-
function continuousAgreement(scores, opts = {}) {
|
|
5067
|
-
const bootstrap = opts.bootstrap ?? 1e3;
|
|
5068
|
-
const weights = opts.weights ?? "quadratic";
|
|
5069
|
-
const seed = opts.seed ?? 12648430;
|
|
5070
|
-
const ciLevel = opts.ciLevel ?? 0.95;
|
|
5071
|
-
const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)));
|
|
5072
|
-
const raters = matrix[0]?.length ?? 0;
|
|
5073
|
-
const clean = matrix.filter((row) => row.length === raters);
|
|
5074
|
-
const nClean = clean.length;
|
|
5075
|
-
if (nClean < 2 || raters < 2) {
|
|
5076
|
-
return {
|
|
5077
|
-
weightedKappa: NaN,
|
|
5078
|
-
icc: NaN,
|
|
5079
|
-
pearson: NaN,
|
|
5080
|
-
spearman: NaN,
|
|
5081
|
-
ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },
|
|
5082
|
-
n: nClean,
|
|
5083
|
-
raters
|
|
5084
|
-
};
|
|
5085
|
-
}
|
|
5086
|
-
const kappa = continuousWeightedKappa(clean, weights);
|
|
5087
|
-
const icc = icc21(clean);
|
|
5088
|
-
const pearson = avgPairwise(clean, pearsonR);
|
|
5089
|
-
const spearman = avgPairwise(clean, spearmanR);
|
|
5090
|
-
const ciIcc = [NaN, NaN];
|
|
5091
|
-
const ciKappa = [NaN, NaN];
|
|
5092
|
-
if (bootstrap > 0) {
|
|
5093
|
-
const rng = mulberry32(seed);
|
|
5094
|
-
const iccs = [];
|
|
5095
|
-
const kappas = [];
|
|
5096
|
-
for (let b = 0; b < bootstrap; b++) {
|
|
5097
|
-
const sample = new Array(nClean);
|
|
5098
|
-
for (let i = 0; i < nClean; i++) {
|
|
5099
|
-
sample[i] = clean[Math.floor(rng() * nClean)];
|
|
5100
|
-
}
|
|
5101
|
-
const iccB = icc21(sample);
|
|
5102
|
-
const kB = continuousWeightedKappa(sample, weights);
|
|
5103
|
-
if (Number.isFinite(iccB)) iccs.push(iccB);
|
|
5104
|
-
if (Number.isFinite(kB)) kappas.push(kB);
|
|
5105
|
-
}
|
|
5106
|
-
const [lo, hi] = percentileBounds(ciLevel);
|
|
5107
|
-
if (iccs.length > 0) {
|
|
5108
|
-
iccs.sort((a, b) => a - b);
|
|
5109
|
-
ciIcc[0] = quantile(iccs, lo);
|
|
5110
|
-
ciIcc[1] = quantile(iccs, hi);
|
|
5111
|
-
}
|
|
5112
|
-
if (kappas.length > 0) {
|
|
5113
|
-
kappas.sort((a, b) => a - b);
|
|
5114
|
-
ciKappa[0] = quantile(kappas, lo);
|
|
5115
|
-
ciKappa[1] = quantile(kappas, hi);
|
|
5116
|
-
}
|
|
5117
|
-
}
|
|
5118
|
-
return {
|
|
5119
|
-
weightedKappa: kappa,
|
|
5120
|
-
icc,
|
|
5121
|
-
pearson,
|
|
5122
|
-
spearman,
|
|
5123
|
-
ci: { icc: ciIcc, weightedKappa: ciKappa },
|
|
5124
|
-
n: nClean,
|
|
5125
|
-
raters
|
|
5126
|
-
};
|
|
5127
|
-
}
|
|
5128
|
-
function calibrateJudgeContinuous(golden, candidate, opts = {}) {
|
|
5129
|
-
const base = calibrateJudge(golden, candidate);
|
|
5130
|
-
const map = /* @__PURE__ */ new Map();
|
|
5131
|
-
for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN });
|
|
5132
|
-
for (const c of candidate) {
|
|
5133
|
-
const entry = map.get(c.itemId);
|
|
5134
|
-
if (entry) entry.j = c.score;
|
|
5135
|
-
}
|
|
5136
|
-
const rows = [];
|
|
5137
|
-
for (const v of map.values()) {
|
|
5138
|
-
if (Number.isFinite(v.j)) rows.push([v.h, v.j]);
|
|
5139
|
-
}
|
|
5140
|
-
const agreement = continuousAgreement(rows, opts);
|
|
5141
|
-
return {
|
|
5142
|
-
...base,
|
|
5143
|
-
weightedKappaContinuous: agreement.weightedKappa,
|
|
5144
|
-
icc: agreement.icc,
|
|
5145
|
-
spearman: agreement.spearman,
|
|
5146
|
-
ci: agreement.ci
|
|
5147
|
-
};
|
|
5148
|
-
}
|
|
5149
|
-
function continuousWeightedKappa(rows, scheme) {
|
|
5150
|
-
if (rows.length === 0) return NaN;
|
|
5151
|
-
const raters = rows[0].length;
|
|
5152
|
-
if (raters < 2) return NaN;
|
|
5153
|
-
const wFn = scheme === "linear" ? (x, y) => Math.abs(x - y) : (x, y) => (x - y) ** 2;
|
|
5154
|
-
let sum2 = 0;
|
|
5155
|
-
let pairs = 0;
|
|
5156
|
-
for (let r1 = 0; r1 < raters; r1++) {
|
|
5157
|
-
for (let r2 = r1 + 1; r2 < raters; r2++) {
|
|
5158
|
-
const a = rows.map((row) => row[r1]);
|
|
5159
|
-
const b = rows.map((row) => row[r2]);
|
|
5160
|
-
const n = a.length;
|
|
5161
|
-
let obs = 0;
|
|
5162
|
-
for (let i = 0; i < n; i++) obs += wFn(a[i], b[i]);
|
|
5163
|
-
obs /= n;
|
|
5164
|
-
let exp = 0;
|
|
5165
|
-
for (let i = 0; i < n; i++) {
|
|
5166
|
-
for (let j = 0; j < n; j++) exp += wFn(a[i], b[j]);
|
|
5167
|
-
}
|
|
5168
|
-
exp /= n * n;
|
|
5169
|
-
if (exp === 0) {
|
|
5170
|
-
sum2 += obs === 0 ? 1 : 0;
|
|
5171
|
-
} else {
|
|
5172
|
-
sum2 += 1 - obs / exp;
|
|
5173
|
-
}
|
|
5174
|
-
pairs++;
|
|
5175
|
-
}
|
|
5176
|
-
}
|
|
5177
|
-
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5178
|
-
}
|
|
5179
|
-
function icc21(rows) {
|
|
5180
|
-
const n = rows.length;
|
|
5181
|
-
if (n < 2) return NaN;
|
|
5182
|
-
const k = rows[0].length;
|
|
5183
|
-
if (k < 2) return NaN;
|
|
5184
|
-
const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k);
|
|
5185
|
-
const colMeans = new Array(k).fill(0);
|
|
5186
|
-
for (let j = 0; j < k; j++) {
|
|
5187
|
-
let s = 0;
|
|
5188
|
-
for (let i = 0; i < n; i++) s += rows[i][j];
|
|
5189
|
-
colMeans[j] = s / n;
|
|
5190
|
-
}
|
|
5191
|
-
let grand = 0;
|
|
5192
|
-
for (let i = 0; i < n; i++) grand += rowMeans[i];
|
|
5193
|
-
grand /= n;
|
|
5194
|
-
let ssR = 0;
|
|
5195
|
-
for (let i = 0; i < n; i++) ssR += (rowMeans[i] - grand) ** 2;
|
|
5196
|
-
ssR *= k;
|
|
5197
|
-
let ssC = 0;
|
|
5198
|
-
for (let j = 0; j < k; j++) ssC += (colMeans[j] - grand) ** 2;
|
|
5199
|
-
ssC *= n;
|
|
5200
|
-
let ssT = 0;
|
|
5201
|
-
for (let i = 0; i < n; i++) {
|
|
5202
|
-
for (let j = 0; j < k; j++) ssT += (rows[i][j] - grand) ** 2;
|
|
5203
|
-
}
|
|
5204
|
-
const ssE = ssT - ssR - ssC;
|
|
5205
|
-
const dfR = n - 1;
|
|
5206
|
-
const dfC = k - 1;
|
|
5207
|
-
const dfE = (n - 1) * (k - 1);
|
|
5208
|
-
const msR = ssR / dfR;
|
|
5209
|
-
const msC = ssC / dfC;
|
|
5210
|
-
const msE = dfE > 0 ? ssE / dfE : 0;
|
|
5211
|
-
const denom = msR + (k - 1) * msE + k * (msC - msE) / n;
|
|
5212
|
-
if (denom === 0) {
|
|
5213
|
-
return msR === 0 && msE === 0 ? 1 : 0;
|
|
5214
|
-
}
|
|
5215
|
-
return (msR - msE) / denom;
|
|
5216
|
-
}
|
|
5217
|
-
function avgPairwise(rows, fn) {
|
|
5218
|
-
const k = rows[0]?.length ?? 0;
|
|
5219
|
-
if (k < 2) return NaN;
|
|
5220
|
-
let sum2 = 0;
|
|
5221
|
-
let pairs = 0;
|
|
5222
|
-
for (let i = 0; i < k; i++) {
|
|
5223
|
-
for (let j = i + 1; j < k; j++) {
|
|
5224
|
-
const a = rows.map((row) => row[i]);
|
|
5225
|
-
const b = rows.map((row) => row[j]);
|
|
5226
|
-
const r = fn(a, b);
|
|
5227
|
-
if (Number.isFinite(r)) {
|
|
5228
|
-
sum2 += r;
|
|
5229
|
-
pairs++;
|
|
5230
|
-
}
|
|
5231
|
-
}
|
|
5232
|
-
}
|
|
5233
|
-
return pairs === 0 ? NaN : sum2 / pairs;
|
|
5234
|
-
}
|
|
5235
|
-
function spearmanR(a, b) {
|
|
5236
|
-
if (a.length !== b.length || a.length < 2) return NaN;
|
|
5237
|
-
return pearsonR(rankWithTies(a), rankWithTies(b));
|
|
5238
|
-
}
|
|
5239
|
-
function rankWithTies(xs) {
|
|
5240
|
-
const n = xs.length;
|
|
5241
|
-
const indexed = xs.map((v, i2) => ({ v, i: i2 }));
|
|
5242
|
-
indexed.sort((x, y) => x.v - y.v);
|
|
5243
|
-
const ranks = new Array(n).fill(0);
|
|
5244
|
-
let i = 0;
|
|
5245
|
-
while (i < n) {
|
|
5246
|
-
let j = i;
|
|
5247
|
-
while (j + 1 < n && indexed[j + 1].v === indexed[i].v) j++;
|
|
5248
|
-
const avg = (i + j) / 2 + 1;
|
|
5249
|
-
for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
|
|
5250
|
-
i = j + 1;
|
|
5251
|
-
}
|
|
5252
|
-
return ranks;
|
|
5253
|
-
}
|
|
5254
|
-
function mulberry32(seed) {
|
|
5255
|
-
let a = seed >>> 0;
|
|
5256
|
-
return () => {
|
|
5257
|
-
a = a + 1831565813 >>> 0;
|
|
5258
|
-
let t = a;
|
|
5259
|
-
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
5260
|
-
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
5261
|
-
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
5262
|
-
};
|
|
5263
|
-
}
|
|
5264
|
-
function percentileBounds(ciLevel) {
|
|
5265
|
-
const tail = (1 - ciLevel) / 2;
|
|
5266
|
-
return [tail, 1 - tail];
|
|
5267
|
-
}
|
|
5268
|
-
function quantile(sorted, q) {
|
|
5269
|
-
if (sorted.length === 0) return NaN;
|
|
5270
|
-
if (sorted.length === 1) return sorted[0];
|
|
5271
|
-
const pos = q * (sorted.length - 1);
|
|
5272
|
-
const lo = Math.floor(pos);
|
|
5273
|
-
const hi = Math.ceil(pos);
|
|
5274
|
-
if (lo === hi) return sorted[lo];
|
|
5275
|
-
const frac = pos - lo;
|
|
5276
|
-
return sorted[lo] * (1 - frac) + sorted[hi] * frac;
|
|
5277
|
-
}
|
|
5278
|
-
|
|
5279
4967
|
// src/observability.ts
|
|
5280
4968
|
async function toLangfuseEnvelope(store, runId) {
|
|
5281
4969
|
const run = await store.getRun(runId);
|
|
@@ -6077,7 +5765,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
6077
5765
|
runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
|
|
6078
5766
|
}
|
|
6079
5767
|
const runCounts = [...runCountByScenario.values()];
|
|
6080
|
-
const p25 = runCounts.length > 0 ?
|
|
5768
|
+
const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
|
|
6081
5769
|
for (const s of scenarios) {
|
|
6082
5770
|
const count = runCountByScenario.get(s.id) ?? 0;
|
|
6083
5771
|
if (count <= p25 && count < 3) {
|
|
@@ -6131,7 +5819,7 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
6131
5819
|
}
|
|
6132
5820
|
return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
|
|
6133
5821
|
}
|
|
6134
|
-
function
|
|
5822
|
+
function quantile(xs, p) {
|
|
6135
5823
|
const sorted = [...xs].sort((a, b) => a - b);
|
|
6136
5824
|
const idx = p * (sorted.length - 1);
|
|
6137
5825
|
const lo = Math.floor(idx);
|
|
@@ -9446,6 +9134,8 @@ export {
|
|
|
9446
9134
|
controlFailureClassFromVerification,
|
|
9447
9135
|
controlRunToFeedbackTrajectory,
|
|
9448
9136
|
controlRunToRunRecord,
|
|
9137
|
+
corpusInterRaterAgreement,
|
|
9138
|
+
corpusInterRaterAgreementFromJudgeScores,
|
|
9449
9139
|
createAntiSlopJudge,
|
|
9450
9140
|
createCompositeMutator,
|
|
9451
9141
|
createCustomJudge,
|