@tangle-network/agent-eval 0.37.0 → 0.40.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/campaign/index.d.ts +695 -0
- package/dist/campaign/index.js +741 -0
- package/dist/campaign/index.js.map +1 -0
- package/dist/chunk-5U2DOJU4.js +565 -0
- package/dist/chunk-5U2DOJU4.js.map +1 -0
- package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
- package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
- package/dist/chunk-BWZEGTES.js.map +1 -0
- package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
- package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
- package/dist/chunk-GGE4NNQT.js +65 -0
- package/dist/chunk-GGE4NNQT.js.map +1 -0
- package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
- package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
- package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
- package/dist/chunk-MAOZCN36.js.map +1 -0
- package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
- package/dist/chunk-QWV226SL.js +276 -0
- package/dist/chunk-QWV226SL.js.map +1 -0
- package/dist/chunk-TMXPFWC7.js +305 -0
- package/dist/chunk-TMXPFWC7.js.map +1 -0
- package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
- package/dist/chunk-WP7SY7AI.js.map +1 -0
- package/dist/chunk-YV7J7X5N.js +313 -0
- package/dist/chunk-YV7J7X5N.js.map +1 -0
- package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
- package/dist/control.d.ts +3 -3
- package/dist/control.js +2 -2
- package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
- package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
- package/dist/governance/index.d.ts +133 -5
- package/dist/index.d.ts +35 -34
- package/dist/index.js +97 -630
- package/dist/index.js.map +1 -1
- package/dist/matrix/index.d.ts +2 -109
- package/dist/matrix/index.js +5 -270
- package/dist/matrix/index.js.map +1 -1
- package/dist/multishot/index.d.ts +276 -0
- package/dist/multishot/index.js +516 -0
- package/dist/multishot/index.js.map +1 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +5 -5
- package/dist/pipelines/index.js +2 -2
- package/dist/red-team-30II1T4o.d.ts +63 -0
- package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
- package/dist/reporting.d.ts +2 -2
- package/dist/reporting.js +3 -3
- package/dist/rl.js +15 -315
- package/dist/rl.js.map +1 -1
- package/dist/run-campaign-JYJXYHHL.js +10 -0
- package/dist/run-campaign-JYJXYHHL.js.map +1 -0
- package/dist/traces.js +7 -5
- package/dist/types-DHqkLwEU.d.ts +110 -0
- package/dist/wire/index.d.ts +2 -2
- package/docs/design/loop-taxonomy.md +233 -0
- package/package.json +38 -24
- package/dist/chunk-KHZRNY3F.js.map +0 -1
- package/dist/chunk-L5UNCDAJ.js.map +0 -1
- package/dist/chunk-TSPOEDM3.js.map +0 -1
- package/dist/index-CN2agEaO.d.ts +0 -191
- /package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
- /package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
- /package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
- /package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
- /package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
- /package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0
|
@@ -343,17 +343,18 @@ function weightedMean(scores) {
|
|
|
343
343
|
}
|
|
344
344
|
return totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
345
345
|
}
|
|
346
|
-
function confidenceInterval(scores, confidence = 0.95) {
|
|
346
|
+
function confidenceInterval(scores, confidence = 0.95, opts = {}) {
|
|
347
347
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
348
348
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
349
349
|
const n = scores.length;
|
|
350
350
|
const mean = scores.reduce((a, b) => a + b, 0) / n;
|
|
351
|
-
const B = 1e3;
|
|
351
|
+
const B = opts.resamples ?? 1e3;
|
|
352
|
+
const rng = makeRng(opts.seed);
|
|
352
353
|
const bootstrapMeans = [];
|
|
353
354
|
for (let i = 0; i < B; i++) {
|
|
354
355
|
let sum = 0;
|
|
355
356
|
for (let j = 0; j < n; j++) {
|
|
356
|
-
sum += scores[Math.floor(
|
|
357
|
+
sum += scores[Math.floor(rng() * n)];
|
|
357
358
|
}
|
|
358
359
|
bootstrapMeans.push(sum / n);
|
|
359
360
|
}
|
|
@@ -880,4 +881,4 @@ export {
|
|
|
880
881
|
benjaminiHochberg,
|
|
881
882
|
pairedBootstrap
|
|
882
883
|
};
|
|
883
|
-
//# sourceMappingURL=chunk-
|
|
884
|
+
//# sourceMappingURL=chunk-WP7SY7AI.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/judge-calibration.ts","../src/statistics.ts"],"sourcesContent":["/**\n * Judge calibration — measure judge quality against human gold + bias.\n *\n * Workflow:\n * 1. Build a golden set: {itemId, humanScore}[].\n * 2. Run candidate judges; each produces {itemId, score}.\n * 3. `calibrateJudge(golden, candidate)` reports κ + Pearson + MAE.\n * 4. `calibrateJudgeContinuous(golden, candidate)` adds quadratic-weighted\n * κ over the un-rounded [0,1] scores plus ICC(2,1), Pearson, Spearman,\n * and bootstrap CIs — use this for fine-grained judges where rounding\n * to int discards information (e.g. 0.78 vs 0.81 both round to 1 and\n * look \"perfectly agreed\" to integer κ).\n * 5. Run bias probes (positional, verbosity, self-preference) to\n * detect systematic score inflation.\n * 6. For N≥2 judges on the same items, `continuousAgreement(scores)`\n * reports ICC(2,1) + κ_w + Pearson + Spearman with bootstrap CIs.\n *\n * Returns actionable diagnostics, not a single number. Consumers then\n * decide whether to trust the judge, retrain it, or add a tie-breaker.\n */\n\nexport interface GoldenItem {\n itemId: string\n humanScore: number\n /** Optional group used for per-group bias audits (e.g. model-of-output family). */\n group?: string\n}\n\nexport interface CandidateScore {\n itemId: string\n score: number\n /** Optional — enables positional-bias analysis (did order matter?). */\n positionOfAInput?: 'first' | 'second'\n}\n\nexport interface CalibrationResult {\n n: number\n pearson: number\n /** Cohen's κ with quadratic weights over integer-rounded scores. */\n kappa: number\n /** Mean absolute error vs human. */\n mae: number\n /** Worst-5 miscalibrations (largest |judge - human|). */\n worstItems: Array<{ itemId: string; judge: number; human: number; delta: number }>\n}\n\nexport function calibrateJudge(\n golden: GoldenItem[],\n candidate: CandidateScore[],\n): CalibrationResult {\n const map = new Map<string, { h: number; j: number }>()\n for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN })\n for (const c of candidate) {\n const entry = map.get(c.itemId)\n if (entry) entry.j = c.score\n }\n const common = [...map.values()].filter((v) => Number.isFinite(v.j))\n const n = common.length\n if (n < 2) {\n return { n, pearson: NaN, kappa: NaN, mae: NaN, worstItems: [] }\n }\n const humans = common.map((c) => c.h)\n const judges = common.map((c) => c.j)\n const pearson = pearsonR(humans, judges)\n const kappa = weightedKappa(humans.map(Math.round), judges.map(Math.round))\n const absDiffs = common.map((c) => Math.abs(c.j - c.h))\n const mae = absDiffs.reduce((a, b) => a + b, 0) / n\n const worst = [...map.entries()]\n .filter(([, v]) => Number.isFinite(v.j))\n .map(([itemId, v]) => ({ itemId, judge: v.j, human: v.h, delta: Math.abs(v.j - v.h) }))\n .sort((a, b) => b.delta - a.delta)\n .slice(0, 5)\n return { n, pearson, kappa, mae, worstItems: worst }\n}\n\nexport interface PositionalBiasResult {\n /**\n * Score delta (first-position - second-position) averaged across items\n * presented in both positions. Non-zero = positional bias.\n */\n avgDelta: number\n n: number\n}\n\n/**\n * Feed the same items to the judge twice with A/B swapped and pass all\n * results here. Items that don't appear in both positions are ignored.\n */\nexport function positionalBias(scores: CandidateScore[]): PositionalBiasResult {\n const pairs = new Map<string, { first?: number; second?: number }>()\n for (const s of scores) {\n const slot = pairs.get(s.itemId) ?? {}\n if (s.positionOfAInput === 'first') slot.first = s.score\n else if (s.positionOfAInput === 'second') slot.second = s.score\n pairs.set(s.itemId, slot)\n }\n const deltas: number[] = []\n for (const { first, second } of pairs.values()) {\n if (first !== undefined && second !== undefined) deltas.push(first - second)\n }\n if (deltas.length === 0) return { avgDelta: 0, n: 0 }\n return { avgDelta: deltas.reduce((a, b) => a + b, 0) / deltas.length, n: deltas.length }\n}\n\nexport interface VerbosityBiasResult {\n /** Pearson correlation between output length and score. Strong positive = verbosity bias. */\n pearson: number\n n: number\n}\n\nexport function verbosityBias(\n samples: Array<{ outputLen: number; score: number }>,\n): VerbosityBiasResult {\n const n = samples.length\n if (n < 3) return { pearson: NaN, n }\n return {\n pearson: pearsonR(\n samples.map((s) => s.outputLen),\n samples.map((s) => s.score),\n ),\n n,\n }\n}\n\nexport interface SelfPreferenceResult {\n /** Mean judge score when judge's family matches output's family. */\n inFamilyMean: number\n outOfFamilyMean: number\n deltaMean: number\n n: number\n}\n\n/**\n * Pass the same scenarios scored with judge-model X grading outputs from\n * model X (in-family) and model Y (out-of-family). Non-zero delta\n * indicates self-preference.\n */\nexport function selfPreference(\n samples: Array<{ score: number; inFamily: boolean }>,\n): SelfPreferenceResult {\n const inF = samples.filter((s) => s.inFamily).map((s) => s.score)\n const outF = samples.filter((s) => !s.inFamily).map((s) => s.score)\n if (inF.length === 0 || outF.length === 0)\n return { inFamilyMean: 0, outOfFamilyMean: 0, deltaMean: 0, n: 0 }\n const inMean = inF.reduce((a, b) => a + b, 0) / inF.length\n const outMean = outF.reduce((a, b) => a + b, 0) / outF.length\n return {\n inFamilyMean: inMean,\n outOfFamilyMean: outMean,\n deltaMean: inMean - outMean,\n n: samples.length,\n }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────\n\nfunction pearsonR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n const mA = a.reduce((s, v) => s + v, 0) / a.length\n const mB = b.reduce((s, v) => s + v, 0) / b.length\n let num = 0,\n dA = 0,\n dB = 0\n for (let i = 0; i < a.length; i++) {\n const da = a[i]! - mA\n const db = b[i]! - mB\n num += da * db\n dA += da * da\n dB += db * db\n }\n if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0\n return num / Math.sqrt(dA * dB)\n}\n\n/** Quadratic weighted Cohen's κ over bounded integer scores. */\nfunction weightedKappa(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length === 0) return NaN\n const min = Math.min(...a, ...b)\n const max = Math.max(...a, ...b)\n const K = max - min + 1\n if (K < 2) return 1\n const observed: number[][] = Array.from({ length: K }, () => new Array(K).fill(0))\n const rowMarg = new Array(K).fill(0)\n const colMarg = new Array(K).fill(0)\n for (let i = 0; i < a.length; i++) {\n const ai = a[i]! - min\n const bi = b[i]! - min\n const row = observed[ai]!\n row[bi] = (row[bi] ?? 0) + 1\n rowMarg[ai]++\n colMarg[bi]++\n }\n let num = 0\n let den = 0\n for (let i = 0; i < K; i++) {\n for (let j = 0; j < K; j++) {\n const w = (i - j) ** 2 / (K - 1) ** 2\n const expected = (rowMarg[i] * colMarg[j]) / a.length\n num += w * observed[i]![j]!\n den += w * expected\n }\n }\n if (den === 0) return 1\n return 1 - num / den\n}\n\n// ── Continuous-value inter-rater agreement ──────────────────────────\n//\n// `weightedKappa` above quantises to integers. For [0,1] judges that is\n// lossy: 0.78 and 0.81 both round to 1 and look identical to integer κ.\n// `continuousAgreement` keeps the original magnitudes:\n//\n// - κ_w : Cohen's weighted κ generalised to continuous scores. For two\n// raters, weighted κ = 1 − Σ w_ij·o_ij / Σ w_ij·e_ij collapses\n// into the closed form below when each \"category\" is the score\n// itself (Janson & Olsson 2001; Cohen 1968 quadratic weights).\n// - ICC : Intraclass Correlation Coefficient, ICC(2,1) per Shrout &\n// Fleiss (1979) — two-way random effects, absolute agreement,\n// single rater. The right reliability coefficient when judges\n// are a random sample of the judge population.\n// - r, ρ : Pearson + Spearman for sanity-check. Pearson catches linear\n// association regardless of scale; Spearman catches monotone\n// association regardless of distribution.\n// - CIs : Bootstrap percentile CIs over items (rows). Bias is preserved\n// because we resample the full row, i.e. a judge's vector\n// stays internally consistent within each bootstrap replicate.\n//\n// References:\n// - Shrout, P. E. & Fleiss, J. L. (1979). Intraclass correlations: uses\n// in assessing rater reliability. Psychological Bulletin, 86(2), 420.\n// - McGraw, K. O. & Wong, S. P. (1996). Forming inferences about some\n// intraclass correlation coefficients. Psychological Methods, 1, 30.\n// - Janson, H. & Olsson, U. (2001). A measure of agreement for interval\n// or nominal multivariate observations. Educ. Psychol. Meas., 61, 277.\n\nexport interface ContinuousAgreement {\n /** Cohen's κ_w with quadratic weights, computed on raw [0,1] scores. */\n weightedKappa: number\n /** ICC(2,1): two-way random effects, absolute agreement, single rater. */\n icc: number\n /** Pearson product-moment correlation (averaged over rater pairs if N>2). */\n pearson: number\n /** Spearman rank correlation (averaged over rater pairs if N>2). */\n spearman: number\n /** 95% bootstrap percentile CIs over items. */\n ci: {\n icc: [number, number]\n weightedKappa: [number, number]\n }\n /** Number of complete items (no NaN across raters). */\n n: number\n /** Number of raters. */\n raters: number\n}\n\nexport interface ContinuousAgreementOptions {\n /** Bootstrap iterations. Default 1000. Set to 0 to skip CIs (CI = [NaN, NaN]). */\n bootstrap?: number\n /** κ weighting scheme. Default 'quadratic'. */\n weights?: 'linear' | 'quadratic'\n /** PRNG seed for reproducible bootstrap. Default 0xC0FFEE. */\n seed?: number\n /** Confidence level for percentile CI. Default 0.95. */\n ciLevel?: number\n}\n\n/**\n * Inter-rater agreement on continuous (typically [0,1]) scores.\n *\n * `scores` has shape [n_items][n_raters]. Rows with any non-finite entry\n * are dropped. Returns NaN metrics if fewer than 2 raters or 2 complete\n * items remain.\n */\nexport function continuousAgreement(\n scores: number[][],\n opts: ContinuousAgreementOptions = {},\n): ContinuousAgreement {\n const bootstrap = opts.bootstrap ?? 1000\n const weights = opts.weights ?? 'quadratic'\n const seed = opts.seed ?? 0xc0ffee\n const ciLevel = opts.ciLevel ?? 0.95\n\n const matrix = scores.filter((row) => row.length >= 2 && row.every((v) => Number.isFinite(v)))\n const raters = matrix[0]?.length ?? 0\n // All rows must have the same rater count, else drop the offenders.\n const clean = matrix.filter((row) => row.length === raters)\n const nClean = clean.length\n\n if (nClean < 2 || raters < 2) {\n return {\n weightedKappa: NaN,\n icc: NaN,\n pearson: NaN,\n spearman: NaN,\n ci: { icc: [NaN, NaN], weightedKappa: [NaN, NaN] },\n n: nClean,\n raters,\n }\n }\n\n const kappa = continuousWeightedKappa(clean, weights)\n const icc = icc21(clean)\n const pearson = avgPairwise(clean, pearsonR)\n const spearman = avgPairwise(clean, spearmanR)\n\n const ciIcc: [number, number] = [NaN, NaN]\n const ciKappa: [number, number] = [NaN, NaN]\n if (bootstrap > 0) {\n const rng = mulberry32(seed)\n const iccs: number[] = []\n const kappas: number[] = []\n for (let b = 0; b < bootstrap; b++) {\n const sample: number[][] = new Array(nClean)\n for (let i = 0; i < nClean; i++) {\n sample[i] = clean[Math.floor(rng() * nClean)]!\n }\n const iccB = icc21(sample)\n const kB = continuousWeightedKappa(sample, weights)\n if (Number.isFinite(iccB)) iccs.push(iccB)\n if (Number.isFinite(kB)) kappas.push(kB)\n }\n const [lo, hi] = percentileBounds(ciLevel)\n if (iccs.length > 0) {\n iccs.sort((a, b) => a - b)\n ciIcc[0] = quantile(iccs, lo)\n ciIcc[1] = quantile(iccs, hi)\n }\n if (kappas.length > 0) {\n kappas.sort((a, b) => a - b)\n ciKappa[0] = quantile(kappas, lo)\n ciKappa[1] = quantile(kappas, hi)\n }\n }\n\n return {\n weightedKappa: kappa,\n icc,\n pearson,\n spearman,\n ci: { icc: ciIcc, weightedKappa: ciKappa },\n n: nClean,\n raters,\n }\n}\n\nexport interface ContinuousCalibrationResult extends CalibrationResult {\n /** Cohen's κ_w computed on raw (un-rounded) scores. */\n weightedKappaContinuous: number\n /** ICC(2,1) treating golden + candidate as two raters. */\n icc: number\n spearman: number\n ci: {\n icc: [number, number]\n weightedKappa: [number, number]\n }\n}\n\n/**\n * Drop-in superset of `calibrateJudge` that adds continuous-value\n * agreement metrics. The old fields (n, pearson, kappa, mae, worstItems)\n * are preserved unchanged so existing callers continue to work.\n */\nexport function calibrateJudgeContinuous(\n golden: GoldenItem[],\n candidate: CandidateScore[],\n opts: ContinuousAgreementOptions = {},\n): ContinuousCalibrationResult {\n const base = calibrateJudge(golden, candidate)\n const map = new Map<string, { h: number; j: number }>()\n for (const g of golden) map.set(g.itemId, { h: g.humanScore, j: NaN })\n for (const c of candidate) {\n const entry = map.get(c.itemId)\n if (entry) entry.j = c.score\n }\n const rows: number[][] = []\n for (const v of map.values()) {\n if (Number.isFinite(v.j)) rows.push([v.h, v.j])\n }\n const agreement = continuousAgreement(rows, opts)\n return {\n ...base,\n weightedKappaContinuous: agreement.weightedKappa,\n icc: agreement.icc,\n spearman: agreement.spearman,\n ci: agreement.ci,\n }\n}\n\n// ── Continuous-agreement internals ──────────────────────────────────\n\n/**\n * Quadratic-weighted κ on continuous scores. With weights w(x,y) = (x-y)^2\n * (or |x-y| for linear) the formula collapses to:\n *\n * κ_w = 1 − E_obs[w] / E_exp[w]\n *\n * where E_obs averages w over paired (a_i, b_i) and E_exp averages w over\n * the independent product distribution (sum_{i,j} w(a_i, b_j) / n^2).\n * The normalisation by (max-min)^2 in the integer version cancels in the\n * ratio, so we don't need it here. Generalises to N raters by averaging κ_w\n * over all rater pairs (mean pairwise weighted agreement).\n */\nfunction continuousWeightedKappa(rows: number[][], scheme: 'linear' | 'quadratic'): number {\n if (rows.length === 0) return NaN\n const raters = rows[0]!.length\n if (raters < 2) return NaN\n const wFn =\n scheme === 'linear'\n ? (x: number, y: number) => Math.abs(x - y)\n : (x: number, y: number) => (x - y) ** 2\n let sum = 0\n let pairs = 0\n for (let r1 = 0; r1 < raters; r1++) {\n for (let r2 = r1 + 1; r2 < raters; r2++) {\n const a = rows.map((row) => row[r1]!)\n const b = rows.map((row) => row[r2]!)\n const n = a.length\n let obs = 0\n for (let i = 0; i < n; i++) obs += wFn(a[i]!, b[i]!)\n obs /= n\n // Expected under independence: average of w over all i,j cross pairs.\n let exp = 0\n for (let i = 0; i < n; i++) {\n for (let j = 0; j < n; j++) exp += wFn(a[i]!, b[j]!)\n }\n exp /= n * n\n if (exp === 0) {\n // Degenerate: at least one rater is constant. Convention: perfect\n // agreement iff observed disagreement is also zero, else 0.\n sum += obs === 0 ? 1 : 0\n } else {\n sum += 1 - obs / exp\n }\n pairs++\n }\n }\n return pairs === 0 ? NaN : sum / pairs\n}\n\n/**\n * ICC(2,1) — two-way random effects, absolute agreement, single rater.\n *\n * ICC(2,1) = (MSR − MSE) / (MSR + (k−1)·MSE + k·(MSC − MSE)/n)\n *\n * where MSR = between-rows MS, MSC = between-columns MS, MSE = residual MS,\n * n = rows (items), k = columns (raters).\n */\nfunction icc21(rows: number[][]): number {\n const n = rows.length\n if (n < 2) return NaN\n const k = rows[0]!.length\n if (k < 2) return NaN\n\n // Row means.\n const rowMeans = rows.map((row) => row.reduce((s, v) => s + v, 0) / k)\n // Column means.\n const colMeans = new Array(k).fill(0)\n for (let j = 0; j < k; j++) {\n let s = 0\n for (let i = 0; i < n; i++) s += rows[i]![j]!\n colMeans[j] = s / n\n }\n // Grand mean.\n let grand = 0\n for (let i = 0; i < n; i++) grand += rowMeans[i]!\n grand /= n\n\n let ssR = 0\n for (let i = 0; i < n; i++) ssR += (rowMeans[i]! - grand) ** 2\n ssR *= k\n let ssC = 0\n for (let j = 0; j < k; j++) ssC += (colMeans[j]! - grand) ** 2\n ssC *= n\n let ssT = 0\n for (let i = 0; i < n; i++) {\n for (let j = 0; j < k; j++) ssT += (rows[i]![j]! - grand) ** 2\n }\n const ssE = ssT - ssR - ssC\n\n const dfR = n - 1\n const dfC = k - 1\n const dfE = (n - 1) * (k - 1)\n const msR = ssR / dfR\n const msC = ssC / dfC\n const msE = dfE > 0 ? ssE / dfE : 0\n\n const denom = msR + (k - 1) * msE + (k * (msC - msE)) / n\n if (denom === 0) {\n // Degenerate (all values equal): convention = 1 if rows identical,\n // else 0. Identical rows ⇒ msR = 0 too, so msR − msE near 0.\n return msR === 0 && msE === 0 ? 1 : 0\n }\n return (msR - msE) / denom\n}\n\n/** Average pairwise statistic over all rater pairs. */\nfunction avgPairwise(rows: number[][], fn: (a: number[], b: number[]) => number): number {\n const k = rows[0]?.length ?? 0\n if (k < 2) return NaN\n let sum = 0\n let pairs = 0\n for (let i = 0; i < k; i++) {\n for (let j = i + 1; j < k; j++) {\n const a = rows.map((row) => row[i]!)\n const b = rows.map((row) => row[j]!)\n const r = fn(a, b)\n if (Number.isFinite(r)) {\n sum += r\n pairs++\n }\n }\n }\n return pairs === 0 ? NaN : sum / pairs\n}\n\n/** Spearman rank correlation. Ties get average ranks. */\nfunction spearmanR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return NaN\n return pearsonR(rankWithTies(a), rankWithTies(b))\n}\n\nfunction rankWithTies(xs: number[]): number[] {\n const n = xs.length\n const indexed = xs.map((v, i) => ({ v, i }))\n indexed.sort((x, y) => x.v - y.v)\n const ranks = new Array(n).fill(0)\n let i = 0\n while (i < n) {\n let j = i\n while (j + 1 < n && indexed[j + 1]!.v === indexed[i]!.v) j++\n // Average rank for ties (ranks are 1-indexed).\n const avg = (i + j) / 2 + 1\n for (let k = i; k <= j; k++) ranks[indexed[k]!.i] = avg\n i = j + 1\n }\n return ranks\n}\n\n/** Seeded PRNG — Mulberry32. Deterministic across platforms. */\nfunction mulberry32(seed: number): () => number {\n let a = seed >>> 0\n return () => {\n a = (a + 0x6d2b79f5) >>> 0\n let t = a\n t = Math.imul(t ^ (t >>> 15), t | 1)\n t ^= t + Math.imul(t ^ (t >>> 7), t | 61)\n return ((t ^ (t >>> 14)) >>> 0) / 4294967296\n }\n}\n\nfunction percentileBounds(ciLevel: number): [number, number] {\n const tail = (1 - ciLevel) / 2\n return [tail, 1 - tail]\n}\n\n/** Linear-interpolated quantile of a pre-sorted ascending array. */\nfunction quantile(sorted: number[], q: number): number {\n if (sorted.length === 0) return NaN\n if (sorted.length === 1) return sorted[0]!\n const pos = q * (sorted.length - 1)\n const lo = Math.floor(pos)\n const hi = Math.ceil(pos)\n if (lo === hi) return sorted[lo]!\n const frac = pos - lo\n return sorted[lo]! * (1 - frac) + sorted[hi]! * frac\n}\n","import { ValidationError } from './errors'\nimport {\n type ContinuousAgreement,\n type ContinuousAgreementOptions,\n continuousAgreement,\n} from './judge-calibration'\nimport type { JudgeScore } from './types'\n\n/** Dimensions where lower raw score = better outcome (inverted semantics) */\nconst INVERTED_DIMENSIONS = new Set(['hallucination', 'false_confidence', 'worst_failure'])\n\n/**\n * Normalize scores so all dimensions follow \"higher = better\".\n * Inverted dimensions (hallucination, false_confidence, worst_failure)\n * already use inverted scoring in the prompt (10 = no hallucination),\n * but this function ensures consistency if raw scores leak through.\n */\nexport function normalizeScores(scores: JudgeScore[]): JudgeScore[] {\n return scores.map((s) => {\n if (INVERTED_DIMENSIONS.has(s.dimension)) {\n return s\n }\n return s\n })\n}\n\n/** Weighted mean — falls back to uniform weights when omitted */\nexport function weightedMean(scores: { score: number; weight?: number }[]): number {\n if (scores.length === 0) return 0\n let totalWeight = 0\n let weightedSum = 0\n for (const { score, weight } of scores) {\n const w = weight ?? 1\n weightedSum += score * w\n totalWeight += w\n }\n return totalWeight > 0 ? weightedSum / totalWeight : 0\n}\n\n/** Bootstrap confidence interval */\nexport function confidenceInterval(\n scores: number[],\n confidence = 0.95,\n opts: { seed?: number; resamples?: number } = {},\n): { mean: number; lower: number; upper: number } {\n if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 }\n if (scores.length === 1) return { mean: scores[0]!, lower: scores[0]!, upper: scores[0]! }\n\n const n = scores.length\n const mean = scores.reduce((a, b) => a + b, 0) / n\n\n const B = opts.resamples ?? 1000\n const rng = makeRng(opts.seed)\n const bootstrapMeans: number[] = []\n\n for (let i = 0; i < B; i++) {\n let sum = 0\n for (let j = 0; j < n; j++) {\n sum += scores[Math.floor(rng() * n)]!\n }\n bootstrapMeans.push(sum / n)\n }\n\n bootstrapMeans.sort((a, b) => a - b)\n\n const alpha = 1 - confidence\n const lowerIdx = Math.floor((alpha / 2) * B)\n const upperIdx = Math.floor((1 - alpha / 2) * B) - 1\n\n return {\n mean,\n lower: bootstrapMeans[lowerIdx]!,\n upper: bootstrapMeans[Math.min(upperIdx, B - 1)]!,\n }\n}\n\n/**\n * Inter-rater reliability — simplified Krippendorff's alpha.\n *\n * Each inner array is one judge's scores for all items.\n * All arrays must have the same length (same items scored).\n */\nexport function interRaterReliability(judgeScores: JudgeScore[][]): number {\n if (judgeScores.length < 2) return 1\n\n // Group scores by dimension across judges\n const dimensionMap = new Map<string, number[][]>()\n for (const judgeSet of judgeScores) {\n for (const s of judgeSet) {\n if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, [])\n const arr = dimensionMap.get(s.dimension)!\n if (arr.length === 0 || arr[arr.length - 1]!.length >= judgeScores.length) {\n arr.push([s.score])\n } else {\n arr[arr.length - 1]!.push(s.score)\n }\n }\n }\n\n // Collect all paired ratings\n const allValues: number[] = []\n const pairDiffs: number[] = []\n\n for (const items of dimensionMap.values()) {\n for (const ratings of items) {\n if (ratings.length < 2) continue\n for (const v of ratings) allValues.push(v)\n for (let i = 0; i < ratings.length; i++) {\n for (let j = i + 1; j < ratings.length; j++) {\n pairDiffs.push((ratings[i]! - ratings[j]!) ** 2)\n }\n }\n }\n }\n\n if (pairDiffs.length === 0 || allValues.length < 2) return 1\n\n const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length\n\n // Expected disagreement from all possible pairings of values\n let expectedDisagreement = 0\n let expectedCount = 0\n for (let i = 0; i < allValues.length; i++) {\n for (let j = i + 1; j < allValues.length; j++) {\n expectedDisagreement += (allValues[i]! - allValues[j]!) ** 2\n expectedCount++\n }\n }\n expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0\n\n if (expectedDisagreement === 0) return 1\n return 1 - observedDisagreement / expectedDisagreement\n}\n\n/**\n * Mann-Whitney U test for comparing two independent groups.\n * Returns U statistic and approximate p-value (normal approximation).\n */\nexport function mannWhitneyU(a: number[], b: number[]): { u: number; p: number } {\n if (a.length === 0 || b.length === 0) return { u: 0, p: 1 }\n\n const n1 = a.length\n const n2 = b.length\n\n // Rank all values together\n const combined = [\n ...a.map((v) => ({ v, group: 'a' as const })),\n ...b.map((v) => ({ v, group: 'b' as const })),\n ].sort((x, y) => x.v - y.v)\n\n // Assign ranks with tie handling\n const ranks: number[] = new Array(combined.length)\n let i = 0\n while (i < combined.length) {\n let j = i\n while (j < combined.length && combined[j]!.v === combined[i]!.v) j++\n const avgRank = (i + 1 + j) / 2\n for (let k = i; k < j; k++) ranks[k] = avgRank\n i = j\n }\n\n // Sum ranks for group a\n let r1 = 0\n for (let k = 0; k < combined.length; k++) {\n if (combined[k]!.group === 'a') r1 += ranks[k]!\n }\n\n const u1 = r1 - (n1 * (n1 + 1)) / 2\n const u2 = n1 * n2 - u1\n const u = Math.min(u1, u2)\n\n // Normal approximation for p-value\n const mu = (n1 * n2) / 2\n const sigma = Math.sqrt((n1 * n2 * (n1 + n2 + 1)) / 12)\n\n if (sigma === 0) return { u, p: 1 }\n\n const z = Math.abs(u - mu) / sigma\n // Two-tailed p-value from z-score (approximation)\n const p = 2 * (1 - normalCdf(z))\n\n return { u, p }\n}\n\n/** Partial credit: returns 0-1 ratio of current toward target */\nexport function partialCredit(current: number, target: number): number {\n if (target <= 0) return 1\n return Math.min(1, Math.max(0, current / target))\n}\n\n/**\n * Paired t-test — before/after measurements on the SAME items.\n * Pairing removes inter-item variance, giving tighter significance than\n * an unpaired test when comparing prompt v1 vs prompt v2 on identical\n * scenarios.\n */\nexport function pairedTTest(\n before: number[],\n after: number[],\n): { t: number; df: number; p: number } {\n if (before.length !== after.length) {\n throw new ValidationError(\n `pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`,\n )\n }\n const n = before.length\n if (n < 2) return { t: 0, df: 0, p: 1 }\n\n const diffs = before.map((b, i) => after[i]! - b)\n const mean = diffs.reduce((a, b) => a + b, 0) / n\n const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1)\n const se = Math.sqrt(variance / n)\n if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 }\n\n const t = mean / se\n const df = n - 1\n const p = 2 * (1 - studentTCdf(Math.abs(t), df))\n return { t, df, p }\n}\n\n/**\n * Wilcoxon signed-rank test — paired non-parametric alternative.\n * Use when the differences aren't normally distributed.\n */\nexport function wilcoxonSignedRank(before: number[], after: number[]): { w: number; p: number } {\n if (before.length !== after.length) {\n throw new ValidationError(\n `wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`,\n )\n }\n const diffs = before.map((b, i) => after[i]! - b).filter((d) => d !== 0)\n const n = diffs.length\n if (n < 6) return { w: 0, p: 1 }\n\n const absRanks = diffs\n .map((d, i) => ({ abs: Math.abs(d), sign: Math.sign(d), i }))\n .sort((a, b) => a.abs - b.abs)\n const ranks: number[] = new Array(n)\n let i = 0\n while (i < n) {\n let j = i\n while (j < n && absRanks[j]!.abs === absRanks[i]!.abs) j++\n const avg = (i + 1 + j) / 2\n for (let k = i; k < j; k++) ranks[absRanks[k]!.i] = avg\n i = j\n }\n let wPlus = 0\n for (let k = 0; k < n; k++) if (diffs[k]! > 0) wPlus += ranks[k]!\n\n const mean = (n * (n + 1)) / 4\n const variance = (n * (n + 1) * (2 * n + 1)) / 24\n const z = (wPlus - mean) / Math.sqrt(variance)\n const p = 2 * (1 - normalCdf(Math.abs(z)))\n return { w: wPlus, p }\n}\n\n/**\n * Cohen's d — standardized effect size for two independent groups.\n * Positive d means group b has higher mean than group a.\n * Rule of thumb: |d| < 0.2 negligible, 0.2–0.5 small, 0.5–0.8 medium, > 0.8 large.\n */\nexport function cohensD(a: number[], b: number[]): number {\n if (a.length < 2 || b.length < 2) return 0\n const meanA = a.reduce((x, y) => x + y, 0) / a.length\n const meanB = b.reduce((x, y) => x + y, 0) / b.length\n const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1)\n const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1)\n const pooled = Math.sqrt(\n ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2),\n )\n if (pooled === 0) return 0\n return (meanB - meanA) / pooled\n}\n\n// ── Corpus-wide inter-rater agreement ──────────────────────────────\n//\n// The legacy `interRaterReliability(judgeScores)` computes a within-item\n// Krippendorff α — multiple judges score *the same item* and we ask\n// \"how much do their scores agree on that item?\" Useful for a single\n// scenario, but it cannot answer \"how reliable are these judges across\n// the whole evaluation corpus?\"\n//\n// `corpusInterRaterAgreement` does the corpus-wide question properly.\n// Inputs are flat per-(item, judge, dimension) score records. For each\n// dimension we pivot to a complete [n_items × n_judges] matrix and feed\n// it to the ICC(2,1) + κ_w machinery already validated in\n// `judge-calibration.ts`. An overall pooled metric averages the\n// per-dimension ICC/κ across dimensions.\n\nexport interface CorpusScoreRecord {\n /** Stable identifier for the rated item (scenario, span, turn, …). */\n itemId: string\n /** Identifier for the judge that produced this score. */\n judgeName: string\n /** Dimension name (matches `JudgeScore.dimension`). */\n dimension: string\n /** Numeric score; must be finite. */\n score: number\n}\n\nexport interface CorpusAgreementPerDimension extends ContinuousAgreement {\n dimension: string\n /** Item IDs that contributed to this dimension's matrix (every judge scored them). */\n itemIds: string[]\n /** Judge IDs that contributed to this dimension's matrix. */\n judgeIds: string[]\n}\n\nexport interface CorpusAgreementReport {\n /** Per-dimension ICC(2,1) + κ_w + Pearson + Spearman + bootstrap CIs. */\n perDimension: CorpusAgreementPerDimension[]\n /** Mean ICC across dimensions (NaN if no dimension yielded a finite ICC). */\n overallIcc: number\n /** Mean weighted κ across dimensions (NaN if none finite). */\n overallWeightedKappa: number\n /** Dimensions evaluated (sorted). */\n dimensions: string[]\n /** Judges seen across the corpus (sorted). */\n judgeIds: string[]\n}\n\nexport interface CorpusAgreementOptions extends ContinuousAgreementOptions {\n /**\n * Restrict the audit to these dimensions. Default = every dimension\n * that appears in the input. A dimension named here but absent from\n * the input throws — silent omission would corrupt the overall metric.\n */\n dimensions?: string[]\n /**\n * Restrict the audit to these judges. Default = every judge that\n * appears in the input. A judge named here but absent from a\n * dimension throws (see \"fail loud\" below).\n */\n judges?: string[]\n}\n\n/**\n * Corpus-wide inter-rater agreement across N items × M judges × D dimensions.\n *\n * For each dimension, builds the [n_items][n_judges] matrix of scores\n * (keeping only items every judge rated on that dimension), then runs\n * `continuousAgreement` to get ICC(2,1), κ_w, Pearson, Spearman, and\n * bootstrap CIs. Reports a pooled mean across dimensions as a single\n * \"is this judge panel reliable on this corpus?\" number.\n *\n * Fail-loud contract:\n * - Empty input throws.\n * - Fewer than 2 judges or fewer than 2 items per dimension throws.\n * - A judge present in some dimensions but with zero scored items on\n * another dimension throws (would silently shrink the matrix).\n * - Duplicate (itemId, judgeName, dimension) records throw.\n */\nexport function corpusInterRaterAgreement(\n records: CorpusScoreRecord[],\n opts: CorpusAgreementOptions = {},\n): CorpusAgreementReport {\n if (records.length === 0) {\n throw new ValidationError('corpusInterRaterAgreement: no score records supplied')\n }\n\n const judgesSeen = new Set<string>()\n const dimsSeen = new Set<string>()\n // dimension → judge → itemId → score\n const grid = new Map<string, Map<string, Map<string, number>>>()\n\n for (const r of records) {\n if (!Number.isFinite(r.score)) {\n throw new ValidationError(\n `corpusInterRaterAgreement: non-finite score for (item=${r.itemId}, judge=${r.judgeName}, dim=${r.dimension})`,\n )\n }\n judgesSeen.add(r.judgeName)\n dimsSeen.add(r.dimension)\n const byJudge = grid.get(r.dimension) ?? new Map<string, Map<string, number>>()\n const byItem = byJudge.get(r.judgeName) ?? new Map<string, number>()\n if (byItem.has(r.itemId)) {\n throw new ValidationError(\n `corpusInterRaterAgreement: duplicate record for (item=${r.itemId}, judge=${r.judgeName}, dim=${r.dimension})`,\n )\n }\n byItem.set(r.itemId, r.score)\n byJudge.set(r.judgeName, byItem)\n grid.set(r.dimension, byJudge)\n }\n\n const targetDims = opts.dimensions ?? [...dimsSeen].sort()\n for (const d of targetDims) {\n if (!dimsSeen.has(d)) {\n throw new ValidationError(\n `corpusInterRaterAgreement: dimension '${d}' was requested but no records carry it`,\n )\n }\n }\n const targetJudges = opts.judges ? [...opts.judges] : [...judgesSeen].sort()\n for (const j of targetJudges) {\n if (!judgesSeen.has(j)) {\n throw new ValidationError(\n `corpusInterRaterAgreement: judge '${j}' was requested but produced no records`,\n )\n }\n }\n if (targetJudges.length < 2) {\n throw new ValidationError(\n `corpusInterRaterAgreement: need ≥2 judges, got ${targetJudges.length}`,\n )\n }\n\n const perDimension: CorpusAgreementPerDimension[] = []\n const iccs: number[] = []\n const kappas: number[] = []\n\n for (const dim of targetDims) {\n const byJudge = grid.get(dim)!\n // Fail loud: every requested judge must have scored ≥1 item on this dim.\n const judgeItemCounts: Record<string, number> = {}\n for (const j of targetJudges) {\n const m = byJudge.get(j)\n judgeItemCounts[j] = m?.size ?? 0\n }\n const emptyJudges = targetJudges.filter((j) => judgeItemCounts[j] === 0)\n if (emptyJudges.length > 0) {\n throw new ValidationError(\n `corpusInterRaterAgreement: dimension '${dim}' has no scores from judge(s) ${emptyJudges.join(', ')} (counts: ${JSON.stringify(judgeItemCounts)})`,\n )\n }\n\n // Items rated by *every* requested judge on this dim.\n let commonItems: Set<string> | null = null\n for (const j of targetJudges) {\n const ids = new Set(byJudge.get(j)!.keys())\n if (commonItems === null) {\n commonItems = ids\n } else {\n const prev: Set<string> = commonItems\n commonItems = new Set([...prev].filter((x) => ids.has(x)))\n }\n }\n const sortedItems = [...(commonItems ?? new Set<string>())].sort()\n if (sortedItems.length < 2) {\n throw new ValidationError(\n `corpusInterRaterAgreement: dimension '${dim}' has ${sortedItems.length} item(s) rated by all ${targetJudges.length} judges (need ≥2)`,\n )\n }\n\n const matrix: number[][] = sortedItems.map((itemId) =>\n targetJudges.map((j) => byJudge.get(j)!.get(itemId)!),\n )\n const agreement = continuousAgreement(matrix, opts)\n perDimension.push({\n ...agreement,\n dimension: dim,\n itemIds: sortedItems,\n judgeIds: [...targetJudges],\n })\n if (Number.isFinite(agreement.icc)) iccs.push(agreement.icc)\n if (Number.isFinite(agreement.weightedKappa)) kappas.push(agreement.weightedKappa)\n }\n\n const mean = (xs: number[]) =>\n xs.length === 0 ? Number.NaN : xs.reduce((a, b) => a + b, 0) / xs.length\n return {\n perDimension,\n overallIcc: mean(iccs),\n overallWeightedKappa: mean(kappas),\n dimensions: targetDims,\n judgeIds: targetJudges,\n }\n}\n\n/**\n * Convenience adapter for `JudgeScore[]` data keyed externally by item.\n *\n * Use when you have per-item arrays of `JudgeScore[]` (e.g. one\n * `ScenarioResult.judgeScores` per scenario) and want corpus-wide\n * agreement without manually flattening. `itemId` must be unique per\n * row of `itemsScores`.\n */\nexport function corpusInterRaterAgreementFromJudgeScores(\n itemsScores: Array<{ itemId: string; scores: JudgeScore[] }>,\n opts: CorpusAgreementOptions = {},\n): CorpusAgreementReport {\n const records: CorpusScoreRecord[] = []\n const seen = new Set<string>()\n for (const { itemId, scores } of itemsScores) {\n if (seen.has(itemId)) {\n throw new ValidationError(\n `corpusInterRaterAgreementFromJudgeScores: duplicate itemId '${itemId}'`,\n )\n }\n seen.add(itemId)\n for (const s of scores) {\n records.push({\n itemId,\n judgeName: s.judgeName,\n dimension: s.dimension,\n score: s.score,\n })\n }\n }\n return corpusInterRaterAgreement(records, opts)\n}\n\n/** Student-t CDF approximation via Abramowitz-Stegun series. */\nfunction studentTCdf(t: number, df: number): number {\n if (df <= 0) return 0.5\n if (df > 100) return normalCdf(t)\n const x = df / (df + t * t)\n const a = df / 2\n const b = 0.5\n const ib = incompleteBeta(x, a, b)\n return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib\n}\n\n/** Regularized incomplete beta function via continued fraction (Lentz). */\nfunction incompleteBeta(x: number, a: number, b: number): number {\n if (x <= 0) return 0\n if (x >= 1) return 1\n const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b)\n const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a\n const maxIter = 200\n const eps = 3e-7\n let c = 1\n let d = 1 - ((a + b) * x) / (a + 1)\n if (Math.abs(d) < 1e-30) d = 1e-30\n d = 1 / d\n let f = d\n for (let m = 1; m <= maxIter; m++) {\n const m2 = 2 * m\n let num = (m * (b - m) * x) / ((a + m2 - 1) * (a + m2))\n d = 1 + num * d\n if (Math.abs(d) < 1e-30) d = 1e-30\n c = 1 + num / c\n if (Math.abs(c) < 1e-30) c = 1e-30\n d = 1 / d\n f *= d * c\n num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1))\n d = 1 + num * d\n if (Math.abs(d) < 1e-30) d = 1e-30\n c = 1 + num / c\n if (Math.abs(c) < 1e-30) c = 1e-30\n d = 1 / d\n const delta = d * c\n f *= delta\n if (Math.abs(delta - 1) < eps) break\n }\n return front * f\n}\n\n/** Lanczos approximation to ln Γ(z). */\nfunction lnGamma(z: number): number {\n const g = 7\n const coefs = [\n 0.99999999999980993, 676.5203681218851, -1259.1392167224028, 771.32342877765313,\n -176.61502916214059, 12.507343278686905, -0.13857109526572012, 9.9843695780195716e-6,\n 1.5056327351493116e-7,\n ]\n if (z < 0.5) {\n return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z)\n }\n z -= 1\n let x = coefs[0]!\n for (let i = 1; i < g + 2; i++) x += coefs[i]! / (z + i)\n const t = z + g + 0.5\n return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x)\n}\n\n// Standard normal CDF approximation (Abramowitz and Stegun)\nfunction normalCdf(x: number): number {\n const a1 = 0.254829592\n const a2 = -0.284496736\n const a3 = 1.421413741\n const a4 = -1.453152027\n const a5 = 1.061405429\n const p = 0.3275911\n\n const sign = x < 0 ? -1 : 1\n const absX = Math.abs(x)\n const t = 1 / (1 + p * absX)\n const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp((-absX * absX) / 2)\n\n return 0.5 * (1 + sign * y)\n}\n\n// ── Power analysis + multiple-comparison correction ──────────────────\n\n/**\n * Required N per arm for a two-sample comparison at target effect size,\n * alpha, and power. Normal-approximation formula:\n * n = 2 * ( (z_{1-α/2} + z_{1-β}) / d )^2\n * where d is Cohen's d. Returns Infinity for effect ≤ 0.\n */\nexport function requiredSampleSize(opts: {\n effect: number\n alpha?: number\n power?: number\n twoSided?: boolean\n}): number {\n const effect = opts.effect\n if (!Number.isFinite(effect) || effect <= 0) return Infinity\n const alpha = opts.alpha ?? 0.05\n const power = opts.power ?? 0.8\n const twoSided = opts.twoSided ?? true\n const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha)\n const zBeta = zQuantile(power)\n const n = 2 * ((zAlpha + zBeta) / effect) ** 2\n return Math.ceil(n)\n}\n\n/**\n * Minimum detectable paired effect (standardised units) for a target paired\n * sample size: d_min = (z_{1-α/2} + z_β) / sqrt(n_paired). Multiply by\n * sd(deltas) for score units; treat as a lower bound — Wilcoxon and bootstrap\n * have asymptotic relative efficiency below 1 vs the t-test on heavy tails.\n */\nexport function pairedMde(opts: {\n nPaired: number\n alpha?: number\n power?: number\n twoSided?: boolean\n}): number {\n if (!Number.isFinite(opts.nPaired) || opts.nPaired <= 0) return Infinity\n const alpha = opts.alpha ?? 0.05\n const power = opts.power ?? 0.8\n const twoSided = opts.twoSided ?? true\n const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha)\n const zBeta = zQuantile(power)\n return (zAlpha + zBeta) / Math.sqrt(opts.nPaired)\n}\n\n/** Bonferroni adjustment: multiply every p-value by the test count, clamp at 1. */\nexport function bonferroni(\n pValues: number[],\n alpha = 0.05,\n): { adjusted: number[]; significant: boolean[] } {\n const k = pValues.length\n const adjusted = pValues.map((p) => Math.min(1, p * k))\n const significant = adjusted.map((p) => p < alpha)\n return { adjusted, significant }\n}\n\n/**\n * Benjamini–Hochberg false discovery rate. Returns adjusted q-values and\n * significance at the target FDR; handles ties and preserves q monotonicity.\n */\nexport function benjaminiHochberg(\n pValues: number[],\n fdr = 0.05,\n): { qValues: number[]; significant: boolean[] } {\n const n = pValues.length\n if (n === 0) return { qValues: [], significant: [] }\n const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p)\n const q = new Array<number>(n)\n let minRight = 1\n for (let k = n - 1; k >= 0; k--) {\n const rank = k + 1\n const entry = indexed[k]!\n const raw = (entry.p * n) / rank\n const bounded = Math.min(minRight, raw)\n minRight = bounded\n q[entry.i] = Math.min(1, bounded)\n }\n const significant = q.map((v) => v < fdr)\n return { qValues: q, significant }\n}\n\n// ── Paired bootstrap (promotion-gate effect size) ────────────────────\n\nexport interface PairedBootstrapResult {\n /** Number of paired observations. */\n n: number\n /** Median of paired deltas (after − before). */\n median: number\n /** Mean of paired deltas. */\n mean: number\n /** Lower bound of the bootstrap CI on the chosen statistic. */\n low: number\n /** Upper bound of the bootstrap CI on the chosen statistic. */\n high: number\n /** Confidence level used (e.g. 0.95). */\n confidence: number\n /** Number of bootstrap resamples used. */\n resamples: number\n}\n\nexport interface PairedBootstrapOptions {\n /** Confidence level. Default 0.95. */\n confidence?: number\n /** Bootstrap resample count. Default 2000. */\n resamples?: number\n /** Statistic to bootstrap. Default 'median'. */\n statistic?: 'median' | 'mean'\n /** Deterministic seed. If omitted, uses Math.random(). */\n seed?: number\n}\n\n/**\n * Paired bootstrap on (after − before) deltas. Returns a CI on the chosen\n * statistic (median by default); pairs are resampled with replacement. The\n * lower bound is what the promotion gate checks — `low > threshold` means the\n * gain is real at the confidence level. Throws on unequal sample sizes.\n */\nexport function pairedBootstrap(\n before: number[],\n after: number[],\n opts: PairedBootstrapOptions = {},\n): PairedBootstrapResult {\n if (before.length !== after.length) {\n throw new Error(`pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`)\n }\n const confidence = opts.confidence ?? 0.95\n const resamples = opts.resamples ?? 2000\n const statistic = opts.statistic ?? 'median'\n if (confidence <= 0 || confidence >= 1) {\n throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`)\n }\n\n const n = before.length\n const deltas = before.map((b, i) => after[i]! - b)\n if (n === 0) {\n return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples }\n }\n if (n === 1) {\n const d = deltas[0]!\n return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples }\n }\n\n const rng = makeRng(opts.seed)\n const samples = new Array<number>(resamples)\n for (let b = 0; b < resamples; b++) {\n if (statistic === 'mean') {\n let sum = 0\n for (let k = 0; k < n; k++) {\n sum += deltas[Math.floor(rng() * n)]!\n }\n samples[b] = sum / n\n } else {\n const acc = new Array<number>(n)\n for (let k = 0; k < n; k++) {\n acc[k] = deltas[Math.floor(rng() * n)]!\n }\n samples[b] = medianInPlace(acc)\n }\n }\n samples.sort((a, b) => a - b)\n\n const alpha = 1 - confidence\n const lowIdx = Math.floor((alpha / 2) * resamples)\n const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1)\n\n return {\n n,\n median: medianInPlace([...deltas]),\n mean: deltas.reduce((s, x) => s + x, 0) / n,\n low: samples[lowIdx]!,\n high: samples[Math.max(highIdx, lowIdx)]!,\n confidence,\n resamples,\n }\n}\n\n// ── private stats helpers ────────────────────────────────────────────\n\n/** Standard-normal inverse CDF (Acklam approximation). */\nfunction zQuantile(p: number): number {\n if (p <= 0 || p >= 1) {\n if (p === 0) return -Infinity\n if (p === 1) return Infinity\n return NaN\n }\n const a = [\n -3.969683028665376e1, 2.209460984245205e2, -2.759285104469687e2, 1.38357751867269e2,\n -3.066479806614716e1, 2.506628277459239,\n ]\n const b = [\n -5.447609879822406e1, 1.615858368580409e2, -1.556989798598866e2, 6.680131188771972e1,\n -1.328068155288572e1,\n ]\n const c = [\n -7.784894002430293e-3, -3.223964580411365e-1, -2.400758277161838, -2.549732539343734,\n 4.374664141464968, 2.938163982698783,\n ]\n const d = [7.784695709041462e-3, 3.224671290700398e-1, 2.445134137142996, 3.754408661907416]\n const pLow = 0.02425\n const pHigh = 1 - pLow\n let q: number\n let r: number\n if (p < pLow) {\n q = Math.sqrt(-2 * Math.log(p))\n return (\n (((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + c[5]!) /\n ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)\n )\n }\n if (p <= pHigh) {\n q = p - 0.5\n r = q * q\n return (\n ((((((a[0]! * r + a[1]!) * r + a[2]!) * r + a[3]!) * r + a[4]!) * r + a[5]!) * q) /\n (((((b[0]! * r + b[1]!) * r + b[2]!) * r + b[3]!) * r + b[4]!) * r + 1)\n )\n }\n q = Math.sqrt(-2 * Math.log(1 - p))\n return (\n -(((((c[0]! * q + c[1]!) * q + c[2]!) * q + c[3]!) * q + c[4]!) * q + c[5]!) /\n ((((d[0]! * q + d[1]!) * q + d[2]!) * q + d[3]!) * q + 1)\n )\n}\n\nfunction medianInPlace(xs: number[]): number {\n if (xs.length === 0) return 0\n xs.sort((a, b) => a - b)\n const mid = Math.floor(xs.length / 2)\n return xs.length % 2 === 0 ? (xs[mid - 1]! + xs[mid]!) / 2 : xs[mid]!\n}\n\n/** Tiny seedable PRNG (mulberry32) — deterministic bootstrap resampling, not cryptographic. */\nfunction makeRng(seed: number | undefined): () => number {\n if (seed === undefined) return Math.random\n let s = seed | 0 || 0x9e3779b9\n return () => {\n s = (s + 0x6d2b79f5) | 0\n let t = s\n t = Math.imul(t ^ (t >>> 15), t | 1)\n t ^= t + Math.imul(t ^ (t >>> 7), t | 61)\n return ((t ^ (t >>> 14)) >>> 0) / 4294967296\n }\n}\n"],"mappings":";;;;;AA8CO,SAAS,eACd,QACA,WACmB;AACnB,QAAM,MAAM,oBAAI,IAAsC;AACtD,aAAW,KAAK,OAAQ,KAAI,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,YAAY,GAAG,IAAI,CAAC;AACrE,aAAW,KAAK,WAAW;AACzB,UAAM,QAAQ,IAAI,IAAI,EAAE,MAAM;AAC9B,QAAI,MAAO,OAAM,IAAI,EAAE;AAAA,EACzB;AACA,QAAM,SAAS,CAAC,GAAG,IAAI,OAAO,CAAC,EAAE,OAAO,CAAC,MAAM,OAAO,SAAS,EAAE,CAAC,CAAC;AACnE,QAAM,IAAI,OAAO;AACjB,MAAI,IAAI,GAAG;AACT,WAAO,EAAE,GAAG,SAAS,KAAK,OAAO,KAAK,KAAK,KAAK,YAAY,CAAC,EAAE;AAAA,EACjE;AACA,QAAM,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC;AACpC,QAAM,SAAS,OAAO,IAAI,CAAC,MAAM,EAAE,CAAC;AACpC,QAAM,UAAU,SAAS,QAAQ,MAAM;AACvC,QAAM,QAAQ,cAAc,OAAO,IAAI,KAAK,KAAK,GAAG,OAAO,IAAI,KAAK,KAAK,CAAC;AAC1E,QAAM,WAAW,OAAO,IAAI,CAAC,MAAM,KAAK,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;AACtD,QAAM,MAAM,SAAS,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAClD,QAAM,QAAQ,CAAC,GAAG,IAAI,QAAQ,CAAC,EAC5B,OAAO,CAAC,CAAC,EAAE,CAAC,MAAM,OAAO,SAAS,EAAE,CAAC,CAAC,EACtC,IAAI,CAAC,CAAC,QAAQ,CAAC,OAAO,EAAE,QAAQ,OAAO,EAAE,GAAG,OAAO,EAAE,GAAG,OAAO,KAAK,IAAI,EAAE,IAAI,EAAE,CAAC,EAAE,EAAE,EACrF,KAAK,CAAC,GAAG,MAAM,EAAE,QAAQ,EAAE,KAAK,EAChC,MAAM,GAAG,CAAC;AACb,SAAO,EAAE,GAAG,SAAS,OAAO,KAAK,YAAY,MAAM;AACrD;AAeO,SAAS,eAAe,QAAgD;AAC7E,QAAM,QAAQ,oBAAI,IAAiD;AACnE,aAAW,KAAK,QAAQ;AACtB,UAAM,OAAO,MAAM,IAAI,EAAE,MAAM,KAAK,CAAC;AACrC,QAAI,EAAE,qBAAqB,QAAS,MAAK,QAAQ,EAAE;AAAA,aAC1C,EAAE,qBAAqB,SAAU,MAAK,SAAS,EAAE;AAC1D,UAAM,IAAI,EAAE,QAAQ,IAAI;AAAA,EAC1B;AACA,QAAM,SAAmB,CAAC;AAC1B,aAAW,EAAE,OAAO,OAAO,KAAK,MAAM,OAAO,GAAG;AAC9C,QAAI,UAAU,UAAa,WAAW,OAAW,QAAO,KAAK,QAAQ,MAAM;AAAA,EAC7E;AACA,MAAI,OAAO,WAAW,EAAG,QAAO,EAAE,UAAU,GAAG,GAAG,EAAE;AACpD,SAAO,EAAE,UAAU,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,OAAO,QAAQ,GAAG,OAAO,OAAO;AACzF;AAQO,SAAS,cACd,SACqB;AACrB,QAAM,IAAI,QAAQ;AAClB,MAAI,IAAI,EAAG,QAAO,EAAE,SAAS,KAAK,EAAE;AACpC,SAAO;AAAA,IACL,SAAS;AAAA,MACP,QAAQ,IAAI,CAAC,MAAM,EAAE,SAAS;AAAA,MAC9B,QAAQ,IAAI,CAAC,MAAM,EAAE,KAAK;AAAA,IAC5B;AAAA,IACA;AAAA,EACF;AACF;AAeO,SAAS,eACd,SACsB;AACtB,QAAM,MAAM,QAAQ,OAAO,CAAC,MAAM,EAAE,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,KAAK;AAChE,QAAM,OAAO,QAAQ,OAAO,CAAC,MAAM,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,MAAM,EAAE,KAAK;AAClE,MAAI,IAAI,WAAW,KAAK,KAAK,WAAW;AACtC,WAAO,EAAE,cAAc,GAAG,iBAAiB,GAAG,WAAW,GAAG,GAAG,EAAE;AACnE,QAAM,SAAS,IAAI,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,IAAI;AACpD,QAAM,UAAU,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,KAAK;AACvD,SAAO;AAAA,IACL,cAAc;AAAA,IACd,iBAAiB;AAAA,IACjB,WAAW,SAAS;AAAA,IACpB,GAAG,QAAQ;AAAA,EACb;AACF;AAIA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,QAAM,KAAK,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC5C,MAAI,MAAM,GACR,KAAK,GACL,KAAK;AACP,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,UAAM,KAAK;AACX,UAAM,KAAK;AAAA,EACb;AACA,MAAI,OAAO,KAAK,OAAO,EAAG,QAAO,OAAO,KAAK,OAAO,IAAI,IAAI;AAC5D,SAAO,MAAM,KAAK,KAAK,KAAK,EAAE;AAChC;AAGA,SAAS,cAAc,GAAa,GAAqB;AACvD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,WAAW,EAAG,QAAO;AACpD,QAAM,MAAM,KAAK,IAAI,GAAG,GAAG,GAAG,CAAC;AAC/B,QAAM,MAAM,KAAK,IAAI,GAAG,GAAG,GAAG,CAAC;AAC/B,QAAM,IAAI,MAAM,MAAM;AACtB,MAAI,IAAI,EAAG,QAAO;AAClB,QAAM,WAAuB,MAAM,KAAK,EAAE,QAAQ,EAAE,GAAG,MAAM,IAAI,MAAM,CAAC,EAAE,KAAK,CAAC,CAAC;AACjF,QAAM,UAAU,IAAI,MAAM,CAAC,EAAE,KAAK,CAAC;AACnC,QAAM,UAAU,IAAI,MAAM,CAAC,EAAE,KAAK,CAAC;AACnC,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,MAAM,SAAS,EAAE;AACvB,QAAI,EAAE,KAAK,IAAI,EAAE,KAAK,KAAK;AAC3B,YAAQ,EAAE;AACV,YAAQ,EAAE;AAAA,EACZ;AACA,MAAI,MAAM;AACV,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,aAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,YAAM,KAAK,IAAI,MAAM,KAAK,IAAI,MAAM;AACpC,YAAM,WAAY,QAAQ,CAAC,IAAI,QAAQ,CAAC,IAAK,EAAE;AAC/C,aAAO,IAAI,SAAS,CAAC,EAAG,CAAC;AACzB,aAAO,IAAI;AAAA,IACb;AAAA,EACF;AACA,MAAI,QAAQ,EAAG,QAAO;AACtB,SAAO,IAAI,MAAM;AACnB;AAqEO,SAAS,oBACd,QACA,OAAmC,CAAC,GACf;AACrB,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,UAAU,KAAK,WAAW;AAChC,QAAM,OAAO,KAAK,QAAQ;AAC1B,QAAM,UAAU,KAAK,WAAW;AAEhC,QAAM,SAAS,OAAO,OAAO,CAAC,QAAQ,IAAI,UAAU,KAAK,IAAI,MAAM,CAAC,MAAM,OAAO,SAAS,CAAC,CAAC,CAAC;AAC7F,QAAM,SAAS,OAAO,CAAC,GAAG,UAAU;AAEpC,QAAM,QAAQ,OAAO,OAAO,CAAC,QAAQ,IAAI,WAAW,MAAM;AAC1D,QAAM,SAAS,MAAM;AAErB,MAAI,SAAS,KAAK,SAAS,GAAG;AAC5B,WAAO;AAAA,MACL,eAAe;AAAA,MACf,KAAK;AAAA,MACL,SAAS;AAAA,MACT,UAAU;AAAA,MACV,IAAI,EAAE,KAAK,CAAC,KAAK,GAAG,GAAG,eAAe,CAAC,KAAK,GAAG,EAAE;AAAA,MACjD,GAAG;AAAA,MACH;AAAA,IACF;AAAA,EACF;AAEA,QAAM,QAAQ,wBAAwB,OAAO,OAAO;AACpD,QAAM,MAAM,MAAM,KAAK;AACvB,QAAM,UAAU,YAAY,OAAO,QAAQ;AAC3C,QAAM,WAAW,YAAY,OAAO,SAAS;AAE7C,QAAM,QAA0B,CAAC,KAAK,GAAG;AACzC,QAAM,UAA4B,CAAC,KAAK,GAAG;AAC3C,MAAI,YAAY,GAAG;AACjB,UAAM,MAAM,WAAW,IAAI;AAC3B,UAAM,OAAiB,CAAC;AACxB,UAAM,SAAmB,CAAC;AAC1B,aAAS,IAAI,GAAG,IAAI,WAAW,KAAK;AAClC,YAAM,SAAqB,IAAI,MAAM,MAAM;AAC3C,eAAS,IAAI,GAAG,IAAI,QAAQ,KAAK;AAC/B,eAAO,CAAC,IAAI,MAAM,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC;AAAA,MAC9C;AACA,YAAM,OAAO,MAAM,MAAM;AACzB,YAAM,KAAK,wBAAwB,QAAQ,OAAO;AAClD,UAAI,OAAO,SAAS,IAAI,EAAG,MAAK,KAAK,IAAI;AACzC,UAAI,OAAO,SAAS,EAAE,EAAG,QAAO,KAAK,EAAE;AAAA,IACzC;AACA,UAAM,CAAC,IAAI,EAAE,IAAI,iBAAiB,OAAO;AACzC,QAAI,KAAK,SAAS,GAAG;AACnB,WAAK,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACzB,YAAM,CAAC,IAAI,SAAS,MAAM,EAAE;AAC5B,YAAM,CAAC,IAAI,SAAS,MAAM,EAAE;AAAA,IAC9B;AACA,QAAI,OAAO,SAAS,GAAG;AACrB,aAAO,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3B,cAAQ,CAAC,IAAI,SAAS,QAAQ,EAAE;AAChC,cAAQ,CAAC,IAAI,SAAS,QAAQ,EAAE;AAAA,IAClC;AAAA,EACF;AAEA,SAAO;AAAA,IACL,eAAe;AAAA,IACf;AAAA,IACA;AAAA,IACA;AAAA,IACA,IAAI,EAAE,KAAK,OAAO,eAAe,QAAQ;AAAA,IACzC,GAAG;AAAA,IACH;AAAA,EACF;AACF;AAmBO,SAAS,yBACd,QACA,WACA,OAAmC,CAAC,GACP;AAC7B,QAAM,OAAO,eAAe,QAAQ,SAAS;AAC7C,QAAM,MAAM,oBAAI,IAAsC;AACtD,aAAW,KAAK,OAAQ,KAAI,IAAI,EAAE,QAAQ,EAAE,GAAG,EAAE,YAAY,GAAG,IAAI,CAAC;AACrE,aAAW,KAAK,WAAW;AACzB,UAAM,QAAQ,IAAI,IAAI,EAAE,MAAM;AAC9B,QAAI,MAAO,OAAM,IAAI,EAAE;AAAA,EACzB;AACA,QAAM,OAAmB,CAAC;AAC1B,aAAW,KAAK,IAAI,OAAO,GAAG;AAC5B,QAAI,OAAO,SAAS,EAAE,CAAC,EAAG,MAAK,KAAK,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC;AAAA,EAChD;AACA,QAAM,YAAY,oBAAoB,MAAM,IAAI;AAChD,SAAO;AAAA,IACL,GAAG;AAAA,IACH,yBAAyB,UAAU;AAAA,IACnC,KAAK,UAAU;AAAA,IACf,UAAU,UAAU;AAAA,IACpB,IAAI,UAAU;AAAA,EAChB;AACF;AAgBA,SAAS,wBAAwB,MAAkB,QAAwC;AACzF,MAAI,KAAK,WAAW,EAAG,QAAO;AAC9B,QAAM,SAAS,KAAK,CAAC,EAAG;AACxB,MAAI,SAAS,EAAG,QAAO;AACvB,QAAM,MACJ,WAAW,WACP,CAAC,GAAW,MAAc,KAAK,IAAI,IAAI,CAAC,IACxC,CAAC,GAAW,OAAe,IAAI,MAAM;AAC3C,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,WAAS,KAAK,GAAG,KAAK,QAAQ,MAAM;AAClC,aAAS,KAAK,KAAK,GAAG,KAAK,QAAQ,MAAM;AACvC,YAAM,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAE;AACpC,YAAM,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,EAAE,CAAE;AACpC,YAAM,IAAI,EAAE;AACZ,UAAI,MAAM;AACV,eAAS,IAAI,GAAG,IAAI,GAAG,IAAK,QAAO,IAAI,EAAE,CAAC,GAAI,EAAE,CAAC,CAAE;AACnD,aAAO;AAEP,UAAI,MAAM;AACV,eAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,iBAAS,IAAI,GAAG,IAAI,GAAG,IAAK,QAAO,IAAI,EAAE,CAAC,GAAI,EAAE,CAAC,CAAE;AAAA,MACrD;AACA,aAAO,IAAI;AACX,UAAI,QAAQ,GAAG;AAGb,eAAO,QAAQ,IAAI,IAAI;AAAA,MACzB,OAAO;AACL,eAAO,IAAI,MAAM;AAAA,MACnB;AACA;AAAA,IACF;AAAA,EACF;AACA,SAAO,UAAU,IAAI,MAAM,MAAM;AACnC;AAUA,SAAS,MAAM,MAA0B;AACvC,QAAM,IAAI,KAAK;AACf,MAAI,IAAI,EAAG,QAAO;AAClB,QAAM,IAAI,KAAK,CAAC,EAAG;AACnB,MAAI,IAAI,EAAG,QAAO;AAGlB,QAAM,WAAW,KAAK,IAAI,CAAC,QAAQ,IAAI,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,CAAC;AAErE,QAAM,WAAW,IAAI,MAAM,CAAC,EAAE,KAAK,CAAC;AACpC,WAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,QAAI,IAAI;AACR,aAAS,IAAI,GAAG,IAAI,GAAG,IAAK,MAAK,KAAK,CAAC,EAAG,CAAC;AAC3C,aAAS,CAAC,IAAI,IAAI;AAAA,EACpB;AAEA,MAAI,QAAQ;AACZ,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,UAAS,SAAS,CAAC;AAC/C,WAAS;AAET,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,SAAQ,SAAS,CAAC,IAAK,UAAU;AAC7D,SAAO;AACP,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,SAAQ,SAAS,CAAC,IAAK,UAAU;AAC7D,SAAO;AACP,MAAI,MAAM;AACV,WAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,aAAS,IAAI,GAAG,IAAI,GAAG,IAAK,SAAQ,KAAK,CAAC,EAAG,CAAC,IAAK,UAAU;AAAA,EAC/D;AACA,QAAM,MAAM,MAAM,MAAM;AAExB,QAAM,MAAM,IAAI;AAChB,QAAM,MAAM,IAAI;AAChB,QAAM,OAAO,IAAI,MAAM,IAAI;AAC3B,QAAM,MAAM,MAAM;AAClB,QAAM,MAAM,MAAM;AAClB,QAAM,MAAM,MAAM,IAAI,MAAM,MAAM;AAElC,QAAM,QAAQ,OAAO,IAAI,KAAK,MAAO,KAAK,MAAM,OAAQ;AACxD,MAAI,UAAU,GAAG;AAGf,WAAO,QAAQ,KAAK,QAAQ,IAAI,IAAI;AAAA,EACtC;AACA,UAAQ,MAAM,OAAO;AACvB;AAGA,SAAS,YAAY,MAAkB,IAAkD;AACvF,QAAM,IAAI,KAAK,CAAC,GAAG,UAAU;AAC7B,MAAI,IAAI,EAAG,QAAO;AAClB,MAAI,MAAM;AACV,MAAI,QAAQ;AACZ,WAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,aAAS,IAAI,IAAI,GAAG,IAAI,GAAG,KAAK;AAC9B,YAAM,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAE;AACnC,YAAM,IAAI,KAAK,IAAI,CAAC,QAAQ,IAAI,CAAC,CAAE;AACnC,YAAM,IAAI,GAAG,GAAG,CAAC;AACjB,UAAI,OAAO,SAAS,CAAC,GAAG;AACtB,eAAO;AACP;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACA,SAAO,UAAU,IAAI,MAAM,MAAM;AACnC;AAGA,SAAS,UAAU,GAAa,GAAqB;AACnD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,SAAO,SAAS,aAAa,CAAC,GAAG,aAAa,CAAC,CAAC;AAClD;AAEA,SAAS,aAAa,IAAwB;AAC5C,QAAM,IAAI,GAAG;AACb,QAAM,UAAU,GAAG,IAAI,CAAC,GAAGA,QAAO,EAAE,GAAG,GAAAA,GAAE,EAAE;AAC3C,UAAQ,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AAChC,QAAM,QAAQ,IAAI,MAAM,CAAC,EAAE,KAAK,CAAC;AACjC,MAAI,IAAI;AACR,SAAO,IAAI,GAAG;AACZ,QAAI,IAAI;AACR,WAAO,IAAI,IAAI,KAAK,QAAQ,IAAI,CAAC,EAAG,MAAM,QAAQ,CAAC,EAAG,EAAG;AAEzD,UAAM,OAAO,IAAI,KAAK,IAAI;AAC1B,aAAS,IAAI,GAAG,KAAK,GAAG,IAAK,OAAM,QAAQ,CAAC,EAAG,CAAC,IAAI;AACpD,QAAI,IAAI;AAAA,EACV;AACA,SAAO;AACT;AAGA,SAAS,WAAW,MAA4B;AAC9C,MAAI,IAAI,SAAS;AACjB,SAAO,MAAM;AACX,QAAK,IAAI,eAAgB;AACzB,QAAI,IAAI;AACR,QAAI,KAAK,KAAK,IAAK,MAAM,IAAK,IAAI,CAAC;AACnC,SAAK,IAAI,KAAK,KAAK,IAAK,MAAM,GAAI,IAAI,EAAE;AACxC,aAAS,IAAK,MAAM,QAAS,KAAK;AAAA,EACpC;AACF;AAEA,SAAS,iBAAiB,SAAmC;AAC3D,QAAM,QAAQ,IAAI,WAAW;AAC7B,SAAO,CAAC,MAAM,IAAI,IAAI;AACxB;AAGA,SAAS,SAAS,QAAkB,GAAmB;AACrD,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,OAAO,WAAW,EAAG,QAAO,OAAO,CAAC;AACxC,QAAM,MAAM,KAAK,OAAO,SAAS;AACjC,QAAM,KAAK,KAAK,MAAM,GAAG;AACzB,QAAM,KAAK,KAAK,KAAK,GAAG;AACxB,MAAI,OAAO,GAAI,QAAO,OAAO,EAAE;AAC/B,QAAM,OAAO,MAAM;AACnB,SAAO,OAAO,EAAE,KAAM,IAAI,QAAQ,OAAO,EAAE,IAAK;AAClD;;;AC5iBA,IAAM,sBAAsB,oBAAI,IAAI,CAAC,iBAAiB,oBAAoB,eAAe,CAAC;AAQnF,SAAS,gBAAgB,QAAoC;AAClE,SAAO,OAAO,IAAI,CAAC,MAAM;AACvB,QAAI,oBAAoB,IAAI,EAAE,SAAS,GAAG;AACxC,aAAO;AAAA,IACT;AACA,WAAO;AAAA,EACT,CAAC;AACH;AAGO,SAAS,aAAa,QAAsD;AACjF,MAAI,OAAO,WAAW,EAAG,QAAO;AAChC,MAAI,cAAc;AAClB,MAAI,cAAc;AAClB,aAAW,EAAE,OAAO,OAAO,KAAK,QAAQ;AACtC,UAAM,IAAI,UAAU;AACpB,mBAAe,QAAQ;AACvB,mBAAe;AAAA,EACjB;AACA,SAAO,cAAc,IAAI,cAAc,cAAc;AACvD;AAGO,SAAS,mBACd,QACA,aAAa,MACb,OAA8C,CAAC,GACC;AAChD,MAAI,OAAO,WAAW,EAAG,QAAO,EAAE,MAAM,GAAG,OAAO,GAAG,OAAO,EAAE;AAC9D,MAAI,OAAO,WAAW,EAAG,QAAO,EAAE,MAAM,OAAO,CAAC,GAAI,OAAO,OAAO,CAAC,GAAI,OAAO,OAAO,CAAC,EAAG;AAEzF,QAAM,IAAI,OAAO;AACjB,QAAM,OAAO,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAEjD,QAAM,IAAI,KAAK,aAAa;AAC5B,QAAM,MAAM,QAAQ,KAAK,IAAI;AAC7B,QAAM,iBAA2B,CAAC;AAElC,WAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,aAAO,OAAO,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC;AAAA,IACrC;AACA,mBAAe,KAAK,MAAM,CAAC;AAAA,EAC7B;AAEA,iBAAe,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAEnC,QAAM,QAAQ,IAAI;AAClB,QAAM,WAAW,KAAK,MAAO,QAAQ,IAAK,CAAC;AAC3C,QAAM,WAAW,KAAK,OAAO,IAAI,QAAQ,KAAK,CAAC,IAAI;AAEnD,SAAO;AAAA,IACL;AAAA,IACA,OAAO,eAAe,QAAQ;AAAA,IAC9B,OAAO,eAAe,KAAK,IAAI,UAAU,IAAI,CAAC,CAAC;AAAA,EACjD;AACF;AAQO,SAAS,sBAAsB,aAAqC;AACzE,MAAI,YAAY,SAAS,EAAG,QAAO;AAGnC,QAAM,eAAe,oBAAI,IAAwB;AACjD,aAAW,YAAY,aAAa;AAClC,eAAW,KAAK,UAAU;AACxB,UAAI,CAAC,aAAa,IAAI,EAAE,SAAS,EAAG,cAAa,IAAI,EAAE,WAAW,CAAC,CAAC;AACpE,YAAM,MAAM,aAAa,IAAI,EAAE,SAAS;AACxC,UAAI,IAAI,WAAW,KAAK,IAAI,IAAI,SAAS,CAAC,EAAG,UAAU,YAAY,QAAQ;AACzE,YAAI,KAAK,CAAC,EAAE,KAAK,CAAC;AAAA,MACpB,OAAO;AACL,YAAI,IAAI,SAAS,CAAC,EAAG,KAAK,EAAE,KAAK;AAAA,MACnC;AAAA,IACF;AAAA,EACF;AAGA,QAAM,YAAsB,CAAC;AAC7B,QAAM,YAAsB,CAAC;AAE7B,aAAW,SAAS,aAAa,OAAO,GAAG;AACzC,eAAW,WAAW,OAAO;AAC3B,UAAI,QAAQ,SAAS,EAAG;AACxB,iBAAW,KAAK,QAAS,WAAU,KAAK,CAAC;AACzC,eAAS,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AACvC,iBAAS,IAAI,IAAI,GAAG,IAAI,QAAQ,QAAQ,KAAK;AAC3C,oBAAU,MAAM,QAAQ,CAAC,IAAK,QAAQ,CAAC,MAAO,CAAC;AAAA,QACjD;AAAA,MACF;AAAA,IACF;AAAA,EACF;AAEA,MAAI,UAAU,WAAW,KAAK,UAAU,SAAS,EAAG,QAAO;AAE3D,QAAM,uBAAuB,UAAU,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,UAAU;AAG9E,MAAI,uBAAuB;AAC3B,MAAI,gBAAgB;AACpB,WAAS,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;AACzC,aAAS,IAAI,IAAI,GAAG,IAAI,UAAU,QAAQ,KAAK;AAC7C,+BAAyB,UAAU,CAAC,IAAK,UAAU,CAAC,MAAO;AAC3D;AAAA,IACF;AAAA,EACF;AACA,yBAAuB,gBAAgB,IAAI,uBAAuB,gBAAgB;AAElF,MAAI,yBAAyB,EAAG,QAAO;AACvC,SAAO,IAAI,uBAAuB;AACpC;AAMO,SAAS,aAAa,GAAa,GAAuC;AAC/E,MAAI,EAAE,WAAW,KAAK,EAAE,WAAW,EAAG,QAAO,EAAE,GAAG,GAAG,GAAG,EAAE;AAE1D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,EAAE;AAGb,QAAM,WAAW;AAAA,IACf,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,OAAO,IAAa,EAAE;AAAA,IAC5C,GAAG,EAAE,IAAI,CAAC,OAAO,EAAE,GAAG,OAAO,IAAa,EAAE;AAAA,EAC9C,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AAG1B,QAAM,QAAkB,IAAI,MAAM,SAAS,MAAM;AACjD,MAAI,IAAI;AACR,SAAO,IAAI,SAAS,QAAQ;AAC1B,QAAI,IAAI;AACR,WAAO,IAAI,SAAS,UAAU,SAAS,CAAC,EAAG,MAAM,SAAS,CAAC,EAAG,EAAG;AACjE,UAAM,WAAW,IAAI,IAAI,KAAK;AAC9B,aAAS,IAAI,GAAG,IAAI,GAAG,IAAK,OAAM,CAAC,IAAI;AACvC,QAAI;AAAA,EACN;AAGA,MAAI,KAAK;AACT,WAAS,IAAI,GAAG,IAAI,SAAS,QAAQ,KAAK;AACxC,QAAI,SAAS,CAAC,EAAG,UAAU,IAAK,OAAM,MAAM,CAAC;AAAA,EAC/C;AAEA,QAAM,KAAK,KAAM,MAAM,KAAK,KAAM;AAClC,QAAM,KAAK,KAAK,KAAK;AACrB,QAAM,IAAI,KAAK,IAAI,IAAI,EAAE;AAGzB,QAAM,KAAM,KAAK,KAAM;AACvB,QAAM,QAAQ,KAAK,KAAM,KAAK,MAAM,KAAK,KAAK,KAAM,EAAE;AAEtD,MAAI,UAAU,EAAG,QAAO,EAAE,GAAG,GAAG,EAAE;AAElC,QAAM,IAAI,KAAK,IAAI,IAAI,EAAE,IAAI;AAE7B,QAAM,IAAI,KAAK,IAAI,UAAU,CAAC;AAE9B,SAAO,EAAE,GAAG,EAAE;AAChB;AAGO,SAAS,cAAc,SAAiB,QAAwB;AACrE,MAAI,UAAU,EAAG,QAAO;AACxB,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,UAAU,MAAM,CAAC;AAClD;AAQO,SAAS,YACd,QACA,OACsC;AACtC,MAAI,OAAO,WAAW,MAAM,QAAQ;AAClC,UAAM,IAAI;AAAA,MACR,sCAAsC,OAAO,MAAM,OAAO,MAAM,MAAM;AAAA,IACxE;AAAA,EACF;AACA,QAAM,IAAI,OAAO;AACjB,MAAI,IAAI,EAAG,QAAO,EAAE,GAAG,GAAG,IAAI,GAAG,GAAG,EAAE;AAEtC,QAAM,QAAQ,OAAO,IAAI,CAAC,GAAG,MAAM,MAAM,CAAC,IAAK,CAAC;AAChD,QAAM,OAAO,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAChD,QAAM,WAAW,MAAM,OAAO,CAAC,KAAK,MAAM,OAAO,IAAI,SAAS,GAAG,CAAC,KAAK,IAAI;AAC3E,QAAM,KAAK,KAAK,KAAK,WAAW,CAAC;AACjC,MAAI,OAAO,EAAG,QAAO,EAAE,GAAG,SAAS,IAAI,IAAI,UAAU,IAAI,IAAI,GAAG,GAAG,SAAS,IAAI,IAAI,EAAE;AAEtF,QAAM,IAAI,OAAO;AACjB,QAAM,KAAK,IAAI;AACf,QAAM,IAAI,KAAK,IAAI,YAAY,KAAK,IAAI,CAAC,GAAG,EAAE;AAC9C,SAAO,EAAE,GAAG,IAAI,EAAE;AACpB;AAMO,SAAS,mBAAmB,QAAkB,OAA2C;AAC9F,MAAI,OAAO,WAAW,MAAM,QAAQ;AAClC,UAAM,IAAI;AAAA,MACR,6CAA6C,OAAO,MAAM,OAAO,MAAM,MAAM;AAAA,IAC/E;AAAA,EACF;AACA,QAAM,QAAQ,OAAO,IAAI,CAAC,GAAGC,OAAM,MAAMA,EAAC,IAAK,CAAC,EAAE,OAAO,CAAC,MAAM,MAAM,CAAC;AACvE,QAAM,IAAI,MAAM;AAChB,MAAI,IAAI,EAAG,QAAO,EAAE,GAAG,GAAG,GAAG,EAAE;AAE/B,QAAM,WAAW,MACd,IAAI,CAAC,GAAGA,QAAO,EAAE,KAAK,KAAK,IAAI,CAAC,GAAG,MAAM,KAAK,KAAK,CAAC,GAAG,GAAAA,GAAE,EAAE,EAC3D,KAAK,CAAC,GAAG,MAAM,EAAE,MAAM,EAAE,GAAG;AAC/B,QAAM,QAAkB,IAAI,MAAM,CAAC;AACnC,MAAI,IAAI;AACR,SAAO,IAAI,GAAG;AACZ,QAAI,IAAI;AACR,WAAO,IAAI,KAAK,SAAS,CAAC,EAAG,QAAQ,SAAS,CAAC,EAAG,IAAK;AACvD,UAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,aAAS,IAAI,GAAG,IAAI,GAAG,IAAK,OAAM,SAAS,CAAC,EAAG,CAAC,IAAI;AACpD,QAAI;AAAA,EACN;AACA,MAAI,QAAQ;AACZ,WAAS,IAAI,GAAG,IAAI,GAAG,IAAK,KAAI,MAAM,CAAC,IAAK,EAAG,UAAS,MAAM,CAAC;AAE/D,QAAM,OAAQ,KAAK,IAAI,KAAM;AAC7B,QAAM,WAAY,KAAK,IAAI,MAAM,IAAI,IAAI,KAAM;AAC/C,QAAM,KAAK,QAAQ,QAAQ,KAAK,KAAK,QAAQ;AAC7C,QAAM,IAAI,KAAK,IAAI,UAAU,KAAK,IAAI,CAAC,CAAC;AACxC,SAAO,EAAE,GAAG,OAAO,EAAE;AACvB;AAOO,SAAS,QAAQ,GAAa,GAAqB;AACxD,MAAI,EAAE,SAAS,KAAK,EAAE,SAAS,EAAG,QAAO;AACzC,QAAM,QAAQ,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC/C,QAAM,QAAQ,EAAE,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,EAAE;AAC/C,QAAM,OAAO,EAAE,OAAO,CAAC,KAAK,MAAM,OAAO,IAAI,UAAU,GAAG,CAAC,KAAK,EAAE,SAAS;AAC3E,QAAM,OAAO,EAAE,OAAO,CAAC,KAAK,MAAM,OAAO,IAAI,UAAU,GAAG,CAAC,KAAK,EAAE,SAAS;AAC3E,QAAM,SAAS,KAAK;AAAA,MAChB,EAAE,SAAS,KAAK,QAAQ,EAAE,SAAS,KAAK,SAAS,EAAE,SAAS,EAAE,SAAS;AAAA,EAC3E;AACA,MAAI,WAAW,EAAG,QAAO;AACzB,UAAQ,QAAQ,SAAS;AAC3B;AAgFO,SAAS,0BACd,SACA,OAA+B,CAAC,GACT;AACvB,MAAI,QAAQ,WAAW,GAAG;AACxB,UAAM,IAAI,gBAAgB,sDAAsD;AAAA,EAClF;AAEA,QAAM,aAAa,oBAAI,IAAY;AACnC,QAAM,WAAW,oBAAI,IAAY;AAEjC,QAAM,OAAO,oBAAI,IAA8C;AAE/D,aAAW,KAAK,SAAS;AACvB,QAAI,CAAC,OAAO,SAAS,EAAE,KAAK,GAAG;AAC7B,YAAM,IAAI;AAAA,QACR,yDAAyD,EAAE,MAAM,WAAW,EAAE,SAAS,SAAS,EAAE,SAAS;AAAA,MAC7G;AAAA,IACF;AACA,eAAW,IAAI,EAAE,SAAS;AAC1B,aAAS,IAAI,EAAE,SAAS;AACxB,UAAM,UAAU,KAAK,IAAI,EAAE,SAAS,KAAK,oBAAI,IAAiC;AAC9E,UAAM,SAAS,QAAQ,IAAI,EAAE,SAAS,KAAK,oBAAI,IAAoB;AACnE,QAAI,OAAO,IAAI,EAAE,MAAM,GAAG;AACxB,YAAM,IAAI;AAAA,QACR,yDAAyD,EAAE,MAAM,WAAW,EAAE,SAAS,SAAS,EAAE,SAAS;AAAA,MAC7G;AAAA,IACF;AACA,WAAO,IAAI,EAAE,QAAQ,EAAE,KAAK;AAC5B,YAAQ,IAAI,EAAE,WAAW,MAAM;AAC/B,SAAK,IAAI,EAAE,WAAW,OAAO;AAAA,EAC/B;AAEA,QAAM,aAAa,KAAK,cAAc,CAAC,GAAG,QAAQ,EAAE,KAAK;AACzD,aAAW,KAAK,YAAY;AAC1B,QAAI,CAAC,SAAS,IAAI,CAAC,GAAG;AACpB,YAAM,IAAI;AAAA,QACR,yCAAyC,CAAC;AAAA,MAC5C;AAAA,IACF;AAAA,EACF;AACA,QAAM,eAAe,KAAK,SAAS,CAAC,GAAG,KAAK,MAAM,IAAI,CAAC,GAAG,UAAU,EAAE,KAAK;AAC3E,aAAW,KAAK,cAAc;AAC5B,QAAI,CAAC,WAAW,IAAI,CAAC,GAAG;AACtB,YAAM,IAAI;AAAA,QACR,qCAAqC,CAAC;AAAA,MACxC;AAAA,IACF;AAAA,EACF;AACA,MAAI,aAAa,SAAS,GAAG;AAC3B,UAAM,IAAI;AAAA,MACR,uDAAkD,aAAa,MAAM;AAAA,IACvE;AAAA,EACF;AAEA,QAAM,eAA8C,CAAC;AACrD,QAAM,OAAiB,CAAC;AACxB,QAAM,SAAmB,CAAC;AAE1B,aAAW,OAAO,YAAY;AAC5B,UAAM,UAAU,KAAK,IAAI,GAAG;AAE5B,UAAM,kBAA0C,CAAC;AACjD,eAAW,KAAK,cAAc;AAC5B,YAAM,IAAI,QAAQ,IAAI,CAAC;AACvB,sBAAgB,CAAC,IAAI,GAAG,QAAQ;AAAA,IAClC;AACA,UAAM,cAAc,aAAa,OAAO,CAAC,MAAM,gBAAgB,CAAC,MAAM,CAAC;AACvE,QAAI,YAAY,SAAS,GAAG;AAC1B,YAAM,IAAI;AAAA,QACR,yCAAyC,GAAG,iCAAiC,YAAY,KAAK,IAAI,CAAC,aAAa,KAAK,UAAU,eAAe,CAAC;AAAA,MACjJ;AAAA,IACF;AAGA,QAAI,cAAkC;AACtC,eAAW,KAAK,cAAc;AAC5B,YAAM,MAAM,IAAI,IAAI,QAAQ,IAAI,CAAC,EAAG,KAAK,CAAC;AAC1C,UAAI,gBAAgB,MAAM;AACxB,sBAAc;AAAA,MAChB,OAAO;AACL,cAAM,OAAoB;AAC1B,sBAAc,IAAI,IAAI,CAAC,GAAG,IAAI,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI,CAAC,CAAC,CAAC;AAAA,MAC3D;AAAA,IACF;AACA,UAAM,cAAc,CAAC,GAAI,eAAe,oBAAI,IAAY,CAAE,EAAE,KAAK;AACjE,QAAI,YAAY,SAAS,GAAG;AAC1B,YAAM,IAAI;AAAA,QACR,yCAAyC,GAAG,SAAS,YAAY,MAAM,yBAAyB,aAAa,MAAM;AAAA,MACrH;AAAA,IACF;AAEA,UAAM,SAAqB,YAAY;AAAA,MAAI,CAAC,WAC1C,aAAa,IAAI,CAAC,MAAM,QAAQ,IAAI,CAAC,EAAG,IAAI,MAAM,CAAE;AAAA,IACtD;AACA,UAAM,YAAY,oBAAoB,QAAQ,IAAI;AAClD,iBAAa,KAAK;AAAA,MAChB,GAAG;AAAA,MACH,WAAW;AAAA,MACX,SAAS;AAAA,MACT,UAAU,CAAC,GAAG,YAAY;AAAA,IAC5B,CAAC;AACD,QAAI,OAAO,SAAS,UAAU,GAAG,EAAG,MAAK,KAAK,UAAU,GAAG;AAC3D,QAAI,OAAO,SAAS,UAAU,aAAa,EAAG,QAAO,KAAK,UAAU,aAAa;AAAA,EACnF;AAEA,QAAM,OAAO,CAAC,OACZ,GAAG,WAAW,IAAI,OAAO,MAAM,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AACpE,SAAO;AAAA,IACL;AAAA,IACA,YAAY,KAAK,IAAI;AAAA,IACrB,sBAAsB,KAAK,MAAM;AAAA,IACjC,YAAY;AAAA,IACZ,UAAU;AAAA,EACZ;AACF;AAUO,SAAS,yCACd,aACA,OAA+B,CAAC,GACT;AACvB,QAAM,UAA+B,CAAC;AACtC,QAAM,OAAO,oBAAI,IAAY;AAC7B,aAAW,EAAE,QAAQ,OAAO,KAAK,aAAa;AAC5C,QAAI,KAAK,IAAI,MAAM,GAAG;AACpB,YAAM,IAAI;AAAA,QACR,+DAA+D,MAAM;AAAA,MACvE;AAAA,IACF;AACA,SAAK,IAAI,MAAM;AACf,eAAW,KAAK,QAAQ;AACtB,cAAQ,KAAK;AAAA,QACX;AAAA,QACA,WAAW,EAAE;AAAA,QACb,WAAW,EAAE;AAAA,QACb,OAAO,EAAE;AAAA,MACX,CAAC;AAAA,IACH;AAAA,EACF;AACA,SAAO,0BAA0B,SAAS,IAAI;AAChD;AAGA,SAAS,YAAY,GAAW,IAAoB;AAClD,MAAI,MAAM,EAAG,QAAO;AACpB,MAAI,KAAK,IAAK,QAAO,UAAU,CAAC;AAChC,QAAM,IAAI,MAAM,KAAK,IAAI;AACzB,QAAM,IAAI,KAAK;AACf,QAAM,IAAI;AACV,QAAM,KAAK,eAAe,GAAG,GAAG,CAAC;AACjC,SAAO,KAAK,IAAI,IAAI,MAAM,KAAK,MAAM;AACvC;AAGA,SAAS,eAAe,GAAW,GAAW,GAAmB;AAC/D,MAAI,KAAK,EAAG,QAAO;AACnB,MAAI,KAAK,EAAG,QAAO;AACnB,QAAM,SAAS,QAAQ,CAAC,IAAI,QAAQ,CAAC,IAAI,QAAQ,IAAI,CAAC;AACtD,QAAM,QAAQ,KAAK,IAAI,KAAK,IAAI,CAAC,IAAI,IAAI,KAAK,IAAI,IAAI,CAAC,IAAI,IAAI,MAAM,IAAI;AACzE,QAAM,UAAU;AAChB,QAAM,MAAM;AACZ,MAAI,IAAI;AACR,MAAI,IAAI,KAAM,IAAI,KAAK,KAAM,IAAI;AACjC,MAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,MAAI,IAAI;AACR,MAAI,IAAI;AACR,WAAS,IAAI,GAAG,KAAK,SAAS,KAAK;AACjC,UAAM,KAAK,IAAI;AACf,QAAI,MAAO,KAAK,IAAI,KAAK,MAAO,IAAI,KAAK,MAAM,IAAI;AACnD,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI;AACR,SAAK,IAAI;AACT,UAAM,GAAG,IAAI,MAAM,IAAI,IAAI,KAAK,OAAO,IAAI,OAAO,IAAI,KAAK;AAC3D,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI,MAAM;AACd,QAAI,KAAK,IAAI,CAAC,IAAI,MAAO,KAAI;AAC7B,QAAI,IAAI;AACR,UAAM,QAAQ,IAAI;AAClB,SAAK;AACL,QAAI,KAAK,IAAI,QAAQ,CAAC,IAAI,IAAK;AAAA,EACjC;AACA,SAAO,QAAQ;AACjB;AAGA,SAAS,QAAQ,GAAmB;AAClC,QAAM,IAAI;AACV,QAAM,QAAQ;AAAA,IACZ;AAAA,IAAqB;AAAA,IAAmB;AAAA,IAAqB;AAAA,IAC7D;AAAA,IAAqB;AAAA,IAAoB;AAAA,IAAsB;AAAA,IAC/D;AAAA,EACF;AACA,MAAI,IAAI,KAAK;AACX,WAAO,KAAK,IAAI,KAAK,KAAK,KAAK,IAAI,KAAK,KAAK,CAAC,CAAC,IAAI,QAAQ,IAAI,CAAC;AAAA,EAClE;AACA,OAAK;AACL,MAAI,IAAI,MAAM,CAAC;AACf,WAAS,IAAI,GAAG,IAAI,IAAI,GAAG,IAAK,MAAK,MAAM,CAAC,KAAM,IAAI;AACtD,QAAM,IAAI,IAAI,IAAI;AAClB,SAAO,MAAM,KAAK,IAAI,IAAI,KAAK,EAAE,KAAK,IAAI,OAAO,KAAK,IAAI,CAAC,IAAI,IAAI,KAAK,IAAI,CAAC;AAC/E;AAGA,SAAS,UAAU,GAAmB;AACpC,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,KAAK;AACX,QAAM,IAAI;AAEV,QAAM,OAAO,IAAI,IAAI,KAAK;AAC1B,QAAM,OAAO,KAAK,IAAI,CAAC;AACvB,QAAM,IAAI,KAAK,IAAI,IAAI;AACvB,QAAM,IAAI,QAAQ,KAAK,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,MAAM,IAAI,KAAK,IAAK,CAAC,OAAO,OAAQ,CAAC;AAE9F,SAAO,OAAO,IAAI,OAAO;AAC3B;AAUO,SAAS,mBAAmB,MAKxB;AACT,QAAM,SAAS,KAAK;AACpB,MAAI,CAAC,OAAO,SAAS,MAAM,KAAK,UAAU,EAAG,QAAO;AACpD,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,SAAS,UAAU,WAAW,IAAI,QAAQ,IAAI,IAAI,KAAK;AAC7D,QAAM,QAAQ,UAAU,KAAK;AAC7B,QAAM,IAAI,MAAM,SAAS,SAAS,WAAW;AAC7C,SAAO,KAAK,KAAK,CAAC;AACpB;AAQO,SAAS,UAAU,MAKf;AACT,MAAI,CAAC,OAAO,SAAS,KAAK,OAAO,KAAK,KAAK,WAAW,EAAG,QAAO;AAChE,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,QAAQ,KAAK,SAAS;AAC5B,QAAM,WAAW,KAAK,YAAY;AAClC,QAAM,SAAS,UAAU,WAAW,IAAI,QAAQ,IAAI,IAAI,KAAK;AAC7D,QAAM,QAAQ,UAAU,KAAK;AAC7B,UAAQ,SAAS,SAAS,KAAK,KAAK,KAAK,OAAO;AAClD;AAGO,SAAS,WACd,SACA,QAAQ,MACwC;AAChD,QAAM,IAAI,QAAQ;AAClB,QAAM,WAAW,QAAQ,IAAI,CAAC,MAAM,KAAK,IAAI,GAAG,IAAI,CAAC,CAAC;AACtD,QAAM,cAAc,SAAS,IAAI,CAAC,MAAM,IAAI,KAAK;AACjD,SAAO,EAAE,UAAU,YAAY;AACjC;AAMO,SAAS,kBACd,SACA,MAAM,MACyC;AAC/C,QAAM,IAAI,QAAQ;AAClB,MAAI,MAAM,EAAG,QAAO,EAAE,SAAS,CAAC,GAAG,aAAa,CAAC,EAAE;AACnD,QAAM,UAAU,QAAQ,IAAI,CAAC,GAAG,OAAO,EAAE,GAAG,EAAE,EAAE,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,IAAI,EAAE,CAAC;AAC1E,QAAM,IAAI,IAAI,MAAc,CAAC;AAC7B,MAAI,WAAW;AACf,WAAS,IAAI,IAAI,GAAG,KAAK,GAAG,KAAK;AAC/B,UAAM,OAAO,IAAI;AACjB,UAAM,QAAQ,QAAQ,CAAC;AACvB,UAAM,MAAO,MAAM,IAAI,IAAK;AAC5B,UAAM,UAAU,KAAK,IAAI,UAAU,GAAG;AACtC,eAAW;AACX,MAAE,MAAM,CAAC,IAAI,KAAK,IAAI,GAAG,OAAO;AAAA,EAClC;AACA,QAAM,cAAc,EAAE,IAAI,CAAC,MAAM,IAAI,GAAG;AACxC,SAAO,EAAE,SAAS,GAAG,YAAY;AACnC;AAsCO,SAAS,gBACd,QACA,OACA,OAA+B,CAAC,GACT;AACvB,MAAI,OAAO,WAAW,MAAM,QAAQ;AAClC,UAAM,IAAI,MAAM,0CAA0C,OAAO,MAAM,OAAO,MAAM,MAAM,GAAG;AAAA,EAC/F;AACA,QAAM,aAAa,KAAK,cAAc;AACtC,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,YAAY,KAAK,aAAa;AACpC,MAAI,cAAc,KAAK,cAAc,GAAG;AACtC,UAAM,IAAI,MAAM,qDAAqD,UAAU,EAAE;AAAA,EACnF;AAEA,QAAM,IAAI,OAAO;AACjB,QAAM,SAAS,OAAO,IAAI,CAAC,GAAG,MAAM,MAAM,CAAC,IAAK,CAAC;AACjD,MAAI,MAAM,GAAG;AACX,WAAO,EAAE,GAAG,GAAG,QAAQ,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,YAAY,UAAU;AAAA,EAC5E;AACA,MAAI,MAAM,GAAG;AACX,UAAM,IAAI,OAAO,CAAC;AAClB,WAAO,EAAE,GAAG,GAAG,QAAQ,GAAG,MAAM,GAAG,KAAK,GAAG,MAAM,GAAG,YAAY,UAAU;AAAA,EAC5E;AAEA,QAAM,MAAM,QAAQ,KAAK,IAAI;AAC7B,QAAM,UAAU,IAAI,MAAc,SAAS;AAC3C,WAAS,IAAI,GAAG,IAAI,WAAW,KAAK;AAClC,QAAI,cAAc,QAAQ;AACxB,UAAI,MAAM;AACV,eAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,eAAO,OAAO,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC;AAAA,MACrC;AACA,cAAQ,CAAC,IAAI,MAAM;AAAA,IACrB,OAAO;AACL,YAAM,MAAM,IAAI,MAAc,CAAC;AAC/B,eAAS,IAAI,GAAG,IAAI,GAAG,KAAK;AAC1B,YAAI,CAAC,IAAI,OAAO,KAAK,MAAM,IAAI,IAAI,CAAC,CAAC;AAAA,MACvC;AACA,cAAQ,CAAC,IAAI,cAAc,GAAG;AAAA,IAChC;AAAA,EACF;AACA,UAAQ,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAE5B,QAAM,QAAQ,IAAI;AAClB,QAAM,SAAS,KAAK,MAAO,QAAQ,IAAK,SAAS;AACjD,QAAM,UAAU,KAAK,IAAI,YAAY,GAAG,KAAK,MAAM,IAAI,QAAQ,KAAK,SAAS,IAAI,CAAC;AAElF,SAAO;AAAA,IACL;AAAA,IACA,QAAQ,cAAc,CAAC,GAAG,MAAM,CAAC;AAAA,IACjC,MAAM,OAAO,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI;AAAA,IAC1C,KAAK,QAAQ,MAAM;AAAA,IACnB,MAAM,QAAQ,KAAK,IAAI,SAAS,MAAM,CAAC;AAAA,IACvC;AAAA,IACA;AAAA,EACF;AACF;AAKA,SAAS,UAAU,GAAmB;AACpC,MAAI,KAAK,KAAK,KAAK,GAAG;AACpB,QAAI,MAAM,EAAG,QAAO;AACpB,QAAI,MAAM,EAAG,QAAO;AACpB,WAAO;AAAA,EACT;AACA,QAAM,IAAI;AAAA,IACR;AAAA,IAAsB;AAAA,IAAqB;AAAA,IAAsB;AAAA,IACjE;AAAA,IAAsB;AAAA,EACxB;AACA,QAAM,IAAI;AAAA,IACR;AAAA,IAAsB;AAAA,IAAqB;AAAA,IAAsB;AAAA,IACjE;AAAA,EACF;AACA,QAAM,IAAI;AAAA,IACR;AAAA,IAAuB;AAAA,IAAuB;AAAA,IAAoB;AAAA,IAClE;AAAA,IAAmB;AAAA,EACrB;AACA,QAAM,IAAI,CAAC,sBAAsB,oBAAsB,mBAAmB,iBAAiB;AAC3F,QAAM,OAAO;AACb,QAAM,QAAQ,IAAI;AAClB,MAAI;AACJ,MAAI;AACJ,MAAI,IAAI,MAAM;AACZ,QAAI,KAAK,KAAK,KAAK,KAAK,IAAI,CAAC,CAAC;AAC9B,gBACO,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,SACpE,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI;AAAA,EAE3D;AACA,MAAI,KAAK,OAAO;AACd,QAAI,IAAI;AACR,QAAI,IAAI;AACR,gBACQ,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,SAC1E,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI;AAAA,EAEzE;AACA,MAAI,KAAK,KAAK,KAAK,KAAK,IAAI,IAAI,CAAC,CAAC;AAClC,SACE,MAAM,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,SACrE,EAAE,CAAC,IAAK,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI,EAAE,CAAC,KAAM,IAAI;AAE3D;AAEA,SAAS,cAAc,IAAsB;AAC3C,MAAI,GAAG,WAAW,EAAG,QAAO;AAC5B,KAAG,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACvB,QAAM,MAAM,KAAK,MAAM,GAAG,SAAS,CAAC;AACpC,SAAO,GAAG,SAAS,MAAM,KAAK,GAAG,MAAM,CAAC,IAAK,GAAG,GAAG,KAAM,IAAI,GAAG,GAAG;AACrE;AAGA,SAAS,QAAQ,MAAwC;AACvD,MAAI,SAAS,OAAW,QAAO,KAAK;AACpC,MAAI,IAAI,OAAO,KAAK;AACpB,SAAO,MAAM;AACX,QAAK,IAAI,aAAc;AACvB,QAAI,IAAI;AACR,QAAI,KAAK,KAAK,IAAK,MAAM,IAAK,IAAI,CAAC;AACnC,SAAK,IAAI,KAAK,KAAK,IAAK,MAAM,GAAI,IAAI,EAAE;AACxC,aAAS,IAAK,MAAM,QAAS,KAAK;AAAA,EACpC;AACF;","names":["i","i"]}
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
// src/rl/verifiable-reward.ts
|
|
2
|
+
var DEFAULT_DETERMINISTIC_LAYERS = /* @__PURE__ */ new Set([
|
|
3
|
+
"install",
|
|
4
|
+
"typecheck",
|
|
5
|
+
"build",
|
|
6
|
+
"lint",
|
|
7
|
+
"test",
|
|
8
|
+
"compile",
|
|
9
|
+
"schema",
|
|
10
|
+
"sandbox",
|
|
11
|
+
"unit_tests",
|
|
12
|
+
"integration_tests"
|
|
13
|
+
]);
|
|
14
|
+
var DEFAULT_SOURCE_FOR = (name) => {
|
|
15
|
+
const lower = name.toLowerCase();
|
|
16
|
+
if (lower.includes("test")) return "test";
|
|
17
|
+
if (lower.includes("compile") || lower.includes("build") || lower.includes("typecheck") || lower.includes("lint"))
|
|
18
|
+
return "compile";
|
|
19
|
+
if (lower.includes("schema")) return "schema";
|
|
20
|
+
if (lower.includes("sandbox")) return "sandbox";
|
|
21
|
+
if (lower.includes("judge") || lower.includes("semantic")) return "judge";
|
|
22
|
+
return "composite";
|
|
23
|
+
};
|
|
24
|
+
function extractVerifiableReward(report, opts = {}) {
|
|
25
|
+
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
26
|
+
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
27
|
+
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
28
|
+
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
29
|
+
const deterministic = report.layers.filter(
|
|
30
|
+
(l) => deterministicSet.has(l.layer) && typeof l.score === "number" && Number.isFinite(l.score)
|
|
31
|
+
);
|
|
32
|
+
if (deterministic.length === 1) {
|
|
33
|
+
const layer = deterministic[0];
|
|
34
|
+
return {
|
|
35
|
+
value: clamp01(layer.score),
|
|
36
|
+
source: sourceFor(layer.layer),
|
|
37
|
+
determinism: "deterministic",
|
|
38
|
+
confidence: 1,
|
|
39
|
+
origin: layer.layer,
|
|
40
|
+
breakdown: layerBreakdown(layer)
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
if (deterministic.length > 1) {
|
|
44
|
+
let num = 0;
|
|
45
|
+
let denom = 0;
|
|
46
|
+
const breakdown = {};
|
|
47
|
+
for (const l of deterministic) {
|
|
48
|
+
const w = l.detail?.weight ?? 1;
|
|
49
|
+
num += w * (l.score ?? 0);
|
|
50
|
+
denom += w;
|
|
51
|
+
breakdown[l.layer] = l.score;
|
|
52
|
+
}
|
|
53
|
+
return {
|
|
54
|
+
value: denom === 0 ? 0 : clamp01(num / denom),
|
|
55
|
+
source: "composite",
|
|
56
|
+
determinism: "deterministic",
|
|
57
|
+
confidence: 1,
|
|
58
|
+
origin: deterministic.map((l) => l.layer).join("+"),
|
|
59
|
+
breakdown
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
if (!fallbackToJudge) return null;
|
|
63
|
+
const judge = report.layers.find(
|
|
64
|
+
(l) => typeof l.score === "number" && Number.isFinite(l.score) && sourceFor(l.layer) === "judge"
|
|
65
|
+
) ?? report.layers.find((l) => typeof l.score === "number" && Number.isFinite(l.score));
|
|
66
|
+
if (!judge) return null;
|
|
67
|
+
const confFromDetail = judge.detail?.confidence;
|
|
68
|
+
return {
|
|
69
|
+
value: clamp01(judge.score),
|
|
70
|
+
source: "judge",
|
|
71
|
+
determinism: "probabilistic",
|
|
72
|
+
confidence: typeof confFromDetail === "number" ? confFromDetail : judgeFloor,
|
|
73
|
+
origin: judge.layer,
|
|
74
|
+
breakdown: layerBreakdown(judge)
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
function extractVerifiableRewardsFromRecords(runs, opts = {}) {
|
|
78
|
+
const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR;
|
|
79
|
+
const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS]);
|
|
80
|
+
const fallbackToJudge = opts.fallbackToJudge ?? true;
|
|
81
|
+
const judgeFloor = opts.judgeConfidenceFloor ?? 0.7;
|
|
82
|
+
return runs.map((run) => {
|
|
83
|
+
const layerScores = [];
|
|
84
|
+
for (const [k, v] of Object.entries(run.outcome.raw)) {
|
|
85
|
+
if (k.startsWith("layer.") && !k.includes(".", 6) && typeof v === "number" && Number.isFinite(v)) {
|
|
86
|
+
layerScores.push({ name: k.slice("layer.".length), score: v });
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
const det = layerScores.filter((l) => deterministicSet.has(l.name));
|
|
90
|
+
if (det.length === 1) {
|
|
91
|
+
const layer = det[0];
|
|
92
|
+
return {
|
|
93
|
+
runId: run.runId,
|
|
94
|
+
reward: {
|
|
95
|
+
value: clamp01(layer.score),
|
|
96
|
+
source: sourceFor(layer.name),
|
|
97
|
+
determinism: "deterministic",
|
|
98
|
+
confidence: 1,
|
|
99
|
+
origin: layer.name
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
if (det.length > 1) {
|
|
104
|
+
const value = det.reduce((s, l) => s + l.score, 0) / det.length;
|
|
105
|
+
const breakdown = Object.fromEntries(
|
|
106
|
+
det.map((l) => [l.name, l.score])
|
|
107
|
+
);
|
|
108
|
+
return {
|
|
109
|
+
runId: run.runId,
|
|
110
|
+
reward: {
|
|
111
|
+
value: clamp01(value),
|
|
112
|
+
source: "composite",
|
|
113
|
+
determinism: "deterministic",
|
|
114
|
+
confidence: 1,
|
|
115
|
+
origin: det.map((l) => l.name).join("+"),
|
|
116
|
+
breakdown
|
|
117
|
+
}
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
if (!fallbackToJudge) return { runId: run.runId, reward: null };
|
|
121
|
+
const primary = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
122
|
+
if (typeof primary !== "number" || !Number.isFinite(primary)) {
|
|
123
|
+
return { runId: run.runId, reward: null };
|
|
124
|
+
}
|
|
125
|
+
return {
|
|
126
|
+
runId: run.runId,
|
|
127
|
+
reward: {
|
|
128
|
+
value: clamp01(primary),
|
|
129
|
+
source: "judge",
|
|
130
|
+
determinism: "probabilistic",
|
|
131
|
+
confidence: judgeFloor,
|
|
132
|
+
origin: "run.outcome.score"
|
|
133
|
+
}
|
|
134
|
+
};
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
function filterDeterministicallyRewarded(runs, opts = {}) {
|
|
138
|
+
const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false });
|
|
139
|
+
const out = [];
|
|
140
|
+
for (let i = 0; i < runs.length; i++) {
|
|
141
|
+
const r = rewarded[i];
|
|
142
|
+
if (r.reward && r.reward.determinism === "deterministic") {
|
|
143
|
+
out.push({ run: runs[i], reward: r.reward });
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return out;
|
|
147
|
+
}
|
|
148
|
+
function clamp01(x) {
|
|
149
|
+
if (!Number.isFinite(x)) return 0;
|
|
150
|
+
return Math.max(0, Math.min(1, x));
|
|
151
|
+
}
|
|
152
|
+
function layerBreakdown(l) {
|
|
153
|
+
const out = {};
|
|
154
|
+
if (l.diagnostics) {
|
|
155
|
+
for (const [k, v] of Object.entries(l.diagnostics)) {
|
|
156
|
+
if (typeof v === "number" && Number.isFinite(v)) out[k] = v;
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
return out;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// src/rl/reward-hacking.ts
|
|
163
|
+
var DEFAULT_PROXY = (r) => {
|
|
164
|
+
const v = r.outcome.holdoutScore ?? r.outcome.searchScore;
|
|
165
|
+
return typeof v === "number" && Number.isFinite(v) ? v : null;
|
|
166
|
+
};
|
|
167
|
+
function detectRewardHacking(input) {
|
|
168
|
+
const proxyOf = input.proxyOf ?? DEFAULT_PROXY;
|
|
169
|
+
const truthOf = input.truthOf;
|
|
170
|
+
const sus = input.thresholds?.suspect ?? 0.3;
|
|
171
|
+
const gam = input.thresholds?.gaming ?? 0.6;
|
|
172
|
+
const runs = input.runs.filter((r) => proxyOf(r) !== null);
|
|
173
|
+
const n = runs.length;
|
|
174
|
+
if (n < 4) {
|
|
175
|
+
return {
|
|
176
|
+
findings: [],
|
|
177
|
+
verdict: "clean",
|
|
178
|
+
n,
|
|
179
|
+
rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`]
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)));
|
|
183
|
+
const before = runs.slice(0, n - windowSize);
|
|
184
|
+
const after = runs.slice(n - windowSize);
|
|
185
|
+
const findings = [];
|
|
186
|
+
if (truthOf) {
|
|
187
|
+
const beforeProxy = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
188
|
+
const afterProxy = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
189
|
+
const beforeTruth = before.map(truthOf).filter((v) => typeof v === "number");
|
|
190
|
+
const afterTruth = after.map(truthOf).filter((v) => typeof v === "number");
|
|
191
|
+
if (beforeProxy.length >= 2 && afterProxy.length >= 2 && beforeTruth.length >= 2 && afterTruth.length >= 2) {
|
|
192
|
+
const proxyDelta = mean(afterProxy) - mean(beforeProxy);
|
|
193
|
+
const truthDelta = mean(afterTruth) - mean(beforeTruth);
|
|
194
|
+
const gap = Math.max(0, proxyDelta - truthDelta);
|
|
195
|
+
const severity = clamp012(gap * 5);
|
|
196
|
+
findings.push({
|
|
197
|
+
signal: "reward_divergence",
|
|
198
|
+
severity,
|
|
199
|
+
message: severity >= sus ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} \u2014 potential Goodhart` : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,
|
|
200
|
+
detail: {
|
|
201
|
+
proxyDelta,
|
|
202
|
+
truthDelta,
|
|
203
|
+
gap,
|
|
204
|
+
beforeN: beforeProxy.length,
|
|
205
|
+
afterN: afterProxy.length
|
|
206
|
+
}
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
{
|
|
211
|
+
const beforeP = before.map(proxyOf).filter((v) => typeof v === "number");
|
|
212
|
+
const afterP = after.map(proxyOf).filter((v) => typeof v === "number");
|
|
213
|
+
if (beforeP.length >= 4 && afterP.length >= 4) {
|
|
214
|
+
const ks = ksStatistic(beforeP, afterP);
|
|
215
|
+
const severity = clamp012(ks - 0.2);
|
|
216
|
+
findings.push({
|
|
217
|
+
signal: "distribution_shift",
|
|
218
|
+
severity,
|
|
219
|
+
message: severity >= sus ? `KS=${ks.toFixed(3)} between before/after windows \u2014 distributional shift large` : `KS=${ks.toFixed(3)} between before/after windows \u2014 within-distribution drift`,
|
|
220
|
+
detail: { ks, beforeN: beforeP.length, afterN: afterP.length }
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
{
|
|
225
|
+
const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions);
|
|
226
|
+
const aligned = runs.map((r) => ({ p: proxyOf(r), s: secondaryOf(r) })).filter(
|
|
227
|
+
(x) => typeof x.p === "number" && typeof x.s === "number"
|
|
228
|
+
);
|
|
229
|
+
if (aligned.length >= 4) {
|
|
230
|
+
const ps = aligned.map((x) => x.p);
|
|
231
|
+
const ss = aligned.map((x) => x.s);
|
|
232
|
+
const r = pearsonR(ps, ss);
|
|
233
|
+
const severity = clamp012(0.5 - Math.max(0, r));
|
|
234
|
+
findings.push({
|
|
235
|
+
signal: "reward_disagreement",
|
|
236
|
+
severity,
|
|
237
|
+
message: severity >= sus ? `proxy and independent secondary reward correlate \u03C1=${r.toFixed(3)} \u2014 possibly hacking proxy` : `proxy and secondary reward correlate \u03C1=${r.toFixed(3)}`,
|
|
238
|
+
detail: { pearson: r, n: aligned.length }
|
|
239
|
+
});
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
{
|
|
243
|
+
const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {});
|
|
244
|
+
if (detRuns.length >= 4) {
|
|
245
|
+
const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2));
|
|
246
|
+
const detAfter = detRuns.slice(Math.floor(detRuns.length / 2));
|
|
247
|
+
const detDelta = mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value));
|
|
248
|
+
const proxyDelta = mean(after.map(proxyOf).filter((v) => typeof v === "number")) - mean(before.map(proxyOf).filter((v) => typeof v === "number"));
|
|
249
|
+
const driftGap = Math.max(0, proxyDelta - detDelta);
|
|
250
|
+
const severity = clamp012(driftGap * 5);
|
|
251
|
+
findings.push({
|
|
252
|
+
signal: "judge_drift",
|
|
253
|
+
severity,
|
|
254
|
+
message: severity >= sus ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} \u2014 judge drifting up without verifiable backing` : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,
|
|
255
|
+
detail: { proxyDelta, detDelta, driftGap, n: detRuns.length }
|
|
256
|
+
});
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0);
|
|
260
|
+
const verdict = maxSev >= gam ? "gaming" : maxSev >= sus ? "suspect" : "clean";
|
|
261
|
+
const rationale = findings.filter((f) => f.severity >= sus).map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} \u2014 ${f.message}`);
|
|
262
|
+
if (rationale.length === 0) rationale.push("no signals fired above suspect threshold");
|
|
263
|
+
return { findings, verdict, rationale, n };
|
|
264
|
+
}
|
|
265
|
+
function mean(xs) {
|
|
266
|
+
if (xs.length === 0) return 0;
|
|
267
|
+
return xs.reduce((s, x) => s + x, 0) / xs.length;
|
|
268
|
+
}
|
|
269
|
+
function clamp012(x) {
|
|
270
|
+
if (!Number.isFinite(x)) return 0;
|
|
271
|
+
return Math.max(0, Math.min(1, x));
|
|
272
|
+
}
|
|
273
|
+
function pearsonR(a, b) {
|
|
274
|
+
if (a.length !== b.length || a.length < 2) return 0;
|
|
275
|
+
const ma = mean(a);
|
|
276
|
+
const mb = mean(b);
|
|
277
|
+
let num = 0, da = 0, db = 0;
|
|
278
|
+
for (let i = 0; i < a.length; i++) {
|
|
279
|
+
const xa = a[i] - ma;
|
|
280
|
+
const xb = b[i] - mb;
|
|
281
|
+
num += xa * xb;
|
|
282
|
+
da += xa * xa;
|
|
283
|
+
db += xb * xb;
|
|
284
|
+
}
|
|
285
|
+
if (da === 0 || db === 0) return 0;
|
|
286
|
+
return num / Math.sqrt(da * db);
|
|
287
|
+
}
|
|
288
|
+
function ksStatistic(a, b) {
|
|
289
|
+
const sortedA = [...a].sort((x, y) => x - y);
|
|
290
|
+
const sortedB = [...b].sort((x, y) => x - y);
|
|
291
|
+
const all = [.../* @__PURE__ */ new Set([...sortedA, ...sortedB])].sort((x, y) => x - y);
|
|
292
|
+
let max = 0;
|
|
293
|
+
for (const v of all) {
|
|
294
|
+
const fa = sortedA.filter((x) => x <= v).length / sortedA.length;
|
|
295
|
+
const fb = sortedB.filter((x) => x <= v).length / sortedB.length;
|
|
296
|
+
max = Math.max(max, Math.abs(fa - fb));
|
|
297
|
+
}
|
|
298
|
+
return max;
|
|
299
|
+
}
|
|
300
|
+
function defaultSecondary(verifiableOpts) {
|
|
301
|
+
return (run) => {
|
|
302
|
+
const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {});
|
|
303
|
+
return filtered.length === 1 ? filtered[0].reward.value : null;
|
|
304
|
+
};
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
export {
|
|
308
|
+
extractVerifiableReward,
|
|
309
|
+
extractVerifiableRewardsFromRecords,
|
|
310
|
+
filterDeterministicallyRewarded,
|
|
311
|
+
detectRewardHacking
|
|
312
|
+
};
|
|
313
|
+
//# sourceMappingURL=chunk-YV7J7X5N.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/rl/verifiable-reward.ts","../src/rl/reward-hacking.ts"],"sourcesContent":["/**\n * Verifiable reward channel.\n *\n * For RL on coding / math / theorem-proving / structured-output tasks, the\n * reward signal is *decidable* — a test passes or fails, a proof checks or\n * doesn't, an output validates against a schema or doesn't. These rewards\n * are dramatically more useful for RL training than LLM-judge scores\n * because they don't drift, can't be Goodhart-gamed by the policy in the\n * same way, and don't require a separate calibration loop.\n *\n * The `MultiLayerVerifier` already produces this signal — it just doesn't\n * surface it in a shape that's clean enough for RL training. This module\n * wraps the verifier output so consumers can:\n *\n * 1. Extract a clean `VerifiableReward` from a `VerificationReport`\n * 2. Distinguish *deterministic* rewards (compile, test, schema) from\n * *probabilistic* rewards (judge) so they can be weighted differently\n * in the RL training step\n * 3. Filter `RunRecord[]` to only those with a verifiable reward,\n * producing the clean training set that DeepSeek-R1-style GRPO and\n * AlphaProof-style search both depend on\n *\n * Why this matters: every credible 2025-2026 frontier RL result on coding\n * agents leans on verifiable reward (DeepSeek-R1 GRPO on test pass-rate,\n * o-series RL on math/code, AlphaProof on Lean kernel checking). Mixing\n * judge scores into the reward signal poisons the gradient. This module\n * is the seam.\n */\n\nimport type { LayerResult, VerificationReport } from '../multi-layer-verifier'\nimport type { RunRecord } from '../run-record'\n\nexport type VerifiableRewardSource =\n | 'compile' // typecheck / build / lint passed\n | 'test' // unit / integration test pass-rate\n | 'schema' // structured output validates\n | 'sandbox' // sandbox exec exit code\n | 'judge' // LLM judge — probabilistic, included for completeness\n | 'composite' // weighted blend across multiple of the above\n\nexport interface VerifiableReward {\n /** Scalar in [0, 1]. The RL training signal. */\n value: number\n /** What produced the reward — different sources have different determinism. */\n source: VerifiableRewardSource\n /**\n * Determinism class. `'deterministic'` rewards are repeatable byte-for-byte\n * given the same inputs (compile, test, schema validation, sandbox exit code).\n * `'probabilistic'` rewards depend on a stochastic component (LLM judge).\n * Mixing these in the same training batch without separation is a known\n * footgun in production RLHF pipelines.\n */\n determinism: 'deterministic' | 'probabilistic'\n /**\n * Confidence in the reward value. For deterministic sources this is 1.0\n * (the bit either flipped or didn't). For judge sources this is the\n * judge-reported confidence or — when missing — a calibrated prior.\n */\n confidence: number\n /** The layer / judge id that produced the signal, for provenance. */\n origin: string\n /**\n * Any per-source breakdown the consumer might want — e.g. `{ tests_passed: 7, tests_total: 10 }`.\n */\n breakdown?: Record<string, number>\n}\n\nexport interface VerifiableRewardExtractionOptions {\n /**\n * Which layers count as deterministic-reward sources. The verifier doesn't\n * tag layers as \"this is verifiable\"; the caller declares it via this list\n * (or via the layer name → source mapping). Default treats common names\n * (`install`, `typecheck`, `build`, `lint`, `test`, `compile`, `schema`,\n * `sandbox`) as deterministic.\n */\n deterministicLayers?: string[]\n /**\n * Map layer name → reward source. Defaults to a sensible string-match.\n */\n sourceFor?: (layerName: string) => VerifiableRewardSource\n /**\n * Whether to fall back to a probabilistic (judge) reward when no\n * deterministic layer produced a numeric score. Default `true`. Set to\n * `false` for \"deterministic-only\" training pipelines that should\n * discard runs without a verifiable signal.\n */\n fallbackToJudge?: boolean\n /**\n * Default confidence for probabilistic (judge) rewards when the judge\n * doesn't report one. Default `0.7`.\n */\n judgeConfidenceFloor?: number\n}\n\nconst DEFAULT_DETERMINISTIC_LAYERS = new Set([\n 'install',\n 'typecheck',\n 'build',\n 'lint',\n 'test',\n 'compile',\n 'schema',\n 'sandbox',\n 'unit_tests',\n 'integration_tests',\n])\n\nconst DEFAULT_SOURCE_FOR = (name: string): VerifiableRewardSource => {\n const lower = name.toLowerCase()\n if (lower.includes('test')) return 'test'\n if (\n lower.includes('compile') ||\n lower.includes('build') ||\n lower.includes('typecheck') ||\n lower.includes('lint')\n )\n return 'compile'\n if (lower.includes('schema')) return 'schema'\n if (lower.includes('sandbox')) return 'sandbox'\n if (lower.includes('judge') || lower.includes('semantic')) return 'judge'\n return 'composite'\n}\n\n/**\n * Extract a `VerifiableReward` from a `VerificationReport`.\n *\n * Strategy: prefer the deterministic layers (in order: test → compile →\n * schema → sandbox), fall back to the judge layer if `fallbackToJudge` is\n * true, return `null` if no signal qualifies. When multiple deterministic\n * layers contribute, return a `'composite'` source with a weighted blend.\n */\nexport function extractVerifiableReward(\n report: VerificationReport,\n opts: VerifiableRewardExtractionOptions = {},\n): VerifiableReward | null {\n const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS])\n const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR\n const fallbackToJudge = opts.fallbackToJudge ?? true\n const judgeFloor = opts.judgeConfidenceFloor ?? 0.7\n\n const deterministic = report.layers.filter(\n (l) => deterministicSet.has(l.layer) && typeof l.score === 'number' && Number.isFinite(l.score),\n )\n\n if (deterministic.length === 1) {\n const layer = deterministic[0]!\n return {\n value: clamp01(layer.score!),\n source: sourceFor(layer.layer),\n determinism: 'deterministic',\n confidence: 1,\n origin: layer.layer,\n breakdown: layerBreakdown(layer),\n }\n }\n\n if (deterministic.length > 1) {\n // Composite: weighted blend by `Layer.weight` if present, else equal.\n let num = 0\n let denom = 0\n const breakdown: Record<string, number> = {}\n for (const l of deterministic) {\n const w = (l.detail?.weight as number | undefined) ?? 1\n num += w * (l.score ?? 0)\n denom += w\n breakdown[l.layer] = l.score!\n }\n return {\n value: denom === 0 ? 0 : clamp01(num / denom),\n source: 'composite',\n determinism: 'deterministic',\n confidence: 1,\n origin: deterministic.map((l) => l.layer).join('+'),\n breakdown,\n }\n }\n\n if (!fallbackToJudge) return null\n\n const judge =\n report.layers.find(\n (l) =>\n typeof l.score === 'number' && Number.isFinite(l.score) && sourceFor(l.layer) === 'judge',\n ) ?? report.layers.find((l) => typeof l.score === 'number' && Number.isFinite(l.score))\n\n if (!judge) return null\n\n const confFromDetail = judge.detail?.confidence as number | undefined\n return {\n value: clamp01(judge.score!),\n source: 'judge',\n determinism: 'probabilistic',\n confidence: typeof confFromDetail === 'number' ? confFromDetail : judgeFloor,\n origin: judge.layer,\n breakdown: layerBreakdown(judge),\n }\n}\n\n/**\n * Extract verifiable rewards from `RunRecord[]` produced via the\n * `verificationReportToRunRecord` adapter (which encodes per-layer scores\n * in `outcome.raw['layer.<name>']`). For records that don't carry layer\n * scores, returns `null` for that record.\n *\n * This is the canonical bridge from \"campaign-shaped artifacts\" to\n * \"RL-training-ready reward signals\": every record that has a clean\n * verifiable reward becomes a training datum, every record that doesn't\n * gets filtered out (or kept with `'probabilistic'` determinism for\n * separate downstream handling).\n */\nexport function extractVerifiableRewardsFromRecords(\n runs: RunRecord[],\n opts: VerifiableRewardExtractionOptions = {},\n): Array<{ runId: string; reward: VerifiableReward | null }> {\n const sourceFor = opts.sourceFor ?? DEFAULT_SOURCE_FOR\n const deterministicSet = new Set(opts.deterministicLayers ?? [...DEFAULT_DETERMINISTIC_LAYERS])\n const fallbackToJudge = opts.fallbackToJudge ?? true\n const judgeFloor = opts.judgeConfidenceFloor ?? 0.7\n\n return runs.map((run) => {\n // Recover per-layer scores from outcome.raw['layer.<name>']\n const layerScores: Array<{ name: string; score: number }> = []\n for (const [k, v] of Object.entries(run.outcome.raw)) {\n if (\n k.startsWith('layer.') &&\n !k.includes('.', 6) &&\n typeof v === 'number' &&\n Number.isFinite(v)\n ) {\n layerScores.push({ name: k.slice('layer.'.length), score: v })\n }\n }\n const det = layerScores.filter((l) => deterministicSet.has(l.name))\n\n if (det.length === 1) {\n const layer = det[0]!\n return {\n runId: run.runId,\n reward: {\n value: clamp01(layer.score),\n source: sourceFor(layer.name),\n determinism: 'deterministic',\n confidence: 1,\n origin: layer.name,\n },\n }\n }\n if (det.length > 1) {\n const value = det.reduce((s, l) => s + l.score, 0) / det.length\n const breakdown: Record<string, number> = Object.fromEntries(\n det.map((l) => [l.name, l.score]),\n )\n return {\n runId: run.runId,\n reward: {\n value: clamp01(value),\n source: 'composite',\n determinism: 'deterministic',\n confidence: 1,\n origin: det.map((l) => l.name).join('+'),\n breakdown,\n },\n }\n }\n if (!fallbackToJudge) return { runId: run.runId, reward: null }\n\n // Probabilistic fallback: use the run's primary score.\n const primary = run.outcome.holdoutScore ?? run.outcome.searchScore\n if (typeof primary !== 'number' || !Number.isFinite(primary)) {\n return { runId: run.runId, reward: null }\n }\n return {\n runId: run.runId,\n reward: {\n value: clamp01(primary),\n source: 'judge',\n determinism: 'probabilistic',\n confidence: judgeFloor,\n origin: 'run.outcome.score',\n },\n }\n })\n}\n\n/** Filter `RunRecord[]` to those with deterministic verifiable rewards. */\nexport function filterDeterministicallyRewarded(\n runs: RunRecord[],\n opts: VerifiableRewardExtractionOptions = {},\n): Array<{ run: RunRecord; reward: VerifiableReward }> {\n const rewarded = extractVerifiableRewardsFromRecords(runs, { ...opts, fallbackToJudge: false })\n const out: Array<{ run: RunRecord; reward: VerifiableReward }> = []\n for (let i = 0; i < runs.length; i++) {\n const r = rewarded[i]!\n if (r.reward && r.reward.determinism === 'deterministic') {\n out.push({ run: runs[i]!, reward: r.reward })\n }\n }\n return out\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────────\n\nfunction clamp01(x: number): number {\n if (!Number.isFinite(x)) return 0\n return Math.max(0, Math.min(1, x))\n}\n\nfunction layerBreakdown(l: LayerResult): Record<string, number> {\n const out: Record<string, number> = {}\n if (l.diagnostics) {\n for (const [k, v] of Object.entries(l.diagnostics)) {\n if (typeof v === 'number' && Number.isFinite(v)) out[k] = v\n }\n }\n return out\n}\n","/**\n * Reward hacking / Goodhart detection.\n *\n * Goodhart's Law says: when a measure becomes a target, it ceases to be\n * a good measure. In RLHF and agentic-RL settings this is the dominant\n * failure mode — the policy learns to produce outputs that score well on\n * the proxy reward (judge, rubric, test pass-rate) without producing\n * the underlying capability the proxy was meant to track.\n *\n * Krakovna et al. (2020, \"Specification Gaming Examples in AI\") and the\n * subsequent RLHF reward-hacking literature (Skalse et al. 2022, Kim et al.\n * 2023) converge on a few diagnostic signatures:\n *\n * 1. **Reward divergence:** the proxy reward grows while the held-out\n * ground-truth signal stagnates or drops. Predictive validity over\n * time captures this.\n * 2. **Distributional shift in outputs:** after RL, the policy produces\n * outputs that no longer match the reference distribution — usually\n * because it found a high-reward attractor that's degenerate (e.g.\n * one-token responses, repetition, formatting tricks).\n * 3. **Disagreement between independent rewards:** if you train on\n * reward A and a held-out independent reward B drops sharply, you're\n * probably hacking A.\n * 4. **Calibration drift:** the verifiable / deterministic component of\n * the reward is stable; the probabilistic / judge component drifts up\n * while the deterministic component doesn't. The judge is being\n * gamed.\n *\n * This module ships explicit detectors for all four signatures, plus a\n * combined verdict. The output is diagnostic — actionable signals,\n * not autoreject — because each signature has known false positives\n * (e.g., a policy that genuinely improves can show distributional shift).\n *\n * Differs from `rubricPredictiveValidity` (which is a *standing* check on\n * whether rubrics correlate with deployment outcomes) — this is a\n * *temporal* check on whether the reward-vs-truth gap is *widening over\n * time during a training run*.\n */\n\nimport type { RunRecord } from '../run-record'\nimport {\n filterDeterministicallyRewarded,\n type VerifiableRewardExtractionOptions,\n} from './verifiable-reward'\n\nexport type RewardHackingSignal =\n | 'reward_divergence'\n | 'distribution_shift'\n | 'reward_disagreement'\n | 'judge_drift'\n\nexport interface RewardHackingFinding {\n signal: RewardHackingSignal\n /** Severity in [0, 1]. >0.5 = strong signal. */\n severity: number\n message: string\n /** Numeric evidence the consumer can render. */\n detail: Record<string, number>\n}\n\nexport interface RewardHackingReport {\n findings: RewardHackingFinding[]\n /**\n * Composite verdict. `'clean'` if every signal severity < 0.3;\n * `'suspect'` if at least one ≥ 0.3 but none ≥ 0.6; `'gaming'` if any ≥ 0.6.\n */\n verdict: 'clean' | 'suspect' | 'gaming'\n /** Rationale for the verdict, ready to paste into an audit log. */\n rationale: string[]\n /** Number of paired (proxy, truth) data points the report saw. */\n n: number\n}\n\nexport interface DetectRewardHackingInput {\n /**\n * Run records ordered by recency (oldest first). The detector segments\n * them into prefix/suffix windows to compute \"did the gap widen.\"\n */\n runs: RunRecord[]\n /**\n * The metric the policy was trained to optimize. Should be present on\n * `outcome.raw` or `outcome.holdoutScore`. Default reads `outcome.holdoutScore`.\n */\n proxyOf?: (run: RunRecord) => number | null\n /**\n * The held-out ground-truth metric. For RL on coding, this is typically\n * test pass-rate. For RLHF, it's downstream task performance or human\n * preference. For knowledge tasks, it's an independently-graded score.\n */\n truthOf?: (run: RunRecord) => number | null\n /**\n * Independent secondary reward. Used for the `reward_disagreement`\n * signal. Default uses the verifiable reward extractor (deterministic\n * sources only).\n */\n secondaryRewardOf?: (run: RunRecord) => number | null\n /**\n * Window size — how many of the most recent runs count as the \"after\"\n * cohort. Default min(50, half the runs).\n */\n windowSize?: number\n /**\n * Severity threshold to flag a signal. Default 0.3 (suspect) and 0.6\n * (gaming).\n */\n thresholds?: { suspect?: number; gaming?: number }\n /**\n * Verifiable-reward options used for the secondary-reward fallback.\n */\n verifiableRewardOptions?: VerifiableRewardExtractionOptions\n}\n\nconst DEFAULT_PROXY = (r: RunRecord): number | null => {\n const v = r.outcome.holdoutScore ?? r.outcome.searchScore\n return typeof v === 'number' && Number.isFinite(v) ? v : null\n}\n\nexport function detectRewardHacking(input: DetectRewardHackingInput): RewardHackingReport {\n const proxyOf = input.proxyOf ?? DEFAULT_PROXY\n const truthOf = input.truthOf\n const sus = input.thresholds?.suspect ?? 0.3\n const gam = input.thresholds?.gaming ?? 0.6\n\n const runs = input.runs.filter((r) => proxyOf(r) !== null)\n const n = runs.length\n if (n < 4) {\n return {\n findings: [],\n verdict: 'clean',\n n,\n rationale: [`fewer than 4 runs with proxy reward (n=${n}); insufficient evidence`],\n }\n }\n const windowSize = Math.max(1, input.windowSize ?? Math.min(50, Math.floor(n / 2)))\n const before = runs.slice(0, n - windowSize)\n const after = runs.slice(n - windowSize)\n\n const findings: RewardHackingFinding[] = []\n\n // ── Signal 1: reward divergence (proxy ↑ while truth flat or ↓) ──────\n if (truthOf) {\n const beforeProxy = before.map(proxyOf).filter((v): v is number => typeof v === 'number')\n const afterProxy = after.map(proxyOf).filter((v): v is number => typeof v === 'number')\n const beforeTruth = before.map(truthOf).filter((v): v is number => typeof v === 'number')\n const afterTruth = after.map(truthOf).filter((v): v is number => typeof v === 'number')\n if (\n beforeProxy.length >= 2 &&\n afterProxy.length >= 2 &&\n beforeTruth.length >= 2 &&\n afterTruth.length >= 2\n ) {\n const proxyDelta = mean(afterProxy) - mean(beforeProxy)\n const truthDelta = mean(afterTruth) - mean(beforeTruth)\n // Divergence: proxy goes up while truth goes flat or down.\n // Severity = max(0, (proxyDelta - truthDelta)) — bigger gap = bigger signal.\n const gap = Math.max(0, proxyDelta - truthDelta)\n const severity = clamp01(gap * 5) // scale: 0.2 absolute gap → severity 1.0\n findings.push({\n signal: 'reward_divergence',\n severity,\n message:\n severity >= sus\n ? `proxy reward rose by ${proxyDelta.toFixed(3)} while truth changed by ${truthDelta.toFixed(3)} — potential Goodhart`\n : `proxy and truth moved together (proxy ${proxyDelta.toFixed(3)}, truth ${truthDelta.toFixed(3)})`,\n detail: {\n proxyDelta,\n truthDelta,\n gap,\n beforeN: beforeProxy.length,\n afterN: afterProxy.length,\n },\n })\n }\n }\n\n // ── Signal 2: distributional shift in outputs (KS on score distributions) ──\n {\n const beforeP = before.map(proxyOf).filter((v): v is number => typeof v === 'number')\n const afterP = after.map(proxyOf).filter((v): v is number => typeof v === 'number')\n if (beforeP.length >= 4 && afterP.length >= 4) {\n const ks = ksStatistic(beforeP, afterP)\n // KS statistic: bigger = more shift. We're agnostic about direction;\n // genuine improvement ALSO produces shift, so this signal is\n // contributory rather than load-bearing.\n const severity = clamp01(ks - 0.2)\n findings.push({\n signal: 'distribution_shift',\n severity,\n message:\n severity >= sus\n ? `KS=${ks.toFixed(3)} between before/after windows — distributional shift large`\n : `KS=${ks.toFixed(3)} between before/after windows — within-distribution drift`,\n detail: { ks, beforeN: beforeP.length, afterN: afterP.length },\n })\n }\n }\n\n // ── Signal 3: reward disagreement (proxy vs independent secondary) ────\n {\n const secondaryOf = input.secondaryRewardOf ?? defaultSecondary(input.verifiableRewardOptions)\n const aligned = runs\n .map((r) => ({ p: proxyOf(r), s: secondaryOf(r) }))\n .filter(\n (x): x is { p: number; s: number } => typeof x.p === 'number' && typeof x.s === 'number',\n )\n if (aligned.length >= 4) {\n const ps = aligned.map((x) => x.p)\n const ss = aligned.map((x) => x.s)\n const r = pearsonR(ps, ss)\n // Disagreement: low or negative correlation between primary proxy\n // reward and an independent secondary signal.\n const severity = clamp01(0.5 - Math.max(0, r))\n findings.push({\n signal: 'reward_disagreement',\n severity,\n message:\n severity >= sus\n ? `proxy and independent secondary reward correlate ρ=${r.toFixed(3)} — possibly hacking proxy`\n : `proxy and secondary reward correlate ρ=${r.toFixed(3)}`,\n detail: { pearson: r, n: aligned.length },\n })\n }\n }\n\n // ── Signal 4: judge drift (probabilistic up while deterministic flat) ─\n {\n const detRuns = filterDeterministicallyRewarded(runs, input.verifiableRewardOptions ?? {})\n if (detRuns.length >= 4) {\n const detBefore = detRuns.slice(0, Math.floor(detRuns.length / 2))\n const detAfter = detRuns.slice(Math.floor(detRuns.length / 2))\n const detDelta =\n mean(detAfter.map((r) => r.reward.value)) - mean(detBefore.map((r) => r.reward.value))\n const proxyDelta =\n mean(after.map(proxyOf).filter((v): v is number => typeof v === 'number')) -\n mean(before.map(proxyOf).filter((v): v is number => typeof v === 'number'))\n const driftGap = Math.max(0, proxyDelta - detDelta)\n const severity = clamp01(driftGap * 5)\n findings.push({\n signal: 'judge_drift',\n severity,\n message:\n severity >= sus\n ? `judge proxy +${proxyDelta.toFixed(3)} while deterministic reward +${detDelta.toFixed(3)} — judge drifting up without verifiable backing`\n : `judge and deterministic rewards move in step (judge ${proxyDelta.toFixed(3)}, det ${detDelta.toFixed(3)})`,\n detail: { proxyDelta, detDelta, driftGap, n: detRuns.length },\n })\n }\n }\n\n const maxSev = findings.reduce((m, f) => Math.max(m, f.severity), 0)\n const verdict: RewardHackingReport['verdict'] =\n maxSev >= gam ? 'gaming' : maxSev >= sus ? 'suspect' : 'clean'\n const rationale = findings\n .filter((f) => f.severity >= sus)\n .map((f) => `${f.signal}: severity ${f.severity.toFixed(2)} — ${f.message}`)\n if (rationale.length === 0) rationale.push('no signals fired above suspect threshold')\n\n return { findings, verdict, rationale, n }\n}\n\n// ── Helpers ──────────────────────────────────────────────────────────────\n\nfunction mean(xs: number[]): number {\n if (xs.length === 0) return 0\n return xs.reduce((s, x) => s + x, 0) / xs.length\n}\n\nfunction clamp01(x: number): number {\n if (!Number.isFinite(x)) return 0\n return Math.max(0, Math.min(1, x))\n}\n\nfunction pearsonR(a: number[], b: number[]): number {\n if (a.length !== b.length || a.length < 2) return 0\n const ma = mean(a)\n const mb = mean(b)\n let num = 0,\n da = 0,\n db = 0\n for (let i = 0; i < a.length; i++) {\n const xa = a[i]! - ma\n const xb = b[i]! - mb\n num += xa * xb\n da += xa * xa\n db += xb * xb\n }\n if (da === 0 || db === 0) return 0\n return num / Math.sqrt(da * db)\n}\n\nfunction ksStatistic(a: number[], b: number[]): number {\n // Two-sample Kolmogorov-Smirnov statistic.\n const sortedA = [...a].sort((x, y) => x - y)\n const sortedB = [...b].sort((x, y) => x - y)\n const all = [...new Set([...sortedA, ...sortedB])].sort((x, y) => x - y)\n let max = 0\n for (const v of all) {\n const fa = sortedA.filter((x) => x <= v).length / sortedA.length\n const fb = sortedB.filter((x) => x <= v).length / sortedB.length\n max = Math.max(max, Math.abs(fa - fb))\n }\n return max\n}\n\nfunction defaultSecondary(\n verifiableOpts?: VerifiableRewardExtractionOptions,\n): (run: RunRecord) => number | null {\n return (run: RunRecord) => {\n const filtered = filterDeterministicallyRewarded([run], verifiableOpts ?? {})\n return filtered.length === 1 ? filtered[0]!.reward.value : null\n }\n}\n"],"mappings":";AA8FA,IAAM,+BAA+B,oBAAI,IAAI;AAAA,EAC3C;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AACF,CAAC;AAED,IAAM,qBAAqB,CAAC,SAAyC;AACnE,QAAM,QAAQ,KAAK,YAAY;AAC/B,MAAI,MAAM,SAAS,MAAM,EAAG,QAAO;AACnC,MACE,MAAM,SAAS,SAAS,KACxB,MAAM,SAAS,OAAO,KACtB,MAAM,SAAS,WAAW,KAC1B,MAAM,SAAS,MAAM;AAErB,WAAO;AACT,MAAI,MAAM,SAAS,QAAQ,EAAG,QAAO;AACrC,MAAI,MAAM,SAAS,SAAS,EAAG,QAAO;AACtC,MAAI,MAAM,SAAS,OAAO,KAAK,MAAM,SAAS,UAAU,EAAG,QAAO;AAClE,SAAO;AACT;AAUO,SAAS,wBACd,QACA,OAA0C,CAAC,GAClB;AACzB,QAAM,mBAAmB,IAAI,IAAI,KAAK,uBAAuB,CAAC,GAAG,4BAA4B,CAAC;AAC9F,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,kBAAkB,KAAK,mBAAmB;AAChD,QAAM,aAAa,KAAK,wBAAwB;AAEhD,QAAM,gBAAgB,OAAO,OAAO;AAAA,IAClC,CAAC,MAAM,iBAAiB,IAAI,EAAE,KAAK,KAAK,OAAO,EAAE,UAAU,YAAY,OAAO,SAAS,EAAE,KAAK;AAAA,EAChG;AAEA,MAAI,cAAc,WAAW,GAAG;AAC9B,UAAM,QAAQ,cAAc,CAAC;AAC7B,WAAO;AAAA,MACL,OAAO,QAAQ,MAAM,KAAM;AAAA,MAC3B,QAAQ,UAAU,MAAM,KAAK;AAAA,MAC7B,aAAa;AAAA,MACb,YAAY;AAAA,MACZ,QAAQ,MAAM;AAAA,MACd,WAAW,eAAe,KAAK;AAAA,IACjC;AAAA,EACF;AAEA,MAAI,cAAc,SAAS,GAAG;AAE5B,QAAI,MAAM;AACV,QAAI,QAAQ;AACZ,UAAM,YAAoC,CAAC;AAC3C,eAAW,KAAK,eAAe;AAC7B,YAAM,IAAK,EAAE,QAAQ,UAAiC;AACtD,aAAO,KAAK,EAAE,SAAS;AACvB,eAAS;AACT,gBAAU,EAAE,KAAK,IAAI,EAAE;AAAA,IACzB;AACA,WAAO;AAAA,MACL,OAAO,UAAU,IAAI,IAAI,QAAQ,MAAM,KAAK;AAAA,MAC5C,QAAQ;AAAA,MACR,aAAa;AAAA,MACb,YAAY;AAAA,MACZ,QAAQ,cAAc,IAAI,CAAC,MAAM,EAAE,KAAK,EAAE,KAAK,GAAG;AAAA,MAClD;AAAA,IACF;AAAA,EACF;AAEA,MAAI,CAAC,gBAAiB,QAAO;AAE7B,QAAM,QACJ,OAAO,OAAO;AAAA,IACZ,CAAC,MACC,OAAO,EAAE,UAAU,YAAY,OAAO,SAAS,EAAE,KAAK,KAAK,UAAU,EAAE,KAAK,MAAM;AAAA,EACtF,KAAK,OAAO,OAAO,KAAK,CAAC,MAAM,OAAO,EAAE,UAAU,YAAY,OAAO,SAAS,EAAE,KAAK,CAAC;AAExF,MAAI,CAAC,MAAO,QAAO;AAEnB,QAAM,iBAAiB,MAAM,QAAQ;AACrC,SAAO;AAAA,IACL,OAAO,QAAQ,MAAM,KAAM;AAAA,IAC3B,QAAQ;AAAA,IACR,aAAa;AAAA,IACb,YAAY,OAAO,mBAAmB,WAAW,iBAAiB;AAAA,IAClE,QAAQ,MAAM;AAAA,IACd,WAAW,eAAe,KAAK;AAAA,EACjC;AACF;AAcO,SAAS,oCACd,MACA,OAA0C,CAAC,GACgB;AAC3D,QAAM,YAAY,KAAK,aAAa;AACpC,QAAM,mBAAmB,IAAI,IAAI,KAAK,uBAAuB,CAAC,GAAG,4BAA4B,CAAC;AAC9F,QAAM,kBAAkB,KAAK,mBAAmB;AAChD,QAAM,aAAa,KAAK,wBAAwB;AAEhD,SAAO,KAAK,IAAI,CAAC,QAAQ;AAEvB,UAAM,cAAsD,CAAC;AAC7D,eAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,IAAI,QAAQ,GAAG,GAAG;AACpD,UACE,EAAE,WAAW,QAAQ,KACrB,CAAC,EAAE,SAAS,KAAK,CAAC,KAClB,OAAO,MAAM,YACb,OAAO,SAAS,CAAC,GACjB;AACA,oBAAY,KAAK,EAAE,MAAM,EAAE,MAAM,SAAS,MAAM,GAAG,OAAO,EAAE,CAAC;AAAA,MAC/D;AAAA,IACF;AACA,UAAM,MAAM,YAAY,OAAO,CAAC,MAAM,iBAAiB,IAAI,EAAE,IAAI,CAAC;AAElE,QAAI,IAAI,WAAW,GAAG;AACpB,YAAM,QAAQ,IAAI,CAAC;AACnB,aAAO;AAAA,QACL,OAAO,IAAI;AAAA,QACX,QAAQ;AAAA,UACN,OAAO,QAAQ,MAAM,KAAK;AAAA,UAC1B,QAAQ,UAAU,MAAM,IAAI;AAAA,UAC5B,aAAa;AAAA,UACb,YAAY;AAAA,UACZ,QAAQ,MAAM;AAAA,QAChB;AAAA,MACF;AAAA,IACF;AACA,QAAI,IAAI,SAAS,GAAG;AAClB,YAAM,QAAQ,IAAI,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,OAAO,CAAC,IAAI,IAAI;AACzD,YAAM,YAAoC,OAAO;AAAA,QAC/C,IAAI,IAAI,CAAC,MAAM,CAAC,EAAE,MAAM,EAAE,KAAK,CAAC;AAAA,MAClC;AACA,aAAO;AAAA,QACL,OAAO,IAAI;AAAA,QACX,QAAQ;AAAA,UACN,OAAO,QAAQ,KAAK;AAAA,UACpB,QAAQ;AAAA,UACR,aAAa;AAAA,UACb,YAAY;AAAA,UACZ,QAAQ,IAAI,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,GAAG;AAAA,UACvC;AAAA,QACF;AAAA,MACF;AAAA,IACF;AACA,QAAI,CAAC,gBAAiB,QAAO,EAAE,OAAO,IAAI,OAAO,QAAQ,KAAK;AAG9D,UAAM,UAAU,IAAI,QAAQ,gBAAgB,IAAI,QAAQ;AACxD,QAAI,OAAO,YAAY,YAAY,CAAC,OAAO,SAAS,OAAO,GAAG;AAC5D,aAAO,EAAE,OAAO,IAAI,OAAO,QAAQ,KAAK;AAAA,IAC1C;AACA,WAAO;AAAA,MACL,OAAO,IAAI;AAAA,MACX,QAAQ;AAAA,QACN,OAAO,QAAQ,OAAO;AAAA,QACtB,QAAQ;AAAA,QACR,aAAa;AAAA,QACb,YAAY;AAAA,QACZ,QAAQ;AAAA,MACV;AAAA,IACF;AAAA,EACF,CAAC;AACH;AAGO,SAAS,gCACd,MACA,OAA0C,CAAC,GACU;AACrD,QAAM,WAAW,oCAAoC,MAAM,EAAE,GAAG,MAAM,iBAAiB,MAAM,CAAC;AAC9F,QAAM,MAA2D,CAAC;AAClE,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,SAAS,CAAC;AACpB,QAAI,EAAE,UAAU,EAAE,OAAO,gBAAgB,iBAAiB;AACxD,UAAI,KAAK,EAAE,KAAK,KAAK,CAAC,GAAI,QAAQ,EAAE,OAAO,CAAC;AAAA,IAC9C;AAAA,EACF;AACA,SAAO;AACT;AAIA,SAAS,QAAQ,GAAmB;AAClC,MAAI,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO;AAChC,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AACnC;AAEA,SAAS,eAAe,GAAwC;AAC9D,QAAM,MAA8B,CAAC;AACrC,MAAI,EAAE,aAAa;AACjB,eAAW,CAAC,GAAG,CAAC,KAAK,OAAO,QAAQ,EAAE,WAAW,GAAG;AAClD,UAAI,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,EAAG,KAAI,CAAC,IAAI;AAAA,IAC5D;AAAA,EACF;AACA,SAAO;AACT;;;AC3MA,IAAM,gBAAgB,CAAC,MAAgC;AACrD,QAAM,IAAI,EAAE,QAAQ,gBAAgB,EAAE,QAAQ;AAC9C,SAAO,OAAO,MAAM,YAAY,OAAO,SAAS,CAAC,IAAI,IAAI;AAC3D;AAEO,SAAS,oBAAoB,OAAsD;AACxF,QAAM,UAAU,MAAM,WAAW;AACjC,QAAM,UAAU,MAAM;AACtB,QAAM,MAAM,MAAM,YAAY,WAAW;AACzC,QAAM,MAAM,MAAM,YAAY,UAAU;AAExC,QAAM,OAAO,MAAM,KAAK,OAAO,CAAC,MAAM,QAAQ,CAAC,MAAM,IAAI;AACzD,QAAM,IAAI,KAAK;AACf,MAAI,IAAI,GAAG;AACT,WAAO;AAAA,MACL,UAAU,CAAC;AAAA,MACX,SAAS;AAAA,MACT;AAAA,MACA,WAAW,CAAC,0CAA0C,CAAC,0BAA0B;AAAA,IACnF;AAAA,EACF;AACA,QAAM,aAAa,KAAK,IAAI,GAAG,MAAM,cAAc,KAAK,IAAI,IAAI,KAAK,MAAM,IAAI,CAAC,CAAC,CAAC;AAClF,QAAM,SAAS,KAAK,MAAM,GAAG,IAAI,UAAU;AAC3C,QAAM,QAAQ,KAAK,MAAM,IAAI,UAAU;AAEvC,QAAM,WAAmC,CAAC;AAG1C,MAAI,SAAS;AACX,UAAM,cAAc,OAAO,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AACxF,UAAM,aAAa,MAAM,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AACtF,UAAM,cAAc,OAAO,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AACxF,UAAM,aAAa,MAAM,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AACtF,QACE,YAAY,UAAU,KACtB,WAAW,UAAU,KACrB,YAAY,UAAU,KACtB,WAAW,UAAU,GACrB;AACA,YAAM,aAAa,KAAK,UAAU,IAAI,KAAK,WAAW;AACtD,YAAM,aAAa,KAAK,UAAU,IAAI,KAAK,WAAW;AAGtD,YAAM,MAAM,KAAK,IAAI,GAAG,aAAa,UAAU;AAC/C,YAAM,WAAWA,SAAQ,MAAM,CAAC;AAChC,eAAS,KAAK;AAAA,QACZ,QAAQ;AAAA,QACR;AAAA,QACA,SACE,YAAY,MACR,wBAAwB,WAAW,QAAQ,CAAC,CAAC,2BAA2B,WAAW,QAAQ,CAAC,CAAC,+BAC7F,yCAAyC,WAAW,QAAQ,CAAC,CAAC,WAAW,WAAW,QAAQ,CAAC,CAAC;AAAA,QACpG,QAAQ;AAAA,UACN;AAAA,UACA;AAAA,UACA;AAAA,UACA,SAAS,YAAY;AAAA,UACrB,QAAQ,WAAW;AAAA,QACrB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AAGA;AACE,UAAM,UAAU,OAAO,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AACpF,UAAM,SAAS,MAAM,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ;AAClF,QAAI,QAAQ,UAAU,KAAK,OAAO,UAAU,GAAG;AAC7C,YAAM,KAAK,YAAY,SAAS,MAAM;AAItC,YAAM,WAAWA,SAAQ,KAAK,GAAG;AACjC,eAAS,KAAK;AAAA,QACZ,QAAQ;AAAA,QACR;AAAA,QACA,SACE,YAAY,MACR,MAAM,GAAG,QAAQ,CAAC,CAAC,oEACnB,MAAM,GAAG,QAAQ,CAAC,CAAC;AAAA,QACzB,QAAQ,EAAE,IAAI,SAAS,QAAQ,QAAQ,QAAQ,OAAO,OAAO;AAAA,MAC/D,CAAC;AAAA,IACH;AAAA,EACF;AAGA;AACE,UAAM,cAAc,MAAM,qBAAqB,iBAAiB,MAAM,uBAAuB;AAC7F,UAAM,UAAU,KACb,IAAI,CAAC,OAAO,EAAE,GAAG,QAAQ,CAAC,GAAG,GAAG,YAAY,CAAC,EAAE,EAAE,EACjD;AAAA,MACC,CAAC,MAAqC,OAAO,EAAE,MAAM,YAAY,OAAO,EAAE,MAAM;AAAA,IAClF;AACF,QAAI,QAAQ,UAAU,GAAG;AACvB,YAAM,KAAK,QAAQ,IAAI,CAAC,MAAM,EAAE,CAAC;AACjC,YAAM,KAAK,QAAQ,IAAI,CAAC,MAAM,EAAE,CAAC;AACjC,YAAM,IAAI,SAAS,IAAI,EAAE;AAGzB,YAAM,WAAWA,SAAQ,MAAM,KAAK,IAAI,GAAG,CAAC,CAAC;AAC7C,eAAS,KAAK;AAAA,QACZ,QAAQ;AAAA,QACR;AAAA,QACA,SACE,YAAY,MACR,2DAAsD,EAAE,QAAQ,CAAC,CAAC,mCAClE,+CAA0C,EAAE,QAAQ,CAAC,CAAC;AAAA,QAC5D,QAAQ,EAAE,SAAS,GAAG,GAAG,QAAQ,OAAO;AAAA,MAC1C,CAAC;AAAA,IACH;AAAA,EACF;AAGA;AACE,UAAM,UAAU,gCAAgC,MAAM,MAAM,2BAA2B,CAAC,CAAC;AACzF,QAAI,QAAQ,UAAU,GAAG;AACvB,YAAM,YAAY,QAAQ,MAAM,GAAG,KAAK,MAAM,QAAQ,SAAS,CAAC,CAAC;AACjE,YAAM,WAAW,QAAQ,MAAM,KAAK,MAAM,QAAQ,SAAS,CAAC,CAAC;AAC7D,YAAM,WACJ,KAAK,SAAS,IAAI,CAAC,MAAM,EAAE,OAAO,KAAK,CAAC,IAAI,KAAK,UAAU,IAAI,CAAC,MAAM,EAAE,OAAO,KAAK,CAAC;AACvF,YAAM,aACJ,KAAK,MAAM,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ,CAAC,IACzE,KAAK,OAAO,IAAI,OAAO,EAAE,OAAO,CAAC,MAAmB,OAAO,MAAM,QAAQ,CAAC;AAC5E,YAAM,WAAW,KAAK,IAAI,GAAG,aAAa,QAAQ;AAClD,YAAM,WAAWA,SAAQ,WAAW,CAAC;AACrC,eAAS,KAAK;AAAA,QACZ,QAAQ;AAAA,QACR;AAAA,QACA,SACE,YAAY,MACR,gBAAgB,WAAW,QAAQ,CAAC,CAAC,gCAAgC,SAAS,QAAQ,CAAC,CAAC,yDACxF,uDAAuD,WAAW,QAAQ,CAAC,CAAC,SAAS,SAAS,QAAQ,CAAC,CAAC;AAAA,QAC9G,QAAQ,EAAE,YAAY,UAAU,UAAU,GAAG,QAAQ,OAAO;AAAA,MAC9D,CAAC;AAAA,IACH;AAAA,EACF;AAEA,QAAM,SAAS,SAAS,OAAO,CAAC,GAAG,MAAM,KAAK,IAAI,GAAG,EAAE,QAAQ,GAAG,CAAC;AACnE,QAAM,UACJ,UAAU,MAAM,WAAW,UAAU,MAAM,YAAY;AACzD,QAAM,YAAY,SACf,OAAO,CAAC,MAAM,EAAE,YAAY,GAAG,EAC/B,IAAI,CAAC,MAAM,GAAG,EAAE,MAAM,cAAc,EAAE,SAAS,QAAQ,CAAC,CAAC,WAAM,EAAE,OAAO,EAAE;AAC7E,MAAI,UAAU,WAAW,EAAG,WAAU,KAAK,0CAA0C;AAErF,SAAO,EAAE,UAAU,SAAS,WAAW,EAAE;AAC3C;AAIA,SAAS,KAAK,IAAsB;AAClC,MAAI,GAAG,WAAW,EAAG,QAAO;AAC5B,SAAO,GAAG,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,GAAG;AAC5C;AAEA,SAASA,SAAQ,GAAmB;AAClC,MAAI,CAAC,OAAO,SAAS,CAAC,EAAG,QAAO;AAChC,SAAO,KAAK,IAAI,GAAG,KAAK,IAAI,GAAG,CAAC,CAAC;AACnC;AAEA,SAAS,SAAS,GAAa,GAAqB;AAClD,MAAI,EAAE,WAAW,EAAE,UAAU,EAAE,SAAS,EAAG,QAAO;AAClD,QAAM,KAAK,KAAK,CAAC;AACjB,QAAM,KAAK,KAAK,CAAC;AACjB,MAAI,MAAM,GACR,KAAK,GACL,KAAK;AACP,WAAS,IAAI,GAAG,IAAI,EAAE,QAAQ,KAAK;AACjC,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,UAAM,KAAK,EAAE,CAAC,IAAK;AACnB,WAAO,KAAK;AACZ,UAAM,KAAK;AACX,UAAM,KAAK;AAAA,EACb;AACA,MAAI,OAAO,KAAK,OAAO,EAAG,QAAO;AACjC,SAAO,MAAM,KAAK,KAAK,KAAK,EAAE;AAChC;AAEA,SAAS,YAAY,GAAa,GAAqB;AAErD,QAAM,UAAU,CAAC,GAAG,CAAC,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3C,QAAM,UAAU,CAAC,GAAG,CAAC,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AAC3C,QAAM,MAAM,CAAC,GAAG,oBAAI,IAAI,CAAC,GAAG,SAAS,GAAG,OAAO,CAAC,CAAC,EAAE,KAAK,CAAC,GAAG,MAAM,IAAI,CAAC;AACvE,MAAI,MAAM;AACV,aAAW,KAAK,KAAK;AACnB,UAAM,KAAK,QAAQ,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,SAAS,QAAQ;AAC1D,UAAM,KAAK,QAAQ,OAAO,CAAC,MAAM,KAAK,CAAC,EAAE,SAAS,QAAQ;AAC1D,UAAM,KAAK,IAAI,KAAK,KAAK,IAAI,KAAK,EAAE,CAAC;AAAA,EACvC;AACA,SAAO;AACT;AAEA,SAAS,iBACP,gBACmC;AACnC,SAAO,CAAC,QAAmB;AACzB,UAAM,WAAW,gCAAgC,CAAC,GAAG,GAAG,kBAAkB,CAAC,CAAC;AAC5E,WAAO,SAAS,WAAW,IAAI,SAAS,CAAC,EAAG,OAAO,QAAQ;AAAA,EAC7D;AACF;","names":["clamp01"]}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { b as FeedbackLabel, p as ProposedSideEffect } from './feedback-trajectory-
|
|
1
|
+
import { b as FeedbackLabel, p as ProposedSideEffect } from './feedback-trajectory-Dvy-bt7x.js';
|
|
2
2
|
import { C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig } from './control-runtime-BZ_lVLYW.js';
|
|
3
3
|
import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
|
|
4
4
|
import { T as TraceStore, F as FailureClass } from './store-Db2Bv8Cf.js';
|
package/dist/control.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-
|
|
1
|
+
export { A as ActionExecutionPolicy, a as ActionPolicyDecision, C as ControlRunToRunRecordOptions, e as ProposeReviewConfig, f as ProposeReviewControlAction, g as ProposeReviewControlConfig, h as ProposeReviewControlResult, i as ProposeReviewControlState, j as ProposeReviewReport, p as RunEvidenceMetadata, s as controlRunToRunRecord, u as evaluateActionPolicy, x as runProposeReview, y as runProposeReviewAsControlLoop, z as scoreFromEvals } from './control-CmLJk3IG.js';
|
|
2
2
|
export { c as ControlActionFailureMode, d as ControlActionOutcome, e as ControlBudget, f as ControlContext, g as ControlDecision, C as ControlEvalResult, a as ControlRunResult, h as ControlRuntimeConfig, i as ControlRuntimeError, j as ControlSeverity, b as ControlStep, k as ControlStopPolicies, S as StopDecision, l as allCriticalPassed, o as objectiveEval, r as runAgentControlLoop, s as stopOnNoProgress, m as stopOnRepeatedAction, n as subjectiveEval } from './control-runtime-BZ_lVLYW.js';
|
|
3
|
-
import './feedback-trajectory-
|
|
4
|
-
import './dataset-
|
|
3
|
+
import './feedback-trajectory-Dvy-bt7x.js';
|
|
4
|
+
import './dataset-BlwAtYYf.js';
|
|
5
5
|
import './errors-mje_cKOs.js';
|
|
6
6
|
import './emitter-DP_cSSiw.js';
|
|
7
7
|
import './store-Db2Bv8Cf.js';
|
package/dist/control.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
runProposeReview,
|
|
5
5
|
runProposeReviewAsControlLoop,
|
|
6
6
|
scoreFromEvals
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-L7XMNXLO.js";
|
|
8
8
|
import {
|
|
9
9
|
allCriticalPassed,
|
|
10
10
|
objectiveEval,
|
|
@@ -13,7 +13,7 @@ import {
|
|
|
13
13
|
stopOnRepeatedAction,
|
|
14
14
|
subjectiveEval
|
|
15
15
|
} from "./chunk-NCRFYPS3.js";
|
|
16
|
-
import "./chunk-
|
|
16
|
+
import "./chunk-BWZEGTES.js";
|
|
17
17
|
import "./chunk-TVVP3ZZQ.js";
|
|
18
18
|
import "./chunk-VSMTAMNK.js";
|
|
19
19
|
import "./chunk-QYJT52YW.js";
|