npm - @tangle-network/agent-eval - Versions diffs - 0.21.0 → 0.22.0 - Mend

@tangle-network/agent-eval 0.21.0 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

package/CHANGELOG.md +102 -1
package/README.md +4 -0
package/dist/{chunk-WOK2RTWG.js → chunk-4W4NCYM2.js} +134 -109
package/dist/chunk-4W4NCYM2.js.map +1 -0
package/dist/{chunk-WOPGKVN4.js → chunk-6KQG5HAH.js} +2 -2
package/dist/chunk-6M774GY6.js +53 -0
package/dist/chunk-6M774GY6.js.map +1 -0
package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} +418 -541
package/dist/chunk-IOXMGMHQ.js.map +1 -0
package/dist/{chunk-3GN6U53I.js → chunk-KAO3Q65R.js} +2 -2
package/dist/chunk-QUKKGHTZ.js +121 -0
package/dist/chunk-QUKKGHTZ.js.map +1 -0
package/dist/{chunk-SNUHRBDL.js → chunk-SQQLHODJ.js} +10 -1
package/dist/{chunk-SNUHRBDL.js.map → chunk-SQQLHODJ.js.map} +1 -1
package/dist/chunk-UAND2LOT.js +738 -0
package/dist/chunk-UAND2LOT.js.map +1 -0
package/dist/{chunk-HRZELXCR.js → chunk-USHQBPMH.js} +283 -7
package/dist/chunk-USHQBPMH.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/index.d.ts +10 -284
package/dist/index.js +39 -19
package/dist/index.js.map +1 -1
package/dist/integrity-K2oVlF57.d.ts +210 -0
package/dist/openapi.json +1 -1
package/dist/optimization-UVDNKaO6.d.ts +574 -0
package/dist/optimization.d.ts +6 -144
package/dist/optimization.js +9 -2
package/dist/reporting-B82RSv9C.d.ts +593 -0
package/dist/reporting.d.ts +2 -2
package/dist/reporting.js +15 -8
package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
package/dist/traces.d.ts +101 -181
package/dist/traces.js +16 -5
package/dist/wire/index.js +3 -3
package/docs/research-report-methodology.md +19 -4
package/docs/wire-protocol.md +1 -1
package/package.json +2 -2
package/dist/chunk-3IX6QTB7.js.map +0 -1
package/dist/chunk-HRZELXCR.js.map +0 -1
package/dist/chunk-KRR4VMH7.js +0 -423
package/dist/chunk-KRR4VMH7.js.map +0 -1
package/dist/chunk-WOK2RTWG.js.map +0 -1
package/dist/reporting-Da2ihlcM.d.ts +0 -672
/package/dist/{chunk-WOPGKVN4.js.map → chunk-6KQG5HAH.js.map} +0 -0
/package/dist/{chunk-3GN6U53I.js.map → chunk-KAO3Q65R.js.map} +0 -0

package/dist/{chunk-3IX6QTB7.js → chunk-IOXMGMHQ.js} RENAMED Viewed

@@ -1,337 +1,409 @@
 import {
-  benjaminiHochberg,
-  cohensD,
-  confidenceInterval,
-  pairedBootstrap,
-  pairedMde,
-  wilcoxonSignedRank
-} from "./chunk-KRR4VMH7.js";
+  canonicalize,
+  hashJson
+} from "./chunk-6M774GY6.js";
-// src/release-confidence.ts
-var DEFAULT_THRESHOLDS = {
-  requireCorpus: true,
-  minScenarioCount: 1,
-  minSearchRuns: 1,
-  minHoldoutRuns: 1,
-  requireHoldout: true,
-  minPassRate: 0.8,
-  minMeanScore: 0.7,
-  maxOverfitGap: 0.15,
-  maxMeanCostUsd: Number.POSITIVE_INFINITY,
-  maxP95WallMs: Number.POSITIVE_INFINITY,
-  requireAsiForFailures: true,
-  failureScoreThreshold: 0.5
-};
-function releaseTraceEvidenceFromMultiShotTrials(trials) {
-  return trials.map((trial) => ({
-    scenarioId: trial.scenarioId,
-    candidateId: trial.variantId,
-    split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
-    score: trial.score,
-    ok: trial.ok,
-    turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
-    costUsd: trial.cost,
-    durationMs: trial.durationMs,
-    failureMode: trial.error ? "runtime_error" : void 0,
-    asi: trial.asi,
-    metadata: trial.metadata
-  }));
+// src/statistics.ts
+var INVERTED_DIMENSIONS = /* @__PURE__ */ new Set([
+  "hallucination",
+  "false_confidence",
+  "worst_failure"
+]);
+function normalizeScores(scores) {
+  return scores.map((s) => {
+    if (INVERTED_DIMENSIONS.has(s.dimension)) {
+      return s;
+    }
+    return s;
+  });
 }
-function evaluateReleaseConfidence(input) {
-  const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
-  const candidateId = input.candidateId ?? null;
-  const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
-  const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
-  const scenarios = input.scenarios ?? [];
-  const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
-  const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
-  const searchScores = scoresFor(runs, "search");
-  const holdoutScores = scoresFor(runs, "holdout");
-  const allScores = [...searchScores, ...holdoutScores];
-  const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
-  const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
-  const searchRuns = runs.filter((r) => r.splitTag === "search").length;
-  const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
-  const searchMeanScore = mean(searchScores);
-  const holdoutMeanScore = mean(holdoutScores);
-  const metrics = {
-    scenarioCount,
-    searchRuns,
-    holdoutRuns,
-    passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
-    meanScore: mean(scoreUniverse),
-    searchMeanScore,
-    holdoutMeanScore,
-    overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),
-    meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
-    p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
-    failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
-    failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
-    singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
-    multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
-    splitCounts,
-    domainCounts: countDomains(scenarios),
-    failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
-    responsibleSurfaceCounts: countResponsibleSurfaces(traces)
-  };
-  const issues = [];
-  checkCorpus(input, thresholds, metrics, issues);
-  checkQuality(thresholds, metrics, issues);
-  checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
-  checkDiagnostics(thresholds, metrics, issues);
-  checkEfficiency(thresholds, metrics, issues);
-  const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
-  const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
+function weightedMean(scores) {
+  if (scores.length === 0) return 0;
+  let totalWeight = 0;
+  let weightedSum = 0;
+  for (const { score, weight } of scores) {
+    const w = weight ?? 1;
+    weightedSum += score * w;
+    totalWeight += w;
+  }
+  return totalWeight > 0 ? weightedSum / totalWeight : 0;
+}
+function confidenceInterval(scores, confidence = 0.95) {
+  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
+  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
+  const n = scores.length;
+  const mean = scores.reduce((a, b) => a + b, 0) / n;
+  const B = 1e3;
+  const bootstrapMeans = [];
+  for (let i = 0; i < B; i++) {
+    let sum = 0;
+    for (let j = 0; j < n; j++) {
+      sum += scores[Math.floor(Math.random() * n)];
+    }
+    bootstrapMeans.push(sum / n);
+  }
+  bootstrapMeans.sort((a, b) => a - b);
+  const alpha = 1 - confidence;
+  const lowerIdx = Math.floor(alpha / 2 * B);
+  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    target: input.target,
-    candidateId,
-    baselineId: input.baselineId ?? null,
-    status,
-    promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
-    axes,
-    issues,
-    metrics,
-    dataset: input.dataset ?? null,
-    gateDecision: input.gateDecision ?? null,
-    summary: renderSummary(input.target, status, metrics, issues)
+    mean,
+    lower: bootstrapMeans[lowerIdx],
+    upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
 }
-function assertReleaseConfidence(input) {
-  const scorecard = evaluateReleaseConfidence(input);
-  if (scorecard.status === "fail") {
-    throw new Error(scorecard.summary);
-  }
-  return scorecard;
-}
-function filterCandidate(runs, candidateId, baselineId) {
-  if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
-  if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
-  return [...runs];
-}
-function filterTraceCandidate(traces, candidateId, baselineId) {
-  if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
-  if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
-  return [...traces];
-}
-function checkCorpus(input, thresholds, metrics, issues) {
-  if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
-    issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
-  }
-  if (metrics.scenarioCount < thresholds.minScenarioCount) {
-    issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
-  }
-  if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
-    issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
-  }
-}
-function checkQuality(thresholds, metrics, issues) {
-  if (metrics.searchRuns < thresholds.minSearchRuns) {
-    issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
-  }
-  if (metrics.passRate < thresholds.minPassRate) {
-    issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
-  }
-  if (metrics.meanScore < thresholds.minMeanScore) {
-    issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
-  }
-}
-function checkGeneralization(gateDecision, thresholds, metrics, issues) {
-  if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
-    issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
-  }
-  if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
-    issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
-  }
-  if (gateDecision && !gateDecision.promote) {
-    issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
+function interRaterReliability(judgeScores) {
+  if (judgeScores.length < 2) return 1;
+  const dimensionMap = /* @__PURE__ */ new Map();
+  for (const judgeSet of judgeScores) {
+    for (const s of judgeSet) {
+      if (!dimensionMap.has(s.dimension)) dimensionMap.set(s.dimension, []);
+      const arr = dimensionMap.get(s.dimension);
+      if (arr.length === 0 || arr[arr.length - 1].length >= judgeScores.length) {
+        arr.push([s.score]);
+      } else {
+        arr[arr.length - 1].push(s.score);
+      }
+    }
   }
-}
-function checkDiagnostics(thresholds, metrics, issues) {
-  if (!thresholds.requireAsiForFailures) return;
-  if (metrics.failedRows > metrics.failuresWithAsi) {
-    issues.push({
-      axis: "diagnostics",
-      severity: "critical",
-      code: "missing_failure_asi",
-      detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
-    });
+  const allValues = [];
+  const pairDiffs = [];
+  for (const items of dimensionMap.values()) {
+    for (const ratings of items) {
+      if (ratings.length < 2) continue;
+      for (const v of ratings) allValues.push(v);
+      for (let i = 0; i < ratings.length; i++) {
+        for (let j = i + 1; j < ratings.length; j++) {
+          pairDiffs.push((ratings[i] - ratings[j]) ** 2);
+        }
+      }
+    }
   }
-}
-function checkEfficiency(thresholds, metrics, issues) {
-  if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
-    issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` });
+  if (pairDiffs.length === 0 || allValues.length < 2) return 1;
+  const observedDisagreement = pairDiffs.reduce((a, b) => a + b, 0) / pairDiffs.length;
+  let expectedDisagreement = 0;
+  let expectedCount = 0;
+  for (let i = 0; i < allValues.length; i++) {
+    for (let j = i + 1; j < allValues.length; j++) {
+      expectedDisagreement += (allValues[i] - allValues[j]) ** 2;
+      expectedCount++;
+    }
   }
-  if (metrics.p95WallMs > thresholds.maxP95WallMs) {
-    issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` });
+  expectedDisagreement = expectedCount > 0 ? expectedDisagreement / expectedCount : 0;
+  if (expectedDisagreement === 0) return 1;
+  return 1 - observedDisagreement / expectedDisagreement;
+}
+function mannWhitneyU(a, b) {
+  if (a.length === 0 || b.length === 0) return { u: 0, p: 1 };
+  const n1 = a.length;
+  const n2 = b.length;
+  const combined = [
+    ...a.map((v) => ({ v, group: "a" })),
+    ...b.map((v) => ({ v, group: "b" }))
+  ].sort((x, y) => x.v - y.v);
+  const ranks = new Array(combined.length);
+  let i = 0;
+  while (i < combined.length) {
+    let j = i;
+    while (j < combined.length && combined[j].v === combined[i].v) j++;
+    const avgRank = (i + 1 + j) / 2;
+    for (let k = i; k < j; k++) ranks[k] = avgRank;
+    i = j;
+  }
+  let r1 = 0;
+  for (let k = 0; k < combined.length; k++) {
+    if (combined[k].group === "a") r1 += ranks[k];
+  }
+  const u1 = r1 - n1 * (n1 + 1) / 2;
+  const u2 = n1 * n2 - u1;
+  const u = Math.min(u1, u2);
+  const mu = n1 * n2 / 2;
+  const sigma = Math.sqrt(n1 * n2 * (n1 + n2 + 1) / 12);
+  if (sigma === 0) return { u, p: 1 };
+  const z = Math.abs(u - mu) / sigma;
+  const p = 2 * (1 - normalCdf(z));
+  return { u, p };
+}
+function partialCredit(current, target) {
+  if (target <= 0) return 1;
+  return Math.min(1, Math.max(0, current / target));
+}
+function pairedTTest(before, after) {
+  if (before.length !== after.length) {
+    throw new Error(`pairedTTest: unequal sample sizes (${before.length} vs ${after.length})`);
   }
-}
-function buildAxes(metrics, thresholds, gateDecision, issues) {
-  return [
-    axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
-    axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),
-    axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),
-    axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
-    axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`)
+  const n = before.length;
+  if (n < 2) return { t: 0, df: 0, p: 1 };
+  const diffs = before.map((b, i) => after[i] - b);
+  const mean = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance = diffs.reduce((acc, d) => acc + (d - mean) ** 2, 0) / (n - 1);
+  const se = Math.sqrt(variance / n);
+  if (se === 0) return { t: mean === 0 ? 0 : Infinity, df: n - 1, p: mean === 0 ? 1 : 0 };
+  const t = mean / se;
+  const df = n - 1;
+  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
+  return { t, df, p };
+}
+function wilcoxonSignedRank(before, after) {
+  if (before.length !== after.length) {
+    throw new Error(`wilcoxonSignedRank: unequal sample sizes (${before.length} vs ${after.length})`);
+  }
+  const diffs = before.map((b, i2) => after[i2] - b).filter((d) => d !== 0);
+  const n = diffs.length;
+  if (n < 6) return { w: 0, p: 1 };
+  const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
+  const ranks = new Array(n);
+  let i = 0;
+  while (i < n) {
+    let j = i;
+    while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
+    const avg2 = (i + 1 + j) / 2;
+    for (let k = i; k < j; k++) ranks[absRanks[k].i] = avg2;
+    i = j;
+  }
+  let wPlus = 0;
+  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks[k];
+  const mean = n * (n + 1) / 4;
+  const variance = n * (n + 1) * (2 * n + 1) / 24;
+  const z = (wPlus - mean) / Math.sqrt(variance);
+  const p = 2 * (1 - normalCdf(Math.abs(z)));
+  return { w: wPlus, p };
+}
+function cohensD(a, b) {
+  if (a.length < 2 || b.length < 2) return 0;
+  const meanA = a.reduce((x, y) => x + y, 0) / a.length;
+  const meanB = b.reduce((x, y) => x + y, 0) / b.length;
+  const varA = a.reduce((acc, x) => acc + (x - meanA) ** 2, 0) / (a.length - 1);
+  const varB = b.reduce((acc, x) => acc + (x - meanB) ** 2, 0) / (b.length - 1);
+  const pooled = Math.sqrt(
+    ((a.length - 1) * varA + (b.length - 1) * varB) / (a.length + b.length - 2)
+  );
+  if (pooled === 0) return 0;
+  return (meanB - meanA) / pooled;
+}
+function studentTCdf(t, df) {
+  if (df <= 0) return 0.5;
+  if (df > 100) return normalCdf(t);
+  const x = df / (df + t * t);
+  const a = df / 2;
+  const b = 0.5;
+  const ib = incompleteBeta(x, a, b);
+  return t >= 0 ? 1 - 0.5 * ib : 0.5 * ib;
+}
+function incompleteBeta(x, a, b) {
+  if (x <= 0) return 0;
+  if (x >= 1) return 1;
+  const lnBeta = lnGamma(a) + lnGamma(b) - lnGamma(a + b);
+  const front = Math.exp(Math.log(x) * a + Math.log(1 - x) * b - lnBeta) / a;
+  const maxIter = 200;
+  const eps = 3e-7;
+  let c = 1;
+  let d = 1 - (a + b) * x / (a + 1);
+  if (Math.abs(d) < 1e-30) d = 1e-30;
+  d = 1 / d;
+  let f = d;
+  for (let m = 1; m <= maxIter; m++) {
+    const m2 = 2 * m;
+    let num = m * (b - m) * x / ((a + m2 - 1) * (a + m2));
+    d = 1 + num * d;
+    if (Math.abs(d) < 1e-30) d = 1e-30;
+    c = 1 + num / c;
+    if (Math.abs(c) < 1e-30) c = 1e-30;
+    d = 1 / d;
+    f *= d * c;
+    num = -((a + m) * (a + b + m) * x) / ((a + m2) * (a + m2 + 1));
+    d = 1 + num * d;
+    if (Math.abs(d) < 1e-30) d = 1e-30;
+    c = 1 + num / c;
+    if (Math.abs(c) < 1e-30) c = 1e-30;
+    d = 1 / d;
+    const delta = d * c;
+    f *= delta;
+    if (Math.abs(delta - 1) < eps) break;
+  }
+  return front * f;
+}
+function lnGamma(z) {
+  const g = 7;
+  const coefs = [
+    0.9999999999998099,
+    676.5203681218851,
+    -1259.1392167224028,
+    771.3234287776531,
+    -176.6150291621406,
+    12.507343278686905,
+    -0.13857109526572012,
+    9984369578019572e-21,
+    15056327351493116e-23
   ];
+  if (z < 0.5) {
+    return Math.log(Math.PI / Math.sin(Math.PI * z)) - lnGamma(1 - z);
+  }
+  z -= 1;
+  let x = coefs[0];
+  for (let i = 1; i < g + 2; i++) x += coefs[i] / (z + i);
+  const t = z + g + 0.5;
+  return 0.5 * Math.log(2 * Math.PI) + (z + 0.5) * Math.log(t) - t + Math.log(x);
+}
+function normalCdf(x) {
+  const a1 = 0.254829592;
+  const a2 = -0.284496736;
+  const a3 = 1.421413741;
+  const a4 = -1.453152027;
+  const a5 = 1.061405429;
+  const p = 0.3275911;
+  const sign = x < 0 ? -1 : 1;
+  const absX = Math.abs(x);
+  const t = 1 / (1 + p * absX);
+  const y = 1 - ((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t * Math.exp(-absX * absX / 2);
+  return 0.5 * (1 + sign * y);
 }
-function axis(name, issues, score, detail) {
-  const own = issues.filter((i) => i.axis === name);
-  const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
-  return { name, status, score: bounded(score), detail };
-}
-function countScenarioSplits(scenarios) {
-  const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
-  for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
-  return counts;
-}
-function countDomains(scenarios) {
-  const out = {};
-  for (const scenario of scenarios) {
-    const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
-    out[domain] = (out[domain] ?? 0) + 1;
-  }
-  return out;
+// src/power-analysis.ts
+function requiredSampleSize(opts) {
+  const effect = opts.effect;
+  if (!Number.isFinite(effect) || effect <= 0) return Infinity;
+  const alpha = opts.alpha ?? 0.05;
+  const power = opts.power ?? 0.8;
+  const twoSided = opts.twoSided ?? true;
+  const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
+  const zBeta = zQuantile(power);
+  const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
+  return Math.ceil(n);
+}
+function pairedMde(opts) {
+  if (!Number.isFinite(opts.nPaired) || opts.nPaired <= 0) return Infinity;
+  const alpha = opts.alpha ?? 0.05;
+  const power = opts.power ?? 0.8;
+  const twoSided = opts.twoSided ?? true;
+  const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
+  const zBeta = zQuantile(power);
+  return (zAlpha + zBeta) / Math.sqrt(opts.nPaired);
+}
+function bonferroni(pValues, alpha = 0.05) {
+  const k = pValues.length;
+  const adjusted = pValues.map((p) => Math.min(1, p * k));
+  const significant = adjusted.map((p) => p < alpha);
+  return { adjusted, significant };
+}
+function benjaminiHochberg(pValues, fdr = 0.05) {
+  const n = pValues.length;
+  if (n === 0) return { qValues: [], significant: [] };
+  const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
+  const q = new Array(n);
+  let minRight = 1;
+  for (let k = n - 1; k >= 0; k--) {
+    const rank = k + 1;
+    const raw = indexed[k].p * n / rank;
+    const bounded = Math.min(minRight, raw);
+    minRight = bounded;
+    q[indexed[k].i] = Math.min(1, bounded);
+  }
+  const significant = q.map((v) => v < fdr);
+  return { qValues: q, significant };
+}
+function zQuantile(p) {
+  if (p <= 0 || p >= 1) {
+    if (p === 0) return -Infinity;
+    if (p === 1) return Infinity;
+    return NaN;
+  }
+  const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
+  const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
+  const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
+  const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
+  const pLow = 0.02425;
+  const pHigh = 1 - pLow;
+  let q;
+  let r;
+  if (p < pLow) {
+    q = Math.sqrt(-2 * Math.log(p));
+    return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
+  }
+  if (p <= pHigh) {
+    q = p - 0.5;
+    r = q * q;
+    return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
+  }
+  q = Math.sqrt(-2 * Math.log(1 - p));
+  return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
 }
-function countFailureModes(runs, traces, threshold) {
-  const out = {};
-  for (const run of runs) {
-    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
-    if (run.failureMode || score !== void 0 && score < threshold) {
-      const mode = run.failureMode ?? "low_score";
-      out[mode] = (out[mode] ?? 0) + 1;
-    }
+// src/paired-stats.ts
+function pairedBootstrap(before, after, opts = {}) {
+  if (before.length !== after.length) {
+    throw new Error(
+      `pairedBootstrap: unequal sample sizes (${before.length} vs ${after.length})`
+    );
   }
-  for (const trace of traces) {
-    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
-      const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
-      out[mode] = (out[mode] ?? 0) + 1;
-    }
+  const confidence = opts.confidence ?? 0.95;
+  const resamples = opts.resamples ?? 2e3;
+  const statistic = opts.statistic ?? "median";
+  if (confidence <= 0 || confidence >= 1) {
+    throw new Error(`pairedBootstrap: confidence must be in (0,1), got ${confidence}`);
   }
-  return out;
-}
-function countResponsibleSurfaces(traces) {
-  const out = {};
-  for (const trace of traces) {
-    for (const asi of trace.asi ?? []) {
-      const surface = asi.responsibleSurface ?? "unknown";
-      out[surface] = (out[surface] ?? 0) + 1;
-    }
+  const n = before.length;
+  const deltas = before.map((b, i) => after[i] - b);
+  if (n === 0) {
+    return { n: 0, median: 0, mean: 0, low: 0, high: 0, confidence, resamples };
   }
-  return out;
-}
-function failedRows(runs, traces, threshold) {
-  const out = [];
-  for (const run of runs) {
-    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
-    if (run.failureMode || score !== void 0 && score < threshold) {
-      const asiMetric = run.outcome.raw.asi;
-      out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
-    }
+  if (n === 1) {
+    const d = deltas[0];
+    return { n: 1, median: d, mean: d, low: d, high: d, confidence, resamples };
   }
-  for (const trace of traces) {
-    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
-      out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
+  const rng = makeRng(opts.seed);
+  const samples = new Array(resamples);
+  for (let b = 0; b < resamples; b++) {
+    let acc = null;
+    if (statistic === "mean") {
+      let sum = 0;
+      for (let k = 0; k < n; k++) {
+        sum += deltas[Math.floor(rng() * n)];
+      }
+      samples[b] = sum / n;
+    } else {
+      acc = new Array(n);
+      for (let k = 0; k < n; k++) {
+        acc[k] = deltas[Math.floor(rng() * n)];
+      }
+      samples[b] = medianInPlace(acc);
     }
   }
-  return out;
-}
-function passRate(runs, traces, threshold) {
-  const outcomes = [
-    ...runs.map((run) => {
-      const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
-      return !run.failureMode && score !== void 0 && score >= threshold;
-    }),
-    ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
-  ];
-  if (outcomes.length === 0) return 0;
-  return outcomes.filter(Boolean).length / outcomes.length;
-}
-function scoresFor(runs, split) {
-  return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
-}
-function mean(xs) {
-  if (xs.length === 0) return Number.NaN;
-  return xs.reduce((sum, x) => sum + x, 0) / xs.length;
-}
-function percentile(xs, p) {
-  if (xs.length === 0) return Number.NaN;
-  const sorted = [...xs].sort((a, b) => a - b);
-  return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
-}
-function isFiniteNumber(value) {
-  return typeof value === "number" && Number.isFinite(value);
-}
-function safeDiff(a, b) {
-  if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
-  return a - b;
-}
-function gapScore(gap, maxGap) {
-  if (!Number.isFinite(gap)) return 0;
-  if (maxGap <= 0) return gap <= 0 ? 1 : 0;
-  return bounded(1 - Math.max(0, gap) / maxGap);
-}
-function efficiencyScore(metrics, thresholds) {
-  const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
-  const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
-  return Math.min(cost, latency);
-}
-function bounded(x) {
-  if (!Number.isFinite(x)) return 0;
-  return Math.max(0, Math.min(1, x));
-}
-function renderSummary(target, status, metrics, issues) {
-  const prefix = `release confidence ${status}: ${target}`;
-  const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
-  if (issues.length === 0) return `${prefix}; ${metricText}`;
-  return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
-}
-function fmt(x) {
-  if (!Number.isFinite(x)) return String(x);
-  return x.toFixed(4);
-}
-// src/pre-registration.ts
-function canonicalize(v) {
-  if (v === null || typeof v !== "object") return v;
-  if (Array.isArray(v)) return v.map(canonicalize);
-  const keys = Object.keys(v).sort();
-  const out = {};
-  for (const k of keys) out[k] = canonicalize(v[k]);
-  return out;
+  samples.sort((a, b) => a - b);
+  const alpha = 1 - confidence;
+  const lowIdx = Math.floor(alpha / 2 * resamples);
+  const highIdx = Math.min(resamples - 1, Math.ceil((1 - alpha / 2) * resamples) - 1);
+  return {
+    n,
+    median: medianInPlace([...deltas]),
+    mean: deltas.reduce((s, x) => s + x, 0) / n,
+    low: samples[lowIdx],
+    high: samples[Math.max(highIdx, lowIdx)],
+    confidence,
+    resamples
+  };
 }
-async function hashJson(obj) {
-  const canonical = canonicalize(obj);
-  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
-  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
-  return Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
+function pairedWilcoxon(before, after) {
+  return wilcoxonSignedRank(before, after);
 }
-async function signManifest(m) {
-  const hash = await hashJson(m);
-  return { ...m, contentHash: hash, algo: "sha256-content" };
+function bhAdjust(pValues, fdr = 0.05) {
+  return benjaminiHochberg(pValues, fdr);
 }
-async function verifyManifest(m) {
-  const { contentHash, algo: _algo, ...rest } = m;
-  void _algo;
-  const resigned = await signManifest(rest);
-  return resigned.contentHash === contentHash;
+function medianInPlace(xs) {
+  if (xs.length === 0) return 0;
+  xs.sort((a, b) => a - b);
+  const mid = Math.floor(xs.length / 2);
+  return xs.length % 2 === 0 ? (xs[mid - 1] + xs[mid]) / 2 : xs[mid];
 }
-async function evaluateHypothesis(manifest, observed) {
-  if (!await verifyManifest(manifest)) {
-    throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
-  }
-  const reasons = [];
-  const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
-  if (!directionOk) reasons.push("wrong_direction");
-  if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
-  if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
-  if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
-  return {
-    manifest,
-    observedN: observed.n,
-    observedEffect: observed.effect,
-    observedPValue: observed.pValue,
-    confirmed: reasons.length === 0,
-    rejectionReasons: reasons
+function makeRng(seed) {
+  if (seed === void 0) return Math.random;
+  let s = seed | 0 || 2654435769;
+  return () => {
+    s = s + 1831565813 | 0;
+    let t = s;
+    t = Math.imul(t ^ t >>> 15, t | 1);
+    t ^= t + Math.imul(t ^ t >>> 7, t | 61);
+    return ((t ^ t >>> 14) >>> 0) / 4294967296;
   };
 }
@@ -428,10 +500,10 @@ function renderSummaryTableMarkdown(rows, comparator, split) {
   lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
   lines.push("|---|---:|---:|---|---:|---:|");
   for (const r of rows) {
-    const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
+    const ci = `[${fmt(r.ciLow)}, ${fmt(r.ciHigh)}]`;
     const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
     const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
-    lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
+    lines.push(`| ${r.candidateId} | ${r.n} | ${fmt(r.mean)} | ${ci} | ${q} | ${d} |`);
   }
   return lines.join("\n");
 }
@@ -595,10 +667,10 @@ function seedRng(seed) {
     return ((t ^ t >>> 14) >>> 0) / 4294967296;
   };
 }
-function stdev(xs, mean3) {
+function stdev(xs, mean) {
   if (xs.length < 2) return 0;
   let sse = 0;
-  for (const x of xs) sse += (x - mean3) ** 2;
+  for (const x of xs) sse += (x - mean) ** 2;
   return Math.sqrt(sse / (xs.length - 1));
 }
 async function researchReport(runs, opts = {}) {
@@ -780,7 +852,7 @@ function buildMethodology(ctx) {
   return { assumptions, methods, alternatives, whenNotToApply, citations };
 }
 function formatRope(rope) {
-  return `[${fmt2(rope.low)}, ${fmt2(rope.high)}]`;
+  return `[${fmt(rope.low)}, ${fmt(rope.high)}]`;
 }
 function classifyCandidate(row, ctx) {
   if (ctx.comparator && row.candidateId === ctx.comparator) {
@@ -805,30 +877,30 @@ function classifyCandidate(row, ctx) {
   if (ctx.rope && ci.low >= ctx.rope.low && ci.high <= ctx.rope.high) {
     return {
       decision: "equivalent",
-      reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
+      reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] is fully inside ROPE ${formatRope(ctx.rope)}; candidate is practically equivalent to comparator.`
     };
   }
   const significant = Number.isFinite(row.qValue) && row.qValue <= ctx.fdr;
   const gainPositive = ci.low > 0;
   const gainNegative = ci.high < 0;
   if (gainNegative) {
-    return { decision: "reject", reason: `Paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] lies entirely below zero.` };
+    return { decision: "reject", reason: `Paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] lies entirely below zero.` };
   }
   if (ctx.posterior.n < ctx.minPairs) {
     return {
       decision: "needs_more_data",
-      reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt2(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
+      reason: `Only ${ctx.posterior.n} paired observations; minimum detectable effect at this N is ${fmt(ctx.posterior.mde)} score units (need \u2265 ${ctx.minPairs} pairs to issue a directional verdict).`
     };
   }
   if (significant && gainPositive) {
     return {
       decision: "promote",
-      reason: `BH-adjusted q=${fmt2(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)}.`
+      reason: `BH-adjusted q=${fmt(row.qValue)} \u2264 ${ctx.fdr} and paired-delta CI [${fmt(ci.low)}, ${fmt(ci.high)}] excludes zero; Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)}.`
     };
   }
   return {
     decision: "hold",
-    reason: `Pr(\u0394>0)=${fmt2(ctx.posterior.prGreaterThanZero)} but CI [${fmt2(ci.low)}, ${fmt2(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
+    reason: `Pr(\u0394>0)=${fmt(ctx.posterior.prGreaterThanZero)} but CI [${fmt(ci.low)}, ${fmt(ci.high)}] crosses zero; effect not decisive at fdr=${ctx.fdr}.`
   };
 }
 function buildRecommendation(candidates, ctx) {
@@ -843,11 +915,11 @@ function buildRecommendation(candidates, ctx) {
   if (chosen) {
     rationale.push(`${chosen.candidateId}: ${chosen.decisionReason}`);
     if (chosen.gainCi) {
-      const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt2(chosen.prGreaterThanZero)}` : "";
-      rationale.push(`Median paired gain CI: [${fmt2(chosen.gainCi.low)}, ${fmt2(chosen.gainCi.high)}]${probSummary}.`);
+      const probSummary = chosen.prGreaterThanZero !== null ? `, Pr(\u0394>0)=${fmt(chosen.prGreaterThanZero)}` : "";
+      rationale.push(`Median paired gain CI: [${fmt(chosen.gainCi.low)}, ${fmt(chosen.gainCi.high)}]${probSummary}.`);
     }
     if (chosen.mde !== null && Number.isFinite(chosen.mde)) {
-      rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt2(chosen.mde)} score units.`);
+      rationale.push(`MDE at current paired N=${chosen.pairedN}: ${fmt(chosen.mde)} score units.`);
     }
   }
   if (!ctx.comparator) {
@@ -956,9 +1028,9 @@ function renderResearchMarkdown(report) {
     const prGt = c.prGreaterThanZero === null ? "-" : c.prGreaterThanZero.toFixed(3);
     const q = Number.isFinite(c.qValue) ? c.qValue.toFixed(4) : "-";
     const d = Number.isFinite(c.cohensD) ? c.cohensD.toFixed(3) : "-";
-    const gain = c.gainCi ? `[${fmt2(c.gainCi.low)}, ${fmt2(c.gainCi.high)}]` : "-";
-    const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt2(c.mde);
-    lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt2(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
+    const gain = c.gainCi ? `[${fmt(c.gainCi.low)}, ${fmt(c.gainCi.high)}]` : "-";
+    const mde = c.mde === null || !Number.isFinite(c.mde) ? "-" : fmt(c.mde);
+    lines.push(`| ${c.candidateId} | ${c.decision} | ${fmt(c.mean)} | ${delta} | ${prGt} | ${q} | ${d} | ${c.pairedN} | ${gain} | ${mde} | ${c.onParetoFrontier ? "yes" : "no"} | ${c.gate ?? "-"} |`);
   }
   lines.push("");
   lines.push("## Statistical Summary");
@@ -1113,7 +1185,7 @@ function decisionWeight(decision) {
   return 1;
 }
 function signed(x) {
-  return `${x >= 0 ? "+" : ""}${fmt2(x)}`;
+  return `${x >= 0 ? "+" : ""}${fmt(x)}`;
 }
 function avg(xs) {
   if (xs.length === 0) return Number.NaN;
@@ -1124,226 +1196,31 @@ function medianOfSorted(sorted) {
   const mid = Math.floor(sorted.length / 2);
   return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
 }
-function fmt2(x) {
+function fmt(x) {
   if (!Number.isFinite(x)) return String(x);
   return x.toFixed(4);
 }
-// src/release-report.ts
-function renderReleaseReport(scorecard, options = {}) {
-  const title = options.title ?? `Release Report: ${scorecard.target}`;
-  const lines = [];
-  lines.push(`# ${title}`);
-  lines.push("");
-  lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
-  lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
-  if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
-  if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
-  lines.push("");
-  lines.push(scorecard.summary);
-  lines.push("");
-  lines.push("## Metrics");
-  lines.push("");
-  lines.push("| Metric | Value |");
-  lines.push("|---|---:|");
-  lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
-  lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
-  lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
-  lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
-  lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
-  lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
-  lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
-  lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
-  lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
-  lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
-  lines.push("");
-  if (scorecard.issues.length > 0) {
-    lines.push("## Issues");
-    lines.push("");
-    for (const issue of scorecard.issues) {
-      lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
-    }
-    lines.push("");
-  }
-  const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
-  if (surfaces.length > 0) {
-    lines.push("## Responsible Surfaces");
-    lines.push("");
-    for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
-    lines.push("");
-  }
-  const failures = entries(scorecard.metrics.failureModeCounts);
-  if (failures.length > 0) {
-    lines.push("## Failure Modes");
-    lines.push("");
-    for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
-    lines.push("");
-  }
-  if (options.runs && options.runs.length > 0) {
-    lines.push("## Run Summary");
-    lines.push("");
-    lines.push(summaryTable([...options.runs], {
-      comparator: options.comparator ?? scorecard.baselineId ?? void 0,
-      split: "holdout"
-    }).markdown);
-    lines.push("");
-  }
-  if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
-    lines.push("## TraceAnalyst Findings");
-    lines.push("");
-    for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
-    lines.push("");
-  }
-  const nextActions = options.nextActions ?? defaultNextActions(scorecard);
-  if (nextActions.length > 0) {
-    lines.push("## Next Actions");
-    lines.push("");
-    for (const action of nextActions) lines.push(`- ${action}`);
-    lines.push("");
-  }
-  return lines.join("\n").trimEnd() + "\n";
-}
-function defaultNextActions(scorecard) {
-  if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
-  return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
-}
-function entries(values) {
-  return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
-}
-function pct(value) {
-  return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
-}
-function num(value) {
-  return Number.isFinite(value) ? value.toFixed(3) : "n/a";
-}
-// src/promotion-gate.ts
-function bootstrapCi(baseline, candidate, options = {}) {
-  const alpha = options.alpha ?? 0.05;
-  const iterations = options.iterations ?? 1e3;
-  const minTotal = options.minTotalSamples ?? 6;
-  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
-  const baselineMean = mean2(baseline);
-  const candidateMean = mean2(candidate);
-  const delta = candidateMean - baselineMean;
-  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
-    return {
-      baselineMean,
-      candidateMean,
-      delta,
-      ciLower: -Infinity,
-      ciUpper: Infinity,
-      iterations: 0,
-      alpha,
-      verdict: "INCONCLUSIVE"
-    };
-  }
-  const deltas = new Array(iterations);
-  for (let i = 0; i < iterations; i++) {
-    const bResample = resample(baseline, rng);
-    const cResample = resample(candidate, rng);
-    deltas[i] = mean2(cResample) - mean2(bResample);
-  }
-  deltas.sort((a, b) => a - b);
-  const lowerIdx = Math.floor(alpha / 2 * iterations);
-  const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
-  const ciLower = deltas[Math.max(0, lowerIdx)];
-  const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
-  let verdict;
-  if (ciLower > 0) verdict = "ADVANCE";
-  else if (ciUpper < 0) verdict = "REVERT";
-  else if (delta >= 0) verdict = "KEEP";
-  else verdict = "INCONCLUSIVE";
-  return {
-    baselineMean,
-    candidateMean,
-    delta,
-    ciLower,
-    ciUpper,
-    iterations,
-    alpha,
-    verdict
-  };
-}
-function mean2(xs) {
-  if (xs.length === 0) return 0;
-  let s = 0;
-  for (const x of xs) s += x;
-  return s / xs.length;
-}
-function resample(xs, rng) {
-  const out = new Array(xs.length);
-  for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
-  return out;
-}
-function mulberry32(seed) {
-  let t = seed >>> 0;
-  return () => {
-    t += 1831565813;
-    let r = t;
-    r = Math.imul(r ^ r >>> 15, r | 1);
-    r ^= r + Math.imul(r ^ r >>> 7, r | 61);
-    return ((r ^ r >>> 14) >>> 0) / 4294967296;
-  };
-}
-function hashSeed(a, b) {
-  let h = 2166136261;
-  for (const x of [...a, ...b]) {
-    const view = new Float64Array([x]);
-    const bytes = new Uint8Array(view.buffer);
-    for (const byte of bytes) {
-      h ^= byte;
-      h = Math.imul(h, 16777619);
-    }
-  }
-  return h >>> 0;
-}
-async function judgeReplayGate(args) {
-  const concurrency = args.judgeConcurrency ?? 4;
-  const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
-  const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
-  const ci = bootstrapCi(baselineScores, candidateScores, {
-    ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
-    ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
-    ...args.seed !== void 0 ? { seed: args.seed } : {}
-  });
-  return {
-    ...ci,
-    baselineSamples: baselineScores.length,
-    candidateSamples: candidateScores.length
-  };
-}
-async function scoreAll(outputs, judge, concurrency) {
-  const results = new Array(outputs.length);
-  let next = 0;
-  async function worker() {
-    while (true) {
-      const i = next++;
-      if (i >= outputs.length) return;
-      const v = await judge(outputs[i]);
-      results[i] = Number.isFinite(v) ? v : 0;
-    }
-  }
-  await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
-  return results;
-}
 export {
-  releaseTraceEvidenceFromMultiShotTrials,
-  evaluateReleaseConfidence,
-  assertReleaseConfidence,
-  canonicalize,
-  hashJson,
-  signManifest,
-  verifyManifest,
-  evaluateHypothesis,
+  normalizeScores,
+  weightedMean,
+  confidenceInterval,
+  interRaterReliability,
+  mannWhitneyU,
+  partialCredit,
+  pairedTTest,
+  wilcoxonSignedRank,
+  cohensD,
+  requiredSampleSize,
+  bonferroni,
+  benjaminiHochberg,
+  pairedBootstrap,
+  pairedWilcoxon,
+  bhAdjust,
   summaryTable,
   paretoChart,
   gainHistogram,
   RESEARCH_REPORT_HARD_PAIR_FLOOR,
-  researchReport,
-  renderReleaseReport,
-  bootstrapCi,
-  judgeReplayGate
+  researchReport
 };
-//# sourceMappingURL=chunk-3IX6QTB7.js.map
+//# sourceMappingURL=chunk-IOXMGMHQ.js.map