npm - @tangle-network/agent-eval - Versions diffs - 0.19.0 → 0.19.1 - Mend

@tangle-network/agent-eval 0.19.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/index.js CHANGED Viewed

@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
   const n = scores2.length;
-  const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
+  const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean9,
+    mean: mean10,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
+  const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
-  const t = mean9 / se;
+  if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
+  const t = mean10 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean9 = n * (n + 1) / 4;
+  const mean10 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean9) / Math.sqrt(variance2);
+  const z = (wPlus - mean10) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -6457,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
+  const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -6481,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -6938,9 +6938,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
   for (let k = n - 1; k >= 0; k--) {
     const rank = k + 1;
     const raw = indexed[k].p * n / rank;
-    const bounded = Math.min(minRight, raw);
-    minRight = bounded;
-    q[indexed[k].i] = Math.min(1, bounded);
+    const bounded2 = Math.min(minRight, raw);
+    minRight = bounded2;
+    q[indexed[k].i] = Math.min(1, bounded2);
   }
   const significant = q.map((v) => v < fdr);
   return { qValues: q, significant };
@@ -7470,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
+  const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -8396,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
+  const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8419,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
+  const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -8950,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores2.length < 3) continue;
-    const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
-    const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
+    const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
+    const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -12813,6 +12813,289 @@ function traceExcerpt(trace) {
   return void 0;
 }
+// src/release-confidence.ts
+var DEFAULT_THRESHOLDS = {
+  requireCorpus: true,
+  minScenarioCount: 1,
+  minSearchRuns: 1,
+  minHoldoutRuns: 1,
+  requireHoldout: true,
+  minPassRate: 0.8,
+  minMeanScore: 0.7,
+  maxOverfitGap: 0.15,
+  maxMeanCostUsd: Number.POSITIVE_INFINITY,
+  maxP95WallMs: Number.POSITIVE_INFINITY,
+  requireAsiForFailures: true,
+  failureScoreThreshold: 0.5
+};
+function releaseTraceEvidenceFromMultiShotTrials(trials) {
+  return trials.map((trial) => ({
+    scenarioId: trial.scenarioId,
+    candidateId: trial.variantId,
+    split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
+    score: trial.score,
+    ok: trial.ok,
+    turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
+    costUsd: trial.cost,
+    durationMs: trial.durationMs,
+    failureMode: trial.error ? "runtime_error" : void 0,
+    asi: trial.asi,
+    metadata: trial.metadata
+  }));
+}
+function evaluateReleaseConfidence(input) {
+  const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
+  const candidateId = input.candidateId ?? null;
+  const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
+  const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
+  const scenarios = input.scenarios ?? [];
+  const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
+  const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
+  const searchScores = scoresFor(runs, "search");
+  const holdoutScores = scoresFor(runs, "holdout");
+  const allScores = [...searchScores, ...holdoutScores];
+  const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
+  const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
+  const searchRuns = runs.filter((r) => r.splitTag === "search").length;
+  const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
+  const searchMeanScore = mean8(searchScores);
+  const holdoutMeanScore = mean8(holdoutScores);
+  const metrics = {
+    scenarioCount,
+    searchRuns,
+    holdoutRuns,
+    passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
+    meanScore: mean8(scoreUniverse),
+    searchMeanScore,
+    holdoutMeanScore,
+    overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
+    meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
+    p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
+    failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
+    failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
+    singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
+    multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
+    splitCounts,
+    domainCounts: countDomains(scenarios),
+    failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
+    responsibleSurfaceCounts: countResponsibleSurfaces(traces)
+  };
+  const issues = [];
+  checkCorpus(input, thresholds, metrics, issues);
+  checkQuality(thresholds, metrics, issues);
+  checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
+  checkDiagnostics(thresholds, metrics, issues);
+  checkEfficiency(thresholds, metrics, issues);
+  const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
+  const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
+  return {
+    target: input.target,
+    candidateId,
+    baselineId: input.baselineId ?? null,
+    status,
+    promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
+    axes,
+    issues,
+    metrics,
+    dataset: input.dataset ?? null,
+    gateDecision: input.gateDecision ?? null,
+    summary: renderSummary(input.target, status, metrics, issues)
+  };
+}
+function assertReleaseConfidence(input) {
+  const scorecard = evaluateReleaseConfidence(input);
+  if (scorecard.status === "fail") {
+    throw new Error(scorecard.summary);
+  }
+  return scorecard;
+}
+function filterCandidate(runs, candidateId, baselineId) {
+  if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
+  if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
+  return [...runs];
+}
+function filterTraceCandidate(traces, candidateId, baselineId) {
+  if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
+  if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
+  return [...traces];
+}
+function checkCorpus(input, thresholds, metrics, issues) {
+  if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
+  }
+  if (metrics.scenarioCount < thresholds.minScenarioCount) {
+    issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
+  }
+  if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
+  }
+}
+function checkQuality(thresholds, metrics, issues) {
+  if (metrics.searchRuns < thresholds.minSearchRuns) {
+    issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
+  }
+  if (metrics.passRate < thresholds.minPassRate) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
+  }
+  if (metrics.meanScore < thresholds.minMeanScore) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
+  }
+}
+function checkGeneralization(gateDecision, thresholds, metrics, issues) {
+  if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
+    issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
+  }
+  if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
+    issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
+  }
+  if (gateDecision && !gateDecision.promote) {
+    issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
+  }
+}
+function checkDiagnostics(thresholds, metrics, issues) {
+  if (!thresholds.requireAsiForFailures) return;
+  if (metrics.failedRows > metrics.failuresWithAsi) {
+    issues.push({
+      axis: "diagnostics",
+      severity: "critical",
+      code: "missing_failure_asi",
+      detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
+    });
+  }
+}
+function checkEfficiency(thresholds, metrics, issues) {
+  if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
+  }
+  if (metrics.p95WallMs > thresholds.maxP95WallMs) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
+  }
+}
+function buildAxes(metrics, thresholds, gateDecision, issues) {
+  return [
+    axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
+    axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
+    axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
+    axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
+    axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
+  ];
+}
+function axis(name, issues, score, detail) {
+  const own = issues.filter((i) => i.axis === name);
+  const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
+  return { name, status, score: bounded(score), detail };
+}
+function countScenarioSplits(scenarios) {
+  const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
+  for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
+  return counts;
+}
+function countDomains(scenarios) {
+  const out = {};
+  for (const scenario of scenarios) {
+    const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
+    out[domain] = (out[domain] ?? 0) + 1;
+  }
+  return out;
+}
+function countFailureModes(runs, traces, threshold) {
+  const out = {};
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const mode = run.failureMode ?? "low_score";
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function countResponsibleSurfaces(traces) {
+  const out = {};
+  for (const trace of traces) {
+    for (const asi of trace.asi ?? []) {
+      const surface = asi.responsibleSurface ?? "unknown";
+      out[surface] = (out[surface] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function failedRows(runs, traces, threshold) {
+  const out = [];
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const asiMetric = run.outcome.raw.asi;
+      out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
+    }
+  }
+  return out;
+}
+function passRate(runs, traces, threshold) {
+  const outcomes = [
+    ...runs.map((run) => {
+      const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+      return !run.failureMode && score !== void 0 && score >= threshold;
+    }),
+    ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
+  ];
+  if (outcomes.length === 0) return 0;
+  return outcomes.filter(Boolean).length / outcomes.length;
+}
+function scoresFor(runs, split) {
+  return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
+}
+function mean8(xs) {
+  if (xs.length === 0) return Number.NaN;
+  return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
+}
+function percentile(xs, p) {
+  if (xs.length === 0) return Number.NaN;
+  const sorted = [...xs].sort((a, b) => a - b);
+  return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
+}
+function isFiniteNumber(value) {
+  return typeof value === "number" && Number.isFinite(value);
+}
+function safeDiff2(a, b) {
+  if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
+  return a - b;
+}
+function gapScore(gap, maxGap) {
+  if (!Number.isFinite(gap)) return 0;
+  if (maxGap <= 0) return gap <= 0 ? 1 : 0;
+  return bounded(1 - Math.max(0, gap) / maxGap);
+}
+function efficiencyScore(metrics, thresholds) {
+  const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
+  const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
+  return Math.min(cost, latency);
+}
+function bounded(x) {
+  if (!Number.isFinite(x)) return 0;
+  return Math.max(0, Math.min(1, x));
+}
+function renderSummary(target, status, metrics, issues) {
+  const prefix = `release confidence ${status}: ${target}`;
+  const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
+  if (issues.length === 0) return `${prefix}; ${metricText}`;
+  return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
+}
+function fmt3(x) {
+  if (!Number.isFinite(x)) return String(x);
+  return x.toFixed(4);
+}
 // src/jsonl-trial-cache.ts
 import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
 import { dirname as dirname4 } from "path";
@@ -13458,9 +13741,9 @@ function passOrthogonality(input) {
       sims.push(cosineSimilarity(vectors[i], vectors[j]));
     }
   }
-  const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
   return {
-    orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
+    orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
     passCount: passes.length,
     similarities: sims
   };
@@ -13506,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
   const iterations = options.iterations ?? 1e3;
   const minTotal = options.minTotalSamples ?? 6;
   const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
-  const baselineMean = mean8(baseline);
-  const candidateMean = mean8(candidate);
+  const baselineMean = mean9(baseline);
+  const candidateMean = mean9(candidate);
   const delta = candidateMean - baselineMean;
   if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
     return {
@@ -13525,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
   for (let i = 0; i < iterations; i++) {
     const bResample = resample(baseline, rng);
     const cResample = resample(candidate, rng);
-    deltas[i] = mean8(cResample) - mean8(bResample);
+    deltas[i] = mean9(cResample) - mean9(bResample);
   }
   deltas.sort((a, b) => a - b);
   const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13548,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
     verdict
   };
 }
-function mean8(xs) {
+function mean9(xs) {
   if (xs.length === 0) return 0;
   let s = 0;
   for (const x of xs) s += x;
@@ -13872,6 +14155,7 @@ export {
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
+  assertReleaseConfidence,
   assignFeedbackSplit,
   attributeCounterfactuals,
   deterministicSplit as benchmarkDeterministicSplit,
@@ -13942,6 +14226,7 @@ export {
   evaluateContract,
   evaluateHypothesis,
   evaluateOracles,
+  evaluateReleaseConfidence,
   executeScenario,
   expectAgent,
   exportRewardModel,
@@ -14041,6 +14326,7 @@ export {
   regexMatch,
   regexMatches,
   regressionView,
+  releaseTraceEvidenceFromMultiShotTrials,
   renderMarkdown,
   renderMarkdownReport,
   renderPlaybookMarkdown,