npm - @tangle-network/agent-eval - Versions diffs - 0.18.0 → 0.19.1 - Mend

@tangle-network/agent-eval 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js CHANGED Viewed

@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
   const n = scores2.length;
-  const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
+  const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean9,
+    mean: mean10,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
+  const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
-  const t = mean9 / se;
+  if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
+  const t = mean10 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean9 = n * (n + 1) / 4;
+  const mean10 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean9) / Math.sqrt(variance2);
+  const z = (wPlus - mean10) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -3473,174 +3473,6 @@ function rowToRun(row) {
   };
 }
-// src/power-analysis.ts
-function requiredSampleSize(opts) {
-  const effect = opts.effect;
-  if (!Number.isFinite(effect) || effect <= 0) return Infinity;
-  const alpha = opts.alpha ?? 0.05;
-  const power = opts.power ?? 0.8;
-  const twoSided = opts.twoSided ?? true;
-  const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
-  const zBeta = zQuantile(power);
-  const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
-  return Math.ceil(n);
-}
-function bonferroni(pValues, alpha = 0.05) {
-  const k = pValues.length;
-  const adjusted = pValues.map((p) => Math.min(1, p * k));
-  const significant = adjusted.map((p) => p < alpha);
-  return { adjusted, significant };
-}
-function benjaminiHochberg(pValues, fdr = 0.05) {
-  const n = pValues.length;
-  if (n === 0) return { qValues: [], significant: [] };
-  const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
-  const q = new Array(n);
-  let minRight = 1;
-  for (let k = n - 1; k >= 0; k--) {
-    const rank = k + 1;
-    const raw = indexed[k].p * n / rank;
-    const bounded = Math.min(minRight, raw);
-    minRight = bounded;
-    q[indexed[k].i] = Math.min(1, bounded);
-  }
-  const significant = q.map((v) => v < fdr);
-  return { qValues: q, significant };
-}
-function zQuantile(p) {
-  if (p <= 0 || p >= 1) {
-    if (p === 0) return -Infinity;
-    if (p === 1) return Infinity;
-    return NaN;
-  }
-  const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
-  const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
-  const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
-  const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
-  const pLow = 0.02425;
-  const pHigh = 1 - pLow;
-  let q;
-  let r;
-  if (p < pLow) {
-    q = Math.sqrt(-2 * Math.log(p));
-    return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
-  }
-  if (p <= pHigh) {
-    q = p - 0.5;
-    r = q * q;
-    return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
-  }
-  q = Math.sqrt(-2 * Math.log(1 - p));
-  return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
-}
-// src/prompt-optimizer.ts
-var PromptOptimizer = class {
-  async run(config) {
-    const trials = config.trialsPerScenario ?? 3;
-    const alpha = config.significanceLevel ?? 0.05;
-    if (config.variants.length < 2) {
-      throw new Error("PromptOptimizer requires at least 2 variants");
-    }
-    if (config.scenarioIds.length === 0) {
-      throw new Error("PromptOptimizer requires at least 1 scenario");
-    }
-    const rawScores = /* @__PURE__ */ new Map();
-    for (const variant of config.variants) {
-      const scenarioMap = /* @__PURE__ */ new Map();
-      rawScores.set(variant.id, scenarioMap);
-      for (const scenarioId of config.scenarioIds) {
-        const samples = [];
-        for (let t = 0; t < trials; t++) {
-          const score = await config.scoreVariant({
-            variant,
-            scenarioId,
-            trialIndex: t
-          });
-          if (!Number.isFinite(score)) {
-            throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
-          }
-          samples.push(score);
-        }
-        scenarioMap.set(scenarioId, samples);
-        config.onScenarioComplete?.({
-          variantId: variant.id,
-          scenarioId,
-          scores: samples
-        });
-      }
-    }
-    const scores2 = config.variants.map((variant) => {
-      const scenarioMap = rawScores.get(variant.id);
-      const allSamples = [];
-      const perScenario = {};
-      for (const scenarioId of config.scenarioIds) {
-        const samples = scenarioMap.get(scenarioId) ?? [];
-        allSamples.push(...samples);
-        perScenario[scenarioId] = {
-          mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
-          n: samples.length,
-          samples
-        };
-      }
-      const ci = confidenceInterval(allSamples, 0.95);
-      return {
-        variantId: variant.id,
-        mean: ci.mean,
-        ci95: { lower: ci.lower, upper: ci.upper },
-        n: allSamples.length,
-        perScenario
-      };
-    });
-    const rawPairs = [];
-    for (let i = 0; i < scores2.length; i++) {
-      for (let j = i + 1; j < scores2.length; j++) {
-        const a = scores2[i];
-        const b = scores2[j];
-        const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
-        rawPairs.push({ a, b, p });
-      }
-    }
-    const { qValues } = benjaminiHochberg(rawPairs.map((r) => r.p), alpha);
-    const pairwise2 = rawPairs.map((r, idx) => ({
-      variantA: r.a.variantId,
-      variantB: r.b.variantId,
-      pValue: r.p,
-      qValue: qValues[idx],
-      significant: qValues[idx] < alpha,
-      meanDelta: r.b.mean - r.a.mean
-    }));
-    const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
-    const winner = sorted[0];
-    const second = sorted[1];
-    const winnerComparisons = pairwise2.filter(
-      (c) => c.variantA === winner.variantId || c.variantB === winner.variantId
-    );
-    const significantOverAll = winnerComparisons.every((c) => c.significant);
-    const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
-    return {
-      winner: {
-        variantId: winner.variantId,
-        significant: significantOverAll,
-        ciLowerBoundExceedsSecondMean
-      },
-      scores: scores2,
-      pairwise: pairwise2,
-      config: {
-        trialsPerScenario: trials,
-        significanceLevel: alpha,
-        variants: config.variants.map((v) => v.id),
-        scenarios: config.scenarioIds
-      }
-    };
-  }
-};
-function flatSamples(score) {
-  const out = [];
-  for (const s of Object.values(score.perScenario)) out.push(...s.samples);
-  return out;
-}
 // src/steering.ts
 function mergeSteeringBundle(base, delta) {
   return {
@@ -3831,46 +3663,6 @@ function canonicalInstruction(value) {
   return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
 }
-// src/optimization-loop.ts
-var OptimizationLoop = class {
-  optimizer;
-  constructor(optimizer = new PromptOptimizer()) {
-    this.optimizer = optimizer;
-  }
-  async run(config) {
-    const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
-    const result = await this.optimizer.run({
-      variants: config.variants.map((variant) => ({
-        id: variant.id,
-        prompt: renderSteeringText(variant),
-        metadata: { bundle: variant }
-      })),
-      scenarioIds: config.examples.map((example) => example.scenarioId),
-      trialsPerScenario: config.trialsPerScenario,
-      scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
-        const bundle = byId.get(variant.id);
-        if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
-        const example = config.examples.find((item) => item.scenarioId === scenarioId);
-        if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
-        const score = await config.evaluate({ variant: bundle, example, trialIndex });
-        return aggregateRunScore(score, config.scoreWeights);
-      }
-    });
-    return {
-      winner: byId.get(result.winner.variantId),
-      significant: result.winner.significant,
-      reports: result.scores.map((score) => ({
-        variantId: score.variantId,
-        bundle: byId.get(score.variantId),
-        mean: score.mean,
-        ci95: score.ci95,
-        scenarioScores: score.perScenario
-      })),
-      pairwise: result.pairwise
-    };
-  }
-};
 // src/steering-optimizer.ts
 var PairwiseSteeringOptimizer = class {
   optimize(rows, config = {}) {
@@ -6665,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
+  const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -6689,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -7119,6 +6911,67 @@ function excerpt3(source, needle) {
   return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
 }
+// src/power-analysis.ts
+function requiredSampleSize(opts) {
+  const effect = opts.effect;
+  if (!Number.isFinite(effect) || effect <= 0) return Infinity;
+  const alpha = opts.alpha ?? 0.05;
+  const power = opts.power ?? 0.8;
+  const twoSided = opts.twoSided ?? true;
+  const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
+  const zBeta = zQuantile(power);
+  const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
+  return Math.ceil(n);
+}
+function bonferroni(pValues, alpha = 0.05) {
+  const k = pValues.length;
+  const adjusted = pValues.map((p) => Math.min(1, p * k));
+  const significant = adjusted.map((p) => p < alpha);
+  return { adjusted, significant };
+}
+function benjaminiHochberg(pValues, fdr = 0.05) {
+  const n = pValues.length;
+  if (n === 0) return { qValues: [], significant: [] };
+  const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
+  const q = new Array(n);
+  let minRight = 1;
+  for (let k = n - 1; k >= 0; k--) {
+    const rank = k + 1;
+    const raw = indexed[k].p * n / rank;
+    const bounded2 = Math.min(minRight, raw);
+    minRight = bounded2;
+    q[indexed[k].i] = Math.min(1, bounded2);
+  }
+  const significant = q.map((v) => v < fdr);
+  return { qValues: q, significant };
+}
+function zQuantile(p) {
+  if (p <= 0 || p >= 1) {
+    if (p === 0) return -Infinity;
+    if (p === 1) return Infinity;
+    return NaN;
+  }
+  const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
+  const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
+  const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
+  const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
+  const pLow = 0.02425;
+  const pHigh = 1 - pLow;
+  let q;
+  let r;
+  if (p < pLow) {
+    q = Math.sqrt(-2 * Math.log(p));
+    return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
+  }
+  if (p <= pHigh) {
+    q = p - 0.5;
+    r = q * q;
+    return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
+  }
+  q = Math.sqrt(-2 * Math.log(1 - p));
+  return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
+}
 // src/behavior-dsl.ts
 var BehaviorAssertion = class {
   constructor(store, runId) {
@@ -7617,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
+  const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -8543,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
+  const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8566,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
+  const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -9097,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores2.length < 3) continue;
-    const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
-    const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
+    const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
+    const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -12960,6 +12813,289 @@ function traceExcerpt(trace) {
   return void 0;
 }
+// src/release-confidence.ts
+var DEFAULT_THRESHOLDS = {
+  requireCorpus: true,
+  minScenarioCount: 1,
+  minSearchRuns: 1,
+  minHoldoutRuns: 1,
+  requireHoldout: true,
+  minPassRate: 0.8,
+  minMeanScore: 0.7,
+  maxOverfitGap: 0.15,
+  maxMeanCostUsd: Number.POSITIVE_INFINITY,
+  maxP95WallMs: Number.POSITIVE_INFINITY,
+  requireAsiForFailures: true,
+  failureScoreThreshold: 0.5
+};
+function releaseTraceEvidenceFromMultiShotTrials(trials) {
+  return trials.map((trial) => ({
+    scenarioId: trial.scenarioId,
+    candidateId: trial.variantId,
+    split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
+    score: trial.score,
+    ok: trial.ok,
+    turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
+    costUsd: trial.cost,
+    durationMs: trial.durationMs,
+    failureMode: trial.error ? "runtime_error" : void 0,
+    asi: trial.asi,
+    metadata: trial.metadata
+  }));
+}
+function evaluateReleaseConfidence(input) {
+  const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
+  const candidateId = input.candidateId ?? null;
+  const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
+  const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
+  const scenarios = input.scenarios ?? [];
+  const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
+  const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
+  const searchScores = scoresFor(runs, "search");
+  const holdoutScores = scoresFor(runs, "holdout");
+  const allScores = [...searchScores, ...holdoutScores];
+  const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
+  const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
+  const searchRuns = runs.filter((r) => r.splitTag === "search").length;
+  const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
+  const searchMeanScore = mean8(searchScores);
+  const holdoutMeanScore = mean8(holdoutScores);
+  const metrics = {
+    scenarioCount,
+    searchRuns,
+    holdoutRuns,
+    passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
+    meanScore: mean8(scoreUniverse),
+    searchMeanScore,
+    holdoutMeanScore,
+    overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
+    meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
+    p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
+    failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
+    failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
+    singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
+    multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
+    splitCounts,
+    domainCounts: countDomains(scenarios),
+    failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
+    responsibleSurfaceCounts: countResponsibleSurfaces(traces)
+  };
+  const issues = [];
+  checkCorpus(input, thresholds, metrics, issues);
+  checkQuality(thresholds, metrics, issues);
+  checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
+  checkDiagnostics(thresholds, metrics, issues);
+  checkEfficiency(thresholds, metrics, issues);
+  const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
+  const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
+  return {
+    target: input.target,
+    candidateId,
+    baselineId: input.baselineId ?? null,
+    status,
+    promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
+    axes,
+    issues,
+    metrics,
+    dataset: input.dataset ?? null,
+    gateDecision: input.gateDecision ?? null,
+    summary: renderSummary(input.target, status, metrics, issues)
+  };
+}
+function assertReleaseConfidence(input) {
+  const scorecard = evaluateReleaseConfidence(input);
+  if (scorecard.status === "fail") {
+    throw new Error(scorecard.summary);
+  }
+  return scorecard;
+}
+function filterCandidate(runs, candidateId, baselineId) {
+  if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
+  if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
+  return [...runs];
+}
+function filterTraceCandidate(traces, candidateId, baselineId) {
+  if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
+  if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
+  return [...traces];
+}
+function checkCorpus(input, thresholds, metrics, issues) {
+  if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
+  }
+  if (metrics.scenarioCount < thresholds.minScenarioCount) {
+    issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
+  }
+  if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
+  }
+}
+function checkQuality(thresholds, metrics, issues) {
+  if (metrics.searchRuns < thresholds.minSearchRuns) {
+    issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
+  }
+  if (metrics.passRate < thresholds.minPassRate) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
+  }
+  if (metrics.meanScore < thresholds.minMeanScore) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
+  }
+}
+function checkGeneralization(gateDecision, thresholds, metrics, issues) {
+  if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
+    issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
+  }
+  if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
+    issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
+  }
+  if (gateDecision && !gateDecision.promote) {
+    issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
+  }
+}
+function checkDiagnostics(thresholds, metrics, issues) {
+  if (!thresholds.requireAsiForFailures) return;
+  if (metrics.failedRows > metrics.failuresWithAsi) {
+    issues.push({
+      axis: "diagnostics",
+      severity: "critical",
+      code: "missing_failure_asi",
+      detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
+    });
+  }
+}
+function checkEfficiency(thresholds, metrics, issues) {
+  if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
+  }
+  if (metrics.p95WallMs > thresholds.maxP95WallMs) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
+  }
+}
+function buildAxes(metrics, thresholds, gateDecision, issues) {
+  return [
+    axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
+    axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
+    axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
+    axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
+    axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
+  ];
+}
+function axis(name, issues, score, detail) {
+  const own = issues.filter((i) => i.axis === name);
+  const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
+  return { name, status, score: bounded(score), detail };
+}
+function countScenarioSplits(scenarios) {
+  const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
+  for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
+  return counts;
+}
+function countDomains(scenarios) {
+  const out = {};
+  for (const scenario of scenarios) {
+    const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
+    out[domain] = (out[domain] ?? 0) + 1;
+  }
+  return out;
+}
+function countFailureModes(runs, traces, threshold) {
+  const out = {};
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const mode = run.failureMode ?? "low_score";
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function countResponsibleSurfaces(traces) {
+  const out = {};
+  for (const trace of traces) {
+    for (const asi of trace.asi ?? []) {
+      const surface = asi.responsibleSurface ?? "unknown";
+      out[surface] = (out[surface] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function failedRows(runs, traces, threshold) {
+  const out = [];
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const asiMetric = run.outcome.raw.asi;
+      out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
+    }
+  }
+  return out;
+}
+function passRate(runs, traces, threshold) {
+  const outcomes = [
+    ...runs.map((run) => {
+      const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+      return !run.failureMode && score !== void 0 && score >= threshold;
+    }),
+    ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
+  ];
+  if (outcomes.length === 0) return 0;
+  return outcomes.filter(Boolean).length / outcomes.length;
+}
+function scoresFor(runs, split) {
+  return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
+}
+function mean8(xs) {
+  if (xs.length === 0) return Number.NaN;
+  return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
+}
+function percentile(xs, p) {
+  if (xs.length === 0) return Number.NaN;
+  const sorted = [...xs].sort((a, b) => a - b);
+  return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
+}
+function isFiniteNumber(value) {
+  return typeof value === "number" && Number.isFinite(value);
+}
+function safeDiff2(a, b) {
+  if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
+  return a - b;
+}
+function gapScore(gap, maxGap) {
+  if (!Number.isFinite(gap)) return 0;
+  if (maxGap <= 0) return gap <= 0 ? 1 : 0;
+  return bounded(1 - Math.max(0, gap) / maxGap);
+}
+function efficiencyScore(metrics, thresholds) {
+  const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
+  const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
+  return Math.min(cost, latency);
+}
+function bounded(x) {
+  if (!Number.isFinite(x)) return 0;
+  return Math.max(0, Math.min(1, x));
+}
+function renderSummary(target, status, metrics, issues) {
+  const prefix = `release confidence ${status}: ${target}`;
+  const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
+  if (issues.length === 0) return `${prefix}; ${metricText}`;
+  return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
+}
+function fmt3(x) {
+  if (!Number.isFinite(x)) return String(x);
+  return x.toFixed(4);
+}
 // src/jsonl-trial-cache.ts
 import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
 import { dirname as dirname4 } from "path";
@@ -13605,9 +13741,9 @@ function passOrthogonality(input) {
       sims.push(cosineSimilarity(vectors[i], vectors[j]));
     }
   }
-  const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
   return {
-    orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
+    orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
     passCount: passes.length,
     similarities: sims
   };
@@ -13653,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
   const iterations = options.iterations ?? 1e3;
   const minTotal = options.minTotalSamples ?? 6;
   const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
-  const baselineMean = mean8(baseline);
-  const candidateMean = mean8(candidate);
+  const baselineMean = mean9(baseline);
+  const candidateMean = mean9(candidate);
   const delta = candidateMean - baselineMean;
   if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
     return {
@@ -13672,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
   for (let i = 0; i < iterations; i++) {
     const bResample = resample(baseline, rng);
     const cResample = resample(candidate, rng);
-    deltas[i] = mean8(cResample) - mean8(bResample);
+    deltas[i] = mean9(cResample) - mean9(bResample);
   }
   deltas.sort((a, b) => a - b);
   const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13695,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
     verdict
   };
 }
-function mean8(xs) {
+function mean9(xs) {
   if (xs.length === 0) return 0;
   let s = 0;
   for (const x of xs) s += x;
@@ -13995,12 +14131,10 @@ export {
   Mutex,
   NoopResearcher,
   OTEL_AGENT_EVAL_SCOPE,
-  OptimizationLoop,
   PairwiseSteeringOptimizer,
   PrmGrader,
   ProductClient,
   ProjectRegistry,
-  PromptOptimizer,
   PromptRegistry,
   REDACTION_VERSION,
   RunCritic,
@@ -14021,6 +14155,7 @@ export {
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
+  assertReleaseConfidence,
   assignFeedbackSplit,
   attributeCounterfactuals,
   deterministicSplit as benchmarkDeterministicSplit,
@@ -14091,6 +14226,7 @@ export {
   evaluateContract,
   evaluateHypothesis,
   evaluateOracles,
+  evaluateReleaseConfidence,
   executeScenario,
   expectAgent,
   exportRewardModel,
@@ -14190,6 +14326,7 @@ export {
   regexMatch,
   regexMatches,
   regressionView,
+  releaseTraceEvidenceFromMultiShotTrials,
   renderMarkdown,
   renderMarkdownReport,
   renderPlaybookMarkdown,