npm - @tangle-network/agent-eval - Versions diffs - 0.49.0 → 0.50.0 - Mend

@tangle-network/agent-eval 0.49.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +8 -2
package/dist/campaign/index.d.ts +3 -3
package/dist/{chunk-PD3MH6WU.js → chunk-5KSDYBYH.js} +2 -2
package/dist/{chunk-MNL6LXGQ.js → chunk-EGIPWXHL.js} +2 -98
package/dist/chunk-EGIPWXHL.js.map +1 -0
package/dist/{chunk-OYI6RZJK.js → chunk-FQK2CCIM.js} +1 -1
package/dist/chunk-FQK2CCIM.js.map +1 -0
package/dist/chunk-MAZ26DC7.js +99 -0
package/dist/chunk-MAZ26DC7.js.map +1 -0
package/dist/chunk-SHTXZ4O2.js +113 -0
package/dist/chunk-SHTXZ4O2.js.map +1 -0
package/dist/{chunk-KQ26DYTQ.js → chunk-UBQGWD3O.js} +2 -2
package/dist/contract/index.d.ts +206 -9
package/dist/contract/index.js +751 -3
package/dist/contract/index.js.map +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/hosted/index.d.ts +8 -192
package/dist/hosted/index.js +1 -1
package/dist/index-BRxz6qov.d.ts +409 -0
package/dist/index.d.ts +18 -462
package/dist/index.js +14 -106
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +3 -3
package/dist/openapi.json +1 -1
package/dist/{outcome-store-BxJ3DQKJ.d.ts → outcome-store-D6KWmYvj.d.ts} +1 -1
package/dist/registry-8KAs18kY.d.ts +457 -0
package/dist/{release-report-DBB8lB1P.d.ts → release-report-DSu0DWy8.d.ts} +3 -296
package/dist/reporting.d.ts +6 -4
package/dist/reporting.js +6 -4
package/dist/{researcher-CHMO56K0.d.ts → researcher-LZD0qHEa.d.ts} +1 -1
package/dist/rl.d.ts +9 -8
package/dist/rl.js +3 -2
package/dist/rl.js.map +1 -1
package/dist/{rubric-predictive-validity-CJ08tGwq.d.ts → rubric-predictive-validity-ByZEC3BX.d.ts} +1 -1
package/dist/{run-improvement-loop-B-L8GgpW.d.ts → run-improvement-loop-BPMjNKMJ.d.ts} +2 -2
package/dist/sequential-5iSVfzl2.d.ts +139 -0
package/dist/store-CJbzDxZ2.d.ts +220 -0
package/dist/{sequential-CbFH___X.d.ts → summary-report-B7gNRX-r.d.ts} +1 -139
package/dist/traces.d.ts +3 -220
package/dist/{types-8u72Gc76.d.ts → types-Dbj5gu8n.d.ts} +1 -1
package/dist/types-DhqpAi_z.d.ts +296 -0
package/package.json +1 -1
package/dist/chunk-MNL6LXGQ.js.map +0 -1
package/dist/chunk-OYI6RZJK.js.map +0 -1
/package/dist/{chunk-PD3MH6WU.js.map → chunk-5KSDYBYH.js.map} +0 -0
/package/dist/{chunk-KQ26DYTQ.js.map → chunk-UBQGWD3O.js.map} +0 -0

package/dist/contract/index.js CHANGED Viewed

@@ -14,20 +14,538 @@ import {
 } from "../chunk-J3EIOI3O.js";
 import {
   createHostedClient
-} from "../chunk-OYI6RZJK.js";
+} from "../chunk-FQK2CCIM.js";
+import {
+  checkCanaries
+} from "../chunk-SHTXZ4O2.js";
 import "../chunk-N4SBKEPJ.js";
 import "../chunk-YV7J7X5N.js";
 import {
   FileSystemOutcomeStore,
   InMemoryOutcomeStore
 } from "../chunk-3RF76KTD.js";
-import "../chunk-WP7SY7AI.js";
+import {
+  paretoChart
+} from "../chunk-EGIPWXHL.js";
+import {
+  cohensD,
+  pairedBootstrap,
+  pairedMde,
+  pairedTTest,
+  requiredSampleSize
+} from "../chunk-WP7SY7AI.js";
 import "../chunk-GGE4NNQT.js";
+import "../chunk-47X6LRCE.js";
+import "../chunk-5BKGXME7.js";
+import "../chunk-VSMTAMNK.js";
 import "../chunk-VXNVVBZO.js";
 import "../chunk-PC4UYEBM.js";
 import "../chunk-QYJT52YW.js";
 import "../chunk-NSBPE2FW.js";
+// src/contract/analyze-runs.ts
+async function analyzeRuns(opts) {
+  const runs = opts.runs;
+  const bins = opts.histogramBins ?? 12;
+  const threshold = opts.decisionThreshold ?? 0.02;
+  const split = resolveSplit(runs, opts.split ?? "auto");
+  const composite = distributionOf(
+    runs.map((r) => compositeOf(r, split)).filter(Number.isFinite),
+    bins
+  );
+  const perDimension = computePerDimension(runs, bins);
+  const costQuality = {
+    cost: distributionOf(runs.map((r) => r.costUsd).filter(Number.isFinite), bins),
+    pareto: paretoChart(runs, { split })
+  };
+  const judges = computeJudgeInsights(runs);
+  const interRater = opts.raterScores ? computeInterRater(opts.raterScores) : void 0;
+  const lift = computeLift(runs, opts.baselineCandidateId, opts.candidateCandidateId, split);
+  const failureClusters = opts.analyst ? await computeFailureClusters(runs, opts.analyst, split) : void 0;
+  const contamination = opts.canaryScenarios ? computeContamination(runs, opts.canaryScenarios) : void 0;
+  const outcomeCorrelation = opts.outcomeSignal ? computeOutcomeCorrelation(runs, opts.outcomeSignal, split) : void 0;
+  const release = buildReleaseScorecard(composite, lift, contamination);
+  const recommendations = buildRecommendations({
+    composite,
+    judges,
+    interRater,
+    lift,
+    failureClusters,
+    contamination,
+    outcomeCorrelation,
+    threshold
+  });
+  return {
+    n: runs.length,
+    composite,
+    perDimension,
+    costQuality,
+    judges,
+    interRater,
+    lift,
+    failureClusters,
+    contamination,
+    outcomeCorrelation,
+    release,
+    recommendations
+  };
+}
+function resolveSplit(runs, pref) {
+  if (pref !== "auto") return pref;
+  const hasHoldout = runs.some((r) => Number.isFinite(r.outcome.holdoutScore));
+  return hasHoldout ? "holdout" : "search";
+}
+function compositeOf(run, split) {
+  const primary = split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore;
+  if (Number.isFinite(primary)) return primary;
+  const alt = split === "holdout" ? run.outcome.searchScore : run.outcome.holdoutScore;
+  return Number.isFinite(alt) ? alt : Number.NaN;
+}
+function distributionOf(values, bins) {
+  if (values.length === 0) {
+    return {
+      n: 0,
+      mean: 0,
+      p50: 0,
+      p95: 0,
+      stddev: 0,
+      min: 0,
+      max: 0,
+      histogram: []
+    };
+  }
+  const sorted = [...values].sort((a, b) => a - b);
+  const n = sorted.length;
+  const mean2 = sorted.reduce((s, v) => s + v, 0) / n;
+  const variance = sorted.reduce((s, v) => s + (v - mean2) ** 2, 0) / n;
+  const stddev = Math.sqrt(variance);
+  return {
+    n,
+    mean: mean2,
+    p50: percentile(sorted, 0.5),
+    p95: percentile(sorted, 0.95),
+    stddev,
+    min: sorted[0],
+    max: sorted[n - 1],
+    histogram: histogram(sorted, bins)
+  };
+}
+function percentile(sorted, q) {
+  if (sorted.length === 0) return 0;
+  if (sorted.length === 1) return sorted[0];
+  const idx = (sorted.length - 1) * q;
+  const lo = Math.floor(idx);
+  const hi = Math.ceil(idx);
+  if (lo === hi) return sorted[lo];
+  const w = idx - lo;
+  return sorted[lo] * (1 - w) + sorted[hi] * w;
+}
+function histogram(sorted, bins) {
+  if (sorted.length === 0 || bins < 1) return [];
+  const min = sorted[0];
+  const max = sorted[sorted.length - 1];
+  if (min === max) return [{ lo: min, hi: max, count: sorted.length }];
+  const width = (max - min) / bins;
+  const out = [];
+  for (let i = 0; i < bins; i++) {
+    const lo = min + i * width;
+    const hi = i === bins - 1 ? max : lo + width;
+    out.push({ lo, hi, count: 0 });
+  }
+  for (const v of sorted) {
+    const idx = Math.min(bins - 1, Math.floor((v - min) / width));
+    out[idx].count++;
+  }
+  return out;
+}
+function computePerDimension(runs, bins) {
+  const byDim = /* @__PURE__ */ new Map();
+  for (const run of runs) {
+    const scores = run.outcome.judgeScores;
+    if (!scores) continue;
+    for (const [dim, value] of Object.entries(scores.perDimMean ?? {})) {
+      if (!Number.isFinite(value)) continue;
+      const arr = byDim.get(dim) ?? [];
+      arr.push(value);
+      byDim.set(dim, arr);
+    }
+  }
+  const out = {};
+  for (const [dim, values] of byDim) out[dim] = distributionOf(values, bins);
+  return out;
+}
+function computeJudgeInsights(runs) {
+  const out = {};
+  const byJudge = /* @__PURE__ */ new Map();
+  for (const run of runs) {
+    const scores = run.outcome.judgeScores;
+    if (!scores?.perJudge) continue;
+    for (const [judgeId, dims] of Object.entries(scores.perJudge)) {
+      const dimValues = Object.values(dims).filter(Number.isFinite);
+      if (dimValues.length === 0) continue;
+      const judgeMean = dimValues.reduce((s, v) => s + v, 0) / dimValues.length;
+      const arr = byJudge.get(judgeId) ?? [];
+      arr.push(judgeMean);
+      byJudge.set(judgeId, arr);
+    }
+  }
+  for (const [judgeId, values] of byJudge) {
+    out[judgeId] = {
+      n: values.length,
+      meanScore: values.reduce((s, v) => s + v, 0) / values.length
+    };
+  }
+  return out;
+}
+function computeInterRater(ratings) {
+  const byRun = /* @__PURE__ */ new Map();
+  for (const r of ratings) {
+    if (!Number.isFinite(r.score)) continue;
+    const list = byRun.get(r.runId) ?? [];
+    list.push({ rater: r.rater, score: r.score });
+    byRun.set(r.runId, list);
+  }
+  const raters = new Set(ratings.map((r) => r.rater));
+  const jointlyRated = [];
+  for (const [runId, ratersForRun] of byRun) {
+    const seen = new Set(ratersForRun.map((r) => r.rater));
+    let all = true;
+    for (const r of raters) if (!seen.has(r)) all = false;
+    if (all) jointlyRated.push(runId);
+  }
+  if (raters.size < 2 || jointlyRated.length === 0) return void 0;
+  const raterList = [...raters].sort();
+  const perPair = {};
+  for (let i = 0; i < raterList.length; i++) {
+    for (let j = i + 1; j < raterList.length; j++) {
+      const a = raterList[i];
+      const b = raterList[j];
+      const aScores = [];
+      const bScores = [];
+      for (const runId of jointlyRated) {
+        const ratersForRun = byRun.get(runId);
+        const sa = ratersForRun.find((r) => r.rater === a)?.score;
+        const sb = ratersForRun.find((r) => r.rater === b)?.score;
+        if (sa !== void 0 && sb !== void 0) {
+          aScores.push(sa);
+          bScores.push(sb);
+        }
+      }
+      perPair[`${a}::${b}`] = pearson(aScores, bScores);
+    }
+  }
+  const pairKappas = Object.values(perPair);
+  const kappa = pairKappas.length === 0 ? 0 : pairKappas.reduce((s, v) => s + v, 0) / pairKappas.length;
+  const disagreementCases = jointlyRated.map((runId) => {
+    const ratersForRun = byRun.get(runId);
+    const scores = ratersForRun.map((r) => r.score);
+    const range = Math.max(...scores) - Math.min(...scores);
+    return { runId, ratings: ratersForRun, range };
+  }).sort((a, b) => b.range - a.range).slice(0, 20);
+  return {
+    raters: raters.size,
+    jointlyRated: jointlyRated.length,
+    kappa,
+    perPair,
+    disagreementCases
+  };
+}
+function pearson(a, b) {
+  if (a.length !== b.length || a.length === 0) return 0;
+  const n = a.length;
+  const meanA = a.reduce((s, v) => s + v, 0) / n;
+  const meanB = b.reduce((s, v) => s + v, 0) / n;
+  let num = 0;
+  let denomA = 0;
+  let denomB = 0;
+  for (let i = 0; i < n; i++) {
+    const da = a[i] - meanA;
+    const db = b[i] - meanB;
+    num += da * db;
+    denomA += da * da;
+    denomB += db * db;
+  }
+  const denom = Math.sqrt(denomA * denomB);
+  return denom === 0 ? 0 : num / denom;
+}
+function computeLift(runs, baselineId, candidateId, split) {
+  let bId = baselineId;
+  let cId = candidateId;
+  if (!bId || !cId) {
+    const ids = [...new Set(runs.map((r) => r.candidateId))];
+    if (ids.length !== 2) return void 0;
+    const [idA, idB] = ids;
+    const meanA = mean(runs.filter((r) => r.candidateId === idA).map((r) => compositeOf(r, split)));
+    const meanB = mean(runs.filter((r) => r.candidateId === idB).map((r) => compositeOf(r, split)));
+    bId = meanA <= meanB ? idA : idB;
+    cId = meanA <= meanB ? idB : idA;
+  }
+  const baseline = runs.filter((r) => r.candidateId === bId);
+  const candidate = runs.filter((r) => r.candidateId === cId);
+  if (baseline.length === 0 || candidate.length === 0) return void 0;
+  const baselineByKey = new Map(baseline.map((r) => [pairingKey(r), r]));
+  const pairedBaseline = [];
+  const pairedCandidate = [];
+  let usedKeyPairing = false;
+  for (const cand of candidate) {
+    const b = baselineByKey.get(pairingKey(cand));
+    if (b) {
+      const bC = compositeOf(b, split);
+      const cC = compositeOf(cand, split);
+      if (Number.isFinite(bC) && Number.isFinite(cC)) {
+        pairedBaseline.push(bC);
+        pairedCandidate.push(cC);
+        usedKeyPairing = true;
+      }
+    }
+  }
+  if (!usedKeyPairing) {
+    const n = Math.min(baseline.length, candidate.length);
+    for (let i = 0; i < n; i++) {
+      const bC = compositeOf(baseline[i], split);
+      const cC = compositeOf(candidate[i], split);
+      if (Number.isFinite(bC) && Number.isFinite(cC)) {
+        pairedBaseline.push(bC);
+        pairedCandidate.push(cC);
+      }
+    }
+  }
+  if (pairedBaseline.length === 0) return void 0;
+  const baselineMean = mean(pairedBaseline);
+  const candidateMean = mean(pairedCandidate);
+  const delta = candidateMean - baselineMean;
+  const bootstrap = pairedBootstrap(pairedBaseline, pairedCandidate, {
+    confidence: 0.95,
+    resamples: 2e3,
+    statistic: "mean"
+  });
+  const tTest = pairedTTest(pairedBaseline, pairedCandidate);
+  const d = cohensD(pairedBaseline, pairedCandidate);
+  const mde = pairedMde({ nPaired: pairedBaseline.length, power: 0.8, alpha: 0.05 });
+  const requiredN = requiredSampleSize({
+    effect: Math.max(Math.abs(delta), 1e-6),
+    power: 0.8,
+    alpha: 0.05
+  });
+  return {
+    baselineMean,
+    candidateMean,
+    delta,
+    ci95: [bootstrap.low, bootstrap.high],
+    pValue: tTest.p,
+    n: pairedBaseline.length,
+    cohensD: d,
+    mde,
+    requiredN
+  };
+}
+function pairingKey(r) {
+  return `${r.experimentId}::${r.seed}`;
+}
+function mean(arr) {
+  return arr.length === 0 ? 0 : arr.reduce((s, v) => s + v, 0) / arr.length;
+}
+async function computeFailureClusters(runs, analyst, split) {
+  const failed = runs.filter((r) => compositeOf(r, split) < 0.5 || r.failureMode !== void 0);
+  if (failed.length === 0) return { clusters: [], totalFailures: 0 };
+  const clusters = /* @__PURE__ */ new Map();
+  for (const run of failed) {
+    try {
+      const result = await analyst.run(run.runId, {
+        kind: "run-record",
+        run
+      });
+      for (const finding of result.findings) {
+        const key = finding.area || finding.analyst_id || "unclassified";
+        const c = clusters.get(key) ?? { exemplars: [], share: 0 };
+        if (c.exemplars.length < 5) c.exemplars.push(run.runId);
+        clusters.set(key, c);
+      }
+    } catch {
+      const c = clusters.get("analyst-error") ?? { exemplars: [], share: 0 };
+      if (c.exemplars.length < 5) c.exemplars.push(run.runId);
+      clusters.set("analyst-error", c);
+    }
+  }
+  const clusterList = [...clusters.entries()].map(([id, c]) => ({
+    id,
+    name: id,
+    share: c.exemplars.length / failed.length,
+    exemplars: c.exemplars
+  }));
+  clusterList.sort((a, b) => b.share - a.share);
+  return { clusters: clusterList, totalFailures: failed.length };
+}
+function computeContamination(runs, canaries) {
+  let leaks = 0;
+  const details = [];
+  for (const run of runs) {
+    const output = stringifyOutput(run);
+    if (!output) continue;
+    const leaksHere = checkCanaries(output, canaries);
+    for (const leak of leaksHere) {
+      leaks++;
+      details.push({ runId: run.runId, canary: leak.canary, matched: leak.evidence });
+    }
+  }
+  return { leaks, holdoutAuditPassed: leaks === 0, details };
+}
+function stringifyOutput(run) {
+  const metadata = run.metadata;
+  if (typeof metadata?.output === "string") return metadata.output;
+  if (typeof metadata?.text === "string") return metadata.text;
+  return void 0;
+}
+function computeOutcomeCorrelation(runs, outcome, split) {
+  const xs = [];
+  const ys = [];
+  for (const run of runs) {
+    const y = outcome.valueByRunId[run.runId];
+    if (y === void 0 || !Number.isFinite(y)) continue;
+    const x = compositeOf(run, split);
+    if (!Number.isFinite(x)) continue;
+    xs.push(x);
+    ys.push(y);
+  }
+  if (xs.length < 3) return void 0;
+  const p = pearson(xs, ys);
+  const s = spearman(xs, ys);
+  const meanX = mean(xs);
+  const meanY = mean(ys);
+  let num = 0;
+  let denom = 0;
+  for (let i = 0; i < xs.length; i++) {
+    num += (xs[i] - meanX) * (ys[i] - meanY);
+    denom += (xs[i] - meanX) ** 2;
+  }
+  const slope = denom === 0 ? 0 : num / denom;
+  const intercept = meanY - slope * meanX;
+  const ssTot = ys.reduce((a, y) => a + (y - meanY) ** 2, 0);
+  const ssRes = ys.reduce((a, y, i) => a + (y - (intercept + slope * xs[i])) ** 2, 0);
+  const r2 = ssTot === 0 ? 0 : 1 - ssRes / ssTot;
+  return {
+    metric: outcome.metric,
+    n: xs.length,
+    pearson: p,
+    spearman: s,
+    rewardModel: { intercept, slope, r2 }
+  };
+}
+function spearman(a, b) {
+  if (a.length !== b.length || a.length === 0) return 0;
+  return pearson(rank(a), rank(b));
+}
+function rank(arr) {
+  const indexed = arr.map((v, i2) => ({ v, i: i2 }));
+  indexed.sort((x, y) => x.v - y.v);
+  const ranks = new Array(arr.length).fill(0);
+  let i = 0;
+  while (i < indexed.length) {
+    let j = i;
+    while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
+    const avg = (i + j + 2) / 2;
+    for (let k = i; k <= j; k++) ranks[indexed[k].i] = avg;
+    i = j + 1;
+  }
+  return ranks;
+}
+function buildReleaseScorecard(composite, lift, contamination) {
+  const axes = [];
+  const liftPass = lift === void 0 || lift.ci95[0] > 0 ? "pass" : lift.delta > 0 ? "warn" : "fail";
+  axes.push({
+    name: "quality-lift",
+    status: liftPass,
+    detail: lift ? `delta=${lift.delta.toFixed(3)}, CI95=[${lift.ci95[0].toFixed(3)}, ${lift.ci95[1].toFixed(3)}], n=${lift.n}` : "no baseline/candidate pair available"
+  });
+  const contamPass = contamination === void 0 || contamination.leaks === 0 ? "pass" : "fail";
+  axes.push({
+    name: "contamination",
+    status: contamPass,
+    detail: contamination ? `${contamination.leaks} canary leak(s)` : "no canaries supplied"
+  });
+  axes.push({
+    name: "composite-distribution",
+    status: composite.mean >= 0.5 ? "pass" : composite.mean >= 0.3 ? "warn" : "fail",
+    detail: `mean=${composite.mean.toFixed(3)}, p50=${composite.p50.toFixed(3)}, p95=${composite.p95.toFixed(3)} over n=${composite.n}`
+  });
+  const status = axes.some((a) => a.status === "fail") ? "fail" : axes.some((a) => a.status === "warn") ? "warn" : "pass";
+  return {
+    status,
+    axes,
+    issues: []
+  };
+}
+function buildRecommendations(ctx) {
+  const out = [];
+  if (ctx.lift) {
+    const decisive = ctx.lift.ci95[0] > ctx.threshold;
+    const inconclusive = ctx.lift.ci95[0] <= ctx.threshold && ctx.lift.ci95[1] > ctx.threshold;
+    if (decisive) {
+      out.push({
+        priority: "critical",
+        kind: "ship",
+        title: `Ship \u2014 lift ${ctx.lift.delta.toFixed(3)} (95% CI ${ctx.lift.ci95[0].toFixed(3)}..${ctx.lift.ci95[1].toFixed(3)})`,
+        detail: `Holdout lift exceeds threshold ${ctx.threshold} with 95% bootstrap confidence (n=${ctx.lift.n}, p=${ctx.lift.pValue.toFixed(4)}, d=${ctx.lift.cohensD.toFixed(2)}).`,
+        evidencePath: "lift"
+      });
+    } else if (inconclusive) {
+      out.push({
+        priority: "high",
+        kind: "expand-corpus",
+        title: `Inconclusive \u2014 need ~${ctx.lift.requiredN} paired runs (have ${ctx.lift.n}) at current effect size`,
+        detail: `CI straddles threshold. Current MDE at 80% power is ${ctx.lift.mde.toFixed(3)}; observed delta is ${ctx.lift.delta.toFixed(3)}.`,
+        evidencePath: "lift"
+      });
+    } else {
+      out.push({
+        priority: "critical",
+        kind: "hold",
+        title: `Hold \u2014 lift CI lower bound ${ctx.lift.ci95[0].toFixed(3)} is at or below threshold ${ctx.threshold}`,
+        detail: `Bootstrap CI provides no statistical evidence the candidate is better. Consider tightening the mutation or expanding the holdout.`,
+        evidencePath: "lift"
+      });
+    }
+  }
+  if (ctx.contamination && ctx.contamination.leaks > 0) {
+    out.push({
+      priority: "critical",
+      kind: "fix",
+      title: `${ctx.contamination.leaks} canary leak${ctx.contamination.leaks === 1 ? "" : "s"} detected`,
+      detail: `Holdout integrity is compromised. The lift number is unreliable until you investigate.`,
+      evidencePath: "contamination"
+    });
+  }
+  if (ctx.interRater && ctx.interRater.kappa < 0.5) {
+    out.push({
+      priority: "high",
+      kind: "recalibrate",
+      title: `Inter-rater agreement \u03BA=${ctx.interRater.kappa.toFixed(2)} is below 0.5`,
+      detail: `Raters disagree on what 'good' looks like. Top disagreement cases listed in interRater.disagreementCases \u2014 consider a triage meeting or refining the rubric.`,
+      evidencePath: "interRater"
+    });
+  }
+  if (ctx.failureClusters && ctx.failureClusters.clusters.length > 0) {
+    const top = ctx.failureClusters.clusters[0];
+    out.push({
+      priority: "high",
+      kind: "investigate",
+      title: `Top failure cluster: ${top.name} (${(top.share * 100).toFixed(0)}% of failures)`,
+      detail: `${ctx.failureClusters.totalFailures} runs failed. The largest cluster groups ${top.exemplars.length} exemplars under '${top.name}'.`,
+      evidencePath: "failureClusters.clusters[0]"
+    });
+  }
+  if (ctx.outcomeCorrelation && Math.abs(ctx.outcomeCorrelation.spearman) < 0.3) {
+    out.push({
+      priority: "medium",
+      kind: "recalibrate",
+      title: `Judge scores decoupled from ${ctx.outcomeCorrelation.metric} (Spearman \u03C1=${ctx.outcomeCorrelation.spearman.toFixed(2)})`,
+      detail: `Your judges score what they were trained to score, but it isn't predicting downstream ${ctx.outcomeCorrelation.metric}. Consider retraining the judge against ${ctx.outcomeCorrelation.metric} as the gold signal.`,
+      evidencePath: "outcomeCorrelation"
+    });
+  }
+  return out;
+}
 // src/contract/self-improve.ts
 function splitTrainHoldout(scenarios, fraction) {
   function hash(s) {
@@ -139,6 +657,14 @@ async function selfImprove(opts) {
     (sum, gen) => sum + gen.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
     0
   );
+  const insight = await analyzeRuns({
+    runs: [
+      ...cellsToRunRecords(result.baselineCampaign.cells, "baseline", runDir),
+      ...cellsToRunRecords(result.winnerOnHoldout.cells, "winner", runDir)
+    ],
+    baselineCandidateId: "baseline",
+    candidateCandidateId: "winner"
+  });
   const summary = {
     baseline,
     winner: {
@@ -150,6 +676,7 @@ async function selfImprove(opts) {
     generationsExplored: result.generations.length,
     durationMs: Date.now() - startedAt,
     totalCostUsd: totalCost,
+    insight,
     raw: result
   };
   if (opts.hostedTenant) {
@@ -212,7 +739,8 @@ async function shipEvalRunToHosted(tenant, opts, summary, raw, runDir) {
     gateDecision: summary.gateDecision,
     holdoutLift: summary.lift,
     totalCostUsd: summary.totalCostUsd,
-    totalDurationMs: summary.durationMs
+    totalDurationMs: summary.durationMs,
+    insightReport: summary.insight
   };
   await client.ingestEvalRun(event);
 }
@@ -228,12 +756,232 @@ function hashString(s) {
   }
   return h.toString(16).padStart(8, "0");
 }
+function cellsToRunRecords(cells, candidateId, runId) {
+  return cells.map((cell) => {
+    const perJudge = {};
+    const perDimMeanAccum = {};
+    let compositeSum = 0;
+    let compositeCount = 0;
+    for (const [judgeId, score] of Object.entries(cell.judgeScores)) {
+      perJudge[judgeId] = { ...score.dimensions };
+      for (const [dim, value] of Object.entries(score.dimensions)) {
+        if (!Number.isFinite(value)) continue;
+        const accum = perDimMeanAccum[dim] ?? { sum: 0, n: 0 };
+        accum.sum += value;
+        accum.n += 1;
+        perDimMeanAccum[dim] = accum;
+      }
+      if (Number.isFinite(score.composite)) {
+        compositeSum += score.composite;
+        compositeCount += 1;
+      }
+    }
+    const perDimMean = {};
+    for (const [dim, { sum, n }] of Object.entries(perDimMeanAccum)) {
+      perDimMean[dim] = n === 0 ? 0 : sum / n;
+    }
+    const composite = compositeCount === 0 ? 0 : compositeSum / compositeCount;
+    const judgeScores = {
+      perJudge,
+      perDimMean,
+      composite
+    };
+    return {
+      runId: `${runId}::${candidateId}::${cell.cellId}`,
+      experimentId: runId,
+      candidateId,
+      // Pair on (scenarioId, rep) — analyzeRuns pairs on (experimentId, seed).
+      // Synthesize a stable seed for that pairing.
+      seed: cell.rep * 1e6 + hashString(cell.scenarioId).slice(0, 6).split("").reduce((a, c) => a * 31 + c.charCodeAt(0) >>> 0, 0),
+      model: "campaign-cell",
+      promptHash: "sha256:cell",
+      configHash: "sha256:cell",
+      commitSha: "cell",
+      wallMs: cell.durationMs,
+      costUsd: cell.costUsd,
+      tokenUsage: { input: 0, output: 0 },
+      outcome: {
+        holdoutScore: composite,
+        raw: {},
+        judgeScores
+      },
+      splitTag: "holdout",
+      ...cell.error ? { failureMode: cell.error } : {}
+    };
+  });
+}
+// src/contract/intake/feedback-table.ts
+function fromFeedbackTable(opts) {
+  const { ratings, meta = [], scale, emitRaterScores = true } = opts;
+  const metaByRun = new Map(meta.map((m) => [m.runId, m]));
+  const normalise = (rating) => {
+    if (typeof rating === "boolean") return rating ? 1 : 0;
+    if (!Number.isFinite(rating)) return Number.NaN;
+    if (!scale) return rating;
+    const { min, max } = scale;
+    if (max === min) return rating;
+    return (rating - min) / (max - min);
+  };
+  const byRun = /* @__PURE__ */ new Map();
+  for (const row of ratings) {
+    const list = byRun.get(row.runId) ?? [];
+    list.push(row);
+    byRun.set(row.runId, list);
+  }
+  const runs = [];
+  const raterScores = [];
+  for (const [runId, rowsForRun] of byRun) {
+    const normalised = rowsForRun.map((r) => ({ rater: r.rater, score: normalise(r.rating) })).filter((r) => Number.isFinite(r.score));
+    if (normalised.length === 0) continue;
+    const meanScore = normalised.reduce((s, r) => s + r.score, 0) / normalised.length;
+    const runMeta = metaByRun.get(runId) ?? { runId };
+    const judgeScores = {
+      perJudge: Object.fromEntries(normalised.map((r) => [r.rater, { rating: r.score }])),
+      perDimMean: { rating: meanScore },
+      composite: meanScore
+    };
+    const outcome = {
+      // Feedback corpora ARE the holdout signal — score lands on
+      // `holdoutScore` so downstream substrate primitives (`paretoChart`,
+      // promotion gates) read it correctly by default.
+      holdoutScore: meanScore,
+      raw: Object.fromEntries(normalised.map((r) => [`rater:${r.rater}`, r.score])),
+      judgeScores
+    };
+    runs.push({
+      runId,
+      experimentId: runMeta.experimentId ?? "feedback-corpus",
+      candidateId: runMeta.candidateId ?? runId,
+      seed: 0,
+      model: runMeta.model ?? "unknown@unknown",
+      promptHash: runMeta.promptHash ?? "sha256:unknown",
+      configHash: runMeta.configHash ?? "sha256:unknown",
+      commitSha: runMeta.commitSha ?? "unknown",
+      wallMs: runMeta.wallMs ?? 0,
+      costUsd: runMeta.costUsd ?? 0,
+      tokenUsage: { input: 0, output: 0 },
+      outcome,
+      splitTag: runMeta.splitTag ?? "holdout"
+    });
+    if (emitRaterScores) {
+      for (const r of normalised) raterScores.push({ runId, rater: r.rater, score: r.score });
+    }
+  }
+  return { runs, raterScores };
+}
+// src/contract/intake/otel-spans.ts
+var SCORE_KEYS = ["tangle.score", "eval.score", "score"];
+var MODEL_KEYS = ["tangle.model", "gen_ai.request.model", "llm.model", "model"];
+var COST_KEYS = ["tangle.cost.usd", "gen_ai.usage.cost_usd", "cost.usd", "cost"];
+var INPUT_TOKEN_KEYS = ["gen_ai.usage.input_tokens", "tangle.tokens.in", "tokens.in"];
+var OUTPUT_TOKEN_KEYS = ["gen_ai.usage.output_tokens", "tangle.tokens.out", "tokens.out"];
+var PROMPT_HASH_KEYS = ["tangle.prompt_hash", "prompt.hash"];
+var CONFIG_HASH_KEYS = ["tangle.config_hash", "config.hash"];
+function fromOtelSpans(opts) {
+  const { spans, defaultSplit = "holdout", experimentId = "otel-corpus" } = opts;
+  const grouped = groupSpans(spans);
+  const runs = [];
+  for (const [groupKey, groupSpans2] of grouped) {
+    const root = findRoot(groupSpans2);
+    if (!root) continue;
+    const wallMs = Math.max(0, (root.endTimeUnixNano - root.startTimeUnixNano) / 1e6);
+    const model = readAttrString(groupSpans2, MODEL_KEYS) ?? "unknown@unknown";
+    const costUsd = readAttrNumber(groupSpans2, COST_KEYS) ?? 0;
+    const inputTokens = readAttrNumber(groupSpans2, INPUT_TOKEN_KEYS) ?? 0;
+    const outputTokens = readAttrNumber(groupSpans2, OUTPUT_TOKEN_KEYS) ?? 0;
+    const promptHash = readAttrString(groupSpans2, PROMPT_HASH_KEYS) ?? "sha256:unknown";
+    const configHash = readAttrString(groupSpans2, CONFIG_HASH_KEYS) ?? "sha256:unknown";
+    const score = readAttrNumber(groupSpans2, SCORE_KEYS);
+    const rawNumeric = collectNumericAttrs(groupSpans2);
+    const tokenUsage = {
+      input: inputTokens,
+      output: outputTokens
+    };
+    const judgeScores = score !== void 0 ? {
+      perJudge: { "otel-derived": { score } },
+      perDimMean: { score },
+      composite: score
+    } : void 0;
+    const errorSpan = groupSpans2.find((s) => s.status?.code === "ERROR");
+    const outcome = {
+      ...opts.defaultSplit === "search" ? { searchScore: score } : { holdoutScore: score },
+      raw: rawNumeric,
+      ...judgeScores ? { judgeScores } : {}
+    };
+    runs.push({
+      runId: groupKey,
+      experimentId,
+      candidateId: root.attributes["tangle.candidateId"] ?? "otel-default",
+      seed: 0,
+      model,
+      promptHash,
+      configHash,
+      commitSha: root.attributes["tangle.commit_sha"] ?? "unknown",
+      wallMs,
+      costUsd,
+      tokenUsage,
+      outcome,
+      splitTag: defaultSplit,
+      ...errorSpan ? { failureMode: errorSpan.name } : {}
+    });
+  }
+  return runs;
+}
+function groupSpans(spans) {
+  const m = /* @__PURE__ */ new Map();
+  for (const span of spans) {
+    const key = span["tangle.runId"] ?? span.traceId;
+    const list = m.get(key) ?? [];
+    list.push(span);
+    m.set(key, list);
+  }
+  return m;
+}
+function findRoot(group) {
+  return group.find((s) => !s.parentSpanId) ?? group[0];
+}
+function readAttrString(spans, keys) {
+  for (const span of spans) {
+    for (const key of keys) {
+      const v = span.attributes[key];
+      if (typeof v === "string" && v.length > 0) return v;
+    }
+  }
+  return void 0;
+}
+function readAttrNumber(spans, keys) {
+  for (const span of spans) {
+    for (const key of keys) {
+      const v = span.attributes[key];
+      if (typeof v === "number" && Number.isFinite(v)) return v;
+      if (typeof v === "string") {
+        const parsed = Number(v);
+        if (Number.isFinite(parsed)) return parsed;
+      }
+    }
+  }
+  return void 0;
+}
+function collectNumericAttrs(spans) {
+  const raw = {};
+  for (const span of spans) {
+    for (const [k, v] of Object.entries(span.attributes)) {
+      if (typeof v === "number" && Number.isFinite(v)) raw[k] = v;
+    }
+  }
+  return raw;
+}
 export {
   FileSystemOutcomeStore,
   InMemoryOutcomeStore,
+  analyzeRuns,
   composeGate,
   defaultProductionGate,
   evolutionaryDriver,
+  fromFeedbackTable,
+  fromOtelSpans,
   fsCampaignStorage,
   gepaDriver,
   heldOutGate,