npm - @tangle-network/agent-eval - Versions diffs - 0.5.0 → 0.6.0 - Mend

@tangle-network/agent-eval 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/index.js CHANGED Viewed

@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
   const n = scores.length;
-  const mean2 = scores.reduce((a, b) => a + b, 0) / n;
+  const mean3 = scores.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean2,
+    mean: mean3,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -479,18 +479,18 @@ function mannWhitneyU(a, b) {
     ...a.map((v) => ({ v, group: "a" })),
     ...b.map((v) => ({ v, group: "b" }))
   ].sort((x, y) => x.v - y.v);
-  const ranks2 = new Array(combined.length);
+  const ranks3 = new Array(combined.length);
   let i = 0;
   while (i < combined.length) {
     let j = i;
     while (j < combined.length && combined[j].v === combined[i].v) j++;
     const avgRank = (i + 1 + j) / 2;
-    for (let k = i; k < j; k++) ranks2[k] = avgRank;
+    for (let k = i; k < j; k++) ranks3[k] = avgRank;
     i = j;
   }
   let r1 = 0;
   for (let k = 0; k < combined.length; k++) {
-    if (combined[k].group === "a") r1 += ranks2[k];
+    if (combined[k].group === "a") r1 += ranks3[k];
   }
   const u1 = r1 - n1 * (n1 + 1) / 2;
   const u2 = n1 * n2 - u1;
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean2 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean2) ** 2, 0) / (n - 1);
+  const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean2 === 0 ? 0 : Infinity, df: n - 1, p: mean2 === 0 ? 1 : 0 };
-  const t = mean2 / se;
+  if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
+  const t = mean3 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -530,20 +530,20 @@ function wilcoxonSignedRank(before, after) {
   const n = diffs.length;
   if (n < 6) return { w: 0, p: 1 };
   const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
-  const ranks2 = new Array(n);
+  const ranks3 = new Array(n);
   let i = 0;
   while (i < n) {
     let j = i;
     while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
     const avg = (i + 1 + j) / 2;
-    for (let k = i; k < j; k++) ranks2[absRanks[k].i] = avg;
+    for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
     i = j;
   }
   let wPlus = 0;
-  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks2[k];
-  const mean2 = n * (n + 1) / 4;
+  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
+  const mean3 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean2) / Math.sqrt(variance2);
+  const z = (wPlus - mean3) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -1531,24 +1531,24 @@ function analyzeAntiSlop(outputs, config) {
       }
     }
     for (const re of config.hedgingPatterns) {
-      const matches = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
-      if (matches) {
-        counts.hedging += matches.length;
+      const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
+      if (matches2) {
+        counts.hedging += matches2.length;
         issues.push({
           category: "hedging",
-          detail: `${matches.length}x ${re.source}`,
-          example: matches[0]
+          detail: `${matches2.length}x ${re.source}`,
+          example: matches2[0]
         });
       }
     }
     for (const re of config.apologyPatterns) {
-      const matches = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
-      if (matches) {
-        counts.apology += matches.length;
+      const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
+      if (matches2) {
+        counts.apology += matches2.length;
         issues.push({
           category: "apology",
-          detail: `${matches.length}x ${re.source}`,
-          example: matches[0]
+          detail: `${matches2.length}x ${re.source}`,
+          example: matches2[0]
         });
       }
     }
@@ -4076,10 +4076,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean2 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean2) ** 2, 0) / tail.length;
+  const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean2) > 1e-9 ? Math.abs(mean2) : 1;
+  const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -4100,7 +4100,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean2, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -5028,12 +5028,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean2 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean2) ** 2, 0) / all.length;
+  const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean2) > 1e-9 ? Math.abs(mean2) : 1;
+  const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean2, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -5445,6 +5445,1546 @@ var ProjectRegistry = class {
     return out;
   }
 };
+// src/meta-eval/outcome-store.ts
+var InMemoryOutcomeStore = class {
+  items = [];
+  async append(outcome) {
+    this.items.push({ ...outcome });
+  }
+  async forRun(runId) {
+    return this.items.filter((o) => o.runId === runId).map((o) => ({ ...o }));
+  }
+  async list(filter = {}) {
+    return this.items.filter((o) => matches(o, filter)).map((o) => ({ ...o }));
+  }
+};
+var FileSystemOutcomeStore = class {
+  dir;
+  maxBytes;
+  memo;
+  loaded = false;
+  constructor(options) {
+    this.dir = options.dir;
+    this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
+  }
+  async ensureDir() {
+    const fs = await import("fs/promises");
+    await fs.mkdir(this.dir, { recursive: true });
+  }
+  async append(outcome) {
+    await this.ensureDir();
+    const fs = await import("fs/promises");
+    const path = await import("path");
+    const active = path.join(this.dir, "outcomes.ndjson");
+    try {
+      const stat = await fs.stat(active);
+      if (stat.size >= this.maxBytes) {
+        await fs.rename(active, path.join(this.dir, `outcomes.${Date.now()}.ndjson`));
+      }
+    } catch {
+    }
+    await fs.appendFile(active, JSON.stringify(outcome) + "\n", "utf8");
+    if (this.memo) await this.memo.append(outcome);
+  }
+  async load() {
+    if (this.loaded && this.memo) return this.memo;
+    const fs = await import("fs/promises");
+    const path = await import("path");
+    const memo = new InMemoryOutcomeStore();
+    try {
+      const entries = await fs.readdir(this.dir);
+      for (const file of entries) {
+        if (!file.endsWith(".ndjson")) continue;
+        const content = await fs.readFile(path.join(this.dir, file), "utf8");
+        for (const line of content.split("\n")) {
+          if (!line.trim()) continue;
+          await memo.append(JSON.parse(line));
+        }
+      }
+    } catch {
+    }
+    this.memo = memo;
+    this.loaded = true;
+    return memo;
+  }
+  async forRun(runId) {
+    return (await this.load()).forRun(runId);
+  }
+  async list(filter) {
+    return (await this.load()).list(filter);
+  }
+};
+function matches(o, f) {
+  if (f.runIds && !f.runIds.includes(o.runId)) return false;
+  if (f.since !== void 0 && o.capturedAt < f.since) return false;
+  if (f.until !== void 0 && o.capturedAt > f.until) return false;
+  if (f.source && o.source !== f.source) return false;
+  if (f.label && o.labels?.[f.label.key] !== f.label.value) return false;
+  return true;
+}
+// src/meta-eval/correlation-study.ts
+async function correlationStudy(traceStore, outcomeStore, evalMetrics, outcomeMetricNames, options = {}) {
+  const runs = await traceStore.listRuns();
+  const outcomes = await outcomeStore.list(options.outcomeFilter);
+  const outcomesByRun = /* @__PURE__ */ new Map();
+  for (const o of outcomes) {
+    const arr = outcomesByRun.get(o.runId) ?? [];
+    arr.push(o);
+    outcomesByRun.set(o.runId, arr);
+  }
+  const reduction = options.reduction ?? "latest";
+  const maxLag = options.maxCaptureLagMs ?? Infinity;
+  const pairs = [];
+  for (const em of evalMetrics) {
+    for (const om of outcomeMetricNames) {
+      pairs.push({ evalMetric: em.id, outcomeMetric: om, xs: [], ys: [] });
+    }
+  }
+  let joined = 0;
+  let skipped = 0;
+  for (const run of runs) {
+    const os = outcomesByRun.get(run.runId);
+    if (!os || os.length === 0) {
+      skipped++;
+      continue;
+    }
+    const eligible = os.filter((o) => o.capturedAt - run.startedAt <= maxLag);
+    if (eligible.length === 0) {
+      skipped++;
+      continue;
+    }
+    for (const em of evalMetrics) {
+      const extract = em.extract ?? defaultExtract3(em.id);
+      const x = await extract(run, traceStore);
+      if (x === null || !Number.isFinite(x)) continue;
+      for (const om of outcomeMetricNames) {
+        const values = eligible.map((o) => o.metrics[om]).filter((v) => typeof v === "number" && Number.isFinite(v));
+        if (values.length === 0) continue;
+        const y = reduce(values, reduction, eligible);
+        if (y === null) continue;
+        const pair = pairs.find((p) => p.evalMetric === em.id && p.outcomeMetric === om);
+        pair.xs.push(x);
+        pair.ys.push(y);
+      }
+    }
+    joined++;
+  }
+  const results = pairs.filter((p) => p.xs.length >= 3).map((p) => {
+    const pearson2 = pearsonR3(p.xs, p.ys);
+    const spearman = pearsonR3(ranks2(p.xs), ranks2(p.ys));
+    const pearsonCi95 = bootstrapPearsonCi(p.xs, p.ys, options.bootstrapIterations ?? 500);
+    const verdict = Math.abs(pearson2) >= 0.7 ? "strong" : Math.abs(pearson2) >= 0.4 ? "moderate" : "weak";
+    return { evalMetric: p.evalMetric, outcomeMetric: p.outcomeMetric, n: p.xs.length, pearson: pearson2, spearman, pearsonCi95, verdict };
+  });
+  return { pairs: results, joinedSamples: joined, skippedRuns: skipped };
+}
+function reduce(values, kind, outcomes) {
+  if (values.length === 0) return null;
+  if (kind === "mean") return values.reduce((a, b) => a + b, 0) / values.length;
+  if (kind === "max") return Math.max(...values);
+  const latest = [...outcomes].sort((a, b) => b.capturedAt - a.capturedAt)[0];
+  const v = latest?.metrics[Object.keys(latest.metrics)[0]];
+  const paired = outcomes.map((o) => ({ at: o.capturedAt, v: values.find((x) => o.metrics[Object.keys(o.metrics)[0]] === x) })).filter((p) => p.v !== void 0);
+  if (paired.length === 0) return v ?? null;
+  return paired.sort((a, b) => b.at - a.at)[0].v ?? null;
+}
+function pearsonR3(a, b) {
+  if (a.length !== b.length || a.length < 2) return NaN;
+  const mA = a.reduce((s, v) => s + v, 0) / a.length;
+  const mB = b.reduce((s, v) => s + v, 0) / b.length;
+  let num = 0, dA = 0, dB = 0;
+  for (let i = 0; i < a.length; i++) {
+    const da = a[i] - mA, db = b[i] - mB;
+    num += da * db;
+    dA += da * da;
+    dB += db * db;
+  }
+  if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
+  return num / Math.sqrt(dA * dB);
+}
+function ranks2(xs) {
+  const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v);
+  const r = new Array(xs.length);
+  for (let i = 0; i < indexed.length; i++) {
+    let j = i;
+    while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
+    const avg = (i + j + 2) / 2;
+    for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
+    i = j;
+  }
+  return r;
+}
+function bootstrapPearsonCi(xs, ys, iterations) {
+  const n = xs.length;
+  if (n < 3) return { lower: NaN, upper: NaN };
+  const rs = [];
+  for (let b = 0; b < iterations; b++) {
+    const rx = new Array(n);
+    const ry = new Array(n);
+    for (let i = 0; i < n; i++) {
+      const idx = Math.floor(Math.random() * n);
+      rx[i] = xs[idx];
+      ry[i] = ys[idx];
+    }
+    const r = pearsonR3(rx, ry);
+    if (Number.isFinite(r)) rs.push(r);
+  }
+  rs.sort((a, b) => a - b);
+  if (rs.length === 0) return { lower: NaN, upper: NaN };
+  return { lower: rs[Math.floor(0.025 * rs.length)], upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))] };
+}
+function defaultExtract3(metric) {
+  return async (run, store) => {
+    switch (metric) {
+      case "score":
+      case "overallScore":
+        return run.outcome?.score ?? null;
+      case "pass":
+        return run.outcome?.pass === true ? 1 : 0;
+      case "durationMs":
+        return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null;
+      case "costUsd": {
+        const llm = await llmSpans(store, run.runId);
+        return aggregateLlm(llm).costUsd;
+      }
+      case "inputTokens": {
+        const llm = await llmSpans(store, run.runId);
+        return aggregateLlm(llm).inputTokens;
+      }
+      default:
+        return null;
+    }
+  };
+}
+// src/meta-eval/calibration.ts
+async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
+  const runs = await traceStore.listRuns();
+  const outcomes = await outcomeStore.list();
+  const byRun = /* @__PURE__ */ new Map();
+  for (const o of outcomes) {
+    const arr = byRun.get(o.runId) ?? [];
+    arr.push(o);
+    byRun.set(o.runId, arr);
+  }
+  const extract = evalMetric.extract ?? defaultExtract4(evalMetric.id);
+  const pairs = [];
+  for (const run of runs) {
+    const os = byRun.get(run.runId);
+    if (!os?.length) continue;
+    const x = await extract(run, traceStore);
+    if (x === null || !Number.isFinite(x)) continue;
+    const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
+    const y = latest.metrics[outcomeMetric];
+    if (typeof y !== "number" || !Number.isFinite(y)) continue;
+    pairs.push({ x, y });
+  }
+  if (pairs.length < 2) return null;
+  const numBins = options.bins ?? 10;
+  const binning = options.binning ?? "equal-width";
+  const xs = pairs.map((p) => p.x);
+  const lo = options.range?.lo ?? Math.min(...xs);
+  const hi = options.range?.hi ?? Math.max(...xs);
+  const bins = [];
+  if (binning === "equal-frequency") {
+    const sorted = [...pairs].sort((a, b) => a.x - b.x);
+    const perBin = Math.max(1, Math.floor(sorted.length / numBins));
+    for (let i = 0; i < sorted.length; i += perBin) {
+      const chunk = sorted.slice(i, i + perBin);
+      if (chunk.length === 0) continue;
+      bins.push(toBin(chunk));
+    }
+  } else {
+    const width = (hi - lo) / numBins;
+    if (width === 0) return null;
+    for (let i = 0; i < numBins; i++) {
+      const binLo = lo + i * width;
+      const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
+      const chunk = pairs.filter((p) => p.x >= binLo && p.x < binHi);
+      if (chunk.length === 0) continue;
+      bins.push(toBin(chunk, binLo, binHi));
+    }
+  }
+  const total = bins.reduce((a, b) => a + b.n, 0);
+  const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
+  const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
+  return { evalMetric: evalMetric.id, outcomeMetric, n: pairs.length, bins, ece, maxGap };
+}
+function toBin(chunk, lower, upper) {
+  const xs = chunk.map((c) => c.x);
+  const ys = chunk.map((c) => c.y);
+  const evalMean = mean2(xs);
+  const outcomeMean = mean2(ys);
+  return {
+    lower: lower ?? Math.min(...xs),
+    upper: upper ?? Math.max(...xs),
+    n: chunk.length,
+    evalMean,
+    outcomeMean,
+    gap: Math.abs(outcomeMean - evalMean)
+  };
+}
+function mean2(xs) {
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+function defaultExtract4(metric) {
+  return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
+}
+// src/prm/rubric.ts
+var PrmGrader = class {
+  constructor(rubrics) {
+    this.rubrics = rubrics;
+    if (rubrics.length === 0) throw new Error("PrmGrader: at least 1 rubric required");
+  }
+  rubrics;
+  /**
+   * Grade every eligible span in a run. Emits a JudgeVerdict span for each
+   * (rubric × span) verdict so the result is visible to downstream pipelines
+   * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
+   */
+  async grade(store, runId) {
+    const trajectory = await buildTrajectory(store, runId);
+    const emitter = new TraceEmitter(store, { runId });
+    const steps = [];
+    let ungraded = 0;
+    for (let i = 0; i < trajectory.steps.length; i++) {
+      const step = trajectory.steps[i];
+      const ctx = {
+        trajectory,
+        step,
+        prior: trajectory.steps.slice(0, i),
+        next: trajectory.steps.slice(i + 1)
+      };
+      let gradedThis = false;
+      for (const rubric of this.rubrics) {
+        if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue;
+        const verdict = await rubric.grade(ctx);
+        if (verdict === null) continue;
+        const weight = rubric.weight ?? 1;
+        steps.push({
+          spanId: step.span.spanId,
+          rubricId: rubric.id,
+          score: verdict.score,
+          weight,
+          rationale: verdict.rationale,
+          evidence: verdict.evidence
+        });
+        gradedThis = true;
+        await emitter.recordJudge({
+          judgeId: `prm:${rubric.id}`,
+          targetSpanId: step.span.spanId,
+          dimension: "step_quality",
+          score: verdict.score,
+          rationale: verdict.rationale,
+          evidence: verdict.evidence,
+          name: `prm:${rubric.id}`
+        });
+      }
+      if (!gradedThis) ungraded++;
+    }
+    const totalWeight = steps.reduce((a, s) => a + s.weight, 0);
+    const aggregateScore = totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight;
+    return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded };
+  }
+};
+function isPrmVerdict(verdict) {
+  return verdict.judgeId.startsWith("prm:");
+}
+// src/prm/builtin-rubrics.ts
+function outputLengthRubric(args = {}) {
+  const min = args.minChars ?? 20;
+  const max = args.maxChars ?? 8e3;
+  return {
+    id: "output-length",
+    kinds: ["llm"],
+    weight: args.weight ?? 0.5,
+    async grade({ step }) {
+      const llm = step.span;
+      const len = (llm.output ?? "").length;
+      if (len === 0) return { score: 0, rationale: "empty output" };
+      if (len < min) return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` };
+      if (len > max) return { score: Math.max(0, 1 - (len - max) / max), rationale: `above max (${len} > ${max})` };
+      return { score: 1, rationale: `${len} chars in bounds` };
+    }
+  };
+}
+function toolSuccessRubric(args = {}) {
+  return {
+    id: "tool-success",
+    kinds: ["tool"],
+    weight: args.weight ?? 1,
+    async grade({ step }) {
+      const tool = step.span;
+      if (tool.status === "error") return { score: 0, rationale: `error: ${tool.error ?? "unknown"}` };
+      const r = tool.result;
+      if (r === null || r === void 0) return { score: 0.3, rationale: "empty result" };
+      const asText = typeof r === "string" ? r : JSON.stringify(r);
+      if (asText.length < 4) return { score: 0.5, rationale: "tiny result" };
+      return { score: 1, rationale: `${tool.toolName} ok` };
+    }
+  };
+}
+function toolNonRedundantRubric(args = {}) {
+  const weight = args.weight ?? 0.5;
+  return {
+    id: "tool-non-redundant",
+    kinds: ["tool"],
+    weight,
+    async grade({ step, prior }) {
+      const tool = step.span;
+      const priorMatches = prior.filter((p) => {
+        if (p.span.kind !== "tool") return false;
+        const pt = p.span;
+        return pt.toolName === tool.toolName && stableStringify2(pt.args) === stableStringify2(tool.args);
+      });
+      if (priorMatches.length === 0) return { score: 1, rationale: "novel call" };
+      return { score: Math.max(0, 1 - priorMatches.length * 0.5), rationale: `${priorMatches.length} duplicate(s)` };
+    }
+  };
+}
+function nonRefusalRubric(args = {}) {
+  const weight = args.weight ?? 1;
+  const markers = args.markers ?? [
+    /\bi\s+(?:can(?:not|'t)|won't|will\s+not)\b/i,
+    /\b(?:as\s+an?\s+)?ai\b.*?\b(?:can't|cannot)\b/i
+  ];
+  return {
+    id: "non-refusal",
+    kinds: ["llm"],
+    weight,
+    async grade({ step }) {
+      const llm = step.span;
+      const out = llm.output ?? "";
+      const refused = markers.some((re) => re.test(out));
+      return refused ? { score: 0, rationale: "refusal marker present" } : { score: 1, rationale: "no refusal" };
+    }
+  };
+}
+function toolIntentAlignmentRubric(args = {}) {
+  return {
+    id: "tool-intent-alignment",
+    kinds: ["llm"],
+    weight: args.weight ?? 0.5,
+    async grade({ step, next }) {
+      const llm = step.span;
+      const nextTool = next.find((s) => s.span.kind === "tool");
+      if (!nextTool) return null;
+      const toolName = nextTool.span.toolName;
+      const out = (llm.output ?? "").toLowerCase();
+      const mentioned = out.includes(toolName.toLowerCase());
+      return mentioned ? { score: 1, rationale: `mentioned "${toolName}" before calling it` } : { score: 0.5, rationale: `called "${toolName}" without announcing it` };
+    }
+  };
+}
+function stableStringify2(value) {
+  if (value === null || typeof value !== "object") return JSON.stringify(value);
+  if (Array.isArray(value)) return `[${value.map(stableStringify2).join(",")}]`;
+  const keys = Object.keys(value).sort();
+  return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify2(value[k])}`).join(",")}}`;
+}
+// src/prm/training-export.ts
+async function exportTrainingData(store, graded, options = {}) {
+  const window = options.contextWindow ?? 5;
+  const out = [];
+  for (const g of graded) {
+    const trajectory = await buildTrajectory(store, g.runId);
+    const spanById = new Map(trajectory.steps.map((s) => [s.span.spanId, s]));
+    for (const gs of g.steps) {
+      const node = spanById.get(gs.spanId);
+      if (!node) continue;
+      const idx = trajectory.steps.indexOf(node);
+      const priorSpans = trajectory.steps.slice(Math.max(0, idx - window), idx).map((s) => s.span);
+      out.push({
+        runId: g.runId,
+        spanId: gs.spanId,
+        rubricId: gs.rubricId,
+        score: gs.score,
+        context: {
+          priorTurns: priorSpans.map(spanToTurn).filter((t) => t !== null),
+          step: { kind: node.span.kind, text: spanToText(node.span) }
+        },
+        rationale: gs.rationale,
+        evidence: gs.evidence
+      });
+    }
+  }
+  return out;
+}
+function toNdjson(samples) {
+  return samples.map((s) => JSON.stringify(s)).join("\n") + "\n";
+}
+function spanToTurn(span) {
+  if (isLlmSpan(span)) {
+    const text = span.output ?? span.messages.map((m) => `${m.role}: ${m.content}`).join("\n");
+    return { role: "assistant", content: text };
+  }
+  if (isToolSpan(span)) {
+    return {
+      role: "tool",
+      content: `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`
+    };
+  }
+  return null;
+}
+function spanToText(span) {
+  if (isLlmSpan(span)) return span.output ?? "";
+  if (isToolSpan(span)) return `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`;
+  return span.name;
+}
+function safeStringify(v) {
+  if (v === null || v === void 0) return "";
+  if (typeof v === "string") return v;
+  try {
+    return JSON.stringify(v);
+  } catch {
+    return String(v);
+  }
+}
+// src/prm/inference.ts
+async function prmBestOfN(store, grader, runIds) {
+  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
+  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
+  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
+  const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
+  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
+}
+async function prmEnsembleBestOfN(store, graders, runIds) {
+  if (graders.length === 0) throw new Error("prmEnsembleBestOfN: at least 1 grader");
+  const perGrader = await Promise.all(
+    graders.map(async (g) => {
+      const graded = await Promise.all(runIds.map((id) => g.grade(store, id)));
+      return graded.sort((a, b) => b.aggregateScore - a.aggregateScore);
+    })
+  );
+  const bordaScores = /* @__PURE__ */ new Map();
+  for (const ranking of perGrader) {
+    ranking.forEach((g, rank) => {
+      bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank));
+    });
+  }
+  const canonical = perGrader[0];
+  const byRun = new Map(canonical.map((g) => [g.runId, g]));
+  const ranked = [...byRun.values()].sort(
+    (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
+  );
+  const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
+  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
+}
+// src/bisector.ts
+async function bisect(options) {
+  const equals = options.equals ?? ((a, b) => Object.is(a, b));
+  const maxIter = options.maxIterations ?? 40;
+  const path = [];
+  const goodVerdict = await options.runEval(options.good);
+  path.push({ state: options.good, ...goodVerdict });
+  const badVerdict = await options.runEval(options.bad);
+  path.push({ state: options.bad, ...badVerdict });
+  if (!goodVerdict.pass) {
+    return { culprit: options.good, path, converged: false, inputInconsistent: true };
+  }
+  if (badVerdict.pass) {
+    return { culprit: options.bad, path, converged: false, inputInconsistent: true };
+  }
+  let good = options.good;
+  let bad = options.bad;
+  for (let i = 0; i < maxIter; i++) {
+    const mid = options.halfway(good, bad);
+    if (mid === null || equals(mid, good) || equals(mid, bad)) {
+      return { culprit: bad, path, converged: true, inputInconsistent: false };
+    }
+    const v = await options.runEval(mid);
+    path.push({ state: mid, ...v });
+    if (v.pass) good = mid;
+    else bad = mid;
+  }
+  return { culprit: bad, path, converged: false, inputInconsistent: false };
+}
+async function commitBisect(options) {
+  const { commits } = options;
+  const goodIdx = commits.indexOf(options.good);
+  const badIdx = commits.indexOf(options.bad);
+  if (goodIdx < 0 || badIdx < 0) {
+    throw new Error(`commitBisect: good or bad SHA not in commit list (good=${options.good}, bad=${options.bad})`);
+  }
+  if (goodIdx >= badIdx) {
+    throw new Error("commitBisect: good must precede bad in the commit list");
+  }
+  return bisect({
+    good: options.good,
+    bad: options.bad,
+    runEval: options.runEval,
+    maxIterations: options.maxIterations,
+    halfway: (g, b) => {
+      const gi = commits.indexOf(g);
+      const bi = commits.indexOf(b);
+      if (bi - gi <= 1) return null;
+      return commits[Math.floor((gi + bi) / 2)];
+    }
+  });
+}
+async function promptBisect(options) {
+  const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
+  const join = (paragraphs) => paragraphs.join("\n\n");
+  const goodParas = split(options.good);
+  const badParas = split(options.bad);
+  if (goodParas.length !== badParas.length) {
+    throw new Error(`promptBisect: paragraph count mismatch (${goodParas.length} vs ${badParas.length})`);
+  }
+  if (goodParas.length < 2) {
+    throw new Error("promptBisect: need at least 2 paragraphs to bisect");
+  }
+  const n = goodParas.length;
+  const goodMask = "0".repeat(n);
+  const badMask = "1".repeat(n);
+  function paragraphsFor(mask) {
+    return mask.split("").map((c, i) => c === "1" ? badParas[i] : goodParas[i]);
+  }
+  const result = await bisect({
+    good: goodMask,
+    bad: badMask,
+    runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
+    maxIterations: options.maxIterations ?? n + 5,
+    halfway: (g, b) => {
+      for (let i = 0; i < g.length; i++) {
+        if (g[i] !== b[i]) {
+          const differing = [];
+          for (let j = i; j < g.length; j++) if (g[j] !== b[j]) differing.push(j);
+          if (differing.length === 0) return null;
+          if (differing.length === 1) return null;
+          const flip = differing.slice(0, Math.ceil(differing.length / 2));
+          const chars = g.split("");
+          for (const f of flip) chars[f] = b[f];
+          return chars.join("");
+        }
+      }
+      return null;
+    },
+    equals: (a, b) => a === b
+  });
+  let offendingParagraphIndex;
+  const lastGood = result.path.filter((s) => s.pass).pop();
+  const culprit = result.culprit;
+  if (lastGood) {
+    for (let i = 0; i < n; i++) {
+      if (lastGood.state[i] !== culprit[i]) {
+        offendingParagraphIndex = i;
+        break;
+      }
+    }
+  }
+  const materializedPath = result.path.map((s) => ({
+    state: join(paragraphsFor(s.state)),
+    score: s.score,
+    pass: s.pass
+  }));
+  return {
+    culprit: join(paragraphsFor(culprit)),
+    path: materializedPath,
+    converged: result.converged,
+    inputInconsistent: result.inputInconsistent,
+    offendingParagraphIndex
+  };
+}
+// src/counterfactual.ts
+async function runCounterfactual(store, originalRunId, mutation, runner) {
+  const originalRun = await store.getRun(originalRunId);
+  if (!originalRun) throw new Error(`counterfactual: run ${originalRunId} not found`);
+  const trajectory = await buildTrajectory(store, originalRunId);
+  if (mutation.at < 0 || mutation.at >= trajectory.steps.length) {
+    throw new Error(`counterfactual: mutation.at=${mutation.at} out of range [0, ${trajectory.steps.length})`);
+  }
+  const targetStep = trajectory.steps[mutation.at];
+  const mutatedStep = applyMutation(targetStep, mutation);
+  const cfEmitter = new TraceEmitter(store);
+  await cfEmitter.startRun({
+    scenarioId: originalRun.scenarioId,
+    variantId: originalRun.variantId ? `${originalRun.variantId}+cf:${mutation.kind}@${mutation.at}` : `cf:${mutation.kind}@${mutation.at}`,
+    projectId: originalRun.projectId,
+    parentRunId: originalRunId,
+    layer: "meta",
+    tags: { counterfactual: "true", mutationKind: mutation.kind, mutationAt: String(mutation.at) }
+  });
+  await runner.executeFrom(
+    {
+      originalRunId,
+      originalTrajectory: trajectory,
+      prefix: trajectory.steps.slice(0, mutation.at),
+      mutation,
+      mutatedStep
+    },
+    cfEmitter
+  );
+  const counterfactual = await store.getRun(cfEmitter.runId);
+  const delta = {
+    originalOutcomeScore: originalRun.outcome?.score ?? null,
+    counterfactualOutcomeScore: counterfactual?.outcome?.score ?? null,
+    deltaScore: originalRun.outcome?.score !== void 0 && counterfactual?.outcome?.score !== void 0 ? counterfactual.outcome.score - originalRun.outcome.score : null
+  };
+  return { counterfactualRunId: cfEmitter.runId, originalRunId, mutation, delta };
+}
+function applyMutation(step, mutation) {
+  if (mutation.kind === "swap-model" && step.span.kind === "llm") {
+    const llm = step.span;
+    return { ...step, span: { ...llm, model: mutation.newModel } };
+  }
+  if (mutation.kind === "swap-tool-result" && step.span.kind === "tool") {
+    const tool = step.span;
+    return { ...step, span: { ...tool, result: mutation.newResult } };
+  }
+  if (mutation.kind === "inject-system-message" && step.span.kind === "llm") {
+    const llm = step.span;
+    return {
+      ...step,
+      span: {
+        ...llm,
+        messages: [{ role: "system", content: mutation.content }, ...llm.messages]
+      }
+    };
+  }
+  if (mutation.kind === "custom") return mutation.apply(step);
+  return step;
+}
+function attributeCounterfactuals(results) {
+  const grouped = /* @__PURE__ */ new Map();
+  for (const r of results) {
+    const arr = grouped.get(r.mutation.kind) ?? [];
+    arr.push(r);
+    grouped.set(r.mutation.kind, arr);
+  }
+  const out = [];
+  for (const [kind, items] of grouped) {
+    const deltas = items.map((i) => i.delta.deltaScore).filter((d) => typeof d === "number");
+    if (deltas.length === 0) continue;
+    const meanAbs = deltas.reduce((a, b) => a + Math.abs(b), 0) / deltas.length;
+    const meanSigned = deltas.reduce((a, b) => a + b, 0) / deltas.length;
+    out.push({ mutationKind: kind, n: deltas.length, meanAbsDelta: meanAbs, meanSignedDelta: meanSigned });
+  }
+  return out.sort((a, b) => b.meanAbsDelta - a.meanAbsDelta);
+}
+// src/cross-trace-diff.ts
+async function crossTraceDiff(store, runA, runB, options = {}) {
+  const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
+  const eq = options.stepEquals ?? defaultStepEquals2;
+  const alignment = align(a.steps, b.steps, eq);
+  const [judgesA, judgesB] = await Promise.all([
+    store.spans({ runId: runA, kind: "judge" }).then((s) => s.filter(isJudgeSpan)),
+    store.spans({ runId: runB, kind: "judge" }).then((s) => s.filter(isJudgeSpan))
+  ]);
+  const prmByTargetA = indexPrmByTarget(judgesA);
+  const prmByTargetB = indexPrmByTarget(judgesB);
+  const attributions = alignment.map((ao) => attributeStep(ao, prmByTargetA, prmByTargetB));
+  const prmDeltaSum = attributions.reduce((acc, at) => acc + (at.prmDelta ?? 0), 0);
+  const [runRecA, runRecB] = await Promise.all([store.getRun(runA), store.getRun(runB)]);
+  const totalScoreDelta = runRecA?.outcome?.score !== void 0 && runRecB?.outcome?.score !== void 0 ? runRecB.outcome.score - runRecA.outcome.score : null;
+  return { runA, runB, alignment, attributions, totalScoreDelta, prmDeltaSum };
+}
+function align(a, b, eq) {
+  const dp = Array.from({ length: a.length + 1 }, () => new Array(b.length + 1).fill(0));
+  for (let i2 = 1; i2 <= a.length; i2++) {
+    for (let j2 = 1; j2 <= b.length; j2++) {
+      if (eq(a[i2 - 1], b[j2 - 1])) dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
+      else dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
+    }
+  }
+  const ops = [];
+  let i = a.length;
+  let j = b.length;
+  while (i > 0 || j > 0) {
+    if (i > 0 && j > 0 && eq(a[i - 1], b[j - 1])) {
+      ops.push({ op: "match", a: a[i - 1], b: b[j - 1] });
+      i--;
+      j--;
+    } else if (i > 0 && j > 0 && dp[i - 1][j] === dp[i][j - 1]) {
+      if (a[i - 1].span.kind === b[j - 1].span.kind) {
+        ops.push({ op: "replace", a: a[i - 1], b: b[j - 1] });
+        i--;
+        j--;
+      } else if (dp[i - 1][j] >= dp[i][j - 1]) {
+        ops.push({ op: "delete", a: a[i - 1] });
+        i--;
+      } else {
+        ops.push({ op: "insert", b: b[j - 1] });
+        j--;
+      }
+    } else if (i > 0 && (j === 0 || dp[i - 1][j] >= dp[i][j - 1])) {
+      ops.push({ op: "delete", a: a[i - 1] });
+      i--;
+    } else {
+      ops.push({ op: "insert", b: b[j - 1] });
+      j--;
+    }
+  }
+  return ops.reverse();
+}
+function defaultStepEquals2(a, b) {
+  if (a.span.kind !== b.span.kind) return false;
+  if (a.span.kind === "tool" && b.span.kind === "tool") return a.span.toolName === b.span.toolName;
+  if (a.span.kind === "llm" && b.span.kind === "llm") return a.span.model === b.span.model;
+  return a.span.name === b.span.name;
+}
+function indexPrmByTarget(judges) {
+  const out = /* @__PURE__ */ new Map();
+  for (const j of judges) {
+    const prior = out.get(j.targetSpanId) ?? 0;
+    out.set(j.targetSpanId, prior + j.score);
+  }
+  return out;
+}
+function spanLatency(s) {
+  return s.endedAt && s.startedAt ? s.endedAt - s.startedAt : null;
+}
+function spanTokens(s) {
+  if (s.kind !== "llm") return null;
+  return (s.inputTokens ?? 0) + (s.outputTokens ?? 0);
+}
+function attributeStep(op, prmA, prmB) {
+  if (op.op === "match") {
+    const pa2 = prmA.get(op.a.span.spanId);
+    const pb = prmB.get(op.b.span.spanId);
+    const prmDelta = pa2 !== void 0 && pb !== void 0 ? pb - pa2 : null;
+    const la = spanLatency(op.a.span);
+    const lb = spanLatency(op.b.span);
+    const ta = spanTokens(op.a.span);
+    const tb = spanTokens(op.b.span);
+    return {
+      op,
+      prmDelta,
+      latencyDeltaMs: la !== null && lb !== null ? lb - la : null,
+      tokenDelta: ta !== null && tb !== null ? tb - ta : null,
+      note: prmDelta === null ? "matched step, no PRM coverage" : "matched step, PRM delta recorded"
+    };
+  }
+  if (op.op === "replace") {
+    const pa2 = prmA.get(op.a.span.spanId) ?? 0;
+    const pb = prmB.get(op.b.span.spanId) ?? 0;
+    return {
+      op,
+      prmDelta: pb - pa2,
+      latencyDeltaMs: null,
+      tokenDelta: null,
+      note: `replaced ${op.a.span.kind}/${op.a.span.name} \u2192 ${op.b.span.kind}/${op.b.span.name}`
+    };
+  }
+  if (op.op === "insert") {
+    const pb = prmB.get(op.b.span.spanId) ?? 0;
+    return {
+      op,
+      prmDelta: pb,
+      latencyDeltaMs: null,
+      tokenDelta: null,
+      note: `inserted step in B (${op.b.span.kind}/${op.b.span.name})`
+    };
+  }
+  const pa = prmA.get(op.a.span.spanId) ?? 0;
+  return {
+    op,
+    prmDelta: -pa,
+    latencyDeltaMs: null,
+    tokenDelta: null,
+    note: `deleted step from A (${op.a.span.kind}/${op.a.span.name})`
+  };
+}
+// src/pre-registration.ts
+async function signManifest(m) {
+  const canonical = canonicalize2(m);
+  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
+  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
+  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
+  return { ...m, contentHash: hash };
+}
+async function verifyManifest(m) {
+  const { contentHash, ...rest } = m;
+  const resigned = await signManifest(rest);
+  return resigned.contentHash === contentHash;
+}
+async function evaluateHypothesis(manifest, observed) {
+  if (!await verifyManifest(manifest)) {
+    throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
+  }
+  const reasons = [];
+  const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
+  if (!directionOk) reasons.push("wrong_direction");
+  if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
+  if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
+  if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
+  return {
+    manifest,
+    observedN: observed.n,
+    observedEffect: observed.effect,
+    observedPValue: observed.pValue,
+    confirmed: reasons.length === 0,
+    rejectionReasons: reasons
+  };
+}
+function canonicalize2(v) {
+  if (v === null || typeof v !== "object") return v;
+  if (Array.isArray(v)) return v.map(canonicalize2);
+  const keys = Object.keys(v).sort();
+  const out = {};
+  for (const k of keys) out[k] = canonicalize2(v[k]);
+  return out;
+}
+// src/self-play.ts
+async function runSelfPlay(proposer, scorer, targets, options = {}) {
+  if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
+  const minSpread = options.minSpread ?? 0.1;
+  const floor = options.minAbsoluteFloor ?? 0.1;
+  const maxSurvivors = options.maxSurvivors ?? 50;
+  const totalRounds = options.rounds ?? 1;
+  const allRounds = [];
+  let priorSurvivors = [];
+  const datasetScenarios = [];
+  for (let r = 0; r < totalRounds; r++) {
+    const proposed = await proposer.propose(r, priorSurvivors);
+    const scored = [];
+    const rejected = [];
+    const surviving = [];
+    for (const candidate of proposed) {
+      const scores = await scorer.scoreCandidate(candidate, targets);
+      if (scores.length < 2) {
+        rejected.push({ candidate, reason: "scorer returned <2 results" });
+        continue;
+      }
+      const values = scores.map((s) => s.score);
+      const spread = Math.max(...values) - Math.min(...values);
+      const maxScore = Math.max(...values);
+      scored.push({ candidate, scores, spread });
+      if (maxScore < floor) {
+        rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
+        continue;
+      }
+      if (spread < minSpread) {
+        rejected.push({ candidate, reason: `spread below threshold (${spread.toFixed(3)} < ${minSpread})` });
+        continue;
+      }
+      surviving.push(candidate);
+    }
+    surviving.sort((a, b) => {
+      const sa = scored.find((s) => s.candidate.id === a.id)?.spread ?? 0;
+      const sb = scored.find((s) => s.candidate.id === b.id)?.spread ?? 0;
+      return sb - sa;
+    });
+    const capped = surviving.slice(0, maxSurvivors);
+    for (const s of capped) {
+      datasetScenarios.push({
+        id: s.id,
+        payload: s.payload,
+        split: "test",
+        tags: { ...s.tags, evolutionRound: String(r), origin: "self-play" }
+      });
+    }
+    allRounds.push({ round: r, proposed, survived: capped, rejected, scoredBreakdown: scored });
+    priorSurvivors = capped;
+  }
+  const dataset = new Dataset({
+    name: "self-play-survivors",
+    provenance: {
+      version: "1.0.0",
+      createdAt: (/* @__PURE__ */ new Date()).toISOString(),
+      contributor: "self-play",
+      description: `Evolved across ${totalRounds} round(s), ${allRounds.reduce((a, r) => a + r.survived.length, 0)} survivors`
+    },
+    scenarios: datasetScenarios
+  });
+  return { rounds: allRounds, dataset };
+}
+// src/causal-attribution.ts
+function causalAttribution(cells) {
+  if (cells.length < 4) throw new Error("causalAttribution: need \u2265 4 cells to estimate effects");
+  const factors = Object.keys(cells[0].levels);
+  if (factors.length < 2) throw new Error("causalAttribution: need \u2265 2 factors");
+  const allScores = cells.map((c) => c.score);
+  const grandMean = allScores.reduce((a, b) => a + b, 0) / allScores.length;
+  const totalVariance = allScores.reduce((acc, s) => acc + (s - grandMean) ** 2, 0) / allScores.length;
+  if (totalVariance === 0) {
+    return { totalVariance: 0, mainEffects: factors.map((f) => ({ factor: f, shareOfVariance: 0, range: 0 })), interactions: [], residualShare: 1, sharesSum: 1 };
+  }
+  const mainEffects = factors.map((f) => {
+    const byLevel = groupBy2(cells, (c) => c.levels[f]);
+    const means = [];
+    for (const arr of byLevel.values()) {
+      means.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
+    }
+    const mainVariance = means.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / means.length;
+    return {
+      factor: f,
+      shareOfVariance: mainVariance / totalVariance,
+      range: Math.max(...means) - Math.min(...means)
+    };
+  });
+  const interactions = [];
+  for (let i = 0; i < factors.length; i++) {
+    for (let j = i + 1; j < factors.length; j++) {
+      const byPair = groupBy2(cells, (c) => `${c.levels[factors[i]]}|${c.levels[factors[j]]}`);
+      const pairMeans = [];
+      for (const arr of byPair.values()) {
+        pairMeans.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
+      }
+      const pairVariance = pairMeans.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / pairMeans.length;
+      const mainI = mainEffects[i].shareOfVariance * totalVariance;
+      const mainJ = mainEffects[j].shareOfVariance * totalVariance;
+      const interactionVariance = Math.max(0, pairVariance - mainI - mainJ);
+      interactions.push({
+        factors: [factors[i], factors[j]],
+        shareOfVariance: interactionVariance / totalVariance
+      });
+    }
+  }
+  const mainSum = mainEffects.reduce((a, m) => a + m.shareOfVariance, 0);
+  const interactionSum = interactions.reduce((a, m) => a + m.shareOfVariance, 0);
+  const residualShare = Math.max(0, 1 - mainSum - interactionSum);
+  const sharesSum = mainSum + interactionSum + residualShare;
+  return { totalVariance, mainEffects, interactions, residualShare, sharesSum };
+}
+function groupBy2(items, key) {
+  const m = /* @__PURE__ */ new Map();
+  for (const item of items) {
+    const k = key(item);
+    const arr = m.get(k) ?? [];
+    arr.push(item);
+    m.set(k, arr);
+  }
+  return m;
+}
+// src/active-learning.ts
+async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
+  const minPerBand = options.minPerBand ?? 5;
+  const varianceThreshold = options.varianceThreshold ?? 0.05;
+  const topK = options.topK ?? 10;
+  const scenarios = dataset.all();
+  const targets = [];
+  const BANDS = ["easy", "medium", "hard", "extreme"];
+  for (const band of BANDS) {
+    const count = scenarios.filter((s) => s.difficulty === band).length;
+    if (count < minPerBand) {
+      const neighbors = scenarios.filter((s) => s.difficulty === band).slice(0, 3);
+      targets.push({
+        reason: "difficulty-gap",
+        description: `difficulty="${band}" has ${count} scenario(s) \u2014 below minimum ${minPerBand}`,
+        neighbors: [...neighbors],
+        direction: `create more "${band}" scenarios; reuse domain but shift complexity`,
+        priority: Math.max(0, 1 - count / minPerBand)
+      });
+    }
+  }
+  const runs = await traceStore.listRuns();
+  const runCountByScenario = /* @__PURE__ */ new Map();
+  for (const r of runs) {
+    runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
+  }
+  const runCounts = [...runCountByScenario.values()];
+  const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
+  for (const s of scenarios) {
+    const count = runCountByScenario.get(s.id) ?? 0;
+    if (count <= p25 && count < 3) {
+      targets.push({
+        reason: "undersampled",
+        description: `scenario "${s.id}" has only ${count} run(s)`,
+        neighbors: [s],
+        direction: `create near-duplicates of "${s.id}" to stabilize its mean`,
+        priority: Math.max(0, 1 - count / 3) * 0.7
+      });
+    }
+  }
+  for (const s of scenarios) {
+    const sRuns = runs.filter((r) => r.scenarioId === s.id);
+    const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
+    if (scores.length < 3) continue;
+    const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
+    if (variance2 > varianceThreshold) {
+      targets.push({
+        reason: "high-variance",
+        description: `scenario "${s.id}" has unstable scoring (variance ${variance2.toFixed(3)})`,
+        neighbors: [s],
+        direction: `disambiguate the scenario description \u2014 current wording admits too many valid interpretations`,
+        priority: Math.min(1, variance2 * 5)
+      });
+    }
+  }
+  const failureByClass = /* @__PURE__ */ new Map();
+  for (const run of runs) {
+    if (run.outcome?.pass === true) continue;
+    const spans = await traceStore.spans({ runId: run.runId });
+    const events = await traceStore.events({ runId: run.runId });
+    const { failureClass } = classifyFailure({ run, spans, events });
+    if (failureClass === "success" || failureClass === "unknown") continue;
+    const arr = failureByClass.get(failureClass) ?? [];
+    arr.push(run);
+    failureByClass.set(failureClass, arr);
+  }
+  for (const [cls, runs2] of failureByClass) {
+    if (runs2.length < 3) continue;
+    const affectedScenarios = [...new Set(runs2.map((r) => r.scenarioId))];
+    const neighbors = scenarios.filter((s) => affectedScenarios.includes(s.id)).slice(0, 3);
+    targets.push({
+      reason: "failure-cluster",
+      description: `failure class "${cls}" observed ${runs2.length}\xD7 across ${affectedScenarios.length} scenario(s)`,
+      neighbors,
+      direction: `create scenarios that exercise "${cls}" recovery \u2014 currently a systematic weakness`,
+      priority: Math.min(1, runs2.length / 10)
+    });
+  }
+  return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
+}
+function quantile(xs, p) {
+  const sorted = [...xs].sort((a, b) => a - b);
+  const idx = p * (sorted.length - 1);
+  const lo = Math.floor(idx);
+  const hi = Math.ceil(idx);
+  return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
+}
+// src/reward-model-export.ts
+async function exportRewardModel(store, grader, runIds) {
+  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
+  const samples = await exportTrainingData(store, graded);
+  const rubrics = [...new Set(samples.map((s) => s.rubricId))];
+  const meanReward = samples.length > 0 ? samples.reduce((a, s) => a + s.score, 0) / samples.length : 0;
+  return {
+    version: "1.0",
+    metadata: {
+      nTraces: graded.length,
+      nSamples: samples.length,
+      rubrics,
+      exportedAt: (/* @__PURE__ */ new Date()).toISOString(),
+      meanReward
+    },
+    trainingNdjson: toNdjson(samples)
+  };
+}
+function loadScorerFromGrader(grader) {
+  return {
+    async score(trajectory, store) {
+      const graded = await grader.grade(store, trajectory.runId);
+      return graded.aggregateScore;
+    },
+    metadata: {
+      rubrics: ["grader-backed"],
+      deterministic: true
+    }
+  };
+}
+async function replayScorerOverCorpus(store, scorer, runIds) {
+  return Promise.all(
+    runIds.map(async (runId) => {
+      const [trajectory, run] = await Promise.all([buildTrajectory(store, runId), store.getRun(runId)]);
+      return {
+        runId,
+        score: await scorer.score(trajectory, store),
+        outcomeScore: run?.outcome?.score ?? null
+      };
+    })
+  );
+}
+// src/governance/types.ts
+function renderMarkdown(report) {
+  const sevEmoji = {
+    info: "\u2139\uFE0E",
+    low: "\xB7",
+    medium: "!",
+    high: "!!",
+    critical: "\u203C"
+  };
+  const lines = [];
+  lines.push(`# ${report.framework} report \u2014 ${report.context.systemName}`);
+  lines.push("");
+  lines.push(`- Organization: **${report.context.organization}**`);
+  lines.push(`- Period: ${report.context.periodStart} \u2192 ${report.context.periodEnd}`);
+  lines.push(`- Owner: ${report.context.owner.role} ${report.context.owner.name} <${report.context.owner.email}>`);
+  lines.push(`- Generated: ${report.generatedAt}`);
+  lines.push("");
+  lines.push(`## Summary \u2014 ${report.summary.overall}`);
+  lines.push("");
+  lines.push(`${report.summary.findings} finding(s).`);
+  for (const [sev, n] of Object.entries(report.summary.byeverity)) {
+    if (n > 0) lines.push(`- ${sevEmoji[sev]} ${sev}: ${n}`);
+  }
+  lines.push("");
+  lines.push("## Findings");
+  lines.push("");
+  for (const f of report.findings) {
+    lines.push(`### ${sevEmoji[f.severity]} ${f.id} \u2014 ${f.control}`);
+    lines.push("");
+    lines.push(f.summary);
+    if (f.evidence) {
+      lines.push("");
+      lines.push("**Evidence:** " + f.evidence);
+    }
+    if (f.remediation) {
+      lines.push("");
+      lines.push("**Remediation:** " + f.remediation);
+    }
+    lines.push("");
+  }
+  return lines.join("\n");
+}
+function summarize(findings) {
+  const byeverity = {
+    info: 0,
+    low: 0,
+    medium: 0,
+    high: 0,
+    critical: 0
+  };
+  for (const f of findings) byeverity[f.severity]++;
+  const overall = byeverity.critical + byeverity.high > 0 ? "non-compliant" : byeverity.medium + byeverity.low > 0 ? "compliant-with-findings" : "compliant";
+  return { findings: findings.length, byeverity, overall };
+}
+// src/governance/nist-ai-rmf.ts
+async function nistAiRmfReport(ctx) {
+  const findings = [];
+  if (!ctx.owner?.email) {
+    findings.push({
+      id: "G-1.1",
+      severity: "high",
+      control: "NIST-AI-RMF:GOVERN-1.1",
+      summary: "No responsible owner recorded for the AI system.",
+      remediation: "Assign an accountable individual + email in GovernanceContext.owner."
+    });
+  }
+  if (ctx.datasets.length === 0) {
+    findings.push({
+      id: "G-1.3",
+      severity: "high",
+      control: "NIST-AI-RMF:GOVERN-1.3",
+      summary: "No versioned datasets recorded for the evaluation period.",
+      remediation: "Register each dataset with a Dataset manifest (content hash + provenance)."
+    });
+  } else {
+    for (const manifest of ctx.datasets) {
+      if (!manifest.contentHash || manifest.contentHash.length < 16) {
+        findings.push({
+          id: "G-1.3-hash",
+          severity: "medium",
+          control: "NIST-AI-RMF:GOVERN-1.3",
+          summary: `Dataset "${manifest.name}" has weak or missing content hash.`,
+          evidence: `contentHash="${manifest.contentHash}"`,
+          remediation: "Call dataset.manifest() to compute SHA-256; commit the manifest alongside releases."
+        });
+      }
+    }
+  }
+  if (!ctx.redTeam) {
+    findings.push({
+      id: "M-2.6",
+      severity: "high",
+      control: "NIST-AI-RMF:MEASURE-2.6",
+      summary: "No red-team evaluation attached to the report period.",
+      remediation: "Run redTeamDataset() against the system and attach the RedTeamReport to context.redTeam."
+    });
+  } else if (ctx.redTeam.overallPassRate < 0.8) {
+    findings.push({
+      id: "M-2.6-rate",
+      severity: "high",
+      control: "NIST-AI-RMF:MEASURE-2.6",
+      summary: `Red-team pass rate ${(ctx.redTeam.overallPassRate * 100).toFixed(1)}% below 80% threshold.`,
+      evidence: JSON.stringify(ctx.redTeam.passRateByCategory),
+      remediation: "Harden the failing categories; rerun the battery."
+    });
+  }
+  const runs = await ctx.traceStore.listRuns({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
+  if (runs.length === 0) {
+    findings.push({
+      id: "M-2.1",
+      severity: "critical",
+      control: "NIST-AI-RMF:MEASURE-2.1",
+      summary: "No eval runs recorded for the reporting period.",
+      remediation: "Emit traces for every deployment-relevant evaluation."
+    });
+  }
+  if (!ctx.judgeCalibration || ctx.judgeCalibration.length === 0) {
+    findings.push({
+      id: "M-2.11",
+      severity: "medium",
+      control: "NIST-AI-RMF:MEASURE-2.11",
+      summary: "No judge-vs-human calibration recorded.",
+      remediation: "Build a human golden set; run calibrateJudge() before trusting LLM judge scores."
+    });
+  } else {
+    const weak = ctx.judgeCalibration.filter((c) => Number.isFinite(c.pearson) && c.pearson < 0.6);
+    if (weak.length > 0) {
+      findings.push({
+        id: "M-2.11-weak",
+        severity: "medium",
+        control: "NIST-AI-RMF:MEASURE-2.11",
+        summary: `${weak.length} judge(s) show weak agreement with humans (Pearson < 0.6).`,
+        remediation: "Retrain or replace the underperforming judges."
+      });
+    }
+  }
+  if (!ctx.outcomeStore) {
+    findings.push({
+      id: "MN-1.1",
+      severity: "medium",
+      control: "NIST-AI-RMF:MANAGE-1.1",
+      summary: "No deployment outcomes captured \u2014 meta-eval correlation cannot be computed.",
+      remediation: "Attach an OutcomeStore and ingest production outcome metrics."
+    });
+  } else {
+    const outcomes = await ctx.outcomeStore.list({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
+    if (outcomes.length === 0) {
+      findings.push({
+        id: "MN-1.1-empty",
+        severity: "medium",
+        control: "NIST-AI-RMF:MANAGE-1.1",
+        summary: "OutcomeStore present but no outcomes captured for the period."
+      });
+    }
+  }
+  const hashChecks = [];
+  for (const manifest of ctx.datasets) {
+    hashChecks.push({ name: manifest.name, ok: /^[0-9a-f]{64}$/.test(manifest.contentHash) });
+  }
+  const payload = {
+    controlsEvaluated: [
+      "GOVERN-1.1",
+      "GOVERN-1.3",
+      "MEASURE-2.1",
+      "MEASURE-2.6",
+      "MEASURE-2.11",
+      "MANAGE-1.1"
+    ],
+    runCount: runs.length,
+    redTeamPassRate: ctx.redTeam?.overallPassRate ?? null,
+    datasetHashChecks: hashChecks
+  };
+  return {
+    framework: "NIST-AI-RMF",
+    version: "1.0.0",
+    context: {
+      organization: ctx.organization,
+      systemName: ctx.systemName,
+      periodStart: ctx.periodStart,
+      periodEnd: ctx.periodEnd,
+      owner: ctx.owner
+    },
+    summary: summarize(findings),
+    findings,
+    payload,
+    generatedAt: (/* @__PURE__ */ new Date()).toISOString()
+  };
+}
+// src/governance/soc2.ts
+async function soc2Report(ctx) {
+  const findings = [];
+  const start = Date.parse(ctx.periodStart);
+  const end = Date.parse(ctx.periodEnd);
+  const runs = await ctx.traceStore.listRuns({ since: start, until: end });
+  const failureRate = runs.length > 0 ? runs.filter((r) => r.outcome?.pass === false).length / runs.length : null;
+  if (failureRate !== null && failureRate > 0.2) {
+    findings.push({
+      id: "CC7.1-fail-rate",
+      severity: "medium",
+      control: "SOC2:CC7.1",
+      summary: `System failure rate ${(failureRate * 100).toFixed(1)}% over the period exceeds 20%.`,
+      remediation: "Investigate failure clusters (failureClusterView) + prioritize remediation."
+    });
+  }
+  if (runs.length === 0) {
+    findings.push({
+      id: "CC7.1-coverage",
+      severity: "high",
+      control: "SOC2:CC7.1",
+      summary: "No telemetry runs recorded for the period \u2014 monitoring regime is incomplete."
+    });
+  }
+  const aborted = runs.filter((r) => r.status === "aborted");
+  if (aborted.length > runs.length * 0.05 && aborted.length >= 3) {
+    findings.push({
+      id: "CC7.2-abort",
+      severity: "medium",
+      control: "SOC2:CC7.2",
+      summary: `${aborted.length} run(s) aborted \u2014 investigate pattern.`,
+      remediation: "Use the bisector + failureClusterView to localize the trigger."
+    });
+  }
+  const incidentEvents = await ctx.traceStore.events({ kind: "policy_violation", since: start, until: end });
+  const errorEvents = await ctx.traceStore.events({ kind: "error", since: start, until: end });
+  const totalIncidents = incidentEvents.length + errorEvents.length;
+  if (totalIncidents > 0) {
+    findings.push({
+      id: "CC7.3-resolution",
+      severity: "low",
+      control: "SOC2:CC7.3",
+      summary: `${totalIncidents} incident-class event(s) recorded; resolution tracking is informal.`,
+      remediation: 'Emit a resolution event (kind="log" with payload.resolves=<eventId>) per remediated incident.'
+    });
+  }
+  const modelFingerprints = new Set(runs.map((r) => r.modelFingerprint).filter(Boolean));
+  const promptHashes = new Set(runs.map((r) => r.promptSha).filter(Boolean));
+  const codeSha = new Set(runs.map((r) => r.codeSha).filter(Boolean));
+  if (codeSha.size === 0) {
+    findings.push({
+      id: "CC7.4-code",
+      severity: "high",
+      control: "SOC2:CC7.4",
+      summary: "No codeSha recorded on runs \u2014 cannot attribute scores to a specific release.",
+      remediation: "Populate Run.codeSha with the git SHA of the system at run time."
+    });
+  }
+  if (promptHashes.size === 0) {
+    findings.push({
+      id: "CC7.4-prompt",
+      severity: "medium",
+      control: "SOC2:CC7.4",
+      summary: "No promptSha recorded \u2014 prompt changes are untracked."
+    });
+  }
+  const payload = {
+    controls: ["CC7.1", "CC7.2", "CC7.3", "CC7.4"],
+    runCount: runs.length,
+    failureRate,
+    abortedCount: aborted.length,
+    incidentEventCount: totalIncidents,
+    distinctReleases: {
+      codeShas: codeSha.size,
+      promptHashes: promptHashes.size,
+      modelFingerprints: modelFingerprints.size
+    }
+  };
+  return {
+    framework: "SOC2",
+    version: "2017-Common-Criteria",
+    context: {
+      organization: ctx.organization,
+      systemName: ctx.systemName,
+      periodStart: ctx.periodStart,
+      periodEnd: ctx.periodEnd,
+      owner: ctx.owner
+    },
+    summary: summarize(findings),
+    findings,
+    payload,
+    generatedAt: (/* @__PURE__ */ new Date()).toISOString()
+  };
+}
+// src/governance/eu-ai-act.ts
+function classifyEuAiRisk(signals) {
+  if (signals.biometricPublic || signals.socialScoring || signals.subliminal) return "unacceptable";
+  if (signals.annexIII) return "high";
+  if (signals.chatbot || signals.generatesSyntheticMedia) return "limited";
+  return "minimal";
+}
+async function euAiActReport(ctx, signals) {
+  const riskClass = classifyEuAiRisk(signals);
+  const findings = [];
+  if (riskClass === "unacceptable") {
+    findings.push({
+      id: "EU-ART-5",
+      severity: "critical",
+      control: "EU-AI-ACT:Article-5",
+      summary: "Use case matches a prohibited practice under Article 5.",
+      remediation: "Discontinue or substantially redesign the use case."
+    });
+  }
+  if (riskClass === "high") {
+    if (!ctx.redTeam) {
+      findings.push({
+        id: "EU-ART-9",
+        severity: "high",
+        control: "EU-AI-ACT:Article-9",
+        summary: "High-risk system lacks documented adversarial-testing evidence (Art. 9 risk mgmt).",
+        remediation: "Run redTeamDataset() + attach the report."
+      });
+    }
+    if (ctx.datasets.length === 0) {
+      findings.push({
+        id: "EU-ART-10",
+        severity: "high",
+        control: "EU-AI-ACT:Article-10",
+        summary: "No training/eval datasets recorded with provenance (Art. 10)."
+      });
+    }
+    const runs = await ctx.traceStore.listRuns({
+      since: Date.parse(ctx.periodStart),
+      until: Date.parse(ctx.periodEnd)
+    });
+    if (runs.length === 0) {
+      findings.push({
+        id: "EU-ART-11",
+        severity: "high",
+        control: "EU-AI-ACT:Article-11",
+        summary: "No eval runs recorded (Art. 11 technical documentation)."
+      });
+    }
+    if (!signals.chatbot && !signals.generatesSyntheticMedia) {
+    } else {
+      findings.push({
+        id: "EU-ART-13",
+        severity: "info",
+        control: "EU-AI-ACT:Article-13",
+        summary: "Chatbot/synthetic-media transparency obligations apply; verify user-facing disclosures."
+      });
+    }
+    if (!ctx.owner?.email) {
+      findings.push({
+        id: "EU-ART-14",
+        severity: "high",
+        control: "EU-AI-ACT:Article-14",
+        summary: "No designated human overseer (Art. 14).",
+        remediation: "Populate GovernanceContext.owner with the responsible individual."
+      });
+    }
+    if (!ctx.outcomeStore) {
+      findings.push({
+        id: "EU-ART-15",
+        severity: "medium",
+        control: "EU-AI-ACT:Article-15",
+        summary: "No post-deployment outcome measurement; accuracy + robustness are un-attested.",
+        remediation: "Attach an OutcomeStore + run correlationStudy() over the reporting period."
+      });
+    }
+  }
+  if (riskClass === "limited") {
+    findings.push({
+      id: "EU-ART-52",
+      severity: "info",
+      control: "EU-AI-ACT:Article-52",
+      summary: "Transparency obligations apply: disclose AI nature + synthetic content labeling.",
+      remediation: "Ensure user-facing surfaces label AI-generated content."
+    });
+  }
+  const payload = {
+    riskClass,
+    signals,
+    articlesReviewed: riskClass === "high" ? ["5", "9", "10", "11", "13", "14", "15"] : riskClass === "limited" ? ["52"] : ["none"]
+  };
+  return {
+    framework: "EU-AI-ACT",
+    version: "Regulation-2024-1689",
+    context: {
+      organization: ctx.organization,
+      systemName: ctx.systemName,
+      periodStart: ctx.periodStart,
+      periodEnd: ctx.periodEnd,
+      owner: ctx.owner
+    },
+    summary: summarize(findings),
+    findings,
+    payload,
+    generatedAt: (/* @__PURE__ */ new Date()).toISOString()
+  };
+}
 export {
   AgentDriver,
   BenchmarkRunner,
@@ -5463,15 +7003,18 @@ export {
   DualAgentBench,
   ExperimentTracker,
   FAILURE_CLASSES,
+  FileSystemOutcomeStore,
   FileSystemTraceStore,
   HoldoutAuditor,
   HoldoutLockedError,
   InMemoryExperimentStore,
+  InMemoryOutcomeStore,
   InMemoryTraceStore,
   InMemoryWorkspaceInspector,
   MODEL_PRICING,
   MetricsCollector,
   OTEL_AGENT_EVAL_SCOPE,
+  PrmGrader,
   ProductClient,
   ProjectRegistry,
   PromptOptimizer,
@@ -5488,20 +7031,26 @@ export {
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
+  attributeCounterfactuals,
   benjaminiHochberg,
+  bisect,
   bonferroni,
   budgetBreachView,
   buildTrajectory,
   byteLengthRange,
   calibrateJudge,
+  calibrationCurve,
   canaryLeakView,
+  causalAttribution,
   checkCanaries,
   checkSlos,
+  classifyEuAiRisk,
   classifyFailure,
   codeExecutionJudge,
   cohensD,
   coherenceJudge,
   collectionPreserved,
+  commitBisect,
   compareToBaseline,
   composeParsers,
   composeValidators,
@@ -5509,18 +7058,24 @@ export {
   confidenceInterval,
   containsAll,
   correlateLayers,
+  correlationStudy,
   createAntiSlopJudge,
   createCustomJudge,
   createDomainExpertJudge,
+  crossTraceDiff,
   defaultJudges,
   dominates,
   estimateCost,
   estimateTokens,
+  euAiActReport,
   evaluateContract,
+  evaluateHypothesis,
   evaluateOracles,
   executeScenario,
   expectAgent,
+  exportRewardModel,
   exportRunAsOtlp,
+  exportTrainingData,
   failureClusterView,
   fileContains,
   fileExists,
@@ -5534,6 +7089,7 @@ export {
   iqr,
   isJudgeSpan,
   isLlmSpan,
+  isPrmVerdict,
   isRetrievalSpan,
   isSandboxSpan,
   isToolSpan,
@@ -5545,10 +7101,14 @@ export {
   keyPreserved,
   llmSpanFromProvider,
   llmSpans,
+  loadScorerFromGrader,
   lowercaseMutator,
   mannWhitneyU,
+  nistAiRmfReport,
+  nonRefusalRubric,
   normalizeScores,
   notBlocked,
+  outputLengthRubric,
   pairedTTest,
   paraphraseRobustness,
   paretoFrontier,
@@ -5557,6 +7117,10 @@ export {
   politenessPrefixMutator,
   positionalBias,
   printDriverSummary,
+  prmBestOfN,
+  prmEnsembleBestOfN,
+  promptBisect,
+  proposeSynthesisTargets,
   pytestTestParser,
   redTeamDataset,
   redTeamReport,
@@ -5565,16 +7129,20 @@ export {
   regexMatch,
   regexMatches,
   regressionView,
+  renderMarkdown,
   renderMarkdownReport,
+  replayScorerOverCorpus,
   replayTraceThroughJudge,
   requiredSampleSize,
   resumeBuilderSession,
   rowCount,
   rowWhere,
   runAssertions,
+  runCounterfactual,
   runE2EWorkflow,
   runExpectations,
   runFailureClass,
+  runSelfPlay,
   runTestGradedScenario,
   runsForScenario,
   scoreAllProjects,
@@ -5583,17 +7151,25 @@ export {
   scoreRedTeamOutput,
   selfPreference,
   sentenceReorderMutator,
+  signManifest,
+  soc2Report,
   statusAdvanced,
   stuckLoopView,
+  summarize,
   textInSnapshot,
   toLangfuseEnvelope,
+  toNdjson,
   toPrometheusText,
+  toolIntentAlignmentRubric,
   toolNamesForRun,
+  toolNonRedundantRubric,
   toolSpans,
+  toolSuccessRubric,
   toolWasteView,
   typoMutator,
   urlContains,
   verbosityBias,
+  verifyManifest,
   visualDiff,
   vitestTestParser,
   weightedMean,