@tangle-network/agent-eval 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean2 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean3 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean2,
428
+ mean: mean3,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -479,18 +479,18 @@ function mannWhitneyU(a, b) {
479
479
  ...a.map((v) => ({ v, group: "a" })),
480
480
  ...b.map((v) => ({ v, group: "b" }))
481
481
  ].sort((x, y) => x.v - y.v);
482
- const ranks2 = new Array(combined.length);
482
+ const ranks3 = new Array(combined.length);
483
483
  let i = 0;
484
484
  while (i < combined.length) {
485
485
  let j = i;
486
486
  while (j < combined.length && combined[j].v === combined[i].v) j++;
487
487
  const avgRank = (i + 1 + j) / 2;
488
- for (let k = i; k < j; k++) ranks2[k] = avgRank;
488
+ for (let k = i; k < j; k++) ranks3[k] = avgRank;
489
489
  i = j;
490
490
  }
491
491
  let r1 = 0;
492
492
  for (let k = 0; k < combined.length; k++) {
493
- if (combined[k].group === "a") r1 += ranks2[k];
493
+ if (combined[k].group === "a") r1 += ranks3[k];
494
494
  }
495
495
  const u1 = r1 - n1 * (n1 + 1) / 2;
496
496
  const u2 = n1 * n2 - u1;
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean2 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean2) ** 2, 0) / (n - 1);
516
+ const mean3 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean3) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean2 === 0 ? 0 : Infinity, df: n - 1, p: mean2 === 0 ? 1 : 0 };
520
- const t = mean2 / se;
519
+ if (se === 0) return { t: mean3 === 0 ? 0 : Infinity, df: n - 1, p: mean3 === 0 ? 1 : 0 };
520
+ const t = mean3 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -530,20 +530,20 @@ function wilcoxonSignedRank(before, after) {
530
530
  const n = diffs.length;
531
531
  if (n < 6) return { w: 0, p: 1 };
532
532
  const absRanks = diffs.map((d, i2) => ({ abs: Math.abs(d), sign: Math.sign(d), i: i2 })).sort((a, b) => a.abs - b.abs);
533
- const ranks2 = new Array(n);
533
+ const ranks3 = new Array(n);
534
534
  let i = 0;
535
535
  while (i < n) {
536
536
  let j = i;
537
537
  while (j < n && absRanks[j].abs === absRanks[i].abs) j++;
538
538
  const avg = (i + 1 + j) / 2;
539
- for (let k = i; k < j; k++) ranks2[absRanks[k].i] = avg;
539
+ for (let k = i; k < j; k++) ranks3[absRanks[k].i] = avg;
540
540
  i = j;
541
541
  }
542
542
  let wPlus = 0;
543
- for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks2[k];
544
- const mean2 = n * (n + 1) / 4;
543
+ for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
+ const mean3 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean2) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean3) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -1531,24 +1531,24 @@ function analyzeAntiSlop(outputs, config) {
1531
1531
  }
1532
1532
  }
1533
1533
  for (const re of config.hedgingPatterns) {
1534
- const matches = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1535
- if (matches) {
1536
- counts.hedging += matches.length;
1534
+ const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1535
+ if (matches2) {
1536
+ counts.hedging += matches2.length;
1537
1537
  issues.push({
1538
1538
  category: "hedging",
1539
- detail: `${matches.length}x ${re.source}`,
1540
- example: matches[0]
1539
+ detail: `${matches2.length}x ${re.source}`,
1540
+ example: matches2[0]
1541
1541
  });
1542
1542
  }
1543
1543
  }
1544
1544
  for (const re of config.apologyPatterns) {
1545
- const matches = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1546
- if (matches) {
1547
- counts.apology += matches.length;
1545
+ const matches2 = output.match(new RegExp(re, re.flags.includes("g") ? re.flags : re.flags + "g"));
1546
+ if (matches2) {
1547
+ counts.apology += matches2.length;
1548
1548
  issues.push({
1549
1549
  category: "apology",
1550
- detail: `${matches.length}x ${re.source}`,
1551
- example: matches[0]
1550
+ detail: `${matches2.length}x ${re.source}`,
1551
+ example: matches2[0]
1552
1552
  });
1553
1553
  }
1554
1554
  }
@@ -4076,10 +4076,10 @@ function analyzeSeries(values, options = {}) {
4076
4076
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
4077
4077
  }
4078
4078
  const tail = values.slice(-window);
4079
- const mean2 = tail.reduce((a, b) => a + b, 0) / tail.length;
4080
- const variance2 = tail.reduce((acc, v) => acc + (v - mean2) ** 2, 0) / tail.length;
4079
+ const mean3 = tail.reduce((a, b) => a + b, 0) / tail.length;
4080
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean3) ** 2, 0) / tail.length;
4081
4081
  const stdDev = Math.sqrt(variance2);
4082
- const refMean = Math.abs(mean2) > 1e-9 ? Math.abs(mean2) : 1;
4082
+ const refMean = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
4083
4083
  const cv = stdDev / refMean;
4084
4084
  const stable = tail.length >= window && cv <= stableCv;
4085
4085
  let tailRun = 0;
@@ -4100,7 +4100,7 @@ function analyzeSeries(values, options = {}) {
4100
4100
  } else {
4101
4101
  state = "noisy";
4102
4102
  }
4103
- return { state, windowMean: mean2, windowCv: cv, tailRun, stable };
4103
+ return { state, windowMean: mean3, windowCv: cv, tailRun, stable };
4104
4104
  }
4105
4105
 
4106
4106
  // src/state-continuity.ts
@@ -5028,12 +5028,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
5028
5028
  variantScores.push({ mutator: id, score, mutated });
5029
5029
  all.push(score);
5030
5030
  }
5031
- const mean2 = all.reduce((a, b) => a + b, 0) / all.length;
5032
- const variance2 = all.reduce((a, v) => a + (v - mean2) ** 2, 0) / all.length;
5031
+ const mean3 = all.reduce((a, b) => a + b, 0) / all.length;
5032
+ const variance2 = all.reduce((a, v) => a + (v - mean3) ** 2, 0) / all.length;
5033
5033
  const stdDev = Math.sqrt(variance2);
5034
- const ref = Math.abs(mean2) > 1e-9 ? Math.abs(mean2) : 1;
5034
+ const ref = Math.abs(mean3) > 1e-9 ? Math.abs(mean3) : 1;
5035
5035
  const robustness = Math.max(0, 1 - stdDev / ref);
5036
- return { originalScore, variantScores, meanScore: mean2, stdDev, robustness };
5036
+ return { originalScore, variantScores, meanScore: mean3, stdDev, robustness };
5037
5037
  }
5038
5038
  var lowercaseMutator = (p) => p.toLowerCase();
5039
5039
  var sentenceReorderMutator = (p, seed) => {
@@ -5445,6 +5445,1546 @@ var ProjectRegistry = class {
5445
5445
  return out;
5446
5446
  }
5447
5447
  };
5448
+
5449
+ // src/meta-eval/outcome-store.ts
5450
+ var InMemoryOutcomeStore = class {
5451
+ items = [];
5452
+ async append(outcome) {
5453
+ this.items.push({ ...outcome });
5454
+ }
5455
+ async forRun(runId) {
5456
+ return this.items.filter((o) => o.runId === runId).map((o) => ({ ...o }));
5457
+ }
5458
+ async list(filter = {}) {
5459
+ return this.items.filter((o) => matches(o, filter)).map((o) => ({ ...o }));
5460
+ }
5461
+ };
5462
+ var FileSystemOutcomeStore = class {
5463
+ dir;
5464
+ maxBytes;
5465
+ memo;
5466
+ loaded = false;
5467
+ constructor(options) {
5468
+ this.dir = options.dir;
5469
+ this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
5470
+ }
5471
+ async ensureDir() {
5472
+ const fs = await import("fs/promises");
5473
+ await fs.mkdir(this.dir, { recursive: true });
5474
+ }
5475
+ async append(outcome) {
5476
+ await this.ensureDir();
5477
+ const fs = await import("fs/promises");
5478
+ const path = await import("path");
5479
+ const active = path.join(this.dir, "outcomes.ndjson");
5480
+ try {
5481
+ const stat = await fs.stat(active);
5482
+ if (stat.size >= this.maxBytes) {
5483
+ await fs.rename(active, path.join(this.dir, `outcomes.${Date.now()}.ndjson`));
5484
+ }
5485
+ } catch {
5486
+ }
5487
+ await fs.appendFile(active, JSON.stringify(outcome) + "\n", "utf8");
5488
+ if (this.memo) await this.memo.append(outcome);
5489
+ }
5490
+ async load() {
5491
+ if (this.loaded && this.memo) return this.memo;
5492
+ const fs = await import("fs/promises");
5493
+ const path = await import("path");
5494
+ const memo = new InMemoryOutcomeStore();
5495
+ try {
5496
+ const entries = await fs.readdir(this.dir);
5497
+ for (const file of entries) {
5498
+ if (!file.endsWith(".ndjson")) continue;
5499
+ const content = await fs.readFile(path.join(this.dir, file), "utf8");
5500
+ for (const line of content.split("\n")) {
5501
+ if (!line.trim()) continue;
5502
+ await memo.append(JSON.parse(line));
5503
+ }
5504
+ }
5505
+ } catch {
5506
+ }
5507
+ this.memo = memo;
5508
+ this.loaded = true;
5509
+ return memo;
5510
+ }
5511
+ async forRun(runId) {
5512
+ return (await this.load()).forRun(runId);
5513
+ }
5514
+ async list(filter) {
5515
+ return (await this.load()).list(filter);
5516
+ }
5517
+ };
5518
+ function matches(o, f) {
5519
+ if (f.runIds && !f.runIds.includes(o.runId)) return false;
5520
+ if (f.since !== void 0 && o.capturedAt < f.since) return false;
5521
+ if (f.until !== void 0 && o.capturedAt > f.until) return false;
5522
+ if (f.source && o.source !== f.source) return false;
5523
+ if (f.label && o.labels?.[f.label.key] !== f.label.value) return false;
5524
+ return true;
5525
+ }
5526
+
5527
+ // src/meta-eval/correlation-study.ts
5528
+ async function correlationStudy(traceStore, outcomeStore, evalMetrics, outcomeMetricNames, options = {}) {
5529
+ const runs = await traceStore.listRuns();
5530
+ const outcomes = await outcomeStore.list(options.outcomeFilter);
5531
+ const outcomesByRun = /* @__PURE__ */ new Map();
5532
+ for (const o of outcomes) {
5533
+ const arr = outcomesByRun.get(o.runId) ?? [];
5534
+ arr.push(o);
5535
+ outcomesByRun.set(o.runId, arr);
5536
+ }
5537
+ const reduction = options.reduction ?? "latest";
5538
+ const maxLag = options.maxCaptureLagMs ?? Infinity;
5539
+ const pairs = [];
5540
+ for (const em of evalMetrics) {
5541
+ for (const om of outcomeMetricNames) {
5542
+ pairs.push({ evalMetric: em.id, outcomeMetric: om, xs: [], ys: [] });
5543
+ }
5544
+ }
5545
+ let joined = 0;
5546
+ let skipped = 0;
5547
+ for (const run of runs) {
5548
+ const os = outcomesByRun.get(run.runId);
5549
+ if (!os || os.length === 0) {
5550
+ skipped++;
5551
+ continue;
5552
+ }
5553
+ const eligible = os.filter((o) => o.capturedAt - run.startedAt <= maxLag);
5554
+ if (eligible.length === 0) {
5555
+ skipped++;
5556
+ continue;
5557
+ }
5558
+ for (const em of evalMetrics) {
5559
+ const extract = em.extract ?? defaultExtract3(em.id);
5560
+ const x = await extract(run, traceStore);
5561
+ if (x === null || !Number.isFinite(x)) continue;
5562
+ for (const om of outcomeMetricNames) {
5563
+ const values = eligible.map((o) => o.metrics[om]).filter((v) => typeof v === "number" && Number.isFinite(v));
5564
+ if (values.length === 0) continue;
5565
+ const y = reduce(values, reduction, eligible);
5566
+ if (y === null) continue;
5567
+ const pair = pairs.find((p) => p.evalMetric === em.id && p.outcomeMetric === om);
5568
+ pair.xs.push(x);
5569
+ pair.ys.push(y);
5570
+ }
5571
+ }
5572
+ joined++;
5573
+ }
5574
+ const results = pairs.filter((p) => p.xs.length >= 3).map((p) => {
5575
+ const pearson2 = pearsonR3(p.xs, p.ys);
5576
+ const spearman = pearsonR3(ranks2(p.xs), ranks2(p.ys));
5577
+ const pearsonCi95 = bootstrapPearsonCi(p.xs, p.ys, options.bootstrapIterations ?? 500);
5578
+ const verdict = Math.abs(pearson2) >= 0.7 ? "strong" : Math.abs(pearson2) >= 0.4 ? "moderate" : "weak";
5579
+ return { evalMetric: p.evalMetric, outcomeMetric: p.outcomeMetric, n: p.xs.length, pearson: pearson2, spearman, pearsonCi95, verdict };
5580
+ });
5581
+ return { pairs: results, joinedSamples: joined, skippedRuns: skipped };
5582
+ }
5583
+ function reduce(values, kind, outcomes) {
5584
+ if (values.length === 0) return null;
5585
+ if (kind === "mean") return values.reduce((a, b) => a + b, 0) / values.length;
5586
+ if (kind === "max") return Math.max(...values);
5587
+ const latest = [...outcomes].sort((a, b) => b.capturedAt - a.capturedAt)[0];
5588
+ const v = latest?.metrics[Object.keys(latest.metrics)[0]];
5589
+ const paired = outcomes.map((o) => ({ at: o.capturedAt, v: values.find((x) => o.metrics[Object.keys(o.metrics)[0]] === x) })).filter((p) => p.v !== void 0);
5590
+ if (paired.length === 0) return v ?? null;
5591
+ return paired.sort((a, b) => b.at - a.at)[0].v ?? null;
5592
+ }
5593
+ function pearsonR3(a, b) {
5594
+ if (a.length !== b.length || a.length < 2) return NaN;
5595
+ const mA = a.reduce((s, v) => s + v, 0) / a.length;
5596
+ const mB = b.reduce((s, v) => s + v, 0) / b.length;
5597
+ let num = 0, dA = 0, dB = 0;
5598
+ for (let i = 0; i < a.length; i++) {
5599
+ const da = a[i] - mA, db = b[i] - mB;
5600
+ num += da * db;
5601
+ dA += da * da;
5602
+ dB += db * db;
5603
+ }
5604
+ if (dA === 0 || dB === 0) return dA === 0 && dB === 0 ? 1 : 0;
5605
+ return num / Math.sqrt(dA * dB);
5606
+ }
5607
+ function ranks2(xs) {
5608
+ const indexed = xs.map((v, i) => ({ v, i })).sort((x, y) => x.v - y.v);
5609
+ const r = new Array(xs.length);
5610
+ for (let i = 0; i < indexed.length; i++) {
5611
+ let j = i;
5612
+ while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
5613
+ const avg = (i + j + 2) / 2;
5614
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
5615
+ i = j;
5616
+ }
5617
+ return r;
5618
+ }
5619
+ function bootstrapPearsonCi(xs, ys, iterations) {
5620
+ const n = xs.length;
5621
+ if (n < 3) return { lower: NaN, upper: NaN };
5622
+ const rs = [];
5623
+ for (let b = 0; b < iterations; b++) {
5624
+ const rx = new Array(n);
5625
+ const ry = new Array(n);
5626
+ for (let i = 0; i < n; i++) {
5627
+ const idx = Math.floor(Math.random() * n);
5628
+ rx[i] = xs[idx];
5629
+ ry[i] = ys[idx];
5630
+ }
5631
+ const r = pearsonR3(rx, ry);
5632
+ if (Number.isFinite(r)) rs.push(r);
5633
+ }
5634
+ rs.sort((a, b) => a - b);
5635
+ if (rs.length === 0) return { lower: NaN, upper: NaN };
5636
+ return { lower: rs[Math.floor(0.025 * rs.length)], upper: rs[Math.min(rs.length - 1, Math.floor(0.975 * rs.length))] };
5637
+ }
5638
+ function defaultExtract3(metric) {
5639
+ return async (run, store) => {
5640
+ switch (metric) {
5641
+ case "score":
5642
+ case "overallScore":
5643
+ return run.outcome?.score ?? null;
5644
+ case "pass":
5645
+ return run.outcome?.pass === true ? 1 : 0;
5646
+ case "durationMs":
5647
+ return run.endedAt && run.startedAt ? run.endedAt - run.startedAt : null;
5648
+ case "costUsd": {
5649
+ const llm = await llmSpans(store, run.runId);
5650
+ return aggregateLlm(llm).costUsd;
5651
+ }
5652
+ case "inputTokens": {
5653
+ const llm = await llmSpans(store, run.runId);
5654
+ return aggregateLlm(llm).inputTokens;
5655
+ }
5656
+ default:
5657
+ return null;
5658
+ }
5659
+ };
5660
+ }
5661
+
5662
+ // src/meta-eval/calibration.ts
5663
+ async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
5664
+ const runs = await traceStore.listRuns();
5665
+ const outcomes = await outcomeStore.list();
5666
+ const byRun = /* @__PURE__ */ new Map();
5667
+ for (const o of outcomes) {
5668
+ const arr = byRun.get(o.runId) ?? [];
5669
+ arr.push(o);
5670
+ byRun.set(o.runId, arr);
5671
+ }
5672
+ const extract = evalMetric.extract ?? defaultExtract4(evalMetric.id);
5673
+ const pairs = [];
5674
+ for (const run of runs) {
5675
+ const os = byRun.get(run.runId);
5676
+ if (!os?.length) continue;
5677
+ const x = await extract(run, traceStore);
5678
+ if (x === null || !Number.isFinite(x)) continue;
5679
+ const latest = [...os].sort((a, b) => b.capturedAt - a.capturedAt)[0];
5680
+ const y = latest.metrics[outcomeMetric];
5681
+ if (typeof y !== "number" || !Number.isFinite(y)) continue;
5682
+ pairs.push({ x, y });
5683
+ }
5684
+ if (pairs.length < 2) return null;
5685
+ const numBins = options.bins ?? 10;
5686
+ const binning = options.binning ?? "equal-width";
5687
+ const xs = pairs.map((p) => p.x);
5688
+ const lo = options.range?.lo ?? Math.min(...xs);
5689
+ const hi = options.range?.hi ?? Math.max(...xs);
5690
+ const bins = [];
5691
+ if (binning === "equal-frequency") {
5692
+ const sorted = [...pairs].sort((a, b) => a.x - b.x);
5693
+ const perBin = Math.max(1, Math.floor(sorted.length / numBins));
5694
+ for (let i = 0; i < sorted.length; i += perBin) {
5695
+ const chunk = sorted.slice(i, i + perBin);
5696
+ if (chunk.length === 0) continue;
5697
+ bins.push(toBin(chunk));
5698
+ }
5699
+ } else {
5700
+ const width = (hi - lo) / numBins;
5701
+ if (width === 0) return null;
5702
+ for (let i = 0; i < numBins; i++) {
5703
+ const binLo = lo + i * width;
5704
+ const binHi = i === numBins - 1 ? hi + 1e-9 : lo + (i + 1) * width;
5705
+ const chunk = pairs.filter((p) => p.x >= binLo && p.x < binHi);
5706
+ if (chunk.length === 0) continue;
5707
+ bins.push(toBin(chunk, binLo, binHi));
5708
+ }
5709
+ }
5710
+ const total = bins.reduce((a, b) => a + b.n, 0);
5711
+ const ece = bins.reduce((a, b) => a + b.n / total * b.gap, 0);
5712
+ const maxGap = bins.reduce((a, b) => Math.max(a, b.gap), 0);
5713
+ return { evalMetric: evalMetric.id, outcomeMetric, n: pairs.length, bins, ece, maxGap };
5714
+ }
5715
+ function toBin(chunk, lower, upper) {
5716
+ const xs = chunk.map((c) => c.x);
5717
+ const ys = chunk.map((c) => c.y);
5718
+ const evalMean = mean2(xs);
5719
+ const outcomeMean = mean2(ys);
5720
+ return {
5721
+ lower: lower ?? Math.min(...xs),
5722
+ upper: upper ?? Math.max(...xs),
5723
+ n: chunk.length,
5724
+ evalMean,
5725
+ outcomeMean,
5726
+ gap: Math.abs(outcomeMean - evalMean)
5727
+ };
5728
+ }
5729
+ function mean2(xs) {
5730
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
5731
+ }
5732
+ function defaultExtract4(metric) {
5733
+ return async (run) => run.outcome?.score ?? (metric === "pass" ? run.outcome?.pass === true ? 1 : 0 : null);
5734
+ }
5735
+
5736
+ // src/prm/rubric.ts
5737
+ var PrmGrader = class {
5738
+ constructor(rubrics) {
5739
+ this.rubrics = rubrics;
5740
+ if (rubrics.length === 0) throw new Error("PrmGrader: at least 1 rubric required");
5741
+ }
5742
+ rubrics;
5743
+ /**
5744
+ * Grade every eligible span in a run. Emits a JudgeVerdict span for each
5745
+ * (rubric × span) verdict so the result is visible to downstream pipelines
5746
+ * (judgeAgreementView, etc.) — PRM is just "a judge that runs per span."
5747
+ */
5748
+ async grade(store, runId) {
5749
+ const trajectory = await buildTrajectory(store, runId);
5750
+ const emitter = new TraceEmitter(store, { runId });
5751
+ const steps = [];
5752
+ let ungraded = 0;
5753
+ for (let i = 0; i < trajectory.steps.length; i++) {
5754
+ const step = trajectory.steps[i];
5755
+ const ctx = {
5756
+ trajectory,
5757
+ step,
5758
+ prior: trajectory.steps.slice(0, i),
5759
+ next: trajectory.steps.slice(i + 1)
5760
+ };
5761
+ let gradedThis = false;
5762
+ for (const rubric of this.rubrics) {
5763
+ if (rubric.kinds && !rubric.kinds.includes(step.span.kind)) continue;
5764
+ const verdict = await rubric.grade(ctx);
5765
+ if (verdict === null) continue;
5766
+ const weight = rubric.weight ?? 1;
5767
+ steps.push({
5768
+ spanId: step.span.spanId,
5769
+ rubricId: rubric.id,
5770
+ score: verdict.score,
5771
+ weight,
5772
+ rationale: verdict.rationale,
5773
+ evidence: verdict.evidence
5774
+ });
5775
+ gradedThis = true;
5776
+ await emitter.recordJudge({
5777
+ judgeId: `prm:${rubric.id}`,
5778
+ targetSpanId: step.span.spanId,
5779
+ dimension: "step_quality",
5780
+ score: verdict.score,
5781
+ rationale: verdict.rationale,
5782
+ evidence: verdict.evidence,
5783
+ name: `prm:${rubric.id}`
5784
+ });
5785
+ }
5786
+ if (!gradedThis) ungraded++;
5787
+ }
5788
+ const totalWeight = steps.reduce((a, s) => a + s.weight, 0);
5789
+ const aggregateScore = totalWeight === 0 ? 0 : steps.reduce((a, s) => a + s.score * s.weight, 0) / totalWeight;
5790
+ return { runId, steps, aggregateScore, gradedCount: steps.length, ungradedCount: ungraded };
5791
+ }
5792
+ };
5793
+ function isPrmVerdict(verdict) {
5794
+ return verdict.judgeId.startsWith("prm:");
5795
+ }
5796
+
5797
+ // src/prm/builtin-rubrics.ts
5798
+ function outputLengthRubric(args = {}) {
5799
+ const min = args.minChars ?? 20;
5800
+ const max = args.maxChars ?? 8e3;
5801
+ return {
5802
+ id: "output-length",
5803
+ kinds: ["llm"],
5804
+ weight: args.weight ?? 0.5,
5805
+ async grade({ step }) {
5806
+ const llm = step.span;
5807
+ const len = (llm.output ?? "").length;
5808
+ if (len === 0) return { score: 0, rationale: "empty output" };
5809
+ if (len < min) return { score: Math.max(0, len / min), rationale: `below min (${len} < ${min})` };
5810
+ if (len > max) return { score: Math.max(0, 1 - (len - max) / max), rationale: `above max (${len} > ${max})` };
5811
+ return { score: 1, rationale: `${len} chars in bounds` };
5812
+ }
5813
+ };
5814
+ }
5815
+ function toolSuccessRubric(args = {}) {
5816
+ return {
5817
+ id: "tool-success",
5818
+ kinds: ["tool"],
5819
+ weight: args.weight ?? 1,
5820
+ async grade({ step }) {
5821
+ const tool = step.span;
5822
+ if (tool.status === "error") return { score: 0, rationale: `error: ${tool.error ?? "unknown"}` };
5823
+ const r = tool.result;
5824
+ if (r === null || r === void 0) return { score: 0.3, rationale: "empty result" };
5825
+ const asText = typeof r === "string" ? r : JSON.stringify(r);
5826
+ if (asText.length < 4) return { score: 0.5, rationale: "tiny result" };
5827
+ return { score: 1, rationale: `${tool.toolName} ok` };
5828
+ }
5829
+ };
5830
+ }
5831
+ function toolNonRedundantRubric(args = {}) {
5832
+ const weight = args.weight ?? 0.5;
5833
+ return {
5834
+ id: "tool-non-redundant",
5835
+ kinds: ["tool"],
5836
+ weight,
5837
+ async grade({ step, prior }) {
5838
+ const tool = step.span;
5839
+ const priorMatches = prior.filter((p) => {
5840
+ if (p.span.kind !== "tool") return false;
5841
+ const pt = p.span;
5842
+ return pt.toolName === tool.toolName && stableStringify2(pt.args) === stableStringify2(tool.args);
5843
+ });
5844
+ if (priorMatches.length === 0) return { score: 1, rationale: "novel call" };
5845
+ return { score: Math.max(0, 1 - priorMatches.length * 0.5), rationale: `${priorMatches.length} duplicate(s)` };
5846
+ }
5847
+ };
5848
+ }
5849
+ function nonRefusalRubric(args = {}) {
5850
+ const weight = args.weight ?? 1;
5851
+ const markers = args.markers ?? [
5852
+ /\bi\s+(?:can(?:not|'t)|won't|will\s+not)\b/i,
5853
+ /\b(?:as\s+an?\s+)?ai\b.*?\b(?:can't|cannot)\b/i
5854
+ ];
5855
+ return {
5856
+ id: "non-refusal",
5857
+ kinds: ["llm"],
5858
+ weight,
5859
+ async grade({ step }) {
5860
+ const llm = step.span;
5861
+ const out = llm.output ?? "";
5862
+ const refused = markers.some((re) => re.test(out));
5863
+ return refused ? { score: 0, rationale: "refusal marker present" } : { score: 1, rationale: "no refusal" };
5864
+ }
5865
+ };
5866
+ }
5867
+ function toolIntentAlignmentRubric(args = {}) {
5868
+ return {
5869
+ id: "tool-intent-alignment",
5870
+ kinds: ["llm"],
5871
+ weight: args.weight ?? 0.5,
5872
+ async grade({ step, next }) {
5873
+ const llm = step.span;
5874
+ const nextTool = next.find((s) => s.span.kind === "tool");
5875
+ if (!nextTool) return null;
5876
+ const toolName = nextTool.span.toolName;
5877
+ const out = (llm.output ?? "").toLowerCase();
5878
+ const mentioned = out.includes(toolName.toLowerCase());
5879
+ return mentioned ? { score: 1, rationale: `mentioned "${toolName}" before calling it` } : { score: 0.5, rationale: `called "${toolName}" without announcing it` };
5880
+ }
5881
+ };
5882
+ }
5883
+ function stableStringify2(value) {
5884
+ if (value === null || typeof value !== "object") return JSON.stringify(value);
5885
+ if (Array.isArray(value)) return `[${value.map(stableStringify2).join(",")}]`;
5886
+ const keys = Object.keys(value).sort();
5887
+ return `{${keys.map((k) => `${JSON.stringify(k)}:${stableStringify2(value[k])}`).join(",")}}`;
5888
+ }
5889
+
5890
+ // src/prm/training-export.ts
5891
+ async function exportTrainingData(store, graded, options = {}) {
5892
+ const window = options.contextWindow ?? 5;
5893
+ const out = [];
5894
+ for (const g of graded) {
5895
+ const trajectory = await buildTrajectory(store, g.runId);
5896
+ const spanById = new Map(trajectory.steps.map((s) => [s.span.spanId, s]));
5897
+ for (const gs of g.steps) {
5898
+ const node = spanById.get(gs.spanId);
5899
+ if (!node) continue;
5900
+ const idx = trajectory.steps.indexOf(node);
5901
+ const priorSpans = trajectory.steps.slice(Math.max(0, idx - window), idx).map((s) => s.span);
5902
+ out.push({
5903
+ runId: g.runId,
5904
+ spanId: gs.spanId,
5905
+ rubricId: gs.rubricId,
5906
+ score: gs.score,
5907
+ context: {
5908
+ priorTurns: priorSpans.map(spanToTurn).filter((t) => t !== null),
5909
+ step: { kind: node.span.kind, text: spanToText(node.span) }
5910
+ },
5911
+ rationale: gs.rationale,
5912
+ evidence: gs.evidence
5913
+ });
5914
+ }
5915
+ }
5916
+ return out;
5917
+ }
5918
+ function toNdjson(samples) {
5919
+ return samples.map((s) => JSON.stringify(s)).join("\n") + "\n";
5920
+ }
5921
+ function spanToTurn(span) {
5922
+ if (isLlmSpan(span)) {
5923
+ const text = span.output ?? span.messages.map((m) => `${m.role}: ${m.content}`).join("\n");
5924
+ return { role: "assistant", content: text };
5925
+ }
5926
+ if (isToolSpan(span)) {
5927
+ return {
5928
+ role: "tool",
5929
+ content: `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`
5930
+ };
5931
+ }
5932
+ return null;
5933
+ }
5934
+ function spanToText(span) {
5935
+ if (isLlmSpan(span)) return span.output ?? "";
5936
+ if (isToolSpan(span)) return `${span.toolName}(${safeStringify(span.args)}) \u2192 ${safeStringify(span.result)}`;
5937
+ return span.name;
5938
+ }
5939
+ function safeStringify(v) {
5940
+ if (v === null || v === void 0) return "";
5941
+ if (typeof v === "string") return v;
5942
+ try {
5943
+ return JSON.stringify(v);
5944
+ } catch {
5945
+ return String(v);
5946
+ }
5947
+ }
5948
+
5949
+ // src/prm/inference.ts
5950
+ async function prmBestOfN(store, grader, runIds) {
5951
+ if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
5952
+ const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
5953
+ const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
5954
+ const mean3 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
5955
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / graded.length;
5956
+ return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
5957
+ }
5958
+ async function prmEnsembleBestOfN(store, graders, runIds) {
5959
+ if (graders.length === 0) throw new Error("prmEnsembleBestOfN: at least 1 grader");
5960
+ const perGrader = await Promise.all(
5961
+ graders.map(async (g) => {
5962
+ const graded = await Promise.all(runIds.map((id) => g.grade(store, id)));
5963
+ return graded.sort((a, b) => b.aggregateScore - a.aggregateScore);
5964
+ })
5965
+ );
5966
+ const bordaScores = /* @__PURE__ */ new Map();
5967
+ for (const ranking of perGrader) {
5968
+ ranking.forEach((g, rank) => {
5969
+ bordaScores.set(g.runId, (bordaScores.get(g.runId) ?? 0) + (ranking.length - rank));
5970
+ });
5971
+ }
5972
+ const canonical = perGrader[0];
5973
+ const byRun = new Map(canonical.map((g) => [g.runId, g]));
5974
+ const ranked = [...byRun.values()].sort(
5975
+ (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
5976
+ );
5977
+ const mean3 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
5978
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean3) ** 2, 0) / ranked.length;
5979
+ return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
5980
+ }
5981
+
5982
+ // src/bisector.ts
5983
+ async function bisect(options) {
5984
+ const equals = options.equals ?? ((a, b) => Object.is(a, b));
5985
+ const maxIter = options.maxIterations ?? 40;
5986
+ const path = [];
5987
+ const goodVerdict = await options.runEval(options.good);
5988
+ path.push({ state: options.good, ...goodVerdict });
5989
+ const badVerdict = await options.runEval(options.bad);
5990
+ path.push({ state: options.bad, ...badVerdict });
5991
+ if (!goodVerdict.pass) {
5992
+ return { culprit: options.good, path, converged: false, inputInconsistent: true };
5993
+ }
5994
+ if (badVerdict.pass) {
5995
+ return { culprit: options.bad, path, converged: false, inputInconsistent: true };
5996
+ }
5997
+ let good = options.good;
5998
+ let bad = options.bad;
5999
+ for (let i = 0; i < maxIter; i++) {
6000
+ const mid = options.halfway(good, bad);
6001
+ if (mid === null || equals(mid, good) || equals(mid, bad)) {
6002
+ return { culprit: bad, path, converged: true, inputInconsistent: false };
6003
+ }
6004
+ const v = await options.runEval(mid);
6005
+ path.push({ state: mid, ...v });
6006
+ if (v.pass) good = mid;
6007
+ else bad = mid;
6008
+ }
6009
+ return { culprit: bad, path, converged: false, inputInconsistent: false };
6010
+ }
6011
+ async function commitBisect(options) {
6012
+ const { commits } = options;
6013
+ const goodIdx = commits.indexOf(options.good);
6014
+ const badIdx = commits.indexOf(options.bad);
6015
+ if (goodIdx < 0 || badIdx < 0) {
6016
+ throw new Error(`commitBisect: good or bad SHA not in commit list (good=${options.good}, bad=${options.bad})`);
6017
+ }
6018
+ if (goodIdx >= badIdx) {
6019
+ throw new Error("commitBisect: good must precede bad in the commit list");
6020
+ }
6021
+ return bisect({
6022
+ good: options.good,
6023
+ bad: options.bad,
6024
+ runEval: options.runEval,
6025
+ maxIterations: options.maxIterations,
6026
+ halfway: (g, b) => {
6027
+ const gi = commits.indexOf(g);
6028
+ const bi = commits.indexOf(b);
6029
+ if (bi - gi <= 1) return null;
6030
+ return commits[Math.floor((gi + bi) / 2)];
6031
+ }
6032
+ });
6033
+ }
6034
+ async function promptBisect(options) {
6035
+ const split = options.paragraphSplitter ?? ((p) => p.split(/\n\s*\n/));
6036
+ const join = (paragraphs) => paragraphs.join("\n\n");
6037
+ const goodParas = split(options.good);
6038
+ const badParas = split(options.bad);
6039
+ if (goodParas.length !== badParas.length) {
6040
+ throw new Error(`promptBisect: paragraph count mismatch (${goodParas.length} vs ${badParas.length})`);
6041
+ }
6042
+ if (goodParas.length < 2) {
6043
+ throw new Error("promptBisect: need at least 2 paragraphs to bisect");
6044
+ }
6045
+ const n = goodParas.length;
6046
+ const goodMask = "0".repeat(n);
6047
+ const badMask = "1".repeat(n);
6048
+ function paragraphsFor(mask) {
6049
+ return mask.split("").map((c, i) => c === "1" ? badParas[i] : goodParas[i]);
6050
+ }
6051
+ const result = await bisect({
6052
+ good: goodMask,
6053
+ bad: badMask,
6054
+ runEval: (mask) => options.runEval(join(paragraphsFor(mask))),
6055
+ maxIterations: options.maxIterations ?? n + 5,
6056
+ halfway: (g, b) => {
6057
+ for (let i = 0; i < g.length; i++) {
6058
+ if (g[i] !== b[i]) {
6059
+ const differing = [];
6060
+ for (let j = i; j < g.length; j++) if (g[j] !== b[j]) differing.push(j);
6061
+ if (differing.length === 0) return null;
6062
+ if (differing.length === 1) return null;
6063
+ const flip = differing.slice(0, Math.ceil(differing.length / 2));
6064
+ const chars = g.split("");
6065
+ for (const f of flip) chars[f] = b[f];
6066
+ return chars.join("");
6067
+ }
6068
+ }
6069
+ return null;
6070
+ },
6071
+ equals: (a, b) => a === b
6072
+ });
6073
+ let offendingParagraphIndex;
6074
+ const lastGood = result.path.filter((s) => s.pass).pop();
6075
+ const culprit = result.culprit;
6076
+ if (lastGood) {
6077
+ for (let i = 0; i < n; i++) {
6078
+ if (lastGood.state[i] !== culprit[i]) {
6079
+ offendingParagraphIndex = i;
6080
+ break;
6081
+ }
6082
+ }
6083
+ }
6084
+ const materializedPath = result.path.map((s) => ({
6085
+ state: join(paragraphsFor(s.state)),
6086
+ score: s.score,
6087
+ pass: s.pass
6088
+ }));
6089
+ return {
6090
+ culprit: join(paragraphsFor(culprit)),
6091
+ path: materializedPath,
6092
+ converged: result.converged,
6093
+ inputInconsistent: result.inputInconsistent,
6094
+ offendingParagraphIndex
6095
+ };
6096
+ }
6097
+
6098
+ // src/counterfactual.ts
6099
+ async function runCounterfactual(store, originalRunId, mutation, runner) {
6100
+ const originalRun = await store.getRun(originalRunId);
6101
+ if (!originalRun) throw new Error(`counterfactual: run ${originalRunId} not found`);
6102
+ const trajectory = await buildTrajectory(store, originalRunId);
6103
+ if (mutation.at < 0 || mutation.at >= trajectory.steps.length) {
6104
+ throw new Error(`counterfactual: mutation.at=${mutation.at} out of range [0, ${trajectory.steps.length})`);
6105
+ }
6106
+ const targetStep = trajectory.steps[mutation.at];
6107
+ const mutatedStep = applyMutation(targetStep, mutation);
6108
+ const cfEmitter = new TraceEmitter(store);
6109
+ await cfEmitter.startRun({
6110
+ scenarioId: originalRun.scenarioId,
6111
+ variantId: originalRun.variantId ? `${originalRun.variantId}+cf:${mutation.kind}@${mutation.at}` : `cf:${mutation.kind}@${mutation.at}`,
6112
+ projectId: originalRun.projectId,
6113
+ parentRunId: originalRunId,
6114
+ layer: "meta",
6115
+ tags: { counterfactual: "true", mutationKind: mutation.kind, mutationAt: String(mutation.at) }
6116
+ });
6117
+ await runner.executeFrom(
6118
+ {
6119
+ originalRunId,
6120
+ originalTrajectory: trajectory,
6121
+ prefix: trajectory.steps.slice(0, mutation.at),
6122
+ mutation,
6123
+ mutatedStep
6124
+ },
6125
+ cfEmitter
6126
+ );
6127
+ const counterfactual = await store.getRun(cfEmitter.runId);
6128
+ const delta = {
6129
+ originalOutcomeScore: originalRun.outcome?.score ?? null,
6130
+ counterfactualOutcomeScore: counterfactual?.outcome?.score ?? null,
6131
+ deltaScore: originalRun.outcome?.score !== void 0 && counterfactual?.outcome?.score !== void 0 ? counterfactual.outcome.score - originalRun.outcome.score : null
6132
+ };
6133
+ return { counterfactualRunId: cfEmitter.runId, originalRunId, mutation, delta };
6134
+ }
6135
+ function applyMutation(step, mutation) {
6136
+ if (mutation.kind === "swap-model" && step.span.kind === "llm") {
6137
+ const llm = step.span;
6138
+ return { ...step, span: { ...llm, model: mutation.newModel } };
6139
+ }
6140
+ if (mutation.kind === "swap-tool-result" && step.span.kind === "tool") {
6141
+ const tool = step.span;
6142
+ return { ...step, span: { ...tool, result: mutation.newResult } };
6143
+ }
6144
+ if (mutation.kind === "inject-system-message" && step.span.kind === "llm") {
6145
+ const llm = step.span;
6146
+ return {
6147
+ ...step,
6148
+ span: {
6149
+ ...llm,
6150
+ messages: [{ role: "system", content: mutation.content }, ...llm.messages]
6151
+ }
6152
+ };
6153
+ }
6154
+ if (mutation.kind === "custom") return mutation.apply(step);
6155
+ return step;
6156
+ }
6157
+ function attributeCounterfactuals(results) {
6158
+ const grouped = /* @__PURE__ */ new Map();
6159
+ for (const r of results) {
6160
+ const arr = grouped.get(r.mutation.kind) ?? [];
6161
+ arr.push(r);
6162
+ grouped.set(r.mutation.kind, arr);
6163
+ }
6164
+ const out = [];
6165
+ for (const [kind, items] of grouped) {
6166
+ const deltas = items.map((i) => i.delta.deltaScore).filter((d) => typeof d === "number");
6167
+ if (deltas.length === 0) continue;
6168
+ const meanAbs = deltas.reduce((a, b) => a + Math.abs(b), 0) / deltas.length;
6169
+ const meanSigned = deltas.reduce((a, b) => a + b, 0) / deltas.length;
6170
+ out.push({ mutationKind: kind, n: deltas.length, meanAbsDelta: meanAbs, meanSignedDelta: meanSigned });
6171
+ }
6172
+ return out.sort((a, b) => b.meanAbsDelta - a.meanAbsDelta);
6173
+ }
6174
+
6175
+ // src/cross-trace-diff.ts
6176
+ async function crossTraceDiff(store, runA, runB, options = {}) {
6177
+ const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
6178
+ const eq = options.stepEquals ?? defaultStepEquals2;
6179
+ const alignment = align(a.steps, b.steps, eq);
6180
+ const [judgesA, judgesB] = await Promise.all([
6181
+ store.spans({ runId: runA, kind: "judge" }).then((s) => s.filter(isJudgeSpan)),
6182
+ store.spans({ runId: runB, kind: "judge" }).then((s) => s.filter(isJudgeSpan))
6183
+ ]);
6184
+ const prmByTargetA = indexPrmByTarget(judgesA);
6185
+ const prmByTargetB = indexPrmByTarget(judgesB);
6186
+ const attributions = alignment.map((ao) => attributeStep(ao, prmByTargetA, prmByTargetB));
6187
+ const prmDeltaSum = attributions.reduce((acc, at) => acc + (at.prmDelta ?? 0), 0);
6188
+ const [runRecA, runRecB] = await Promise.all([store.getRun(runA), store.getRun(runB)]);
6189
+ const totalScoreDelta = runRecA?.outcome?.score !== void 0 && runRecB?.outcome?.score !== void 0 ? runRecB.outcome.score - runRecA.outcome.score : null;
6190
+ return { runA, runB, alignment, attributions, totalScoreDelta, prmDeltaSum };
6191
+ }
6192
+ function align(a, b, eq) {
6193
+ const dp = Array.from({ length: a.length + 1 }, () => new Array(b.length + 1).fill(0));
6194
+ for (let i2 = 1; i2 <= a.length; i2++) {
6195
+ for (let j2 = 1; j2 <= b.length; j2++) {
6196
+ if (eq(a[i2 - 1], b[j2 - 1])) dp[i2][j2] = dp[i2 - 1][j2 - 1] + 1;
6197
+ else dp[i2][j2] = Math.max(dp[i2 - 1][j2], dp[i2][j2 - 1]);
6198
+ }
6199
+ }
6200
+ const ops = [];
6201
+ let i = a.length;
6202
+ let j = b.length;
6203
+ while (i > 0 || j > 0) {
6204
+ if (i > 0 && j > 0 && eq(a[i - 1], b[j - 1])) {
6205
+ ops.push({ op: "match", a: a[i - 1], b: b[j - 1] });
6206
+ i--;
6207
+ j--;
6208
+ } else if (i > 0 && j > 0 && dp[i - 1][j] === dp[i][j - 1]) {
6209
+ if (a[i - 1].span.kind === b[j - 1].span.kind) {
6210
+ ops.push({ op: "replace", a: a[i - 1], b: b[j - 1] });
6211
+ i--;
6212
+ j--;
6213
+ } else if (dp[i - 1][j] >= dp[i][j - 1]) {
6214
+ ops.push({ op: "delete", a: a[i - 1] });
6215
+ i--;
6216
+ } else {
6217
+ ops.push({ op: "insert", b: b[j - 1] });
6218
+ j--;
6219
+ }
6220
+ } else if (i > 0 && (j === 0 || dp[i - 1][j] >= dp[i][j - 1])) {
6221
+ ops.push({ op: "delete", a: a[i - 1] });
6222
+ i--;
6223
+ } else {
6224
+ ops.push({ op: "insert", b: b[j - 1] });
6225
+ j--;
6226
+ }
6227
+ }
6228
+ return ops.reverse();
6229
+ }
6230
+ function defaultStepEquals2(a, b) {
6231
+ if (a.span.kind !== b.span.kind) return false;
6232
+ if (a.span.kind === "tool" && b.span.kind === "tool") return a.span.toolName === b.span.toolName;
6233
+ if (a.span.kind === "llm" && b.span.kind === "llm") return a.span.model === b.span.model;
6234
+ return a.span.name === b.span.name;
6235
+ }
6236
+ function indexPrmByTarget(judges) {
6237
+ const out = /* @__PURE__ */ new Map();
6238
+ for (const j of judges) {
6239
+ const prior = out.get(j.targetSpanId) ?? 0;
6240
+ out.set(j.targetSpanId, prior + j.score);
6241
+ }
6242
+ return out;
6243
+ }
6244
+ function spanLatency(s) {
6245
+ return s.endedAt && s.startedAt ? s.endedAt - s.startedAt : null;
6246
+ }
6247
+ function spanTokens(s) {
6248
+ if (s.kind !== "llm") return null;
6249
+ return (s.inputTokens ?? 0) + (s.outputTokens ?? 0);
6250
+ }
6251
+ function attributeStep(op, prmA, prmB) {
6252
+ if (op.op === "match") {
6253
+ const pa2 = prmA.get(op.a.span.spanId);
6254
+ const pb = prmB.get(op.b.span.spanId);
6255
+ const prmDelta = pa2 !== void 0 && pb !== void 0 ? pb - pa2 : null;
6256
+ const la = spanLatency(op.a.span);
6257
+ const lb = spanLatency(op.b.span);
6258
+ const ta = spanTokens(op.a.span);
6259
+ const tb = spanTokens(op.b.span);
6260
+ return {
6261
+ op,
6262
+ prmDelta,
6263
+ latencyDeltaMs: la !== null && lb !== null ? lb - la : null,
6264
+ tokenDelta: ta !== null && tb !== null ? tb - ta : null,
6265
+ note: prmDelta === null ? "matched step, no PRM coverage" : "matched step, PRM delta recorded"
6266
+ };
6267
+ }
6268
+ if (op.op === "replace") {
6269
+ const pa2 = prmA.get(op.a.span.spanId) ?? 0;
6270
+ const pb = prmB.get(op.b.span.spanId) ?? 0;
6271
+ return {
6272
+ op,
6273
+ prmDelta: pb - pa2,
6274
+ latencyDeltaMs: null,
6275
+ tokenDelta: null,
6276
+ note: `replaced ${op.a.span.kind}/${op.a.span.name} \u2192 ${op.b.span.kind}/${op.b.span.name}`
6277
+ };
6278
+ }
6279
+ if (op.op === "insert") {
6280
+ const pb = prmB.get(op.b.span.spanId) ?? 0;
6281
+ return {
6282
+ op,
6283
+ prmDelta: pb,
6284
+ latencyDeltaMs: null,
6285
+ tokenDelta: null,
6286
+ note: `inserted step in B (${op.b.span.kind}/${op.b.span.name})`
6287
+ };
6288
+ }
6289
+ const pa = prmA.get(op.a.span.spanId) ?? 0;
6290
+ return {
6291
+ op,
6292
+ prmDelta: -pa,
6293
+ latencyDeltaMs: null,
6294
+ tokenDelta: null,
6295
+ note: `deleted step from A (${op.a.span.kind}/${op.a.span.name})`
6296
+ };
6297
+ }
6298
+
6299
+ // src/pre-registration.ts
6300
+ async function signManifest(m) {
6301
+ const canonical = canonicalize2(m);
6302
+ const bytes = new TextEncoder().encode(JSON.stringify(canonical));
6303
+ const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
6304
+ const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
6305
+ return { ...m, contentHash: hash };
6306
+ }
6307
+ async function verifyManifest(m) {
6308
+ const { contentHash, ...rest } = m;
6309
+ const resigned = await signManifest(rest);
6310
+ return resigned.contentHash === contentHash;
6311
+ }
6312
+ async function evaluateHypothesis(manifest, observed) {
6313
+ if (!await verifyManifest(manifest)) {
6314
+ throw new Error("evaluateHypothesis: manifest content hash mismatch (tampered)");
6315
+ }
6316
+ const reasons = [];
6317
+ const directionOk = manifest.direction === "increase" ? observed.effect > 0 : observed.effect < 0;
6318
+ if (!directionOk) reasons.push("wrong_direction");
6319
+ if (Math.abs(observed.effect) < manifest.minEffect) reasons.push("effect_too_small");
6320
+ if (observed.pValue >= manifest.alpha) reasons.push("not_significant");
6321
+ if (observed.n < manifest.preRegisteredN) reasons.push("undersampled");
6322
+ return {
6323
+ manifest,
6324
+ observedN: observed.n,
6325
+ observedEffect: observed.effect,
6326
+ observedPValue: observed.pValue,
6327
+ confirmed: reasons.length === 0,
6328
+ rejectionReasons: reasons
6329
+ };
6330
+ }
6331
+ function canonicalize2(v) {
6332
+ if (v === null || typeof v !== "object") return v;
6333
+ if (Array.isArray(v)) return v.map(canonicalize2);
6334
+ const keys = Object.keys(v).sort();
6335
+ const out = {};
6336
+ for (const k of keys) out[k] = canonicalize2(v[k]);
6337
+ return out;
6338
+ }
6339
+
6340
+ // src/self-play.ts
6341
+ async function runSelfPlay(proposer, scorer, targets, options = {}) {
6342
+ if (targets.length < 2) throw new Error("runSelfPlay: at least 2 targets required (need a difference to measure)");
6343
+ const minSpread = options.minSpread ?? 0.1;
6344
+ const floor = options.minAbsoluteFloor ?? 0.1;
6345
+ const maxSurvivors = options.maxSurvivors ?? 50;
6346
+ const totalRounds = options.rounds ?? 1;
6347
+ const allRounds = [];
6348
+ let priorSurvivors = [];
6349
+ const datasetScenarios = [];
6350
+ for (let r = 0; r < totalRounds; r++) {
6351
+ const proposed = await proposer.propose(r, priorSurvivors);
6352
+ const scored = [];
6353
+ const rejected = [];
6354
+ const surviving = [];
6355
+ for (const candidate of proposed) {
6356
+ const scores = await scorer.scoreCandidate(candidate, targets);
6357
+ if (scores.length < 2) {
6358
+ rejected.push({ candidate, reason: "scorer returned <2 results" });
6359
+ continue;
6360
+ }
6361
+ const values = scores.map((s) => s.score);
6362
+ const spread = Math.max(...values) - Math.min(...values);
6363
+ const maxScore = Math.max(...values);
6364
+ scored.push({ candidate, scores, spread });
6365
+ if (maxScore < floor) {
6366
+ rejected.push({ candidate, reason: `every target below floor (max=${maxScore.toFixed(3)} < ${floor})` });
6367
+ continue;
6368
+ }
6369
+ if (spread < minSpread) {
6370
+ rejected.push({ candidate, reason: `spread below threshold (${spread.toFixed(3)} < ${minSpread})` });
6371
+ continue;
6372
+ }
6373
+ surviving.push(candidate);
6374
+ }
6375
+ surviving.sort((a, b) => {
6376
+ const sa = scored.find((s) => s.candidate.id === a.id)?.spread ?? 0;
6377
+ const sb = scored.find((s) => s.candidate.id === b.id)?.spread ?? 0;
6378
+ return sb - sa;
6379
+ });
6380
+ const capped = surviving.slice(0, maxSurvivors);
6381
+ for (const s of capped) {
6382
+ datasetScenarios.push({
6383
+ id: s.id,
6384
+ payload: s.payload,
6385
+ split: "test",
6386
+ tags: { ...s.tags, evolutionRound: String(r), origin: "self-play" }
6387
+ });
6388
+ }
6389
+ allRounds.push({ round: r, proposed, survived: capped, rejected, scoredBreakdown: scored });
6390
+ priorSurvivors = capped;
6391
+ }
6392
+ const dataset = new Dataset({
6393
+ name: "self-play-survivors",
6394
+ provenance: {
6395
+ version: "1.0.0",
6396
+ createdAt: (/* @__PURE__ */ new Date()).toISOString(),
6397
+ contributor: "self-play",
6398
+ description: `Evolved across ${totalRounds} round(s), ${allRounds.reduce((a, r) => a + r.survived.length, 0)} survivors`
6399
+ },
6400
+ scenarios: datasetScenarios
6401
+ });
6402
+ return { rounds: allRounds, dataset };
6403
+ }
6404
+
6405
+ // src/causal-attribution.ts
6406
+ function causalAttribution(cells) {
6407
+ if (cells.length < 4) throw new Error("causalAttribution: need \u2265 4 cells to estimate effects");
6408
+ const factors = Object.keys(cells[0].levels);
6409
+ if (factors.length < 2) throw new Error("causalAttribution: need \u2265 2 factors");
6410
+ const allScores = cells.map((c) => c.score);
6411
+ const grandMean = allScores.reduce((a, b) => a + b, 0) / allScores.length;
6412
+ const totalVariance = allScores.reduce((acc, s) => acc + (s - grandMean) ** 2, 0) / allScores.length;
6413
+ if (totalVariance === 0) {
6414
+ return { totalVariance: 0, mainEffects: factors.map((f) => ({ factor: f, shareOfVariance: 0, range: 0 })), interactions: [], residualShare: 1, sharesSum: 1 };
6415
+ }
6416
+ const mainEffects = factors.map((f) => {
6417
+ const byLevel = groupBy2(cells, (c) => c.levels[f]);
6418
+ const means = [];
6419
+ for (const arr of byLevel.values()) {
6420
+ means.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
6421
+ }
6422
+ const mainVariance = means.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / means.length;
6423
+ return {
6424
+ factor: f,
6425
+ shareOfVariance: mainVariance / totalVariance,
6426
+ range: Math.max(...means) - Math.min(...means)
6427
+ };
6428
+ });
6429
+ const interactions = [];
6430
+ for (let i = 0; i < factors.length; i++) {
6431
+ for (let j = i + 1; j < factors.length; j++) {
6432
+ const byPair = groupBy2(cells, (c) => `${c.levels[factors[i]]}|${c.levels[factors[j]]}`);
6433
+ const pairMeans = [];
6434
+ for (const arr of byPair.values()) {
6435
+ pairMeans.push(arr.reduce((a, c) => a + c.score, 0) / arr.length);
6436
+ }
6437
+ const pairVariance = pairMeans.reduce((acc, m) => acc + (m - grandMean) ** 2, 0) / pairMeans.length;
6438
+ const mainI = mainEffects[i].shareOfVariance * totalVariance;
6439
+ const mainJ = mainEffects[j].shareOfVariance * totalVariance;
6440
+ const interactionVariance = Math.max(0, pairVariance - mainI - mainJ);
6441
+ interactions.push({
6442
+ factors: [factors[i], factors[j]],
6443
+ shareOfVariance: interactionVariance / totalVariance
6444
+ });
6445
+ }
6446
+ }
6447
+ const mainSum = mainEffects.reduce((a, m) => a + m.shareOfVariance, 0);
6448
+ const interactionSum = interactions.reduce((a, m) => a + m.shareOfVariance, 0);
6449
+ const residualShare = Math.max(0, 1 - mainSum - interactionSum);
6450
+ const sharesSum = mainSum + interactionSum + residualShare;
6451
+ return { totalVariance, mainEffects, interactions, residualShare, sharesSum };
6452
+ }
6453
+ function groupBy2(items, key) {
6454
+ const m = /* @__PURE__ */ new Map();
6455
+ for (const item of items) {
6456
+ const k = key(item);
6457
+ const arr = m.get(k) ?? [];
6458
+ arr.push(item);
6459
+ m.set(k, arr);
6460
+ }
6461
+ return m;
6462
+ }
6463
+
6464
+ // src/active-learning.ts
6465
+ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
6466
+ const minPerBand = options.minPerBand ?? 5;
6467
+ const varianceThreshold = options.varianceThreshold ?? 0.05;
6468
+ const topK = options.topK ?? 10;
6469
+ const scenarios = dataset.all();
6470
+ const targets = [];
6471
+ const BANDS = ["easy", "medium", "hard", "extreme"];
6472
+ for (const band of BANDS) {
6473
+ const count = scenarios.filter((s) => s.difficulty === band).length;
6474
+ if (count < minPerBand) {
6475
+ const neighbors = scenarios.filter((s) => s.difficulty === band).slice(0, 3);
6476
+ targets.push({
6477
+ reason: "difficulty-gap",
6478
+ description: `difficulty="${band}" has ${count} scenario(s) \u2014 below minimum ${minPerBand}`,
6479
+ neighbors: [...neighbors],
6480
+ direction: `create more "${band}" scenarios; reuse domain but shift complexity`,
6481
+ priority: Math.max(0, 1 - count / minPerBand)
6482
+ });
6483
+ }
6484
+ }
6485
+ const runs = await traceStore.listRuns();
6486
+ const runCountByScenario = /* @__PURE__ */ new Map();
6487
+ for (const r of runs) {
6488
+ runCountByScenario.set(r.scenarioId, (runCountByScenario.get(r.scenarioId) ?? 0) + 1);
6489
+ }
6490
+ const runCounts = [...runCountByScenario.values()];
6491
+ const p25 = runCounts.length > 0 ? quantile(runCounts, 0.25) : 0;
6492
+ for (const s of scenarios) {
6493
+ const count = runCountByScenario.get(s.id) ?? 0;
6494
+ if (count <= p25 && count < 3) {
6495
+ targets.push({
6496
+ reason: "undersampled",
6497
+ description: `scenario "${s.id}" has only ${count} run(s)`,
6498
+ neighbors: [s],
6499
+ direction: `create near-duplicates of "${s.id}" to stabilize its mean`,
6500
+ priority: Math.max(0, 1 - count / 3) * 0.7
6501
+ });
6502
+ }
6503
+ }
6504
+ for (const s of scenarios) {
6505
+ const sRuns = runs.filter((r) => r.scenarioId === s.id);
6506
+ const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
6507
+ if (scores.length < 3) continue;
6508
+ const mean3 = scores.reduce((a, b) => a + b, 0) / scores.length;
6509
+ const variance2 = scores.reduce((a, b) => a + (b - mean3) ** 2, 0) / scores.length;
6510
+ if (variance2 > varianceThreshold) {
6511
+ targets.push({
6512
+ reason: "high-variance",
6513
+ description: `scenario "${s.id}" has unstable scoring (variance ${variance2.toFixed(3)})`,
6514
+ neighbors: [s],
6515
+ direction: `disambiguate the scenario description \u2014 current wording admits too many valid interpretations`,
6516
+ priority: Math.min(1, variance2 * 5)
6517
+ });
6518
+ }
6519
+ }
6520
+ const failureByClass = /* @__PURE__ */ new Map();
6521
+ for (const run of runs) {
6522
+ if (run.outcome?.pass === true) continue;
6523
+ const spans = await traceStore.spans({ runId: run.runId });
6524
+ const events = await traceStore.events({ runId: run.runId });
6525
+ const { failureClass } = classifyFailure({ run, spans, events });
6526
+ if (failureClass === "success" || failureClass === "unknown") continue;
6527
+ const arr = failureByClass.get(failureClass) ?? [];
6528
+ arr.push(run);
6529
+ failureByClass.set(failureClass, arr);
6530
+ }
6531
+ for (const [cls, runs2] of failureByClass) {
6532
+ if (runs2.length < 3) continue;
6533
+ const affectedScenarios = [...new Set(runs2.map((r) => r.scenarioId))];
6534
+ const neighbors = scenarios.filter((s) => affectedScenarios.includes(s.id)).slice(0, 3);
6535
+ targets.push({
6536
+ reason: "failure-cluster",
6537
+ description: `failure class "${cls}" observed ${runs2.length}\xD7 across ${affectedScenarios.length} scenario(s)`,
6538
+ neighbors,
6539
+ direction: `create scenarios that exercise "${cls}" recovery \u2014 currently a systematic weakness`,
6540
+ priority: Math.min(1, runs2.length / 10)
6541
+ });
6542
+ }
6543
+ return targets.sort((a, b) => b.priority - a.priority).slice(0, topK);
6544
+ }
6545
+ function quantile(xs, p) {
6546
+ const sorted = [...xs].sort((a, b) => a - b);
6547
+ const idx = p * (sorted.length - 1);
6548
+ const lo = Math.floor(idx);
6549
+ const hi = Math.ceil(idx);
6550
+ return sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
6551
+ }
6552
+
6553
+ // src/reward-model-export.ts
6554
+ async function exportRewardModel(store, grader, runIds) {
6555
+ const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6556
+ const samples = await exportTrainingData(store, graded);
6557
+ const rubrics = [...new Set(samples.map((s) => s.rubricId))];
6558
+ const meanReward = samples.length > 0 ? samples.reduce((a, s) => a + s.score, 0) / samples.length : 0;
6559
+ return {
6560
+ version: "1.0",
6561
+ metadata: {
6562
+ nTraces: graded.length,
6563
+ nSamples: samples.length,
6564
+ rubrics,
6565
+ exportedAt: (/* @__PURE__ */ new Date()).toISOString(),
6566
+ meanReward
6567
+ },
6568
+ trainingNdjson: toNdjson(samples)
6569
+ };
6570
+ }
6571
+ function loadScorerFromGrader(grader) {
6572
+ return {
6573
+ async score(trajectory, store) {
6574
+ const graded = await grader.grade(store, trajectory.runId);
6575
+ return graded.aggregateScore;
6576
+ },
6577
+ metadata: {
6578
+ rubrics: ["grader-backed"],
6579
+ deterministic: true
6580
+ }
6581
+ };
6582
+ }
6583
+ async function replayScorerOverCorpus(store, scorer, runIds) {
6584
+ return Promise.all(
6585
+ runIds.map(async (runId) => {
6586
+ const [trajectory, run] = await Promise.all([buildTrajectory(store, runId), store.getRun(runId)]);
6587
+ return {
6588
+ runId,
6589
+ score: await scorer.score(trajectory, store),
6590
+ outcomeScore: run?.outcome?.score ?? null
6591
+ };
6592
+ })
6593
+ );
6594
+ }
6595
+
6596
+ // src/governance/types.ts
6597
+ function renderMarkdown(report) {
6598
+ const sevEmoji = {
6599
+ info: "\u2139\uFE0E",
6600
+ low: "\xB7",
6601
+ medium: "!",
6602
+ high: "!!",
6603
+ critical: "\u203C"
6604
+ };
6605
+ const lines = [];
6606
+ lines.push(`# ${report.framework} report \u2014 ${report.context.systemName}`);
6607
+ lines.push("");
6608
+ lines.push(`- Organization: **${report.context.organization}**`);
6609
+ lines.push(`- Period: ${report.context.periodStart} \u2192 ${report.context.periodEnd}`);
6610
+ lines.push(`- Owner: ${report.context.owner.role} ${report.context.owner.name} <${report.context.owner.email}>`);
6611
+ lines.push(`- Generated: ${report.generatedAt}`);
6612
+ lines.push("");
6613
+ lines.push(`## Summary \u2014 ${report.summary.overall}`);
6614
+ lines.push("");
6615
+ lines.push(`${report.summary.findings} finding(s).`);
6616
+ for (const [sev, n] of Object.entries(report.summary.byeverity)) {
6617
+ if (n > 0) lines.push(`- ${sevEmoji[sev]} ${sev}: ${n}`);
6618
+ }
6619
+ lines.push("");
6620
+ lines.push("## Findings");
6621
+ lines.push("");
6622
+ for (const f of report.findings) {
6623
+ lines.push(`### ${sevEmoji[f.severity]} ${f.id} \u2014 ${f.control}`);
6624
+ lines.push("");
6625
+ lines.push(f.summary);
6626
+ if (f.evidence) {
6627
+ lines.push("");
6628
+ lines.push("**Evidence:** " + f.evidence);
6629
+ }
6630
+ if (f.remediation) {
6631
+ lines.push("");
6632
+ lines.push("**Remediation:** " + f.remediation);
6633
+ }
6634
+ lines.push("");
6635
+ }
6636
+ return lines.join("\n");
6637
+ }
6638
+ function summarize(findings) {
6639
+ const byeverity = {
6640
+ info: 0,
6641
+ low: 0,
6642
+ medium: 0,
6643
+ high: 0,
6644
+ critical: 0
6645
+ };
6646
+ for (const f of findings) byeverity[f.severity]++;
6647
+ const overall = byeverity.critical + byeverity.high > 0 ? "non-compliant" : byeverity.medium + byeverity.low > 0 ? "compliant-with-findings" : "compliant";
6648
+ return { findings: findings.length, byeverity, overall };
6649
+ }
6650
+
6651
+ // src/governance/nist-ai-rmf.ts
6652
+ async function nistAiRmfReport(ctx) {
6653
+ const findings = [];
6654
+ if (!ctx.owner?.email) {
6655
+ findings.push({
6656
+ id: "G-1.1",
6657
+ severity: "high",
6658
+ control: "NIST-AI-RMF:GOVERN-1.1",
6659
+ summary: "No responsible owner recorded for the AI system.",
6660
+ remediation: "Assign an accountable individual + email in GovernanceContext.owner."
6661
+ });
6662
+ }
6663
+ if (ctx.datasets.length === 0) {
6664
+ findings.push({
6665
+ id: "G-1.3",
6666
+ severity: "high",
6667
+ control: "NIST-AI-RMF:GOVERN-1.3",
6668
+ summary: "No versioned datasets recorded for the evaluation period.",
6669
+ remediation: "Register each dataset with a Dataset manifest (content hash + provenance)."
6670
+ });
6671
+ } else {
6672
+ for (const manifest of ctx.datasets) {
6673
+ if (!manifest.contentHash || manifest.contentHash.length < 16) {
6674
+ findings.push({
6675
+ id: "G-1.3-hash",
6676
+ severity: "medium",
6677
+ control: "NIST-AI-RMF:GOVERN-1.3",
6678
+ summary: `Dataset "${manifest.name}" has weak or missing content hash.`,
6679
+ evidence: `contentHash="${manifest.contentHash}"`,
6680
+ remediation: "Call dataset.manifest() to compute SHA-256; commit the manifest alongside releases."
6681
+ });
6682
+ }
6683
+ }
6684
+ }
6685
+ if (!ctx.redTeam) {
6686
+ findings.push({
6687
+ id: "M-2.6",
6688
+ severity: "high",
6689
+ control: "NIST-AI-RMF:MEASURE-2.6",
6690
+ summary: "No red-team evaluation attached to the report period.",
6691
+ remediation: "Run redTeamDataset() against the system and attach the RedTeamReport to context.redTeam."
6692
+ });
6693
+ } else if (ctx.redTeam.overallPassRate < 0.8) {
6694
+ findings.push({
6695
+ id: "M-2.6-rate",
6696
+ severity: "high",
6697
+ control: "NIST-AI-RMF:MEASURE-2.6",
6698
+ summary: `Red-team pass rate ${(ctx.redTeam.overallPassRate * 100).toFixed(1)}% below 80% threshold.`,
6699
+ evidence: JSON.stringify(ctx.redTeam.passRateByCategory),
6700
+ remediation: "Harden the failing categories; rerun the battery."
6701
+ });
6702
+ }
6703
+ const runs = await ctx.traceStore.listRuns({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
6704
+ if (runs.length === 0) {
6705
+ findings.push({
6706
+ id: "M-2.1",
6707
+ severity: "critical",
6708
+ control: "NIST-AI-RMF:MEASURE-2.1",
6709
+ summary: "No eval runs recorded for the reporting period.",
6710
+ remediation: "Emit traces for every deployment-relevant evaluation."
6711
+ });
6712
+ }
6713
+ if (!ctx.judgeCalibration || ctx.judgeCalibration.length === 0) {
6714
+ findings.push({
6715
+ id: "M-2.11",
6716
+ severity: "medium",
6717
+ control: "NIST-AI-RMF:MEASURE-2.11",
6718
+ summary: "No judge-vs-human calibration recorded.",
6719
+ remediation: "Build a human golden set; run calibrateJudge() before trusting LLM judge scores."
6720
+ });
6721
+ } else {
6722
+ const weak = ctx.judgeCalibration.filter((c) => Number.isFinite(c.pearson) && c.pearson < 0.6);
6723
+ if (weak.length > 0) {
6724
+ findings.push({
6725
+ id: "M-2.11-weak",
6726
+ severity: "medium",
6727
+ control: "NIST-AI-RMF:MEASURE-2.11",
6728
+ summary: `${weak.length} judge(s) show weak agreement with humans (Pearson < 0.6).`,
6729
+ remediation: "Retrain or replace the underperforming judges."
6730
+ });
6731
+ }
6732
+ }
6733
+ if (!ctx.outcomeStore) {
6734
+ findings.push({
6735
+ id: "MN-1.1",
6736
+ severity: "medium",
6737
+ control: "NIST-AI-RMF:MANAGE-1.1",
6738
+ summary: "No deployment outcomes captured \u2014 meta-eval correlation cannot be computed.",
6739
+ remediation: "Attach an OutcomeStore and ingest production outcome metrics."
6740
+ });
6741
+ } else {
6742
+ const outcomes = await ctx.outcomeStore.list({ since: Date.parse(ctx.periodStart), until: Date.parse(ctx.periodEnd) });
6743
+ if (outcomes.length === 0) {
6744
+ findings.push({
6745
+ id: "MN-1.1-empty",
6746
+ severity: "medium",
6747
+ control: "NIST-AI-RMF:MANAGE-1.1",
6748
+ summary: "OutcomeStore present but no outcomes captured for the period."
6749
+ });
6750
+ }
6751
+ }
6752
+ const hashChecks = [];
6753
+ for (const manifest of ctx.datasets) {
6754
+ hashChecks.push({ name: manifest.name, ok: /^[0-9a-f]{64}$/.test(manifest.contentHash) });
6755
+ }
6756
+ const payload = {
6757
+ controlsEvaluated: [
6758
+ "GOVERN-1.1",
6759
+ "GOVERN-1.3",
6760
+ "MEASURE-2.1",
6761
+ "MEASURE-2.6",
6762
+ "MEASURE-2.11",
6763
+ "MANAGE-1.1"
6764
+ ],
6765
+ runCount: runs.length,
6766
+ redTeamPassRate: ctx.redTeam?.overallPassRate ?? null,
6767
+ datasetHashChecks: hashChecks
6768
+ };
6769
+ return {
6770
+ framework: "NIST-AI-RMF",
6771
+ version: "1.0.0",
6772
+ context: {
6773
+ organization: ctx.organization,
6774
+ systemName: ctx.systemName,
6775
+ periodStart: ctx.periodStart,
6776
+ periodEnd: ctx.periodEnd,
6777
+ owner: ctx.owner
6778
+ },
6779
+ summary: summarize(findings),
6780
+ findings,
6781
+ payload,
6782
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString()
6783
+ };
6784
+ }
6785
+
6786
+ // src/governance/soc2.ts
6787
+ async function soc2Report(ctx) {
6788
+ const findings = [];
6789
+ const start = Date.parse(ctx.periodStart);
6790
+ const end = Date.parse(ctx.periodEnd);
6791
+ const runs = await ctx.traceStore.listRuns({ since: start, until: end });
6792
+ const failureRate = runs.length > 0 ? runs.filter((r) => r.outcome?.pass === false).length / runs.length : null;
6793
+ if (failureRate !== null && failureRate > 0.2) {
6794
+ findings.push({
6795
+ id: "CC7.1-fail-rate",
6796
+ severity: "medium",
6797
+ control: "SOC2:CC7.1",
6798
+ summary: `System failure rate ${(failureRate * 100).toFixed(1)}% over the period exceeds 20%.`,
6799
+ remediation: "Investigate failure clusters (failureClusterView) + prioritize remediation."
6800
+ });
6801
+ }
6802
+ if (runs.length === 0) {
6803
+ findings.push({
6804
+ id: "CC7.1-coverage",
6805
+ severity: "high",
6806
+ control: "SOC2:CC7.1",
6807
+ summary: "No telemetry runs recorded for the period \u2014 monitoring regime is incomplete."
6808
+ });
6809
+ }
6810
+ const aborted = runs.filter((r) => r.status === "aborted");
6811
+ if (aborted.length > runs.length * 0.05 && aborted.length >= 3) {
6812
+ findings.push({
6813
+ id: "CC7.2-abort",
6814
+ severity: "medium",
6815
+ control: "SOC2:CC7.2",
6816
+ summary: `${aborted.length} run(s) aborted \u2014 investigate pattern.`,
6817
+ remediation: "Use the bisector + failureClusterView to localize the trigger."
6818
+ });
6819
+ }
6820
+ const incidentEvents = await ctx.traceStore.events({ kind: "policy_violation", since: start, until: end });
6821
+ const errorEvents = await ctx.traceStore.events({ kind: "error", since: start, until: end });
6822
+ const totalIncidents = incidentEvents.length + errorEvents.length;
6823
+ if (totalIncidents > 0) {
6824
+ findings.push({
6825
+ id: "CC7.3-resolution",
6826
+ severity: "low",
6827
+ control: "SOC2:CC7.3",
6828
+ summary: `${totalIncidents} incident-class event(s) recorded; resolution tracking is informal.`,
6829
+ remediation: 'Emit a resolution event (kind="log" with payload.resolves=<eventId>) per remediated incident.'
6830
+ });
6831
+ }
6832
+ const modelFingerprints = new Set(runs.map((r) => r.modelFingerprint).filter(Boolean));
6833
+ const promptHashes = new Set(runs.map((r) => r.promptSha).filter(Boolean));
6834
+ const codeSha = new Set(runs.map((r) => r.codeSha).filter(Boolean));
6835
+ if (codeSha.size === 0) {
6836
+ findings.push({
6837
+ id: "CC7.4-code",
6838
+ severity: "high",
6839
+ control: "SOC2:CC7.4",
6840
+ summary: "No codeSha recorded on runs \u2014 cannot attribute scores to a specific release.",
6841
+ remediation: "Populate Run.codeSha with the git SHA of the system at run time."
6842
+ });
6843
+ }
6844
+ if (promptHashes.size === 0) {
6845
+ findings.push({
6846
+ id: "CC7.4-prompt",
6847
+ severity: "medium",
6848
+ control: "SOC2:CC7.4",
6849
+ summary: "No promptSha recorded \u2014 prompt changes are untracked."
6850
+ });
6851
+ }
6852
+ const payload = {
6853
+ controls: ["CC7.1", "CC7.2", "CC7.3", "CC7.4"],
6854
+ runCount: runs.length,
6855
+ failureRate,
6856
+ abortedCount: aborted.length,
6857
+ incidentEventCount: totalIncidents,
6858
+ distinctReleases: {
6859
+ codeShas: codeSha.size,
6860
+ promptHashes: promptHashes.size,
6861
+ modelFingerprints: modelFingerprints.size
6862
+ }
6863
+ };
6864
+ return {
6865
+ framework: "SOC2",
6866
+ version: "2017-Common-Criteria",
6867
+ context: {
6868
+ organization: ctx.organization,
6869
+ systemName: ctx.systemName,
6870
+ periodStart: ctx.periodStart,
6871
+ periodEnd: ctx.periodEnd,
6872
+ owner: ctx.owner
6873
+ },
6874
+ summary: summarize(findings),
6875
+ findings,
6876
+ payload,
6877
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString()
6878
+ };
6879
+ }
6880
+
6881
+ // src/governance/eu-ai-act.ts
6882
+ function classifyEuAiRisk(signals) {
6883
+ if (signals.biometricPublic || signals.socialScoring || signals.subliminal) return "unacceptable";
6884
+ if (signals.annexIII) return "high";
6885
+ if (signals.chatbot || signals.generatesSyntheticMedia) return "limited";
6886
+ return "minimal";
6887
+ }
6888
+ async function euAiActReport(ctx, signals) {
6889
+ const riskClass = classifyEuAiRisk(signals);
6890
+ const findings = [];
6891
+ if (riskClass === "unacceptable") {
6892
+ findings.push({
6893
+ id: "EU-ART-5",
6894
+ severity: "critical",
6895
+ control: "EU-AI-ACT:Article-5",
6896
+ summary: "Use case matches a prohibited practice under Article 5.",
6897
+ remediation: "Discontinue or substantially redesign the use case."
6898
+ });
6899
+ }
6900
+ if (riskClass === "high") {
6901
+ if (!ctx.redTeam) {
6902
+ findings.push({
6903
+ id: "EU-ART-9",
6904
+ severity: "high",
6905
+ control: "EU-AI-ACT:Article-9",
6906
+ summary: "High-risk system lacks documented adversarial-testing evidence (Art. 9 risk mgmt).",
6907
+ remediation: "Run redTeamDataset() + attach the report."
6908
+ });
6909
+ }
6910
+ if (ctx.datasets.length === 0) {
6911
+ findings.push({
6912
+ id: "EU-ART-10",
6913
+ severity: "high",
6914
+ control: "EU-AI-ACT:Article-10",
6915
+ summary: "No training/eval datasets recorded with provenance (Art. 10)."
6916
+ });
6917
+ }
6918
+ const runs = await ctx.traceStore.listRuns({
6919
+ since: Date.parse(ctx.periodStart),
6920
+ until: Date.parse(ctx.periodEnd)
6921
+ });
6922
+ if (runs.length === 0) {
6923
+ findings.push({
6924
+ id: "EU-ART-11",
6925
+ severity: "high",
6926
+ control: "EU-AI-ACT:Article-11",
6927
+ summary: "No eval runs recorded (Art. 11 technical documentation)."
6928
+ });
6929
+ }
6930
+ if (!signals.chatbot && !signals.generatesSyntheticMedia) {
6931
+ } else {
6932
+ findings.push({
6933
+ id: "EU-ART-13",
6934
+ severity: "info",
6935
+ control: "EU-AI-ACT:Article-13",
6936
+ summary: "Chatbot/synthetic-media transparency obligations apply; verify user-facing disclosures."
6937
+ });
6938
+ }
6939
+ if (!ctx.owner?.email) {
6940
+ findings.push({
6941
+ id: "EU-ART-14",
6942
+ severity: "high",
6943
+ control: "EU-AI-ACT:Article-14",
6944
+ summary: "No designated human overseer (Art. 14).",
6945
+ remediation: "Populate GovernanceContext.owner with the responsible individual."
6946
+ });
6947
+ }
6948
+ if (!ctx.outcomeStore) {
6949
+ findings.push({
6950
+ id: "EU-ART-15",
6951
+ severity: "medium",
6952
+ control: "EU-AI-ACT:Article-15",
6953
+ summary: "No post-deployment outcome measurement; accuracy + robustness are un-attested.",
6954
+ remediation: "Attach an OutcomeStore + run correlationStudy() over the reporting period."
6955
+ });
6956
+ }
6957
+ }
6958
+ if (riskClass === "limited") {
6959
+ findings.push({
6960
+ id: "EU-ART-52",
6961
+ severity: "info",
6962
+ control: "EU-AI-ACT:Article-52",
6963
+ summary: "Transparency obligations apply: disclose AI nature + synthetic content labeling.",
6964
+ remediation: "Ensure user-facing surfaces label AI-generated content."
6965
+ });
6966
+ }
6967
+ const payload = {
6968
+ riskClass,
6969
+ signals,
6970
+ articlesReviewed: riskClass === "high" ? ["5", "9", "10", "11", "13", "14", "15"] : riskClass === "limited" ? ["52"] : ["none"]
6971
+ };
6972
+ return {
6973
+ framework: "EU-AI-ACT",
6974
+ version: "Regulation-2024-1689",
6975
+ context: {
6976
+ organization: ctx.organization,
6977
+ systemName: ctx.systemName,
6978
+ periodStart: ctx.periodStart,
6979
+ periodEnd: ctx.periodEnd,
6980
+ owner: ctx.owner
6981
+ },
6982
+ summary: summarize(findings),
6983
+ findings,
6984
+ payload,
6985
+ generatedAt: (/* @__PURE__ */ new Date()).toISOString()
6986
+ };
6987
+ }
5448
6988
  export {
5449
6989
  AgentDriver,
5450
6990
  BenchmarkRunner,
@@ -5463,15 +7003,18 @@ export {
5463
7003
  DualAgentBench,
5464
7004
  ExperimentTracker,
5465
7005
  FAILURE_CLASSES,
7006
+ FileSystemOutcomeStore,
5466
7007
  FileSystemTraceStore,
5467
7008
  HoldoutAuditor,
5468
7009
  HoldoutLockedError,
5469
7010
  InMemoryExperimentStore,
7011
+ InMemoryOutcomeStore,
5470
7012
  InMemoryTraceStore,
5471
7013
  InMemoryWorkspaceInspector,
5472
7014
  MODEL_PRICING,
5473
7015
  MetricsCollector,
5474
7016
  OTEL_AGENT_EVAL_SCOPE,
7017
+ PrmGrader,
5475
7018
  ProductClient,
5476
7019
  ProjectRegistry,
5477
7020
  PromptOptimizer,
@@ -5488,20 +7031,26 @@ export {
5488
7031
  analyzeAntiSlop,
5489
7032
  analyzeSeries,
5490
7033
  argHash,
7034
+ attributeCounterfactuals,
5491
7035
  benjaminiHochberg,
7036
+ bisect,
5492
7037
  bonferroni,
5493
7038
  budgetBreachView,
5494
7039
  buildTrajectory,
5495
7040
  byteLengthRange,
5496
7041
  calibrateJudge,
7042
+ calibrationCurve,
5497
7043
  canaryLeakView,
7044
+ causalAttribution,
5498
7045
  checkCanaries,
5499
7046
  checkSlos,
7047
+ classifyEuAiRisk,
5500
7048
  classifyFailure,
5501
7049
  codeExecutionJudge,
5502
7050
  cohensD,
5503
7051
  coherenceJudge,
5504
7052
  collectionPreserved,
7053
+ commitBisect,
5505
7054
  compareToBaseline,
5506
7055
  composeParsers,
5507
7056
  composeValidators,
@@ -5509,18 +7058,24 @@ export {
5509
7058
  confidenceInterval,
5510
7059
  containsAll,
5511
7060
  correlateLayers,
7061
+ correlationStudy,
5512
7062
  createAntiSlopJudge,
5513
7063
  createCustomJudge,
5514
7064
  createDomainExpertJudge,
7065
+ crossTraceDiff,
5515
7066
  defaultJudges,
5516
7067
  dominates,
5517
7068
  estimateCost,
5518
7069
  estimateTokens,
7070
+ euAiActReport,
5519
7071
  evaluateContract,
7072
+ evaluateHypothesis,
5520
7073
  evaluateOracles,
5521
7074
  executeScenario,
5522
7075
  expectAgent,
7076
+ exportRewardModel,
5523
7077
  exportRunAsOtlp,
7078
+ exportTrainingData,
5524
7079
  failureClusterView,
5525
7080
  fileContains,
5526
7081
  fileExists,
@@ -5534,6 +7089,7 @@ export {
5534
7089
  iqr,
5535
7090
  isJudgeSpan,
5536
7091
  isLlmSpan,
7092
+ isPrmVerdict,
5537
7093
  isRetrievalSpan,
5538
7094
  isSandboxSpan,
5539
7095
  isToolSpan,
@@ -5545,10 +7101,14 @@ export {
5545
7101
  keyPreserved,
5546
7102
  llmSpanFromProvider,
5547
7103
  llmSpans,
7104
+ loadScorerFromGrader,
5548
7105
  lowercaseMutator,
5549
7106
  mannWhitneyU,
7107
+ nistAiRmfReport,
7108
+ nonRefusalRubric,
5550
7109
  normalizeScores,
5551
7110
  notBlocked,
7111
+ outputLengthRubric,
5552
7112
  pairedTTest,
5553
7113
  paraphraseRobustness,
5554
7114
  paretoFrontier,
@@ -5557,6 +7117,10 @@ export {
5557
7117
  politenessPrefixMutator,
5558
7118
  positionalBias,
5559
7119
  printDriverSummary,
7120
+ prmBestOfN,
7121
+ prmEnsembleBestOfN,
7122
+ promptBisect,
7123
+ proposeSynthesisTargets,
5560
7124
  pytestTestParser,
5561
7125
  redTeamDataset,
5562
7126
  redTeamReport,
@@ -5565,16 +7129,20 @@ export {
5565
7129
  regexMatch,
5566
7130
  regexMatches,
5567
7131
  regressionView,
7132
+ renderMarkdown,
5568
7133
  renderMarkdownReport,
7134
+ replayScorerOverCorpus,
5569
7135
  replayTraceThroughJudge,
5570
7136
  requiredSampleSize,
5571
7137
  resumeBuilderSession,
5572
7138
  rowCount,
5573
7139
  rowWhere,
5574
7140
  runAssertions,
7141
+ runCounterfactual,
5575
7142
  runE2EWorkflow,
5576
7143
  runExpectations,
5577
7144
  runFailureClass,
7145
+ runSelfPlay,
5578
7146
  runTestGradedScenario,
5579
7147
  runsForScenario,
5580
7148
  scoreAllProjects,
@@ -5583,17 +7151,25 @@ export {
5583
7151
  scoreRedTeamOutput,
5584
7152
  selfPreference,
5585
7153
  sentenceReorderMutator,
7154
+ signManifest,
7155
+ soc2Report,
5586
7156
  statusAdvanced,
5587
7157
  stuckLoopView,
7158
+ summarize,
5588
7159
  textInSnapshot,
5589
7160
  toLangfuseEnvelope,
7161
+ toNdjson,
5590
7162
  toPrometheusText,
7163
+ toolIntentAlignmentRubric,
5591
7164
  toolNamesForRun,
7165
+ toolNonRedundantRubric,
5592
7166
  toolSpans,
7167
+ toolSuccessRubric,
5593
7168
  toolWasteView,
5594
7169
  typoMutator,
5595
7170
  urlContains,
5596
7171
  verbosityBias,
7172
+ verifyManifest,
5597
7173
  visualDiff,
5598
7174
  vitestTestParser,
5599
7175
  weightedMean,