@tangle-network/agent-eval 0.18.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
417
417
  if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
418
418
  if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
419
419
  const n = scores2.length;
420
- const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
420
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
421
421
  const B = 1e3;
422
422
  const bootstrapMeans = [];
423
423
  for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
432
432
  const lowerIdx = Math.floor(alpha / 2 * B);
433
433
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
434
434
  return {
435
- mean: mean9,
435
+ mean: mean10,
436
436
  lower: bootstrapMeans[lowerIdx],
437
437
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
438
438
  };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
520
520
  const n = before.length;
521
521
  if (n < 2) return { t: 0, df: 0, p: 1 };
522
522
  const diffs = before.map((b, i) => after[i] - b);
523
- const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
524
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
523
+ const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
524
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
525
525
  const se = Math.sqrt(variance2 / n);
526
- if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
527
- const t = mean9 / se;
526
+ if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
527
+ const t = mean10 / se;
528
528
  const df = n - 1;
529
529
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
530
530
  return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
548
548
  }
549
549
  let wPlus = 0;
550
550
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
551
- const mean9 = n * (n + 1) / 4;
551
+ const mean10 = n * (n + 1) / 4;
552
552
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
553
- const z = (wPlus - mean9) / Math.sqrt(variance2);
553
+ const z = (wPlus - mean10) / Math.sqrt(variance2);
554
554
  const p = 2 * (1 - normalCdf(Math.abs(z)));
555
555
  return { w: wPlus, p };
556
556
  }
@@ -3473,174 +3473,6 @@ function rowToRun(row) {
3473
3473
  };
3474
3474
  }
3475
3475
 
3476
- // src/power-analysis.ts
3477
- function requiredSampleSize(opts) {
3478
- const effect = opts.effect;
3479
- if (!Number.isFinite(effect) || effect <= 0) return Infinity;
3480
- const alpha = opts.alpha ?? 0.05;
3481
- const power = opts.power ?? 0.8;
3482
- const twoSided = opts.twoSided ?? true;
3483
- const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
3484
- const zBeta = zQuantile(power);
3485
- const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
3486
- return Math.ceil(n);
3487
- }
3488
- function bonferroni(pValues, alpha = 0.05) {
3489
- const k = pValues.length;
3490
- const adjusted = pValues.map((p) => Math.min(1, p * k));
3491
- const significant = adjusted.map((p) => p < alpha);
3492
- return { adjusted, significant };
3493
- }
3494
- function benjaminiHochberg(pValues, fdr = 0.05) {
3495
- const n = pValues.length;
3496
- if (n === 0) return { qValues: [], significant: [] };
3497
- const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
3498
- const q = new Array(n);
3499
- let minRight = 1;
3500
- for (let k = n - 1; k >= 0; k--) {
3501
- const rank = k + 1;
3502
- const raw = indexed[k].p * n / rank;
3503
- const bounded = Math.min(minRight, raw);
3504
- minRight = bounded;
3505
- q[indexed[k].i] = Math.min(1, bounded);
3506
- }
3507
- const significant = q.map((v) => v < fdr);
3508
- return { qValues: q, significant };
3509
- }
3510
- function zQuantile(p) {
3511
- if (p <= 0 || p >= 1) {
3512
- if (p === 0) return -Infinity;
3513
- if (p === 1) return Infinity;
3514
- return NaN;
3515
- }
3516
- const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
3517
- const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
3518
- const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
3519
- const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
3520
- const pLow = 0.02425;
3521
- const pHigh = 1 - pLow;
3522
- let q;
3523
- let r;
3524
- if (p < pLow) {
3525
- q = Math.sqrt(-2 * Math.log(p));
3526
- return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
3527
- }
3528
- if (p <= pHigh) {
3529
- q = p - 0.5;
3530
- r = q * q;
3531
- return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
3532
- }
3533
- q = Math.sqrt(-2 * Math.log(1 - p));
3534
- return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
3535
- }
3536
-
3537
- // src/prompt-optimizer.ts
3538
- var PromptOptimizer = class {
3539
- async run(config) {
3540
- const trials = config.trialsPerScenario ?? 3;
3541
- const alpha = config.significanceLevel ?? 0.05;
3542
- if (config.variants.length < 2) {
3543
- throw new Error("PromptOptimizer requires at least 2 variants");
3544
- }
3545
- if (config.scenarioIds.length === 0) {
3546
- throw new Error("PromptOptimizer requires at least 1 scenario");
3547
- }
3548
- const rawScores = /* @__PURE__ */ new Map();
3549
- for (const variant of config.variants) {
3550
- const scenarioMap = /* @__PURE__ */ new Map();
3551
- rawScores.set(variant.id, scenarioMap);
3552
- for (const scenarioId of config.scenarioIds) {
3553
- const samples = [];
3554
- for (let t = 0; t < trials; t++) {
3555
- const score = await config.scoreVariant({
3556
- variant,
3557
- scenarioId,
3558
- trialIndex: t
3559
- });
3560
- if (!Number.isFinite(score)) {
3561
- throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
3562
- }
3563
- samples.push(score);
3564
- }
3565
- scenarioMap.set(scenarioId, samples);
3566
- config.onScenarioComplete?.({
3567
- variantId: variant.id,
3568
- scenarioId,
3569
- scores: samples
3570
- });
3571
- }
3572
- }
3573
- const scores2 = config.variants.map((variant) => {
3574
- const scenarioMap = rawScores.get(variant.id);
3575
- const allSamples = [];
3576
- const perScenario = {};
3577
- for (const scenarioId of config.scenarioIds) {
3578
- const samples = scenarioMap.get(scenarioId) ?? [];
3579
- allSamples.push(...samples);
3580
- perScenario[scenarioId] = {
3581
- mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
3582
- n: samples.length,
3583
- samples
3584
- };
3585
- }
3586
- const ci = confidenceInterval(allSamples, 0.95);
3587
- return {
3588
- variantId: variant.id,
3589
- mean: ci.mean,
3590
- ci95: { lower: ci.lower, upper: ci.upper },
3591
- n: allSamples.length,
3592
- perScenario
3593
- };
3594
- });
3595
- const rawPairs = [];
3596
- for (let i = 0; i < scores2.length; i++) {
3597
- for (let j = i + 1; j < scores2.length; j++) {
3598
- const a = scores2[i];
3599
- const b = scores2[j];
3600
- const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
3601
- rawPairs.push({ a, b, p });
3602
- }
3603
- }
3604
- const { qValues } = benjaminiHochberg(rawPairs.map((r) => r.p), alpha);
3605
- const pairwise2 = rawPairs.map((r, idx) => ({
3606
- variantA: r.a.variantId,
3607
- variantB: r.b.variantId,
3608
- pValue: r.p,
3609
- qValue: qValues[idx],
3610
- significant: qValues[idx] < alpha,
3611
- meanDelta: r.b.mean - r.a.mean
3612
- }));
3613
- const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
3614
- const winner = sorted[0];
3615
- const second = sorted[1];
3616
- const winnerComparisons = pairwise2.filter(
3617
- (c) => c.variantA === winner.variantId || c.variantB === winner.variantId
3618
- );
3619
- const significantOverAll = winnerComparisons.every((c) => c.significant);
3620
- const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
3621
- return {
3622
- winner: {
3623
- variantId: winner.variantId,
3624
- significant: significantOverAll,
3625
- ciLowerBoundExceedsSecondMean
3626
- },
3627
- scores: scores2,
3628
- pairwise: pairwise2,
3629
- config: {
3630
- trialsPerScenario: trials,
3631
- significanceLevel: alpha,
3632
- variants: config.variants.map((v) => v.id),
3633
- scenarios: config.scenarioIds
3634
- }
3635
- };
3636
- }
3637
- };
3638
- function flatSamples(score) {
3639
- const out = [];
3640
- for (const s of Object.values(score.perScenario)) out.push(...s.samples);
3641
- return out;
3642
- }
3643
-
3644
3476
  // src/steering.ts
3645
3477
  function mergeSteeringBundle(base, delta) {
3646
3478
  return {
@@ -3831,46 +3663,6 @@ function canonicalInstruction(value) {
3831
3663
  return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
3832
3664
  }
3833
3665
 
3834
- // src/optimization-loop.ts
3835
- var OptimizationLoop = class {
3836
- optimizer;
3837
- constructor(optimizer = new PromptOptimizer()) {
3838
- this.optimizer = optimizer;
3839
- }
3840
- async run(config) {
3841
- const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
3842
- const result = await this.optimizer.run({
3843
- variants: config.variants.map((variant) => ({
3844
- id: variant.id,
3845
- prompt: renderSteeringText(variant),
3846
- metadata: { bundle: variant }
3847
- })),
3848
- scenarioIds: config.examples.map((example) => example.scenarioId),
3849
- trialsPerScenario: config.trialsPerScenario,
3850
- scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
3851
- const bundle = byId.get(variant.id);
3852
- if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
3853
- const example = config.examples.find((item) => item.scenarioId === scenarioId);
3854
- if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
3855
- const score = await config.evaluate({ variant: bundle, example, trialIndex });
3856
- return aggregateRunScore(score, config.scoreWeights);
3857
- }
3858
- });
3859
- return {
3860
- winner: byId.get(result.winner.variantId),
3861
- significant: result.winner.significant,
3862
- reports: result.scores.map((score) => ({
3863
- variantId: score.variantId,
3864
- bundle: byId.get(score.variantId),
3865
- mean: score.mean,
3866
- ci95: score.ci95,
3867
- scenarioScores: score.perScenario
3868
- })),
3869
- pairwise: result.pairwise
3870
- };
3871
- }
3872
- };
3873
-
3874
3666
  // src/steering-optimizer.ts
3875
3667
  var PairwiseSteeringOptimizer = class {
3876
3668
  optimize(rows, config = {}) {
@@ -6665,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
6665
6457
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6666
6458
  }
6667
6459
  const tail = values.slice(-window);
6668
- const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
6669
- const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
6460
+ const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
6461
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
6670
6462
  const stdDev = Math.sqrt(variance2);
6671
- const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
6463
+ const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
6672
6464
  const cv = stdDev / refMean;
6673
6465
  const stable = tail.length >= window && cv <= stableCv;
6674
6466
  let tailRun = 0;
@@ -6689,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
6689
6481
  } else {
6690
6482
  state = "noisy";
6691
6483
  }
6692
- return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
6484
+ return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
6693
6485
  }
6694
6486
 
6695
6487
  // src/state-continuity.ts
@@ -7119,6 +6911,67 @@ function excerpt3(source, needle) {
7119
6911
  return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
7120
6912
  }
7121
6913
 
6914
+ // src/power-analysis.ts
6915
+ function requiredSampleSize(opts) {
6916
+ const effect = opts.effect;
6917
+ if (!Number.isFinite(effect) || effect <= 0) return Infinity;
6918
+ const alpha = opts.alpha ?? 0.05;
6919
+ const power = opts.power ?? 0.8;
6920
+ const twoSided = opts.twoSided ?? true;
6921
+ const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
6922
+ const zBeta = zQuantile(power);
6923
+ const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
6924
+ return Math.ceil(n);
6925
+ }
6926
+ function bonferroni(pValues, alpha = 0.05) {
6927
+ const k = pValues.length;
6928
+ const adjusted = pValues.map((p) => Math.min(1, p * k));
6929
+ const significant = adjusted.map((p) => p < alpha);
6930
+ return { adjusted, significant };
6931
+ }
6932
+ function benjaminiHochberg(pValues, fdr = 0.05) {
6933
+ const n = pValues.length;
6934
+ if (n === 0) return { qValues: [], significant: [] };
6935
+ const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
6936
+ const q = new Array(n);
6937
+ let minRight = 1;
6938
+ for (let k = n - 1; k >= 0; k--) {
6939
+ const rank = k + 1;
6940
+ const raw = indexed[k].p * n / rank;
6941
+ const bounded2 = Math.min(minRight, raw);
6942
+ minRight = bounded2;
6943
+ q[indexed[k].i] = Math.min(1, bounded2);
6944
+ }
6945
+ const significant = q.map((v) => v < fdr);
6946
+ return { qValues: q, significant };
6947
+ }
6948
+ function zQuantile(p) {
6949
+ if (p <= 0 || p >= 1) {
6950
+ if (p === 0) return -Infinity;
6951
+ if (p === 1) return Infinity;
6952
+ return NaN;
6953
+ }
6954
+ const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
6955
+ const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
6956
+ const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
6957
+ const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
6958
+ const pLow = 0.02425;
6959
+ const pHigh = 1 - pLow;
6960
+ let q;
6961
+ let r;
6962
+ if (p < pLow) {
6963
+ q = Math.sqrt(-2 * Math.log(p));
6964
+ return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
6965
+ }
6966
+ if (p <= pHigh) {
6967
+ q = p - 0.5;
6968
+ r = q * q;
6969
+ return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
6970
+ }
6971
+ q = Math.sqrt(-2 * Math.log(1 - p));
6972
+ return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
6973
+ }
6974
+
7122
6975
  // src/behavior-dsl.ts
7123
6976
  var BehaviorAssertion = class {
7124
6977
  constructor(store, runId) {
@@ -7617,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7617
7470
  variantScores.push({ mutator: id, score, mutated });
7618
7471
  all.push(score);
7619
7472
  }
7620
- const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
7621
- const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
7473
+ const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
7474
+ const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
7622
7475
  const stdDev = Math.sqrt(variance2);
7623
- const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
7476
+ const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
7624
7477
  const robustness = Math.max(0, 1 - stdDev / ref);
7625
- return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
7478
+ return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
7626
7479
  }
7627
7480
  var lowercaseMutator = (p) => p.toLowerCase();
7628
7481
  var sentenceReorderMutator = (p, seed) => {
@@ -8543,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
8543
8396
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
8544
8397
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
8545
8398
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
8546
- const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8547
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
8399
+ const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8400
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
8548
8401
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8549
8402
  }
8550
8403
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8566,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
8566
8419
  const ranked = [...byRun.values()].sort(
8567
8420
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
8568
8421
  );
8569
- const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8570
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
8422
+ const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8423
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
8571
8424
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8572
8425
  }
8573
8426
 
@@ -9097,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
9097
8950
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
9098
8951
  const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
9099
8952
  if (scores2.length < 3) continue;
9100
- const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
9101
- const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
8953
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
8954
+ const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
9102
8955
  if (variance2 > varianceThreshold) {
9103
8956
  targets.push({
9104
8957
  reason: "high-variance",
@@ -12960,6 +12813,289 @@ function traceExcerpt(trace) {
12960
12813
  return void 0;
12961
12814
  }
12962
12815
 
12816
+ // src/release-confidence.ts
12817
+ var DEFAULT_THRESHOLDS = {
12818
+ requireCorpus: true,
12819
+ minScenarioCount: 1,
12820
+ minSearchRuns: 1,
12821
+ minHoldoutRuns: 1,
12822
+ requireHoldout: true,
12823
+ minPassRate: 0.8,
12824
+ minMeanScore: 0.7,
12825
+ maxOverfitGap: 0.15,
12826
+ maxMeanCostUsd: Number.POSITIVE_INFINITY,
12827
+ maxP95WallMs: Number.POSITIVE_INFINITY,
12828
+ requireAsiForFailures: true,
12829
+ failureScoreThreshold: 0.5
12830
+ };
12831
+ function releaseTraceEvidenceFromMultiShotTrials(trials) {
12832
+ return trials.map((trial) => ({
12833
+ scenarioId: trial.scenarioId,
12834
+ candidateId: trial.variantId,
12835
+ split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
12836
+ score: trial.score,
12837
+ ok: trial.ok,
12838
+ turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
12839
+ costUsd: trial.cost,
12840
+ durationMs: trial.durationMs,
12841
+ failureMode: trial.error ? "runtime_error" : void 0,
12842
+ asi: trial.asi,
12843
+ metadata: trial.metadata
12844
+ }));
12845
+ }
12846
+ function evaluateReleaseConfidence(input) {
12847
+ const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
12848
+ const candidateId = input.candidateId ?? null;
12849
+ const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
12850
+ const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
12851
+ const scenarios = input.scenarios ?? [];
12852
+ const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
12853
+ const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
12854
+ const searchScores = scoresFor(runs, "search");
12855
+ const holdoutScores = scoresFor(runs, "holdout");
12856
+ const allScores = [...searchScores, ...holdoutScores];
12857
+ const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
12858
+ const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
12859
+ const searchRuns = runs.filter((r) => r.splitTag === "search").length;
12860
+ const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
12861
+ const searchMeanScore = mean8(searchScores);
12862
+ const holdoutMeanScore = mean8(holdoutScores);
12863
+ const metrics = {
12864
+ scenarioCount,
12865
+ searchRuns,
12866
+ holdoutRuns,
12867
+ passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
12868
+ meanScore: mean8(scoreUniverse),
12869
+ searchMeanScore,
12870
+ holdoutMeanScore,
12871
+ overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
12872
+ meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
12873
+ p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
12874
+ failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
12875
+ failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
12876
+ singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
12877
+ multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
12878
+ splitCounts,
12879
+ domainCounts: countDomains(scenarios),
12880
+ failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
12881
+ responsibleSurfaceCounts: countResponsibleSurfaces(traces)
12882
+ };
12883
+ const issues = [];
12884
+ checkCorpus(input, thresholds, metrics, issues);
12885
+ checkQuality(thresholds, metrics, issues);
12886
+ checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
12887
+ checkDiagnostics(thresholds, metrics, issues);
12888
+ checkEfficiency(thresholds, metrics, issues);
12889
+ const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
12890
+ const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
12891
+ return {
12892
+ target: input.target,
12893
+ candidateId,
12894
+ baselineId: input.baselineId ?? null,
12895
+ status,
12896
+ promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
12897
+ axes,
12898
+ issues,
12899
+ metrics,
12900
+ dataset: input.dataset ?? null,
12901
+ gateDecision: input.gateDecision ?? null,
12902
+ summary: renderSummary(input.target, status, metrics, issues)
12903
+ };
12904
+ }
12905
+ function assertReleaseConfidence(input) {
12906
+ const scorecard = evaluateReleaseConfidence(input);
12907
+ if (scorecard.status === "fail") {
12908
+ throw new Error(scorecard.summary);
12909
+ }
12910
+ return scorecard;
12911
+ }
12912
+ function filterCandidate(runs, candidateId, baselineId) {
12913
+ if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
12914
+ if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
12915
+ return [...runs];
12916
+ }
12917
+ function filterTraceCandidate(traces, candidateId, baselineId) {
12918
+ if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
12919
+ if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
12920
+ return [...traces];
12921
+ }
12922
+ function checkCorpus(input, thresholds, metrics, issues) {
12923
+ if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
12924
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
12925
+ }
12926
+ if (metrics.scenarioCount < thresholds.minScenarioCount) {
12927
+ issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
12928
+ }
12929
+ if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
12930
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
12931
+ }
12932
+ }
12933
+ function checkQuality(thresholds, metrics, issues) {
12934
+ if (metrics.searchRuns < thresholds.minSearchRuns) {
12935
+ issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
12936
+ }
12937
+ if (metrics.passRate < thresholds.minPassRate) {
12938
+ issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
12939
+ }
12940
+ if (metrics.meanScore < thresholds.minMeanScore) {
12941
+ issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
12942
+ }
12943
+ }
12944
+ function checkGeneralization(gateDecision, thresholds, metrics, issues) {
12945
+ if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
12946
+ issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
12947
+ }
12948
+ if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
12949
+ issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
12950
+ }
12951
+ if (gateDecision && !gateDecision.promote) {
12952
+ issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
12953
+ }
12954
+ }
12955
+ function checkDiagnostics(thresholds, metrics, issues) {
12956
+ if (!thresholds.requireAsiForFailures) return;
12957
+ if (metrics.failedRows > metrics.failuresWithAsi) {
12958
+ issues.push({
12959
+ axis: "diagnostics",
12960
+ severity: "critical",
12961
+ code: "missing_failure_asi",
12962
+ detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
12963
+ });
12964
+ }
12965
+ }
12966
+ function checkEfficiency(thresholds, metrics, issues) {
12967
+ if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
12968
+ issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
12969
+ }
12970
+ if (metrics.p95WallMs > thresholds.maxP95WallMs) {
12971
+ issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
12972
+ }
12973
+ }
12974
+ function buildAxes(metrics, thresholds, gateDecision, issues) {
12975
+ return [
12976
+ axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
12977
+ axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
12978
+ axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
12979
+ axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
12980
+ axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
12981
+ ];
12982
+ }
12983
+ function axis(name, issues, score, detail) {
12984
+ const own = issues.filter((i) => i.axis === name);
12985
+ const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
12986
+ return { name, status, score: bounded(score), detail };
12987
+ }
12988
+ function countScenarioSplits(scenarios) {
12989
+ const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
12990
+ for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
12991
+ return counts;
12992
+ }
12993
+ function countDomains(scenarios) {
12994
+ const out = {};
12995
+ for (const scenario of scenarios) {
12996
+ const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
12997
+ out[domain] = (out[domain] ?? 0) + 1;
12998
+ }
12999
+ return out;
13000
+ }
13001
+ function countFailureModes(runs, traces, threshold) {
13002
+ const out = {};
13003
+ for (const run of runs) {
13004
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13005
+ if (run.failureMode || score !== void 0 && score < threshold) {
13006
+ const mode = run.failureMode ?? "low_score";
13007
+ out[mode] = (out[mode] ?? 0) + 1;
13008
+ }
13009
+ }
13010
+ for (const trace of traces) {
13011
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13012
+ const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
13013
+ out[mode] = (out[mode] ?? 0) + 1;
13014
+ }
13015
+ }
13016
+ return out;
13017
+ }
13018
+ function countResponsibleSurfaces(traces) {
13019
+ const out = {};
13020
+ for (const trace of traces) {
13021
+ for (const asi of trace.asi ?? []) {
13022
+ const surface = asi.responsibleSurface ?? "unknown";
13023
+ out[surface] = (out[surface] ?? 0) + 1;
13024
+ }
13025
+ }
13026
+ return out;
13027
+ }
13028
+ function failedRows(runs, traces, threshold) {
13029
+ const out = [];
13030
+ for (const run of runs) {
13031
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13032
+ if (run.failureMode || score !== void 0 && score < threshold) {
13033
+ const asiMetric = run.outcome.raw.asi;
13034
+ out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
13035
+ }
13036
+ }
13037
+ for (const trace of traces) {
13038
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13039
+ out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
13040
+ }
13041
+ }
13042
+ return out;
13043
+ }
13044
+ function passRate(runs, traces, threshold) {
13045
+ const outcomes = [
13046
+ ...runs.map((run) => {
13047
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13048
+ return !run.failureMode && score !== void 0 && score >= threshold;
13049
+ }),
13050
+ ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
13051
+ ];
13052
+ if (outcomes.length === 0) return 0;
13053
+ return outcomes.filter(Boolean).length / outcomes.length;
13054
+ }
13055
+ function scoresFor(runs, split) {
13056
+ return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
13057
+ }
13058
+ function mean8(xs) {
13059
+ if (xs.length === 0) return Number.NaN;
13060
+ return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
13061
+ }
13062
+ function percentile(xs, p) {
13063
+ if (xs.length === 0) return Number.NaN;
13064
+ const sorted = [...xs].sort((a, b) => a - b);
13065
+ return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
13066
+ }
13067
+ function isFiniteNumber(value) {
13068
+ return typeof value === "number" && Number.isFinite(value);
13069
+ }
13070
+ function safeDiff2(a, b) {
13071
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
13072
+ return a - b;
13073
+ }
13074
+ function gapScore(gap, maxGap) {
13075
+ if (!Number.isFinite(gap)) return 0;
13076
+ if (maxGap <= 0) return gap <= 0 ? 1 : 0;
13077
+ return bounded(1 - Math.max(0, gap) / maxGap);
13078
+ }
13079
+ function efficiencyScore(metrics, thresholds) {
13080
+ const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
13081
+ const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
13082
+ return Math.min(cost, latency);
13083
+ }
13084
+ function bounded(x) {
13085
+ if (!Number.isFinite(x)) return 0;
13086
+ return Math.max(0, Math.min(1, x));
13087
+ }
13088
+ function renderSummary(target, status, metrics, issues) {
13089
+ const prefix = `release confidence ${status}: ${target}`;
13090
+ const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
13091
+ if (issues.length === 0) return `${prefix}; ${metricText}`;
13092
+ return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
13093
+ }
13094
+ function fmt3(x) {
13095
+ if (!Number.isFinite(x)) return String(x);
13096
+ return x.toFixed(4);
13097
+ }
13098
+
12963
13099
  // src/jsonl-trial-cache.ts
12964
13100
  import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
12965
13101
  import { dirname as dirname4 } from "path";
@@ -13605,9 +13741,9 @@ function passOrthogonality(input) {
13605
13741
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
13606
13742
  }
13607
13743
  }
13608
- const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
13744
+ const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
13609
13745
  return {
13610
- orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
13746
+ orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
13611
13747
  passCount: passes.length,
13612
13748
  similarities: sims
13613
13749
  };
@@ -13653,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
13653
13789
  const iterations = options.iterations ?? 1e3;
13654
13790
  const minTotal = options.minTotalSamples ?? 6;
13655
13791
  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
13656
- const baselineMean = mean8(baseline);
13657
- const candidateMean = mean8(candidate);
13792
+ const baselineMean = mean9(baseline);
13793
+ const candidateMean = mean9(candidate);
13658
13794
  const delta = candidateMean - baselineMean;
13659
13795
  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
13660
13796
  return {
@@ -13672,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13672
13808
  for (let i = 0; i < iterations; i++) {
13673
13809
  const bResample = resample(baseline, rng);
13674
13810
  const cResample = resample(candidate, rng);
13675
- deltas[i] = mean8(cResample) - mean8(bResample);
13811
+ deltas[i] = mean9(cResample) - mean9(bResample);
13676
13812
  }
13677
13813
  deltas.sort((a, b) => a - b);
13678
13814
  const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13695,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13695
13831
  verdict
13696
13832
  };
13697
13833
  }
13698
- function mean8(xs) {
13834
+ function mean9(xs) {
13699
13835
  if (xs.length === 0) return 0;
13700
13836
  let s = 0;
13701
13837
  for (const x of xs) s += x;
@@ -13995,12 +14131,10 @@ export {
13995
14131
  Mutex,
13996
14132
  NoopResearcher,
13997
14133
  OTEL_AGENT_EVAL_SCOPE,
13998
- OptimizationLoop,
13999
14134
  PairwiseSteeringOptimizer,
14000
14135
  PrmGrader,
14001
14136
  ProductClient,
14002
14137
  ProjectRegistry,
14003
- PromptOptimizer,
14004
14138
  PromptRegistry,
14005
14139
  REDACTION_VERSION,
14006
14140
  RunCritic,
@@ -14021,6 +14155,7 @@ export {
14021
14155
  analyzeAntiSlop,
14022
14156
  analyzeSeries,
14023
14157
  argHash,
14158
+ assertReleaseConfidence,
14024
14159
  assignFeedbackSplit,
14025
14160
  attributeCounterfactuals,
14026
14161
  deterministicSplit as benchmarkDeterministicSplit,
@@ -14091,6 +14226,7 @@ export {
14091
14226
  evaluateContract,
14092
14227
  evaluateHypothesis,
14093
14228
  evaluateOracles,
14229
+ evaluateReleaseConfidence,
14094
14230
  executeScenario,
14095
14231
  expectAgent,
14096
14232
  exportRewardModel,
@@ -14190,6 +14326,7 @@ export {
14190
14326
  regexMatch,
14191
14327
  regexMatches,
14192
14328
  regressionView,
14329
+ releaseTraceEvidenceFromMultiShotTrials,
14193
14330
  renderMarkdown,
14194
14331
  renderMarkdownReport,
14195
14332
  renderPlaybookMarkdown,