@tangle-network/agent-eval 0.18.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -1
- package/dist/index.d.ts +192 -220
- package/dist/index.js +375 -238
- package/dist/index.js.map +1 -1
- package/docs/feature-guide.md +2 -2
- package/docs/wire-protocol.md +1 -1
- package/package.json +12 -10
package/dist/index.js
CHANGED
|
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
417
417
|
if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
418
418
|
if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
|
|
419
419
|
const n = scores2.length;
|
|
420
|
-
const
|
|
420
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
|
|
421
421
|
const B = 1e3;
|
|
422
422
|
const bootstrapMeans = [];
|
|
423
423
|
for (let i = 0; i < B; i++) {
|
|
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
432
432
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
433
433
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
434
434
|
return {
|
|
435
|
-
mean:
|
|
435
|
+
mean: mean10,
|
|
436
436
|
lower: bootstrapMeans[lowerIdx],
|
|
437
437
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
438
438
|
};
|
|
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
|
|
|
520
520
|
const n = before.length;
|
|
521
521
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
522
522
|
const diffs = before.map((b, i) => after[i] - b);
|
|
523
|
-
const
|
|
524
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
523
|
+
const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
524
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
|
|
525
525
|
const se = Math.sqrt(variance2 / n);
|
|
526
|
-
if (se === 0) return { t:
|
|
527
|
-
const t =
|
|
526
|
+
if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
|
|
527
|
+
const t = mean10 / se;
|
|
528
528
|
const df = n - 1;
|
|
529
529
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
530
530
|
return { t, df, p };
|
|
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
548
548
|
}
|
|
549
549
|
let wPlus = 0;
|
|
550
550
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
551
|
-
const
|
|
551
|
+
const mean10 = n * (n + 1) / 4;
|
|
552
552
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
553
|
-
const z = (wPlus -
|
|
553
|
+
const z = (wPlus - mean10) / Math.sqrt(variance2);
|
|
554
554
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
555
555
|
return { w: wPlus, p };
|
|
556
556
|
}
|
|
@@ -3473,174 +3473,6 @@ function rowToRun(row) {
|
|
|
3473
3473
|
};
|
|
3474
3474
|
}
|
|
3475
3475
|
|
|
3476
|
-
// src/power-analysis.ts
|
|
3477
|
-
function requiredSampleSize(opts) {
|
|
3478
|
-
const effect = opts.effect;
|
|
3479
|
-
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
3480
|
-
const alpha = opts.alpha ?? 0.05;
|
|
3481
|
-
const power = opts.power ?? 0.8;
|
|
3482
|
-
const twoSided = opts.twoSided ?? true;
|
|
3483
|
-
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
3484
|
-
const zBeta = zQuantile(power);
|
|
3485
|
-
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
3486
|
-
return Math.ceil(n);
|
|
3487
|
-
}
|
|
3488
|
-
function bonferroni(pValues, alpha = 0.05) {
|
|
3489
|
-
const k = pValues.length;
|
|
3490
|
-
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
3491
|
-
const significant = adjusted.map((p) => p < alpha);
|
|
3492
|
-
return { adjusted, significant };
|
|
3493
|
-
}
|
|
3494
|
-
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
3495
|
-
const n = pValues.length;
|
|
3496
|
-
if (n === 0) return { qValues: [], significant: [] };
|
|
3497
|
-
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
3498
|
-
const q = new Array(n);
|
|
3499
|
-
let minRight = 1;
|
|
3500
|
-
for (let k = n - 1; k >= 0; k--) {
|
|
3501
|
-
const rank = k + 1;
|
|
3502
|
-
const raw = indexed[k].p * n / rank;
|
|
3503
|
-
const bounded = Math.min(minRight, raw);
|
|
3504
|
-
minRight = bounded;
|
|
3505
|
-
q[indexed[k].i] = Math.min(1, bounded);
|
|
3506
|
-
}
|
|
3507
|
-
const significant = q.map((v) => v < fdr);
|
|
3508
|
-
return { qValues: q, significant };
|
|
3509
|
-
}
|
|
3510
|
-
function zQuantile(p) {
|
|
3511
|
-
if (p <= 0 || p >= 1) {
|
|
3512
|
-
if (p === 0) return -Infinity;
|
|
3513
|
-
if (p === 1) return Infinity;
|
|
3514
|
-
return NaN;
|
|
3515
|
-
}
|
|
3516
|
-
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
3517
|
-
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
3518
|
-
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
3519
|
-
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
3520
|
-
const pLow = 0.02425;
|
|
3521
|
-
const pHigh = 1 - pLow;
|
|
3522
|
-
let q;
|
|
3523
|
-
let r;
|
|
3524
|
-
if (p < pLow) {
|
|
3525
|
-
q = Math.sqrt(-2 * Math.log(p));
|
|
3526
|
-
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3527
|
-
}
|
|
3528
|
-
if (p <= pHigh) {
|
|
3529
|
-
q = p - 0.5;
|
|
3530
|
-
r = q * q;
|
|
3531
|
-
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
3532
|
-
}
|
|
3533
|
-
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
3534
|
-
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3535
|
-
}
|
|
3536
|
-
|
|
3537
|
-
// src/prompt-optimizer.ts
|
|
3538
|
-
var PromptOptimizer = class {
|
|
3539
|
-
async run(config) {
|
|
3540
|
-
const trials = config.trialsPerScenario ?? 3;
|
|
3541
|
-
const alpha = config.significanceLevel ?? 0.05;
|
|
3542
|
-
if (config.variants.length < 2) {
|
|
3543
|
-
throw new Error("PromptOptimizer requires at least 2 variants");
|
|
3544
|
-
}
|
|
3545
|
-
if (config.scenarioIds.length === 0) {
|
|
3546
|
-
throw new Error("PromptOptimizer requires at least 1 scenario");
|
|
3547
|
-
}
|
|
3548
|
-
const rawScores = /* @__PURE__ */ new Map();
|
|
3549
|
-
for (const variant of config.variants) {
|
|
3550
|
-
const scenarioMap = /* @__PURE__ */ new Map();
|
|
3551
|
-
rawScores.set(variant.id, scenarioMap);
|
|
3552
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3553
|
-
const samples = [];
|
|
3554
|
-
for (let t = 0; t < trials; t++) {
|
|
3555
|
-
const score = await config.scoreVariant({
|
|
3556
|
-
variant,
|
|
3557
|
-
scenarioId,
|
|
3558
|
-
trialIndex: t
|
|
3559
|
-
});
|
|
3560
|
-
if (!Number.isFinite(score)) {
|
|
3561
|
-
throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
|
|
3562
|
-
}
|
|
3563
|
-
samples.push(score);
|
|
3564
|
-
}
|
|
3565
|
-
scenarioMap.set(scenarioId, samples);
|
|
3566
|
-
config.onScenarioComplete?.({
|
|
3567
|
-
variantId: variant.id,
|
|
3568
|
-
scenarioId,
|
|
3569
|
-
scores: samples
|
|
3570
|
-
});
|
|
3571
|
-
}
|
|
3572
|
-
}
|
|
3573
|
-
const scores2 = config.variants.map((variant) => {
|
|
3574
|
-
const scenarioMap = rawScores.get(variant.id);
|
|
3575
|
-
const allSamples = [];
|
|
3576
|
-
const perScenario = {};
|
|
3577
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3578
|
-
const samples = scenarioMap.get(scenarioId) ?? [];
|
|
3579
|
-
allSamples.push(...samples);
|
|
3580
|
-
perScenario[scenarioId] = {
|
|
3581
|
-
mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
|
|
3582
|
-
n: samples.length,
|
|
3583
|
-
samples
|
|
3584
|
-
};
|
|
3585
|
-
}
|
|
3586
|
-
const ci = confidenceInterval(allSamples, 0.95);
|
|
3587
|
-
return {
|
|
3588
|
-
variantId: variant.id,
|
|
3589
|
-
mean: ci.mean,
|
|
3590
|
-
ci95: { lower: ci.lower, upper: ci.upper },
|
|
3591
|
-
n: allSamples.length,
|
|
3592
|
-
perScenario
|
|
3593
|
-
};
|
|
3594
|
-
});
|
|
3595
|
-
const rawPairs = [];
|
|
3596
|
-
for (let i = 0; i < scores2.length; i++) {
|
|
3597
|
-
for (let j = i + 1; j < scores2.length; j++) {
|
|
3598
|
-
const a = scores2[i];
|
|
3599
|
-
const b = scores2[j];
|
|
3600
|
-
const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
|
|
3601
|
-
rawPairs.push({ a, b, p });
|
|
3602
|
-
}
|
|
3603
|
-
}
|
|
3604
|
-
const { qValues } = benjaminiHochberg(rawPairs.map((r) => r.p), alpha);
|
|
3605
|
-
const pairwise2 = rawPairs.map((r, idx) => ({
|
|
3606
|
-
variantA: r.a.variantId,
|
|
3607
|
-
variantB: r.b.variantId,
|
|
3608
|
-
pValue: r.p,
|
|
3609
|
-
qValue: qValues[idx],
|
|
3610
|
-
significant: qValues[idx] < alpha,
|
|
3611
|
-
meanDelta: r.b.mean - r.a.mean
|
|
3612
|
-
}));
|
|
3613
|
-
const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
|
|
3614
|
-
const winner = sorted[0];
|
|
3615
|
-
const second = sorted[1];
|
|
3616
|
-
const winnerComparisons = pairwise2.filter(
|
|
3617
|
-
(c) => c.variantA === winner.variantId || c.variantB === winner.variantId
|
|
3618
|
-
);
|
|
3619
|
-
const significantOverAll = winnerComparisons.every((c) => c.significant);
|
|
3620
|
-
const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
|
|
3621
|
-
return {
|
|
3622
|
-
winner: {
|
|
3623
|
-
variantId: winner.variantId,
|
|
3624
|
-
significant: significantOverAll,
|
|
3625
|
-
ciLowerBoundExceedsSecondMean
|
|
3626
|
-
},
|
|
3627
|
-
scores: scores2,
|
|
3628
|
-
pairwise: pairwise2,
|
|
3629
|
-
config: {
|
|
3630
|
-
trialsPerScenario: trials,
|
|
3631
|
-
significanceLevel: alpha,
|
|
3632
|
-
variants: config.variants.map((v) => v.id),
|
|
3633
|
-
scenarios: config.scenarioIds
|
|
3634
|
-
}
|
|
3635
|
-
};
|
|
3636
|
-
}
|
|
3637
|
-
};
|
|
3638
|
-
function flatSamples(score) {
|
|
3639
|
-
const out = [];
|
|
3640
|
-
for (const s of Object.values(score.perScenario)) out.push(...s.samples);
|
|
3641
|
-
return out;
|
|
3642
|
-
}
|
|
3643
|
-
|
|
3644
3476
|
// src/steering.ts
|
|
3645
3477
|
function mergeSteeringBundle(base, delta) {
|
|
3646
3478
|
return {
|
|
@@ -3831,46 +3663,6 @@ function canonicalInstruction(value) {
|
|
|
3831
3663
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
3832
3664
|
}
|
|
3833
3665
|
|
|
3834
|
-
// src/optimization-loop.ts
|
|
3835
|
-
var OptimizationLoop = class {
|
|
3836
|
-
optimizer;
|
|
3837
|
-
constructor(optimizer = new PromptOptimizer()) {
|
|
3838
|
-
this.optimizer = optimizer;
|
|
3839
|
-
}
|
|
3840
|
-
async run(config) {
|
|
3841
|
-
const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
|
|
3842
|
-
const result = await this.optimizer.run({
|
|
3843
|
-
variants: config.variants.map((variant) => ({
|
|
3844
|
-
id: variant.id,
|
|
3845
|
-
prompt: renderSteeringText(variant),
|
|
3846
|
-
metadata: { bundle: variant }
|
|
3847
|
-
})),
|
|
3848
|
-
scenarioIds: config.examples.map((example) => example.scenarioId),
|
|
3849
|
-
trialsPerScenario: config.trialsPerScenario,
|
|
3850
|
-
scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
|
|
3851
|
-
const bundle = byId.get(variant.id);
|
|
3852
|
-
if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
|
|
3853
|
-
const example = config.examples.find((item) => item.scenarioId === scenarioId);
|
|
3854
|
-
if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
|
|
3855
|
-
const score = await config.evaluate({ variant: bundle, example, trialIndex });
|
|
3856
|
-
return aggregateRunScore(score, config.scoreWeights);
|
|
3857
|
-
}
|
|
3858
|
-
});
|
|
3859
|
-
return {
|
|
3860
|
-
winner: byId.get(result.winner.variantId),
|
|
3861
|
-
significant: result.winner.significant,
|
|
3862
|
-
reports: result.scores.map((score) => ({
|
|
3863
|
-
variantId: score.variantId,
|
|
3864
|
-
bundle: byId.get(score.variantId),
|
|
3865
|
-
mean: score.mean,
|
|
3866
|
-
ci95: score.ci95,
|
|
3867
|
-
scenarioScores: score.perScenario
|
|
3868
|
-
})),
|
|
3869
|
-
pairwise: result.pairwise
|
|
3870
|
-
};
|
|
3871
|
-
}
|
|
3872
|
-
};
|
|
3873
|
-
|
|
3874
3666
|
// src/steering-optimizer.ts
|
|
3875
3667
|
var PairwiseSteeringOptimizer = class {
|
|
3876
3668
|
optimize(rows, config = {}) {
|
|
@@ -6665,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6665
6457
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6666
6458
|
}
|
|
6667
6459
|
const tail = values.slice(-window);
|
|
6668
|
-
const
|
|
6669
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
6460
|
+
const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6461
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
|
|
6670
6462
|
const stdDev = Math.sqrt(variance2);
|
|
6671
|
-
const refMean = Math.abs(
|
|
6463
|
+
const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
6672
6464
|
const cv = stdDev / refMean;
|
|
6673
6465
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6674
6466
|
let tailRun = 0;
|
|
@@ -6689,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6689
6481
|
} else {
|
|
6690
6482
|
state = "noisy";
|
|
6691
6483
|
}
|
|
6692
|
-
return { state, windowMean:
|
|
6484
|
+
return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
|
|
6693
6485
|
}
|
|
6694
6486
|
|
|
6695
6487
|
// src/state-continuity.ts
|
|
@@ -7119,6 +6911,67 @@ function excerpt3(source, needle) {
|
|
|
7119
6911
|
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
7120
6912
|
}
|
|
7121
6913
|
|
|
6914
|
+
// src/power-analysis.ts
|
|
6915
|
+
function requiredSampleSize(opts) {
|
|
6916
|
+
const effect = opts.effect;
|
|
6917
|
+
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
6918
|
+
const alpha = opts.alpha ?? 0.05;
|
|
6919
|
+
const power = opts.power ?? 0.8;
|
|
6920
|
+
const twoSided = opts.twoSided ?? true;
|
|
6921
|
+
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
6922
|
+
const zBeta = zQuantile(power);
|
|
6923
|
+
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
6924
|
+
return Math.ceil(n);
|
|
6925
|
+
}
|
|
6926
|
+
function bonferroni(pValues, alpha = 0.05) {
|
|
6927
|
+
const k = pValues.length;
|
|
6928
|
+
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
6929
|
+
const significant = adjusted.map((p) => p < alpha);
|
|
6930
|
+
return { adjusted, significant };
|
|
6931
|
+
}
|
|
6932
|
+
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
6933
|
+
const n = pValues.length;
|
|
6934
|
+
if (n === 0) return { qValues: [], significant: [] };
|
|
6935
|
+
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
6936
|
+
const q = new Array(n);
|
|
6937
|
+
let minRight = 1;
|
|
6938
|
+
for (let k = n - 1; k >= 0; k--) {
|
|
6939
|
+
const rank = k + 1;
|
|
6940
|
+
const raw = indexed[k].p * n / rank;
|
|
6941
|
+
const bounded2 = Math.min(minRight, raw);
|
|
6942
|
+
minRight = bounded2;
|
|
6943
|
+
q[indexed[k].i] = Math.min(1, bounded2);
|
|
6944
|
+
}
|
|
6945
|
+
const significant = q.map((v) => v < fdr);
|
|
6946
|
+
return { qValues: q, significant };
|
|
6947
|
+
}
|
|
6948
|
+
function zQuantile(p) {
|
|
6949
|
+
if (p <= 0 || p >= 1) {
|
|
6950
|
+
if (p === 0) return -Infinity;
|
|
6951
|
+
if (p === 1) return Infinity;
|
|
6952
|
+
return NaN;
|
|
6953
|
+
}
|
|
6954
|
+
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
6955
|
+
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
6956
|
+
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
6957
|
+
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
6958
|
+
const pLow = 0.02425;
|
|
6959
|
+
const pHigh = 1 - pLow;
|
|
6960
|
+
let q;
|
|
6961
|
+
let r;
|
|
6962
|
+
if (p < pLow) {
|
|
6963
|
+
q = Math.sqrt(-2 * Math.log(p));
|
|
6964
|
+
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6965
|
+
}
|
|
6966
|
+
if (p <= pHigh) {
|
|
6967
|
+
q = p - 0.5;
|
|
6968
|
+
r = q * q;
|
|
6969
|
+
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
6970
|
+
}
|
|
6971
|
+
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
6972
|
+
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6973
|
+
}
|
|
6974
|
+
|
|
7122
6975
|
// src/behavior-dsl.ts
|
|
7123
6976
|
var BehaviorAssertion = class {
|
|
7124
6977
|
constructor(store, runId) {
|
|
@@ -7617,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7617
7470
|
variantScores.push({ mutator: id, score, mutated });
|
|
7618
7471
|
all.push(score);
|
|
7619
7472
|
}
|
|
7620
|
-
const
|
|
7621
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
7473
|
+
const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7474
|
+
const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
|
|
7622
7475
|
const stdDev = Math.sqrt(variance2);
|
|
7623
|
-
const ref = Math.abs(
|
|
7476
|
+
const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
7624
7477
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7625
|
-
return { originalScore, variantScores, meanScore:
|
|
7478
|
+
return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
|
|
7626
7479
|
}
|
|
7627
7480
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7628
7481
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -8543,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
8543
8396
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
8544
8397
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
8545
8398
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
8546
|
-
const
|
|
8547
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
8399
|
+
const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
8400
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
|
|
8548
8401
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8549
8402
|
}
|
|
8550
8403
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -8566,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
8566
8419
|
const ranked = [...byRun.values()].sort(
|
|
8567
8420
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
8568
8421
|
);
|
|
8569
|
-
const
|
|
8570
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
8422
|
+
const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
8423
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
|
|
8571
8424
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8572
8425
|
}
|
|
8573
8426
|
|
|
@@ -9097,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
9097
8950
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
9098
8951
|
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
9099
8952
|
if (scores2.length < 3) continue;
|
|
9100
|
-
const
|
|
9101
|
-
const variance2 = scores2.reduce((a, b) => a + (b -
|
|
8953
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
8954
|
+
const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
|
|
9102
8955
|
if (variance2 > varianceThreshold) {
|
|
9103
8956
|
targets.push({
|
|
9104
8957
|
reason: "high-variance",
|
|
@@ -12960,6 +12813,289 @@ function traceExcerpt(trace) {
|
|
|
12960
12813
|
return void 0;
|
|
12961
12814
|
}
|
|
12962
12815
|
|
|
12816
|
+
// src/release-confidence.ts
|
|
12817
|
+
var DEFAULT_THRESHOLDS = {
|
|
12818
|
+
requireCorpus: true,
|
|
12819
|
+
minScenarioCount: 1,
|
|
12820
|
+
minSearchRuns: 1,
|
|
12821
|
+
minHoldoutRuns: 1,
|
|
12822
|
+
requireHoldout: true,
|
|
12823
|
+
minPassRate: 0.8,
|
|
12824
|
+
minMeanScore: 0.7,
|
|
12825
|
+
maxOverfitGap: 0.15,
|
|
12826
|
+
maxMeanCostUsd: Number.POSITIVE_INFINITY,
|
|
12827
|
+
maxP95WallMs: Number.POSITIVE_INFINITY,
|
|
12828
|
+
requireAsiForFailures: true,
|
|
12829
|
+
failureScoreThreshold: 0.5
|
|
12830
|
+
};
|
|
12831
|
+
function releaseTraceEvidenceFromMultiShotTrials(trials) {
|
|
12832
|
+
return trials.map((trial) => ({
|
|
12833
|
+
scenarioId: trial.scenarioId,
|
|
12834
|
+
candidateId: trial.variantId,
|
|
12835
|
+
split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
|
|
12836
|
+
score: trial.score,
|
|
12837
|
+
ok: trial.ok,
|
|
12838
|
+
turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
|
|
12839
|
+
costUsd: trial.cost,
|
|
12840
|
+
durationMs: trial.durationMs,
|
|
12841
|
+
failureMode: trial.error ? "runtime_error" : void 0,
|
|
12842
|
+
asi: trial.asi,
|
|
12843
|
+
metadata: trial.metadata
|
|
12844
|
+
}));
|
|
12845
|
+
}
|
|
12846
|
+
function evaluateReleaseConfidence(input) {
|
|
12847
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
|
12848
|
+
const candidateId = input.candidateId ?? null;
|
|
12849
|
+
const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
|
|
12850
|
+
const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
|
|
12851
|
+
const scenarios = input.scenarios ?? [];
|
|
12852
|
+
const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
|
|
12853
|
+
const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
|
|
12854
|
+
const searchScores = scoresFor(runs, "search");
|
|
12855
|
+
const holdoutScores = scoresFor(runs, "holdout");
|
|
12856
|
+
const allScores = [...searchScores, ...holdoutScores];
|
|
12857
|
+
const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
|
|
12858
|
+
const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
|
|
12859
|
+
const searchRuns = runs.filter((r) => r.splitTag === "search").length;
|
|
12860
|
+
const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
|
|
12861
|
+
const searchMeanScore = mean8(searchScores);
|
|
12862
|
+
const holdoutMeanScore = mean8(holdoutScores);
|
|
12863
|
+
const metrics = {
|
|
12864
|
+
scenarioCount,
|
|
12865
|
+
searchRuns,
|
|
12866
|
+
holdoutRuns,
|
|
12867
|
+
passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
|
|
12868
|
+
meanScore: mean8(scoreUniverse),
|
|
12869
|
+
searchMeanScore,
|
|
12870
|
+
holdoutMeanScore,
|
|
12871
|
+
overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
|
|
12872
|
+
meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
|
|
12873
|
+
p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
|
|
12874
|
+
failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
|
|
12875
|
+
failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
|
|
12876
|
+
singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
|
|
12877
|
+
multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
|
|
12878
|
+
splitCounts,
|
|
12879
|
+
domainCounts: countDomains(scenarios),
|
|
12880
|
+
failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
|
|
12881
|
+
responsibleSurfaceCounts: countResponsibleSurfaces(traces)
|
|
12882
|
+
};
|
|
12883
|
+
const issues = [];
|
|
12884
|
+
checkCorpus(input, thresholds, metrics, issues);
|
|
12885
|
+
checkQuality(thresholds, metrics, issues);
|
|
12886
|
+
checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
|
|
12887
|
+
checkDiagnostics(thresholds, metrics, issues);
|
|
12888
|
+
checkEfficiency(thresholds, metrics, issues);
|
|
12889
|
+
const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
|
|
12890
|
+
const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
|
|
12891
|
+
return {
|
|
12892
|
+
target: input.target,
|
|
12893
|
+
candidateId,
|
|
12894
|
+
baselineId: input.baselineId ?? null,
|
|
12895
|
+
status,
|
|
12896
|
+
promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
|
|
12897
|
+
axes,
|
|
12898
|
+
issues,
|
|
12899
|
+
metrics,
|
|
12900
|
+
dataset: input.dataset ?? null,
|
|
12901
|
+
gateDecision: input.gateDecision ?? null,
|
|
12902
|
+
summary: renderSummary(input.target, status, metrics, issues)
|
|
12903
|
+
};
|
|
12904
|
+
}
|
|
12905
|
+
function assertReleaseConfidence(input) {
|
|
12906
|
+
const scorecard = evaluateReleaseConfidence(input);
|
|
12907
|
+
if (scorecard.status === "fail") {
|
|
12908
|
+
throw new Error(scorecard.summary);
|
|
12909
|
+
}
|
|
12910
|
+
return scorecard;
|
|
12911
|
+
}
|
|
12912
|
+
function filterCandidate(runs, candidateId, baselineId) {
|
|
12913
|
+
if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
|
|
12914
|
+
if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
|
|
12915
|
+
return [...runs];
|
|
12916
|
+
}
|
|
12917
|
+
function filterTraceCandidate(traces, candidateId, baselineId) {
|
|
12918
|
+
if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
|
|
12919
|
+
if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
|
|
12920
|
+
return [...traces];
|
|
12921
|
+
}
|
|
12922
|
+
function checkCorpus(input, thresholds, metrics, issues) {
|
|
12923
|
+
if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
|
|
12924
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
|
|
12925
|
+
}
|
|
12926
|
+
if (metrics.scenarioCount < thresholds.minScenarioCount) {
|
|
12927
|
+
issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
|
|
12928
|
+
}
|
|
12929
|
+
if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
|
|
12930
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
|
|
12931
|
+
}
|
|
12932
|
+
}
|
|
12933
|
+
function checkQuality(thresholds, metrics, issues) {
|
|
12934
|
+
if (metrics.searchRuns < thresholds.minSearchRuns) {
|
|
12935
|
+
issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
|
|
12936
|
+
}
|
|
12937
|
+
if (metrics.passRate < thresholds.minPassRate) {
|
|
12938
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
|
|
12939
|
+
}
|
|
12940
|
+
if (metrics.meanScore < thresholds.minMeanScore) {
|
|
12941
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
|
|
12942
|
+
}
|
|
12943
|
+
}
|
|
12944
|
+
function checkGeneralization(gateDecision, thresholds, metrics, issues) {
|
|
12945
|
+
if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
|
|
12946
|
+
issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
|
|
12947
|
+
}
|
|
12948
|
+
if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
|
|
12949
|
+
issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
|
|
12950
|
+
}
|
|
12951
|
+
if (gateDecision && !gateDecision.promote) {
|
|
12952
|
+
issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
|
|
12953
|
+
}
|
|
12954
|
+
}
|
|
12955
|
+
function checkDiagnostics(thresholds, metrics, issues) {
|
|
12956
|
+
if (!thresholds.requireAsiForFailures) return;
|
|
12957
|
+
if (metrics.failedRows > metrics.failuresWithAsi) {
|
|
12958
|
+
issues.push({
|
|
12959
|
+
axis: "diagnostics",
|
|
12960
|
+
severity: "critical",
|
|
12961
|
+
code: "missing_failure_asi",
|
|
12962
|
+
detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
|
|
12963
|
+
});
|
|
12964
|
+
}
|
|
12965
|
+
}
|
|
12966
|
+
function checkEfficiency(thresholds, metrics, issues) {
|
|
12967
|
+
if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
|
|
12968
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
|
|
12969
|
+
}
|
|
12970
|
+
if (metrics.p95WallMs > thresholds.maxP95WallMs) {
|
|
12971
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
|
|
12972
|
+
}
|
|
12973
|
+
}
|
|
12974
|
+
function buildAxes(metrics, thresholds, gateDecision, issues) {
|
|
12975
|
+
return [
|
|
12976
|
+
axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
|
|
12977
|
+
axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
|
|
12978
|
+
axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
|
|
12979
|
+
axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
|
|
12980
|
+
axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
|
|
12981
|
+
];
|
|
12982
|
+
}
|
|
12983
|
+
function axis(name, issues, score, detail) {
|
|
12984
|
+
const own = issues.filter((i) => i.axis === name);
|
|
12985
|
+
const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
|
|
12986
|
+
return { name, status, score: bounded(score), detail };
|
|
12987
|
+
}
|
|
12988
|
+
function countScenarioSplits(scenarios) {
|
|
12989
|
+
const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
|
|
12990
|
+
for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
|
|
12991
|
+
return counts;
|
|
12992
|
+
}
|
|
12993
|
+
function countDomains(scenarios) {
|
|
12994
|
+
const out = {};
|
|
12995
|
+
for (const scenario of scenarios) {
|
|
12996
|
+
const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
|
|
12997
|
+
out[domain] = (out[domain] ?? 0) + 1;
|
|
12998
|
+
}
|
|
12999
|
+
return out;
|
|
13000
|
+
}
|
|
13001
|
+
function countFailureModes(runs, traces, threshold) {
|
|
13002
|
+
const out = {};
|
|
13003
|
+
for (const run of runs) {
|
|
13004
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13005
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13006
|
+
const mode = run.failureMode ?? "low_score";
|
|
13007
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13008
|
+
}
|
|
13009
|
+
}
|
|
13010
|
+
for (const trace of traces) {
|
|
13011
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13012
|
+
const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
|
|
13013
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13014
|
+
}
|
|
13015
|
+
}
|
|
13016
|
+
return out;
|
|
13017
|
+
}
|
|
13018
|
+
function countResponsibleSurfaces(traces) {
|
|
13019
|
+
const out = {};
|
|
13020
|
+
for (const trace of traces) {
|
|
13021
|
+
for (const asi of trace.asi ?? []) {
|
|
13022
|
+
const surface = asi.responsibleSurface ?? "unknown";
|
|
13023
|
+
out[surface] = (out[surface] ?? 0) + 1;
|
|
13024
|
+
}
|
|
13025
|
+
}
|
|
13026
|
+
return out;
|
|
13027
|
+
}
|
|
13028
|
+
function failedRows(runs, traces, threshold) {
|
|
13029
|
+
const out = [];
|
|
13030
|
+
for (const run of runs) {
|
|
13031
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13032
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13033
|
+
const asiMetric = run.outcome.raw.asi;
|
|
13034
|
+
out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
|
|
13035
|
+
}
|
|
13036
|
+
}
|
|
13037
|
+
for (const trace of traces) {
|
|
13038
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13039
|
+
out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
|
|
13040
|
+
}
|
|
13041
|
+
}
|
|
13042
|
+
return out;
|
|
13043
|
+
}
|
|
13044
|
+
function passRate(runs, traces, threshold) {
|
|
13045
|
+
const outcomes = [
|
|
13046
|
+
...runs.map((run) => {
|
|
13047
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13048
|
+
return !run.failureMode && score !== void 0 && score >= threshold;
|
|
13049
|
+
}),
|
|
13050
|
+
...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
|
|
13051
|
+
];
|
|
13052
|
+
if (outcomes.length === 0) return 0;
|
|
13053
|
+
return outcomes.filter(Boolean).length / outcomes.length;
|
|
13054
|
+
}
|
|
13055
|
+
function scoresFor(runs, split) {
|
|
13056
|
+
return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
|
|
13057
|
+
}
|
|
13058
|
+
function mean8(xs) {
|
|
13059
|
+
if (xs.length === 0) return Number.NaN;
|
|
13060
|
+
return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
|
|
13061
|
+
}
|
|
13062
|
+
function percentile(xs, p) {
|
|
13063
|
+
if (xs.length === 0) return Number.NaN;
|
|
13064
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
13065
|
+
return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
|
|
13066
|
+
}
|
|
13067
|
+
function isFiniteNumber(value) {
|
|
13068
|
+
return typeof value === "number" && Number.isFinite(value);
|
|
13069
|
+
}
|
|
13070
|
+
function safeDiff2(a, b) {
|
|
13071
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
13072
|
+
return a - b;
|
|
13073
|
+
}
|
|
13074
|
+
function gapScore(gap, maxGap) {
|
|
13075
|
+
if (!Number.isFinite(gap)) return 0;
|
|
13076
|
+
if (maxGap <= 0) return gap <= 0 ? 1 : 0;
|
|
13077
|
+
return bounded(1 - Math.max(0, gap) / maxGap);
|
|
13078
|
+
}
|
|
13079
|
+
function efficiencyScore(metrics, thresholds) {
|
|
13080
|
+
const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
|
|
13081
|
+
const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
|
|
13082
|
+
return Math.min(cost, latency);
|
|
13083
|
+
}
|
|
13084
|
+
function bounded(x) {
|
|
13085
|
+
if (!Number.isFinite(x)) return 0;
|
|
13086
|
+
return Math.max(0, Math.min(1, x));
|
|
13087
|
+
}
|
|
13088
|
+
function renderSummary(target, status, metrics, issues) {
|
|
13089
|
+
const prefix = `release confidence ${status}: ${target}`;
|
|
13090
|
+
const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
|
|
13091
|
+
if (issues.length === 0) return `${prefix}; ${metricText}`;
|
|
13092
|
+
return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
|
|
13093
|
+
}
|
|
13094
|
+
function fmt3(x) {
|
|
13095
|
+
if (!Number.isFinite(x)) return String(x);
|
|
13096
|
+
return x.toFixed(4);
|
|
13097
|
+
}
|
|
13098
|
+
|
|
12963
13099
|
// src/jsonl-trial-cache.ts
|
|
12964
13100
|
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
12965
13101
|
import { dirname as dirname4 } from "path";
|
|
@@ -13605,9 +13741,9 @@ function passOrthogonality(input) {
|
|
|
13605
13741
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
13606
13742
|
}
|
|
13607
13743
|
}
|
|
13608
|
-
const
|
|
13744
|
+
const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
13609
13745
|
return {
|
|
13610
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
13746
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
|
|
13611
13747
|
passCount: passes.length,
|
|
13612
13748
|
similarities: sims
|
|
13613
13749
|
};
|
|
@@ -13653,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13653
13789
|
const iterations = options.iterations ?? 1e3;
|
|
13654
13790
|
const minTotal = options.minTotalSamples ?? 6;
|
|
13655
13791
|
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
13656
|
-
const baselineMean =
|
|
13657
|
-
const candidateMean =
|
|
13792
|
+
const baselineMean = mean9(baseline);
|
|
13793
|
+
const candidateMean = mean9(candidate);
|
|
13658
13794
|
const delta = candidateMean - baselineMean;
|
|
13659
13795
|
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
13660
13796
|
return {
|
|
@@ -13672,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13672
13808
|
for (let i = 0; i < iterations; i++) {
|
|
13673
13809
|
const bResample = resample(baseline, rng);
|
|
13674
13810
|
const cResample = resample(candidate, rng);
|
|
13675
|
-
deltas[i] =
|
|
13811
|
+
deltas[i] = mean9(cResample) - mean9(bResample);
|
|
13676
13812
|
}
|
|
13677
13813
|
deltas.sort((a, b) => a - b);
|
|
13678
13814
|
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
@@ -13695,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13695
13831
|
verdict
|
|
13696
13832
|
};
|
|
13697
13833
|
}
|
|
13698
|
-
function
|
|
13834
|
+
function mean9(xs) {
|
|
13699
13835
|
if (xs.length === 0) return 0;
|
|
13700
13836
|
let s = 0;
|
|
13701
13837
|
for (const x of xs) s += x;
|
|
@@ -13995,12 +14131,10 @@ export {
|
|
|
13995
14131
|
Mutex,
|
|
13996
14132
|
NoopResearcher,
|
|
13997
14133
|
OTEL_AGENT_EVAL_SCOPE,
|
|
13998
|
-
OptimizationLoop,
|
|
13999
14134
|
PairwiseSteeringOptimizer,
|
|
14000
14135
|
PrmGrader,
|
|
14001
14136
|
ProductClient,
|
|
14002
14137
|
ProjectRegistry,
|
|
14003
|
-
PromptOptimizer,
|
|
14004
14138
|
PromptRegistry,
|
|
14005
14139
|
REDACTION_VERSION,
|
|
14006
14140
|
RunCritic,
|
|
@@ -14021,6 +14155,7 @@ export {
|
|
|
14021
14155
|
analyzeAntiSlop,
|
|
14022
14156
|
analyzeSeries,
|
|
14023
14157
|
argHash,
|
|
14158
|
+
assertReleaseConfidence,
|
|
14024
14159
|
assignFeedbackSplit,
|
|
14025
14160
|
attributeCounterfactuals,
|
|
14026
14161
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
@@ -14091,6 +14226,7 @@ export {
|
|
|
14091
14226
|
evaluateContract,
|
|
14092
14227
|
evaluateHypothesis,
|
|
14093
14228
|
evaluateOracles,
|
|
14229
|
+
evaluateReleaseConfidence,
|
|
14094
14230
|
executeScenario,
|
|
14095
14231
|
expectAgent,
|
|
14096
14232
|
exportRewardModel,
|
|
@@ -14190,6 +14326,7 @@ export {
|
|
|
14190
14326
|
regexMatch,
|
|
14191
14327
|
regexMatches,
|
|
14192
14328
|
regressionView,
|
|
14329
|
+
releaseTraceEvidenceFromMultiShotTrials,
|
|
14193
14330
|
renderMarkdown,
|
|
14194
14331
|
renderMarkdownReport,
|
|
14195
14332
|
renderPlaybookMarkdown,
|