@tangle-network/agent-eval 0.18.0 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/dist/index.d.ts +84 -220
- package/dist/index.js +61 -210
- package/dist/index.js.map +1 -1
- package/docs/feature-guide.md +2 -2
- package/docs/wire-protocol.md +1 -1
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -3473,174 +3473,6 @@ function rowToRun(row) {
|
|
|
3473
3473
|
};
|
|
3474
3474
|
}
|
|
3475
3475
|
|
|
3476
|
-
// src/power-analysis.ts
|
|
3477
|
-
function requiredSampleSize(opts) {
|
|
3478
|
-
const effect = opts.effect;
|
|
3479
|
-
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
3480
|
-
const alpha = opts.alpha ?? 0.05;
|
|
3481
|
-
const power = opts.power ?? 0.8;
|
|
3482
|
-
const twoSided = opts.twoSided ?? true;
|
|
3483
|
-
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
3484
|
-
const zBeta = zQuantile(power);
|
|
3485
|
-
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
3486
|
-
return Math.ceil(n);
|
|
3487
|
-
}
|
|
3488
|
-
function bonferroni(pValues, alpha = 0.05) {
|
|
3489
|
-
const k = pValues.length;
|
|
3490
|
-
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
3491
|
-
const significant = adjusted.map((p) => p < alpha);
|
|
3492
|
-
return { adjusted, significant };
|
|
3493
|
-
}
|
|
3494
|
-
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
3495
|
-
const n = pValues.length;
|
|
3496
|
-
if (n === 0) return { qValues: [], significant: [] };
|
|
3497
|
-
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
3498
|
-
const q = new Array(n);
|
|
3499
|
-
let minRight = 1;
|
|
3500
|
-
for (let k = n - 1; k >= 0; k--) {
|
|
3501
|
-
const rank = k + 1;
|
|
3502
|
-
const raw = indexed[k].p * n / rank;
|
|
3503
|
-
const bounded = Math.min(minRight, raw);
|
|
3504
|
-
minRight = bounded;
|
|
3505
|
-
q[indexed[k].i] = Math.min(1, bounded);
|
|
3506
|
-
}
|
|
3507
|
-
const significant = q.map((v) => v < fdr);
|
|
3508
|
-
return { qValues: q, significant };
|
|
3509
|
-
}
|
|
3510
|
-
function zQuantile(p) {
|
|
3511
|
-
if (p <= 0 || p >= 1) {
|
|
3512
|
-
if (p === 0) return -Infinity;
|
|
3513
|
-
if (p === 1) return Infinity;
|
|
3514
|
-
return NaN;
|
|
3515
|
-
}
|
|
3516
|
-
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
3517
|
-
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
3518
|
-
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
3519
|
-
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
3520
|
-
const pLow = 0.02425;
|
|
3521
|
-
const pHigh = 1 - pLow;
|
|
3522
|
-
let q;
|
|
3523
|
-
let r;
|
|
3524
|
-
if (p < pLow) {
|
|
3525
|
-
q = Math.sqrt(-2 * Math.log(p));
|
|
3526
|
-
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3527
|
-
}
|
|
3528
|
-
if (p <= pHigh) {
|
|
3529
|
-
q = p - 0.5;
|
|
3530
|
-
r = q * q;
|
|
3531
|
-
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
3532
|
-
}
|
|
3533
|
-
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
3534
|
-
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3535
|
-
}
|
|
3536
|
-
|
|
3537
|
-
// src/prompt-optimizer.ts
|
|
3538
|
-
var PromptOptimizer = class {
|
|
3539
|
-
async run(config) {
|
|
3540
|
-
const trials = config.trialsPerScenario ?? 3;
|
|
3541
|
-
const alpha = config.significanceLevel ?? 0.05;
|
|
3542
|
-
if (config.variants.length < 2) {
|
|
3543
|
-
throw new Error("PromptOptimizer requires at least 2 variants");
|
|
3544
|
-
}
|
|
3545
|
-
if (config.scenarioIds.length === 0) {
|
|
3546
|
-
throw new Error("PromptOptimizer requires at least 1 scenario");
|
|
3547
|
-
}
|
|
3548
|
-
const rawScores = /* @__PURE__ */ new Map();
|
|
3549
|
-
for (const variant of config.variants) {
|
|
3550
|
-
const scenarioMap = /* @__PURE__ */ new Map();
|
|
3551
|
-
rawScores.set(variant.id, scenarioMap);
|
|
3552
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3553
|
-
const samples = [];
|
|
3554
|
-
for (let t = 0; t < trials; t++) {
|
|
3555
|
-
const score = await config.scoreVariant({
|
|
3556
|
-
variant,
|
|
3557
|
-
scenarioId,
|
|
3558
|
-
trialIndex: t
|
|
3559
|
-
});
|
|
3560
|
-
if (!Number.isFinite(score)) {
|
|
3561
|
-
throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
|
|
3562
|
-
}
|
|
3563
|
-
samples.push(score);
|
|
3564
|
-
}
|
|
3565
|
-
scenarioMap.set(scenarioId, samples);
|
|
3566
|
-
config.onScenarioComplete?.({
|
|
3567
|
-
variantId: variant.id,
|
|
3568
|
-
scenarioId,
|
|
3569
|
-
scores: samples
|
|
3570
|
-
});
|
|
3571
|
-
}
|
|
3572
|
-
}
|
|
3573
|
-
const scores2 = config.variants.map((variant) => {
|
|
3574
|
-
const scenarioMap = rawScores.get(variant.id);
|
|
3575
|
-
const allSamples = [];
|
|
3576
|
-
const perScenario = {};
|
|
3577
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3578
|
-
const samples = scenarioMap.get(scenarioId) ?? [];
|
|
3579
|
-
allSamples.push(...samples);
|
|
3580
|
-
perScenario[scenarioId] = {
|
|
3581
|
-
mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
|
|
3582
|
-
n: samples.length,
|
|
3583
|
-
samples
|
|
3584
|
-
};
|
|
3585
|
-
}
|
|
3586
|
-
const ci = confidenceInterval(allSamples, 0.95);
|
|
3587
|
-
return {
|
|
3588
|
-
variantId: variant.id,
|
|
3589
|
-
mean: ci.mean,
|
|
3590
|
-
ci95: { lower: ci.lower, upper: ci.upper },
|
|
3591
|
-
n: allSamples.length,
|
|
3592
|
-
perScenario
|
|
3593
|
-
};
|
|
3594
|
-
});
|
|
3595
|
-
const rawPairs = [];
|
|
3596
|
-
for (let i = 0; i < scores2.length; i++) {
|
|
3597
|
-
for (let j = i + 1; j < scores2.length; j++) {
|
|
3598
|
-
const a = scores2[i];
|
|
3599
|
-
const b = scores2[j];
|
|
3600
|
-
const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
|
|
3601
|
-
rawPairs.push({ a, b, p });
|
|
3602
|
-
}
|
|
3603
|
-
}
|
|
3604
|
-
const { qValues } = benjaminiHochberg(rawPairs.map((r) => r.p), alpha);
|
|
3605
|
-
const pairwise2 = rawPairs.map((r, idx) => ({
|
|
3606
|
-
variantA: r.a.variantId,
|
|
3607
|
-
variantB: r.b.variantId,
|
|
3608
|
-
pValue: r.p,
|
|
3609
|
-
qValue: qValues[idx],
|
|
3610
|
-
significant: qValues[idx] < alpha,
|
|
3611
|
-
meanDelta: r.b.mean - r.a.mean
|
|
3612
|
-
}));
|
|
3613
|
-
const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
|
|
3614
|
-
const winner = sorted[0];
|
|
3615
|
-
const second = sorted[1];
|
|
3616
|
-
const winnerComparisons = pairwise2.filter(
|
|
3617
|
-
(c) => c.variantA === winner.variantId || c.variantB === winner.variantId
|
|
3618
|
-
);
|
|
3619
|
-
const significantOverAll = winnerComparisons.every((c) => c.significant);
|
|
3620
|
-
const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
|
|
3621
|
-
return {
|
|
3622
|
-
winner: {
|
|
3623
|
-
variantId: winner.variantId,
|
|
3624
|
-
significant: significantOverAll,
|
|
3625
|
-
ciLowerBoundExceedsSecondMean
|
|
3626
|
-
},
|
|
3627
|
-
scores: scores2,
|
|
3628
|
-
pairwise: pairwise2,
|
|
3629
|
-
config: {
|
|
3630
|
-
trialsPerScenario: trials,
|
|
3631
|
-
significanceLevel: alpha,
|
|
3632
|
-
variants: config.variants.map((v) => v.id),
|
|
3633
|
-
scenarios: config.scenarioIds
|
|
3634
|
-
}
|
|
3635
|
-
};
|
|
3636
|
-
}
|
|
3637
|
-
};
|
|
3638
|
-
function flatSamples(score) {
|
|
3639
|
-
const out = [];
|
|
3640
|
-
for (const s of Object.values(score.perScenario)) out.push(...s.samples);
|
|
3641
|
-
return out;
|
|
3642
|
-
}
|
|
3643
|
-
|
|
3644
3476
|
// src/steering.ts
|
|
3645
3477
|
function mergeSteeringBundle(base, delta) {
|
|
3646
3478
|
return {
|
|
@@ -3831,46 +3663,6 @@ function canonicalInstruction(value) {
|
|
|
3831
3663
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
3832
3664
|
}
|
|
3833
3665
|
|
|
3834
|
-
// src/optimization-loop.ts
|
|
3835
|
-
var OptimizationLoop = class {
|
|
3836
|
-
optimizer;
|
|
3837
|
-
constructor(optimizer = new PromptOptimizer()) {
|
|
3838
|
-
this.optimizer = optimizer;
|
|
3839
|
-
}
|
|
3840
|
-
async run(config) {
|
|
3841
|
-
const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
|
|
3842
|
-
const result = await this.optimizer.run({
|
|
3843
|
-
variants: config.variants.map((variant) => ({
|
|
3844
|
-
id: variant.id,
|
|
3845
|
-
prompt: renderSteeringText(variant),
|
|
3846
|
-
metadata: { bundle: variant }
|
|
3847
|
-
})),
|
|
3848
|
-
scenarioIds: config.examples.map((example) => example.scenarioId),
|
|
3849
|
-
trialsPerScenario: config.trialsPerScenario,
|
|
3850
|
-
scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
|
|
3851
|
-
const bundle = byId.get(variant.id);
|
|
3852
|
-
if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
|
|
3853
|
-
const example = config.examples.find((item) => item.scenarioId === scenarioId);
|
|
3854
|
-
if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
|
|
3855
|
-
const score = await config.evaluate({ variant: bundle, example, trialIndex });
|
|
3856
|
-
return aggregateRunScore(score, config.scoreWeights);
|
|
3857
|
-
}
|
|
3858
|
-
});
|
|
3859
|
-
return {
|
|
3860
|
-
winner: byId.get(result.winner.variantId),
|
|
3861
|
-
significant: result.winner.significant,
|
|
3862
|
-
reports: result.scores.map((score) => ({
|
|
3863
|
-
variantId: score.variantId,
|
|
3864
|
-
bundle: byId.get(score.variantId),
|
|
3865
|
-
mean: score.mean,
|
|
3866
|
-
ci95: score.ci95,
|
|
3867
|
-
scenarioScores: score.perScenario
|
|
3868
|
-
})),
|
|
3869
|
-
pairwise: result.pairwise
|
|
3870
|
-
};
|
|
3871
|
-
}
|
|
3872
|
-
};
|
|
3873
|
-
|
|
3874
3666
|
// src/steering-optimizer.ts
|
|
3875
3667
|
var PairwiseSteeringOptimizer = class {
|
|
3876
3668
|
optimize(rows, config = {}) {
|
|
@@ -7119,6 +6911,67 @@ function excerpt3(source, needle) {
|
|
|
7119
6911
|
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
7120
6912
|
}
|
|
7121
6913
|
|
|
6914
|
+
// src/power-analysis.ts
|
|
6915
|
+
function requiredSampleSize(opts) {
|
|
6916
|
+
const effect = opts.effect;
|
|
6917
|
+
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
6918
|
+
const alpha = opts.alpha ?? 0.05;
|
|
6919
|
+
const power = opts.power ?? 0.8;
|
|
6920
|
+
const twoSided = opts.twoSided ?? true;
|
|
6921
|
+
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
6922
|
+
const zBeta = zQuantile(power);
|
|
6923
|
+
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
6924
|
+
return Math.ceil(n);
|
|
6925
|
+
}
|
|
6926
|
+
function bonferroni(pValues, alpha = 0.05) {
|
|
6927
|
+
const k = pValues.length;
|
|
6928
|
+
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
6929
|
+
const significant = adjusted.map((p) => p < alpha);
|
|
6930
|
+
return { adjusted, significant };
|
|
6931
|
+
}
|
|
6932
|
+
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
6933
|
+
const n = pValues.length;
|
|
6934
|
+
if (n === 0) return { qValues: [], significant: [] };
|
|
6935
|
+
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
6936
|
+
const q = new Array(n);
|
|
6937
|
+
let minRight = 1;
|
|
6938
|
+
for (let k = n - 1; k >= 0; k--) {
|
|
6939
|
+
const rank = k + 1;
|
|
6940
|
+
const raw = indexed[k].p * n / rank;
|
|
6941
|
+
const bounded = Math.min(minRight, raw);
|
|
6942
|
+
minRight = bounded;
|
|
6943
|
+
q[indexed[k].i] = Math.min(1, bounded);
|
|
6944
|
+
}
|
|
6945
|
+
const significant = q.map((v) => v < fdr);
|
|
6946
|
+
return { qValues: q, significant };
|
|
6947
|
+
}
|
|
6948
|
+
function zQuantile(p) {
|
|
6949
|
+
if (p <= 0 || p >= 1) {
|
|
6950
|
+
if (p === 0) return -Infinity;
|
|
6951
|
+
if (p === 1) return Infinity;
|
|
6952
|
+
return NaN;
|
|
6953
|
+
}
|
|
6954
|
+
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
6955
|
+
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
6956
|
+
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
6957
|
+
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
6958
|
+
const pLow = 0.02425;
|
|
6959
|
+
const pHigh = 1 - pLow;
|
|
6960
|
+
let q;
|
|
6961
|
+
let r;
|
|
6962
|
+
if (p < pLow) {
|
|
6963
|
+
q = Math.sqrt(-2 * Math.log(p));
|
|
6964
|
+
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6965
|
+
}
|
|
6966
|
+
if (p <= pHigh) {
|
|
6967
|
+
q = p - 0.5;
|
|
6968
|
+
r = q * q;
|
|
6969
|
+
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
6970
|
+
}
|
|
6971
|
+
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
6972
|
+
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6973
|
+
}
|
|
6974
|
+
|
|
7122
6975
|
// src/behavior-dsl.ts
|
|
7123
6976
|
var BehaviorAssertion = class {
|
|
7124
6977
|
constructor(store, runId) {
|
|
@@ -13995,12 +13848,10 @@ export {
|
|
|
13995
13848
|
Mutex,
|
|
13996
13849
|
NoopResearcher,
|
|
13997
13850
|
OTEL_AGENT_EVAL_SCOPE,
|
|
13998
|
-
OptimizationLoop,
|
|
13999
13851
|
PairwiseSteeringOptimizer,
|
|
14000
13852
|
PrmGrader,
|
|
14001
13853
|
ProductClient,
|
|
14002
13854
|
ProjectRegistry,
|
|
14003
|
-
PromptOptimizer,
|
|
14004
13855
|
PromptRegistry,
|
|
14005
13856
|
REDACTION_VERSION,
|
|
14006
13857
|
RunCritic,
|