@tangle-network/agent-eval 0.17.3 → 0.19.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -1
- package/dist/index.d.ts +303 -279
- package/dist/index.js +332 -210
- package/dist/index.js.map +1 -1
- package/docs/concepts.md +155 -0
- package/docs/control-runtime.md +351 -0
- package/docs/feature-guide.md +213 -0
- package/docs/feedback-trajectories.md +193 -0
- package/docs/multi-shot-optimization.md +122 -0
- package/docs/wire-protocol.md +199 -0
- package/package.json +21 -14
package/dist/index.js
CHANGED
|
@@ -3473,174 +3473,6 @@ function rowToRun(row) {
|
|
|
3473
3473
|
};
|
|
3474
3474
|
}
|
|
3475
3475
|
|
|
3476
|
-
// src/power-analysis.ts
|
|
3477
|
-
function requiredSampleSize(opts) {
|
|
3478
|
-
const effect = opts.effect;
|
|
3479
|
-
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
3480
|
-
const alpha = opts.alpha ?? 0.05;
|
|
3481
|
-
const power = opts.power ?? 0.8;
|
|
3482
|
-
const twoSided = opts.twoSided ?? true;
|
|
3483
|
-
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
3484
|
-
const zBeta = zQuantile(power);
|
|
3485
|
-
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
3486
|
-
return Math.ceil(n);
|
|
3487
|
-
}
|
|
3488
|
-
function bonferroni(pValues, alpha = 0.05) {
|
|
3489
|
-
const k = pValues.length;
|
|
3490
|
-
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
3491
|
-
const significant = adjusted.map((p) => p < alpha);
|
|
3492
|
-
return { adjusted, significant };
|
|
3493
|
-
}
|
|
3494
|
-
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
3495
|
-
const n = pValues.length;
|
|
3496
|
-
if (n === 0) return { qValues: [], significant: [] };
|
|
3497
|
-
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
3498
|
-
const q = new Array(n);
|
|
3499
|
-
let minRight = 1;
|
|
3500
|
-
for (let k = n - 1; k >= 0; k--) {
|
|
3501
|
-
const rank = k + 1;
|
|
3502
|
-
const raw = indexed[k].p * n / rank;
|
|
3503
|
-
const bounded = Math.min(minRight, raw);
|
|
3504
|
-
minRight = bounded;
|
|
3505
|
-
q[indexed[k].i] = Math.min(1, bounded);
|
|
3506
|
-
}
|
|
3507
|
-
const significant = q.map((v) => v < fdr);
|
|
3508
|
-
return { qValues: q, significant };
|
|
3509
|
-
}
|
|
3510
|
-
function zQuantile(p) {
|
|
3511
|
-
if (p <= 0 || p >= 1) {
|
|
3512
|
-
if (p === 0) return -Infinity;
|
|
3513
|
-
if (p === 1) return Infinity;
|
|
3514
|
-
return NaN;
|
|
3515
|
-
}
|
|
3516
|
-
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
3517
|
-
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
3518
|
-
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
3519
|
-
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
3520
|
-
const pLow = 0.02425;
|
|
3521
|
-
const pHigh = 1 - pLow;
|
|
3522
|
-
let q;
|
|
3523
|
-
let r;
|
|
3524
|
-
if (p < pLow) {
|
|
3525
|
-
q = Math.sqrt(-2 * Math.log(p));
|
|
3526
|
-
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3527
|
-
}
|
|
3528
|
-
if (p <= pHigh) {
|
|
3529
|
-
q = p - 0.5;
|
|
3530
|
-
r = q * q;
|
|
3531
|
-
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
3532
|
-
}
|
|
3533
|
-
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
3534
|
-
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
3535
|
-
}
|
|
3536
|
-
|
|
3537
|
-
// src/prompt-optimizer.ts
|
|
3538
|
-
var PromptOptimizer = class {
|
|
3539
|
-
async run(config) {
|
|
3540
|
-
const trials = config.trialsPerScenario ?? 3;
|
|
3541
|
-
const alpha = config.significanceLevel ?? 0.05;
|
|
3542
|
-
if (config.variants.length < 2) {
|
|
3543
|
-
throw new Error("PromptOptimizer requires at least 2 variants");
|
|
3544
|
-
}
|
|
3545
|
-
if (config.scenarioIds.length === 0) {
|
|
3546
|
-
throw new Error("PromptOptimizer requires at least 1 scenario");
|
|
3547
|
-
}
|
|
3548
|
-
const rawScores = /* @__PURE__ */ new Map();
|
|
3549
|
-
for (const variant of config.variants) {
|
|
3550
|
-
const scenarioMap = /* @__PURE__ */ new Map();
|
|
3551
|
-
rawScores.set(variant.id, scenarioMap);
|
|
3552
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3553
|
-
const samples = [];
|
|
3554
|
-
for (let t = 0; t < trials; t++) {
|
|
3555
|
-
const score = await config.scoreVariant({
|
|
3556
|
-
variant,
|
|
3557
|
-
scenarioId,
|
|
3558
|
-
trialIndex: t
|
|
3559
|
-
});
|
|
3560
|
-
if (!Number.isFinite(score)) {
|
|
3561
|
-
throw new Error(`scoreVariant returned non-finite: variant=${variant.id} scenario=${scenarioId} trial=${t}`);
|
|
3562
|
-
}
|
|
3563
|
-
samples.push(score);
|
|
3564
|
-
}
|
|
3565
|
-
scenarioMap.set(scenarioId, samples);
|
|
3566
|
-
config.onScenarioComplete?.({
|
|
3567
|
-
variantId: variant.id,
|
|
3568
|
-
scenarioId,
|
|
3569
|
-
scores: samples
|
|
3570
|
-
});
|
|
3571
|
-
}
|
|
3572
|
-
}
|
|
3573
|
-
const scores2 = config.variants.map((variant) => {
|
|
3574
|
-
const scenarioMap = rawScores.get(variant.id);
|
|
3575
|
-
const allSamples = [];
|
|
3576
|
-
const perScenario = {};
|
|
3577
|
-
for (const scenarioId of config.scenarioIds) {
|
|
3578
|
-
const samples = scenarioMap.get(scenarioId) ?? [];
|
|
3579
|
-
allSamples.push(...samples);
|
|
3580
|
-
perScenario[scenarioId] = {
|
|
3581
|
-
mean: samples.length ? samples.reduce((a, b) => a + b, 0) / samples.length : 0,
|
|
3582
|
-
n: samples.length,
|
|
3583
|
-
samples
|
|
3584
|
-
};
|
|
3585
|
-
}
|
|
3586
|
-
const ci = confidenceInterval(allSamples, 0.95);
|
|
3587
|
-
return {
|
|
3588
|
-
variantId: variant.id,
|
|
3589
|
-
mean: ci.mean,
|
|
3590
|
-
ci95: { lower: ci.lower, upper: ci.upper },
|
|
3591
|
-
n: allSamples.length,
|
|
3592
|
-
perScenario
|
|
3593
|
-
};
|
|
3594
|
-
});
|
|
3595
|
-
const rawPairs = [];
|
|
3596
|
-
for (let i = 0; i < scores2.length; i++) {
|
|
3597
|
-
for (let j = i + 1; j < scores2.length; j++) {
|
|
3598
|
-
const a = scores2[i];
|
|
3599
|
-
const b = scores2[j];
|
|
3600
|
-
const { p } = mannWhitneyU(flatSamples(a), flatSamples(b));
|
|
3601
|
-
rawPairs.push({ a, b, p });
|
|
3602
|
-
}
|
|
3603
|
-
}
|
|
3604
|
-
const { qValues } = benjaminiHochberg(rawPairs.map((r) => r.p), alpha);
|
|
3605
|
-
const pairwise2 = rawPairs.map((r, idx) => ({
|
|
3606
|
-
variantA: r.a.variantId,
|
|
3607
|
-
variantB: r.b.variantId,
|
|
3608
|
-
pValue: r.p,
|
|
3609
|
-
qValue: qValues[idx],
|
|
3610
|
-
significant: qValues[idx] < alpha,
|
|
3611
|
-
meanDelta: r.b.mean - r.a.mean
|
|
3612
|
-
}));
|
|
3613
|
-
const sorted = scores2.slice().sort((x, y) => y.mean - x.mean);
|
|
3614
|
-
const winner = sorted[0];
|
|
3615
|
-
const second = sorted[1];
|
|
3616
|
-
const winnerComparisons = pairwise2.filter(
|
|
3617
|
-
(c) => c.variantA === winner.variantId || c.variantB === winner.variantId
|
|
3618
|
-
);
|
|
3619
|
-
const significantOverAll = winnerComparisons.every((c) => c.significant);
|
|
3620
|
-
const ciLowerBoundExceedsSecondMean = winner.ci95.lower > second.mean;
|
|
3621
|
-
return {
|
|
3622
|
-
winner: {
|
|
3623
|
-
variantId: winner.variantId,
|
|
3624
|
-
significant: significantOverAll,
|
|
3625
|
-
ciLowerBoundExceedsSecondMean
|
|
3626
|
-
},
|
|
3627
|
-
scores: scores2,
|
|
3628
|
-
pairwise: pairwise2,
|
|
3629
|
-
config: {
|
|
3630
|
-
trialsPerScenario: trials,
|
|
3631
|
-
significanceLevel: alpha,
|
|
3632
|
-
variants: config.variants.map((v) => v.id),
|
|
3633
|
-
scenarios: config.scenarioIds
|
|
3634
|
-
}
|
|
3635
|
-
};
|
|
3636
|
-
}
|
|
3637
|
-
};
|
|
3638
|
-
function flatSamples(score) {
|
|
3639
|
-
const out = [];
|
|
3640
|
-
for (const s of Object.values(score.perScenario)) out.push(...s.samples);
|
|
3641
|
-
return out;
|
|
3642
|
-
}
|
|
3643
|
-
|
|
3644
3476
|
// src/steering.ts
|
|
3645
3477
|
function mergeSteeringBundle(base, delta) {
|
|
3646
3478
|
return {
|
|
@@ -3831,46 +3663,6 @@ function canonicalInstruction(value) {
|
|
|
3831
3663
|
return normalized.length === 0 ? normalized : normalized[0].toUpperCase() + normalized.slice(1);
|
|
3832
3664
|
}
|
|
3833
3665
|
|
|
3834
|
-
// src/optimization-loop.ts
|
|
3835
|
-
var OptimizationLoop = class {
|
|
3836
|
-
optimizer;
|
|
3837
|
-
constructor(optimizer = new PromptOptimizer()) {
|
|
3838
|
-
this.optimizer = optimizer;
|
|
3839
|
-
}
|
|
3840
|
-
async run(config) {
|
|
3841
|
-
const byId = new Map(config.variants.map((variant) => [variant.id, variant]));
|
|
3842
|
-
const result = await this.optimizer.run({
|
|
3843
|
-
variants: config.variants.map((variant) => ({
|
|
3844
|
-
id: variant.id,
|
|
3845
|
-
prompt: renderSteeringText(variant),
|
|
3846
|
-
metadata: { bundle: variant }
|
|
3847
|
-
})),
|
|
3848
|
-
scenarioIds: config.examples.map((example) => example.scenarioId),
|
|
3849
|
-
trialsPerScenario: config.trialsPerScenario,
|
|
3850
|
-
scoreVariant: async ({ variant, scenarioId, trialIndex }) => {
|
|
3851
|
-
const bundle = byId.get(variant.id);
|
|
3852
|
-
if (!bundle) throw new Error(`unknown steering bundle ${variant.id}`);
|
|
3853
|
-
const example = config.examples.find((item) => item.scenarioId === scenarioId);
|
|
3854
|
-
if (!example) throw new Error(`unknown optimization example ${scenarioId}`);
|
|
3855
|
-
const score = await config.evaluate({ variant: bundle, example, trialIndex });
|
|
3856
|
-
return aggregateRunScore(score, config.scoreWeights);
|
|
3857
|
-
}
|
|
3858
|
-
});
|
|
3859
|
-
return {
|
|
3860
|
-
winner: byId.get(result.winner.variantId),
|
|
3861
|
-
significant: result.winner.significant,
|
|
3862
|
-
reports: result.scores.map((score) => ({
|
|
3863
|
-
variantId: score.variantId,
|
|
3864
|
-
bundle: byId.get(score.variantId),
|
|
3865
|
-
mean: score.mean,
|
|
3866
|
-
ci95: score.ci95,
|
|
3867
|
-
scenarioScores: score.perScenario
|
|
3868
|
-
})),
|
|
3869
|
-
pairwise: result.pairwise
|
|
3870
|
-
};
|
|
3871
|
-
}
|
|
3872
|
-
};
|
|
3873
|
-
|
|
3874
3666
|
// src/steering-optimizer.ts
|
|
3875
3667
|
var PairwiseSteeringOptimizer = class {
|
|
3876
3668
|
optimize(rows, config = {}) {
|
|
@@ -7119,6 +6911,67 @@ function excerpt3(source, needle) {
|
|
|
7119
6911
|
return (start > 0 ? "\u2026" : "") + source.slice(start, end) + (end < source.length ? "\u2026" : "");
|
|
7120
6912
|
}
|
|
7121
6913
|
|
|
6914
|
+
// src/power-analysis.ts
|
|
6915
|
+
function requiredSampleSize(opts) {
|
|
6916
|
+
const effect = opts.effect;
|
|
6917
|
+
if (!Number.isFinite(effect) || effect <= 0) return Infinity;
|
|
6918
|
+
const alpha = opts.alpha ?? 0.05;
|
|
6919
|
+
const power = opts.power ?? 0.8;
|
|
6920
|
+
const twoSided = opts.twoSided ?? true;
|
|
6921
|
+
const zAlpha = zQuantile(twoSided ? 1 - alpha / 2 : 1 - alpha);
|
|
6922
|
+
const zBeta = zQuantile(power);
|
|
6923
|
+
const n = 2 * Math.pow((zAlpha + zBeta) / effect, 2);
|
|
6924
|
+
return Math.ceil(n);
|
|
6925
|
+
}
|
|
6926
|
+
function bonferroni(pValues, alpha = 0.05) {
|
|
6927
|
+
const k = pValues.length;
|
|
6928
|
+
const adjusted = pValues.map((p) => Math.min(1, p * k));
|
|
6929
|
+
const significant = adjusted.map((p) => p < alpha);
|
|
6930
|
+
return { adjusted, significant };
|
|
6931
|
+
}
|
|
6932
|
+
function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
6933
|
+
const n = pValues.length;
|
|
6934
|
+
if (n === 0) return { qValues: [], significant: [] };
|
|
6935
|
+
const indexed = pValues.map((p, i) => ({ p, i })).sort((a, b) => a.p - b.p);
|
|
6936
|
+
const q = new Array(n);
|
|
6937
|
+
let minRight = 1;
|
|
6938
|
+
for (let k = n - 1; k >= 0; k--) {
|
|
6939
|
+
const rank = k + 1;
|
|
6940
|
+
const raw = indexed[k].p * n / rank;
|
|
6941
|
+
const bounded = Math.min(minRight, raw);
|
|
6942
|
+
minRight = bounded;
|
|
6943
|
+
q[indexed[k].i] = Math.min(1, bounded);
|
|
6944
|
+
}
|
|
6945
|
+
const significant = q.map((v) => v < fdr);
|
|
6946
|
+
return { qValues: q, significant };
|
|
6947
|
+
}
|
|
6948
|
+
function zQuantile(p) {
|
|
6949
|
+
if (p <= 0 || p >= 1) {
|
|
6950
|
+
if (p === 0) return -Infinity;
|
|
6951
|
+
if (p === 1) return Infinity;
|
|
6952
|
+
return NaN;
|
|
6953
|
+
}
|
|
6954
|
+
const a = [-39.69683028665376, 220.9460984245205, -275.9285104469687, 138.357751867269, -30.66479806614716, 2.506628277459239];
|
|
6955
|
+
const b = [-54.47609879822406, 161.5858368580409, -155.6989798598866, 66.80131188771972, -13.28068155288572];
|
|
6956
|
+
const c = [-0.007784894002430293, -0.3223964580411365, -2.400758277161838, -2.549732539343734, 4.374664141464968, 2.938163982698783];
|
|
6957
|
+
const d = [0.007784695709041462, 0.3224671290700398, 2.445134137142996, 3.754408661907416];
|
|
6958
|
+
const pLow = 0.02425;
|
|
6959
|
+
const pHigh = 1 - pLow;
|
|
6960
|
+
let q;
|
|
6961
|
+
let r;
|
|
6962
|
+
if (p < pLow) {
|
|
6963
|
+
q = Math.sqrt(-2 * Math.log(p));
|
|
6964
|
+
return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6965
|
+
}
|
|
6966
|
+
if (p <= pHigh) {
|
|
6967
|
+
q = p - 0.5;
|
|
6968
|
+
r = q * q;
|
|
6969
|
+
return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1);
|
|
6970
|
+
}
|
|
6971
|
+
q = Math.sqrt(-2 * Math.log(1 - p));
|
|
6972
|
+
return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1);
|
|
6973
|
+
}
|
|
6974
|
+
|
|
7122
6975
|
// src/behavior-dsl.ts
|
|
7123
6976
|
var BehaviorAssertion = class {
|
|
7124
6977
|
constructor(store, runId) {
|
|
@@ -12692,6 +12545,274 @@ function samePopulation(a, b) {
|
|
|
12692
12545
|
return b.every((id) => setA.has(id));
|
|
12693
12546
|
}
|
|
12694
12547
|
|
|
12548
|
+
// src/multi-shot-optimization.ts
|
|
12549
|
+
async function runMultiShotOptimization(config) {
|
|
12550
|
+
validateConfig(config);
|
|
12551
|
+
const scoreAdapter = {
|
|
12552
|
+
score: (args) => scoreOne(config, args.variant, args.scenarioId, args.rep, "search")
|
|
12553
|
+
};
|
|
12554
|
+
const evolution = await runPromptEvolution({
|
|
12555
|
+
runId: config.runId,
|
|
12556
|
+
target: config.target,
|
|
12557
|
+
seedVariants: config.seedVariants,
|
|
12558
|
+
scenarioIds: config.searchScenarioIds,
|
|
12559
|
+
reps: config.reps,
|
|
12560
|
+
generations: config.generations,
|
|
12561
|
+
populationSize: config.populationSize,
|
|
12562
|
+
scoreConcurrency: config.scoreConcurrency ?? 1,
|
|
12563
|
+
scoreAdapter,
|
|
12564
|
+
mutateAdapter: {
|
|
12565
|
+
mutate: (args) => config.mutateAdapter.mutate({
|
|
12566
|
+
...args,
|
|
12567
|
+
topTrials: args.topTrials,
|
|
12568
|
+
bottomTrials: args.bottomTrials
|
|
12569
|
+
})
|
|
12570
|
+
},
|
|
12571
|
+
objectives: config.objectives ?? defaultMultiShotObjectives(),
|
|
12572
|
+
scalarWeights: config.scalarWeights,
|
|
12573
|
+
earlyStopOnNoImprovement: config.earlyStopOnNoImprovement,
|
|
12574
|
+
cache: config.cache,
|
|
12575
|
+
onProgress: config.onProgress
|
|
12576
|
+
});
|
|
12577
|
+
let gate = null;
|
|
12578
|
+
const baseline = config.seedVariants[0];
|
|
12579
|
+
let promotedVariant = evolution.bestVariant;
|
|
12580
|
+
let promotedAggregate = evolution.bestAggregate;
|
|
12581
|
+
if (config.gate && evolution.bestVariant.id !== baseline.id) {
|
|
12582
|
+
gate = await evaluateMultiShotGate(config, baseline, evolution.bestVariant);
|
|
12583
|
+
if (!gate.decision.promote) {
|
|
12584
|
+
promotedVariant = baseline;
|
|
12585
|
+
promotedAggregate = aggregateFor(evolution, baseline.id);
|
|
12586
|
+
}
|
|
12587
|
+
}
|
|
12588
|
+
return {
|
|
12589
|
+
evolution,
|
|
12590
|
+
searchBestVariant: evolution.bestVariant,
|
|
12591
|
+
searchBestAggregate: evolution.bestAggregate,
|
|
12592
|
+
promotedVariant,
|
|
12593
|
+
promotedAggregate,
|
|
12594
|
+
gate
|
|
12595
|
+
};
|
|
12596
|
+
}
|
|
12597
|
+
function defaultMultiShotObjectives() {
|
|
12598
|
+
return [
|
|
12599
|
+
{ name: "score", direction: "maximize", value: (a) => a.meanScore },
|
|
12600
|
+
{ name: "cost", direction: "minimize", value: (a) => a.meanCost }
|
|
12601
|
+
];
|
|
12602
|
+
}
|
|
12603
|
+
function trialTraceFromMultiShotTrial(trial) {
|
|
12604
|
+
return {
|
|
12605
|
+
id: `${trial.variantId}/${trial.scenarioId}/r${trial.rep}`,
|
|
12606
|
+
score: trial.score,
|
|
12607
|
+
inputName: trial.scenarioId,
|
|
12608
|
+
expectations: (trial.asi ?? []).map((item, i) => ({
|
|
12609
|
+
id: item.expectationId ?? `asi-${i}`,
|
|
12610
|
+
phrase: item.message,
|
|
12611
|
+
matched: item.matched ?? false
|
|
12612
|
+
})),
|
|
12613
|
+
emitted: trial.emitted ?? traceExcerpt(trial.trace),
|
|
12614
|
+
metrics: trial.metrics
|
|
12615
|
+
};
|
|
12616
|
+
}
|
|
12617
|
+
async function evaluateMultiShotGate(config, baseline, candidate) {
|
|
12618
|
+
const gateConfig = config.gate;
|
|
12619
|
+
const reps = gateConfig.reps ?? config.reps;
|
|
12620
|
+
const candidateRuns = [];
|
|
12621
|
+
const baselineRuns = [];
|
|
12622
|
+
const searchIds = gateConfig.searchScenarioIds ?? config.searchScenarioIds;
|
|
12623
|
+
for (const scenarioId of searchIds) {
|
|
12624
|
+
for (let rep = 0; rep < reps; rep++) {
|
|
12625
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12626
|
+
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "search");
|
|
12627
|
+
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "search");
|
|
12628
|
+
baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "search", seed, baseTrial));
|
|
12629
|
+
candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "search", seed, candTrial));
|
|
12630
|
+
}
|
|
12631
|
+
}
|
|
12632
|
+
for (const scenarioId of gateConfig.holdoutScenarioIds) {
|
|
12633
|
+
for (let rep = 0; rep < reps; rep++) {
|
|
12634
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12635
|
+
const baseTrial = await scoreOne(config, baseline, scenarioId, rep, "holdout");
|
|
12636
|
+
const candTrial = await scoreOne(config, candidate, scenarioId, rep, "holdout");
|
|
12637
|
+
baselineRuns.push(toValidatedRecord(config, baseline, scenarioId, rep, "holdout", seed, baseTrial));
|
|
12638
|
+
candidateRuns.push(toValidatedRecord(config, candidate, scenarioId, rep, "holdout", seed, candTrial));
|
|
12639
|
+
}
|
|
12640
|
+
}
|
|
12641
|
+
const decision = new HeldOutGate(gateConfig.gate).evaluate(candidateRuns, baselineRuns);
|
|
12642
|
+
return { decision, candidateRuns, baselineRuns };
|
|
12643
|
+
}
|
|
12644
|
+
async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
12645
|
+
const seed = seedFor(config, scenarioId, rep);
|
|
12646
|
+
const input = { variant, scenarioId, rep, split, seed };
|
|
12647
|
+
try {
|
|
12648
|
+
const run = await config.runner.run(input);
|
|
12649
|
+
const scored = await config.scorer.score({ ...input, run });
|
|
12650
|
+
const asi = scored.asi ?? [];
|
|
12651
|
+
return {
|
|
12652
|
+
variantId: variant.id,
|
|
12653
|
+
scenarioId,
|
|
12654
|
+
rep,
|
|
12655
|
+
ok: scored.ok ?? true,
|
|
12656
|
+
score: clamp013(scored.score),
|
|
12657
|
+
cost: scored.costUsd ?? run.costUsd ?? 0,
|
|
12658
|
+
durationMs: scored.durationMs ?? run.durationMs ?? 0,
|
|
12659
|
+
metrics: {
|
|
12660
|
+
...numericMetrics(scored.metrics),
|
|
12661
|
+
...asiMetrics(asi)
|
|
12662
|
+
},
|
|
12663
|
+
split,
|
|
12664
|
+
seed,
|
|
12665
|
+
trace: run.trace,
|
|
12666
|
+
asi,
|
|
12667
|
+
emitted: scored.emitted ?? traceExcerpt(run.trace),
|
|
12668
|
+
metadata: scored.metadata
|
|
12669
|
+
};
|
|
12670
|
+
} catch (err) {
|
|
12671
|
+
return {
|
|
12672
|
+
variantId: variant.id,
|
|
12673
|
+
scenarioId,
|
|
12674
|
+
rep,
|
|
12675
|
+
ok: false,
|
|
12676
|
+
score: 0,
|
|
12677
|
+
cost: 0,
|
|
12678
|
+
durationMs: 0,
|
|
12679
|
+
metrics: { error: 1 },
|
|
12680
|
+
error: err instanceof Error ? err.message : String(err),
|
|
12681
|
+
split,
|
|
12682
|
+
seed,
|
|
12683
|
+
asi: [{
|
|
12684
|
+
severity: "critical",
|
|
12685
|
+
message: err instanceof Error ? err.message : String(err),
|
|
12686
|
+
responsibleSurface: config.target
|
|
12687
|
+
}],
|
|
12688
|
+
emitted: ""
|
|
12689
|
+
};
|
|
12690
|
+
}
|
|
12691
|
+
}
|
|
12692
|
+
function toValidatedRecord(config, variant, scenarioId, rep, split, seed, trial) {
|
|
12693
|
+
const record = config.gate.toRunRecord({ variant, scenarioId, rep, split, seed, trial });
|
|
12694
|
+
return validateRunRecord(record);
|
|
12695
|
+
}
|
|
12696
|
+
function validateConfig(config) {
|
|
12697
|
+
if (!config.runId.trim()) throw new Error("runMultiShotOptimization: runId must not be empty");
|
|
12698
|
+
if (!config.target.trim()) throw new Error("runMultiShotOptimization: target must not be empty");
|
|
12699
|
+
if (config.seedVariants.length === 0) {
|
|
12700
|
+
throw new Error("runMultiShotOptimization: seedVariants must not be empty");
|
|
12701
|
+
}
|
|
12702
|
+
if (config.searchScenarioIds.length === 0) {
|
|
12703
|
+
throw new Error("runMultiShotOptimization: searchScenarioIds must not be empty");
|
|
12704
|
+
}
|
|
12705
|
+
requirePositiveInteger(config.reps, "reps");
|
|
12706
|
+
requirePositiveInteger(config.generations, "generations");
|
|
12707
|
+
requirePositiveInteger(config.populationSize, "populationSize");
|
|
12708
|
+
if (config.scoreConcurrency !== void 0) requirePositiveInteger(config.scoreConcurrency, "scoreConcurrency");
|
|
12709
|
+
if (config.populationSize < config.seedVariants.length) {
|
|
12710
|
+
throw new Error("runMultiShotOptimization: populationSize must be >= seedVariants.length");
|
|
12711
|
+
}
|
|
12712
|
+
assertUnique(config.seedVariants.map((v) => v.id), "seedVariants.id");
|
|
12713
|
+
assertUnique(config.searchScenarioIds, "searchScenarioIds");
|
|
12714
|
+
if (config.gate) {
|
|
12715
|
+
if (config.gate.holdoutScenarioIds.length === 0) {
|
|
12716
|
+
throw new Error("runMultiShotOptimization: gate.holdoutScenarioIds must not be empty");
|
|
12717
|
+
}
|
|
12718
|
+
if (config.gate.reps !== void 0) requirePositiveInteger(config.gate.reps, "gate.reps");
|
|
12719
|
+
assertUnique(config.gate.holdoutScenarioIds, "gate.holdoutScenarioIds");
|
|
12720
|
+
if (config.gate.searchScenarioIds) assertUnique(config.gate.searchScenarioIds, "gate.searchScenarioIds");
|
|
12721
|
+
const searchIds = new Set(config.searchScenarioIds);
|
|
12722
|
+
for (const id of config.gate.holdoutScenarioIds) {
|
|
12723
|
+
if (searchIds.has(id)) {
|
|
12724
|
+
throw new Error(`runMultiShotOptimization: holdout scenario "${id}" also appears in searchScenarioIds`);
|
|
12725
|
+
}
|
|
12726
|
+
}
|
|
12727
|
+
const baselineId = config.seedVariants[0].id;
|
|
12728
|
+
if (config.gate.gate.baselineKey !== baselineId) {
|
|
12729
|
+
throw new Error(
|
|
12730
|
+
`runMultiShotOptimization: gate.gate.baselineKey must match first seed variant id "${baselineId}"`
|
|
12731
|
+
);
|
|
12732
|
+
}
|
|
12733
|
+
}
|
|
12734
|
+
}
|
|
12735
|
+
function requirePositiveInteger(value, name) {
|
|
12736
|
+
if (!Number.isInteger(value) || value <= 0) {
|
|
12737
|
+
throw new Error(`runMultiShotOptimization: ${name} must be a positive integer`);
|
|
12738
|
+
}
|
|
12739
|
+
}
|
|
12740
|
+
function assertUnique(values, name) {
|
|
12741
|
+
const seen = /* @__PURE__ */ new Set();
|
|
12742
|
+
for (const value of values) {
|
|
12743
|
+
if (!value.trim()) throw new Error(`runMultiShotOptimization: ${name} must not contain empty values`);
|
|
12744
|
+
if (seen.has(value)) throw new Error(`runMultiShotOptimization: duplicate ${name} "${value}"`);
|
|
12745
|
+
seen.add(value);
|
|
12746
|
+
}
|
|
12747
|
+
}
|
|
12748
|
+
function aggregateFor(evolution, variantId) {
|
|
12749
|
+
const final = evolution.generations[evolution.generations.length - 1];
|
|
12750
|
+
const aggregate2 = final?.aggregates.find((a) => a.variantId === variantId);
|
|
12751
|
+
if (!aggregate2) {
|
|
12752
|
+
throw new Error(`runMultiShotOptimization: missing aggregate for variant "${variantId}"`);
|
|
12753
|
+
}
|
|
12754
|
+
return aggregate2;
|
|
12755
|
+
}
|
|
12756
|
+
function seedFor(config, scenarioId, rep) {
|
|
12757
|
+
const base = config.seedBase ?? 0;
|
|
12758
|
+
return (base + stableHash2(`${scenarioId}${rep}`)) % Number.MAX_SAFE_INTEGER;
|
|
12759
|
+
}
|
|
12760
|
+
function stableHash2(input) {
|
|
12761
|
+
let h = 2166136261;
|
|
12762
|
+
for (let i = 0; i < input.length; i++) {
|
|
12763
|
+
h ^= input.charCodeAt(i);
|
|
12764
|
+
h = Math.imul(h, 16777619);
|
|
12765
|
+
}
|
|
12766
|
+
return h >>> 0;
|
|
12767
|
+
}
|
|
12768
|
+
function clamp013(n) {
|
|
12769
|
+
if (!Number.isFinite(n)) return 0;
|
|
12770
|
+
return Math.max(0, Math.min(1, n));
|
|
12771
|
+
}
|
|
12772
|
+
function numericMetrics(metrics) {
|
|
12773
|
+
const out = {};
|
|
12774
|
+
for (const [k, v] of Object.entries(metrics ?? {})) {
|
|
12775
|
+
if (Number.isFinite(v)) out[k] = v;
|
|
12776
|
+
}
|
|
12777
|
+
return out;
|
|
12778
|
+
}
|
|
12779
|
+
function asiMetrics(asi) {
|
|
12780
|
+
const out = { asi: asi.length };
|
|
12781
|
+
for (const item of asi.slice(0, 1e3)) {
|
|
12782
|
+
const sev = normalizeSeverity(item.severity);
|
|
12783
|
+
out[`asi.${sev}`] = (out[`asi.${sev}`] ?? 0) + 1;
|
|
12784
|
+
if (item.responsibleSurface) {
|
|
12785
|
+
const key = `surface.${metricKeySegment(item.responsibleSurface)}`;
|
|
12786
|
+
out[key] = (out[key] ?? 0) + 1;
|
|
12787
|
+
}
|
|
12788
|
+
}
|
|
12789
|
+
return out;
|
|
12790
|
+
}
|
|
12791
|
+
function normalizeSeverity(severity) {
|
|
12792
|
+
if (severity === "info" || severity === "warning" || severity === "error" || severity === "critical") {
|
|
12793
|
+
return severity;
|
|
12794
|
+
}
|
|
12795
|
+
return "error";
|
|
12796
|
+
}
|
|
12797
|
+
function metricKeySegment(raw) {
|
|
12798
|
+
return raw.trim().replace(/[^a-zA-Z0-9._-]+/g, "_").slice(0, 80) || "unknown";
|
|
12799
|
+
}
|
|
12800
|
+
function traceExcerpt(trace) {
|
|
12801
|
+
if (!trace) return void 0;
|
|
12802
|
+
if (typeof trace.output === "string") return trace.output;
|
|
12803
|
+
if (trace.transcript) return trace.transcript;
|
|
12804
|
+
if (trace.turns) {
|
|
12805
|
+
try {
|
|
12806
|
+
const clipped = trace.turns.slice(0, 20);
|
|
12807
|
+
const suffix = trace.turns.length > clipped.length ? ` ... ${trace.turns.length - clipped.length} more turn(s)` : "";
|
|
12808
|
+
return `${JSON.stringify(clipped).slice(0, 2e3)}${suffix}`;
|
|
12809
|
+
} catch {
|
|
12810
|
+
return "[unserializable trace turns]";
|
|
12811
|
+
}
|
|
12812
|
+
}
|
|
12813
|
+
return void 0;
|
|
12814
|
+
}
|
|
12815
|
+
|
|
12695
12816
|
// src/jsonl-trial-cache.ts
|
|
12696
12817
|
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
12697
12818
|
import { dirname as dirname4 } from "path";
|
|
@@ -13727,12 +13848,10 @@ export {
|
|
|
13727
13848
|
Mutex,
|
|
13728
13849
|
NoopResearcher,
|
|
13729
13850
|
OTEL_AGENT_EVAL_SCOPE,
|
|
13730
|
-
OptimizationLoop,
|
|
13731
13851
|
PairwiseSteeringOptimizer,
|
|
13732
13852
|
PrmGrader,
|
|
13733
13853
|
ProductClient,
|
|
13734
13854
|
ProjectRegistry,
|
|
13735
|
-
PromptOptimizer,
|
|
13736
13855
|
PromptRegistry,
|
|
13737
13856
|
REDACTION_VERSION,
|
|
13738
13857
|
RunCritic,
|
|
@@ -13811,6 +13930,7 @@ export {
|
|
|
13811
13930
|
decideReferenceReplayPromotion,
|
|
13812
13931
|
decideReferenceReplayRunPromotion,
|
|
13813
13932
|
defaultJudges,
|
|
13933
|
+
defaultMultiShotObjectives,
|
|
13814
13934
|
defaultReferenceReplayMatcher,
|
|
13815
13935
|
deployGateLayer,
|
|
13816
13936
|
distillPlaybook,
|
|
@@ -13948,6 +14068,7 @@ export {
|
|
|
13948
14068
|
runJudgeFleet,
|
|
13949
14069
|
runKeywordCoverageJudge,
|
|
13950
14070
|
runKeywordCoverageJudgeUrl,
|
|
14071
|
+
runMultiShotOptimization,
|
|
13951
14072
|
runPromptEvolution,
|
|
13952
14073
|
runProposeReview,
|
|
13953
14074
|
runProposeReviewAsControlLoop,
|
|
@@ -13991,6 +14112,7 @@ export {
|
|
|
13991
14112
|
toolSpans,
|
|
13992
14113
|
toolSuccessRubric,
|
|
13993
14114
|
toolWasteView,
|
|
14115
|
+
trialTraceFromMultiShotTrial,
|
|
13994
14116
|
typoMutator,
|
|
13995
14117
|
urlContains,
|
|
13996
14118
|
validateRunRecord,
|