@tangle-network/agent-eval 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +498 -4
- package/dist/index.js +786 -38
- package/dist/index.js.map +1 -1
- package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
- package/dist/telemetry/file.d.ts +19 -0
- package/dist/telemetry/file.js +40 -0
- package/dist/telemetry/file.js.map +1 -0
- package/dist/telemetry/index.d.ts +38 -0
- package/dist/telemetry/index.js +128 -0
- package/dist/telemetry/index.js.map +1 -0
- package/package.json +18 -9
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean7,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean7 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean7 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean7) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -2486,6 +2486,56 @@ function paretoFrontier(candidates, objectives) {
|
|
|
2486
2486
|
}));
|
|
2487
2487
|
return { frontier, dominated, dominanceMap };
|
|
2488
2488
|
}
|
|
2489
|
+
function scalarScore(candidates, objectives, options = {}) {
|
|
2490
|
+
if (candidates.length === 0) return [];
|
|
2491
|
+
const weights = options.weights ?? {};
|
|
2492
|
+
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
2493
|
+
const ranges = objectives.map((obj) => {
|
|
2494
|
+
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
2495
|
+
if (values.length === 0) return { min: 0, max: 1 };
|
|
2496
|
+
const min = Math.min(...values);
|
|
2497
|
+
const max = Math.max(...values);
|
|
2498
|
+
return { min, max: max === min ? min + 1 : max };
|
|
2499
|
+
});
|
|
2500
|
+
return candidates.map((c) => {
|
|
2501
|
+
let score = 0;
|
|
2502
|
+
objectives.forEach((obj, i) => {
|
|
2503
|
+
const v = obj.value(c);
|
|
2504
|
+
if (!Number.isFinite(v)) return;
|
|
2505
|
+
const { min, max } = ranges[i];
|
|
2506
|
+
const normalised = (v - min) / (max - min);
|
|
2507
|
+
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
2508
|
+
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
2509
|
+
score += directional * weight;
|
|
2510
|
+
});
|
|
2511
|
+
return { candidate: c, score };
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
function crowdingDistance(candidates, objectives) {
|
|
2515
|
+
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
2516
|
+
for (const obj of objectives) {
|
|
2517
|
+
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
2518
|
+
const min = obj.value(sorted[0]);
|
|
2519
|
+
const max = obj.value(sorted[sorted.length - 1]);
|
|
2520
|
+
const range = max - min || 1;
|
|
2521
|
+
distances.set(sorted[0], Infinity);
|
|
2522
|
+
distances.set(sorted[sorted.length - 1], Infinity);
|
|
2523
|
+
for (let i = 1; i < sorted.length - 1; i++) {
|
|
2524
|
+
const prev = obj.value(sorted[i - 1]);
|
|
2525
|
+
const next = obj.value(sorted[i + 1]);
|
|
2526
|
+
const current = distances.get(sorted[i]);
|
|
2527
|
+
if (current === Infinity) continue;
|
|
2528
|
+
distances.set(sorted[i], current + (next - prev) / range);
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
2532
|
+
}
|
|
2533
|
+
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
2534
|
+
const { frontier } = paretoFrontier(candidates, objectives);
|
|
2535
|
+
if (frontier.length === 0) return [];
|
|
2536
|
+
const distances = crowdingDistance(frontier, objectives);
|
|
2537
|
+
return distances.sort((a, b) => b.distance - a.distance);
|
|
2538
|
+
}
|
|
2489
2539
|
|
|
2490
2540
|
// src/harness-optimizer.ts
|
|
2491
2541
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
@@ -5095,10 +5145,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5095
5145
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5096
5146
|
}
|
|
5097
5147
|
const tail = values.slice(-window);
|
|
5098
|
-
const
|
|
5099
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5148
|
+
const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5149
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
|
|
5100
5150
|
const stdDev = Math.sqrt(variance2);
|
|
5101
|
-
const refMean = Math.abs(
|
|
5151
|
+
const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
5102
5152
|
const cv = stdDev / refMean;
|
|
5103
5153
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5104
5154
|
let tailRun = 0;
|
|
@@ -5119,7 +5169,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5119
5169
|
} else {
|
|
5120
5170
|
state = "noisy";
|
|
5121
5171
|
}
|
|
5122
|
-
return { state, windowMean:
|
|
5172
|
+
return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
|
|
5123
5173
|
}
|
|
5124
5174
|
|
|
5125
5175
|
// src/state-continuity.ts
|
|
@@ -6047,12 +6097,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6047
6097
|
variantScores.push({ mutator: id, score, mutated });
|
|
6048
6098
|
all.push(score);
|
|
6049
6099
|
}
|
|
6050
|
-
const
|
|
6051
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6100
|
+
const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6101
|
+
const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
|
|
6052
6102
|
const stdDev = Math.sqrt(variance2);
|
|
6053
|
-
const ref = Math.abs(
|
|
6103
|
+
const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
6054
6104
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6055
|
-
return { originalScore, variantScores, meanScore:
|
|
6105
|
+
return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
|
|
6056
6106
|
}
|
|
6057
6107
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6058
6108
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6973,8 +7023,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6973
7023
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6974
7024
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6975
7025
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6976
|
-
const
|
|
6977
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
7026
|
+
const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
7027
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
|
|
6978
7028
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6979
7029
|
}
|
|
6980
7030
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6996,8 +7046,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6996
7046
|
const ranked = [...byRun.values()].sort(
|
|
6997
7047
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6998
7048
|
);
|
|
6999
|
-
const
|
|
7000
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
7049
|
+
const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7050
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
|
|
7001
7051
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7002
7052
|
}
|
|
7003
7053
|
|
|
@@ -7527,8 +7577,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7527
7577
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7528
7578
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7529
7579
|
if (scores.length < 3) continue;
|
|
7530
|
-
const
|
|
7531
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7580
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7581
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
|
|
7532
7582
|
if (variance2 > varianceThreshold) {
|
|
7533
7583
|
targets.push({
|
|
7534
7584
|
reason: "high-variance",
|
|
@@ -9491,6 +9541,7 @@ async function runReferenceReplay(cases, options) {
|
|
|
9491
9541
|
const scoreOptions2 = {
|
|
9492
9542
|
matcher: options.matcher,
|
|
9493
9543
|
matchThreshold: options.matchThreshold,
|
|
9544
|
+
matchStrategy: options.matchStrategy,
|
|
9494
9545
|
includeHoldout: true
|
|
9495
9546
|
};
|
|
9496
9547
|
const scenarioScore = scoreReferenceReplay([scenario], scoreOptions2).scenarios[0];
|
|
@@ -9510,6 +9561,7 @@ async function runReferenceReplay(cases, options) {
|
|
|
9510
9561
|
const scoreOptions = {
|
|
9511
9562
|
matcher: options.matcher,
|
|
9512
9563
|
matchThreshold: options.matchThreshold,
|
|
9564
|
+
matchStrategy: options.matchStrategy,
|
|
9513
9565
|
includeHoldout: true
|
|
9514
9566
|
};
|
|
9515
9567
|
const run = {
|
|
@@ -9560,12 +9612,13 @@ function jsonlReferenceReplayStore(path) {
|
|
|
9560
9612
|
function scoreReferenceReplay(scenarios, options = {}) {
|
|
9561
9613
|
const matcher = options.matcher ?? defaultReferenceReplayMatcher;
|
|
9562
9614
|
const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
|
|
9615
|
+
const matchStrategy = options.matchStrategy ?? "reference-order";
|
|
9563
9616
|
const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
|
|
9564
9617
|
const scores = scenarios.filter((scenario) => {
|
|
9565
9618
|
const split = scenario.split ?? "train";
|
|
9566
9619
|
if (split === "holdout" && !options.includeHoldout) return false;
|
|
9567
9620
|
return allowedSplits.has(split);
|
|
9568
|
-
}).map((scenario) => scoreScenario(scenario, matcher, threshold));
|
|
9621
|
+
}).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
|
|
9569
9622
|
return {
|
|
9570
9623
|
scenarios: scores,
|
|
9571
9624
|
aggregate: aggregateScenarioScores(scores),
|
|
@@ -9664,18 +9717,18 @@ function defaultReferenceReplayMatcher(reference, candidate) {
|
|
|
9664
9717
|
const score = clamp012(textScore * 0.85 + tagScore + severityScore);
|
|
9665
9718
|
return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
|
|
9666
9719
|
}
|
|
9667
|
-
function scoreScenario(scenario, matcher, threshold) {
|
|
9720
|
+
function scoreScenario(scenario, matcher, threshold, matchStrategy) {
|
|
9721
|
+
return matchStrategy === "global-greedy" ? scoreScenarioGlobalGreedy(scenario, matcher, threshold) : scoreScenarioReferenceOrder(scenario, matcher, threshold);
|
|
9722
|
+
}
|
|
9723
|
+
function scoreScenarioReferenceOrder(scenario, matcher, threshold) {
|
|
9668
9724
|
const candidatesLeft = scenario.candidates.map((candidate, index) => ({ candidate, index }));
|
|
9669
9725
|
const matches2 = [];
|
|
9670
9726
|
for (const reference of scenario.references) {
|
|
9671
9727
|
let best = null;
|
|
9672
9728
|
for (const item of candidatesLeft) {
|
|
9673
|
-
const result = matcher
|
|
9674
|
-
if (!Number.isFinite(result.score)) {
|
|
9675
|
-
throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${item.candidate.id}`);
|
|
9676
|
-
}
|
|
9729
|
+
const result = scorePair(scenario, matcher, reference, item.candidate);
|
|
9677
9730
|
if (!best || result.score > best.score) {
|
|
9678
|
-
best = { ...item,
|
|
9731
|
+
best = { ...item, ...result };
|
|
9679
9732
|
}
|
|
9680
9733
|
}
|
|
9681
9734
|
const weight = reference.weight ?? 1;
|
|
@@ -9703,12 +9756,72 @@ function scoreScenario(scenario, matcher, threshold) {
|
|
|
9703
9756
|
});
|
|
9704
9757
|
}
|
|
9705
9758
|
}
|
|
9759
|
+
return buildScenarioScore(scenario, matches2, candidatesLeft.length);
|
|
9760
|
+
}
|
|
9761
|
+
function scoreScenarioGlobalGreedy(scenario, matcher, threshold) {
|
|
9762
|
+
const pairs = [];
|
|
9763
|
+
for (const [referenceIndex, reference] of scenario.references.entries()) {
|
|
9764
|
+
for (const [candidateIndex, candidate] of scenario.candidates.entries()) {
|
|
9765
|
+
pairs.push({
|
|
9766
|
+
referenceIndex,
|
|
9767
|
+
candidateIndex,
|
|
9768
|
+
reference,
|
|
9769
|
+
candidate,
|
|
9770
|
+
...scorePair(scenario, matcher, reference, candidate)
|
|
9771
|
+
});
|
|
9772
|
+
}
|
|
9773
|
+
}
|
|
9774
|
+
pairs.sort(
|
|
9775
|
+
(a, b) => b.score - a.score || a.referenceIndex - b.referenceIndex || a.candidateIndex - b.candidateIndex
|
|
9776
|
+
);
|
|
9777
|
+
const selectedByReference = /* @__PURE__ */ new Map();
|
|
9778
|
+
const selectedCandidates = /* @__PURE__ */ new Set();
|
|
9779
|
+
for (const pair of pairs) {
|
|
9780
|
+
if (pair.score < threshold) break;
|
|
9781
|
+
if (selectedByReference.has(pair.referenceIndex) || selectedCandidates.has(pair.candidateIndex)) continue;
|
|
9782
|
+
selectedByReference.set(pair.referenceIndex, pair);
|
|
9783
|
+
selectedCandidates.add(pair.candidateIndex);
|
|
9784
|
+
}
|
|
9785
|
+
const matches2 = scenario.references.map((reference, referenceIndex) => {
|
|
9786
|
+
const weight = reference.weight ?? 1;
|
|
9787
|
+
const selected = selectedByReference.get(referenceIndex);
|
|
9788
|
+
if (selected) {
|
|
9789
|
+
return {
|
|
9790
|
+
scenarioId: scenario.id,
|
|
9791
|
+
referenceId: reference.id,
|
|
9792
|
+
candidateId: selected.candidate.id,
|
|
9793
|
+
score: selected.score,
|
|
9794
|
+
matched: true,
|
|
9795
|
+
weight,
|
|
9796
|
+
reason: selected.reason
|
|
9797
|
+
};
|
|
9798
|
+
}
|
|
9799
|
+
const bestRejected = pairs.find((pair) => pair.referenceIndex === referenceIndex);
|
|
9800
|
+
return {
|
|
9801
|
+
scenarioId: scenario.id,
|
|
9802
|
+
referenceId: reference.id,
|
|
9803
|
+
candidateId: bestRejected?.candidate.id ?? null,
|
|
9804
|
+
score: bestRejected?.score ?? 0,
|
|
9805
|
+
matched: false,
|
|
9806
|
+
weight,
|
|
9807
|
+
reason: bestRejected?.reason ?? "no candidates"
|
|
9808
|
+
};
|
|
9809
|
+
});
|
|
9810
|
+
return buildScenarioScore(scenario, matches2, scenario.candidates.length - selectedCandidates.size);
|
|
9811
|
+
}
|
|
9812
|
+
function scorePair(scenario, matcher, reference, candidate) {
|
|
9813
|
+
const result = matcher(reference, candidate, scenario);
|
|
9814
|
+
if (!Number.isFinite(result.score)) {
|
|
9815
|
+
throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
|
|
9816
|
+
}
|
|
9817
|
+
return { score: clamp012(result.score), reason: result.reason ?? "" };
|
|
9818
|
+
}
|
|
9819
|
+
function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
9706
9820
|
const matched = matches2.filter((match) => match.matched).length;
|
|
9707
9821
|
const total = scenario.references.length;
|
|
9708
|
-
const falsePositives = candidatesLeft.length;
|
|
9709
9822
|
const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9710
9823
|
const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9711
|
-
const
|
|
9824
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9712
9825
|
const recall = ratio(matched, total);
|
|
9713
9826
|
return {
|
|
9714
9827
|
scenarioId: scenario.id,
|
|
@@ -9718,9 +9831,9 @@ function scoreScenario(scenario, matcher, threshold) {
|
|
|
9718
9831
|
falsePositives,
|
|
9719
9832
|
matchedWeight,
|
|
9720
9833
|
totalWeight,
|
|
9721
|
-
precision,
|
|
9834
|
+
precision: precision2,
|
|
9722
9835
|
recall,
|
|
9723
|
-
f1: f1(
|
|
9836
|
+
f1: f1(precision2, recall),
|
|
9724
9837
|
matches: matches2
|
|
9725
9838
|
};
|
|
9726
9839
|
}
|
|
@@ -9738,7 +9851,7 @@ function aggregateScenarioScores(scores) {
|
|
|
9738
9851
|
const falsePositives = sum(scores.map((score) => score.falsePositives));
|
|
9739
9852
|
const matchedWeight = sum(scores.map((score) => score.matchedWeight));
|
|
9740
9853
|
const totalWeight = sum(scores.map((score) => score.totalWeight));
|
|
9741
|
-
const
|
|
9854
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9742
9855
|
const recall = ratio(matched, total);
|
|
9743
9856
|
return {
|
|
9744
9857
|
matched,
|
|
@@ -9746,9 +9859,9 @@ function aggregateScenarioScores(scores) {
|
|
|
9746
9859
|
falsePositives,
|
|
9747
9860
|
matchedWeight,
|
|
9748
9861
|
totalWeight,
|
|
9749
|
-
precision,
|
|
9862
|
+
precision: precision2,
|
|
9750
9863
|
recall,
|
|
9751
|
-
f1: f1(
|
|
9864
|
+
f1: f1(precision2, recall),
|
|
9752
9865
|
weightedRecall: ratio(matchedWeight, totalWeight)
|
|
9753
9866
|
};
|
|
9754
9867
|
}
|
|
@@ -9768,8 +9881,8 @@ function emptyAggregate() {
|
|
|
9768
9881
|
function hasSplit(score, split) {
|
|
9769
9882
|
return score.bySplit[split] !== void 0;
|
|
9770
9883
|
}
|
|
9771
|
-
function f1(
|
|
9772
|
-
return
|
|
9884
|
+
function f1(precision2, recall) {
|
|
9885
|
+
return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
|
|
9773
9886
|
}
|
|
9774
9887
|
function ratio(numerator, denominator) {
|
|
9775
9888
|
return denominator > 0 ? numerator / denominator : 0;
|
|
@@ -9854,6 +9967,624 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
|
|
|
9854
9967
|
"where",
|
|
9855
9968
|
"which"
|
|
9856
9969
|
]);
|
|
9970
|
+
|
|
9971
|
+
// src/reference-replay-steering.ts
|
|
9972
|
+
function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
9973
|
+
const rows = [];
|
|
9974
|
+
for (const run of runs) {
|
|
9975
|
+
const variantId = run.variantId ?? run.id;
|
|
9976
|
+
const bundle = options.bundleForRun?.(run) ?? {
|
|
9977
|
+
id: variantId,
|
|
9978
|
+
metadata: run.metadata
|
|
9979
|
+
};
|
|
9980
|
+
for (const caseRun of run.cases) {
|
|
9981
|
+
rows.push({
|
|
9982
|
+
variantId,
|
|
9983
|
+
scenarioId: caseRun.caseId,
|
|
9984
|
+
bundle,
|
|
9985
|
+
score: options.scoreForCase?.(caseRun, run) ?? referenceReplayScenarioToRunScore(caseRun.score, caseRun.durationMs),
|
|
9986
|
+
metadata: {
|
|
9987
|
+
runId: run.id,
|
|
9988
|
+
split: caseRun.split,
|
|
9989
|
+
task: caseRun.metadata?.task ?? caseRun.metadata?.repo ?? caseRun.caseId,
|
|
9990
|
+
referenceCount: caseRun.references.length,
|
|
9991
|
+
candidateCount: caseRun.candidates.length,
|
|
9992
|
+
matched: caseRun.score.matched,
|
|
9993
|
+
total: caseRun.score.total,
|
|
9994
|
+
falsePositives: caseRun.score.falsePositives,
|
|
9995
|
+
precision: caseRun.score.precision,
|
|
9996
|
+
recall: caseRun.score.recall,
|
|
9997
|
+
f1: caseRun.score.f1,
|
|
9998
|
+
error: caseRun.error,
|
|
9999
|
+
...caseRun.metadata ?? {}
|
|
10000
|
+
}
|
|
10001
|
+
});
|
|
10002
|
+
}
|
|
10003
|
+
}
|
|
10004
|
+
return rows;
|
|
10005
|
+
}
|
|
10006
|
+
function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
10007
|
+
const success = scenarioScore.f1;
|
|
10008
|
+
const recall = scenarioScore.recall;
|
|
10009
|
+
const precision2 = scenarioScore.precision;
|
|
10010
|
+
const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
|
|
10011
|
+
return {
|
|
10012
|
+
success,
|
|
10013
|
+
goalProgress: recall,
|
|
10014
|
+
repoGroundedness: precision2,
|
|
10015
|
+
driftPenalty: 1 - precision2,
|
|
10016
|
+
toolUseQuality: precision2,
|
|
10017
|
+
patchQuality: 0,
|
|
10018
|
+
testReality: scenarioScore.total > 0 ? 1 : 0,
|
|
10019
|
+
finalGate: success,
|
|
10020
|
+
reviewerBlockers: failed ? 1 : 0,
|
|
10021
|
+
costUsd: 0,
|
|
10022
|
+
wallSeconds: Math.max(0, durationMs / 1e3),
|
|
10023
|
+
notes: [
|
|
10024
|
+
`reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
|
|
10025
|
+
`precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
|
|
10026
|
+
]
|
|
10027
|
+
};
|
|
10028
|
+
}
|
|
10029
|
+
|
|
10030
|
+
// src/prompt-evolution.ts
|
|
10031
|
+
var InMemoryTrialCache = class {
|
|
10032
|
+
store = /* @__PURE__ */ new Map();
|
|
10033
|
+
get(key) {
|
|
10034
|
+
return this.store.get(key);
|
|
10035
|
+
}
|
|
10036
|
+
set(key, value) {
|
|
10037
|
+
this.store.set(key, value);
|
|
10038
|
+
}
|
|
10039
|
+
size() {
|
|
10040
|
+
return this.store.size;
|
|
10041
|
+
}
|
|
10042
|
+
clear() {
|
|
10043
|
+
this.store.clear();
|
|
10044
|
+
}
|
|
10045
|
+
};
|
|
10046
|
+
async function runPromptEvolution(config) {
|
|
10047
|
+
const generations = [];
|
|
10048
|
+
let population = [...config.seedVariants];
|
|
10049
|
+
let bestVariant = population[0];
|
|
10050
|
+
let bestAggregate = null;
|
|
10051
|
+
for (let generation = 0; generation < config.generations; generation++) {
|
|
10052
|
+
config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
|
|
10053
|
+
const trials = await scorePopulation(population, config, generation);
|
|
10054
|
+
const aggregates = aggregateTrials(population, config.scenarioIds, trials);
|
|
10055
|
+
const front = paretoFrontierWithCrowding(aggregates, config.objectives);
|
|
10056
|
+
const frontIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10057
|
+
const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
|
|
10058
|
+
scored.sort((a, b) => b.score - a.score);
|
|
10059
|
+
const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
|
|
10060
|
+
const report = {
|
|
10061
|
+
runId: config.runId,
|
|
10062
|
+
target: config.target,
|
|
10063
|
+
generation,
|
|
10064
|
+
variants: population,
|
|
10065
|
+
aggregates,
|
|
10066
|
+
paretoFrontIds: front.map((c) => c.candidate.variantId),
|
|
10067
|
+
winnerId,
|
|
10068
|
+
trials
|
|
10069
|
+
};
|
|
10070
|
+
generations.push(report);
|
|
10071
|
+
config.onProgress?.({ type: "generation-complete", report });
|
|
10072
|
+
const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
|
|
10073
|
+
if (winnerAgg) {
|
|
10074
|
+
const winner = population.find((v) => v.id === winnerId);
|
|
10075
|
+
if (winner) bestVariant = winner;
|
|
10076
|
+
bestAggregate = winnerAgg;
|
|
10077
|
+
}
|
|
10078
|
+
if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
|
|
10079
|
+
const prev = generations[generations.length - 2];
|
|
10080
|
+
const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
|
|
10081
|
+
if (noChange) {
|
|
10082
|
+
config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
|
|
10083
|
+
break;
|
|
10084
|
+
}
|
|
10085
|
+
}
|
|
10086
|
+
if (generation === config.generations - 1) break;
|
|
10087
|
+
population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
|
|
10088
|
+
}
|
|
10089
|
+
return {
|
|
10090
|
+
runId: config.runId,
|
|
10091
|
+
target: config.target,
|
|
10092
|
+
generations,
|
|
10093
|
+
bestVariant,
|
|
10094
|
+
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
|
|
10095
|
+
};
|
|
10096
|
+
}
|
|
10097
|
+
async function scorePopulation(population, config, generation) {
|
|
10098
|
+
const jobs = [];
|
|
10099
|
+
for (const variant of population) {
|
|
10100
|
+
for (const scenarioId of config.scenarioIds) {
|
|
10101
|
+
for (let rep = 0; rep < config.reps; rep++) {
|
|
10102
|
+
jobs.push(async () => {
|
|
10103
|
+
const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
|
|
10104
|
+
const cached = config.cache?.get(cacheKey);
|
|
10105
|
+
if (cached) {
|
|
10106
|
+
config.onProgress?.({
|
|
10107
|
+
type: "trial-complete",
|
|
10108
|
+
generation,
|
|
10109
|
+
variantId: variant.id,
|
|
10110
|
+
scenarioId,
|
|
10111
|
+
rep,
|
|
10112
|
+
ok: cached.ok,
|
|
10113
|
+
score: cached.score,
|
|
10114
|
+
cached: true
|
|
10115
|
+
});
|
|
10116
|
+
return cached;
|
|
10117
|
+
}
|
|
10118
|
+
const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
|
|
10119
|
+
config.cache?.set(cacheKey, result);
|
|
10120
|
+
config.onProgress?.({
|
|
10121
|
+
type: "trial-complete",
|
|
10122
|
+
generation,
|
|
10123
|
+
variantId: variant.id,
|
|
10124
|
+
scenarioId,
|
|
10125
|
+
rep,
|
|
10126
|
+
ok: result.ok,
|
|
10127
|
+
score: result.score,
|
|
10128
|
+
cached: false
|
|
10129
|
+
});
|
|
10130
|
+
return result;
|
|
10131
|
+
});
|
|
10132
|
+
}
|
|
10133
|
+
}
|
|
10134
|
+
}
|
|
10135
|
+
return runWithConcurrency(jobs, config.scoreConcurrency);
|
|
10136
|
+
}
|
|
10137
|
+
async function runWithConcurrency(jobs, concurrency) {
|
|
10138
|
+
const results = new Array(jobs.length);
|
|
10139
|
+
const limit = Math.max(1, concurrency);
|
|
10140
|
+
let next = 0;
|
|
10141
|
+
async function worker() {
|
|
10142
|
+
while (true) {
|
|
10143
|
+
const i = next++;
|
|
10144
|
+
if (i >= jobs.length) return;
|
|
10145
|
+
results[i] = await jobs[i]();
|
|
10146
|
+
}
|
|
10147
|
+
}
|
|
10148
|
+
await Promise.all(Array.from({ length: limit }, () => worker()));
|
|
10149
|
+
return results;
|
|
10150
|
+
}
|
|
10151
|
+
function aggregateTrials(population, scenarioIds, trials) {
|
|
10152
|
+
return population.map((variant) => {
|
|
10153
|
+
const variantTrials = trials.filter((t) => t.variantId === variant.id);
|
|
10154
|
+
const scenarios = scenarioIds.map((sid) => {
|
|
10155
|
+
const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
|
|
10156
|
+
const okTrials = scenarioTrials.filter((t) => t.ok);
|
|
10157
|
+
const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
|
|
10158
|
+
return {
|
|
10159
|
+
variantId: variant.id,
|
|
10160
|
+
scenarioId: sid,
|
|
10161
|
+
meanScore: mean5(okTrials.map((t) => t.score)),
|
|
10162
|
+
meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
|
|
10163
|
+
meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
|
|
10164
|
+
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10165
|
+
trials: scenarioTrials.length,
|
|
10166
|
+
metrics
|
|
10167
|
+
};
|
|
10168
|
+
});
|
|
10169
|
+
return {
|
|
10170
|
+
variantId: variant.id,
|
|
10171
|
+
meanScore: mean5(scenarios.map((s) => s.meanScore)),
|
|
10172
|
+
meanCost: mean5(scenarios.map((s) => s.meanCost)),
|
|
10173
|
+
meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
|
|
10174
|
+
okRate: mean5(scenarios.map((s) => s.okRate)),
|
|
10175
|
+
scenarios,
|
|
10176
|
+
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
10177
|
+
};
|
|
10178
|
+
});
|
|
10179
|
+
}
|
|
10180
|
+
function aggregateMetrics(rows) {
|
|
10181
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
10182
|
+
for (const row of rows) {
|
|
10183
|
+
for (const [k, v] of Object.entries(row)) {
|
|
10184
|
+
if (!Number.isFinite(v)) continue;
|
|
10185
|
+
const list = buckets.get(k) ?? [];
|
|
10186
|
+
list.push(v);
|
|
10187
|
+
buckets.set(k, list);
|
|
10188
|
+
}
|
|
10189
|
+
}
|
|
10190
|
+
const out = {};
|
|
10191
|
+
for (const [k, list] of buckets) out[k] = mean5(list);
|
|
10192
|
+
return out;
|
|
10193
|
+
}
|
|
10194
|
+
function mean5(xs) {
|
|
10195
|
+
if (xs.length === 0) return 0;
|
|
10196
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10197
|
+
}
|
|
10198
|
+
async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
|
|
10199
|
+
const survivorIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10200
|
+
const survivors = current.filter((v) => survivorIds.has(v.id));
|
|
10201
|
+
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
|
|
10202
|
+
const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
|
|
10203
|
+
const parent = current.find((v) => v.id === parentId) ?? current[0];
|
|
10204
|
+
const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
|
|
10205
|
+
const topTrials = topKTrialsByScore(trials, parent.id, 3);
|
|
10206
|
+
const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
|
|
10207
|
+
const childCount = Math.max(0, config.populationSize - survivors.length);
|
|
10208
|
+
let children = [];
|
|
10209
|
+
if (childCount > 0) {
|
|
10210
|
+
children = await config.mutateAdapter.mutate({
|
|
10211
|
+
parent,
|
|
10212
|
+
parentAggregate,
|
|
10213
|
+
topTrials,
|
|
10214
|
+
bottomTrials,
|
|
10215
|
+
childCount,
|
|
10216
|
+
generation: nextGeneration
|
|
10217
|
+
});
|
|
10218
|
+
children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
|
|
10219
|
+
}
|
|
10220
|
+
return [...survivors, ...children];
|
|
10221
|
+
}
|
|
10222
|
+
function topKTrialsByScore(trials, variantId, k) {
|
|
10223
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
|
|
10224
|
+
}
|
|
10225
|
+
function bottomKTrialsByScore(trials, variantId, k) {
|
|
10226
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
|
|
10227
|
+
}
|
|
10228
|
+
function samePopulation(a, b) {
|
|
10229
|
+
if (a.length !== b.length) return false;
|
|
10230
|
+
const setA = new Set(a);
|
|
10231
|
+
return b.every((id) => setA.has(id));
|
|
10232
|
+
}
|
|
10233
|
+
|
|
10234
|
+
// src/golden-matcher.ts
|
|
10235
|
+
function matchGoldens(goldens, candidates, options = {}) {
|
|
10236
|
+
const extract = options.text ?? defaultExtract5;
|
|
10237
|
+
const haystacks = candidates.map((c) => extract(c).toLowerCase());
|
|
10238
|
+
const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
|
|
10239
|
+
return {
|
|
10240
|
+
matches: matches2,
|
|
10241
|
+
hits: matches2.filter(Boolean).length,
|
|
10242
|
+
total: goldens.length
|
|
10243
|
+
};
|
|
10244
|
+
}
|
|
10245
|
+
function defaultExtract5(candidate) {
|
|
10246
|
+
if (typeof candidate === "string") return candidate;
|
|
10247
|
+
if (candidate && typeof candidate === "object") {
|
|
10248
|
+
const parts = [];
|
|
10249
|
+
for (const v of Object.values(candidate)) {
|
|
10250
|
+
if (typeof v === "string") parts.push(v);
|
|
10251
|
+
}
|
|
10252
|
+
return parts.join(" ");
|
|
10253
|
+
}
|
|
10254
|
+
return String(candidate ?? "");
|
|
10255
|
+
}
|
|
10256
|
+
function goldenMatched(golden, haystacks) {
|
|
10257
|
+
for (const phrase of golden.any) {
|
|
10258
|
+
const needle = phrase.toLowerCase().trim();
|
|
10259
|
+
if (!needle) continue;
|
|
10260
|
+
if (haystacks.some((h) => h.includes(needle))) return true;
|
|
10261
|
+
}
|
|
10262
|
+
for (const pattern of golden.anyRegex ?? []) {
|
|
10263
|
+
let re;
|
|
10264
|
+
try {
|
|
10265
|
+
re = new RegExp(pattern, "i");
|
|
10266
|
+
} catch {
|
|
10267
|
+
continue;
|
|
10268
|
+
}
|
|
10269
|
+
if (haystacks.some((h) => re.test(h))) return true;
|
|
10270
|
+
}
|
|
10271
|
+
return false;
|
|
10272
|
+
}
|
|
10273
|
+
var DEFAULT_SEVERITY_WEIGHTS = {
|
|
10274
|
+
critical: 3,
|
|
10275
|
+
major: 2,
|
|
10276
|
+
minor: 1
|
|
10277
|
+
};
|
|
10278
|
+
function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
|
|
10279
|
+
if (goldens.length === 0) return 1;
|
|
10280
|
+
const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
|
|
10281
|
+
if (total === 0) return 1;
|
|
10282
|
+
const hit = goldens.reduce(
|
|
10283
|
+
(s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
|
|
10284
|
+
0
|
|
10285
|
+
);
|
|
10286
|
+
return hit / total;
|
|
10287
|
+
}
|
|
10288
|
+
function precision(goldens, candidates, options = {}) {
|
|
10289
|
+
if (candidates.length === 0) return 1;
|
|
10290
|
+
const extract = options.text ?? defaultExtract5;
|
|
10291
|
+
let matched = 0;
|
|
10292
|
+
for (const cand of candidates) {
|
|
10293
|
+
const haystack = extract(cand).toLowerCase();
|
|
10294
|
+
const matchedAny = goldens.some(
|
|
10295
|
+
(g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
|
|
10296
|
+
try {
|
|
10297
|
+
return new RegExp(pat, "i").test(haystack);
|
|
10298
|
+
} catch {
|
|
10299
|
+
return false;
|
|
10300
|
+
}
|
|
10301
|
+
})
|
|
10302
|
+
);
|
|
10303
|
+
if (matchedAny) matched++;
|
|
10304
|
+
}
|
|
10305
|
+
return matched / candidates.length;
|
|
10306
|
+
}
|
|
10307
|
+
|
|
10308
|
+
// src/orthogonality.ts
|
|
10309
|
+
function passOrthogonality(input) {
|
|
10310
|
+
const passes = input.passes;
|
|
10311
|
+
if (passes.length < 2) {
|
|
10312
|
+
return { orthogonality: 1, passCount: passes.length, similarities: [] };
|
|
10313
|
+
}
|
|
10314
|
+
const render = input.text ?? defaultRender;
|
|
10315
|
+
const minLen = input.minTokenLength ?? 4;
|
|
10316
|
+
const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
|
|
10317
|
+
const sims = [];
|
|
10318
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
10319
|
+
for (let j = i + 1; j < vectors.length; j++) {
|
|
10320
|
+
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10321
|
+
}
|
|
10322
|
+
}
|
|
10323
|
+
const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10324
|
+
return {
|
|
10325
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
|
|
10326
|
+
passCount: passes.length,
|
|
10327
|
+
similarities: sims
|
|
10328
|
+
};
|
|
10329
|
+
}
|
|
10330
|
+
function defaultRender(item) {
|
|
10331
|
+
if (typeof item === "string") return item;
|
|
10332
|
+
if (item && typeof item === "object") {
|
|
10333
|
+
const parts = [];
|
|
10334
|
+
for (const v of Object.values(item)) {
|
|
10335
|
+
if (typeof v === "string") parts.push(v);
|
|
10336
|
+
}
|
|
10337
|
+
return parts.join(" ");
|
|
10338
|
+
}
|
|
10339
|
+
return String(item ?? "");
|
|
10340
|
+
}
|
|
10341
|
+
function bagOfWords(items, render, minLen) {
|
|
10342
|
+
const bag = /* @__PURE__ */ new Map();
|
|
10343
|
+
for (const item of items) {
|
|
10344
|
+
const text = render(item).toLowerCase();
|
|
10345
|
+
for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
|
|
10346
|
+
bag.set(tok, (bag.get(tok) ?? 0) + 1);
|
|
10347
|
+
}
|
|
10348
|
+
}
|
|
10349
|
+
return bag;
|
|
10350
|
+
}
|
|
10351
|
+
function cosineSimilarity(a, b) {
|
|
10352
|
+
let dot = 0;
|
|
10353
|
+
let aMag = 0;
|
|
10354
|
+
let bMag = 0;
|
|
10355
|
+
for (const [, v] of a) aMag += v * v;
|
|
10356
|
+
for (const [, v] of b) bMag += v * v;
|
|
10357
|
+
for (const [k, v] of a) {
|
|
10358
|
+
const bv = b.get(k);
|
|
10359
|
+
if (bv) dot += v * bv;
|
|
10360
|
+
}
|
|
10361
|
+
if (aMag === 0 || bMag === 0) return 0;
|
|
10362
|
+
return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
|
|
10363
|
+
}
|
|
10364
|
+
|
|
10365
|
+
// src/promotion-gate.ts
|
|
10366
|
+
function bootstrapCi(baseline, candidate, options = {}) {
|
|
10367
|
+
const alpha = options.alpha ?? 0.05;
|
|
10368
|
+
const iterations = options.iterations ?? 1e3;
|
|
10369
|
+
const minTotal = options.minTotalSamples ?? 6;
|
|
10370
|
+
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
10371
|
+
const baselineMean = mean6(baseline);
|
|
10372
|
+
const candidateMean = mean6(candidate);
|
|
10373
|
+
const delta = candidateMean - baselineMean;
|
|
10374
|
+
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
10375
|
+
return {
|
|
10376
|
+
baselineMean,
|
|
10377
|
+
candidateMean,
|
|
10378
|
+
delta,
|
|
10379
|
+
ciLower: -Infinity,
|
|
10380
|
+
ciUpper: Infinity,
|
|
10381
|
+
iterations: 0,
|
|
10382
|
+
alpha,
|
|
10383
|
+
verdict: "INCONCLUSIVE"
|
|
10384
|
+
};
|
|
10385
|
+
}
|
|
10386
|
+
const deltas = new Array(iterations);
|
|
10387
|
+
for (let i = 0; i < iterations; i++) {
|
|
10388
|
+
const bResample = resample(baseline, rng);
|
|
10389
|
+
const cResample = resample(candidate, rng);
|
|
10390
|
+
deltas[i] = mean6(cResample) - mean6(bResample);
|
|
10391
|
+
}
|
|
10392
|
+
deltas.sort((a, b) => a - b);
|
|
10393
|
+
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
10394
|
+
const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
|
|
10395
|
+
const ciLower = deltas[Math.max(0, lowerIdx)];
|
|
10396
|
+
const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
|
|
10397
|
+
let verdict;
|
|
10398
|
+
if (ciLower > 0) verdict = "ADVANCE";
|
|
10399
|
+
else if (ciUpper < 0) verdict = "REVERT";
|
|
10400
|
+
else if (delta >= 0) verdict = "KEEP";
|
|
10401
|
+
else verdict = "INCONCLUSIVE";
|
|
10402
|
+
return {
|
|
10403
|
+
baselineMean,
|
|
10404
|
+
candidateMean,
|
|
10405
|
+
delta,
|
|
10406
|
+
ciLower,
|
|
10407
|
+
ciUpper,
|
|
10408
|
+
iterations,
|
|
10409
|
+
alpha,
|
|
10410
|
+
verdict
|
|
10411
|
+
};
|
|
10412
|
+
}
|
|
10413
|
+
function mean6(xs) {
|
|
10414
|
+
if (xs.length === 0) return 0;
|
|
10415
|
+
let s = 0;
|
|
10416
|
+
for (const x of xs) s += x;
|
|
10417
|
+
return s / xs.length;
|
|
10418
|
+
}
|
|
10419
|
+
function resample(xs, rng) {
|
|
10420
|
+
const out = new Array(xs.length);
|
|
10421
|
+
for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
|
|
10422
|
+
return out;
|
|
10423
|
+
}
|
|
10424
|
+
function mulberry32(seed) {
|
|
10425
|
+
let t = seed >>> 0;
|
|
10426
|
+
return () => {
|
|
10427
|
+
t += 1831565813;
|
|
10428
|
+
let r = t;
|
|
10429
|
+
r = Math.imul(r ^ r >>> 15, r | 1);
|
|
10430
|
+
r ^= r + Math.imul(r ^ r >>> 7, r | 61);
|
|
10431
|
+
return ((r ^ r >>> 14) >>> 0) / 4294967296;
|
|
10432
|
+
};
|
|
10433
|
+
}
|
|
10434
|
+
function hashSeed(a, b) {
|
|
10435
|
+
let h = 2166136261;
|
|
10436
|
+
for (const x of [...a, ...b]) {
|
|
10437
|
+
const view = new Float64Array([x]);
|
|
10438
|
+
const bytes = new Uint8Array(view.buffer);
|
|
10439
|
+
for (const byte of bytes) {
|
|
10440
|
+
h ^= byte;
|
|
10441
|
+
h = Math.imul(h, 16777619);
|
|
10442
|
+
}
|
|
10443
|
+
}
|
|
10444
|
+
return h >>> 0;
|
|
10445
|
+
}
|
|
10446
|
+
async function judgeReplayGate(args) {
|
|
10447
|
+
const concurrency = args.judgeConcurrency ?? 4;
|
|
10448
|
+
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
10449
|
+
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
10450
|
+
const ci = bootstrapCi(baselineScores, candidateScores, {
|
|
10451
|
+
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
10452
|
+
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
10453
|
+
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
10454
|
+
});
|
|
10455
|
+
return {
|
|
10456
|
+
...ci,
|
|
10457
|
+
baselineSamples: baselineScores.length,
|
|
10458
|
+
candidateSamples: candidateScores.length
|
|
10459
|
+
};
|
|
10460
|
+
}
|
|
10461
|
+
async function scoreAll(outputs, judge, concurrency) {
|
|
10462
|
+
const results = new Array(outputs.length);
|
|
10463
|
+
let next = 0;
|
|
10464
|
+
async function worker() {
|
|
10465
|
+
while (true) {
|
|
10466
|
+
const i = next++;
|
|
10467
|
+
if (i >= outputs.length) return;
|
|
10468
|
+
const v = await judge(outputs[i]);
|
|
10469
|
+
results[i] = Number.isFinite(v) ? v : 0;
|
|
10470
|
+
}
|
|
10471
|
+
}
|
|
10472
|
+
await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
10473
|
+
return results;
|
|
10474
|
+
}
|
|
10475
|
+
|
|
10476
|
+
// src/reflective-mutation.ts
|
|
10477
|
+
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
10478
|
+
'Strengthen an imperative ("should" \u2192 "must")',
|
|
10479
|
+
"Add a concrete example pulled from a missed-golden phrase",
|
|
10480
|
+
"Remove a redundant rule that did not improve recall",
|
|
10481
|
+
'Add a counterfactual ("if X is missing, the score is capped at Y")',
|
|
10482
|
+
"Reorder sections so the highest-impact rule is first",
|
|
10483
|
+
"Replace abstract language with a domain-specific noun the trial misses"
|
|
10484
|
+
];
|
|
10485
|
+
function buildReflectionPrompt(ctx) {
|
|
10486
|
+
const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
|
|
10487
|
+
const sections = [];
|
|
10488
|
+
sections.push(`# Mutation target: ${ctx.target}`);
|
|
10489
|
+
sections.push("");
|
|
10490
|
+
sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
|
|
10491
|
+
sections.push("");
|
|
10492
|
+
sections.push("## Current variant");
|
|
10493
|
+
sections.push("```json");
|
|
10494
|
+
sections.push(JSON.stringify(ctx.parentPayload, null, 2));
|
|
10495
|
+
sections.push("```");
|
|
10496
|
+
sections.push("");
|
|
10497
|
+
if (ctx.bottomTrials.length > 0) {
|
|
10498
|
+
sections.push("## Failures (bottom trials) \u2014 what went wrong");
|
|
10499
|
+
sections.push("");
|
|
10500
|
+
for (const trial of ctx.bottomTrials) {
|
|
10501
|
+
sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10502
|
+
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
10503
|
+
if (missed.length > 0) {
|
|
10504
|
+
sections.push("");
|
|
10505
|
+
sections.push("**Missed expectations:**");
|
|
10506
|
+
for (const m of missed) {
|
|
10507
|
+
sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
|
|
10508
|
+
}
|
|
10509
|
+
}
|
|
10510
|
+
if (trial.emitted) {
|
|
10511
|
+
sections.push("");
|
|
10512
|
+
sections.push("**What the agent emitted:**");
|
|
10513
|
+
sections.push("```");
|
|
10514
|
+
sections.push(truncate3(trial.emitted, 600));
|
|
10515
|
+
sections.push("```");
|
|
10516
|
+
}
|
|
10517
|
+
sections.push("");
|
|
10518
|
+
}
|
|
10519
|
+
}
|
|
10520
|
+
if (ctx.topTrials.length > 0) {
|
|
10521
|
+
sections.push("## Successes (top trials) \u2014 what to preserve");
|
|
10522
|
+
sections.push("");
|
|
10523
|
+
for (const trial of ctx.topTrials) {
|
|
10524
|
+
sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10525
|
+
}
|
|
10526
|
+
sections.push("");
|
|
10527
|
+
}
|
|
10528
|
+
sections.push("## Allowed mutation primitives");
|
|
10529
|
+
sections.push("");
|
|
10530
|
+
for (const p of primitives) sections.push(`- ${p}`);
|
|
10531
|
+
sections.push("");
|
|
10532
|
+
sections.push("## Output schema");
|
|
10533
|
+
sections.push("");
|
|
10534
|
+
sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
|
|
10535
|
+
sections.push("```json");
|
|
10536
|
+
sections.push(JSON.stringify(
|
|
10537
|
+
{
|
|
10538
|
+
proposals: [
|
|
10539
|
+
{
|
|
10540
|
+
label: "<short label, \u2264 40 chars>",
|
|
10541
|
+
rationale: "<which failure this targets and which primitive you used>",
|
|
10542
|
+
payload: "<full payload of the new variant \u2014 same shape as the current variant>"
|
|
10543
|
+
}
|
|
10544
|
+
]
|
|
10545
|
+
},
|
|
10546
|
+
null,
|
|
10547
|
+
2
|
|
10548
|
+
));
|
|
10549
|
+
sections.push("```");
|
|
10550
|
+
return sections.join("\n");
|
|
10551
|
+
}
|
|
10552
|
+
function truncate3(s, max) {
|
|
10553
|
+
if (s.length <= max) return s;
|
|
10554
|
+
return s.slice(0, max) + "\u2026 [truncated]";
|
|
10555
|
+
}
|
|
10556
|
+
function quote(s) {
|
|
10557
|
+
return s.replace(/`/g, "\\`");
|
|
10558
|
+
}
|
|
10559
|
+
function parseReflectionResponse(raw, maxProposals) {
|
|
10560
|
+
let text = raw.trim();
|
|
10561
|
+
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
10562
|
+
const start = text.indexOf("{");
|
|
10563
|
+
const end = text.lastIndexOf("}");
|
|
10564
|
+
if (start < 0 || end <= start) return [];
|
|
10565
|
+
let parsed;
|
|
10566
|
+
try {
|
|
10567
|
+
parsed = JSON.parse(text.slice(start, end + 1));
|
|
10568
|
+
} catch {
|
|
10569
|
+
return [];
|
|
10570
|
+
}
|
|
10571
|
+
if (!parsed || typeof parsed !== "object") return [];
|
|
10572
|
+
const proposalsRaw = parsed.proposals;
|
|
10573
|
+
if (!Array.isArray(proposalsRaw)) return [];
|
|
10574
|
+
const out = [];
|
|
10575
|
+
for (const p of proposalsRaw) {
|
|
10576
|
+
if (!p || typeof p !== "object") continue;
|
|
10577
|
+
const obj = p;
|
|
10578
|
+
if (!("payload" in obj)) continue;
|
|
10579
|
+
out.push({
|
|
10580
|
+
label: typeof obj.label === "string" ? obj.label : "mutation",
|
|
10581
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
10582
|
+
payload: obj.payload
|
|
10583
|
+
});
|
|
10584
|
+
if (maxProposals !== void 0 && out.length >= maxProposals) break;
|
|
10585
|
+
}
|
|
10586
|
+
return out;
|
|
10587
|
+
}
|
|
9857
10588
|
export {
|
|
9858
10589
|
AgentDriver,
|
|
9859
10590
|
AxGepaSteeringOptimizer,
|
|
@@ -9868,10 +10599,12 @@ export {
|
|
|
9868
10599
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
9869
10600
|
DEFAULT_FINDERS,
|
|
9870
10601
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10602
|
+
DEFAULT_MUTATION_PRIMITIVES,
|
|
9871
10603
|
DEFAULT_MUTATORS,
|
|
9872
10604
|
DEFAULT_REDACTION_RULES,
|
|
9873
10605
|
DEFAULT_RED_TEAM_CORPUS,
|
|
9874
10606
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
10607
|
+
DEFAULT_SEVERITY_WEIGHTS,
|
|
9875
10608
|
Dataset,
|
|
9876
10609
|
DockerSandboxDriver,
|
|
9877
10610
|
DualAgentBench,
|
|
@@ -9886,6 +10619,7 @@ export {
|
|
|
9886
10619
|
InMemoryExperimentStore,
|
|
9887
10620
|
InMemoryOutcomeStore,
|
|
9888
10621
|
InMemoryTraceStore,
|
|
10622
|
+
InMemoryTrialCache,
|
|
9889
10623
|
InMemoryWorkspaceInspector,
|
|
9890
10624
|
JudgeRunner,
|
|
9891
10625
|
LlmCallError,
|
|
@@ -9921,7 +10655,9 @@ export {
|
|
|
9921
10655
|
benjaminiHochberg,
|
|
9922
10656
|
bisect,
|
|
9923
10657
|
bonferroni,
|
|
10658
|
+
bootstrapCi,
|
|
9924
10659
|
budgetBreachView,
|
|
10660
|
+
buildReflectionPrompt,
|
|
9925
10661
|
buildReviewerPrompt,
|
|
9926
10662
|
buildTrajectory,
|
|
9927
10663
|
byteLengthRange,
|
|
@@ -9959,6 +10695,7 @@ export {
|
|
|
9959
10695
|
createLlmReviewer,
|
|
9960
10696
|
createSemanticConceptJudge,
|
|
9961
10697
|
crossTraceDiff,
|
|
10698
|
+
crowdingDistance,
|
|
9962
10699
|
decideReferenceReplayPromotion,
|
|
9963
10700
|
decideReferenceReplayRunPromotion,
|
|
9964
10701
|
defaultJudges,
|
|
@@ -9992,6 +10729,7 @@ export {
|
|
|
9992
10729
|
formatBenchmarkReport,
|
|
9993
10730
|
formatDriverReport,
|
|
9994
10731
|
formatFindings,
|
|
10732
|
+
precision as goldenPrecision,
|
|
9995
10733
|
gradeSemanticStatus,
|
|
9996
10734
|
groupBy,
|
|
9997
10735
|
hashContent,
|
|
@@ -10013,6 +10751,7 @@ export {
|
|
|
10013
10751
|
jsonlReferenceReplayStore,
|
|
10014
10752
|
jsonlReviewStore,
|
|
10015
10753
|
judgeAgreementView,
|
|
10754
|
+
judgeReplayGate,
|
|
10016
10755
|
judgeSpans,
|
|
10017
10756
|
keyPreserved,
|
|
10018
10757
|
linterJudge,
|
|
@@ -10022,6 +10761,7 @@ export {
|
|
|
10022
10761
|
localCommandRunner,
|
|
10023
10762
|
lowercaseMutator,
|
|
10024
10763
|
mannWhitneyU,
|
|
10764
|
+
matchGoldens,
|
|
10025
10765
|
mergeLayerResults,
|
|
10026
10766
|
mergeSteeringBundle,
|
|
10027
10767
|
multiToolchainLayer,
|
|
@@ -10033,7 +10773,10 @@ export {
|
|
|
10033
10773
|
pairedTTest,
|
|
10034
10774
|
paraphraseRobustness,
|
|
10035
10775
|
paretoFrontier,
|
|
10776
|
+
paretoFrontierWithCrowding,
|
|
10777
|
+
parseReflectionResponse,
|
|
10036
10778
|
partialCredit,
|
|
10779
|
+
passOrthogonality,
|
|
10037
10780
|
pixelDeltaRatio,
|
|
10038
10781
|
politenessPrefixMutator,
|
|
10039
10782
|
positionalBias,
|
|
@@ -10048,6 +10791,8 @@ export {
|
|
|
10048
10791
|
redTeamReport,
|
|
10049
10792
|
redactString,
|
|
10050
10793
|
redactValue,
|
|
10794
|
+
referenceReplayRunsToSteeringRows,
|
|
10795
|
+
referenceReplayScenarioToRunScore,
|
|
10051
10796
|
regexMatch,
|
|
10052
10797
|
regexMatches,
|
|
10053
10798
|
regressionView,
|
|
@@ -10071,12 +10816,14 @@ export {
|
|
|
10071
10816
|
runJudgeFleet,
|
|
10072
10817
|
runKeywordCoverageJudge,
|
|
10073
10818
|
runKeywordCoverageJudgeUrl,
|
|
10819
|
+
runPromptEvolution,
|
|
10074
10820
|
runProposeReview,
|
|
10075
10821
|
runReferenceReplay,
|
|
10076
10822
|
runSelfPlay,
|
|
10077
10823
|
runSemanticConceptJudge,
|
|
10078
10824
|
runTestGradedScenario,
|
|
10079
10825
|
runsForScenario,
|
|
10826
|
+
scalarScore,
|
|
10080
10827
|
scanForMuffledGates,
|
|
10081
10828
|
scoreAllProjects,
|
|
10082
10829
|
scoreContinuity,
|
|
@@ -10113,6 +10860,7 @@ export {
|
|
|
10113
10860
|
viteDeployRunner,
|
|
10114
10861
|
vitestTestParser,
|
|
10115
10862
|
weightedMean,
|
|
10863
|
+
weightedRecall,
|
|
10116
10864
|
welchsTTest,
|
|
10117
10865
|
whitespaceCollapseMutator,
|
|
10118
10866
|
wilcoxonSignedRank
|