@tangle-network/agent-eval 0.11.1 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +489 -4
- package/dist/index.js +659 -35
- package/dist/index.js.map +1 -1
- package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
- package/dist/telemetry/file.d.ts +19 -0
- package/dist/telemetry/file.js +40 -0
- package/dist/telemetry/file.js.map +1 -0
- package/dist/telemetry/index.d.ts +38 -0
- package/dist/telemetry/index.js +128 -0
- package/dist/telemetry/index.js.map +1 -0
- package/package.json +19 -10
package/dist/index.js
CHANGED
|
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
410
410
|
if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
411
411
|
if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
|
|
412
412
|
const n = scores.length;
|
|
413
|
-
const
|
|
413
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / n;
|
|
414
414
|
const B = 1e3;
|
|
415
415
|
const bootstrapMeans = [];
|
|
416
416
|
for (let i = 0; i < B; i++) {
|
|
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
|
|
|
425
425
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
426
426
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
427
427
|
return {
|
|
428
|
-
mean:
|
|
428
|
+
mean: mean7,
|
|
429
429
|
lower: bootstrapMeans[lowerIdx],
|
|
430
430
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
431
431
|
};
|
|
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
|
|
|
513
513
|
const n = before.length;
|
|
514
514
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
515
515
|
const diffs = before.map((b, i) => after[i] - b);
|
|
516
|
-
const
|
|
517
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
516
|
+
const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
517
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
|
|
518
518
|
const se = Math.sqrt(variance2 / n);
|
|
519
|
-
if (se === 0) return { t:
|
|
520
|
-
const t =
|
|
519
|
+
if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
|
|
520
|
+
const t = mean7 / se;
|
|
521
521
|
const df = n - 1;
|
|
522
522
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
523
523
|
return { t, df, p };
|
|
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
541
541
|
}
|
|
542
542
|
let wPlus = 0;
|
|
543
543
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
544
|
-
const
|
|
544
|
+
const mean7 = n * (n + 1) / 4;
|
|
545
545
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
546
|
-
const z = (wPlus -
|
|
546
|
+
const z = (wPlus - mean7) / Math.sqrt(variance2);
|
|
547
547
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
548
548
|
return { w: wPlus, p };
|
|
549
549
|
}
|
|
@@ -2486,6 +2486,56 @@ function paretoFrontier(candidates, objectives) {
|
|
|
2486
2486
|
}));
|
|
2487
2487
|
return { frontier, dominated, dominanceMap };
|
|
2488
2488
|
}
|
|
2489
|
+
function scalarScore(candidates, objectives, options = {}) {
|
|
2490
|
+
if (candidates.length === 0) return [];
|
|
2491
|
+
const weights = options.weights ?? {};
|
|
2492
|
+
const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
|
|
2493
|
+
const ranges = objectives.map((obj) => {
|
|
2494
|
+
const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
|
|
2495
|
+
if (values.length === 0) return { min: 0, max: 1 };
|
|
2496
|
+
const min = Math.min(...values);
|
|
2497
|
+
const max = Math.max(...values);
|
|
2498
|
+
return { min, max: max === min ? min + 1 : max };
|
|
2499
|
+
});
|
|
2500
|
+
return candidates.map((c) => {
|
|
2501
|
+
let score = 0;
|
|
2502
|
+
objectives.forEach((obj, i) => {
|
|
2503
|
+
const v = obj.value(c);
|
|
2504
|
+
if (!Number.isFinite(v)) return;
|
|
2505
|
+
const { min, max } = ranges[i];
|
|
2506
|
+
const normalised = (v - min) / (max - min);
|
|
2507
|
+
const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
|
|
2508
|
+
const weight = (weights[obj.name] ?? 1) / totalWeight;
|
|
2509
|
+
score += directional * weight;
|
|
2510
|
+
});
|
|
2511
|
+
return { candidate: c, score };
|
|
2512
|
+
});
|
|
2513
|
+
}
|
|
2514
|
+
function crowdingDistance(candidates, objectives) {
|
|
2515
|
+
const distances = new Map(candidates.map((c) => [c, 0]));
|
|
2516
|
+
for (const obj of objectives) {
|
|
2517
|
+
const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
|
|
2518
|
+
const min = obj.value(sorted[0]);
|
|
2519
|
+
const max = obj.value(sorted[sorted.length - 1]);
|
|
2520
|
+
const range = max - min || 1;
|
|
2521
|
+
distances.set(sorted[0], Infinity);
|
|
2522
|
+
distances.set(sorted[sorted.length - 1], Infinity);
|
|
2523
|
+
for (let i = 1; i < sorted.length - 1; i++) {
|
|
2524
|
+
const prev = obj.value(sorted[i - 1]);
|
|
2525
|
+
const next = obj.value(sorted[i + 1]);
|
|
2526
|
+
const current = distances.get(sorted[i]);
|
|
2527
|
+
if (current === Infinity) continue;
|
|
2528
|
+
distances.set(sorted[i], current + (next - prev) / range);
|
|
2529
|
+
}
|
|
2530
|
+
}
|
|
2531
|
+
return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
|
|
2532
|
+
}
|
|
2533
|
+
function paretoFrontierWithCrowding(candidates, objectives) {
|
|
2534
|
+
const { frontier } = paretoFrontier(candidates, objectives);
|
|
2535
|
+
if (frontier.length === 0) return [];
|
|
2536
|
+
const distances = crowdingDistance(frontier, objectives);
|
|
2537
|
+
return distances.sort((a, b) => b.distance - a.distance);
|
|
2538
|
+
}
|
|
2489
2539
|
|
|
2490
2540
|
// src/harness-optimizer.ts
|
|
2491
2541
|
var DEFAULT_HARNESS_OBJECTIVES = [
|
|
@@ -5095,10 +5145,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
5095
5145
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
5096
5146
|
}
|
|
5097
5147
|
const tail = values.slice(-window);
|
|
5098
|
-
const
|
|
5099
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
5148
|
+
const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
5149
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
|
|
5100
5150
|
const stdDev = Math.sqrt(variance2);
|
|
5101
|
-
const refMean = Math.abs(
|
|
5151
|
+
const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
5102
5152
|
const cv = stdDev / refMean;
|
|
5103
5153
|
const stable = tail.length >= window && cv <= stableCv;
|
|
5104
5154
|
let tailRun = 0;
|
|
@@ -5119,7 +5169,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
5119
5169
|
} else {
|
|
5120
5170
|
state = "noisy";
|
|
5121
5171
|
}
|
|
5122
|
-
return { state, windowMean:
|
|
5172
|
+
return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
|
|
5123
5173
|
}
|
|
5124
5174
|
|
|
5125
5175
|
// src/state-continuity.ts
|
|
@@ -6047,12 +6097,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
6047
6097
|
variantScores.push({ mutator: id, score, mutated });
|
|
6048
6098
|
all.push(score);
|
|
6049
6099
|
}
|
|
6050
|
-
const
|
|
6051
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
6100
|
+
const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
6101
|
+
const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
|
|
6052
6102
|
const stdDev = Math.sqrt(variance2);
|
|
6053
|
-
const ref = Math.abs(
|
|
6103
|
+
const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
|
|
6054
6104
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
6055
|
-
return { originalScore, variantScores, meanScore:
|
|
6105
|
+
return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
|
|
6056
6106
|
}
|
|
6057
6107
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
6058
6108
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -6973,8 +7023,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
6973
7023
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
6974
7024
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
6975
7025
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
6976
|
-
const
|
|
6977
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
7026
|
+
const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
7027
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
|
|
6978
7028
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
6979
7029
|
}
|
|
6980
7030
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -6996,8 +7046,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
6996
7046
|
const ranked = [...byRun.values()].sort(
|
|
6997
7047
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
6998
7048
|
);
|
|
6999
|
-
const
|
|
7000
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
7049
|
+
const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
7050
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
|
|
7001
7051
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
7002
7052
|
}
|
|
7003
7053
|
|
|
@@ -7527,8 +7577,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
7527
7577
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
7528
7578
|
const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
7529
7579
|
if (scores.length < 3) continue;
|
|
7530
|
-
const
|
|
7531
|
-
const variance2 = scores.reduce((a, b) => a + (b -
|
|
7580
|
+
const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
|
|
7581
|
+
const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
|
|
7532
7582
|
if (variance2 > varianceThreshold) {
|
|
7533
7583
|
targets.push({
|
|
7534
7584
|
reason: "high-variance",
|
|
@@ -9771,7 +9821,7 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9771
9821
|
const total = scenario.references.length;
|
|
9772
9822
|
const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9773
9823
|
const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
|
|
9774
|
-
const
|
|
9824
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9775
9825
|
const recall = ratio(matched, total);
|
|
9776
9826
|
return {
|
|
9777
9827
|
scenarioId: scenario.id,
|
|
@@ -9781,9 +9831,9 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
|
9781
9831
|
falsePositives,
|
|
9782
9832
|
matchedWeight,
|
|
9783
9833
|
totalWeight,
|
|
9784
|
-
precision,
|
|
9834
|
+
precision: precision2,
|
|
9785
9835
|
recall,
|
|
9786
|
-
f1: f1(
|
|
9836
|
+
f1: f1(precision2, recall),
|
|
9787
9837
|
matches: matches2
|
|
9788
9838
|
};
|
|
9789
9839
|
}
|
|
@@ -9801,7 +9851,7 @@ function aggregateScenarioScores(scores) {
|
|
|
9801
9851
|
const falsePositives = sum(scores.map((score) => score.falsePositives));
|
|
9802
9852
|
const matchedWeight = sum(scores.map((score) => score.matchedWeight));
|
|
9803
9853
|
const totalWeight = sum(scores.map((score) => score.totalWeight));
|
|
9804
|
-
const
|
|
9854
|
+
const precision2 = ratio(matched, matched + falsePositives);
|
|
9805
9855
|
const recall = ratio(matched, total);
|
|
9806
9856
|
return {
|
|
9807
9857
|
matched,
|
|
@@ -9809,9 +9859,9 @@ function aggregateScenarioScores(scores) {
|
|
|
9809
9859
|
falsePositives,
|
|
9810
9860
|
matchedWeight,
|
|
9811
9861
|
totalWeight,
|
|
9812
|
-
precision,
|
|
9862
|
+
precision: precision2,
|
|
9813
9863
|
recall,
|
|
9814
|
-
f1: f1(
|
|
9864
|
+
f1: f1(precision2, recall),
|
|
9815
9865
|
weightedRecall: ratio(matchedWeight, totalWeight)
|
|
9816
9866
|
};
|
|
9817
9867
|
}
|
|
@@ -9831,8 +9881,8 @@ function emptyAggregate() {
|
|
|
9831
9881
|
function hasSplit(score, split) {
|
|
9832
9882
|
return score.bySplit[split] !== void 0;
|
|
9833
9883
|
}
|
|
9834
|
-
function f1(
|
|
9835
|
-
return
|
|
9884
|
+
function f1(precision2, recall) {
|
|
9885
|
+
return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
|
|
9836
9886
|
}
|
|
9837
9887
|
function ratio(numerator, denominator) {
|
|
9838
9888
|
return denominator > 0 ? numerator / denominator : 0;
|
|
@@ -9956,14 +10006,14 @@ function referenceReplayRunsToSteeringRows(runs, options = {}) {
|
|
|
9956
10006
|
function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
9957
10007
|
const success = scenarioScore.f1;
|
|
9958
10008
|
const recall = scenarioScore.recall;
|
|
9959
|
-
const
|
|
10009
|
+
const precision2 = scenarioScore.precision;
|
|
9960
10010
|
const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
|
|
9961
10011
|
return {
|
|
9962
10012
|
success,
|
|
9963
10013
|
goalProgress: recall,
|
|
9964
|
-
repoGroundedness:
|
|
9965
|
-
driftPenalty: 1 -
|
|
9966
|
-
toolUseQuality:
|
|
10014
|
+
repoGroundedness: precision2,
|
|
10015
|
+
driftPenalty: 1 - precision2,
|
|
10016
|
+
toolUseQuality: precision2,
|
|
9967
10017
|
patchQuality: 0,
|
|
9968
10018
|
testReality: scenarioScore.total > 0 ? 1 : 0,
|
|
9969
10019
|
finalGate: success,
|
|
@@ -9972,10 +10022,569 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
|
|
|
9972
10022
|
wallSeconds: Math.max(0, durationMs / 1e3),
|
|
9973
10023
|
notes: [
|
|
9974
10024
|
`reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
|
|
9975
|
-
`precision=${
|
|
10025
|
+
`precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
|
|
9976
10026
|
]
|
|
9977
10027
|
};
|
|
9978
10028
|
}
|
|
10029
|
+
|
|
10030
|
+
// src/prompt-evolution.ts
|
|
10031
|
+
var InMemoryTrialCache = class {
|
|
10032
|
+
store = /* @__PURE__ */ new Map();
|
|
10033
|
+
get(key) {
|
|
10034
|
+
return this.store.get(key);
|
|
10035
|
+
}
|
|
10036
|
+
set(key, value) {
|
|
10037
|
+
this.store.set(key, value);
|
|
10038
|
+
}
|
|
10039
|
+
size() {
|
|
10040
|
+
return this.store.size;
|
|
10041
|
+
}
|
|
10042
|
+
clear() {
|
|
10043
|
+
this.store.clear();
|
|
10044
|
+
}
|
|
10045
|
+
};
|
|
10046
|
+
async function runPromptEvolution(config) {
|
|
10047
|
+
const generations = [];
|
|
10048
|
+
let population = [...config.seedVariants];
|
|
10049
|
+
let bestVariant = population[0];
|
|
10050
|
+
let bestAggregate = null;
|
|
10051
|
+
for (let generation = 0; generation < config.generations; generation++) {
|
|
10052
|
+
config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
|
|
10053
|
+
const trials = await scorePopulation(population, config, generation);
|
|
10054
|
+
const aggregates = aggregateTrials(population, config.scenarioIds, trials);
|
|
10055
|
+
const front = paretoFrontierWithCrowding(aggregates, config.objectives);
|
|
10056
|
+
const frontIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10057
|
+
const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
|
|
10058
|
+
scored.sort((a, b) => b.score - a.score);
|
|
10059
|
+
const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
|
|
10060
|
+
const report = {
|
|
10061
|
+
runId: config.runId,
|
|
10062
|
+
target: config.target,
|
|
10063
|
+
generation,
|
|
10064
|
+
variants: population,
|
|
10065
|
+
aggregates,
|
|
10066
|
+
paretoFrontIds: front.map((c) => c.candidate.variantId),
|
|
10067
|
+
winnerId,
|
|
10068
|
+
trials
|
|
10069
|
+
};
|
|
10070
|
+
generations.push(report);
|
|
10071
|
+
config.onProgress?.({ type: "generation-complete", report });
|
|
10072
|
+
const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
|
|
10073
|
+
if (winnerAgg) {
|
|
10074
|
+
const winner = population.find((v) => v.id === winnerId);
|
|
10075
|
+
if (winner) bestVariant = winner;
|
|
10076
|
+
bestAggregate = winnerAgg;
|
|
10077
|
+
}
|
|
10078
|
+
if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
|
|
10079
|
+
const prev = generations[generations.length - 2];
|
|
10080
|
+
const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
|
|
10081
|
+
if (noChange) {
|
|
10082
|
+
config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
|
|
10083
|
+
break;
|
|
10084
|
+
}
|
|
10085
|
+
}
|
|
10086
|
+
if (generation === config.generations - 1) break;
|
|
10087
|
+
population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
|
|
10088
|
+
}
|
|
10089
|
+
return {
|
|
10090
|
+
runId: config.runId,
|
|
10091
|
+
target: config.target,
|
|
10092
|
+
generations,
|
|
10093
|
+
bestVariant,
|
|
10094
|
+
bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
|
|
10095
|
+
};
|
|
10096
|
+
}
|
|
10097
|
+
async function scorePopulation(population, config, generation) {
|
|
10098
|
+
const jobs = [];
|
|
10099
|
+
for (const variant of population) {
|
|
10100
|
+
for (const scenarioId of config.scenarioIds) {
|
|
10101
|
+
for (let rep = 0; rep < config.reps; rep++) {
|
|
10102
|
+
jobs.push(async () => {
|
|
10103
|
+
const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
|
|
10104
|
+
const cached = config.cache?.get(cacheKey);
|
|
10105
|
+
if (cached) {
|
|
10106
|
+
config.onProgress?.({
|
|
10107
|
+
type: "trial-complete",
|
|
10108
|
+
generation,
|
|
10109
|
+
variantId: variant.id,
|
|
10110
|
+
scenarioId,
|
|
10111
|
+
rep,
|
|
10112
|
+
ok: cached.ok,
|
|
10113
|
+
score: cached.score,
|
|
10114
|
+
cached: true
|
|
10115
|
+
});
|
|
10116
|
+
return cached;
|
|
10117
|
+
}
|
|
10118
|
+
const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
|
|
10119
|
+
config.cache?.set(cacheKey, result);
|
|
10120
|
+
config.onProgress?.({
|
|
10121
|
+
type: "trial-complete",
|
|
10122
|
+
generation,
|
|
10123
|
+
variantId: variant.id,
|
|
10124
|
+
scenarioId,
|
|
10125
|
+
rep,
|
|
10126
|
+
ok: result.ok,
|
|
10127
|
+
score: result.score,
|
|
10128
|
+
cached: false
|
|
10129
|
+
});
|
|
10130
|
+
return result;
|
|
10131
|
+
});
|
|
10132
|
+
}
|
|
10133
|
+
}
|
|
10134
|
+
}
|
|
10135
|
+
return runWithConcurrency(jobs, config.scoreConcurrency);
|
|
10136
|
+
}
|
|
10137
|
+
async function runWithConcurrency(jobs, concurrency) {
|
|
10138
|
+
const results = new Array(jobs.length);
|
|
10139
|
+
const limit = Math.max(1, concurrency);
|
|
10140
|
+
let next = 0;
|
|
10141
|
+
async function worker() {
|
|
10142
|
+
while (true) {
|
|
10143
|
+
const i = next++;
|
|
10144
|
+
if (i >= jobs.length) return;
|
|
10145
|
+
results[i] = await jobs[i]();
|
|
10146
|
+
}
|
|
10147
|
+
}
|
|
10148
|
+
await Promise.all(Array.from({ length: limit }, () => worker()));
|
|
10149
|
+
return results;
|
|
10150
|
+
}
|
|
10151
|
+
function aggregateTrials(population, scenarioIds, trials) {
|
|
10152
|
+
return population.map((variant) => {
|
|
10153
|
+
const variantTrials = trials.filter((t) => t.variantId === variant.id);
|
|
10154
|
+
const scenarios = scenarioIds.map((sid) => {
|
|
10155
|
+
const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
|
|
10156
|
+
const okTrials = scenarioTrials.filter((t) => t.ok);
|
|
10157
|
+
const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
|
|
10158
|
+
return {
|
|
10159
|
+
variantId: variant.id,
|
|
10160
|
+
scenarioId: sid,
|
|
10161
|
+
meanScore: mean5(okTrials.map((t) => t.score)),
|
|
10162
|
+
meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
|
|
10163
|
+
meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
|
|
10164
|
+
okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
|
|
10165
|
+
trials: scenarioTrials.length,
|
|
10166
|
+
metrics
|
|
10167
|
+
};
|
|
10168
|
+
});
|
|
10169
|
+
return {
|
|
10170
|
+
variantId: variant.id,
|
|
10171
|
+
meanScore: mean5(scenarios.map((s) => s.meanScore)),
|
|
10172
|
+
meanCost: mean5(scenarios.map((s) => s.meanCost)),
|
|
10173
|
+
meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
|
|
10174
|
+
okRate: mean5(scenarios.map((s) => s.okRate)),
|
|
10175
|
+
scenarios,
|
|
10176
|
+
metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
|
|
10177
|
+
};
|
|
10178
|
+
});
|
|
10179
|
+
}
|
|
10180
|
+
function aggregateMetrics(rows) {
|
|
10181
|
+
const buckets = /* @__PURE__ */ new Map();
|
|
10182
|
+
for (const row of rows) {
|
|
10183
|
+
for (const [k, v] of Object.entries(row)) {
|
|
10184
|
+
if (!Number.isFinite(v)) continue;
|
|
10185
|
+
const list = buckets.get(k) ?? [];
|
|
10186
|
+
list.push(v);
|
|
10187
|
+
buckets.set(k, list);
|
|
10188
|
+
}
|
|
10189
|
+
}
|
|
10190
|
+
const out = {};
|
|
10191
|
+
for (const [k, list] of buckets) out[k] = mean5(list);
|
|
10192
|
+
return out;
|
|
10193
|
+
}
|
|
10194
|
+
function mean5(xs) {
|
|
10195
|
+
if (xs.length === 0) return 0;
|
|
10196
|
+
return xs.reduce((a, b) => a + b, 0) / xs.length;
|
|
10197
|
+
}
|
|
10198
|
+
async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
|
|
10199
|
+
const survivorIds = new Set(front.map((c) => c.candidate.variantId));
|
|
10200
|
+
const survivors = current.filter((v) => survivorIds.has(v.id));
|
|
10201
|
+
const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
|
|
10202
|
+
const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
|
|
10203
|
+
const parent = current.find((v) => v.id === parentId) ?? current[0];
|
|
10204
|
+
const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
|
|
10205
|
+
const topTrials = topKTrialsByScore(trials, parent.id, 3);
|
|
10206
|
+
const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
|
|
10207
|
+
const childCount = Math.max(0, config.populationSize - survivors.length);
|
|
10208
|
+
let children = [];
|
|
10209
|
+
if (childCount > 0) {
|
|
10210
|
+
children = await config.mutateAdapter.mutate({
|
|
10211
|
+
parent,
|
|
10212
|
+
parentAggregate,
|
|
10213
|
+
topTrials,
|
|
10214
|
+
bottomTrials,
|
|
10215
|
+
childCount,
|
|
10216
|
+
generation: nextGeneration
|
|
10217
|
+
});
|
|
10218
|
+
children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
|
|
10219
|
+
}
|
|
10220
|
+
return [...survivors, ...children];
|
|
10221
|
+
}
|
|
10222
|
+
function topKTrialsByScore(trials, variantId, k) {
|
|
10223
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
|
|
10224
|
+
}
|
|
10225
|
+
function bottomKTrialsByScore(trials, variantId, k) {
|
|
10226
|
+
return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
|
|
10227
|
+
}
|
|
10228
|
+
function samePopulation(a, b) {
|
|
10229
|
+
if (a.length !== b.length) return false;
|
|
10230
|
+
const setA = new Set(a);
|
|
10231
|
+
return b.every((id) => setA.has(id));
|
|
10232
|
+
}
|
|
10233
|
+
|
|
10234
|
+
// src/golden-matcher.ts
|
|
10235
|
+
function matchGoldens(goldens, candidates, options = {}) {
|
|
10236
|
+
const extract = options.text ?? defaultExtract5;
|
|
10237
|
+
const haystacks = candidates.map((c) => extract(c).toLowerCase());
|
|
10238
|
+
const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
|
|
10239
|
+
return {
|
|
10240
|
+
matches: matches2,
|
|
10241
|
+
hits: matches2.filter(Boolean).length,
|
|
10242
|
+
total: goldens.length
|
|
10243
|
+
};
|
|
10244
|
+
}
|
|
10245
|
+
function defaultExtract5(candidate) {
|
|
10246
|
+
if (typeof candidate === "string") return candidate;
|
|
10247
|
+
if (candidate && typeof candidate === "object") {
|
|
10248
|
+
const parts = [];
|
|
10249
|
+
for (const v of Object.values(candidate)) {
|
|
10250
|
+
if (typeof v === "string") parts.push(v);
|
|
10251
|
+
}
|
|
10252
|
+
return parts.join(" ");
|
|
10253
|
+
}
|
|
10254
|
+
return String(candidate ?? "");
|
|
10255
|
+
}
|
|
10256
|
+
function goldenMatched(golden, haystacks) {
|
|
10257
|
+
for (const phrase of golden.any) {
|
|
10258
|
+
const needle = phrase.toLowerCase().trim();
|
|
10259
|
+
if (!needle) continue;
|
|
10260
|
+
if (haystacks.some((h) => h.includes(needle))) return true;
|
|
10261
|
+
}
|
|
10262
|
+
for (const pattern of golden.anyRegex ?? []) {
|
|
10263
|
+
let re;
|
|
10264
|
+
try {
|
|
10265
|
+
re = new RegExp(pattern, "i");
|
|
10266
|
+
} catch {
|
|
10267
|
+
continue;
|
|
10268
|
+
}
|
|
10269
|
+
if (haystacks.some((h) => re.test(h))) return true;
|
|
10270
|
+
}
|
|
10271
|
+
return false;
|
|
10272
|
+
}
|
|
10273
|
+
var DEFAULT_SEVERITY_WEIGHTS = {
|
|
10274
|
+
critical: 3,
|
|
10275
|
+
major: 2,
|
|
10276
|
+
minor: 1
|
|
10277
|
+
};
|
|
10278
|
+
function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
|
|
10279
|
+
if (goldens.length === 0) return 1;
|
|
10280
|
+
const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
|
|
10281
|
+
if (total === 0) return 1;
|
|
10282
|
+
const hit = goldens.reduce(
|
|
10283
|
+
(s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
|
|
10284
|
+
0
|
|
10285
|
+
);
|
|
10286
|
+
return hit / total;
|
|
10287
|
+
}
|
|
10288
|
+
function precision(goldens, candidates, options = {}) {
|
|
10289
|
+
if (candidates.length === 0) return 1;
|
|
10290
|
+
const extract = options.text ?? defaultExtract5;
|
|
10291
|
+
let matched = 0;
|
|
10292
|
+
for (const cand of candidates) {
|
|
10293
|
+
const haystack = extract(cand).toLowerCase();
|
|
10294
|
+
const matchedAny = goldens.some(
|
|
10295
|
+
(g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
|
|
10296
|
+
try {
|
|
10297
|
+
return new RegExp(pat, "i").test(haystack);
|
|
10298
|
+
} catch {
|
|
10299
|
+
return false;
|
|
10300
|
+
}
|
|
10301
|
+
})
|
|
10302
|
+
);
|
|
10303
|
+
if (matchedAny) matched++;
|
|
10304
|
+
}
|
|
10305
|
+
return matched / candidates.length;
|
|
10306
|
+
}
|
|
10307
|
+
|
|
10308
|
+
// src/orthogonality.ts
|
|
10309
|
+
function passOrthogonality(input) {
|
|
10310
|
+
const passes = input.passes;
|
|
10311
|
+
if (passes.length < 2) {
|
|
10312
|
+
return { orthogonality: 1, passCount: passes.length, similarities: [] };
|
|
10313
|
+
}
|
|
10314
|
+
const render = input.text ?? defaultRender;
|
|
10315
|
+
const minLen = input.minTokenLength ?? 4;
|
|
10316
|
+
const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
|
|
10317
|
+
const sims = [];
|
|
10318
|
+
for (let i = 0; i < vectors.length; i++) {
|
|
10319
|
+
for (let j = i + 1; j < vectors.length; j++) {
|
|
10320
|
+
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
10321
|
+
}
|
|
10322
|
+
}
|
|
10323
|
+
const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
10324
|
+
return {
|
|
10325
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
|
|
10326
|
+
passCount: passes.length,
|
|
10327
|
+
similarities: sims
|
|
10328
|
+
};
|
|
10329
|
+
}
|
|
10330
|
+
function defaultRender(item) {
|
|
10331
|
+
if (typeof item === "string") return item;
|
|
10332
|
+
if (item && typeof item === "object") {
|
|
10333
|
+
const parts = [];
|
|
10334
|
+
for (const v of Object.values(item)) {
|
|
10335
|
+
if (typeof v === "string") parts.push(v);
|
|
10336
|
+
}
|
|
10337
|
+
return parts.join(" ");
|
|
10338
|
+
}
|
|
10339
|
+
return String(item ?? "");
|
|
10340
|
+
}
|
|
10341
|
+
function bagOfWords(items, render, minLen) {
|
|
10342
|
+
const bag = /* @__PURE__ */ new Map();
|
|
10343
|
+
for (const item of items) {
|
|
10344
|
+
const text = render(item).toLowerCase();
|
|
10345
|
+
for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
|
|
10346
|
+
bag.set(tok, (bag.get(tok) ?? 0) + 1);
|
|
10347
|
+
}
|
|
10348
|
+
}
|
|
10349
|
+
return bag;
|
|
10350
|
+
}
|
|
10351
|
+
function cosineSimilarity(a, b) {
|
|
10352
|
+
let dot = 0;
|
|
10353
|
+
let aMag = 0;
|
|
10354
|
+
let bMag = 0;
|
|
10355
|
+
for (const [, v] of a) aMag += v * v;
|
|
10356
|
+
for (const [, v] of b) bMag += v * v;
|
|
10357
|
+
for (const [k, v] of a) {
|
|
10358
|
+
const bv = b.get(k);
|
|
10359
|
+
if (bv) dot += v * bv;
|
|
10360
|
+
}
|
|
10361
|
+
if (aMag === 0 || bMag === 0) return 0;
|
|
10362
|
+
return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
|
|
10363
|
+
}
|
|
10364
|
+
|
|
10365
|
+
// src/promotion-gate.ts
|
|
10366
|
+
function bootstrapCi(baseline, candidate, options = {}) {
|
|
10367
|
+
const alpha = options.alpha ?? 0.05;
|
|
10368
|
+
const iterations = options.iterations ?? 1e3;
|
|
10369
|
+
const minTotal = options.minTotalSamples ?? 6;
|
|
10370
|
+
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
10371
|
+
const baselineMean = mean6(baseline);
|
|
10372
|
+
const candidateMean = mean6(candidate);
|
|
10373
|
+
const delta = candidateMean - baselineMean;
|
|
10374
|
+
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
10375
|
+
return {
|
|
10376
|
+
baselineMean,
|
|
10377
|
+
candidateMean,
|
|
10378
|
+
delta,
|
|
10379
|
+
ciLower: -Infinity,
|
|
10380
|
+
ciUpper: Infinity,
|
|
10381
|
+
iterations: 0,
|
|
10382
|
+
alpha,
|
|
10383
|
+
verdict: "INCONCLUSIVE"
|
|
10384
|
+
};
|
|
10385
|
+
}
|
|
10386
|
+
const deltas = new Array(iterations);
|
|
10387
|
+
for (let i = 0; i < iterations; i++) {
|
|
10388
|
+
const bResample = resample(baseline, rng);
|
|
10389
|
+
const cResample = resample(candidate, rng);
|
|
10390
|
+
deltas[i] = mean6(cResample) - mean6(bResample);
|
|
10391
|
+
}
|
|
10392
|
+
deltas.sort((a, b) => a - b);
|
|
10393
|
+
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
10394
|
+
const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
|
|
10395
|
+
const ciLower = deltas[Math.max(0, lowerIdx)];
|
|
10396
|
+
const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
|
|
10397
|
+
let verdict;
|
|
10398
|
+
if (ciLower > 0) verdict = "ADVANCE";
|
|
10399
|
+
else if (ciUpper < 0) verdict = "REVERT";
|
|
10400
|
+
else if (delta >= 0) verdict = "KEEP";
|
|
10401
|
+
else verdict = "INCONCLUSIVE";
|
|
10402
|
+
return {
|
|
10403
|
+
baselineMean,
|
|
10404
|
+
candidateMean,
|
|
10405
|
+
delta,
|
|
10406
|
+
ciLower,
|
|
10407
|
+
ciUpper,
|
|
10408
|
+
iterations,
|
|
10409
|
+
alpha,
|
|
10410
|
+
verdict
|
|
10411
|
+
};
|
|
10412
|
+
}
|
|
10413
|
+
function mean6(xs) {
|
|
10414
|
+
if (xs.length === 0) return 0;
|
|
10415
|
+
let s = 0;
|
|
10416
|
+
for (const x of xs) s += x;
|
|
10417
|
+
return s / xs.length;
|
|
10418
|
+
}
|
|
10419
|
+
function resample(xs, rng) {
|
|
10420
|
+
const out = new Array(xs.length);
|
|
10421
|
+
for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
|
|
10422
|
+
return out;
|
|
10423
|
+
}
|
|
10424
|
+
function mulberry32(seed) {
|
|
10425
|
+
let t = seed >>> 0;
|
|
10426
|
+
return () => {
|
|
10427
|
+
t += 1831565813;
|
|
10428
|
+
let r = t;
|
|
10429
|
+
r = Math.imul(r ^ r >>> 15, r | 1);
|
|
10430
|
+
r ^= r + Math.imul(r ^ r >>> 7, r | 61);
|
|
10431
|
+
return ((r ^ r >>> 14) >>> 0) / 4294967296;
|
|
10432
|
+
};
|
|
10433
|
+
}
|
|
10434
|
+
function hashSeed(a, b) {
|
|
10435
|
+
let h = 2166136261;
|
|
10436
|
+
for (const x of [...a, ...b]) {
|
|
10437
|
+
const view = new Float64Array([x]);
|
|
10438
|
+
const bytes = new Uint8Array(view.buffer);
|
|
10439
|
+
for (const byte of bytes) {
|
|
10440
|
+
h ^= byte;
|
|
10441
|
+
h = Math.imul(h, 16777619);
|
|
10442
|
+
}
|
|
10443
|
+
}
|
|
10444
|
+
return h >>> 0;
|
|
10445
|
+
}
|
|
10446
|
+
async function judgeReplayGate(args) {
|
|
10447
|
+
const concurrency = args.judgeConcurrency ?? 4;
|
|
10448
|
+
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
10449
|
+
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
10450
|
+
const ci = bootstrapCi(baselineScores, candidateScores, {
|
|
10451
|
+
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
10452
|
+
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
10453
|
+
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
10454
|
+
});
|
|
10455
|
+
return {
|
|
10456
|
+
...ci,
|
|
10457
|
+
baselineSamples: baselineScores.length,
|
|
10458
|
+
candidateSamples: candidateScores.length
|
|
10459
|
+
};
|
|
10460
|
+
}
|
|
10461
|
+
async function scoreAll(outputs, judge, concurrency) {
|
|
10462
|
+
const results = new Array(outputs.length);
|
|
10463
|
+
let next = 0;
|
|
10464
|
+
async function worker() {
|
|
10465
|
+
while (true) {
|
|
10466
|
+
const i = next++;
|
|
10467
|
+
if (i >= outputs.length) return;
|
|
10468
|
+
const v = await judge(outputs[i]);
|
|
10469
|
+
results[i] = Number.isFinite(v) ? v : 0;
|
|
10470
|
+
}
|
|
10471
|
+
}
|
|
10472
|
+
await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
|
|
10473
|
+
return results;
|
|
10474
|
+
}
|
|
10475
|
+
|
|
10476
|
+
// src/reflective-mutation.ts
|
|
10477
|
+
var DEFAULT_MUTATION_PRIMITIVES = [
|
|
10478
|
+
'Strengthen an imperative ("should" \u2192 "must")',
|
|
10479
|
+
"Add a concrete example pulled from a missed-golden phrase",
|
|
10480
|
+
"Remove a redundant rule that did not improve recall",
|
|
10481
|
+
'Add a counterfactual ("if X is missing, the score is capped at Y")',
|
|
10482
|
+
"Reorder sections so the highest-impact rule is first",
|
|
10483
|
+
"Replace abstract language with a domain-specific noun the trial misses"
|
|
10484
|
+
];
|
|
10485
|
+
function buildReflectionPrompt(ctx) {
|
|
10486
|
+
const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
|
|
10487
|
+
const sections = [];
|
|
10488
|
+
sections.push(`# Mutation target: ${ctx.target}`);
|
|
10489
|
+
sections.push("");
|
|
10490
|
+
sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
|
|
10491
|
+
sections.push("");
|
|
10492
|
+
sections.push("## Current variant");
|
|
10493
|
+
sections.push("```json");
|
|
10494
|
+
sections.push(JSON.stringify(ctx.parentPayload, null, 2));
|
|
10495
|
+
sections.push("```");
|
|
10496
|
+
sections.push("");
|
|
10497
|
+
if (ctx.bottomTrials.length > 0) {
|
|
10498
|
+
sections.push("## Failures (bottom trials) \u2014 what went wrong");
|
|
10499
|
+
sections.push("");
|
|
10500
|
+
for (const trial of ctx.bottomTrials) {
|
|
10501
|
+
sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10502
|
+
const missed = (trial.expectations ?? []).filter((e) => !e.matched);
|
|
10503
|
+
if (missed.length > 0) {
|
|
10504
|
+
sections.push("");
|
|
10505
|
+
sections.push("**Missed expectations:**");
|
|
10506
|
+
for (const m of missed) {
|
|
10507
|
+
sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
|
|
10508
|
+
}
|
|
10509
|
+
}
|
|
10510
|
+
if (trial.emitted) {
|
|
10511
|
+
sections.push("");
|
|
10512
|
+
sections.push("**What the agent emitted:**");
|
|
10513
|
+
sections.push("```");
|
|
10514
|
+
sections.push(truncate3(trial.emitted, 600));
|
|
10515
|
+
sections.push("```");
|
|
10516
|
+
}
|
|
10517
|
+
sections.push("");
|
|
10518
|
+
}
|
|
10519
|
+
}
|
|
10520
|
+
if (ctx.topTrials.length > 0) {
|
|
10521
|
+
sections.push("## Successes (top trials) \u2014 what to preserve");
|
|
10522
|
+
sections.push("");
|
|
10523
|
+
for (const trial of ctx.topTrials) {
|
|
10524
|
+
sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
|
|
10525
|
+
}
|
|
10526
|
+
sections.push("");
|
|
10527
|
+
}
|
|
10528
|
+
sections.push("## Allowed mutation primitives");
|
|
10529
|
+
sections.push("");
|
|
10530
|
+
for (const p of primitives) sections.push(`- ${p}`);
|
|
10531
|
+
sections.push("");
|
|
10532
|
+
sections.push("## Output schema");
|
|
10533
|
+
sections.push("");
|
|
10534
|
+
sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
|
|
10535
|
+
sections.push("```json");
|
|
10536
|
+
sections.push(JSON.stringify(
|
|
10537
|
+
{
|
|
10538
|
+
proposals: [
|
|
10539
|
+
{
|
|
10540
|
+
label: "<short label, \u2264 40 chars>",
|
|
10541
|
+
rationale: "<which failure this targets and which primitive you used>",
|
|
10542
|
+
payload: "<full payload of the new variant \u2014 same shape as the current variant>"
|
|
10543
|
+
}
|
|
10544
|
+
]
|
|
10545
|
+
},
|
|
10546
|
+
null,
|
|
10547
|
+
2
|
|
10548
|
+
));
|
|
10549
|
+
sections.push("```");
|
|
10550
|
+
return sections.join("\n");
|
|
10551
|
+
}
|
|
10552
|
+
function truncate3(s, max) {
|
|
10553
|
+
if (s.length <= max) return s;
|
|
10554
|
+
return s.slice(0, max) + "\u2026 [truncated]";
|
|
10555
|
+
}
|
|
10556
|
+
function quote(s) {
|
|
10557
|
+
return s.replace(/`/g, "\\`");
|
|
10558
|
+
}
|
|
10559
|
+
function parseReflectionResponse(raw, maxProposals) {
|
|
10560
|
+
let text = raw.trim();
|
|
10561
|
+
if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
|
|
10562
|
+
const start = text.indexOf("{");
|
|
10563
|
+
const end = text.lastIndexOf("}");
|
|
10564
|
+
if (start < 0 || end <= start) return [];
|
|
10565
|
+
let parsed;
|
|
10566
|
+
try {
|
|
10567
|
+
parsed = JSON.parse(text.slice(start, end + 1));
|
|
10568
|
+
} catch {
|
|
10569
|
+
return [];
|
|
10570
|
+
}
|
|
10571
|
+
if (!parsed || typeof parsed !== "object") return [];
|
|
10572
|
+
const proposalsRaw = parsed.proposals;
|
|
10573
|
+
if (!Array.isArray(proposalsRaw)) return [];
|
|
10574
|
+
const out = [];
|
|
10575
|
+
for (const p of proposalsRaw) {
|
|
10576
|
+
if (!p || typeof p !== "object") continue;
|
|
10577
|
+
const obj = p;
|
|
10578
|
+
if (!("payload" in obj)) continue;
|
|
10579
|
+
out.push({
|
|
10580
|
+
label: typeof obj.label === "string" ? obj.label : "mutation",
|
|
10581
|
+
rationale: typeof obj.rationale === "string" ? obj.rationale : "",
|
|
10582
|
+
payload: obj.payload
|
|
10583
|
+
});
|
|
10584
|
+
if (maxProposals !== void 0 && out.length >= maxProposals) break;
|
|
10585
|
+
}
|
|
10586
|
+
return out;
|
|
10587
|
+
}
|
|
9979
10588
|
export {
|
|
9980
10589
|
AgentDriver,
|
|
9981
10590
|
AxGepaSteeringOptimizer,
|
|
@@ -9990,10 +10599,12 @@ export {
|
|
|
9990
10599
|
DEFAULT_RULES as DEFAULT_FAILURE_RULES,
|
|
9991
10600
|
DEFAULT_FINDERS,
|
|
9992
10601
|
DEFAULT_HARNESS_OBJECTIVES,
|
|
10602
|
+
DEFAULT_MUTATION_PRIMITIVES,
|
|
9993
10603
|
DEFAULT_MUTATORS,
|
|
9994
10604
|
DEFAULT_REDACTION_RULES,
|
|
9995
10605
|
DEFAULT_RED_TEAM_CORPUS,
|
|
9996
10606
|
DEFAULT_RUN_SCORE_WEIGHTS,
|
|
10607
|
+
DEFAULT_SEVERITY_WEIGHTS,
|
|
9997
10608
|
Dataset,
|
|
9998
10609
|
DockerSandboxDriver,
|
|
9999
10610
|
DualAgentBench,
|
|
@@ -10008,6 +10619,7 @@ export {
|
|
|
10008
10619
|
InMemoryExperimentStore,
|
|
10009
10620
|
InMemoryOutcomeStore,
|
|
10010
10621
|
InMemoryTraceStore,
|
|
10622
|
+
InMemoryTrialCache,
|
|
10011
10623
|
InMemoryWorkspaceInspector,
|
|
10012
10624
|
JudgeRunner,
|
|
10013
10625
|
LlmCallError,
|
|
@@ -10043,7 +10655,9 @@ export {
|
|
|
10043
10655
|
benjaminiHochberg,
|
|
10044
10656
|
bisect,
|
|
10045
10657
|
bonferroni,
|
|
10658
|
+
bootstrapCi,
|
|
10046
10659
|
budgetBreachView,
|
|
10660
|
+
buildReflectionPrompt,
|
|
10047
10661
|
buildReviewerPrompt,
|
|
10048
10662
|
buildTrajectory,
|
|
10049
10663
|
byteLengthRange,
|
|
@@ -10081,6 +10695,7 @@ export {
|
|
|
10081
10695
|
createLlmReviewer,
|
|
10082
10696
|
createSemanticConceptJudge,
|
|
10083
10697
|
crossTraceDiff,
|
|
10698
|
+
crowdingDistance,
|
|
10084
10699
|
decideReferenceReplayPromotion,
|
|
10085
10700
|
decideReferenceReplayRunPromotion,
|
|
10086
10701
|
defaultJudges,
|
|
@@ -10114,6 +10729,7 @@ export {
|
|
|
10114
10729
|
formatBenchmarkReport,
|
|
10115
10730
|
formatDriverReport,
|
|
10116
10731
|
formatFindings,
|
|
10732
|
+
precision as goldenPrecision,
|
|
10117
10733
|
gradeSemanticStatus,
|
|
10118
10734
|
groupBy,
|
|
10119
10735
|
hashContent,
|
|
@@ -10135,6 +10751,7 @@ export {
|
|
|
10135
10751
|
jsonlReferenceReplayStore,
|
|
10136
10752
|
jsonlReviewStore,
|
|
10137
10753
|
judgeAgreementView,
|
|
10754
|
+
judgeReplayGate,
|
|
10138
10755
|
judgeSpans,
|
|
10139
10756
|
keyPreserved,
|
|
10140
10757
|
linterJudge,
|
|
@@ -10144,6 +10761,7 @@ export {
|
|
|
10144
10761
|
localCommandRunner,
|
|
10145
10762
|
lowercaseMutator,
|
|
10146
10763
|
mannWhitneyU,
|
|
10764
|
+
matchGoldens,
|
|
10147
10765
|
mergeLayerResults,
|
|
10148
10766
|
mergeSteeringBundle,
|
|
10149
10767
|
multiToolchainLayer,
|
|
@@ -10155,7 +10773,10 @@ export {
|
|
|
10155
10773
|
pairedTTest,
|
|
10156
10774
|
paraphraseRobustness,
|
|
10157
10775
|
paretoFrontier,
|
|
10776
|
+
paretoFrontierWithCrowding,
|
|
10777
|
+
parseReflectionResponse,
|
|
10158
10778
|
partialCredit,
|
|
10779
|
+
passOrthogonality,
|
|
10159
10780
|
pixelDeltaRatio,
|
|
10160
10781
|
politenessPrefixMutator,
|
|
10161
10782
|
positionalBias,
|
|
@@ -10195,12 +10816,14 @@ export {
|
|
|
10195
10816
|
runJudgeFleet,
|
|
10196
10817
|
runKeywordCoverageJudge,
|
|
10197
10818
|
runKeywordCoverageJudgeUrl,
|
|
10819
|
+
runPromptEvolution,
|
|
10198
10820
|
runProposeReview,
|
|
10199
10821
|
runReferenceReplay,
|
|
10200
10822
|
runSelfPlay,
|
|
10201
10823
|
runSemanticConceptJudge,
|
|
10202
10824
|
runTestGradedScenario,
|
|
10203
10825
|
runsForScenario,
|
|
10826
|
+
scalarScore,
|
|
10204
10827
|
scanForMuffledGates,
|
|
10205
10828
|
scoreAllProjects,
|
|
10206
10829
|
scoreContinuity,
|
|
@@ -10237,6 +10860,7 @@ export {
|
|
|
10237
10860
|
viteDeployRunner,
|
|
10238
10861
|
vitestTestParser,
|
|
10239
10862
|
weightedMean,
|
|
10863
|
+
weightedRecall,
|
|
10240
10864
|
welchsTTest,
|
|
10241
10865
|
whitespaceCollapseMutator,
|
|
10242
10866
|
wilcoxonSignedRank
|