@tangle-network/agent-eval 0.11.1 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean5 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean7 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean5,
428
+ mean: mean7,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
516
+ const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
520
- const t = mean5 / se;
519
+ if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
520
+ const t = mean7 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean5 = n * (n + 1) / 4;
544
+ const mean7 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean5) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean7) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -2486,6 +2486,56 @@ function paretoFrontier(candidates, objectives) {
2486
2486
  }));
2487
2487
  return { frontier, dominated, dominanceMap };
2488
2488
  }
2489
+ function scalarScore(candidates, objectives, options = {}) {
2490
+ if (candidates.length === 0) return [];
2491
+ const weights = options.weights ?? {};
2492
+ const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
2493
+ const ranges = objectives.map((obj) => {
2494
+ const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
2495
+ if (values.length === 0) return { min: 0, max: 1 };
2496
+ const min = Math.min(...values);
2497
+ const max = Math.max(...values);
2498
+ return { min, max: max === min ? min + 1 : max };
2499
+ });
2500
+ return candidates.map((c) => {
2501
+ let score = 0;
2502
+ objectives.forEach((obj, i) => {
2503
+ const v = obj.value(c);
2504
+ if (!Number.isFinite(v)) return;
2505
+ const { min, max } = ranges[i];
2506
+ const normalised = (v - min) / (max - min);
2507
+ const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
2508
+ const weight = (weights[obj.name] ?? 1) / totalWeight;
2509
+ score += directional * weight;
2510
+ });
2511
+ return { candidate: c, score };
2512
+ });
2513
+ }
2514
+ function crowdingDistance(candidates, objectives) {
2515
+ const distances = new Map(candidates.map((c) => [c, 0]));
2516
+ for (const obj of objectives) {
2517
+ const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
2518
+ const min = obj.value(sorted[0]);
2519
+ const max = obj.value(sorted[sorted.length - 1]);
2520
+ const range = max - min || 1;
2521
+ distances.set(sorted[0], Infinity);
2522
+ distances.set(sorted[sorted.length - 1], Infinity);
2523
+ for (let i = 1; i < sorted.length - 1; i++) {
2524
+ const prev = obj.value(sorted[i - 1]);
2525
+ const next = obj.value(sorted[i + 1]);
2526
+ const current = distances.get(sorted[i]);
2527
+ if (current === Infinity) continue;
2528
+ distances.set(sorted[i], current + (next - prev) / range);
2529
+ }
2530
+ }
2531
+ return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
2532
+ }
2533
+ function paretoFrontierWithCrowding(candidates, objectives) {
2534
+ const { frontier } = paretoFrontier(candidates, objectives);
2535
+ if (frontier.length === 0) return [];
2536
+ const distances = crowdingDistance(frontier, objectives);
2537
+ return distances.sort((a, b) => b.distance - a.distance);
2538
+ }
2489
2539
 
2490
2540
  // src/harness-optimizer.ts
2491
2541
  var DEFAULT_HARNESS_OBJECTIVES = [
@@ -5095,10 +5145,10 @@ function analyzeSeries(values, options = {}) {
5095
5145
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
5096
5146
  }
5097
5147
  const tail = values.slice(-window);
5098
- const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
5099
- const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
5148
+ const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
5149
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
5100
5150
  const stdDev = Math.sqrt(variance2);
5101
- const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
5151
+ const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
5102
5152
  const cv = stdDev / refMean;
5103
5153
  const stable = tail.length >= window && cv <= stableCv;
5104
5154
  let tailRun = 0;
@@ -5119,7 +5169,7 @@ function analyzeSeries(values, options = {}) {
5119
5169
  } else {
5120
5170
  state = "noisy";
5121
5171
  }
5122
- return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
5172
+ return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
5123
5173
  }
5124
5174
 
5125
5175
  // src/state-continuity.ts
@@ -6047,12 +6097,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
6047
6097
  variantScores.push({ mutator: id, score, mutated });
6048
6098
  all.push(score);
6049
6099
  }
6050
- const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
6051
- const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
6100
+ const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
6101
+ const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
6052
6102
  const stdDev = Math.sqrt(variance2);
6053
- const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6103
+ const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
6054
6104
  const robustness = Math.max(0, 1 - stdDev / ref);
6055
- return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
6105
+ return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
6056
6106
  }
6057
6107
  var lowercaseMutator = (p) => p.toLowerCase();
6058
6108
  var sentenceReorderMutator = (p, seed) => {
@@ -6973,8 +7023,8 @@ async function prmBestOfN(store, grader, runIds) {
6973
7023
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6974
7024
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6975
7025
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6976
- const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6977
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
7026
+ const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7027
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
6978
7028
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6979
7029
  }
6980
7030
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6996,8 +7046,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6996
7046
  const ranked = [...byRun.values()].sort(
6997
7047
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6998
7048
  );
6999
- const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7000
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
7049
+ const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7050
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
7001
7051
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7002
7052
  }
7003
7053
 
@@ -7527,8 +7577,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7527
7577
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7528
7578
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7529
7579
  if (scores.length < 3) continue;
7530
- const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7531
- const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7580
+ const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
7581
+ const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
7532
7582
  if (variance2 > varianceThreshold) {
7533
7583
  targets.push({
7534
7584
  reason: "high-variance",
@@ -9771,7 +9821,7 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9771
9821
  const total = scenario.references.length;
9772
9822
  const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9773
9823
  const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
9774
- const precision = ratio(matched, matched + falsePositives);
9824
+ const precision2 = ratio(matched, matched + falsePositives);
9775
9825
  const recall = ratio(matched, total);
9776
9826
  return {
9777
9827
  scenarioId: scenario.id,
@@ -9781,9 +9831,9 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
9781
9831
  falsePositives,
9782
9832
  matchedWeight,
9783
9833
  totalWeight,
9784
- precision,
9834
+ precision: precision2,
9785
9835
  recall,
9786
- f1: f1(precision, recall),
9836
+ f1: f1(precision2, recall),
9787
9837
  matches: matches2
9788
9838
  };
9789
9839
  }
@@ -9801,7 +9851,7 @@ function aggregateScenarioScores(scores) {
9801
9851
  const falsePositives = sum(scores.map((score) => score.falsePositives));
9802
9852
  const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9803
9853
  const totalWeight = sum(scores.map((score) => score.totalWeight));
9804
- const precision = ratio(matched, matched + falsePositives);
9854
+ const precision2 = ratio(matched, matched + falsePositives);
9805
9855
  const recall = ratio(matched, total);
9806
9856
  return {
9807
9857
  matched,
@@ -9809,9 +9859,9 @@ function aggregateScenarioScores(scores) {
9809
9859
  falsePositives,
9810
9860
  matchedWeight,
9811
9861
  totalWeight,
9812
- precision,
9862
+ precision: precision2,
9813
9863
  recall,
9814
- f1: f1(precision, recall),
9864
+ f1: f1(precision2, recall),
9815
9865
  weightedRecall: ratio(matchedWeight, totalWeight)
9816
9866
  };
9817
9867
  }
@@ -9831,8 +9881,8 @@ function emptyAggregate() {
9831
9881
  function hasSplit(score, split) {
9832
9882
  return score.bySplit[split] !== void 0;
9833
9883
  }
9834
- function f1(precision, recall) {
9835
- return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
9884
+ function f1(precision2, recall) {
9885
+ return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
9836
9886
  }
9837
9887
  function ratio(numerator, denominator) {
9838
9888
  return denominator > 0 ? numerator / denominator : 0;
@@ -9956,14 +10006,14 @@ function referenceReplayRunsToSteeringRows(runs, options = {}) {
9956
10006
  function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
9957
10007
  const success = scenarioScore.f1;
9958
10008
  const recall = scenarioScore.recall;
9959
- const precision = scenarioScore.precision;
10009
+ const precision2 = scenarioScore.precision;
9960
10010
  const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
9961
10011
  return {
9962
10012
  success,
9963
10013
  goalProgress: recall,
9964
- repoGroundedness: precision,
9965
- driftPenalty: 1 - precision,
9966
- toolUseQuality: precision,
10014
+ repoGroundedness: precision2,
10015
+ driftPenalty: 1 - precision2,
10016
+ toolUseQuality: precision2,
9967
10017
  patchQuality: 0,
9968
10018
  testReality: scenarioScore.total > 0 ? 1 : 0,
9969
10019
  finalGate: success,
@@ -9972,10 +10022,569 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
9972
10022
  wallSeconds: Math.max(0, durationMs / 1e3),
9973
10023
  notes: [
9974
10024
  `reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
9975
- `precision=${precision.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
10025
+ `precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
9976
10026
  ]
9977
10027
  };
9978
10028
  }
10029
+
10030
+ // src/prompt-evolution.ts
10031
+ var InMemoryTrialCache = class {
10032
+ store = /* @__PURE__ */ new Map();
10033
+ get(key) {
10034
+ return this.store.get(key);
10035
+ }
10036
+ set(key, value) {
10037
+ this.store.set(key, value);
10038
+ }
10039
+ size() {
10040
+ return this.store.size;
10041
+ }
10042
+ clear() {
10043
+ this.store.clear();
10044
+ }
10045
+ };
10046
+ async function runPromptEvolution(config) {
10047
+ const generations = [];
10048
+ let population = [...config.seedVariants];
10049
+ let bestVariant = population[0];
10050
+ let bestAggregate = null;
10051
+ for (let generation = 0; generation < config.generations; generation++) {
10052
+ config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
10053
+ const trials = await scorePopulation(population, config, generation);
10054
+ const aggregates = aggregateTrials(population, config.scenarioIds, trials);
10055
+ const front = paretoFrontierWithCrowding(aggregates, config.objectives);
10056
+ const frontIds = new Set(front.map((c) => c.candidate.variantId));
10057
+ const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
10058
+ scored.sort((a, b) => b.score - a.score);
10059
+ const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
10060
+ const report = {
10061
+ runId: config.runId,
10062
+ target: config.target,
10063
+ generation,
10064
+ variants: population,
10065
+ aggregates,
10066
+ paretoFrontIds: front.map((c) => c.candidate.variantId),
10067
+ winnerId,
10068
+ trials
10069
+ };
10070
+ generations.push(report);
10071
+ config.onProgress?.({ type: "generation-complete", report });
10072
+ const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
10073
+ if (winnerAgg) {
10074
+ const winner = population.find((v) => v.id === winnerId);
10075
+ if (winner) bestVariant = winner;
10076
+ bestAggregate = winnerAgg;
10077
+ }
10078
+ if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
10079
+ const prev = generations[generations.length - 2];
10080
+ const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
10081
+ if (noChange) {
10082
+ config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
10083
+ break;
10084
+ }
10085
+ }
10086
+ if (generation === config.generations - 1) break;
10087
+ population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
10088
+ }
10089
+ return {
10090
+ runId: config.runId,
10091
+ target: config.target,
10092
+ generations,
10093
+ bestVariant,
10094
+ bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
10095
+ };
10096
+ }
10097
+ async function scorePopulation(population, config, generation) {
10098
+ const jobs = [];
10099
+ for (const variant of population) {
10100
+ for (const scenarioId of config.scenarioIds) {
10101
+ for (let rep = 0; rep < config.reps; rep++) {
10102
+ jobs.push(async () => {
10103
+ const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
10104
+ const cached = config.cache?.get(cacheKey);
10105
+ if (cached) {
10106
+ config.onProgress?.({
10107
+ type: "trial-complete",
10108
+ generation,
10109
+ variantId: variant.id,
10110
+ scenarioId,
10111
+ rep,
10112
+ ok: cached.ok,
10113
+ score: cached.score,
10114
+ cached: true
10115
+ });
10116
+ return cached;
10117
+ }
10118
+ const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
10119
+ config.cache?.set(cacheKey, result);
10120
+ config.onProgress?.({
10121
+ type: "trial-complete",
10122
+ generation,
10123
+ variantId: variant.id,
10124
+ scenarioId,
10125
+ rep,
10126
+ ok: result.ok,
10127
+ score: result.score,
10128
+ cached: false
10129
+ });
10130
+ return result;
10131
+ });
10132
+ }
10133
+ }
10134
+ }
10135
+ return runWithConcurrency(jobs, config.scoreConcurrency);
10136
+ }
10137
+ async function runWithConcurrency(jobs, concurrency) {
10138
+ const results = new Array(jobs.length);
10139
+ const limit = Math.max(1, concurrency);
10140
+ let next = 0;
10141
+ async function worker() {
10142
+ while (true) {
10143
+ const i = next++;
10144
+ if (i >= jobs.length) return;
10145
+ results[i] = await jobs[i]();
10146
+ }
10147
+ }
10148
+ await Promise.all(Array.from({ length: limit }, () => worker()));
10149
+ return results;
10150
+ }
10151
+ function aggregateTrials(population, scenarioIds, trials) {
10152
+ return population.map((variant) => {
10153
+ const variantTrials = trials.filter((t) => t.variantId === variant.id);
10154
+ const scenarios = scenarioIds.map((sid) => {
10155
+ const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
10156
+ const okTrials = scenarioTrials.filter((t) => t.ok);
10157
+ const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
10158
+ return {
10159
+ variantId: variant.id,
10160
+ scenarioId: sid,
10161
+ meanScore: mean5(okTrials.map((t) => t.score)),
10162
+ meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
10163
+ meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
10164
+ okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
10165
+ trials: scenarioTrials.length,
10166
+ metrics
10167
+ };
10168
+ });
10169
+ return {
10170
+ variantId: variant.id,
10171
+ meanScore: mean5(scenarios.map((s) => s.meanScore)),
10172
+ meanCost: mean5(scenarios.map((s) => s.meanCost)),
10173
+ meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
10174
+ okRate: mean5(scenarios.map((s) => s.okRate)),
10175
+ scenarios,
10176
+ metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
10177
+ };
10178
+ });
10179
+ }
10180
+ function aggregateMetrics(rows) {
10181
+ const buckets = /* @__PURE__ */ new Map();
10182
+ for (const row of rows) {
10183
+ for (const [k, v] of Object.entries(row)) {
10184
+ if (!Number.isFinite(v)) continue;
10185
+ const list = buckets.get(k) ?? [];
10186
+ list.push(v);
10187
+ buckets.set(k, list);
10188
+ }
10189
+ }
10190
+ const out = {};
10191
+ for (const [k, list] of buckets) out[k] = mean5(list);
10192
+ return out;
10193
+ }
10194
+ function mean5(xs) {
10195
+ if (xs.length === 0) return 0;
10196
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
10197
+ }
10198
+ async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
10199
+ const survivorIds = new Set(front.map((c) => c.candidate.variantId));
10200
+ const survivors = current.filter((v) => survivorIds.has(v.id));
10201
+ const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
10202
+ const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
10203
+ const parent = current.find((v) => v.id === parentId) ?? current[0];
10204
+ const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
10205
+ const topTrials = topKTrialsByScore(trials, parent.id, 3);
10206
+ const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
10207
+ const childCount = Math.max(0, config.populationSize - survivors.length);
10208
+ let children = [];
10209
+ if (childCount > 0) {
10210
+ children = await config.mutateAdapter.mutate({
10211
+ parent,
10212
+ parentAggregate,
10213
+ topTrials,
10214
+ bottomTrials,
10215
+ childCount,
10216
+ generation: nextGeneration
10217
+ });
10218
+ children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
10219
+ }
10220
+ return [...survivors, ...children];
10221
+ }
10222
+ function topKTrialsByScore(trials, variantId, k) {
10223
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
10224
+ }
10225
+ function bottomKTrialsByScore(trials, variantId, k) {
10226
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
10227
+ }
10228
+ function samePopulation(a, b) {
10229
+ if (a.length !== b.length) return false;
10230
+ const setA = new Set(a);
10231
+ return b.every((id) => setA.has(id));
10232
+ }
10233
+
10234
+ // src/golden-matcher.ts
10235
+ function matchGoldens(goldens, candidates, options = {}) {
10236
+ const extract = options.text ?? defaultExtract5;
10237
+ const haystacks = candidates.map((c) => extract(c).toLowerCase());
10238
+ const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
10239
+ return {
10240
+ matches: matches2,
10241
+ hits: matches2.filter(Boolean).length,
10242
+ total: goldens.length
10243
+ };
10244
+ }
10245
+ function defaultExtract5(candidate) {
10246
+ if (typeof candidate === "string") return candidate;
10247
+ if (candidate && typeof candidate === "object") {
10248
+ const parts = [];
10249
+ for (const v of Object.values(candidate)) {
10250
+ if (typeof v === "string") parts.push(v);
10251
+ }
10252
+ return parts.join(" ");
10253
+ }
10254
+ return String(candidate ?? "");
10255
+ }
10256
+ function goldenMatched(golden, haystacks) {
10257
+ for (const phrase of golden.any) {
10258
+ const needle = phrase.toLowerCase().trim();
10259
+ if (!needle) continue;
10260
+ if (haystacks.some((h) => h.includes(needle))) return true;
10261
+ }
10262
+ for (const pattern of golden.anyRegex ?? []) {
10263
+ let re;
10264
+ try {
10265
+ re = new RegExp(pattern, "i");
10266
+ } catch {
10267
+ continue;
10268
+ }
10269
+ if (haystacks.some((h) => re.test(h))) return true;
10270
+ }
10271
+ return false;
10272
+ }
10273
+ var DEFAULT_SEVERITY_WEIGHTS = {
10274
+ critical: 3,
10275
+ major: 2,
10276
+ minor: 1
10277
+ };
10278
+ function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
10279
+ if (goldens.length === 0) return 1;
10280
+ const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
10281
+ if (total === 0) return 1;
10282
+ const hit = goldens.reduce(
10283
+ (s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
10284
+ 0
10285
+ );
10286
+ return hit / total;
10287
+ }
10288
+ function precision(goldens, candidates, options = {}) {
10289
+ if (candidates.length === 0) return 1;
10290
+ const extract = options.text ?? defaultExtract5;
10291
+ let matched = 0;
10292
+ for (const cand of candidates) {
10293
+ const haystack = extract(cand).toLowerCase();
10294
+ const matchedAny = goldens.some(
10295
+ (g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
10296
+ try {
10297
+ return new RegExp(pat, "i").test(haystack);
10298
+ } catch {
10299
+ return false;
10300
+ }
10301
+ })
10302
+ );
10303
+ if (matchedAny) matched++;
10304
+ }
10305
+ return matched / candidates.length;
10306
+ }
10307
+
10308
+ // src/orthogonality.ts
10309
+ function passOrthogonality(input) {
10310
+ const passes = input.passes;
10311
+ if (passes.length < 2) {
10312
+ return { orthogonality: 1, passCount: passes.length, similarities: [] };
10313
+ }
10314
+ const render = input.text ?? defaultRender;
10315
+ const minLen = input.minTokenLength ?? 4;
10316
+ const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
10317
+ const sims = [];
10318
+ for (let i = 0; i < vectors.length; i++) {
10319
+ for (let j = i + 1; j < vectors.length; j++) {
10320
+ sims.push(cosineSimilarity(vectors[i], vectors[j]));
10321
+ }
10322
+ }
10323
+ const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10324
+ return {
10325
+ orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
10326
+ passCount: passes.length,
10327
+ similarities: sims
10328
+ };
10329
+ }
10330
+ function defaultRender(item) {
10331
+ if (typeof item === "string") return item;
10332
+ if (item && typeof item === "object") {
10333
+ const parts = [];
10334
+ for (const v of Object.values(item)) {
10335
+ if (typeof v === "string") parts.push(v);
10336
+ }
10337
+ return parts.join(" ");
10338
+ }
10339
+ return String(item ?? "");
10340
+ }
10341
+ function bagOfWords(items, render, minLen) {
10342
+ const bag = /* @__PURE__ */ new Map();
10343
+ for (const item of items) {
10344
+ const text = render(item).toLowerCase();
10345
+ for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
10346
+ bag.set(tok, (bag.get(tok) ?? 0) + 1);
10347
+ }
10348
+ }
10349
+ return bag;
10350
+ }
10351
+ function cosineSimilarity(a, b) {
10352
+ let dot = 0;
10353
+ let aMag = 0;
10354
+ let bMag = 0;
10355
+ for (const [, v] of a) aMag += v * v;
10356
+ for (const [, v] of b) bMag += v * v;
10357
+ for (const [k, v] of a) {
10358
+ const bv = b.get(k);
10359
+ if (bv) dot += v * bv;
10360
+ }
10361
+ if (aMag === 0 || bMag === 0) return 0;
10362
+ return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
10363
+ }
10364
+
10365
+ // src/promotion-gate.ts
10366
+ function bootstrapCi(baseline, candidate, options = {}) {
10367
+ const alpha = options.alpha ?? 0.05;
10368
+ const iterations = options.iterations ?? 1e3;
10369
+ const minTotal = options.minTotalSamples ?? 6;
10370
+ const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
10371
+ const baselineMean = mean6(baseline);
10372
+ const candidateMean = mean6(candidate);
10373
+ const delta = candidateMean - baselineMean;
10374
+ if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
10375
+ return {
10376
+ baselineMean,
10377
+ candidateMean,
10378
+ delta,
10379
+ ciLower: -Infinity,
10380
+ ciUpper: Infinity,
10381
+ iterations: 0,
10382
+ alpha,
10383
+ verdict: "INCONCLUSIVE"
10384
+ };
10385
+ }
10386
+ const deltas = new Array(iterations);
10387
+ for (let i = 0; i < iterations; i++) {
10388
+ const bResample = resample(baseline, rng);
10389
+ const cResample = resample(candidate, rng);
10390
+ deltas[i] = mean6(cResample) - mean6(bResample);
10391
+ }
10392
+ deltas.sort((a, b) => a - b);
10393
+ const lowerIdx = Math.floor(alpha / 2 * iterations);
10394
+ const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
10395
+ const ciLower = deltas[Math.max(0, lowerIdx)];
10396
+ const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
10397
+ let verdict;
10398
+ if (ciLower > 0) verdict = "ADVANCE";
10399
+ else if (ciUpper < 0) verdict = "REVERT";
10400
+ else if (delta >= 0) verdict = "KEEP";
10401
+ else verdict = "INCONCLUSIVE";
10402
+ return {
10403
+ baselineMean,
10404
+ candidateMean,
10405
+ delta,
10406
+ ciLower,
10407
+ ciUpper,
10408
+ iterations,
10409
+ alpha,
10410
+ verdict
10411
+ };
10412
+ }
10413
+ function mean6(xs) {
10414
+ if (xs.length === 0) return 0;
10415
+ let s = 0;
10416
+ for (const x of xs) s += x;
10417
+ return s / xs.length;
10418
+ }
10419
+ function resample(xs, rng) {
10420
+ const out = new Array(xs.length);
10421
+ for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
10422
+ return out;
10423
+ }
10424
+ function mulberry32(seed) {
10425
+ let t = seed >>> 0;
10426
+ return () => {
10427
+ t += 1831565813;
10428
+ let r = t;
10429
+ r = Math.imul(r ^ r >>> 15, r | 1);
10430
+ r ^= r + Math.imul(r ^ r >>> 7, r | 61);
10431
+ return ((r ^ r >>> 14) >>> 0) / 4294967296;
10432
+ };
10433
+ }
10434
+ function hashSeed(a, b) {
10435
+ let h = 2166136261;
10436
+ for (const x of [...a, ...b]) {
10437
+ const view = new Float64Array([x]);
10438
+ const bytes = new Uint8Array(view.buffer);
10439
+ for (const byte of bytes) {
10440
+ h ^= byte;
10441
+ h = Math.imul(h, 16777619);
10442
+ }
10443
+ }
10444
+ return h >>> 0;
10445
+ }
10446
+ async function judgeReplayGate(args) {
10447
+ const concurrency = args.judgeConcurrency ?? 4;
10448
+ const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
10449
+ const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
10450
+ const ci = bootstrapCi(baselineScores, candidateScores, {
10451
+ ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
10452
+ ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
10453
+ ...args.seed !== void 0 ? { seed: args.seed } : {}
10454
+ });
10455
+ return {
10456
+ ...ci,
10457
+ baselineSamples: baselineScores.length,
10458
+ candidateSamples: candidateScores.length
10459
+ };
10460
+ }
10461
+ async function scoreAll(outputs, judge, concurrency) {
10462
+ const results = new Array(outputs.length);
10463
+ let next = 0;
10464
+ async function worker() {
10465
+ while (true) {
10466
+ const i = next++;
10467
+ if (i >= outputs.length) return;
10468
+ const v = await judge(outputs[i]);
10469
+ results[i] = Number.isFinite(v) ? v : 0;
10470
+ }
10471
+ }
10472
+ await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
10473
+ return results;
10474
+ }
10475
+
10476
+ // src/reflective-mutation.ts
10477
+ var DEFAULT_MUTATION_PRIMITIVES = [
10478
+ 'Strengthen an imperative ("should" \u2192 "must")',
10479
+ "Add a concrete example pulled from a missed-golden phrase",
10480
+ "Remove a redundant rule that did not improve recall",
10481
+ 'Add a counterfactual ("if X is missing, the score is capped at Y")',
10482
+ "Reorder sections so the highest-impact rule is first",
10483
+ "Replace abstract language with a domain-specific noun the trial misses"
10484
+ ];
10485
+ function buildReflectionPrompt(ctx) {
10486
+ const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
10487
+ const sections = [];
10488
+ sections.push(`# Mutation target: ${ctx.target}`);
10489
+ sections.push("");
10490
+ sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
10491
+ sections.push("");
10492
+ sections.push("## Current variant");
10493
+ sections.push("```json");
10494
+ sections.push(JSON.stringify(ctx.parentPayload, null, 2));
10495
+ sections.push("```");
10496
+ sections.push("");
10497
+ if (ctx.bottomTrials.length > 0) {
10498
+ sections.push("## Failures (bottom trials) \u2014 what went wrong");
10499
+ sections.push("");
10500
+ for (const trial of ctx.bottomTrials) {
10501
+ sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10502
+ const missed = (trial.expectations ?? []).filter((e) => !e.matched);
10503
+ if (missed.length > 0) {
10504
+ sections.push("");
10505
+ sections.push("**Missed expectations:**");
10506
+ for (const m of missed) {
10507
+ sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
10508
+ }
10509
+ }
10510
+ if (trial.emitted) {
10511
+ sections.push("");
10512
+ sections.push("**What the agent emitted:**");
10513
+ sections.push("```");
10514
+ sections.push(truncate3(trial.emitted, 600));
10515
+ sections.push("```");
10516
+ }
10517
+ sections.push("");
10518
+ }
10519
+ }
10520
+ if (ctx.topTrials.length > 0) {
10521
+ sections.push("## Successes (top trials) \u2014 what to preserve");
10522
+ sections.push("");
10523
+ for (const trial of ctx.topTrials) {
10524
+ sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10525
+ }
10526
+ sections.push("");
10527
+ }
10528
+ sections.push("## Allowed mutation primitives");
10529
+ sections.push("");
10530
+ for (const p of primitives) sections.push(`- ${p}`);
10531
+ sections.push("");
10532
+ sections.push("## Output schema");
10533
+ sections.push("");
10534
+ sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
10535
+ sections.push("```json");
10536
+ sections.push(JSON.stringify(
10537
+ {
10538
+ proposals: [
10539
+ {
10540
+ label: "<short label, \u2264 40 chars>",
10541
+ rationale: "<which failure this targets and which primitive you used>",
10542
+ payload: "<full payload of the new variant \u2014 same shape as the current variant>"
10543
+ }
10544
+ ]
10545
+ },
10546
+ null,
10547
+ 2
10548
+ ));
10549
+ sections.push("```");
10550
+ return sections.join("\n");
10551
+ }
10552
+ function truncate3(s, max) {
10553
+ if (s.length <= max) return s;
10554
+ return s.slice(0, max) + "\u2026 [truncated]";
10555
+ }
10556
+ function quote(s) {
10557
+ return s.replace(/`/g, "\\`");
10558
+ }
10559
+ function parseReflectionResponse(raw, maxProposals) {
10560
+ let text = raw.trim();
10561
+ if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
10562
+ const start = text.indexOf("{");
10563
+ const end = text.lastIndexOf("}");
10564
+ if (start < 0 || end <= start) return [];
10565
+ let parsed;
10566
+ try {
10567
+ parsed = JSON.parse(text.slice(start, end + 1));
10568
+ } catch {
10569
+ return [];
10570
+ }
10571
+ if (!parsed || typeof parsed !== "object") return [];
10572
+ const proposalsRaw = parsed.proposals;
10573
+ if (!Array.isArray(proposalsRaw)) return [];
10574
+ const out = [];
10575
+ for (const p of proposalsRaw) {
10576
+ if (!p || typeof p !== "object") continue;
10577
+ const obj = p;
10578
+ if (!("payload" in obj)) continue;
10579
+ out.push({
10580
+ label: typeof obj.label === "string" ? obj.label : "mutation",
10581
+ rationale: typeof obj.rationale === "string" ? obj.rationale : "",
10582
+ payload: obj.payload
10583
+ });
10584
+ if (maxProposals !== void 0 && out.length >= maxProposals) break;
10585
+ }
10586
+ return out;
10587
+ }
9979
10588
  export {
9980
10589
  AgentDriver,
9981
10590
  AxGepaSteeringOptimizer,
@@ -9990,10 +10599,12 @@ export {
9990
10599
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
9991
10600
  DEFAULT_FINDERS,
9992
10601
  DEFAULT_HARNESS_OBJECTIVES,
10602
+ DEFAULT_MUTATION_PRIMITIVES,
9993
10603
  DEFAULT_MUTATORS,
9994
10604
  DEFAULT_REDACTION_RULES,
9995
10605
  DEFAULT_RED_TEAM_CORPUS,
9996
10606
  DEFAULT_RUN_SCORE_WEIGHTS,
10607
+ DEFAULT_SEVERITY_WEIGHTS,
9997
10608
  Dataset,
9998
10609
  DockerSandboxDriver,
9999
10610
  DualAgentBench,
@@ -10008,6 +10619,7 @@ export {
10008
10619
  InMemoryExperimentStore,
10009
10620
  InMemoryOutcomeStore,
10010
10621
  InMemoryTraceStore,
10622
+ InMemoryTrialCache,
10011
10623
  InMemoryWorkspaceInspector,
10012
10624
  JudgeRunner,
10013
10625
  LlmCallError,
@@ -10043,7 +10655,9 @@ export {
10043
10655
  benjaminiHochberg,
10044
10656
  bisect,
10045
10657
  bonferroni,
10658
+ bootstrapCi,
10046
10659
  budgetBreachView,
10660
+ buildReflectionPrompt,
10047
10661
  buildReviewerPrompt,
10048
10662
  buildTrajectory,
10049
10663
  byteLengthRange,
@@ -10081,6 +10695,7 @@ export {
10081
10695
  createLlmReviewer,
10082
10696
  createSemanticConceptJudge,
10083
10697
  crossTraceDiff,
10698
+ crowdingDistance,
10084
10699
  decideReferenceReplayPromotion,
10085
10700
  decideReferenceReplayRunPromotion,
10086
10701
  defaultJudges,
@@ -10114,6 +10729,7 @@ export {
10114
10729
  formatBenchmarkReport,
10115
10730
  formatDriverReport,
10116
10731
  formatFindings,
10732
+ precision as goldenPrecision,
10117
10733
  gradeSemanticStatus,
10118
10734
  groupBy,
10119
10735
  hashContent,
@@ -10135,6 +10751,7 @@ export {
10135
10751
  jsonlReferenceReplayStore,
10136
10752
  jsonlReviewStore,
10137
10753
  judgeAgreementView,
10754
+ judgeReplayGate,
10138
10755
  judgeSpans,
10139
10756
  keyPreserved,
10140
10757
  linterJudge,
@@ -10144,6 +10761,7 @@ export {
10144
10761
  localCommandRunner,
10145
10762
  lowercaseMutator,
10146
10763
  mannWhitneyU,
10764
+ matchGoldens,
10147
10765
  mergeLayerResults,
10148
10766
  mergeSteeringBundle,
10149
10767
  multiToolchainLayer,
@@ -10155,7 +10773,10 @@ export {
10155
10773
  pairedTTest,
10156
10774
  paraphraseRobustness,
10157
10775
  paretoFrontier,
10776
+ paretoFrontierWithCrowding,
10777
+ parseReflectionResponse,
10158
10778
  partialCredit,
10779
+ passOrthogonality,
10159
10780
  pixelDeltaRatio,
10160
10781
  politenessPrefixMutator,
10161
10782
  positionalBias,
@@ -10195,12 +10816,14 @@ export {
10195
10816
  runJudgeFleet,
10196
10817
  runKeywordCoverageJudge,
10197
10818
  runKeywordCoverageJudgeUrl,
10819
+ runPromptEvolution,
10198
10820
  runProposeReview,
10199
10821
  runReferenceReplay,
10200
10822
  runSelfPlay,
10201
10823
  runSemanticConceptJudge,
10202
10824
  runTestGradedScenario,
10203
10825
  runsForScenario,
10826
+ scalarScore,
10204
10827
  scanForMuffledGates,
10205
10828
  scoreAllProjects,
10206
10829
  scoreContinuity,
@@ -10237,6 +10860,7 @@ export {
10237
10860
  viteDeployRunner,
10238
10861
  vitestTestParser,
10239
10862
  weightedMean,
10863
+ weightedRecall,
10240
10864
  welchsTTest,
10241
10865
  whitespaceCollapseMutator,
10242
10866
  wilcoxonSignedRank