@tangle-network/agent-eval 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
410
410
  if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
411
411
  if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
412
412
  const n = scores.length;
413
- const mean5 = scores.reduce((a, b) => a + b, 0) / n;
413
+ const mean7 = scores.reduce((a, b) => a + b, 0) / n;
414
414
  const B = 1e3;
415
415
  const bootstrapMeans = [];
416
416
  for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
425
425
  const lowerIdx = Math.floor(alpha / 2 * B);
426
426
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
427
427
  return {
428
- mean: mean5,
428
+ mean: mean7,
429
429
  lower: bootstrapMeans[lowerIdx],
430
430
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
431
431
  };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
513
513
  const n = before.length;
514
514
  if (n < 2) return { t: 0, df: 0, p: 1 };
515
515
  const diffs = before.map((b, i) => after[i] - b);
516
- const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
517
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
516
+ const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
517
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
518
518
  const se = Math.sqrt(variance2 / n);
519
- if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
520
- const t = mean5 / se;
519
+ if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
520
+ const t = mean7 / se;
521
521
  const df = n - 1;
522
522
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
523
523
  return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
541
541
  }
542
542
  let wPlus = 0;
543
543
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
544
- const mean5 = n * (n + 1) / 4;
544
+ const mean7 = n * (n + 1) / 4;
545
545
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
546
- const z = (wPlus - mean5) / Math.sqrt(variance2);
546
+ const z = (wPlus - mean7) / Math.sqrt(variance2);
547
547
  const p = 2 * (1 - normalCdf(Math.abs(z)));
548
548
  return { w: wPlus, p };
549
549
  }
@@ -2486,6 +2486,56 @@ function paretoFrontier(candidates, objectives) {
2486
2486
  }));
2487
2487
  return { frontier, dominated, dominanceMap };
2488
2488
  }
2489
+ function scalarScore(candidates, objectives, options = {}) {
2490
+ if (candidates.length === 0) return [];
2491
+ const weights = options.weights ?? {};
2492
+ const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
2493
+ const ranges = objectives.map((obj) => {
2494
+ const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
2495
+ if (values.length === 0) return { min: 0, max: 1 };
2496
+ const min = Math.min(...values);
2497
+ const max = Math.max(...values);
2498
+ return { min, max: max === min ? min + 1 : max };
2499
+ });
2500
+ return candidates.map((c) => {
2501
+ let score = 0;
2502
+ objectives.forEach((obj, i) => {
2503
+ const v = obj.value(c);
2504
+ if (!Number.isFinite(v)) return;
2505
+ const { min, max } = ranges[i];
2506
+ const normalised = (v - min) / (max - min);
2507
+ const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
2508
+ const weight = (weights[obj.name] ?? 1) / totalWeight;
2509
+ score += directional * weight;
2510
+ });
2511
+ return { candidate: c, score };
2512
+ });
2513
+ }
2514
+ function crowdingDistance(candidates, objectives) {
2515
+ const distances = new Map(candidates.map((c) => [c, 0]));
2516
+ for (const obj of objectives) {
2517
+ const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
2518
+ const min = obj.value(sorted[0]);
2519
+ const max = obj.value(sorted[sorted.length - 1]);
2520
+ const range = max - min || 1;
2521
+ distances.set(sorted[0], Infinity);
2522
+ distances.set(sorted[sorted.length - 1], Infinity);
2523
+ for (let i = 1; i < sorted.length - 1; i++) {
2524
+ const prev = obj.value(sorted[i - 1]);
2525
+ const next = obj.value(sorted[i + 1]);
2526
+ const current = distances.get(sorted[i]);
2527
+ if (current === Infinity) continue;
2528
+ distances.set(sorted[i], current + (next - prev) / range);
2529
+ }
2530
+ }
2531
+ return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
2532
+ }
2533
+ function paretoFrontierWithCrowding(candidates, objectives) {
2534
+ const { frontier } = paretoFrontier(candidates, objectives);
2535
+ if (frontier.length === 0) return [];
2536
+ const distances = crowdingDistance(frontier, objectives);
2537
+ return distances.sort((a, b) => b.distance - a.distance);
2538
+ }
2489
2539
 
2490
2540
  // src/harness-optimizer.ts
2491
2541
  var DEFAULT_HARNESS_OBJECTIVES = [
@@ -5095,10 +5145,10 @@ function analyzeSeries(values, options = {}) {
5095
5145
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
5096
5146
  }
5097
5147
  const tail = values.slice(-window);
5098
- const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
5099
- const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
5148
+ const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
5149
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
5100
5150
  const stdDev = Math.sqrt(variance2);
5101
- const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
5151
+ const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
5102
5152
  const cv = stdDev / refMean;
5103
5153
  const stable = tail.length >= window && cv <= stableCv;
5104
5154
  let tailRun = 0;
@@ -5119,7 +5169,7 @@ function analyzeSeries(values, options = {}) {
5119
5169
  } else {
5120
5170
  state = "noisy";
5121
5171
  }
5122
- return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
5172
+ return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
5123
5173
  }
5124
5174
 
5125
5175
  // src/state-continuity.ts
@@ -6047,12 +6097,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
6047
6097
  variantScores.push({ mutator: id, score, mutated });
6048
6098
  all.push(score);
6049
6099
  }
6050
- const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
6051
- const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
6100
+ const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
6101
+ const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
6052
6102
  const stdDev = Math.sqrt(variance2);
6053
- const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
6103
+ const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
6054
6104
  const robustness = Math.max(0, 1 - stdDev / ref);
6055
- return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
6105
+ return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
6056
6106
  }
6057
6107
  var lowercaseMutator = (p) => p.toLowerCase();
6058
6108
  var sentenceReorderMutator = (p, seed) => {
@@ -6973,8 +7023,8 @@ async function prmBestOfN(store, grader, runIds) {
6973
7023
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
6974
7024
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
6975
7025
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
6976
- const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
6977
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
7026
+ const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
7027
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
6978
7028
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
6979
7029
  }
6980
7030
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6996,8 +7046,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
6996
7046
  const ranked = [...byRun.values()].sort(
6997
7047
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
6998
7048
  );
6999
- const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7000
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
7049
+ const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
7050
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
7001
7051
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
7002
7052
  }
7003
7053
 
@@ -7527,8 +7577,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
7527
7577
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
7528
7578
  const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
7529
7579
  if (scores.length < 3) continue;
7530
- const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
7531
- const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
7580
+ const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
7581
+ const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
7532
7582
  if (variance2 > varianceThreshold) {
7533
7583
  targets.push({
7534
7584
  reason: "high-variance",
@@ -9491,6 +9541,7 @@ async function runReferenceReplay(cases, options) {
9491
9541
  const scoreOptions2 = {
9492
9542
  matcher: options.matcher,
9493
9543
  matchThreshold: options.matchThreshold,
9544
+ matchStrategy: options.matchStrategy,
9494
9545
  includeHoldout: true
9495
9546
  };
9496
9547
  const scenarioScore = scoreReferenceReplay([scenario], scoreOptions2).scenarios[0];
@@ -9510,6 +9561,7 @@ async function runReferenceReplay(cases, options) {
9510
9561
  const scoreOptions = {
9511
9562
  matcher: options.matcher,
9512
9563
  matchThreshold: options.matchThreshold,
9564
+ matchStrategy: options.matchStrategy,
9513
9565
  includeHoldout: true
9514
9566
  };
9515
9567
  const run = {
@@ -9560,12 +9612,13 @@ function jsonlReferenceReplayStore(path) {
9560
9612
  function scoreReferenceReplay(scenarios, options = {}) {
9561
9613
  const matcher = options.matcher ?? defaultReferenceReplayMatcher;
9562
9614
  const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
9615
+ const matchStrategy = options.matchStrategy ?? "reference-order";
9563
9616
  const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
9564
9617
  const scores = scenarios.filter((scenario) => {
9565
9618
  const split = scenario.split ?? "train";
9566
9619
  if (split === "holdout" && !options.includeHoldout) return false;
9567
9620
  return allowedSplits.has(split);
9568
- }).map((scenario) => scoreScenario(scenario, matcher, threshold));
9621
+ }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
9569
9622
  return {
9570
9623
  scenarios: scores,
9571
9624
  aggregate: aggregateScenarioScores(scores),
@@ -9664,18 +9717,18 @@ function defaultReferenceReplayMatcher(reference, candidate) {
9664
9717
  const score = clamp012(textScore * 0.85 + tagScore + severityScore);
9665
9718
  return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
9666
9719
  }
9667
- function scoreScenario(scenario, matcher, threshold) {
9720
+ function scoreScenario(scenario, matcher, threshold, matchStrategy) {
9721
+ return matchStrategy === "global-greedy" ? scoreScenarioGlobalGreedy(scenario, matcher, threshold) : scoreScenarioReferenceOrder(scenario, matcher, threshold);
9722
+ }
9723
+ function scoreScenarioReferenceOrder(scenario, matcher, threshold) {
9668
9724
  const candidatesLeft = scenario.candidates.map((candidate, index) => ({ candidate, index }));
9669
9725
  const matches2 = [];
9670
9726
  for (const reference of scenario.references) {
9671
9727
  let best = null;
9672
9728
  for (const item of candidatesLeft) {
9673
- const result = matcher(reference, item.candidate, scenario);
9674
- if (!Number.isFinite(result.score)) {
9675
- throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${item.candidate.id}`);
9676
- }
9729
+ const result = scorePair(scenario, matcher, reference, item.candidate);
9677
9730
  if (!best || result.score > best.score) {
9678
- best = { ...item, score: clamp012(result.score), reason: result.reason ?? "" };
9731
+ best = { ...item, ...result };
9679
9732
  }
9680
9733
  }
9681
9734
  const weight = reference.weight ?? 1;
@@ -9703,12 +9756,72 @@ function scoreScenario(scenario, matcher, threshold) {
9703
9756
  });
9704
9757
  }
9705
9758
  }
9759
+ return buildScenarioScore(scenario, matches2, candidatesLeft.length);
9760
+ }
9761
+ function scoreScenarioGlobalGreedy(scenario, matcher, threshold) {
9762
+ const pairs = [];
9763
+ for (const [referenceIndex, reference] of scenario.references.entries()) {
9764
+ for (const [candidateIndex, candidate] of scenario.candidates.entries()) {
9765
+ pairs.push({
9766
+ referenceIndex,
9767
+ candidateIndex,
9768
+ reference,
9769
+ candidate,
9770
+ ...scorePair(scenario, matcher, reference, candidate)
9771
+ });
9772
+ }
9773
+ }
9774
+ pairs.sort(
9775
+ (a, b) => b.score - a.score || a.referenceIndex - b.referenceIndex || a.candidateIndex - b.candidateIndex
9776
+ );
9777
+ const selectedByReference = /* @__PURE__ */ new Map();
9778
+ const selectedCandidates = /* @__PURE__ */ new Set();
9779
+ for (const pair of pairs) {
9780
+ if (pair.score < threshold) break;
9781
+ if (selectedByReference.has(pair.referenceIndex) || selectedCandidates.has(pair.candidateIndex)) continue;
9782
+ selectedByReference.set(pair.referenceIndex, pair);
9783
+ selectedCandidates.add(pair.candidateIndex);
9784
+ }
9785
+ const matches2 = scenario.references.map((reference, referenceIndex) => {
9786
+ const weight = reference.weight ?? 1;
9787
+ const selected = selectedByReference.get(referenceIndex);
9788
+ if (selected) {
9789
+ return {
9790
+ scenarioId: scenario.id,
9791
+ referenceId: reference.id,
9792
+ candidateId: selected.candidate.id,
9793
+ score: selected.score,
9794
+ matched: true,
9795
+ weight,
9796
+ reason: selected.reason
9797
+ };
9798
+ }
9799
+ const bestRejected = pairs.find((pair) => pair.referenceIndex === referenceIndex);
9800
+ return {
9801
+ scenarioId: scenario.id,
9802
+ referenceId: reference.id,
9803
+ candidateId: bestRejected?.candidate.id ?? null,
9804
+ score: bestRejected?.score ?? 0,
9805
+ matched: false,
9806
+ weight,
9807
+ reason: bestRejected?.reason ?? "no candidates"
9808
+ };
9809
+ });
9810
+ return buildScenarioScore(scenario, matches2, scenario.candidates.length - selectedCandidates.size);
9811
+ }
9812
+ function scorePair(scenario, matcher, reference, candidate) {
9813
+ const result = matcher(reference, candidate, scenario);
9814
+ if (!Number.isFinite(result.score)) {
9815
+ throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
9816
+ }
9817
+ return { score: clamp012(result.score), reason: result.reason ?? "" };
9818
+ }
9819
+ function buildScenarioScore(scenario, matches2, falsePositives) {
9706
9820
  const matched = matches2.filter((match) => match.matched).length;
9707
9821
  const total = scenario.references.length;
9708
- const falsePositives = candidatesLeft.length;
9709
9822
  const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
9710
9823
  const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
9711
- const precision = ratio(matched, matched + falsePositives);
9824
+ const precision2 = ratio(matched, matched + falsePositives);
9712
9825
  const recall = ratio(matched, total);
9713
9826
  return {
9714
9827
  scenarioId: scenario.id,
@@ -9718,9 +9831,9 @@ function scoreScenario(scenario, matcher, threshold) {
9718
9831
  falsePositives,
9719
9832
  matchedWeight,
9720
9833
  totalWeight,
9721
- precision,
9834
+ precision: precision2,
9722
9835
  recall,
9723
- f1: f1(precision, recall),
9836
+ f1: f1(precision2, recall),
9724
9837
  matches: matches2
9725
9838
  };
9726
9839
  }
@@ -9738,7 +9851,7 @@ function aggregateScenarioScores(scores) {
9738
9851
  const falsePositives = sum(scores.map((score) => score.falsePositives));
9739
9852
  const matchedWeight = sum(scores.map((score) => score.matchedWeight));
9740
9853
  const totalWeight = sum(scores.map((score) => score.totalWeight));
9741
- const precision = ratio(matched, matched + falsePositives);
9854
+ const precision2 = ratio(matched, matched + falsePositives);
9742
9855
  const recall = ratio(matched, total);
9743
9856
  return {
9744
9857
  matched,
@@ -9746,9 +9859,9 @@ function aggregateScenarioScores(scores) {
9746
9859
  falsePositives,
9747
9860
  matchedWeight,
9748
9861
  totalWeight,
9749
- precision,
9862
+ precision: precision2,
9750
9863
  recall,
9751
- f1: f1(precision, recall),
9864
+ f1: f1(precision2, recall),
9752
9865
  weightedRecall: ratio(matchedWeight, totalWeight)
9753
9866
  };
9754
9867
  }
@@ -9768,8 +9881,8 @@ function emptyAggregate() {
9768
9881
  function hasSplit(score, split) {
9769
9882
  return score.bySplit[split] !== void 0;
9770
9883
  }
9771
- function f1(precision, recall) {
9772
- return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
9884
+ function f1(precision2, recall) {
9885
+ return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
9773
9886
  }
9774
9887
  function ratio(numerator, denominator) {
9775
9888
  return denominator > 0 ? numerator / denominator : 0;
@@ -9854,6 +9967,624 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
9854
9967
  "where",
9855
9968
  "which"
9856
9969
  ]);
9970
+
9971
+ // src/reference-replay-steering.ts
9972
+ function referenceReplayRunsToSteeringRows(runs, options = {}) {
9973
+ const rows = [];
9974
+ for (const run of runs) {
9975
+ const variantId = run.variantId ?? run.id;
9976
+ const bundle = options.bundleForRun?.(run) ?? {
9977
+ id: variantId,
9978
+ metadata: run.metadata
9979
+ };
9980
+ for (const caseRun of run.cases) {
9981
+ rows.push({
9982
+ variantId,
9983
+ scenarioId: caseRun.caseId,
9984
+ bundle,
9985
+ score: options.scoreForCase?.(caseRun, run) ?? referenceReplayScenarioToRunScore(caseRun.score, caseRun.durationMs),
9986
+ metadata: {
9987
+ runId: run.id,
9988
+ split: caseRun.split,
9989
+ task: caseRun.metadata?.task ?? caseRun.metadata?.repo ?? caseRun.caseId,
9990
+ referenceCount: caseRun.references.length,
9991
+ candidateCount: caseRun.candidates.length,
9992
+ matched: caseRun.score.matched,
9993
+ total: caseRun.score.total,
9994
+ falsePositives: caseRun.score.falsePositives,
9995
+ precision: caseRun.score.precision,
9996
+ recall: caseRun.score.recall,
9997
+ f1: caseRun.score.f1,
9998
+ error: caseRun.error,
9999
+ ...caseRun.metadata ?? {}
10000
+ }
10001
+ });
10002
+ }
10003
+ }
10004
+ return rows;
10005
+ }
10006
+ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
10007
+ const success = scenarioScore.f1;
10008
+ const recall = scenarioScore.recall;
10009
+ const precision2 = scenarioScore.precision;
10010
+ const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
10011
+ return {
10012
+ success,
10013
+ goalProgress: recall,
10014
+ repoGroundedness: precision2,
10015
+ driftPenalty: 1 - precision2,
10016
+ toolUseQuality: precision2,
10017
+ patchQuality: 0,
10018
+ testReality: scenarioScore.total > 0 ? 1 : 0,
10019
+ finalGate: success,
10020
+ reviewerBlockers: failed ? 1 : 0,
10021
+ costUsd: 0,
10022
+ wallSeconds: Math.max(0, durationMs / 1e3),
10023
+ notes: [
10024
+ `reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
10025
+ `precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
10026
+ ]
10027
+ };
10028
+ }
10029
+
10030
+ // src/prompt-evolution.ts
10031
+ var InMemoryTrialCache = class {
10032
+ store = /* @__PURE__ */ new Map();
10033
+ get(key) {
10034
+ return this.store.get(key);
10035
+ }
10036
+ set(key, value) {
10037
+ this.store.set(key, value);
10038
+ }
10039
+ size() {
10040
+ return this.store.size;
10041
+ }
10042
+ clear() {
10043
+ this.store.clear();
10044
+ }
10045
+ };
10046
+ async function runPromptEvolution(config) {
10047
+ const generations = [];
10048
+ let population = [...config.seedVariants];
10049
+ let bestVariant = population[0];
10050
+ let bestAggregate = null;
10051
+ for (let generation = 0; generation < config.generations; generation++) {
10052
+ config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
10053
+ const trials = await scorePopulation(population, config, generation);
10054
+ const aggregates = aggregateTrials(population, config.scenarioIds, trials);
10055
+ const front = paretoFrontierWithCrowding(aggregates, config.objectives);
10056
+ const frontIds = new Set(front.map((c) => c.candidate.variantId));
10057
+ const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
10058
+ scored.sort((a, b) => b.score - a.score);
10059
+ const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
10060
+ const report = {
10061
+ runId: config.runId,
10062
+ target: config.target,
10063
+ generation,
10064
+ variants: population,
10065
+ aggregates,
10066
+ paretoFrontIds: front.map((c) => c.candidate.variantId),
10067
+ winnerId,
10068
+ trials
10069
+ };
10070
+ generations.push(report);
10071
+ config.onProgress?.({ type: "generation-complete", report });
10072
+ const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
10073
+ if (winnerAgg) {
10074
+ const winner = population.find((v) => v.id === winnerId);
10075
+ if (winner) bestVariant = winner;
10076
+ bestAggregate = winnerAgg;
10077
+ }
10078
+ if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
10079
+ const prev = generations[generations.length - 2];
10080
+ const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
10081
+ if (noChange) {
10082
+ config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
10083
+ break;
10084
+ }
10085
+ }
10086
+ if (generation === config.generations - 1) break;
10087
+ population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
10088
+ }
10089
+ return {
10090
+ runId: config.runId,
10091
+ target: config.target,
10092
+ generations,
10093
+ bestVariant,
10094
+ bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
10095
+ };
10096
+ }
10097
+ async function scorePopulation(population, config, generation) {
10098
+ const jobs = [];
10099
+ for (const variant of population) {
10100
+ for (const scenarioId of config.scenarioIds) {
10101
+ for (let rep = 0; rep < config.reps; rep++) {
10102
+ jobs.push(async () => {
10103
+ const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
10104
+ const cached = config.cache?.get(cacheKey);
10105
+ if (cached) {
10106
+ config.onProgress?.({
10107
+ type: "trial-complete",
10108
+ generation,
10109
+ variantId: variant.id,
10110
+ scenarioId,
10111
+ rep,
10112
+ ok: cached.ok,
10113
+ score: cached.score,
10114
+ cached: true
10115
+ });
10116
+ return cached;
10117
+ }
10118
+ const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
10119
+ config.cache?.set(cacheKey, result);
10120
+ config.onProgress?.({
10121
+ type: "trial-complete",
10122
+ generation,
10123
+ variantId: variant.id,
10124
+ scenarioId,
10125
+ rep,
10126
+ ok: result.ok,
10127
+ score: result.score,
10128
+ cached: false
10129
+ });
10130
+ return result;
10131
+ });
10132
+ }
10133
+ }
10134
+ }
10135
+ return runWithConcurrency(jobs, config.scoreConcurrency);
10136
+ }
10137
+ async function runWithConcurrency(jobs, concurrency) {
10138
+ const results = new Array(jobs.length);
10139
+ const limit = Math.max(1, concurrency);
10140
+ let next = 0;
10141
+ async function worker() {
10142
+ while (true) {
10143
+ const i = next++;
10144
+ if (i >= jobs.length) return;
10145
+ results[i] = await jobs[i]();
10146
+ }
10147
+ }
10148
+ await Promise.all(Array.from({ length: limit }, () => worker()));
10149
+ return results;
10150
+ }
10151
+ function aggregateTrials(population, scenarioIds, trials) {
10152
+ return population.map((variant) => {
10153
+ const variantTrials = trials.filter((t) => t.variantId === variant.id);
10154
+ const scenarios = scenarioIds.map((sid) => {
10155
+ const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
10156
+ const okTrials = scenarioTrials.filter((t) => t.ok);
10157
+ const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
10158
+ return {
10159
+ variantId: variant.id,
10160
+ scenarioId: sid,
10161
+ meanScore: mean5(okTrials.map((t) => t.score)),
10162
+ meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
10163
+ meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
10164
+ okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
10165
+ trials: scenarioTrials.length,
10166
+ metrics
10167
+ };
10168
+ });
10169
+ return {
10170
+ variantId: variant.id,
10171
+ meanScore: mean5(scenarios.map((s) => s.meanScore)),
10172
+ meanCost: mean5(scenarios.map((s) => s.meanCost)),
10173
+ meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
10174
+ okRate: mean5(scenarios.map((s) => s.okRate)),
10175
+ scenarios,
10176
+ metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
10177
+ };
10178
+ });
10179
+ }
10180
+ function aggregateMetrics(rows) {
10181
+ const buckets = /* @__PURE__ */ new Map();
10182
+ for (const row of rows) {
10183
+ for (const [k, v] of Object.entries(row)) {
10184
+ if (!Number.isFinite(v)) continue;
10185
+ const list = buckets.get(k) ?? [];
10186
+ list.push(v);
10187
+ buckets.set(k, list);
10188
+ }
10189
+ }
10190
+ const out = {};
10191
+ for (const [k, list] of buckets) out[k] = mean5(list);
10192
+ return out;
10193
+ }
10194
+ function mean5(xs) {
10195
+ if (xs.length === 0) return 0;
10196
+ return xs.reduce((a, b) => a + b, 0) / xs.length;
10197
+ }
10198
+ async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
10199
+ const survivorIds = new Set(front.map((c) => c.candidate.variantId));
10200
+ const survivors = current.filter((v) => survivorIds.has(v.id));
10201
+ const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
10202
+ const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
10203
+ const parent = current.find((v) => v.id === parentId) ?? current[0];
10204
+ const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
10205
+ const topTrials = topKTrialsByScore(trials, parent.id, 3);
10206
+ const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
10207
+ const childCount = Math.max(0, config.populationSize - survivors.length);
10208
+ let children = [];
10209
+ if (childCount > 0) {
10210
+ children = await config.mutateAdapter.mutate({
10211
+ parent,
10212
+ parentAggregate,
10213
+ topTrials,
10214
+ bottomTrials,
10215
+ childCount,
10216
+ generation: nextGeneration
10217
+ });
10218
+ children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
10219
+ }
10220
+ return [...survivors, ...children];
10221
+ }
10222
+ function topKTrialsByScore(trials, variantId, k) {
10223
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
10224
+ }
10225
+ function bottomKTrialsByScore(trials, variantId, k) {
10226
+ return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
10227
+ }
10228
+ function samePopulation(a, b) {
10229
+ if (a.length !== b.length) return false;
10230
+ const setA = new Set(a);
10231
+ return b.every((id) => setA.has(id));
10232
+ }
10233
+
10234
+ // src/golden-matcher.ts
10235
+ function matchGoldens(goldens, candidates, options = {}) {
10236
+ const extract = options.text ?? defaultExtract5;
10237
+ const haystacks = candidates.map((c) => extract(c).toLowerCase());
10238
+ const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
10239
+ return {
10240
+ matches: matches2,
10241
+ hits: matches2.filter(Boolean).length,
10242
+ total: goldens.length
10243
+ };
10244
+ }
10245
+ function defaultExtract5(candidate) {
10246
+ if (typeof candidate === "string") return candidate;
10247
+ if (candidate && typeof candidate === "object") {
10248
+ const parts = [];
10249
+ for (const v of Object.values(candidate)) {
10250
+ if (typeof v === "string") parts.push(v);
10251
+ }
10252
+ return parts.join(" ");
10253
+ }
10254
+ return String(candidate ?? "");
10255
+ }
10256
+ function goldenMatched(golden, haystacks) {
10257
+ for (const phrase of golden.any) {
10258
+ const needle = phrase.toLowerCase().trim();
10259
+ if (!needle) continue;
10260
+ if (haystacks.some((h) => h.includes(needle))) return true;
10261
+ }
10262
+ for (const pattern of golden.anyRegex ?? []) {
10263
+ let re;
10264
+ try {
10265
+ re = new RegExp(pattern, "i");
10266
+ } catch {
10267
+ continue;
10268
+ }
10269
+ if (haystacks.some((h) => re.test(h))) return true;
10270
+ }
10271
+ return false;
10272
+ }
10273
+ var DEFAULT_SEVERITY_WEIGHTS = {
10274
+ critical: 3,
10275
+ major: 2,
10276
+ minor: 1
10277
+ };
10278
+ function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
10279
+ if (goldens.length === 0) return 1;
10280
+ const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
10281
+ if (total === 0) return 1;
10282
+ const hit = goldens.reduce(
10283
+ (s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
10284
+ 0
10285
+ );
10286
+ return hit / total;
10287
+ }
10288
+ function precision(goldens, candidates, options = {}) {
10289
+ if (candidates.length === 0) return 1;
10290
+ const extract = options.text ?? defaultExtract5;
10291
+ let matched = 0;
10292
+ for (const cand of candidates) {
10293
+ const haystack = extract(cand).toLowerCase();
10294
+ const matchedAny = goldens.some(
10295
+ (g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
10296
+ try {
10297
+ return new RegExp(pat, "i").test(haystack);
10298
+ } catch {
10299
+ return false;
10300
+ }
10301
+ })
10302
+ );
10303
+ if (matchedAny) matched++;
10304
+ }
10305
+ return matched / candidates.length;
10306
+ }
10307
+
10308
+ // src/orthogonality.ts
10309
+ function passOrthogonality(input) {
10310
+ const passes = input.passes;
10311
+ if (passes.length < 2) {
10312
+ return { orthogonality: 1, passCount: passes.length, similarities: [] };
10313
+ }
10314
+ const render = input.text ?? defaultRender;
10315
+ const minLen = input.minTokenLength ?? 4;
10316
+ const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
10317
+ const sims = [];
10318
+ for (let i = 0; i < vectors.length; i++) {
10319
+ for (let j = i + 1; j < vectors.length; j++) {
10320
+ sims.push(cosineSimilarity(vectors[i], vectors[j]));
10321
+ }
10322
+ }
10323
+ const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
10324
+ return {
10325
+ orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
10326
+ passCount: passes.length,
10327
+ similarities: sims
10328
+ };
10329
+ }
10330
+ function defaultRender(item) {
10331
+ if (typeof item === "string") return item;
10332
+ if (item && typeof item === "object") {
10333
+ const parts = [];
10334
+ for (const v of Object.values(item)) {
10335
+ if (typeof v === "string") parts.push(v);
10336
+ }
10337
+ return parts.join(" ");
10338
+ }
10339
+ return String(item ?? "");
10340
+ }
10341
+ function bagOfWords(items, render, minLen) {
10342
+ const bag = /* @__PURE__ */ new Map();
10343
+ for (const item of items) {
10344
+ const text = render(item).toLowerCase();
10345
+ for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
10346
+ bag.set(tok, (bag.get(tok) ?? 0) + 1);
10347
+ }
10348
+ }
10349
+ return bag;
10350
+ }
10351
+ function cosineSimilarity(a, b) {
10352
+ let dot = 0;
10353
+ let aMag = 0;
10354
+ let bMag = 0;
10355
+ for (const [, v] of a) aMag += v * v;
10356
+ for (const [, v] of b) bMag += v * v;
10357
+ for (const [k, v] of a) {
10358
+ const bv = b.get(k);
10359
+ if (bv) dot += v * bv;
10360
+ }
10361
+ if (aMag === 0 || bMag === 0) return 0;
10362
+ return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
10363
+ }
10364
+
10365
+ // src/promotion-gate.ts
10366
+ function bootstrapCi(baseline, candidate, options = {}) {
10367
+ const alpha = options.alpha ?? 0.05;
10368
+ const iterations = options.iterations ?? 1e3;
10369
+ const minTotal = options.minTotalSamples ?? 6;
10370
+ const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
10371
+ const baselineMean = mean6(baseline);
10372
+ const candidateMean = mean6(candidate);
10373
+ const delta = candidateMean - baselineMean;
10374
+ if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
10375
+ return {
10376
+ baselineMean,
10377
+ candidateMean,
10378
+ delta,
10379
+ ciLower: -Infinity,
10380
+ ciUpper: Infinity,
10381
+ iterations: 0,
10382
+ alpha,
10383
+ verdict: "INCONCLUSIVE"
10384
+ };
10385
+ }
10386
+ const deltas = new Array(iterations);
10387
+ for (let i = 0; i < iterations; i++) {
10388
+ const bResample = resample(baseline, rng);
10389
+ const cResample = resample(candidate, rng);
10390
+ deltas[i] = mean6(cResample) - mean6(bResample);
10391
+ }
10392
+ deltas.sort((a, b) => a - b);
10393
+ const lowerIdx = Math.floor(alpha / 2 * iterations);
10394
+ const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
10395
+ const ciLower = deltas[Math.max(0, lowerIdx)];
10396
+ const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
10397
+ let verdict;
10398
+ if (ciLower > 0) verdict = "ADVANCE";
10399
+ else if (ciUpper < 0) verdict = "REVERT";
10400
+ else if (delta >= 0) verdict = "KEEP";
10401
+ else verdict = "INCONCLUSIVE";
10402
+ return {
10403
+ baselineMean,
10404
+ candidateMean,
10405
+ delta,
10406
+ ciLower,
10407
+ ciUpper,
10408
+ iterations,
10409
+ alpha,
10410
+ verdict
10411
+ };
10412
+ }
10413
+ function mean6(xs) {
10414
+ if (xs.length === 0) return 0;
10415
+ let s = 0;
10416
+ for (const x of xs) s += x;
10417
+ return s / xs.length;
10418
+ }
10419
+ function resample(xs, rng) {
10420
+ const out = new Array(xs.length);
10421
+ for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
10422
+ return out;
10423
+ }
10424
+ function mulberry32(seed) {
10425
+ let t = seed >>> 0;
10426
+ return () => {
10427
+ t += 1831565813;
10428
+ let r = t;
10429
+ r = Math.imul(r ^ r >>> 15, r | 1);
10430
+ r ^= r + Math.imul(r ^ r >>> 7, r | 61);
10431
+ return ((r ^ r >>> 14) >>> 0) / 4294967296;
10432
+ };
10433
+ }
10434
+ function hashSeed(a, b) {
10435
+ let h = 2166136261;
10436
+ for (const x of [...a, ...b]) {
10437
+ const view = new Float64Array([x]);
10438
+ const bytes = new Uint8Array(view.buffer);
10439
+ for (const byte of bytes) {
10440
+ h ^= byte;
10441
+ h = Math.imul(h, 16777619);
10442
+ }
10443
+ }
10444
+ return h >>> 0;
10445
+ }
10446
+ async function judgeReplayGate(args) {
10447
+ const concurrency = args.judgeConcurrency ?? 4;
10448
+ const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
10449
+ const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
10450
+ const ci = bootstrapCi(baselineScores, candidateScores, {
10451
+ ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
10452
+ ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
10453
+ ...args.seed !== void 0 ? { seed: args.seed } : {}
10454
+ });
10455
+ return {
10456
+ ...ci,
10457
+ baselineSamples: baselineScores.length,
10458
+ candidateSamples: candidateScores.length
10459
+ };
10460
+ }
10461
+ async function scoreAll(outputs, judge, concurrency) {
10462
+ const results = new Array(outputs.length);
10463
+ let next = 0;
10464
+ async function worker() {
10465
+ while (true) {
10466
+ const i = next++;
10467
+ if (i >= outputs.length) return;
10468
+ const v = await judge(outputs[i]);
10469
+ results[i] = Number.isFinite(v) ? v : 0;
10470
+ }
10471
+ }
10472
+ await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
10473
+ return results;
10474
+ }
10475
+
10476
+ // src/reflective-mutation.ts
10477
+ var DEFAULT_MUTATION_PRIMITIVES = [
10478
+ 'Strengthen an imperative ("should" \u2192 "must")',
10479
+ "Add a concrete example pulled from a missed-golden phrase",
10480
+ "Remove a redundant rule that did not improve recall",
10481
+ 'Add a counterfactual ("if X is missing, the score is capped at Y")',
10482
+ "Reorder sections so the highest-impact rule is first",
10483
+ "Replace abstract language with a domain-specific noun the trial misses"
10484
+ ];
10485
+ function buildReflectionPrompt(ctx) {
10486
+ const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
10487
+ const sections = [];
10488
+ sections.push(`# Mutation target: ${ctx.target}`);
10489
+ sections.push("");
10490
+ sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
10491
+ sections.push("");
10492
+ sections.push("## Current variant");
10493
+ sections.push("```json");
10494
+ sections.push(JSON.stringify(ctx.parentPayload, null, 2));
10495
+ sections.push("```");
10496
+ sections.push("");
10497
+ if (ctx.bottomTrials.length > 0) {
10498
+ sections.push("## Failures (bottom trials) \u2014 what went wrong");
10499
+ sections.push("");
10500
+ for (const trial of ctx.bottomTrials) {
10501
+ sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10502
+ const missed = (trial.expectations ?? []).filter((e) => !e.matched);
10503
+ if (missed.length > 0) {
10504
+ sections.push("");
10505
+ sections.push("**Missed expectations:**");
10506
+ for (const m of missed) {
10507
+ sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
10508
+ }
10509
+ }
10510
+ if (trial.emitted) {
10511
+ sections.push("");
10512
+ sections.push("**What the agent emitted:**");
10513
+ sections.push("```");
10514
+ sections.push(truncate3(trial.emitted, 600));
10515
+ sections.push("```");
10516
+ }
10517
+ sections.push("");
10518
+ }
10519
+ }
10520
+ if (ctx.topTrials.length > 0) {
10521
+ sections.push("## Successes (top trials) \u2014 what to preserve");
10522
+ sections.push("");
10523
+ for (const trial of ctx.topTrials) {
10524
+ sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
10525
+ }
10526
+ sections.push("");
10527
+ }
10528
+ sections.push("## Allowed mutation primitives");
10529
+ sections.push("");
10530
+ for (const p of primitives) sections.push(`- ${p}`);
10531
+ sections.push("");
10532
+ sections.push("## Output schema");
10533
+ sections.push("");
10534
+ sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
10535
+ sections.push("```json");
10536
+ sections.push(JSON.stringify(
10537
+ {
10538
+ proposals: [
10539
+ {
10540
+ label: "<short label, \u2264 40 chars>",
10541
+ rationale: "<which failure this targets and which primitive you used>",
10542
+ payload: "<full payload of the new variant \u2014 same shape as the current variant>"
10543
+ }
10544
+ ]
10545
+ },
10546
+ null,
10547
+ 2
10548
+ ));
10549
+ sections.push("```");
10550
+ return sections.join("\n");
10551
+ }
10552
+ function truncate3(s, max) {
10553
+ if (s.length <= max) return s;
10554
+ return s.slice(0, max) + "\u2026 [truncated]";
10555
+ }
10556
+ function quote(s) {
10557
+ return s.replace(/`/g, "\\`");
10558
+ }
10559
+ function parseReflectionResponse(raw, maxProposals) {
10560
+ let text = raw.trim();
10561
+ if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
10562
+ const start = text.indexOf("{");
10563
+ const end = text.lastIndexOf("}");
10564
+ if (start < 0 || end <= start) return [];
10565
+ let parsed;
10566
+ try {
10567
+ parsed = JSON.parse(text.slice(start, end + 1));
10568
+ } catch {
10569
+ return [];
10570
+ }
10571
+ if (!parsed || typeof parsed !== "object") return [];
10572
+ const proposalsRaw = parsed.proposals;
10573
+ if (!Array.isArray(proposalsRaw)) return [];
10574
+ const out = [];
10575
+ for (const p of proposalsRaw) {
10576
+ if (!p || typeof p !== "object") continue;
10577
+ const obj = p;
10578
+ if (!("payload" in obj)) continue;
10579
+ out.push({
10580
+ label: typeof obj.label === "string" ? obj.label : "mutation",
10581
+ rationale: typeof obj.rationale === "string" ? obj.rationale : "",
10582
+ payload: obj.payload
10583
+ });
10584
+ if (maxProposals !== void 0 && out.length >= maxProposals) break;
10585
+ }
10586
+ return out;
10587
+ }
9857
10588
  export {
9858
10589
  AgentDriver,
9859
10590
  AxGepaSteeringOptimizer,
@@ -9868,10 +10599,12 @@ export {
9868
10599
  DEFAULT_RULES as DEFAULT_FAILURE_RULES,
9869
10600
  DEFAULT_FINDERS,
9870
10601
  DEFAULT_HARNESS_OBJECTIVES,
10602
+ DEFAULT_MUTATION_PRIMITIVES,
9871
10603
  DEFAULT_MUTATORS,
9872
10604
  DEFAULT_REDACTION_RULES,
9873
10605
  DEFAULT_RED_TEAM_CORPUS,
9874
10606
  DEFAULT_RUN_SCORE_WEIGHTS,
10607
+ DEFAULT_SEVERITY_WEIGHTS,
9875
10608
  Dataset,
9876
10609
  DockerSandboxDriver,
9877
10610
  DualAgentBench,
@@ -9886,6 +10619,7 @@ export {
9886
10619
  InMemoryExperimentStore,
9887
10620
  InMemoryOutcomeStore,
9888
10621
  InMemoryTraceStore,
10622
+ InMemoryTrialCache,
9889
10623
  InMemoryWorkspaceInspector,
9890
10624
  JudgeRunner,
9891
10625
  LlmCallError,
@@ -9921,7 +10655,9 @@ export {
9921
10655
  benjaminiHochberg,
9922
10656
  bisect,
9923
10657
  bonferroni,
10658
+ bootstrapCi,
9924
10659
  budgetBreachView,
10660
+ buildReflectionPrompt,
9925
10661
  buildReviewerPrompt,
9926
10662
  buildTrajectory,
9927
10663
  byteLengthRange,
@@ -9959,6 +10695,7 @@ export {
9959
10695
  createLlmReviewer,
9960
10696
  createSemanticConceptJudge,
9961
10697
  crossTraceDiff,
10698
+ crowdingDistance,
9962
10699
  decideReferenceReplayPromotion,
9963
10700
  decideReferenceReplayRunPromotion,
9964
10701
  defaultJudges,
@@ -9992,6 +10729,7 @@ export {
9992
10729
  formatBenchmarkReport,
9993
10730
  formatDriverReport,
9994
10731
  formatFindings,
10732
+ precision as goldenPrecision,
9995
10733
  gradeSemanticStatus,
9996
10734
  groupBy,
9997
10735
  hashContent,
@@ -10013,6 +10751,7 @@ export {
10013
10751
  jsonlReferenceReplayStore,
10014
10752
  jsonlReviewStore,
10015
10753
  judgeAgreementView,
10754
+ judgeReplayGate,
10016
10755
  judgeSpans,
10017
10756
  keyPreserved,
10018
10757
  linterJudge,
@@ -10022,6 +10761,7 @@ export {
10022
10761
  localCommandRunner,
10023
10762
  lowercaseMutator,
10024
10763
  mannWhitneyU,
10764
+ matchGoldens,
10025
10765
  mergeLayerResults,
10026
10766
  mergeSteeringBundle,
10027
10767
  multiToolchainLayer,
@@ -10033,7 +10773,10 @@ export {
10033
10773
  pairedTTest,
10034
10774
  paraphraseRobustness,
10035
10775
  paretoFrontier,
10776
+ paretoFrontierWithCrowding,
10777
+ parseReflectionResponse,
10036
10778
  partialCredit,
10779
+ passOrthogonality,
10037
10780
  pixelDeltaRatio,
10038
10781
  politenessPrefixMutator,
10039
10782
  positionalBias,
@@ -10048,6 +10791,8 @@ export {
10048
10791
  redTeamReport,
10049
10792
  redactString,
10050
10793
  redactValue,
10794
+ referenceReplayRunsToSteeringRows,
10795
+ referenceReplayScenarioToRunScore,
10051
10796
  regexMatch,
10052
10797
  regexMatches,
10053
10798
  regressionView,
@@ -10071,12 +10816,14 @@ export {
10071
10816
  runJudgeFleet,
10072
10817
  runKeywordCoverageJudge,
10073
10818
  runKeywordCoverageJudgeUrl,
10819
+ runPromptEvolution,
10074
10820
  runProposeReview,
10075
10821
  runReferenceReplay,
10076
10822
  runSelfPlay,
10077
10823
  runSemanticConceptJudge,
10078
10824
  runTestGradedScenario,
10079
10825
  runsForScenario,
10826
+ scalarScore,
10080
10827
  scanForMuffledGates,
10081
10828
  scoreAllProjects,
10082
10829
  scoreContinuity,
@@ -10113,6 +10860,7 @@ export {
10113
10860
  viteDeployRunner,
10114
10861
  vitestTestParser,
10115
10862
  weightedMean,
10863
+ weightedRecall,
10116
10864
  welchsTTest,
10117
10865
  whitespaceCollapseMutator,
10118
10866
  wilcoxonSignedRank