npm - @tangle-network/agent-eval - Versions diffs - 0.11.0 → 0.12.0 - Mend

@tangle-network/agent-eval 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

package/dist/index.d.ts +498 -4
package/dist/index.js +786 -38
package/dist/index.js.map +1 -1
package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
package/dist/telemetry/file.d.ts +19 -0
package/dist/telemetry/file.js +40 -0
package/dist/telemetry/file.js.map +1 -0
package/dist/telemetry/index.d.ts +38 -0
package/dist/telemetry/index.js +128 -0
package/dist/telemetry/index.js.map +1 -0
package/package.json +18 -9

package/dist/index.js CHANGED Viewed

@@ -410,7 +410,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
   const n = scores.length;
-  const mean5 = scores.reduce((a, b) => a + b, 0) / n;
+  const mean7 = scores.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -425,7 +425,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean5,
+    mean: mean7,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -513,11 +513,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
+  const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
-  const t = mean5 / se;
+  if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
+  const t = mean7 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -541,9 +541,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean5 = n * (n + 1) / 4;
+  const mean7 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean5) / Math.sqrt(variance2);
+  const z = (wPlus - mean7) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -2486,6 +2486,56 @@ function paretoFrontier(candidates, objectives) {
   }));
   return { frontier, dominated, dominanceMap };
 }
+function scalarScore(candidates, objectives, options = {}) {
+  if (candidates.length === 0) return [];
+  const weights = options.weights ?? {};
+  const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
+  const ranges = objectives.map((obj) => {
+    const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
+    if (values.length === 0) return { min: 0, max: 1 };
+    const min = Math.min(...values);
+    const max = Math.max(...values);
+    return { min, max: max === min ? min + 1 : max };
+  });
+  return candidates.map((c) => {
+    let score = 0;
+    objectives.forEach((obj, i) => {
+      const v = obj.value(c);
+      if (!Number.isFinite(v)) return;
+      const { min, max } = ranges[i];
+      const normalised = (v - min) / (max - min);
+      const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
+      const weight = (weights[obj.name] ?? 1) / totalWeight;
+      score += directional * weight;
+    });
+    return { candidate: c, score };
+  });
+}
+function crowdingDistance(candidates, objectives) {
+  const distances = new Map(candidates.map((c) => [c, 0]));
+  for (const obj of objectives) {
+    const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
+    const min = obj.value(sorted[0]);
+    const max = obj.value(sorted[sorted.length - 1]);
+    const range = max - min || 1;
+    distances.set(sorted[0], Infinity);
+    distances.set(sorted[sorted.length - 1], Infinity);
+    for (let i = 1; i < sorted.length - 1; i++) {
+      const prev = obj.value(sorted[i - 1]);
+      const next = obj.value(sorted[i + 1]);
+      const current = distances.get(sorted[i]);
+      if (current === Infinity) continue;
+      distances.set(sorted[i], current + (next - prev) / range);
+    }
+  }
+  return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
+}
+function paretoFrontierWithCrowding(candidates, objectives) {
+  const { frontier } = paretoFrontier(candidates, objectives);
+  if (frontier.length === 0) return [];
+  const distances = crowdingDistance(frontier, objectives);
+  return distances.sort((a, b) => b.distance - a.distance);
+}
 // src/harness-optimizer.ts
 var DEFAULT_HARNESS_OBJECTIVES = [
@@ -5095,10 +5145,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
+  const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
+  const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -5119,7 +5169,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -6047,12 +6097,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
+  const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
+  const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -6973,8 +7023,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
+  const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6996,8 +7046,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
+  const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -7527,8 +7577,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores.length < 3) continue;
-    const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
-    const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
+    const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -9491,6 +9541,7 @@ async function runReferenceReplay(cases, options) {
     const scoreOptions2 = {
       matcher: options.matcher,
       matchThreshold: options.matchThreshold,
+      matchStrategy: options.matchStrategy,
       includeHoldout: true
     };
     const scenarioScore = scoreReferenceReplay([scenario], scoreOptions2).scenarios[0];
@@ -9510,6 +9561,7 @@ async function runReferenceReplay(cases, options) {
   const scoreOptions = {
     matcher: options.matcher,
     matchThreshold: options.matchThreshold,
+    matchStrategy: options.matchStrategy,
     includeHoldout: true
   };
   const run = {
@@ -9560,12 +9612,13 @@ function jsonlReferenceReplayStore(path) {
 function scoreReferenceReplay(scenarios, options = {}) {
   const matcher = options.matcher ?? defaultReferenceReplayMatcher;
   const threshold = options.matchThreshold ?? DEFAULT_MATCH_THRESHOLD;
+  const matchStrategy = options.matchStrategy ?? "reference-order";
   const allowedSplits = new Set(options.splits ?? ALL_SPLITS);
   const scores = scenarios.filter((scenario) => {
     const split = scenario.split ?? "train";
     if (split === "holdout" && !options.includeHoldout) return false;
     return allowedSplits.has(split);
-  }).map((scenario) => scoreScenario(scenario, matcher, threshold));
+  }).map((scenario) => scoreScenario(scenario, matcher, threshold, matchStrategy));
   return {
     scenarios: scores,
     aggregate: aggregateScenarioScores(scores),
@@ -9664,18 +9717,18 @@ function defaultReferenceReplayMatcher(reference, candidate) {
   const score = clamp012(textScore * 0.85 + tagScore + severityScore);
   return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
 }
-function scoreScenario(scenario, matcher, threshold) {
+function scoreScenario(scenario, matcher, threshold, matchStrategy) {
+  return matchStrategy === "global-greedy" ? scoreScenarioGlobalGreedy(scenario, matcher, threshold) : scoreScenarioReferenceOrder(scenario, matcher, threshold);
+}
+function scoreScenarioReferenceOrder(scenario, matcher, threshold) {
   const candidatesLeft = scenario.candidates.map((candidate, index) => ({ candidate, index }));
   const matches2 = [];
   for (const reference of scenario.references) {
     let best = null;
     for (const item of candidatesLeft) {
-      const result = matcher(reference, item.candidate, scenario);
-      if (!Number.isFinite(result.score)) {
-        throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${item.candidate.id}`);
-      }
+      const result = scorePair(scenario, matcher, reference, item.candidate);
       if (!best || result.score > best.score) {
-        best = { ...item, score: clamp012(result.score), reason: result.reason ?? "" };
+        best = { ...item, ...result };
       }
     }
     const weight = reference.weight ?? 1;
@@ -9703,12 +9756,72 @@ function scoreScenario(scenario, matcher, threshold) {
       });
     }
   }
+  return buildScenarioScore(scenario, matches2, candidatesLeft.length);
+}
+function scoreScenarioGlobalGreedy(scenario, matcher, threshold) {
+  const pairs = [];
+  for (const [referenceIndex, reference] of scenario.references.entries()) {
+    for (const [candidateIndex, candidate] of scenario.candidates.entries()) {
+      pairs.push({
+        referenceIndex,
+        candidateIndex,
+        reference,
+        candidate,
+        ...scorePair(scenario, matcher, reference, candidate)
+      });
+    }
+  }
+  pairs.sort(
+    (a, b) => b.score - a.score || a.referenceIndex - b.referenceIndex || a.candidateIndex - b.candidateIndex
+  );
+  const selectedByReference = /* @__PURE__ */ new Map();
+  const selectedCandidates = /* @__PURE__ */ new Set();
+  for (const pair of pairs) {
+    if (pair.score < threshold) break;
+    if (selectedByReference.has(pair.referenceIndex) || selectedCandidates.has(pair.candidateIndex)) continue;
+    selectedByReference.set(pair.referenceIndex, pair);
+    selectedCandidates.add(pair.candidateIndex);
+  }
+  const matches2 = scenario.references.map((reference, referenceIndex) => {
+    const weight = reference.weight ?? 1;
+    const selected = selectedByReference.get(referenceIndex);
+    if (selected) {
+      return {
+        scenarioId: scenario.id,
+        referenceId: reference.id,
+        candidateId: selected.candidate.id,
+        score: selected.score,
+        matched: true,
+        weight,
+        reason: selected.reason
+      };
+    }
+    const bestRejected = pairs.find((pair) => pair.referenceIndex === referenceIndex);
+    return {
+      scenarioId: scenario.id,
+      referenceId: reference.id,
+      candidateId: bestRejected?.candidate.id ?? null,
+      score: bestRejected?.score ?? 0,
+      matched: false,
+      weight,
+      reason: bestRejected?.reason ?? "no candidates"
+    };
+  });
+  return buildScenarioScore(scenario, matches2, scenario.candidates.length - selectedCandidates.size);
+}
+function scorePair(scenario, matcher, reference, candidate) {
+  const result = matcher(reference, candidate, scenario);
+  if (!Number.isFinite(result.score)) {
+    throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
+  }
+  return { score: clamp012(result.score), reason: result.reason ?? "" };
+}
+function buildScenarioScore(scenario, matches2, falsePositives) {
   const matched = matches2.filter((match) => match.matched).length;
   const total = scenario.references.length;
-  const falsePositives = candidatesLeft.length;
   const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
   const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
-  const precision = ratio(matched, matched + falsePositives);
+  const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
     scenarioId: scenario.id,
@@ -9718,9 +9831,9 @@ function scoreScenario(scenario, matcher, threshold) {
     falsePositives,
     matchedWeight,
     totalWeight,
-    precision,
+    precision: precision2,
     recall,
-    f1: f1(precision, recall),
+    f1: f1(precision2, recall),
     matches: matches2
   };
 }
@@ -9738,7 +9851,7 @@ function aggregateScenarioScores(scores) {
   const falsePositives = sum(scores.map((score) => score.falsePositives));
   const matchedWeight = sum(scores.map((score) => score.matchedWeight));
   const totalWeight = sum(scores.map((score) => score.totalWeight));
-  const precision = ratio(matched, matched + falsePositives);
+  const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
     matched,
@@ -9746,9 +9859,9 @@ function aggregateScenarioScores(scores) {
     falsePositives,
     matchedWeight,
     totalWeight,
-    precision,
+    precision: precision2,
     recall,
-    f1: f1(precision, recall),
+    f1: f1(precision2, recall),
     weightedRecall: ratio(matchedWeight, totalWeight)
   };
 }
@@ -9768,8 +9881,8 @@ function emptyAggregate() {
 function hasSplit(score, split) {
   return score.bySplit[split] !== void 0;
 }
-function f1(precision, recall) {
-  return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
+function f1(precision2, recall) {
+  return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
 }
 function ratio(numerator, denominator) {
   return denominator > 0 ? numerator / denominator : 0;
@@ -9854,6 +9967,624 @@ var STOP_WORDS = /* @__PURE__ */ new Set([
   "where",
   "which"
 ]);
+// src/reference-replay-steering.ts
+function referenceReplayRunsToSteeringRows(runs, options = {}) {
+  const rows = [];
+  for (const run of runs) {
+    const variantId = run.variantId ?? run.id;
+    const bundle = options.bundleForRun?.(run) ?? {
+      id: variantId,
+      metadata: run.metadata
+    };
+    for (const caseRun of run.cases) {
+      rows.push({
+        variantId,
+        scenarioId: caseRun.caseId,
+        bundle,
+        score: options.scoreForCase?.(caseRun, run) ?? referenceReplayScenarioToRunScore(caseRun.score, caseRun.durationMs),
+        metadata: {
+          runId: run.id,
+          split: caseRun.split,
+          task: caseRun.metadata?.task ?? caseRun.metadata?.repo ?? caseRun.caseId,
+          referenceCount: caseRun.references.length,
+          candidateCount: caseRun.candidates.length,
+          matched: caseRun.score.matched,
+          total: caseRun.score.total,
+          falsePositives: caseRun.score.falsePositives,
+          precision: caseRun.score.precision,
+          recall: caseRun.score.recall,
+          f1: caseRun.score.f1,
+          error: caseRun.error,
+          ...caseRun.metadata ?? {}
+        }
+      });
+    }
+  }
+  return rows;
+}
+function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
+  const success = scenarioScore.f1;
+  const recall = scenarioScore.recall;
+  const precision2 = scenarioScore.precision;
+  const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
+  return {
+    success,
+    goalProgress: recall,
+    repoGroundedness: precision2,
+    driftPenalty: 1 - precision2,
+    toolUseQuality: precision2,
+    patchQuality: 0,
+    testReality: scenarioScore.total > 0 ? 1 : 0,
+    finalGate: success,
+    reviewerBlockers: failed ? 1 : 0,
+    costUsd: 0,
+    wallSeconds: Math.max(0, durationMs / 1e3),
+    notes: [
+      `reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
+      `precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
+    ]
+  };
+}
+// src/prompt-evolution.ts
+var InMemoryTrialCache = class {
+  store = /* @__PURE__ */ new Map();
+  get(key) {
+    return this.store.get(key);
+  }
+  set(key, value) {
+    this.store.set(key, value);
+  }
+  size() {
+    return this.store.size;
+  }
+  clear() {
+    this.store.clear();
+  }
+};
+async function runPromptEvolution(config) {
+  const generations = [];
+  let population = [...config.seedVariants];
+  let bestVariant = population[0];
+  let bestAggregate = null;
+  for (let generation = 0; generation < config.generations; generation++) {
+    config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
+    const trials = await scorePopulation(population, config, generation);
+    const aggregates = aggregateTrials(population, config.scenarioIds, trials);
+    const front = paretoFrontierWithCrowding(aggregates, config.objectives);
+    const frontIds = new Set(front.map((c) => c.candidate.variantId));
+    const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
+    scored.sort((a, b) => b.score - a.score);
+    const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
+    const report = {
+      runId: config.runId,
+      target: config.target,
+      generation,
+      variants: population,
+      aggregates,
+      paretoFrontIds: front.map((c) => c.candidate.variantId),
+      winnerId,
+      trials
+    };
+    generations.push(report);
+    config.onProgress?.({ type: "generation-complete", report });
+    const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
+    if (winnerAgg) {
+      const winner = population.find((v) => v.id === winnerId);
+      if (winner) bestVariant = winner;
+      bestAggregate = winnerAgg;
+    }
+    if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
+      const prev = generations[generations.length - 2];
+      const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
+      if (noChange) {
+        config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
+        break;
+      }
+    }
+    if (generation === config.generations - 1) break;
+    population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
+  }
+  return {
+    runId: config.runId,
+    target: config.target,
+    generations,
+    bestVariant,
+    bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
+  };
+}
+async function scorePopulation(population, config, generation) {
+  const jobs = [];
+  for (const variant of population) {
+    for (const scenarioId of config.scenarioIds) {
+      for (let rep = 0; rep < config.reps; rep++) {
+        jobs.push(async () => {
+          const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
+          const cached = config.cache?.get(cacheKey);
+          if (cached) {
+            config.onProgress?.({
+              type: "trial-complete",
+              generation,
+              variantId: variant.id,
+              scenarioId,
+              rep,
+              ok: cached.ok,
+              score: cached.score,
+              cached: true
+            });
+            return cached;
+          }
+          const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
+          config.cache?.set(cacheKey, result);
+          config.onProgress?.({
+            type: "trial-complete",
+            generation,
+            variantId: variant.id,
+            scenarioId,
+            rep,
+            ok: result.ok,
+            score: result.score,
+            cached: false
+          });
+          return result;
+        });
+      }
+    }
+  }
+  return runWithConcurrency(jobs, config.scoreConcurrency);
+}
+async function runWithConcurrency(jobs, concurrency) {
+  const results = new Array(jobs.length);
+  const limit = Math.max(1, concurrency);
+  let next = 0;
+  async function worker() {
+    while (true) {
+      const i = next++;
+      if (i >= jobs.length) return;
+      results[i] = await jobs[i]();
+    }
+  }
+  await Promise.all(Array.from({ length: limit }, () => worker()));
+  return results;
+}
+function aggregateTrials(population, scenarioIds, trials) {
+  return population.map((variant) => {
+    const variantTrials = trials.filter((t) => t.variantId === variant.id);
+    const scenarios = scenarioIds.map((sid) => {
+      const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
+      const okTrials = scenarioTrials.filter((t) => t.ok);
+      const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
+      return {
+        variantId: variant.id,
+        scenarioId: sid,
+        meanScore: mean5(okTrials.map((t) => t.score)),
+        meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
+        meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
+        okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
+        trials: scenarioTrials.length,
+        metrics
+      };
+    });
+    return {
+      variantId: variant.id,
+      meanScore: mean5(scenarios.map((s) => s.meanScore)),
+      meanCost: mean5(scenarios.map((s) => s.meanCost)),
+      meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
+      okRate: mean5(scenarios.map((s) => s.okRate)),
+      scenarios,
+      metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
+    };
+  });
+}
+function aggregateMetrics(rows) {
+  const buckets = /* @__PURE__ */ new Map();
+  for (const row of rows) {
+    for (const [k, v] of Object.entries(row)) {
+      if (!Number.isFinite(v)) continue;
+      const list = buckets.get(k) ?? [];
+      list.push(v);
+      buckets.set(k, list);
+    }
+  }
+  const out = {};
+  for (const [k, list] of buckets) out[k] = mean5(list);
+  return out;
+}
+function mean5(xs) {
+  if (xs.length === 0) return 0;
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
+  const survivorIds = new Set(front.map((c) => c.candidate.variantId));
+  const survivors = current.filter((v) => survivorIds.has(v.id));
+  const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
+  const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
+  const parent = current.find((v) => v.id === parentId) ?? current[0];
+  const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
+  const topTrials = topKTrialsByScore(trials, parent.id, 3);
+  const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
+  const childCount = Math.max(0, config.populationSize - survivors.length);
+  let children = [];
+  if (childCount > 0) {
+    children = await config.mutateAdapter.mutate({
+      parent,
+      parentAggregate,
+      topTrials,
+      bottomTrials,
+      childCount,
+      generation: nextGeneration
+    });
+    children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
+  }
+  return [...survivors, ...children];
+}
+function topKTrialsByScore(trials, variantId, k) {
+  return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
+}
+function bottomKTrialsByScore(trials, variantId, k) {
+  return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
+}
+function samePopulation(a, b) {
+  if (a.length !== b.length) return false;
+  const setA = new Set(a);
+  return b.every((id) => setA.has(id));
+}
+// src/golden-matcher.ts
+function matchGoldens(goldens, candidates, options = {}) {
+  const extract = options.text ?? defaultExtract5;
+  const haystacks = candidates.map((c) => extract(c).toLowerCase());
+  const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
+  return {
+    matches: matches2,
+    hits: matches2.filter(Boolean).length,
+    total: goldens.length
+  };
+}
+function defaultExtract5(candidate) {
+  if (typeof candidate === "string") return candidate;
+  if (candidate && typeof candidate === "object") {
+    const parts = [];
+    for (const v of Object.values(candidate)) {
+      if (typeof v === "string") parts.push(v);
+    }
+    return parts.join(" ");
+  }
+  return String(candidate ?? "");
+}
+function goldenMatched(golden, haystacks) {
+  for (const phrase of golden.any) {
+    const needle = phrase.toLowerCase().trim();
+    if (!needle) continue;
+    if (haystacks.some((h) => h.includes(needle))) return true;
+  }
+  for (const pattern of golden.anyRegex ?? []) {
+    let re;
+    try {
+      re = new RegExp(pattern, "i");
+    } catch {
+      continue;
+    }
+    if (haystacks.some((h) => re.test(h))) return true;
+  }
+  return false;
+}
+var DEFAULT_SEVERITY_WEIGHTS = {
+  critical: 3,
+  major: 2,
+  minor: 1
+};
+function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
+  if (goldens.length === 0) return 1;
+  const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
+  if (total === 0) return 1;
+  const hit = goldens.reduce(
+    (s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
+    0
+  );
+  return hit / total;
+}
+function precision(goldens, candidates, options = {}) {
+  if (candidates.length === 0) return 1;
+  const extract = options.text ?? defaultExtract5;
+  let matched = 0;
+  for (const cand of candidates) {
+    const haystack = extract(cand).toLowerCase();
+    const matchedAny = goldens.some(
+      (g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
+        try {
+          return new RegExp(pat, "i").test(haystack);
+        } catch {
+          return false;
+        }
+      })
+    );
+    if (matchedAny) matched++;
+  }
+  return matched / candidates.length;
+}
+// src/orthogonality.ts
+function passOrthogonality(input) {
+  const passes = input.passes;
+  if (passes.length < 2) {
+    return { orthogonality: 1, passCount: passes.length, similarities: [] };
+  }
+  const render = input.text ?? defaultRender;
+  const minLen = input.minTokenLength ?? 4;
+  const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
+  const sims = [];
+  for (let i = 0; i < vectors.length; i++) {
+    for (let j = i + 1; j < vectors.length; j++) {
+      sims.push(cosineSimilarity(vectors[i], vectors[j]));
+    }
+  }
+  const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  return {
+    orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
+    passCount: passes.length,
+    similarities: sims
+  };
+}
+function defaultRender(item) {
+  if (typeof item === "string") return item;
+  if (item && typeof item === "object") {
+    const parts = [];
+    for (const v of Object.values(item)) {
+      if (typeof v === "string") parts.push(v);
+    }
+    return parts.join(" ");
+  }
+  return String(item ?? "");
+}
+function bagOfWords(items, render, minLen) {
+  const bag = /* @__PURE__ */ new Map();
+  for (const item of items) {
+    const text = render(item).toLowerCase();
+    for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
+      bag.set(tok, (bag.get(tok) ?? 0) + 1);
+    }
+  }
+  return bag;
+}
+function cosineSimilarity(a, b) {
+  let dot = 0;
+  let aMag = 0;
+  let bMag = 0;
+  for (const [, v] of a) aMag += v * v;
+  for (const [, v] of b) bMag += v * v;
+  for (const [k, v] of a) {
+    const bv = b.get(k);
+    if (bv) dot += v * bv;
+  }
+  if (aMag === 0 || bMag === 0) return 0;
+  return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
+}
+// src/promotion-gate.ts
+function bootstrapCi(baseline, candidate, options = {}) {
+  const alpha = options.alpha ?? 0.05;
+  const iterations = options.iterations ?? 1e3;
+  const minTotal = options.minTotalSamples ?? 6;
+  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
+  const baselineMean = mean6(baseline);
+  const candidateMean = mean6(candidate);
+  const delta = candidateMean - baselineMean;
+  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
+    return {
+      baselineMean,
+      candidateMean,
+      delta,
+      ciLower: -Infinity,
+      ciUpper: Infinity,
+      iterations: 0,
+      alpha,
+      verdict: "INCONCLUSIVE"
+    };
+  }
+  const deltas = new Array(iterations);
+  for (let i = 0; i < iterations; i++) {
+    const bResample = resample(baseline, rng);
+    const cResample = resample(candidate, rng);
+    deltas[i] = mean6(cResample) - mean6(bResample);
+  }
+  deltas.sort((a, b) => a - b);
+  const lowerIdx = Math.floor(alpha / 2 * iterations);
+  const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
+  const ciLower = deltas[Math.max(0, lowerIdx)];
+  const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
+  let verdict;
+  if (ciLower > 0) verdict = "ADVANCE";
+  else if (ciUpper < 0) verdict = "REVERT";
+  else if (delta >= 0) verdict = "KEEP";
+  else verdict = "INCONCLUSIVE";
+  return {
+    baselineMean,
+    candidateMean,
+    delta,
+    ciLower,
+    ciUpper,
+    iterations,
+    alpha,
+    verdict
+  };
+}
+function mean6(xs) {
+  if (xs.length === 0) return 0;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+function resample(xs, rng) {
+  const out = new Array(xs.length);
+  for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
+  return out;
+}
+function mulberry32(seed) {
+  let t = seed >>> 0;
+  return () => {
+    t += 1831565813;
+    let r = t;
+    r = Math.imul(r ^ r >>> 15, r | 1);
+    r ^= r + Math.imul(r ^ r >>> 7, r | 61);
+    return ((r ^ r >>> 14) >>> 0) / 4294967296;
+  };
+}
+function hashSeed(a, b) {
+  let h = 2166136261;
+  for (const x of [...a, ...b]) {
+    const view = new Float64Array([x]);
+    const bytes = new Uint8Array(view.buffer);
+    for (const byte of bytes) {
+      h ^= byte;
+      h = Math.imul(h, 16777619);
+    }
+  }
+  return h >>> 0;
+}
+async function judgeReplayGate(args) {
+  const concurrency = args.judgeConcurrency ?? 4;
+  const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
+  const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
+  const ci = bootstrapCi(baselineScores, candidateScores, {
+    ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
+    ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
+    ...args.seed !== void 0 ? { seed: args.seed } : {}
+  });
+  return {
+    ...ci,
+    baselineSamples: baselineScores.length,
+    candidateSamples: candidateScores.length
+  };
+}
+async function scoreAll(outputs, judge, concurrency) {
+  const results = new Array(outputs.length);
+  let next = 0;
+  async function worker() {
+    while (true) {
+      const i = next++;
+      if (i >= outputs.length) return;
+      const v = await judge(outputs[i]);
+      results[i] = Number.isFinite(v) ? v : 0;
+    }
+  }
+  await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
+  return results;
+}
+// src/reflective-mutation.ts
+var DEFAULT_MUTATION_PRIMITIVES = [
+  'Strengthen an imperative ("should" \u2192 "must")',
+  "Add a concrete example pulled from a missed-golden phrase",
+  "Remove a redundant rule that did not improve recall",
+  'Add a counterfactual ("if X is missing, the score is capped at Y")',
+  "Reorder sections so the highest-impact rule is first",
+  "Replace abstract language with a domain-specific noun the trial misses"
+];
+function buildReflectionPrompt(ctx) {
+  const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
+  const sections = [];
+  sections.push(`# Mutation target: ${ctx.target}`);
+  sections.push("");
+  sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
+  sections.push("");
+  sections.push("## Current variant");
+  sections.push("```json");
+  sections.push(JSON.stringify(ctx.parentPayload, null, 2));
+  sections.push("```");
+  sections.push("");
+  if (ctx.bottomTrials.length > 0) {
+    sections.push("## Failures (bottom trials) \u2014 what went wrong");
+    sections.push("");
+    for (const trial of ctx.bottomTrials) {
+      sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
+      const missed = (trial.expectations ?? []).filter((e) => !e.matched);
+      if (missed.length > 0) {
+        sections.push("");
+        sections.push("**Missed expectations:**");
+        for (const m of missed) {
+          sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
+        }
+      }
+      if (trial.emitted) {
+        sections.push("");
+        sections.push("**What the agent emitted:**");
+        sections.push("```");
+        sections.push(truncate3(trial.emitted, 600));
+        sections.push("```");
+      }
+      sections.push("");
+    }
+  }
+  if (ctx.topTrials.length > 0) {
+    sections.push("## Successes (top trials) \u2014 what to preserve");
+    sections.push("");
+    for (const trial of ctx.topTrials) {
+      sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
+    }
+    sections.push("");
+  }
+  sections.push("## Allowed mutation primitives");
+  sections.push("");
+  for (const p of primitives) sections.push(`- ${p}`);
+  sections.push("");
+  sections.push("## Output schema");
+  sections.push("");
+  sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
+  sections.push("```json");
+  sections.push(JSON.stringify(
+    {
+      proposals: [
+        {
+          label: "<short label, \u2264 40 chars>",
+          rationale: "<which failure this targets and which primitive you used>",
+          payload: "<full payload of the new variant \u2014 same shape as the current variant>"
+        }
+      ]
+    },
+    null,
+    2
+  ));
+  sections.push("```");
+  return sections.join("\n");
+}
+function truncate3(s, max) {
+  if (s.length <= max) return s;
+  return s.slice(0, max) + "\u2026 [truncated]";
+}
+function quote(s) {
+  return s.replace(/`/g, "\\`");
+}
+function parseReflectionResponse(raw, maxProposals) {
+  let text = raw.trim();
+  if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
+  const start = text.indexOf("{");
+  const end = text.lastIndexOf("}");
+  if (start < 0 || end <= start) return [];
+  let parsed;
+  try {
+    parsed = JSON.parse(text.slice(start, end + 1));
+  } catch {
+    return [];
+  }
+  if (!parsed || typeof parsed !== "object") return [];
+  const proposalsRaw = parsed.proposals;
+  if (!Array.isArray(proposalsRaw)) return [];
+  const out = [];
+  for (const p of proposalsRaw) {
+    if (!p || typeof p !== "object") continue;
+    const obj = p;
+    if (!("payload" in obj)) continue;
+    out.push({
+      label: typeof obj.label === "string" ? obj.label : "mutation",
+      rationale: typeof obj.rationale === "string" ? obj.rationale : "",
+      payload: obj.payload
+    });
+    if (maxProposals !== void 0 && out.length >= maxProposals) break;
+  }
+  return out;
+}
 export {
   AgentDriver,
   AxGepaSteeringOptimizer,
@@ -9868,10 +10599,12 @@ export {
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
   DEFAULT_FINDERS,
   DEFAULT_HARNESS_OBJECTIVES,
+  DEFAULT_MUTATION_PRIMITIVES,
   DEFAULT_MUTATORS,
   DEFAULT_REDACTION_RULES,
   DEFAULT_RED_TEAM_CORPUS,
   DEFAULT_RUN_SCORE_WEIGHTS,
+  DEFAULT_SEVERITY_WEIGHTS,
   Dataset,
   DockerSandboxDriver,
   DualAgentBench,
@@ -9886,6 +10619,7 @@ export {
   InMemoryExperimentStore,
   InMemoryOutcomeStore,
   InMemoryTraceStore,
+  InMemoryTrialCache,
   InMemoryWorkspaceInspector,
   JudgeRunner,
   LlmCallError,
@@ -9921,7 +10655,9 @@ export {
   benjaminiHochberg,
   bisect,
   bonferroni,
+  bootstrapCi,
   budgetBreachView,
+  buildReflectionPrompt,
   buildReviewerPrompt,
   buildTrajectory,
   byteLengthRange,
@@ -9959,6 +10695,7 @@ export {
   createLlmReviewer,
   createSemanticConceptJudge,
   crossTraceDiff,
+  crowdingDistance,
   decideReferenceReplayPromotion,
   decideReferenceReplayRunPromotion,
   defaultJudges,
@@ -9992,6 +10729,7 @@ export {
   formatBenchmarkReport,
   formatDriverReport,
   formatFindings,
+  precision as goldenPrecision,
   gradeSemanticStatus,
   groupBy,
   hashContent,
@@ -10013,6 +10751,7 @@ export {
   jsonlReferenceReplayStore,
   jsonlReviewStore,
   judgeAgreementView,
+  judgeReplayGate,
   judgeSpans,
   keyPreserved,
   linterJudge,
@@ -10022,6 +10761,7 @@ export {
   localCommandRunner,
   lowercaseMutator,
   mannWhitneyU,
+  matchGoldens,
   mergeLayerResults,
   mergeSteeringBundle,
   multiToolchainLayer,
@@ -10033,7 +10773,10 @@ export {
   pairedTTest,
   paraphraseRobustness,
   paretoFrontier,
+  paretoFrontierWithCrowding,
+  parseReflectionResponse,
   partialCredit,
+  passOrthogonality,
   pixelDeltaRatio,
   politenessPrefixMutator,
   positionalBias,
@@ -10048,6 +10791,8 @@ export {
   redTeamReport,
   redactString,
   redactValue,
+  referenceReplayRunsToSteeringRows,
+  referenceReplayScenarioToRunScore,
   regexMatch,
   regexMatches,
   regressionView,
@@ -10071,12 +10816,14 @@ export {
   runJudgeFleet,
   runKeywordCoverageJudge,
   runKeywordCoverageJudgeUrl,
+  runPromptEvolution,
   runProposeReview,
   runReferenceReplay,
   runSelfPlay,
   runSemanticConceptJudge,
   runTestGradedScenario,
   runsForScenario,
+  scalarScore,
   scanForMuffledGates,
   scoreAllProjects,
   scoreContinuity,
@@ -10113,6 +10860,7 @@ export {
   viteDeployRunner,
   vitestTestParser,
   weightedMean,
+  weightedRecall,
   welchsTTest,
   whitespaceCollapseMutator,
   wilcoxonSignedRank