npm - @tangle-network/agent-eval - Versions diffs - 0.11.1 → 0.13.0 - Mend

@tangle-network/agent-eval 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +96 -11
package/dist/chunk-ITN4YOZY.js +215 -0
package/dist/chunk-ITN4YOZY.js.map +1 -0
package/dist/chunk-OZPRSK4A.js +594 -0
package/dist/chunk-OZPRSK4A.js.map +1 -0
package/dist/cli.d.ts +1 -0
package/dist/cli.js +104 -0
package/dist/cli.js.map +1 -0
package/dist/index.d.ts +597 -4
package/dist/index.js +908 -241
package/dist/index.js.map +1 -1
package/dist/sink-fetch-C0B8ximv.d.ts +101 -0
package/dist/telemetry/file.d.ts +19 -0
package/dist/telemetry/file.js +40 -0
package/dist/telemetry/file.js.map +1 -0
package/dist/telemetry/index.d.ts +38 -0
package/dist/telemetry/index.js +128 -0
package/dist/telemetry/index.js.map +1 -0
package/dist/wire/index.d.ts +211 -0
package/dist/wire/index.js +56 -0
package/dist/wire/index.js.map +1 -0
package/package.json +27 -3

package/dist/index.js CHANGED Viewed

@@ -1,3 +1,12 @@
+import {
+  LlmCallError,
+  LlmClient,
+  callLlm,
+  callLlmJson,
+  probeLlm,
+  stripFencedJson
+} from "./chunk-ITN4YOZY.js";
 // src/client.ts
 var ProductClient = class {
   baseUrl;
@@ -410,7 +419,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   if (scores.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores.length === 1) return { mean: scores[0], lower: scores[0], upper: scores[0] };
   const n = scores.length;
-  const mean5 = scores.reduce((a, b) => a + b, 0) / n;
+  const mean7 = scores.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -425,7 +434,7 @@ function confidenceInterval(scores, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean5,
+    mean: mean7,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -513,11 +522,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean5 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean5) ** 2, 0) / (n - 1);
+  const mean7 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean7) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean5 === 0 ? 0 : Infinity, df: n - 1, p: mean5 === 0 ? 1 : 0 };
-  const t = mean5 / se;
+  if (se === 0) return { t: mean7 === 0 ? 0 : Infinity, df: n - 1, p: mean7 === 0 ? 1 : 0 };
+  const t = mean7 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -541,9 +550,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean5 = n * (n + 1) / 4;
+  const mean7 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean5) / Math.sqrt(variance2);
+  const z = (wPlus - mean7) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -1926,6 +1935,244 @@ function rand(bytes) {
   return Array.from(arr).map((b) => b.toString(16).padStart(2, "0")).join("");
 }
+// src/experiment-tracker-fs.ts
+var FileSystemExperimentStore = class {
+  dir;
+  maxBytes;
+  index;
+  loaded = false;
+  constructor(options) {
+    this.dir = options.dir;
+    this.maxBytes = options.maxBytes ?? 32 * 1024 * 1024;
+  }
+  async saveExperiment(exp) {
+    const idx = await this.load();
+    await idx.saveExperiment(exp);
+    await this.append("experiments", exp);
+  }
+  async getExperiment(id) {
+    const idx = await this.load();
+    return idx.getExperiment(id);
+  }
+  async listExperiments() {
+    const idx = await this.load();
+    return idx.listExperiments();
+  }
+  async saveRun(run) {
+    const idx = await this.load();
+    await idx.saveRun(run);
+    await this.append("runs", run);
+  }
+  async getRun(id) {
+    const idx = await this.load();
+    return idx.getRun(id);
+  }
+  async listRuns(experimentId) {
+    const idx = await this.load();
+    return idx.listRuns(experimentId);
+  }
+  async ensureDir() {
+    const fs = await import("fs/promises");
+    await fs.mkdir(this.dir, { recursive: true });
+  }
+  async append(name, record) {
+    await this.ensureDir();
+    const fs = await import("fs/promises");
+    const path = await import("path");
+    const active = path.join(this.dir, `${name}.ndjson`);
+    try {
+      const stat = await fs.stat(active);
+      if (stat.size >= this.maxBytes) {
+        const rolled = path.join(this.dir, `${name}.${Date.now()}.ndjson`);
+        await fs.rename(active, rolled);
+      }
+    } catch {
+    }
+    await fs.appendFile(active, JSON.stringify(record) + "\n", "utf8");
+  }
+  async load() {
+    if (this.loaded && this.index) return this.index;
+    const fs = await import("fs/promises");
+    const path = await import("path");
+    const store = new InMemoryExperimentStore();
+    try {
+      const entries = await fs.readdir(this.dir);
+      const sorted = entries.filter((f) => f.endsWith(".ndjson")).sort((a, b) => a.localeCompare(b));
+      for (const file of sorted) {
+        const full = path.join(this.dir, file);
+        const content = await fs.readFile(full, "utf8");
+        const base = file.split(".")[0];
+        for (const line of content.split("\n")) {
+          if (!line.trim()) continue;
+          let record;
+          try {
+            record = JSON.parse(line);
+          } catch {
+            continue;
+          }
+          if (base === "experiments") {
+            await store.saveExperiment(record);
+          } else if (base === "runs") {
+            await store.saveRun(record);
+          }
+        }
+      }
+    } catch {
+    }
+    this.index = store;
+    this.loaded = true;
+    return store;
+  }
+};
+// src/experiment-tracker-d1.ts
+var SCHEMA_VERSION = 1;
+var D1ExperimentStore = class {
+  db;
+  experimentsTable;
+  runsTable;
+  metaTable;
+  schemaReady = false;
+  constructor(options) {
+    this.db = options.db;
+    const prefix = options.tablePrefix ?? "agent_eval_";
+    this.experimentsTable = `${prefix}experiments`;
+    this.runsTable = `${prefix}runs`;
+    this.metaTable = `${prefix}meta`;
+  }
+  /**
+   * Idempotent schema setup. Safe to call before every operation; the second
+   * call short-circuits via `schemaReady`. Most consumers will call it once
+   * during Worker bootstrap.
+   */
+  async ensureSchema() {
+    if (this.schemaReady) return;
+    const ddl = `
+      CREATE TABLE IF NOT EXISTS ${this.experimentsTable} (
+        id TEXT PRIMARY KEY,
+        name TEXT NOT NULL,
+        created_at TEXT NOT NULL,
+        metadata_json TEXT
+      );
+      CREATE TABLE IF NOT EXISTS ${this.runsTable} (
+        id TEXT PRIMARY KEY,
+        experiment_id TEXT NOT NULL,
+        name TEXT,
+        status TEXT NOT NULL,
+        started_at TEXT NOT NULL,
+        completed_at TEXT,
+        config_json TEXT NOT NULL,
+        report_json TEXT,
+        error TEXT
+      );
+      CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_experiment ON ${this.runsTable}(experiment_id);
+      CREATE INDEX IF NOT EXISTS idx_${this.runsTable}_started ON ${this.runsTable}(started_at);
+      CREATE TABLE IF NOT EXISTS ${this.metaTable} (
+        key TEXT PRIMARY KEY,
+        value TEXT NOT NULL
+      );
+      INSERT OR REPLACE INTO ${this.metaTable}(key, value) VALUES ('schema_version', '${SCHEMA_VERSION}');
+    `;
+    await this.db.exec(ddl.trim().replace(/\s+/g, " "));
+    this.schemaReady = true;
+  }
+  async saveExperiment(exp) {
+    await this.ensureSchema();
+    await this.db.prepare(
+      `INSERT INTO ${this.experimentsTable}(id, name, created_at, metadata_json)
+         VALUES (?1, ?2, ?3, ?4)
+         ON CONFLICT(id) DO UPDATE SET
+           name = excluded.name,
+           created_at = excluded.created_at,
+           metadata_json = excluded.metadata_json`
+    ).bind(exp.id, exp.name, exp.createdAt, exp.metadata ? JSON.stringify(exp.metadata) : null).run();
+  }
+  async getExperiment(id) {
+    await this.ensureSchema();
+    const row = await this.db.prepare(
+      `SELECT id, name, created_at, metadata_json
+         FROM ${this.experimentsTable}
+         WHERE id = ?1`
+    ).bind(id).first();
+    return row ? rowToExperiment(row) : null;
+  }
+  async listExperiments() {
+    await this.ensureSchema();
+    const { results } = await this.db.prepare(
+      `SELECT id, name, created_at, metadata_json
+         FROM ${this.experimentsTable}
+         ORDER BY created_at DESC`
+    ).all();
+    return results.map(rowToExperiment);
+  }
+  async saveRun(run) {
+    await this.ensureSchema();
+    await this.db.prepare(
+      `INSERT INTO ${this.runsTable}(id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error)
+         VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9)
+         ON CONFLICT(id) DO UPDATE SET
+           experiment_id = excluded.experiment_id,
+           name = excluded.name,
+           status = excluded.status,
+           started_at = excluded.started_at,
+           completed_at = excluded.completed_at,
+           config_json = excluded.config_json,
+           report_json = excluded.report_json,
+           error = excluded.error`
+    ).bind(
+      run.id,
+      run.experimentId,
+      run.name ?? null,
+      run.status,
+      run.startedAt,
+      run.completedAt ?? null,
+      JSON.stringify(run.config),
+      run.report ? JSON.stringify(run.report) : null,
+      run.error ?? null
+    ).run();
+  }
+  async getRun(id) {
+    await this.ensureSchema();
+    const row = await this.db.prepare(
+      `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
+         FROM ${this.runsTable}
+         WHERE id = ?1`
+    ).bind(id).first();
+    return row ? rowToRun(row) : null;
+  }
+  async listRuns(experimentId) {
+    await this.ensureSchema();
+    const { results } = await this.db.prepare(
+      `SELECT id, experiment_id, name, status, started_at, completed_at, config_json, report_json, error
+         FROM ${this.runsTable}
+         WHERE experiment_id = ?1
+         ORDER BY started_at DESC`
+    ).bind(experimentId).all();
+    return results.map(rowToRun);
+  }
+};
+function rowToExperiment(row) {
+  return {
+    id: row.id,
+    name: row.name,
+    createdAt: row.created_at,
+    ...row.metadata_json ? { metadata: JSON.parse(row.metadata_json) } : {}
+  };
+}
+function rowToRun(row) {
+  return {
+    id: row.id,
+    experimentId: row.experiment_id,
+    ...row.name ? { name: row.name } : {},
+    status: row.status,
+    startedAt: row.started_at,
+    ...row.completed_at ? { completedAt: row.completed_at } : {},
+    config: JSON.parse(row.config_json),
+    ...row.report_json ? { report: JSON.parse(row.report_json) } : {},
+    ...row.error ? { error: row.error } : {}
+  };
+}
 // src/power-analysis.ts
 function requiredSampleSize(opts) {
   const effect = opts.effect;
@@ -2486,6 +2733,56 @@ function paretoFrontier(candidates, objectives) {
   }));
   return { frontier, dominated, dominanceMap };
 }
+function scalarScore(candidates, objectives, options = {}) {
+  if (candidates.length === 0) return [];
+  const weights = options.weights ?? {};
+  const totalWeight = objectives.reduce((s, o) => s + (weights[o.name] ?? 1), 0);
+  const ranges = objectives.map((obj) => {
+    const values = candidates.map((c) => obj.value(c)).filter((v) => Number.isFinite(v));
+    if (values.length === 0) return { min: 0, max: 1 };
+    const min = Math.min(...values);
+    const max = Math.max(...values);
+    return { min, max: max === min ? min + 1 : max };
+  });
+  return candidates.map((c) => {
+    let score = 0;
+    objectives.forEach((obj, i) => {
+      const v = obj.value(c);
+      if (!Number.isFinite(v)) return;
+      const { min, max } = ranges[i];
+      const normalised = (v - min) / (max - min);
+      const directional = obj.direction === "maximize" ? normalised : 1 - normalised;
+      const weight = (weights[obj.name] ?? 1) / totalWeight;
+      score += directional * weight;
+    });
+    return { candidate: c, score };
+  });
+}
+function crowdingDistance(candidates, objectives) {
+  const distances = new Map(candidates.map((c) => [c, 0]));
+  for (const obj of objectives) {
+    const sorted = [...candidates].sort((a, b) => obj.value(a) - obj.value(b));
+    const min = obj.value(sorted[0]);
+    const max = obj.value(sorted[sorted.length - 1]);
+    const range = max - min || 1;
+    distances.set(sorted[0], Infinity);
+    distances.set(sorted[sorted.length - 1], Infinity);
+    for (let i = 1; i < sorted.length - 1; i++) {
+      const prev = obj.value(sorted[i - 1]);
+      const next = obj.value(sorted[i + 1]);
+      const current = distances.get(sorted[i]);
+      if (current === Infinity) continue;
+      distances.set(sorted[i], current + (next - prev) / range);
+    }
+  }
+  return candidates.map((c) => ({ candidate: c, distance: distances.get(c) ?? 0 }));
+}
+function paretoFrontierWithCrowding(candidates, objectives) {
+  const { frontier } = paretoFrontier(candidates, objectives);
+  if (frontier.length === 0) return [];
+  const distances = crowdingDistance(frontier, objectives);
+  return distances.sort((a, b) => b.distance - a.distance);
+}
 // src/harness-optimizer.ts
 var DEFAULT_HARNESS_OBJECTIVES = [
@@ -5095,10 +5392,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean5 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean5) ** 2, 0) / tail.length;
+  const mean7 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean7) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
+  const refMean = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -5119,7 +5416,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean5, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean7, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -6047,12 +6344,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean5 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean5) ** 2, 0) / all.length;
+  const mean7 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean7) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean5) > 1e-9 ? Math.abs(mean5) : 1;
+  const ref = Math.abs(mean7) > 1e-9 ? Math.abs(mean7) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean5, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean7, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -6973,8 +7270,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean5 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / graded.length;
+  const mean7 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -6996,8 +7293,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean5 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean5) ** 2, 0) / ranked.length;
+  const mean7 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean7) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -7527,8 +7824,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores.length < 3) continue;
-    const mean5 = scores.reduce((a, b) => a + b, 0) / scores.length;
-    const variance2 = scores.reduce((a, b) => a + (b - mean5) ** 2, 0) / scores.length;
+    const mean7 = scores.reduce((a, b) => a + b, 0) / scores.length;
+    const variance2 = scores.reduce((a, b) => a + (b - mean7) ** 2, 0) / scores.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -8008,212 +8305,6 @@ async function euAiActReport(ctx, signals) {
   };
 }
-// src/llm-client.ts
-var LlmCallError = class extends Error {
-  constructor(message, status, body, model) {
-    super(message);
-    this.status = status;
-    this.body = body;
-    this.model = model;
-    this.name = "LlmCallError";
-  }
-  status;
-  body;
-  model;
-};
-var DEFAULT_BASE_URL = "https://router.tangle.tools/v1";
-var DEFAULT_TIMEOUT_MS = 6e4;
-var DEFAULT_MAX_RETRIES = 3;
-var RETRYABLE_STATUS = /* @__PURE__ */ new Set([429, 502, 503, 504]);
-function isRetryableError(err) {
-  if (err instanceof LlmCallError) return RETRYABLE_STATUS.has(err.status);
-  if (err instanceof Error) {
-    return err.name === "AbortError" || err.name === "TimeoutError" || /fetch failed|ECONNRESET|ETIMEDOUT|EAI_AGAIN/i.test(err.message);
-  }
-  return false;
-}
-function parseRetryAfter(headers) {
-  const h = headers.get("retry-after");
-  if (!h) return null;
-  const asNumber = Number(h);
-  if (Number.isFinite(asNumber) && asNumber > 0) return asNumber * 1e3;
-  const asDate = Date.parse(h);
-  if (Number.isFinite(asDate)) return Math.max(0, asDate - Date.now());
-  return null;
-}
-function backoffMs(attempt) {
-  return Math.min(500 * Math.pow(2, attempt), 16e3);
-}
-function buildHeaders(opts) {
-  const headers = {
-    "Content-Type": "application/json",
-    Accept: "application/json"
-  };
-  if (opts.authHeader) {
-    headers[opts.authHeader.name] = opts.authHeader.value;
-  } else if (opts.bearer || opts.apiKey) {
-    headers.Authorization = `Bearer ${opts.bearer ?? opts.apiKey}`;
-  }
-  return headers;
-}
-function isSchemaRejection(status, body) {
-  if (status !== 400) return false;
-  const lower = body.toLowerCase();
-  return lower.includes("response_format") || lower.includes("json_schema") || lower.includes("is unavailable") || lower.includes("not supported");
-}
-function buildBody(req, forceJsonObject) {
-  const body = {
-    model: req.model,
-    messages: req.messages,
-    temperature: req.temperature ?? 0
-  };
-  if (req.maxTokens != null) body.max_tokens = req.maxTokens;
-  if (req.jsonSchema && !forceJsonObject) {
-    body.response_format = {
-      type: "json_schema",
-      json_schema: { name: req.jsonSchema.name, schema: req.jsonSchema.schema, strict: true }
-    };
-  } else if (req.jsonMode || req.jsonSchema) {
-    body.response_format = { type: "json_object" };
-  }
-  return body;
-}
-async function sleep(ms) {
-  return new Promise((resolve) => setTimeout(resolve, ms));
-}
-function stripFencedJson(raw) {
-  const trimmed = raw.trim();
-  const m = trimmed.match(/^```(?:json)?\s*\n?([\s\S]*?)\n?```\s*$/);
-  return m ? m[1].trim() : trimmed;
-}
-async function callLlm(req, opts = {}) {
-  const baseUrl = (opts.baseUrl ?? DEFAULT_BASE_URL).replace(/\/+$/, "");
-  const url = `${baseUrl}/chat/completions`;
-  const timeoutMs = req.timeoutMs ?? opts.defaultTimeoutMs ?? DEFAULT_TIMEOUT_MS;
-  const maxRetries = opts.maxRetries ?? DEFAULT_MAX_RETRIES;
-  const fetchFn = opts.fetch ?? globalThis.fetch;
-  const headers = buildHeaders(opts);
-  let lastErr;
-  for (let attempt = 0; attempt < maxRetries; attempt++) {
-    const controller = new AbortController();
-    const timeoutHandle = setTimeout(() => controller.abort(), timeoutMs);
-    const started = Date.now();
-    try {
-      const res = await fetchFn(url, {
-        method: "POST",
-        headers,
-        body: JSON.stringify(buildBody(req, false)),
-        signal: controller.signal
-      });
-      clearTimeout(timeoutHandle);
-      if (!res.ok) {
-        const body = await res.text();
-        const err = new LlmCallError(
-          `LLM call ${res.status}: ${body.slice(0, 300)}`,
-          res.status,
-          body,
-          req.model
-        );
-        if (RETRYABLE_STATUS.has(res.status) && attempt < maxRetries - 1) {
-          lastErr = err;
-          const retryAfter = parseRetryAfter(res.headers);
-          await sleep(retryAfter ?? backoffMs(attempt));
-          continue;
-        }
-        throw err;
-      }
-      const json = await res.json();
-      const choice = json.choices?.[0];
-      const usageRaw = json.usage ?? {};
-      const costFromProxy = json._response_cost ?? json.cost_usd;
-      return {
-        content: choice?.message?.content ?? "",
-        usage: {
-          promptTokens: Number(usageRaw.prompt_tokens ?? 0),
-          completionTokens: Number(usageRaw.completion_tokens ?? 0),
-          totalTokens: Number(usageRaw.total_tokens ?? 0),
-          cachedPromptTokens: usageRaw.prompt_tokens_details && typeof usageRaw.prompt_tokens_details === "object" ? Number(
-            usageRaw.prompt_tokens_details.cached_tokens ?? 0
-          ) : void 0
-        },
-        costUsd: typeof costFromProxy === "number" ? costFromProxy : null,
-        model: json.model ?? req.model,
-        durationMs: Date.now() - started,
-        raw: json
-      };
-    } catch (err) {
-      clearTimeout(timeoutHandle);
-      lastErr = err;
-      if (attempt < maxRetries - 1 && isRetryableError(err)) {
-        await sleep(backoffMs(attempt));
-        continue;
-      }
-      throw err;
-    }
-  }
-  throw lastErr instanceof Error ? lastErr : new Error(String(lastErr));
-}
-async function callLlmJson(req, opts = {}) {
-  try {
-    const result = await callLlm({ ...req, jsonMode: req.jsonMode ?? !req.jsonSchema }, opts);
-    const value = parseJsonSafely(result.content, result.model);
-    return { value, result };
-  } catch (err) {
-    if (err instanceof LlmCallError && isSchemaRejection(err.status, err.body) && req.jsonSchema) {
-      const degradedReq = { ...req, jsonMode: true, jsonSchema: void 0 };
-      const result = await callLlm(degradedReq, opts);
-      const value = parseJsonSafely(result.content, result.model);
-      return { value, result };
-    }
-    throw err;
-  }
-}
-function parseJsonSafely(content, model) {
-  const stripped = stripFencedJson(content);
-  try {
-    return JSON.parse(stripped);
-  } catch (err) {
-    throw new Error(
-      `LLM returned non-JSON content (model=${model}): ${err instanceof Error ? err.message : String(err)}
---- raw content ---
-${content.slice(0, 800)}`
-    );
-  }
-}
-async function probeLlm(model, opts = {}) {
-  const start = Date.now();
-  try {
-    await callLlm(
-      {
-        model,
-        messages: [{ role: "user", content: "ping" }],
-        maxTokens: 64,
-        timeoutMs: opts.timeoutMs ?? 3e4
-      },
-      opts
-    );
-    return { ok: true, latencyMs: Date.now() - start, error: null };
-  } catch (err) {
-    return {
-      ok: false,
-      latencyMs: Date.now() - start,
-      error: err instanceof Error ? err.message : String(err)
-    };
-  }
-}
-var LlmClient = class {
-  constructor(opts = {}) {
-    this.opts = opts;
-  }
-  opts;
-  call(req, per) {
-    return callLlm(req, { ...this.opts, ...per });
-  }
-  callJson(req, per) {
-    return callLlmJson(req, { ...this.opts, ...per });
-  }
-};
 // src/multi-layer-verifier.ts
 function gradeSemanticStatus(input) {
   if (!input.available) return "error";
@@ -9771,7 +9862,7 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
   const total = scenario.references.length;
   const matchedWeight = matches2.filter((match) => match.matched).reduce((sum2, match) => sum2 + match.weight, 0);
   const totalWeight = matches2.reduce((sum2, match) => sum2 + match.weight, 0);
-  const precision = ratio(matched, matched + falsePositives);
+  const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
     scenarioId: scenario.id,
@@ -9781,9 +9872,9 @@ function buildScenarioScore(scenario, matches2, falsePositives) {
     falsePositives,
     matchedWeight,
     totalWeight,
-    precision,
+    precision: precision2,
     recall,
-    f1: f1(precision, recall),
+    f1: f1(precision2, recall),
     matches: matches2
   };
 }
@@ -9801,7 +9892,7 @@ function aggregateScenarioScores(scores) {
   const falsePositives = sum(scores.map((score) => score.falsePositives));
   const matchedWeight = sum(scores.map((score) => score.matchedWeight));
   const totalWeight = sum(scores.map((score) => score.totalWeight));
-  const precision = ratio(matched, matched + falsePositives);
+  const precision2 = ratio(matched, matched + falsePositives);
   const recall = ratio(matched, total);
   return {
     matched,
@@ -9809,9 +9900,9 @@ function aggregateScenarioScores(scores) {
     falsePositives,
     matchedWeight,
     totalWeight,
-    precision,
+    precision: precision2,
     recall,
-    f1: f1(precision, recall),
+    f1: f1(precision2, recall),
     weightedRecall: ratio(matchedWeight, totalWeight)
   };
 }
@@ -9831,8 +9922,8 @@ function emptyAggregate() {
 function hasSplit(score, split) {
   return score.bySplit[split] !== void 0;
 }
-function f1(precision, recall) {
-  return precision + recall === 0 ? 0 : 2 * precision * recall / (precision + recall);
+function f1(precision2, recall) {
+  return precision2 + recall === 0 ? 0 : 2 * precision2 * recall / (precision2 + recall);
 }
 function ratio(numerator, denominator) {
   return denominator > 0 ? numerator / denominator : 0;
@@ -9956,14 +10047,14 @@ function referenceReplayRunsToSteeringRows(runs, options = {}) {
 function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
   const success = scenarioScore.f1;
   const recall = scenarioScore.recall;
-  const precision = scenarioScore.precision;
+  const precision2 = scenarioScore.precision;
   const failed = scenarioScore.total > 0 && scenarioScore.matched === 0;
   return {
     success,
     goalProgress: recall,
-    repoGroundedness: precision,
-    driftPenalty: 1 - precision,
-    toolUseQuality: precision,
+    repoGroundedness: precision2,
+    driftPenalty: 1 - precision2,
+    toolUseQuality: precision2,
     patchQuality: 0,
     testReality: scenarioScore.total > 0 ? 1 : 0,
     finalGate: success,
@@ -9972,10 +10063,569 @@ function referenceReplayScenarioToRunScore(scenarioScore, durationMs = 0) {
     wallSeconds: Math.max(0, durationMs / 1e3),
     notes: [
       `reference-replay matched ${scenarioScore.matched}/${scenarioScore.total}`,
-      `precision=${precision.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
+      `precision=${precision2.toFixed(3)} recall=${recall.toFixed(3)} f1=${success.toFixed(3)}`
     ]
   };
 }
+// src/prompt-evolution.ts
+var InMemoryTrialCache = class {
+  store = /* @__PURE__ */ new Map();
+  get(key) {
+    return this.store.get(key);
+  }
+  set(key, value) {
+    this.store.set(key, value);
+  }
+  size() {
+    return this.store.size;
+  }
+  clear() {
+    this.store.clear();
+  }
+};
+async function runPromptEvolution(config) {
+  const generations = [];
+  let population = [...config.seedVariants];
+  let bestVariant = population[0];
+  let bestAggregate = null;
+  for (let generation = 0; generation < config.generations; generation++) {
+    config.onProgress?.({ type: "generation-start", generation, populationSize: population.length });
+    const trials = await scorePopulation(population, config, generation);
+    const aggregates = aggregateTrials(population, config.scenarioIds, trials);
+    const front = paretoFrontierWithCrowding(aggregates, config.objectives);
+    const frontIds = new Set(front.map((c) => c.candidate.variantId));
+    const scored = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights });
+    scored.sort((a, b) => b.score - a.score);
+    const winnerId = scored[0]?.candidate.variantId ?? aggregates[0]?.variantId ?? population[0].id;
+    const report = {
+      runId: config.runId,
+      target: config.target,
+      generation,
+      variants: population,
+      aggregates,
+      paretoFrontIds: front.map((c) => c.candidate.variantId),
+      winnerId,
+      trials
+    };
+    generations.push(report);
+    config.onProgress?.({ type: "generation-complete", report });
+    const winnerAgg = aggregates.find((a) => a.variantId === winnerId);
+    if (winnerAgg) {
+      const winner = population.find((v) => v.id === winnerId);
+      if (winner) bestVariant = winner;
+      bestAggregate = winnerAgg;
+    }
+    if (config.earlyStopOnNoImprovement !== false && generations.length >= 2) {
+      const prev = generations[generations.length - 2];
+      const noChange = prev.winnerId === winnerId && samePopulation(prev.paretoFrontIds, [...frontIds]);
+      if (noChange) {
+        config.onProgress?.({ type: "converged", generation, reason: "no improvement vs previous generation" });
+        break;
+      }
+    }
+    if (generation === config.generations - 1) break;
+    population = await nextPopulation(population, aggregates, trials, front, config, generation + 1);
+  }
+  return {
+    runId: config.runId,
+    target: config.target,
+    generations,
+    bestVariant,
+    bestAggregate: bestAggregate ?? aggregateTrials(population, config.scenarioIds, []).find((a) => a.variantId === bestVariant.id)
+  };
+}
+async function scorePopulation(population, config, generation) {
+  const jobs = [];
+  for (const variant of population) {
+    for (const scenarioId of config.scenarioIds) {
+      for (let rep = 0; rep < config.reps; rep++) {
+        jobs.push(async () => {
+          const cacheKey = `${variant.id}|${scenarioId}|${rep}`;
+          const cached = config.cache?.get(cacheKey);
+          if (cached) {
+            config.onProgress?.({
+              type: "trial-complete",
+              generation,
+              variantId: variant.id,
+              scenarioId,
+              rep,
+              ok: cached.ok,
+              score: cached.score,
+              cached: true
+            });
+            return cached;
+          }
+          const result = await config.scoreAdapter.score({ variant, scenarioId, rep });
+          config.cache?.set(cacheKey, result);
+          config.onProgress?.({
+            type: "trial-complete",
+            generation,
+            variantId: variant.id,
+            scenarioId,
+            rep,
+            ok: result.ok,
+            score: result.score,
+            cached: false
+          });
+          return result;
+        });
+      }
+    }
+  }
+  return runWithConcurrency(jobs, config.scoreConcurrency);
+}
+async function runWithConcurrency(jobs, concurrency) {
+  const results = new Array(jobs.length);
+  const limit = Math.max(1, concurrency);
+  let next = 0;
+  async function worker() {
+    while (true) {
+      const i = next++;
+      if (i >= jobs.length) return;
+      results[i] = await jobs[i]();
+    }
+  }
+  await Promise.all(Array.from({ length: limit }, () => worker()));
+  return results;
+}
+function aggregateTrials(population, scenarioIds, trials) {
+  return population.map((variant) => {
+    const variantTrials = trials.filter((t) => t.variantId === variant.id);
+    const scenarios = scenarioIds.map((sid) => {
+      const scenarioTrials = variantTrials.filter((t) => t.scenarioId === sid);
+      const okTrials = scenarioTrials.filter((t) => t.ok);
+      const metrics = aggregateMetrics(okTrials.map((t) => t.metrics ?? {}));
+      return {
+        variantId: variant.id,
+        scenarioId: sid,
+        meanScore: mean5(okTrials.map((t) => t.score)),
+        meanCost: mean5(okTrials.map((t) => t.cost ?? 0)),
+        meanDurationMs: mean5(okTrials.map((t) => t.durationMs ?? 0)),
+        okRate: scenarioTrials.length === 0 ? 0 : okTrials.length / scenarioTrials.length,
+        trials: scenarioTrials.length,
+        metrics
+      };
+    });
+    return {
+      variantId: variant.id,
+      meanScore: mean5(scenarios.map((s) => s.meanScore)),
+      meanCost: mean5(scenarios.map((s) => s.meanCost)),
+      meanDurationMs: mean5(scenarios.map((s) => s.meanDurationMs)),
+      okRate: mean5(scenarios.map((s) => s.okRate)),
+      scenarios,
+      metrics: aggregateMetrics(scenarios.map((s) => s.metrics))
+    };
+  });
+}
+function aggregateMetrics(rows) {
+  const buckets = /* @__PURE__ */ new Map();
+  for (const row of rows) {
+    for (const [k, v] of Object.entries(row)) {
+      if (!Number.isFinite(v)) continue;
+      const list = buckets.get(k) ?? [];
+      list.push(v);
+      buckets.set(k, list);
+    }
+  }
+  const out = {};
+  for (const [k, list] of buckets) out[k] = mean5(list);
+  return out;
+}
+function mean5(xs) {
+  if (xs.length === 0) return 0;
+  return xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+async function nextPopulation(current, aggregates, trials, front, config, nextGeneration) {
+  const survivorIds = new Set(front.map((c) => c.candidate.variantId));
+  const survivors = current.filter((v) => survivorIds.has(v.id));
+  const ranked = scalarScore(aggregates, config.objectives, { weights: config.scalarWeights }).sort((a, b) => b.score - a.score);
+  const parentId = ranked[0]?.candidate.variantId ?? current[0].id;
+  const parent = current.find((v) => v.id === parentId) ?? current[0];
+  const parentAggregate = aggregates.find((a) => a.variantId === parent.id) ?? aggregates[0];
+  const topTrials = topKTrialsByScore(trials, parent.id, 3);
+  const bottomTrials = bottomKTrialsByScore(trials, parent.id, 3);
+  const childCount = Math.max(0, config.populationSize - survivors.length);
+  let children = [];
+  if (childCount > 0) {
+    children = await config.mutateAdapter.mutate({
+      parent,
+      parentAggregate,
+      topTrials,
+      bottomTrials,
+      childCount,
+      generation: nextGeneration
+    });
+    children = children.slice(0, childCount).map((c) => ({ ...c, generation: nextGeneration, parentId: parent.id }));
+  }
+  return [...survivors, ...children];
+}
+function topKTrialsByScore(trials, variantId, k) {
+  return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => b.score - a.score).slice(0, k);
+}
+function bottomKTrialsByScore(trials, variantId, k) {
+  return trials.filter((t) => t.variantId === variantId && t.ok).sort((a, b) => a.score - b.score).slice(0, k);
+}
+function samePopulation(a, b) {
+  if (a.length !== b.length) return false;
+  const setA = new Set(a);
+  return b.every((id) => setA.has(id));
+}
+// src/golden-matcher.ts
+function matchGoldens(goldens, candidates, options = {}) {
+  const extract = options.text ?? defaultExtract5;
+  const haystacks = candidates.map((c) => extract(c).toLowerCase());
+  const matches2 = goldens.map((golden) => goldenMatched(golden, haystacks));
+  return {
+    matches: matches2,
+    hits: matches2.filter(Boolean).length,
+    total: goldens.length
+  };
+}
+function defaultExtract5(candidate) {
+  if (typeof candidate === "string") return candidate;
+  if (candidate && typeof candidate === "object") {
+    const parts = [];
+    for (const v of Object.values(candidate)) {
+      if (typeof v === "string") parts.push(v);
+    }
+    return parts.join(" ");
+  }
+  return String(candidate ?? "");
+}
+function goldenMatched(golden, haystacks) {
+  for (const phrase of golden.any) {
+    const needle = phrase.toLowerCase().trim();
+    if (!needle) continue;
+    if (haystacks.some((h) => h.includes(needle))) return true;
+  }
+  for (const pattern of golden.anyRegex ?? []) {
+    let re;
+    try {
+      re = new RegExp(pattern, "i");
+    } catch {
+      continue;
+    }
+    if (haystacks.some((h) => re.test(h))) return true;
+  }
+  return false;
+}
+var DEFAULT_SEVERITY_WEIGHTS = {
+  critical: 3,
+  major: 2,
+  minor: 1
+};
+function weightedRecall(goldens, result, weights = DEFAULT_SEVERITY_WEIGHTS) {
+  if (goldens.length === 0) return 1;
+  const total = goldens.reduce((s, g) => s + (weights[g.severity] ?? 1), 0);
+  if (total === 0) return 1;
+  const hit = goldens.reduce(
+    (s, g, i) => s + (result.matches[i] ? weights[g.severity] ?? 1 : 0),
+    0
+  );
+  return hit / total;
+}
+function precision(goldens, candidates, options = {}) {
+  if (candidates.length === 0) return 1;
+  const extract = options.text ?? defaultExtract5;
+  let matched = 0;
+  for (const cand of candidates) {
+    const haystack = extract(cand).toLowerCase();
+    const matchedAny = goldens.some(
+      (g) => g.any.some((phrase) => phrase.length > 0 && haystack.includes(phrase.toLowerCase())) || (g.anyRegex ?? []).some((pat) => {
+        try {
+          return new RegExp(pat, "i").test(haystack);
+        } catch {
+          return false;
+        }
+      })
+    );
+    if (matchedAny) matched++;
+  }
+  return matched / candidates.length;
+}
+// src/orthogonality.ts
+function passOrthogonality(input) {
+  const passes = input.passes;
+  if (passes.length < 2) {
+    return { orthogonality: 1, passCount: passes.length, similarities: [] };
+  }
+  const render = input.text ?? defaultRender;
+  const minLen = input.minTokenLength ?? 4;
+  const vectors = passes.map((p) => bagOfWords(p.findings, render, minLen));
+  const sims = [];
+  for (let i = 0; i < vectors.length; i++) {
+    for (let j = i + 1; j < vectors.length; j++) {
+      sims.push(cosineSimilarity(vectors[i], vectors[j]));
+    }
+  }
+  const mean7 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  return {
+    orthogonality: Math.max(0, Math.min(1, 1 - mean7)),
+    passCount: passes.length,
+    similarities: sims
+  };
+}
+function defaultRender(item) {
+  if (typeof item === "string") return item;
+  if (item && typeof item === "object") {
+    const parts = [];
+    for (const v of Object.values(item)) {
+      if (typeof v === "string") parts.push(v);
+    }
+    return parts.join(" ");
+  }
+  return String(item ?? "");
+}
+function bagOfWords(items, render, minLen) {
+  const bag = /* @__PURE__ */ new Map();
+  for (const item of items) {
+    const text = render(item).toLowerCase();
+    for (const tok of text.split(/[^a-z0-9]+/).filter((w) => w.length >= minLen)) {
+      bag.set(tok, (bag.get(tok) ?? 0) + 1);
+    }
+  }
+  return bag;
+}
+function cosineSimilarity(a, b) {
+  let dot = 0;
+  let aMag = 0;
+  let bMag = 0;
+  for (const [, v] of a) aMag += v * v;
+  for (const [, v] of b) bMag += v * v;
+  for (const [k, v] of a) {
+    const bv = b.get(k);
+    if (bv) dot += v * bv;
+  }
+  if (aMag === 0 || bMag === 0) return 0;
+  return dot / (Math.sqrt(aMag) * Math.sqrt(bMag));
+}
+// src/promotion-gate.ts
+function bootstrapCi(baseline, candidate, options = {}) {
+  const alpha = options.alpha ?? 0.05;
+  const iterations = options.iterations ?? 1e3;
+  const minTotal = options.minTotalSamples ?? 6;
+  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
+  const baselineMean = mean6(baseline);
+  const candidateMean = mean6(candidate);
+  const delta = candidateMean - baselineMean;
+  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
+    return {
+      baselineMean,
+      candidateMean,
+      delta,
+      ciLower: -Infinity,
+      ciUpper: Infinity,
+      iterations: 0,
+      alpha,
+      verdict: "INCONCLUSIVE"
+    };
+  }
+  const deltas = new Array(iterations);
+  for (let i = 0; i < iterations; i++) {
+    const bResample = resample(baseline, rng);
+    const cResample = resample(candidate, rng);
+    deltas[i] = mean6(cResample) - mean6(bResample);
+  }
+  deltas.sort((a, b) => a - b);
+  const lowerIdx = Math.floor(alpha / 2 * iterations);
+  const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
+  const ciLower = deltas[Math.max(0, lowerIdx)];
+  const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
+  let verdict;
+  if (ciLower > 0) verdict = "ADVANCE";
+  else if (ciUpper < 0) verdict = "REVERT";
+  else if (delta >= 0) verdict = "KEEP";
+  else verdict = "INCONCLUSIVE";
+  return {
+    baselineMean,
+    candidateMean,
+    delta,
+    ciLower,
+    ciUpper,
+    iterations,
+    alpha,
+    verdict
+  };
+}
+function mean6(xs) {
+  if (xs.length === 0) return 0;
+  let s = 0;
+  for (const x of xs) s += x;
+  return s / xs.length;
+}
+function resample(xs, rng) {
+  const out = new Array(xs.length);
+  for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
+  return out;
+}
+function mulberry32(seed) {
+  let t = seed >>> 0;
+  return () => {
+    t += 1831565813;
+    let r = t;
+    r = Math.imul(r ^ r >>> 15, r | 1);
+    r ^= r + Math.imul(r ^ r >>> 7, r | 61);
+    return ((r ^ r >>> 14) >>> 0) / 4294967296;
+  };
+}
+function hashSeed(a, b) {
+  let h = 2166136261;
+  for (const x of [...a, ...b]) {
+    const view = new Float64Array([x]);
+    const bytes = new Uint8Array(view.buffer);
+    for (const byte of bytes) {
+      h ^= byte;
+      h = Math.imul(h, 16777619);
+    }
+  }
+  return h >>> 0;
+}
+async function judgeReplayGate(args) {
+  const concurrency = args.judgeConcurrency ?? 4;
+  const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
+  const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
+  const ci = bootstrapCi(baselineScores, candidateScores, {
+    ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
+    ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
+    ...args.seed !== void 0 ? { seed: args.seed } : {}
+  });
+  return {
+    ...ci,
+    baselineSamples: baselineScores.length,
+    candidateSamples: candidateScores.length
+  };
+}
+async function scoreAll(outputs, judge, concurrency) {
+  const results = new Array(outputs.length);
+  let next = 0;
+  async function worker() {
+    while (true) {
+      const i = next++;
+      if (i >= outputs.length) return;
+      const v = await judge(outputs[i]);
+      results[i] = Number.isFinite(v) ? v : 0;
+    }
+  }
+  await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
+  return results;
+}
+// src/reflective-mutation.ts
+var DEFAULT_MUTATION_PRIMITIVES = [
+  'Strengthen an imperative ("should" \u2192 "must")',
+  "Add a concrete example pulled from a missed-golden phrase",
+  "Remove a redundant rule that did not improve recall",
+  'Add a counterfactual ("if X is missing, the score is capped at Y")',
+  "Reorder sections so the highest-impact rule is first",
+  "Replace abstract language with a domain-specific noun the trial misses"
+];
+function buildReflectionPrompt(ctx) {
+  const primitives = ctx.mutationPrimitives ?? DEFAULT_MUTATION_PRIMITIVES;
+  const sections = [];
+  sections.push(`# Mutation target: ${ctx.target}`);
+  sections.push("");
+  sections.push(`You are tuning the prompt component named \`${ctx.target}\`. The current variant is shown below; you have ${ctx.topTrials.length} top trials and ${ctx.bottomTrials.length} bottom trials as evidence. Propose ${ctx.childCount} mutation${ctx.childCount === 1 ? "" : "s"} that fix specific weaknesses visible in the bottom trials. Avoid blank rephrasings.`);
+  sections.push("");
+  sections.push("## Current variant");
+  sections.push("```json");
+  sections.push(JSON.stringify(ctx.parentPayload, null, 2));
+  sections.push("```");
+  sections.push("");
+  if (ctx.bottomTrials.length > 0) {
+    sections.push("## Failures (bottom trials) \u2014 what went wrong");
+    sections.push("");
+    for (const trial of ctx.bottomTrials) {
+      sections.push(`### Trial \`${trial.id}\` \u2014 score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
+      const missed = (trial.expectations ?? []).filter((e) => !e.matched);
+      if (missed.length > 0) {
+        sections.push("");
+        sections.push("**Missed expectations:**");
+        for (const m of missed) {
+          sections.push(`- \`${m.id}\`: should match phrase \`${quote(m.phrase)}\``);
+        }
+      }
+      if (trial.emitted) {
+        sections.push("");
+        sections.push("**What the agent emitted:**");
+        sections.push("```");
+        sections.push(truncate3(trial.emitted, 600));
+        sections.push("```");
+      }
+      sections.push("");
+    }
+  }
+  if (ctx.topTrials.length > 0) {
+    sections.push("## Successes (top trials) \u2014 what to preserve");
+    sections.push("");
+    for (const trial of ctx.topTrials) {
+      sections.push(`- \`${trial.id}\`: score ${trial.score.toFixed(2)}${trial.inputName ? ` (${trial.inputName})` : ""}`);
+    }
+    sections.push("");
+  }
+  sections.push("## Allowed mutation primitives");
+  sections.push("");
+  for (const p of primitives) sections.push(`- ${p}`);
+  sections.push("");
+  sections.push("## Output schema");
+  sections.push("");
+  sections.push("Respond with a JSON object \u2014 no prose, no markdown fences:");
+  sections.push("```json");
+  sections.push(JSON.stringify(
+    {
+      proposals: [
+        {
+          label: "<short label, \u2264 40 chars>",
+          rationale: "<which failure this targets and which primitive you used>",
+          payload: "<full payload of the new variant \u2014 same shape as the current variant>"
+        }
+      ]
+    },
+    null,
+    2
+  ));
+  sections.push("```");
+  return sections.join("\n");
+}
+function truncate3(s, max) {
+  if (s.length <= max) return s;
+  return s.slice(0, max) + "\u2026 [truncated]";
+}
+function quote(s) {
+  return s.replace(/`/g, "\\`");
+}
+function parseReflectionResponse(raw, maxProposals) {
+  let text = raw.trim();
+  if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
+  const start = text.indexOf("{");
+  const end = text.lastIndexOf("}");
+  if (start < 0 || end <= start) return [];
+  let parsed;
+  try {
+    parsed = JSON.parse(text.slice(start, end + 1));
+  } catch {
+    return [];
+  }
+  if (!parsed || typeof parsed !== "object") return [];
+  const proposalsRaw = parsed.proposals;
+  if (!Array.isArray(proposalsRaw)) return [];
+  const out = [];
+  for (const p of proposalsRaw) {
+    if (!p || typeof p !== "object") continue;
+    const obj = p;
+    if (!("payload" in obj)) continue;
+    out.push({
+      label: typeof obj.label === "string" ? obj.label : "mutation",
+      rationale: typeof obj.rationale === "string" ? obj.rationale : "",
+      payload: obj.payload
+    });
+    if (maxProposals !== void 0 && out.length >= maxProposals) break;
+  }
+  return out;
+}
 export {
   AgentDriver,
   AxGepaSteeringOptimizer,
@@ -9985,21 +10635,25 @@ export {
   BuilderSession,
   ConvergenceTracker,
   CostTracker,
+  D1ExperimentStore,
   DEFAULT_AGENT_SLOS,
   DEFAULT_COMPLEXITY_WEIGHTS,
   DEFAULT_RULES as DEFAULT_FAILURE_RULES,
   DEFAULT_FINDERS,
   DEFAULT_HARNESS_OBJECTIVES,
+  DEFAULT_MUTATION_PRIMITIVES,
   DEFAULT_MUTATORS,
   DEFAULT_REDACTION_RULES,
   DEFAULT_RED_TEAM_CORPUS,
   DEFAULT_RUN_SCORE_WEIGHTS,
+  DEFAULT_SEVERITY_WEIGHTS,
   Dataset,
   DockerSandboxDriver,
   DualAgentBench,
   ERROR_COUNT_PATTERNS,
   ExperimentTracker,
   FAILURE_CLASSES,
+  FileSystemExperimentStore,
   FileSystemOutcomeStore,
   FileSystemTraceStore,
   HoldoutAuditor,
@@ -10008,6 +10662,7 @@ export {
   InMemoryExperimentStore,
   InMemoryOutcomeStore,
   InMemoryTraceStore,
+  InMemoryTrialCache,
   InMemoryWorkspaceInspector,
   JudgeRunner,
   LlmCallError,
@@ -10043,7 +10698,9 @@ export {
   benjaminiHochberg,
   bisect,
   bonferroni,
+  bootstrapCi,
   budgetBreachView,
+  buildReflectionPrompt,
   buildReviewerPrompt,
   buildTrajectory,
   byteLengthRange,
@@ -10081,6 +10738,7 @@ export {
   createLlmReviewer,
   createSemanticConceptJudge,
   crossTraceDiff,
+  crowdingDistance,
   decideReferenceReplayPromotion,
   decideReferenceReplayRunPromotion,
   defaultJudges,
@@ -10114,6 +10772,7 @@ export {
   formatBenchmarkReport,
   formatDriverReport,
   formatFindings,
+  precision as goldenPrecision,
   gradeSemanticStatus,
   groupBy,
   hashContent,
@@ -10135,6 +10794,7 @@ export {
   jsonlReferenceReplayStore,
   jsonlReviewStore,
   judgeAgreementView,
+  judgeReplayGate,
   judgeSpans,
   keyPreserved,
   linterJudge,
@@ -10144,6 +10804,7 @@ export {
   localCommandRunner,
   lowercaseMutator,
   mannWhitneyU,
+  matchGoldens,
   mergeLayerResults,
   mergeSteeringBundle,
   multiToolchainLayer,
@@ -10155,7 +10816,10 @@ export {
   pairedTTest,
   paraphraseRobustness,
   paretoFrontier,
+  paretoFrontierWithCrowding,
+  parseReflectionResponse,
   partialCredit,
+  passOrthogonality,
   pixelDeltaRatio,
   politenessPrefixMutator,
   positionalBias,
@@ -10195,12 +10859,14 @@ export {
   runJudgeFleet,
   runKeywordCoverageJudge,
   runKeywordCoverageJudgeUrl,
+  runPromptEvolution,
   runProposeReview,
   runReferenceReplay,
   runSelfPlay,
   runSemanticConceptJudge,
   runTestGradedScenario,
   runsForScenario,
+  scalarScore,
   scanForMuffledGates,
   scoreAllProjects,
   scoreContinuity,
@@ -10237,6 +10903,7 @@ export {
   viteDeployRunner,
   vitestTestParser,
   weightedMean,
+  weightedRecall,
   welchsTTest,
   whitespaceCollapseMutator,
   wilcoxonSignedRank