npm - @tangle-network/agent-eval - Versions diffs - 0.61.0 → 0.63.0 - Mend

@tangle-network/agent-eval 0.61.0 → 0.63.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/CHANGELOG.md +48 -8
package/dist/adapters/http.d.ts +4 -1
package/dist/adapters/langchain.d.ts +4 -1
package/dist/adapters/otel.d.ts +4 -4
package/dist/{agent-profile-9J9hxdm2.d.ts → agent-profile-DzcPHR1Z.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +2 -2
package/dist/campaign/index.d.ts +388 -11
package/dist/campaign/index.js +597 -12
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-GMXHLSLL.js → chunk-4ODZXQV2.js} +81 -98
package/dist/chunk-4ODZXQV2.js.map +1 -0
package/dist/{chunk-OLULBECP.js → chunk-7TPYV2ER.js} +27 -1
package/dist/chunk-7TPYV2ER.js.map +1 -0
package/dist/chunk-E22YUOAL.js +111 -0
package/dist/chunk-E22YUOAL.js.map +1 -0
package/dist/{chunk-SUGME4OT.js → chunk-Z7ZU7IYZ.js} +209 -85
package/dist/chunk-Z7ZU7IYZ.js.map +1 -0
package/dist/contract/index.d.ts +9 -9
package/dist/contract/index.js +4 -3
package/dist/contract/index.js.map +1 -1
package/dist/{control-Bf8owbuG.d.ts → control-DxvZeV5X.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/hosted/index.d.ts +4 -4
package/dist/{index-Bvk35ils.d.ts → index-DsnOpCO6.d.ts} +1 -1
package/dist/{index-D9dwa00f.d.ts → index-GISRh500.d.ts} +2 -2
package/dist/index.d.ts +98 -14
package/dist/index.js +331 -128
package/dist/index.js.map +1 -1
package/dist/meta-eval/index.d.ts +2 -2
package/dist/multishot/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/{provenance-D0WeCXt1.d.ts → provenance-cUnovpWV.d.ts} +42 -11
package/dist/{registry-qmbYT3Eo.d.ts → registry-DPly4_hZ.d.ts} +1 -1
package/dist/{release-report-DszkgvJ3.d.ts → release-report-DGoeObZT.d.ts} +2 -2
package/dist/reporting.d.ts +4 -4
package/dist/{researcher-BaVsy0sW.d.ts → researcher-WJvIpX3L.d.ts} +2 -2
package/dist/rl.d.ts +6 -6
package/dist/{rubric-predictive-validity-DgBHWsh7.d.ts → rubric-predictive-validity-D_4BSXGV.d.ts} +1 -1
package/dist/{run-campaign-HXPJAUZ3.js → run-campaign-5J3ED2UJ.js} +3 -2
package/dist/{run-record-DgUVo5pw.d.ts → run-record-BgTFzO2r.d.ts} +1 -1
package/dist/{summary-report-BQvXpvaR.d.ts → summary-report-ByiOUrHj.d.ts} +1 -1
package/dist/{types-Beb6KPqZ.d.ts → types-c2R2kfmv.d.ts} +45 -12
package/package.json +1 -1
package/dist/chunk-GMXHLSLL.js.map +0 -1
package/dist/chunk-OLULBECP.js.map +0 -1
package/dist/chunk-SUGME4OT.js.map +0 -1
/package/dist/{run-campaign-HXPJAUZ3.js.map → run-campaign-5J3ED2UJ.js.map} +0 -0

package/dist/campaign/index.js CHANGED Viewed

@@ -1,5 +1,7 @@
 import {
   buildLoopProvenanceRecord,
+  campaignBreakdown,
+  campaignMeanComposite,
   composeGate,
   countSentenceEdits,
   defaultProductionGate,
@@ -20,33 +22,271 @@ import {
   runOptimization,
   surfaceContentHash,
   surfaceHash
-} from "../chunk-SUGME4OT.js";
+} from "../chunk-Z7ZU7IYZ.js";
 import {
   fsCampaignStorage,
   inMemoryCampaignStorage,
   runCampaign
-} from "../chunk-OLULBECP.js";
+} from "../chunk-7TPYV2ER.js";
 import {
   agentProfileHash
 } from "../chunk-PQV2TKC3.js";
+import "../chunk-4ODZXQV2.js";
 import {
   assertRealBackend,
   summarizeBackendIntegrity
-} from "../chunk-GMXHLSLL.js";
+} from "../chunk-E22YUOAL.js";
 import "../chunk-YV7J7X5N.js";
 import {
   validateRunRecord
 } from "../chunk-F3SRAAZO.js";
-import "../chunk-ITBRCT73.js";
+import {
+  pairedBootstrap
+} from "../chunk-ITBRCT73.js";
 import "../chunk-GGE4NNQT.js";
 import "../chunk-VSMTAMNK.js";
-import "../chunk-IHDHUN2X.js";
+import {
+  callLlm
+} from "../chunk-IHDHUN2X.js";
 import "../chunk-PC4UYEBM.js";
 import {
   AgentEvalError
 } from "../chunk-3BFEG2F6.js";
 import "../chunk-PZ5AY32C.js";
+// src/campaign/skill-patch.ts
+function applySkillPatch(surface, patch) {
+  let lines = surface.split("\n");
+  let applied = 0;
+  const rejected = [];
+  const findLine = (anchor) => lines.findIndex((l) => l.includes(anchor));
+  for (const op of patch.ops) {
+    if (op.op === "add") {
+      if (typeof op.text !== "string" || op.text.trim() === "") {
+        rejected.push({ op, reason: "empty add text" });
+        continue;
+      }
+      const insert = op.text.split("\n");
+      if (op.after === void 0 || op.after === "") {
+        lines = [...lines, ...insert];
+        applied++;
+        continue;
+      }
+      const idx = findLine(op.after);
+      if (idx === -1) {
+        rejected.push({ op, reason: `add anchor not found: ${truncate(op.after)}` });
+        continue;
+      }
+      lines = [...lines.slice(0, idx + 1), ...insert, ...lines.slice(idx + 1)];
+      applied++;
+    } else if (op.op === "delete") {
+      const idx = findLine(op.anchor);
+      if (idx === -1) {
+        rejected.push({ op, reason: `delete anchor not found: ${truncate(op.anchor)}` });
+        continue;
+      }
+      lines = [...lines.slice(0, idx), ...lines.slice(idx + 1)];
+      applied++;
+    } else {
+      const idx = findLine(op.anchor);
+      if (idx === -1) {
+        rejected.push({ op, reason: `replace anchor not found: ${truncate(op.anchor)}` });
+        continue;
+      }
+      if (typeof op.text !== "string") {
+        rejected.push({ op, reason: "replace text missing" });
+        continue;
+      }
+      lines = [...lines.slice(0, idx), ...op.text.split("\n"), ...lines.slice(idx + 1)];
+      applied++;
+    }
+  }
+  return { surface: lines.join("\n"), applied, rejected };
+}
+function patchEditCount(patch) {
+  return patch.ops.length;
+}
+function truncate(s, max = 48) {
+  return s.length <= max ? s : `${s.slice(0, max)}\u2026`;
+}
+// src/campaign/drivers/skill-opt.ts
+var SKILLOPT_SYSTEM = 'You are a SkillOpt optimizer. You improve ONE skill document by proposing BOUNDED, anchored edits \u2014 never a full rewrite. Output ONLY a JSON object of shape {"patches":[{"label":string,"rationale":string,"ops":[op,...]}]} where each op is one of: {"op":"add","after":<exact substring of an existing line, or omit to append>,"text":<new line(s)>}, {"op":"delete","anchor":<exact substring of the line to remove>}, {"op":"replace","anchor":<exact substring of the line to replace>,"text":<replacement line(s)>}. Anchors MUST be verbatim substrings of lines that exist in the document. No prose outside JSON.';
+function skillOptDriver(opts) {
+  const evidenceK = opts.evidenceK ?? 3;
+  const defaultBudget = opts.editBudget ?? 3;
+  async function proposePatches(args) {
+    const userPrompt = buildPatchPrompt({
+      target: opts.target,
+      surface: args.surface,
+      evidence: args.evidence,
+      editBudget: args.editBudget,
+      rejectedBuffer: args.rejectedBuffer,
+      metaNote: args.metaNote,
+      count: args.count
+    });
+    const result = await callLlm(
+      {
+        model: opts.model,
+        messages: [
+          { role: "system", content: SKILLOPT_SYSTEM },
+          { role: "user", content: userPrompt }
+        ],
+        jsonMode: true,
+        temperature: opts.temperature ?? 0.6,
+        maxTokens: opts.maxTokens ?? 4e3
+      },
+      opts.llm
+    );
+    return parseSkillPatchResponse(result.content, args.count, args.editBudget);
+  }
+  return {
+    kind: "skill-opt",
+    proposePatches,
+    async propose(ctx) {
+      if (typeof ctx.currentSurface !== "string") {
+        throw new Error(
+          "skillOptDriver: surface must be a string skill document (got a CodeSurface). SkillOpt patches text."
+        );
+      }
+      const surface = ctx.currentSurface;
+      const patches = await proposePatches({
+        surface,
+        evidence: evidenceFromHistory(ctx, evidenceK),
+        editBudget: defaultBudget,
+        rejectedBuffer: [],
+        count: ctx.populationSize,
+        signal: ctx.signal
+      });
+      const out = [];
+      const seen = /* @__PURE__ */ new Set();
+      for (const patch of patches) {
+        const { surface: candidate, applied } = applySkillPatch(surface, patch);
+        if (applied === 0 || candidate === surface || seen.has(candidate)) continue;
+        seen.add(candidate);
+        out.push({ surface: candidate, label: patch.label, rationale: patch.rationale });
+        if (out.length >= ctx.populationSize) break;
+      }
+      return out;
+    }
+  };
+}
+function evidenceFromHistory(ctx, k) {
+  const last = ctx.history.at(-1);
+  if (!last || last.candidates.length === 0) return { weakScenarios: [], weakDimensions: [] };
+  const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
+  if (!best) return { weakScenarios: [], weakDimensions: [] };
+  const weakScenarios = [...best.scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
+  const weakDimensions = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
+  return { weakScenarios, weakDimensions };
+}
+function buildPatchPrompt(args) {
+  const lines = [
+    `Skill document governs: ${args.target}.`,
+    "",
+    "Current skill document:",
+    "```",
+    args.surface,
+    "```",
+    "",
+    `Propose ${args.count} candidate patch(es). Each patch is a SMALL bundle of`,
+    `at most ${args.editBudget} op(s). Anchors must be verbatim substrings of`,
+    "existing lines. Prefer adding a specific missing rule or sharpening a vague",
+    "one over deleting; never rewrite the whole document."
+  ];
+  if (args.evidence.weakScenarios.length > 0) {
+    lines.push(
+      "",
+      "Weakest scenarios (patch to fix these):",
+      ...args.evidence.weakScenarios.map((s) => `- ${s.scenarioId} (${s.composite.toFixed(2)})`)
+    );
+  }
+  if (args.evidence.weakDimensions.length > 0) {
+    lines.push(
+      "",
+      "Weakest dimensions (what to improve):",
+      ...args.evidence.weakDimensions.map((d) => `- ${d.dimension} (${d.score.toFixed(2)})`)
+    );
+  }
+  if (args.rejectedBuffer.length > 0) {
+    lines.push(
+      "",
+      "Already tried and REJECTED (do not repeat or restate these edits):",
+      ...args.rejectedBuffer.map((e) => `- ${e.label}: ${e.rationale} \u2014 ${e.reason}`)
+    );
+  }
+  if (args.metaNote) {
+    lines.push("", `Strategy note from prior epochs: ${args.metaNote}`);
+  }
+  return lines.join("\n");
+}
+var SkillPatchParseError = class extends Error {
+  constructor(message) {
+    super(message);
+    this.name = "SkillPatchParseError";
+  }
+};
+function parseSkillPatchResponse(raw, maxPatches, editBudget) {
+  let text = raw.trim();
+  if (text.startsWith("```")) text = text.replace(/^```(?:json)?\n?/, "").replace(/\n?```$/, "");
+  const start = text.indexOf("{");
+  const end = text.lastIndexOf("}");
+  if (start < 0 || end <= start) {
+    throw new SkillPatchParseError(
+      `parseSkillPatchResponse: response was not valid JSON (no object found): ${snippet(raw)}`
+    );
+  }
+  let parsed;
+  try {
+    parsed = JSON.parse(text.slice(start, end + 1));
+  } catch (err) {
+    throw new SkillPatchParseError(
+      `parseSkillPatchResponse: response was not valid JSON (${err instanceof Error ? err.message : String(err)}): ${snippet(raw)}`
+    );
+  }
+  const rawPatches = Array.isArray(parsed.patches) ? parsed.patches : [];
+  const out = [];
+  for (const rp of rawPatches) {
+    if (typeof rp !== "object" || rp === null) continue;
+    const obj = rp;
+    const ops = Array.isArray(obj.ops) ? obj.ops.map(normalizeOp).filter(isOp) : [];
+    if (ops.length === 0) continue;
+    out.push({
+      label: typeof obj.label === "string" ? obj.label : "patch",
+      rationale: typeof obj.rationale === "string" ? obj.rationale : "",
+      ops: ops.slice(0, editBudget)
+    });
+    if (out.length >= maxPatches) break;
+  }
+  return out;
+}
+function normalizeOp(raw) {
+  if (typeof raw !== "object" || raw === null) return null;
+  const o = raw;
+  if (o.op === "add") {
+    if (typeof o.text !== "string") return null;
+    const op = { op: "add", text: o.text };
+    if (typeof o.after === "string") op.after = o.after;
+    return op;
+  }
+  if (o.op === "delete") {
+    if (typeof o.anchor !== "string") return null;
+    return { op: "delete", anchor: o.anchor };
+  }
+  if (o.op === "replace") {
+    if (typeof o.anchor !== "string" || typeof o.text !== "string") return null;
+    return { op: "replace", anchor: o.anchor, text: o.text };
+  }
+  return null;
+}
+function isOp(op) {
+  return op !== null;
+}
+function snippet(s, max = 120) {
+  const t = s.trim().replace(/\s+/g, " ");
+  return t.length <= max ? t : `${t.slice(0, max)}\u2026`;
+}
 // src/campaign/labeled-store/fs-adapter.ts
 import { createHash } from "crypto";
 import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
@@ -258,6 +498,339 @@ function appendLine(path, line) {
   }
 }
+// src/campaign/presets/run-skill-opt.ts
+async function runSkillOpt(opts) {
+  if (opts.trainScenarios.length === 0) throw new Error("runSkillOpt: trainScenarios is empty");
+  if (opts.holdoutScenarios.length === 0) throw new Error("runSkillOpt: holdoutScenarios is empty");
+  if (!opts.judges || opts.judges.length === 0) {
+    throw new Error(
+      "runSkillOpt: at least one judge is required \u2014 scoring (and therefore acceptance) is meaningless without one, and would report a silent zero lift."
+    );
+  }
+  const holdoutIds = new Set(opts.holdoutScenarios.map((s) => s.id));
+  const overlap = opts.trainScenarios.filter((s) => holdoutIds.has(s.id)).map((s) => s.id);
+  if (overlap.length > 0) {
+    throw new Error(
+      `runSkillOpt: trainScenarios and holdoutScenarios must be disjoint (overlap: [${overlap.join(
+        ", "
+      )}]) \u2014 a shared scenario leaks the held-out acceptance axis into the proposal evidence.`
+    );
+  }
+  const patchesPerEpoch = opts.patchesPerEpoch ?? 2;
+  const initialBudget = opts.editBudget ?? 3;
+  const minImprovement = opts.minImprovement ?? 0;
+  if (minImprovement < 0) {
+    throw new Error(
+      "runSkillOpt: minImprovement must be >= 0 \u2014 a negative threshold would accept held-out regressions, breaking the monotonic-lift contract."
+    );
+  }
+  const patience = opts.patience ?? opts.maxEpochs;
+  const budgetAnneal = opts.budgetAnneal ?? true;
+  const rejectedBufferSize = opts.rejectedBufferSize ?? 12;
+  const slowMetaEvery = opts.slowMetaEvery ?? 2;
+  let totalCostUsd = 0;
+  const scoreHoldout = async (surface, tag) => {
+    const campaign = await runScoringCampaign(opts, opts.holdoutScenarios, surface, tag);
+    totalCostUsd += campaign.aggregates.totalCostUsd;
+    return campaignMeanComposite(campaign);
+  };
+  const evidenceK = opts.evidenceK ?? 3;
+  const trainEvidence = async (surface, tag) => {
+    const campaign = await runScoringCampaign(opts, opts.trainScenarios, surface, tag);
+    totalCostUsd += campaign.aggregates.totalCostUsd;
+    return toEvidence(campaign, evidenceK);
+  };
+  let current = opts.baselineSurface;
+  let currentEvidence = await trainEvidence(current, "baseline-train");
+  const baselineHoldout = await scoreHoldout(current, "baseline-holdout");
+  let currentHoldout = baselineHoldout;
+  const buffer = [];
+  const acceptedEdits = [];
+  const rejectedAll = [];
+  const history = [];
+  let budget = initialBudget;
+  let sinceAccept = 0;
+  let metaNote;
+  let epochsRun = 0;
+  for (let epoch = 0; epoch < opts.maxEpochs; epoch++) {
+    epochsRun++;
+    const patches = await opts.driver.proposePatches({
+      surface: current,
+      evidence: currentEvidence,
+      editBudget: budget,
+      rejectedBuffer: buffer,
+      metaNote,
+      count: patchesPerEpoch,
+      signal: opts.signal ?? new AbortController().signal
+    });
+    let accepted = null;
+    const rejectedThisEpoch = [];
+    for (let i = 0; i < patches.length; i++) {
+      const patch = patches[i];
+      const { surface: candidate, applied } = applySkillPatch(current, patch);
+      if (applied === 0 || candidate === current) {
+        rejectedThisEpoch.push({
+          label: patch.label,
+          rationale: patch.rationale,
+          reason: "no-op (unanchored or zero-change)"
+        });
+        continue;
+      }
+      const candidateHoldout = await scoreHoldout(candidate, `epoch-${epoch}-cand-${i}-holdout`);
+      if (candidateHoldout > currentHoldout + minImprovement) {
+        accepted = {
+          epoch,
+          label: patch.label,
+          rationale: patch.rationale,
+          holdoutDelta: candidateHoldout - currentHoldout
+        };
+        current = candidate;
+        currentHoldout = candidateHoldout;
+        currentEvidence = await trainEvidence(current, `epoch-${epoch}-train`);
+        break;
+      }
+      rejectedThisEpoch.push({
+        label: patch.label,
+        rationale: patch.rationale,
+        reason: `held-out ${candidateHoldout.toFixed(3)} \u2264 current ${currentHoldout.toFixed(3)}`
+      });
+    }
+    if (accepted) {
+      acceptedEdits.push(accepted);
+      sinceAccept = 0;
+    } else {
+      sinceAccept++;
+      if (budgetAnneal && sinceAccept >= 2 && budget > 1) budget--;
+    }
+    for (const r of rejectedThisEpoch) {
+      buffer.push(r);
+      rejectedAll.push(r);
+    }
+    while (buffer.length > rejectedBufferSize) buffer.shift();
+    if (slowMetaEvery > 0 && (epoch + 1) % slowMetaEvery === 0) {
+      metaNote = buildMetaNote(acceptedEdits, buffer);
+    }
+    history.push({
+      epoch,
+      editBudget: budget,
+      proposed: patches.length,
+      accepted,
+      rejected: rejectedThisEpoch,
+      holdoutComposite: currentHoldout
+    });
+    if (sinceAccept >= patience) break;
+  }
+  return {
+    winnerSurface: current,
+    baselineHoldoutComposite: baselineHoldout,
+    winnerHoldoutComposite: currentHoldout,
+    lift: currentHoldout - baselineHoldout,
+    acceptedEdits,
+    rejectedEdits: rejectedAll,
+    epochsRun,
+    history,
+    totalCostUsd
+  };
+}
+function runScoringCampaign(opts, scenarios, surface, tag) {
+  return runCampaign({
+    ...opts,
+    scenarios,
+    dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
+    runDir: `${opts.runDir}/${tag}`
+  });
+}
+function toEvidence(campaign, k) {
+  const { dimensions, scenarios } = campaignBreakdown(campaign);
+  const weakScenarios = [...scenarios].sort((a, b) => a.composite - b.composite).slice(0, k);
+  const weakDimensions = Object.entries(dimensions).sort((a, b) => a[1] - b[1]).slice(0, k).map(([dimension, score]) => ({ dimension, score }));
+  return { weakScenarios, weakDimensions };
+}
+function buildMetaNote(accepted, rejected) {
+  const parts = [];
+  if (accepted.length > 0) {
+    parts.push(
+      `Edits that improved held-out so far: ${accepted.map((a) => `"${a.label}" (+${a.holdoutDelta.toFixed(3)})`).join("; ")}. Build on these.`
+    );
+  }
+  if (rejected.length > 0) {
+    const labels = [...new Set(rejected.map((r) => r.label))].slice(0, 5);
+    parts.push(`Dead ends to avoid: ${labels.join(", ")}. Try a different anchor or rule.`);
+  }
+  parts.push("Keep edits small and anchored to existing lines.");
+  return parts.join(" ");
+}
+// src/campaign/presets/compare-drivers.ts
+async function compareDrivers(opts) {
+  if (opts.drivers.length === 0) throw new Error("compareDrivers: no drivers to compare");
+  const seed = opts.seed ?? 42;
+  const resamples = opts.resamples ?? 2e3;
+  const confidence = opts.confidence ?? 0.95;
+  const scoreOnHoldout = async (surface, tag) => {
+    const campaign = await runCampaign({
+      ...opts,
+      scenarios: opts.holdoutScenarios,
+      dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
+      runDir: `${opts.runDir}/${tag}`
+    });
+    const byScenario = {};
+    for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
+      byScenario[scenarioId] = composite;
+    }
+    return byScenario;
+  };
+  const scenarioIds = [...new Set(opts.holdoutScenarios.map((s) => s.id))].sort();
+  if (scenarioIds.length === 0) throw new Error("compareDrivers: holdoutScenarios is empty");
+  const align = (byScenario, label) => {
+    const missing = scenarioIds.filter((id) => !(id in byScenario));
+    if (missing.length > 0) {
+      throw new Error(
+        `compareDrivers: ${label} produced no held-out score for scenario(s) [${missing.join(
+          ", "
+        )}] \u2014 a cell errored or its judges returned nothing. Refusing to fabricate a 0 (it would corrupt the lift comparison). Fix the dispatch/judge or drop the scenario.`
+      );
+    }
+    return scenarioIds.map((id) => byScenario[id]);
+  };
+  const baselineArr = align(
+    await scoreOnHoldout(opts.baselineSurface, "compare-baseline"),
+    "baseline"
+  );
+  const winners = [];
+  for (const d of opts.drivers) {
+    const out = await d.optimize();
+    const byScenario = await scoreOnHoldout(out.winnerSurface, `compare-${slug(d.name)}`);
+    winners.push({
+      name: d.name,
+      winnerSurface: out.winnerSurface,
+      costUsd: out.costUsd,
+      durationMs: out.durationMs,
+      arr: align(byScenario, `driver "${d.name}"`)
+    });
+  }
+  const scores = winners.map((w) => {
+    const boot = pairedBootstrap(baselineArr, w.arr, {
+      seed,
+      resamples,
+      confidence,
+      statistic: "mean"
+    });
+    const score = {
+      name: w.name,
+      baselineComposite: mean(baselineArr),
+      winnerComposite: mean(w.arr),
+      lift: boot.mean,
+      liftCi: { low: boot.low, high: boot.high },
+      costUsd: w.costUsd,
+      winnerSurface: w.winnerSurface,
+      rank: 0
+    };
+    if (w.durationMs !== void 0) score.durationMs = w.durationMs;
+    return score;
+  });
+  scores.sort((a, b) => b.lift - a.lift || a.costUsd - b.costUsd);
+  scores.forEach((s, i) => {
+    s.rank = i + 1;
+  });
+  const best = scores[0];
+  const byName = new Map(winners.map((w) => [w.name, w]));
+  const bestArr = byName.get(best.name).arr;
+  const pairwise = scores.slice(1).map((other) => {
+    const otherArr = byName.get(other.name).arr;
+    const boot = pairedBootstrap(otherArr, bestArr, {
+      seed,
+      resamples,
+      confidence,
+      statistic: "mean"
+    });
+    const favored = boot.low > 0 ? best.name : boot.high < 0 ? other.name : "tie";
+    return {
+      a: best.name,
+      b: other.name,
+      deltaMean: boot.mean,
+      low: boot.low,
+      high: boot.high,
+      favored
+    };
+  });
+  return { scores, best, pairwise, holdoutScenarioIds: scenarioIds };
+}
+function mean(xs) {
+  return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
+}
+function slug(name) {
+  return name.replace(/[^a-z0-9]+/gi, "-").toLowerCase();
+}
+function gepaReflectionEntry(config, name = "gepa-reflection") {
+  return gepaEntry(config, false, name);
+}
+function gepaParetoEntry(config, name = "gepa-pareto") {
+  return gepaEntry(config, true, name);
+}
+function gepaEntry(config, combineParents, name) {
+  return {
+    name,
+    async optimize() {
+      const started = Date.now();
+      const driver = gepaDriver({
+        llm: config.llm,
+        model: config.model,
+        target: config.target,
+        combineParents,
+        ...config.mutationPrimitives ? { mutationPrimitives: config.mutationPrimitives } : {}
+      });
+      const result = await runImprovementLoop({
+        scenarios: config.trainScenarios,
+        holdoutScenarios: config.holdoutScenarios,
+        baselineSurface: config.baselineSurface,
+        dispatchWithSurface: config.dispatchWithSurface,
+        judges: config.judges,
+        driver,
+        populationSize: config.populationSize ?? 2,
+        maxGenerations: config.maxGenerations ?? 3,
+        gate: defaultProductionGate({
+          holdoutScenarios: config.holdoutScenarios,
+          deltaThreshold: 0
+        }),
+        autoOnPromote: "none",
+        runDir: `${config.runDir}/${slug(name)}-loop`,
+        ...config.seed !== void 0 ? { seed: config.seed } : {}
+      });
+      const costUsd = result.baselineCampaign.aggregates.totalCostUsd + result.generations.reduce(
+        (sum, g) => sum + g.surfaces.reduce((s, sf) => s + sf.campaign.aggregates.totalCostUsd, 0),
+        0
+      );
+      return { winnerSurface: result.winnerSurface, costUsd, durationMs: Date.now() - started };
+    }
+  };
+}
+function skillOptEntry(config, name = "skill-opt") {
+  return {
+    name,
+    async optimize() {
+      const started = Date.now();
+      const driver = skillOptDriver({ llm: config.llm, model: config.model, target: config.target });
+      const result = await runSkillOpt({
+        baselineSurface: config.baselineSurface,
+        dispatchWithSurface: config.dispatchWithSurface,
+        judges: config.judges,
+        driver,
+        trainScenarios: config.trainScenarios,
+        holdoutScenarios: config.holdoutScenarios,
+        maxEpochs: config.maxEpochs ?? 6,
+        runDir: `${config.runDir}/${slug(name)}-loop`,
+        ...config.seed !== void 0 ? { seed: config.seed } : {}
+      });
+      return {
+        winnerSurface: result.winnerSurface,
+        costUsd: result.totalCostUsd,
+        durationMs: Date.now() - started
+      };
+    }
+  };
+}
 // src/campaign/presets/run-profile-matrix.ts
 import { createHash as createHash2 } from "crypto";
 import { join as join2 } from "path";
@@ -272,12 +845,12 @@ function sanitize(id) {
 function sha(input) {
   return createHash2("sha256").update(JSON.stringify(input)).digest("hex");
 }
-function mean(xs) {
+function mean2(xs) {
   return xs.length === 0 ? 0 : xs.reduce((a, b) => a + b, 0) / xs.length;
 }
 function cellComposite(cell) {
   const composites = Object.values(cell.judgeScores).map((s) => s.composite);
-  return composites.length === 0 ? 0 : mean(composites);
+  return composites.length === 0 ? 0 : mean2(composites);
 }
 function buildRunRecord(args) {
   const { cell, profile, profileHash, configHash, experimentId, splitTag, commitSha, matrixId } = args;
@@ -295,7 +868,7 @@ function buildRunRecord(args) {
     if (js.notes) notes.push(`${judgeName}: ${js.notes}`);
   }
   const perDimMean = {};
-  for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean(values);
+  for (const [dim, values] of Object.entries(dimAccum)) perDimMean[dim] = mean2(values);
   const outcome = splitTag === "holdout" ? { holdoutScore: composite, raw } : { searchScore: composite, raw };
   if (Object.keys(perJudge).length > 0) {
     outcome.judgeScores = {
@@ -406,7 +979,7 @@ async function runProfileMatrix(opts) {
       profileHash,
       model: profile.model,
       records: profileRecords.length,
-      meanComposite: mean(profileRecords.map(compositeOf)),
+      meanComposite: mean2(profileRecords.map(compositeOf)),
       totalCostUsd: profileRecords.reduce((a, r) => a + r.costUsd, 0),
       integrity: summarizeBackendIntegrity(profileRecords)
     };
@@ -436,7 +1009,7 @@ function rollup(records, keyOf) {
     groups.set(key, arr);
   }
   const out = {};
-  for (const [key, xs] of groups) out[key] = { meanComposite: mean(xs), n: xs.length };
+  for (const [key, xs] of groups) out[key] = { meanComposite: mean2(xs), n: xs.length };
   return out;
 }
 function rollupByPersona(records, scenarios, personaOf) {
@@ -465,7 +1038,7 @@ function defaultGit(args, cwd) {
     throw new WorktreeAdapterError(`git ${args.join(" ")} failed: ${stderr || String(err)}`, err);
   }
 }
-function slug(label) {
+function slug2(label) {
   return label.toLowerCase().replace(/[^a-z0-9]+/g, "-").replace(/^-+|-+$/g, "").slice(0, 48) || "candidate";
 }
 function gitWorktreeAdapter(opts) {
@@ -474,7 +1047,7 @@ function gitWorktreeAdapter(opts) {
   const branchPrefix = opts.branchPrefix ?? "improve";
   return {
     async create({ baseRef, label }) {
-      const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
+      const id = `${slug2(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
       const branch = `${branchPrefix}/${id}`;
       const path = join3(worktreeDir, id);
       git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
@@ -508,8 +1081,13 @@ export {
   FsLabeledScenarioStore,
   LabeledScenarioStoreError,
   ProfileMatrixError,
+  SkillPatchParseError,
   WorktreeAdapterError,
+  applySkillPatch,
   buildLoopProvenanceRecord,
+  campaignBreakdown,
+  campaignMeanComposite,
+  compareDrivers,
   composeGate,
   countSentenceEdits,
   defaultProductionGate,
@@ -519,6 +1097,8 @@ export {
   extractH2Sections,
   fsCampaignStorage,
   gepaDriver,
+  gepaParetoEntry,
+  gepaReflectionEntry,
   gitWorktreeAdapter,
   heldOutGate,
   inMemoryCampaignStorage,
@@ -526,6 +1106,8 @@ export {
   labelTrustRank,
   loopProvenanceSpans,
   openAutoPr,
+  parseSkillPatchResponse,
+  patchEditCount,
   provenanceRecordPath,
   provenanceSpansPath,
   resolveWorktreePath,
@@ -534,6 +1116,9 @@ export {
   runImprovementLoop,
   runOptimization,
   runProfileMatrix,
+  runSkillOpt,
+  skillOptDriver,
+  skillOptEntry,
   surfaceContentHash,
   surfaceHash
 };