npm - @tangle-network/agent-eval - Versions diffs - 0.51.0 → 0.52.0 - Mend

@tangle-network/agent-eval 0.51.0 → 0.52.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/CHANGELOG.md +31 -1
package/dist/campaign/index.d.ts +7 -66
package/dist/campaign/index.js +5 -122
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-XAP6DJZE.js → chunk-YXD7GWJI.js} +35 -2
package/dist/chunk-YXD7GWJI.js.map +1 -0
package/dist/contract/index.d.ts +2 -2
package/dist/contract/index.js +1 -1
package/dist/openapi.json +1 -1
package/dist/{run-improvement-loop-BPMjNKMJ.d.ts → run-improvement-loop-Cc7oZlRP.d.ts} +48 -15
package/docs/specs/driver-honest-spec.md +251 -0
package/docs/specs/hermes-self-improvement-audit.md +93 -0
package/docs/specs/profile-versioning.md +291 -0
package/package.json +1 -1
package/dist/chunk-XAP6DJZE.js.map +0 -1

package/dist/chunk-YXD7GWJI.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../src/campaign/auto-pr.ts","../src/campaign/drivers/evolutionary.ts","../src/campaign/drivers/gepa.ts","../src/campaign/gates/compose.ts","../src/campaign/gates/default-production-gate.ts","../src/campaign/gates/heldout-gate.ts","../src/campaign/presets/run-eval.ts","../src/campaign/presets/run-optimization.ts","../src/campaign/presets/run-improvement-loop.ts"],"sourcesContent":["/**\n * @experimental\n *\n * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's\n * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening\n * code consumers duplicated 4 times. The PR body includes the campaign's\n * manifest hash, gate verdict, and scorecard summary so reviewers can see\n * exactly what was promoted + why.\n *\n * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.\n * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is\n * deferred to Pass B with the full shadow / canary / rollback stack.\n */\n\nimport { execSync } from 'node:child_process'\nimport { writeFileSync } from 'node:fs'\nimport { tmpdir } from 'node:os'\nimport { join } from 'node:path'\nimport type { CampaignResult, GateResult, Scenario } from './types'\n\nexport interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {\n /** Campaign result to attach to the PR. */\n result: CampaignResult<TArtifact, TScenario>\n /** Gate verdict explaining the promotion. Substrate refuses to open a PR\n * when `gate.decision !== 'ship'` — fails loud. */\n gate: GateResult\n /** Promoted surface diff — typically the new system prompt addendum or\n * full profile diff. Substrate writes it as the PR body. */\n promotedDiff: string\n /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */\n ghOwner: string\n ghRepo: string\n /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */\n branch?: string\n /** PR title. Default includes manifest hash. */\n title?: string\n /** Whether to actually open the PR or just dry-run. Default reads\n * `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */\n dryRun?: boolean\n /** Test seam — substitute `gh pr create` invocation. */\n ghExec?: (args: string[]) => { stdout: string; stderr: string; status: number }\n}\n\nexport interface OpenAutoPrResult {\n opened: boolean\n prUrl?: string\n dryRun: boolean\n reason: string\n}\n\nexport function openAutoPr<TArtifact, TScenario extends Scenario>(\n options: OpenAutoPrOptions<TArtifact, TScenario>,\n): OpenAutoPrResult {\n if (options.gate.decision !== 'ship') {\n return {\n opened: false,\n dryRun: false,\n reason: `gate verdict was \"${options.gate.decision}\" — refusing to open PR`,\n }\n }\n\n const dryRun = options.dryRun ?? !process.env.GH_AUTO_PR_TOKEN\n const branch = options.branch ?? `auto/${options.result.manifestHash.slice(0, 12)}`\n const title =\n options.title ?? `auto: campaign ${options.result.manifestHash.slice(0, 8)} promoted by gate`\n\n const body = renderPrBody(options.result, options.gate, options.promotedDiff)\n const bodyPath = join(tmpdir(), `auto-pr-body-${Date.now()}.md`)\n writeFileSync(bodyPath, body)\n\n if (dryRun) {\n return {\n opened: false,\n dryRun: true,\n reason: `dry-run (GH_AUTO_PR_TOKEN not set). Would create PR on ${options.ghOwner}/${options.ghRepo} branch ${branch}. Body at ${bodyPath}.`,\n }\n }\n\n const ghExec = options.ghExec ?? defaultGhExec\n const result = ghExec([\n 'pr',\n 'create',\n '--repo',\n `${options.ghOwner}/${options.ghRepo}`,\n '--head',\n branch,\n '--title',\n title,\n '--body-file',\n bodyPath,\n ])\n if (result.status !== 0) {\n return {\n opened: false,\n dryRun: false,\n reason: `gh pr create failed (exit ${result.status}): ${result.stderr.slice(0, 400)}`,\n }\n }\n const prUrl = result.stdout.trim()\n return { opened: true, prUrl, dryRun: false, reason: 'PR opened' }\n}\n\nfunction renderPrBody<TArtifact, TScenario extends Scenario>(\n result: CampaignResult<TArtifact, TScenario>,\n gate: GateResult,\n diff: string,\n): string {\n const lines: string[] = []\n lines.push(`## Automated promotion by \\`runImprovementLoop\\``)\n lines.push('')\n lines.push(`**Manifest**: \\`${result.manifestHash}\\``)\n lines.push(`**Seed**: ${result.seed}`)\n lines.push(`**Duration**: ${Math.round(result.durationMs / 1000)}s`)\n lines.push(\n `**Cells**: executed ${result.aggregates.cellsExecuted}, cached ${result.aggregates.cellsCached}, skipped ${result.aggregates.cellsSkipped}, failed ${result.aggregates.cellsFailed}`,\n )\n lines.push(`**Total spend**: $${result.aggregates.totalCostUsd.toFixed(2)}`)\n lines.push('')\n lines.push(`### Gate verdict: \\`${gate.decision}\\``)\n lines.push('')\n for (const reason of gate.reasons) lines.push(`- ${reason}`)\n if (gate.delta !== undefined) lines.push(`- delta: ${gate.delta.toFixed(3)}`)\n lines.push('')\n lines.push('### Contributing gates')\n lines.push('')\n lines.push('| gate | passed | detail |')\n lines.push('|---|---|---|')\n for (const c of gate.contributingGates) {\n const detail =\n typeof c.detail === 'object'\n ? JSON.stringify(c.detail).slice(0, 80)\n : String(c.detail).slice(0, 80)\n lines.push(`| ${c.name} | ${c.passed ? '✓' : '✗'} | ${detail} |`)\n }\n lines.push('')\n lines.push('### Promoted surface')\n lines.push('')\n lines.push('```diff')\n lines.push(diff.slice(0, 8000))\n lines.push('```')\n lines.push('')\n lines.push('### By-judge aggregates')\n lines.push('')\n lines.push('| judge | mean | ci95 | n |')\n lines.push('|---|---|---|---|')\n for (const [name, agg] of Object.entries(result.aggregates.byJudge)) {\n lines.push(\n `| ${name} | ${agg.mean.toFixed(3)} | [${agg.ci95[0].toFixed(3)}, ${agg.ci95[1].toFixed(3)}] | ${agg.n} |`,\n )\n }\n return lines.join('\\n')\n}\n\nfunction defaultGhExec(args: string[]): { stdout: string; stderr: string; status: number } {\n try {\n const stdout = execSync(`gh ${args.map(quoteArg).join(' ')}`, {\n env: { ...process.env, GH_TOKEN: process.env.GH_AUTO_PR_TOKEN ?? process.env.GH_TOKEN ?? '' },\n stdio: ['ignore', 'pipe', 'pipe'],\n }).toString('utf8')\n return { stdout, stderr: '', status: 0 }\n } catch (err) {\n const e = err as { status?: number; stderr?: Buffer; stdout?: Buffer }\n return {\n stdout: e.stdout?.toString('utf8') ?? '',\n stderr: e.stderr?.toString('utf8') ?? '',\n status: e.status ?? 1,\n }\n }\n}\n\nfunction quoteArg(arg: string): string {\n if (/^[a-zA-Z0-9_/\\-:.@]+$/.test(arg)) return arg\n return `\"${arg.replace(/\"/g, '\\\\\"')}\"`\n}\n","/**\n * @experimental\n *\n * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:\n * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is\n * the evolutionary strategy: each generation, mutate the current best surface\n * into N candidates, measure, select. No generation memory beyond the current\n * surface; the loop body handles ranking + promotion.\n *\n * The reflective alternative is agent-runtime's `improvementDriver` with a\n * `reflectiveGenerator` / `agenticGenerator`: it reasons over the report +\n * trace findings to propose targeted edits rather than blind mutations. Both\n * conform to `ImprovementDriver`; the improvement loop is identical regardless\n * of which drives it.\n */\n\nimport type { ImprovementDriver, Mutator } from '../types'\n\nexport interface EvolutionaryDriverOptions<TFindings = unknown> {\n mutator: Mutator<TFindings>\n /** External findings fed to the mutator each generation. Default: []. */\n findings?: TFindings[]\n}\n\nexport function evolutionaryDriver<TFindings = unknown>(\n opts: EvolutionaryDriverOptions<TFindings>,\n): ImprovementDriver<TFindings> {\n return {\n kind: `evolutionary:${opts.mutator.kind}`,\n async propose({ currentSurface, findings, populationSize, signal }) {\n return opts.mutator.mutate({\n findings: findings.length > 0 ? findings : (opts.findings ?? []),\n currentSurface,\n populationSize,\n signal,\n })\n },\n }\n}\n","/**\n * @experimental\n *\n * `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.\n * Each generation it reflects on the prior best candidate's per-scenario\n * scores + weakest dimensions, asks an LLM to propose targeted rewrites of\n * the current surface, and returns them as the next population.\n *\n * Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):\n * this driver implements the *reflection* primitive — it does NOT implement\n * GEPA's Pareto frontier of candidates, multi-objective non-dominated\n * tracking, or the combine-complementary-lessons step. We use \"best by\n * composite\" as the parent each generation; the paper retains a Pareto set\n * and combines lessons across non-dominated candidates. Tracked as #101 in\n * the substrate roadmap. See `docs/specs/driver-honest-spec.md`.\n *\n * Optional `constraints` move structured-doc guards into the driver\n * (preserve H2 section headings, cap sentence-level edits) — useful when\n * the surface IS a structured procedure like a SKILL.md / runbook /\n * judge rubric. When `constraints` is omitted, behavior is unchanged.\n *\n * The driver is surface-agnostic — any string surface in any consumer opts\n * in by selecting it. Reuses the generic reflection primitive\n * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router\n * client; no dependency on the legacy `runMultiShotOptimization` /\n * `prompt-evolution` orchestration.\n *\n * Earns its keep where there is real per-instance signal (which the\n * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel\n * now provide). For thin-signal surfaces it degrades to plain reflection.\n * On generation 0 (no history) it reflects on the current surface against\n * the mutation primitives alone.\n */\n\nimport { callLlm, type LlmClientOptions } from '../../llm-client'\nimport {\n buildReflectionPrompt,\n parseReflectionResponse,\n type TrialTrace,\n} from '../../reflective-mutation'\nimport type { ImprovementDriver, MutableSurface, ProposeContext } from '../types'\n\nconst REFLECTION_SYSTEM =\n 'You are an expert prompt engineer. Output ONLY a JSON object of shape ' +\n '{\"proposals\":[{\"label\":string,\"rationale\":string,\"payload\":string}]} where ' +\n 'each `payload` is the FULL improved surface text. No prose outside the JSON.'\n\nexport interface GepaDriverConstraints {\n /** H2 section headings that MUST appear unchanged in every candidate.\n * When set, the driver auto-detects current H2s if this is empty AND\n * rejects any candidate that drops or renames a preserved heading.\n * Use when the surface is a structured doc (SKILL.md, runbook,\n * sectioned system prompt, judge rubric). */\n preserveSections?: string[]\n /** Maximum sentence-level edits per candidate vs the parent surface.\n * Rejection threshold = maxSentenceEdits × 2 (counts adds + removes).\n * Inspired by SkillOpt's edit-budget as a \"textual learning rate.\"\n * Cap prevents an LLM rewrite from overwriting useful prior rules. */\n maxSentenceEdits?: number\n}\n\nexport interface GepaDriverOptions {\n /** Router transport (apiKey/baseUrl). */\n llm: LlmClientOptions\n /** Model that performs the reflection. */\n model: string\n /** What is being optimized — appears in the reflection prompt for orientation. */\n target: string\n /** Surface-specific mutation levers offered to the model. */\n mutationPrimitives?: string[]\n /** Top/bottom scenarios surfaced as evidence each generation. Default 3. */\n evidenceK?: number\n /** Reflection sampling temperature. Default 0.7. */\n temperature?: number\n /** Reflection max tokens. Default 6000. */\n maxTokens?: number\n /** Structured-doc constraints. Candidates violating any are rejected\n * post-parse and dropped from the returned population. */\n constraints?: GepaDriverConstraints\n}\n\nexport function gepaDriver(opts: GepaDriverOptions): ImprovementDriver {\n const evidenceK = opts.evidenceK ?? 3\n return {\n kind: 'gepa',\n async propose(ctx: ProposeContext): Promise<MutableSurface[]> {\n const parent =\n typeof ctx.currentSurface === 'string'\n ? ctx.currentSurface\n : JSON.stringify(ctx.currentSurface)\n const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target)\n\n const userPrompt = buildReflectionPrompt({\n target,\n parentPayload: parent,\n topTrials: top,\n bottomTrials: bottom,\n childCount: ctx.populationSize,\n mutationPrimitives: opts.mutationPrimitives,\n })\n\n const result = await callLlm(\n {\n model: opts.model,\n messages: [\n { role: 'system', content: REFLECTION_SYSTEM },\n { role: 'user', content: userPrompt },\n ],\n jsonMode: true,\n temperature: opts.temperature ?? 0.7,\n maxTokens: opts.maxTokens ?? 6000,\n },\n opts.llm,\n )\n\n const proposals = parseReflectionResponse(result.content, ctx.populationSize)\n const out: MutableSurface[] = []\n const constraints = opts.constraints\n const preserveSections =\n constraints?.preserveSections !== undefined\n ? constraints.preserveSections.length === 0\n ? extractH2Sections(parent)\n : constraints.preserveSections\n : null\n const maxEdits = constraints?.maxSentenceEdits\n for (const proposal of proposals) {\n const text = typeof proposal.payload === 'string' ? proposal.payload.trim() : ''\n if (!text || text === parent || out.includes(text)) continue\n if (preserveSections && !validatePreservedSections(text, preserveSections)) continue\n if (maxEdits !== undefined && countSentenceEdits(parent, text) > maxEdits * 2) continue\n out.push(text)\n }\n return out\n },\n }\n}\n\n/** Extract H2 headings (`## Foo`) from a markdown surface. Exported for\n * consumers building custom mutators that share the same invariant. */\nexport function extractH2Sections(text: string): string[] {\n const out: string[] = []\n for (const line of text.split('\\n')) {\n const match = /^##\\s+(.+?)\\s*$/.exec(line)\n if (match) out.push(match[1]!)\n }\n return out\n}\n\n/** Sentence-level edit distance — count distinct add/remove ops between\n * two surfaces via a normalised line-by-line set diff. Treats trivial\n * whitespace as identical. Exported for tests + consumer-side validators. */\nexport function countSentenceEdits(baseline: string, candidate: string): number {\n const norm = (s: string) =>\n s\n .split(/(?<=[.!?])\\s+|\\n/g)\n .map((p) => p.trim())\n .filter((p) => p.length > 0)\n const a = new Set(norm(baseline))\n const b = new Set(norm(candidate))\n let edits = 0\n for (const s of a) if (!b.has(s)) edits++\n for (const s of b) if (!a.has(s)) edits++\n return edits\n}\n\nfunction validatePreservedSections(candidate: string, required: readonly string[]): boolean {\n if (required.length === 0) return true\n const have = new Set(extractH2Sections(candidate))\n for (const section of required) {\n if (!have.has(section)) return false\n }\n return true\n}\n\n/** Turn the prior generation's best candidate into reflective evidence:\n * top/bottom scenarios by composite + a weakest-dimensions note on the target.\n * Empty on generation 0 — the model reflects on the surface alone. */\nfunction buildEvidence(\n ctx: ProposeContext,\n evidenceK: number,\n baseTarget: string,\n): { top: TrialTrace[]; bottom: TrialTrace[]; target: string } {\n const last = ctx.history.at(-1)\n if (!last || last.candidates.length === 0) {\n return { top: [], bottom: [], target: baseTarget }\n }\n const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0]\n if (!best) return { top: [], bottom: [], target: baseTarget }\n\n const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite)\n const toTrace = (s: { scenarioId: string; composite: number }): TrialTrace => ({\n id: s.scenarioId,\n score: s.composite,\n })\n const top = byScore.slice(0, evidenceK).map(toTrace)\n const bottom = byScore.slice(-evidenceK).reverse().map(toTrace)\n\n const weakest = Object.entries(best.dimensions)\n .sort((a, b) => a[1] - b[1])\n .slice(0, 3)\n .map(([dim, value]) => `${dim} (${value.toFixed(2)})`)\n const target =\n weakest.length > 0 ? `${baseTarget} — weakest dimensions: ${weakest.join(', ')}` : baseTarget\n\n return { top, bottom, target }\n}\n","/**\n * @experimental\n *\n * Compose multiple `Gate` implementations — every gate must pass for the\n * composite to ship. Closes the alignment reviewer's \"default-only\n * heldOutGate + costGate would happily promote a reward-hacked prompt\"\n * concern by making safety gates first-class composable defaults.\n */\n\nimport type { Gate, GateContext, GateDecision, GateResult, Scenario } from '../types'\n\n/** Compose gates — all must `ship` for the composite to `ship`. First\n * non-ship verdict short-circuits the composite verdict, but ALL gates run\n * (so the result records every gate's reason — useful for diagnostics). */\nexport function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(\n ...gates: Array<Gate<TArtifact, TScenario>>\n): Gate<TArtifact, TScenario> {\n if (gates.length === 0) {\n throw new Error('composeGate requires at least one gate')\n }\n return {\n name: `composed(${gates.map((g) => g.name).join(',')})`,\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const results: Array<{ gate: Gate<TArtifact, TScenario>; res: GateResult }> = []\n for (const gate of gates) {\n const res = await gate.decide(ctx)\n results.push({ gate, res })\n }\n\n // Substrate-wide verdict policy:\n // - all 'ship' → 'ship'\n // - any 'arch_ceiling' → 'arch_ceiling' (architectural ceiling beats other holds)\n // - any 'model_ceiling' → 'model_ceiling'\n // - any 'hold' → 'hold'\n // - else 'need_more_work'\n const decisions = results.map((r) => r.res.decision)\n const overall: GateDecision = decisions.every((d) => d === 'ship')\n ? 'ship'\n : decisions.includes('arch_ceiling')\n ? 'arch_ceiling'\n : decisions.includes('model_ceiling')\n ? 'model_ceiling'\n : decisions.includes('hold')\n ? 'hold'\n : 'need_more_work'\n\n const contributing = results.flatMap((r) =>\n r.res.contributingGates.length > 0\n ? r.res.contributingGates\n : [{ name: r.gate.name, passed: r.res.decision === 'ship', detail: r.res }],\n )\n\n const reasons = results.flatMap((r) =>\n r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`),\n )\n\n return {\n decision: overall,\n reasons,\n contributingGates: contributing,\n delta: results[0]?.res.delta,\n }\n },\n }\n}\n","/**\n * @experimental\n *\n * `defaultProductionGate` — composes the substrate's existing safety\n * primitives (red-team / reward-hacking / canary / heldout) into a single\n * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' \"safety\n * primitives are off the critical path\" blocker.\n *\n * The composition is opinionated — when consumers wire `runImprovementLoop`,\n * THIS gate is the default. Consumers can still pass a custom gate to\n * override; the recommended pattern is to compose THIS gate with whatever\n * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).\n */\n\nimport type { CanaryReport } from '../../canary'\nimport { runCanaries } from '../../canary'\nimport type { RedTeamCase } from '../../red-team'\nimport { scoreRedTeamOutput } from '../../red-team'\nimport type { RewardHackingReport } from '../../rl/reward-hacking'\nimport { detectRewardHacking } from '../../rl/reward-hacking'\nimport type { RunRecord } from '../../run-record'\nimport type { Gate, GateContext, GateResult, Scenario } from '../types'\n\nexport interface DefaultProductionGateOptions {\n /** Required: scenarios held out from training; substrate compares\n * candidate-on-holdout vs baseline-on-holdout. */\n holdoutScenarios: Scenario[]\n /** Minimum mean-composite improvement required to ship. Default 0.5. */\n deltaThreshold?: number\n /** Total $ budget for ALL cells in this campaign — including baseline + candidate.\n * Composite verdict refuses to ship when spend exceeded budget. */\n budgetUsd?: number\n /** Red-team cases to probe candidate outputs against. When omitted the\n * substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific\n * battery for tighter coverage. */\n redTeamBattery?: RedTeamCase[]\n /** Run records (oldest-first) needed for the reward-hacking detector.\n * Substrate populates from prior production-loop generations. */\n recentRuns?: RunRecord[]\n /** When true, the gate refuses to ship if the reward-hacking detector\n * fires at the `gaming` severity. Default true. */\n blockOnRewardHackingGaming?: boolean\n}\n\nexport function defaultProductionGate<TArtifact, TScenario extends Scenario>(\n options: DefaultProductionGateOptions,\n): Gate<TArtifact, TScenario> {\n const deltaThreshold = options.deltaThreshold ?? 0.5\n const blockOnGaming = options.blockOnRewardHackingGaming ?? true\n\n return {\n name: 'defaultProductionGate',\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const reasons: string[] = []\n const contributing: Array<{ name: string; passed: boolean; detail: unknown }> = []\n\n // ── (1) heldout composite delta ─────────────────────────────────\n // Baseline scores come from their OWN map; sharing `judgeScores` would\n // compare the candidate against itself (delta 0).\n const baselineComposite = meanComposite(\n ctx.baselineArtifacts,\n ctx.baselineJudgeScores ?? ctx.judgeScores,\n options.holdoutScenarios,\n )\n const candidateComposite = meanComposite(\n ctx.candidateArtifacts,\n ctx.judgeScores,\n options.holdoutScenarios,\n )\n const delta = candidateComposite - baselineComposite\n const heldoutPass = delta >= deltaThreshold\n contributing.push({\n name: 'heldout-delta',\n passed: heldoutPass,\n detail: { baselineComposite, candidateComposite, delta, deltaThreshold },\n })\n if (!heldoutPass) {\n reasons.push(`heldout delta ${delta.toFixed(3)} < threshold ${deltaThreshold}`)\n }\n\n // ── (2) budget gate ─────────────────────────────────────────────\n const budgetPass =\n options.budgetUsd === undefined ||\n ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd\n contributing.push({\n name: 'budget',\n passed: budgetPass,\n detail: {\n candidateUsd: ctx.cost.candidate,\n baselineUsd: ctx.cost.baseline,\n budgetUsd: options.budgetUsd,\n },\n })\n if (!budgetPass) {\n reasons.push(\n `spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`,\n )\n }\n\n // ── (3) red-team probe on candidate ─────────────────────────────\n const redTeamFindings = options.redTeamBattery\n ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery)\n : { passed: true, findings: [] }\n contributing.push({\n name: 'red-team',\n passed: redTeamFindings.passed,\n detail: {\n failures: redTeamFindings.findings.length,\n sample: redTeamFindings.findings.slice(0, 3),\n },\n })\n if (!redTeamFindings.passed) {\n reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`)\n }\n\n // ── (4) reward-hacking detector on the run-history window ───────\n let rewardHackingReport: RewardHackingReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n rewardHackingReport = detectRewardHacking({ runs: options.recentRuns })\n }\n // reward-hacking severity is numeric (0..1). \"gaming\" threshold per\n // detectRewardHacking defaults = 0.6. Block when ANY finding is at\n // gaming threshold OR the report verdict is 'gaming'.\n const gamingThreshold = 0.6\n const gamingFindings = (rewardHackingReport?.findings ?? []).filter(\n (f) => f.severity >= gamingThreshold,\n )\n const rewardHackingPass =\n !rewardHackingReport ||\n !blockOnGaming ||\n (gamingFindings.length === 0 && rewardHackingReport.verdict !== 'gaming')\n contributing.push({\n name: 'reward-hacking',\n passed: rewardHackingPass,\n detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length },\n })\n if (!rewardHackingPass) {\n reasons.push(\n `reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport!.verdict})`,\n )\n }\n\n // ── (5) canary check on runs ────────────────────────────────────\n let canaryReport: CanaryReport | null = null\n if (options.recentRuns && options.recentRuns.length >= 10) {\n canaryReport = runCanaries(options.recentRuns, {})\n }\n // CanarySeverity is 'info' | 'warn' | 'error' — block on 'error'.\n const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === 'error')\n const canaryPass = errorAlerts.length === 0\n contributing.push({\n name: 'canary',\n passed: canaryPass,\n detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length },\n })\n if (!canaryPass) {\n reasons.push(`canary error alerts: ${errorAlerts.length}`)\n }\n\n // ── Verdict ─────────────────────────────────────────────────────\n const allPassed = contributing.every((c) => c.passed)\n const decision = allPassed ? 'ship' : 'hold'\n\n return {\n decision,\n reasons: reasons.length > 0 ? reasons : ['all gates passed'],\n contributingGates: contributing,\n delta,\n }\n },\n }\n}\n\nfunction meanComposite<TArtifact, TScenario extends Scenario>(\n artifacts: Map<string, TArtifact> | undefined,\n judgeScoresByCell: Map<string, Record<string, { composite: number }>>,\n scenarios: TScenario[],\n): number {\n if (!artifacts || artifacts.size === 0) return 0\n const scenarioIds = new Set(scenarios.map((s) => s.id))\n const composites: number[] = []\n for (const [cellId, scores] of judgeScoresByCell) {\n const scenarioId = cellId.split(':')[0] ?? ''\n if (!scenarioIds.has(scenarioId)) continue\n const cellComposites = Object.values(scores).map((s) => s.composite)\n if (cellComposites.length === 0) continue\n composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length)\n }\n if (composites.length === 0) return 0\n return composites.reduce((a, b) => a + b, 0) / composites.length\n}\n\nfunction probeRedTeam<TArtifact>(\n artifacts: Map<string, TArtifact>,\n battery: RedTeamCase[],\n): { passed: boolean; findings: Array<{ scenarioId: string; reason: string }> } {\n const findings: Array<{ scenarioId: string; reason: string }> = []\n for (const [_cellId, artifact] of artifacts) {\n const text = extractText(artifact)\n if (text === undefined) continue\n for (const rtCase of battery) {\n const finding = scoreRedTeamOutput(text, [], rtCase)\n if (!finding.passed) {\n findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? 'red-team probe failed' })\n }\n }\n }\n return { passed: findings.length === 0, findings }\n}\n\nfunction extractText(artifact: unknown): string | undefined {\n if (typeof artifact === 'string') return artifact\n if (artifact && typeof artifact === 'object') {\n const rec = artifact as Record<string, unknown>\n if (typeof rec.text === 'string') return rec.text\n if (typeof rec.output === 'string') return rec.output\n if (typeof rec.content === 'string') return rec.content\n }\n return undefined\n}\n","/**\n * @experimental\n *\n * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable\n * `Gate`. Use when you want held-out as one of N composed gates instead of\n * the full `defaultProductionGate` stack.\n */\n\nimport type { Gate, GateContext, GateResult, Scenario } from '../types'\n\nexport interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {\n scenarios: TScenario[]\n deltaThreshold?: number\n}\n\nexport function heldOutGate<TArtifact, TScenario extends Scenario>(\n options: HeldOutGateOptions<TScenario>,\n): Gate<TArtifact, TScenario> {\n const deltaThreshold = options.deltaThreshold ?? 0.5\n return {\n name: 'heldOutGate',\n async decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult> {\n const scenarioIds = new Set(options.scenarios.map((s) => s.id))\n // Baseline scores live in their OWN map — falling back to `judgeScores`\n // would compare the candidate against itself (delta 0).\n const baseline = meanForScenarios(ctx.baselineJudgeScores ?? ctx.judgeScores, scenarioIds)\n const candidate = meanForScenarios(ctx.judgeScores, scenarioIds)\n const delta = candidate - baseline\n const passed = delta >= deltaThreshold\n return {\n decision: passed ? 'ship' : 'hold',\n reasons: passed\n ? [`held-out delta ${delta.toFixed(3)} ≥ ${deltaThreshold}`]\n : [`held-out delta ${delta.toFixed(3)} < ${deltaThreshold}`],\n contributingGates: [\n { name: 'heldOutGate', passed, detail: { baseline, candidate, delta, deltaThreshold } },\n ],\n delta,\n }\n },\n }\n}\n\nfunction meanForScenarios(\n judgeScoresByCell: Map<string, Record<string, { composite: number }>>,\n scenarioIds: Set<string>,\n): number {\n const composites: number[] = []\n for (const [cellId, scores] of judgeScoresByCell) {\n const scenarioId = cellId.split(':')[0] ?? ''\n if (!scenarioIds.has(scenarioId)) continue\n const vals = Object.values(scores).map((s) => s.composite)\n if (vals.length > 0) composites.push(vals.reduce((a, b) => a + b, 0) / vals.length)\n }\n return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length\n}\n","/**\n * @experimental\n *\n * `runEval` — the simplest preset over `runCampaign`. No optimizer, no\n * gate, no auto-PR. Just: run scenarios through dispatch, score with\n * judges, return CampaignResult.\n *\n * The 80% case for consumers who want a scorecard, not an improvement loop.\n */\n\nimport { type RunCampaignOptions, runCampaign } from '../run-campaign'\nimport type { CampaignResult, Scenario } from '../types'\n\nexport interface RunEvalOptions<TScenario extends Scenario, TArtifact>\n extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {\n runDir: string\n}\n\nexport async function runEval<TScenario extends Scenario, TArtifact>(\n opts: RunEvalOptions<TScenario, TArtifact>,\n): Promise<CampaignResult<TArtifact, TScenario>> {\n return runCampaign(opts)\n}\n","/**\n * @experimental\n *\n * `runOptimization` — the improvement loop body. Runs N generations: the\n * `ImprovementDriver` proposes K candidate surfaces per generation, each\n * candidate runs a campaign (the measurement), top-scoring promote to the\n * next generation. Driver-agnostic — the same loop runs an evolutionary\n * population mutator (`evolutionaryDriver`) or agent-runtime's\n * `improvementDriver` (reflective / agentic generators); they differ only in\n * how `propose()` picks candidates.\n *\n * This is `runLoop`'s shape (plan → measure → decide) specialized to surface\n * improvement: `driver.propose` = plan, `runCampaign` = the measurement (which\n * runs the worker behind `dispatch`), the mean-composite ranking = the\n * validator, `driver.decide` = the stop check.\n *\n * The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout\n * re-score + release gate + optional PR.\n */\n\nimport { createHash } from 'node:crypto'\nimport { type RunCampaignOptions, runCampaign } from '../run-campaign'\nimport type {\n CampaignResult,\n GenerationRecord,\n ImprovementDriver,\n MutableSurface,\n Scenario,\n} from '../types'\n\nexport interface RunOptimizationOptions<TScenario extends Scenario, TArtifact>\n extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {\n /** Initial mutable surface (typically system prompt or addendum). */\n baselineSurface: MutableSurface\n /** Dispatcher that takes the CURRENT surface + scenario → artifact. */\n dispatchWithSurface: (\n surface: MutableSurface,\n scenario: TScenario,\n ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1],\n ) => Promise<TArtifact>\n /** The improvement strategy. Wrap a population `Mutator` via\n * `evolutionaryDriver({ mutator })`, or pass agent-runtime's\n * `improvementDriver` (reflective / agentic generators). */\n driver: ImprovementDriver\n populationSize: number\n maxGenerations: number\n /** How many top-scoring candidates carry to the next generation. Default 2. */\n promoteTopK?: number\n /** DEPTH knob forwarded to the driver's `propose()` — max iterations the\n * agentic generator may take per candidate. */\n maxImprovementShots?: number\n /** Phase-2 research report forwarded to `propose()` (analyst findings +\n * diff). Opaque here; the driver types it. */\n report?: unknown\n}\n\nexport interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {\n generations: Array<{\n record: GenerationRecord\n surfaces: Array<{\n surfaceHash: string\n surface: MutableSurface\n campaign: CampaignResult<TArtifact, TScenario>\n }>\n }>\n winnerSurface: MutableSurface\n winnerSurfaceHash: string\n baselineCampaign: CampaignResult<TArtifact, TScenario>\n}\n\nexport async function runOptimization<TScenario extends Scenario, TArtifact>(\n opts: RunOptimizationOptions<TScenario, TArtifact>,\n): Promise<RunOptimizationResult<TArtifact, TScenario>> {\n const promoteTopK = opts.promoteTopK ?? 2\n\n // Baseline run\n const baselineCampaign = await runCampaign<TScenario, TArtifact>({\n ...opts,\n dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),\n runDir: `${opts.runDir}/baseline`,\n })\n\n const generations: RunOptimizationResult<TArtifact, TScenario>['generations'] = []\n const history: GenerationRecord[] = []\n let currentSurfaces: MutableSurface[] = [opts.baselineSurface]\n let winnerSurface = opts.baselineSurface\n let winnerSurfaceHash = surfaceHash(opts.baselineSurface)\n let winnerComposite = meanComposite(baselineCampaign)\n\n for (let gen = 0; gen < opts.maxGenerations; gen++) {\n // Decide: the driver may stop early based on accumulated history.\n if (opts.driver.decide?.({ history }).stop) break\n\n // Plan: the driver proposes N candidates from the current best surface,\n // the accumulated generation history, and any external findings.\n const candidates = await opts.driver.propose({\n currentSurface: currentSurfaces[0] ?? opts.baselineSurface,\n history,\n findings: [],\n populationSize: opts.populationSize,\n generation: gen,\n signal: new AbortController().signal,\n report: opts.report,\n dataset: opts.labeledStore && opts.labeledStore !== 'off' ? opts.labeledStore : undefined,\n maxImprovementShots: opts.maxImprovementShots,\n })\n\n // Run each candidate as its own campaign.\n const surfaceResults: Array<{\n surfaceHash: string\n surface: MutableSurface\n campaign: CampaignResult<TArtifact, TScenario>\n composite: number\n }> = []\n for (let i = 0; i < candidates.length; i++) {\n const surface = candidates[i] as MutableSurface\n const hash = surfaceHash(surface)\n const campaign = await runCampaign<TScenario, TArtifact>({\n ...opts,\n dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),\n runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`,\n })\n const composite = meanComposite(campaign)\n surfaceResults.push({ surfaceHash: hash, surface, campaign, composite })\n }\n\n // Rank, promote top-K.\n surfaceResults.sort((a, b) => b.composite - a.composite)\n const promoted = surfaceResults.slice(0, promoteTopK)\n currentSurfaces = promoted.map((p) => p.surface)\n const top = surfaceResults[0]\n if (top && top.composite > winnerComposite) {\n winnerSurface = top.surface\n winnerSurfaceHash = top.surfaceHash\n winnerComposite = top.composite\n }\n\n const record: GenerationRecord = {\n generationIndex: gen,\n candidates: surfaceResults.map((s) => {\n const breakdown = candidateBreakdown(s.campaign)\n return {\n surfaceHash: s.surfaceHash,\n composite: s.composite,\n ci95: [s.composite, s.composite] as [number, number],\n dimensions: breakdown.dimensions,\n scenarios: breakdown.scenarios,\n }\n }),\n promoted: promoted.map((p) => p.surfaceHash),\n }\n history.push(record)\n generations.push({\n record,\n surfaces: surfaceResults.map((s) => ({\n surfaceHash: s.surfaceHash,\n surface: s.surface,\n campaign: s.campaign,\n })),\n })\n }\n\n return {\n generations,\n winnerSurface,\n winnerSurfaceHash,\n baselineCampaign,\n }\n}\n\nexport function surfaceHash(surface: MutableSurface): string {\n // Prompt/tool surfaces (string) hash by content; code surfaces hash by the\n // worktree + base ref pair (the content lives in git, not in the string).\n const material =\n typeof surface === 'string'\n ? surface\n : JSON.stringify({\n kind: surface.kind,\n worktreeRef: surface.worktreeRef,\n baseRef: surface.baseRef ?? null,\n })\n return createHash('sha256').update(material).digest('hex').slice(0, 16)\n}\n\nfunction meanComposite<TArtifact, TScenario extends Scenario>(\n campaign: CampaignResult<TArtifact, TScenario>,\n): number {\n const composites: number[] = []\n for (const cell of campaign.cells) {\n const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite)\n if (cellComposites.length > 0) {\n composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length)\n }\n }\n return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length\n}\n\n/** Per-candidate evidence a reflective driver grounds its next proposal on:\n * mean score per judge dimension + per-scenario composite. */\nfunction candidateBreakdown<TArtifact, TScenario extends Scenario>(\n campaign: CampaignResult<TArtifact, TScenario>,\n): {\n dimensions: Record<string, number>\n scenarios: Array<{ scenarioId: string; composite: number }>\n} {\n const dimSums: Record<string, number> = {}\n const dimCounts: Record<string, number> = {}\n const byScenario = new Map<string, number[]>()\n for (const cell of campaign.cells) {\n const judgeScores = Object.values(cell.judgeScores)\n if (judgeScores.length === 0) continue\n const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length\n const arr = byScenario.get(cell.scenarioId) ?? []\n arr.push(cellComposite)\n byScenario.set(cell.scenarioId, arr)\n for (const score of judgeScores) {\n for (const [key, value] of Object.entries(score.dimensions)) {\n dimSums[key] = (dimSums[key] ?? 0) + value\n dimCounts[key] = (dimCounts[key] ?? 0) + 1\n }\n }\n }\n const dimensions: Record<string, number> = {}\n for (const key of Object.keys(dimSums)) {\n const count = dimCounts[key] ?? 0\n dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0\n }\n const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({\n scenarioId,\n composite: comps.reduce((a, b) => a + b, 0) / comps.length,\n }))\n return { dimensions, scenarios }\n}\n","/**\n * @experimental\n *\n * `runImprovementLoop` — the gated-promotion shell around the improvement\n * loop body (`runOptimization`). Drives candidate surfaces via the\n * `ImprovementDriver`, re-scores the winner against the baseline on a\n * holdout set, runs the release gate, and optionally opens a PR.\n *\n * Role vocabulary (see docs/design/loop-taxonomy.md):\n * - DRIVER = the `ImprovementDriver` (evolutionary GEPA mutator OR\n * reflective analyst). Proposes candidate SURFACES — the\n * worker's system prompt / tool config — NOT conversation\n * turns.\n * - MEASUREMENT= `runCampaign`. Scores one surface by running the worker\n * (via `dispatch`) over scenarios and judging the output.\n * - WORKER = the agent harness in the sandbox, invoked behind the\n * topology-opaque `dispatch` seam — never referenced here.\n *\n * Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the\n * INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`\n * is the OUTER loop: it improves the surface that those workers run.\n *\n * Hard-refuses unsafe configurations:\n * - `tracing: 'off'` when a driver is wired (improvement is unattributable)\n * - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships\n * `'pr'` and `'none'`.\n */\n\nimport { openAutoPr } from '../auto-pr'\nimport type { CampaignResult, Gate, MutableSurface, Scenario } from '../types'\nimport type { RunOptimizationOptions, RunOptimizationResult } from './run-optimization'\nimport { runOptimization } from './run-optimization'\n\nexport interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact>\n extends RunOptimizationOptions<TScenario, TArtifact> {\n /** Holdout scenarios kept OUT of the training optimization pool — used\n * ONLY to score baseline vs winner for the gate. */\n holdoutScenarios: TScenario[]\n /** Promotion gate. Substrate strongly recommends `defaultProductionGate`\n * for production wiring (composes red-team / reward-hacking / canary /\n * heldout). */\n gate: Gate<TArtifact, TScenario>\n /** What to do when the gate ships:\n * - `'pr'`: open a PR via `openAutoPr`\n * - `'none'`: just report — caller decides what to do with the winner\n * v0.40 does NOT support `'config'` (live-runtime self-mutation) —\n * deferred to Pass B behind safety stack. */\n autoOnPromote: 'pr' | 'none'\n /** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */\n ghOwner?: string\n ghRepo?: string\n /** Optional render override — substrate writes a diff-shaped surface; pass\n * a function to format the promoted surface differently. */\n renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string\n}\n\nexport interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario>\n extends RunOptimizationResult<TArtifact, TScenario> {\n baselineOnHoldout: CampaignResult<TArtifact, TScenario>\n winnerOnHoldout: CampaignResult<TArtifact, TScenario>\n gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>\n prResult?: ReturnType<typeof openAutoPr>\n}\n\nexport async function runImprovementLoop<TScenario extends Scenario, TArtifact>(\n opts: RunImprovementLoopOptions<TScenario, TArtifact>,\n): Promise<RunImprovementLoopResult<TArtifact, TScenario>> {\n // ── Safety pre-flight ─────────────────────────────────────────────\n // biome-ignore lint/suspicious/noExplicitAny: Pass A reserved field for Pass B Shape B\n if ((opts as any).autoOnPromote === 'config') {\n throw new Error(\n \"runImprovementLoop: autoOnPromote='config' is deferred to Pass B (requires shadow deploy + rollback + ensemble judges). Use 'pr' or 'none' in v0.40.\",\n )\n }\n // Refuse tracing=off whenever a driver is wired. An improvement loop\n // without traces is unattributable — its candidate surfaces cannot be\n // cited back to the spans that motivated them, and the dataset flywheel\n // (LabeledScenarioStore) that GEPA optimizes against goes unfed.\n if (opts.tracing === 'off' && opts.driver) {\n throw new Error(\n \"runImprovementLoop: tracing='off' is forbidden when a driver is wired. The improvement loop without traces is unattributable; candidate surfaces cannot be cited back to spans and the optimization dataset goes unfed.\",\n )\n }\n if (opts.autoOnPromote === 'pr' && (!opts.ghOwner || !opts.ghRepo)) {\n throw new Error(\"runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.\")\n }\n\n // ── (1) optimization loop produces a winner ────────────────────────\n const optimization = await runOptimization(opts)\n\n // ── (2) baseline + winner re-scored on the holdout set ─────────────\n const { runCampaign } = await import('../run-campaign')\n\n const baselineOnHoldout = await runCampaign<TScenario, TArtifact>({\n ...opts,\n scenarios: opts.holdoutScenarios,\n dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),\n runDir: `${opts.runDir}/holdout-baseline`,\n })\n\n const winnerOnHoldout = await runCampaign<TScenario, TArtifact>({\n ...opts,\n scenarios: opts.holdoutScenarios,\n dispatch: (scenario, ctx) =>\n opts.dispatchWithSurface(optimization.winnerSurface, scenario, ctx),\n runDir: `${opts.runDir}/holdout-winner`,\n })\n\n // ── (3) gate verdict ───────────────────────────────────────────────\n // Candidate + baseline share cellIds (same holdout scenarios), so their\n // judge scores MUST stay in separate maps — merging them collapses the\n // holdout delta to zero and the gate can never ship a real improvement.\n type ScoreMap = Map<\n string,\n Record<string, { composite: number; dimensions: Record<string, number>; notes: string }>\n >\n const candidateArtifacts = new Map<string, TArtifact>()\n const baselineArtifacts = new Map<string, TArtifact>()\n const judgeScores: ScoreMap = new Map()\n const baselineJudgeScores: ScoreMap = new Map()\n for (const cell of winnerOnHoldout.cells) {\n candidateArtifacts.set(cell.cellId, cell.artifact)\n judgeScores.set(cell.cellId, cell.judgeScores)\n }\n for (const cell of baselineOnHoldout.cells) {\n baselineArtifacts.set(cell.cellId, cell.artifact)\n baselineJudgeScores.set(cell.cellId, cell.judgeScores)\n }\n\n const gateResult = await opts.gate.decide({\n candidateArtifacts,\n baselineArtifacts,\n judgeScores,\n baselineJudgeScores,\n scenarios: opts.holdoutScenarios,\n cost: {\n candidate: winnerOnHoldout.aggregates.totalCostUsd,\n baseline: baselineOnHoldout.aggregates.totalCostUsd,\n },\n signal: new AbortController().signal,\n })\n\n // ── (4) auto-PR when gate ships ────────────────────────────────────\n let prResult: ReturnType<typeof openAutoPr> | undefined\n if (opts.autoOnPromote === 'pr' && gateResult.decision === 'ship') {\n const render = opts.renderPromotedDiff ?? defaultRenderDiff\n const promotedDiff = render(optimization.winnerSurface, opts.baselineSurface)\n prResult = openAutoPr({\n result: winnerOnHoldout,\n gate: gateResult,\n promotedDiff,\n ghOwner: opts.ghOwner!,\n ghRepo: opts.ghRepo!,\n })\n }\n\n return {\n ...optimization,\n baselineOnHoldout,\n winnerOnHoldout,\n gateResult,\n prResult,\n }\n}\n\nfunction defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string {\n // Code surfaces aren't text-diffable here — the diff lives in git. Render\n // the worktree/base refs + summary so the PR body points at the change.\n if (typeof winnerSurface !== 'string' || typeof baselineSurface !== 'string') {\n const fmt = (s: MutableSurface): string =>\n typeof s === 'string'\n ? '(prompt surface)'\n : `worktree=${s.worktreeRef}${s.baseRef ? ` base=${s.baseRef}` : ''}${s.summary ? `\\n${s.summary}` : ''}`\n return `--- baseline\\n${fmt(baselineSurface)}\\n+++ winner\\n${fmt(winnerSurface)}`\n }\n const lines: string[] = []\n lines.push('--- baseline')\n lines.push('+++ winner')\n for (const l of baselineSurface.split('\\n')) lines.push(`- ${l}`)\n for (const l of winnerSurface.split('\\n')) lines.push(`+ ${l}`)\n return lines.join('\\n')\n}\n"],"mappings":";;;;;;;;;;;;;;;;;AAcA,SAAS,gBAAgB;AACzB,SAAS,qBAAqB;AAC9B,SAAS,cAAc;AACvB,SAAS,YAAY;AAiCd,SAAS,WACd,SACkB;AAClB,MAAI,QAAQ,KAAK,aAAa,QAAQ;AACpC,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,QAAQ;AAAA,MACR,QAAQ,qBAAqB,QAAQ,KAAK,QAAQ;AAAA,IACpD;AAAA,EACF;AAEA,QAAM,SAAS,QAAQ,UAAU,CAAC,QAAQ,IAAI;AAC9C,QAAM,SAAS,QAAQ,UAAU,QAAQ,QAAQ,OAAO,aAAa,MAAM,GAAG,EAAE,CAAC;AACjF,QAAM,QACJ,QAAQ,SAAS,kBAAkB,QAAQ,OAAO,aAAa,MAAM,GAAG,CAAC,CAAC;AAE5E,QAAM,OAAO,aAAa,QAAQ,QAAQ,QAAQ,MAAM,QAAQ,YAAY;AAC5E,QAAM,WAAW,KAAK,OAAO,GAAG,gBAAgB,KAAK,IAAI,CAAC,KAAK;AAC/D,gBAAc,UAAU,IAAI;AAE5B,MAAI,QAAQ;AACV,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,QAAQ;AAAA,MACR,QAAQ,0DAA0D,QAAQ,OAAO,IAAI,QAAQ,MAAM,WAAW,MAAM,aAAa,QAAQ;AAAA,IAC3I;AAAA,EACF;AAEA,QAAM,SAAS,QAAQ,UAAU;AACjC,QAAM,SAAS,OAAO;AAAA,IACpB;AAAA,IACA;AAAA,IACA;AAAA,IACA,GAAG,QAAQ,OAAO,IAAI,QAAQ,MAAM;AAAA,IACpC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,CAAC;AACD,MAAI,OAAO,WAAW,GAAG;AACvB,WAAO;AAAA,MACL,QAAQ;AAAA,MACR,QAAQ;AAAA,MACR,QAAQ,6BAA6B,OAAO,MAAM,MAAM,OAAO,OAAO,MAAM,GAAG,GAAG,CAAC;AAAA,IACrF;AAAA,EACF;AACA,QAAM,QAAQ,OAAO,OAAO,KAAK;AACjC,SAAO,EAAE,QAAQ,MAAM,OAAO,QAAQ,OAAO,QAAQ,YAAY;AACnE;AAEA,SAAS,aACP,QACA,MACA,MACQ;AACR,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,kDAAkD;AAC7D,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,mBAAmB,OAAO,YAAY,IAAI;AACrD,QAAM,KAAK,aAAa,OAAO,IAAI,EAAE;AACrC,QAAM,KAAK,iBAAiB,KAAK,MAAM,OAAO,aAAa,GAAI,CAAC,GAAG;AACnE,QAAM;AAAA,IACJ,uBAAuB,OAAO,WAAW,aAAa,YAAY,OAAO,WAAW,WAAW,aAAa,OAAO,WAAW,YAAY,YAAY,OAAO,WAAW,WAAW;AAAA,EACrL;AACA,QAAM,KAAK,qBAAqB,OAAO,WAAW,aAAa,QAAQ,CAAC,CAAC,EAAE;AAC3E,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,uBAAuB,KAAK,QAAQ,IAAI;AACnD,QAAM,KAAK,EAAE;AACb,aAAW,UAAU,KAAK,QAAS,OAAM,KAAK,KAAK,MAAM,EAAE;AAC3D,MAAI,KAAK,UAAU,OAAW,OAAM,KAAK,YAAY,KAAK,MAAM,QAAQ,CAAC,CAAC,EAAE;AAC5E,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,wBAAwB;AACnC,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,4BAA4B;AACvC,QAAM,KAAK,eAAe;AAC1B,aAAW,KAAK,KAAK,mBAAmB;AACtC,UAAM,SACJ,OAAO,EAAE,WAAW,WAChB,KAAK,UAAU,EAAE,MAAM,EAAE,MAAM,GAAG,EAAE,IACpC,OAAO,EAAE,MAAM,EAAE,MAAM,GAAG,EAAE;AAClC,UAAM,KAAK,KAAK,EAAE,IAAI,MAAM,EAAE,SAAS,WAAM,QAAG,MAAM,MAAM,IAAI;AAAA,EAClE;AACA,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,sBAAsB;AACjC,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,SAAS;AACpB,QAAM,KAAK,KAAK,MAAM,GAAG,GAAI,CAAC;AAC9B,QAAM,KAAK,KAAK;AAChB,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,yBAAyB;AACpC,QAAM,KAAK,EAAE;AACb,QAAM,KAAK,6BAA6B;AACxC,QAAM,KAAK,mBAAmB;AAC9B,aAAW,CAAC,MAAM,GAAG,KAAK,OAAO,QAAQ,OAAO,WAAW,OAAO,GAAG;AACnE,UAAM;AAAA,MACJ,KAAK,IAAI,MAAM,IAAI,KAAK,QAAQ,CAAC,CAAC,OAAO,IAAI,KAAK,CAAC,EAAE,QAAQ,CAAC,CAAC,KAAK,IAAI,KAAK,CAAC,EAAE,QAAQ,CAAC,CAAC,OAAO,IAAI,CAAC;AAAA,IACxG;AAAA,EACF;AACA,SAAO,MAAM,KAAK,IAAI;AACxB;AAEA,SAAS,cAAc,MAAoE;AACzF,MAAI;AACF,UAAM,SAAS,SAAS,MAAM,KAAK,IAAI,QAAQ,EAAE,KAAK,GAAG,CAAC,IAAI;AAAA,MAC5D,KAAK,EAAE,GAAG,QAAQ,KAAK,UAAU,QAAQ,IAAI,oBAAoB,QAAQ,IAAI,YAAY,GAAG;AAAA,MAC5F,OAAO,CAAC,UAAU,QAAQ,MAAM;AAAA,IAClC,CAAC,EAAE,SAAS,MAAM;AAClB,WAAO,EAAE,QAAQ,QAAQ,IAAI,QAAQ,EAAE;AAAA,EACzC,SAAS,KAAK;AACZ,UAAM,IAAI;AACV,WAAO;AAAA,MACL,QAAQ,EAAE,QAAQ,SAAS,MAAM,KAAK;AAAA,MACtC,QAAQ,EAAE,QAAQ,SAAS,MAAM,KAAK;AAAA,MACtC,QAAQ,EAAE,UAAU;AAAA,IACtB;AAAA,EACF;AACF;AAEA,SAAS,SAAS,KAAqB;AACrC,MAAI,wBAAwB,KAAK,GAAG,EAAG,QAAO;AAC9C,SAAO,IAAI,IAAI,QAAQ,MAAM,KAAK,CAAC;AACrC;;;ACrJO,SAAS,mBACd,MAC8B;AAC9B,SAAO;AAAA,IACL,MAAM,gBAAgB,KAAK,QAAQ,IAAI;AAAA,IACvC,MAAM,QAAQ,EAAE,gBAAgB,UAAU,gBAAgB,OAAO,GAAG;AAClE,aAAO,KAAK,QAAQ,OAAO;AAAA,QACzB,UAAU,SAAS,SAAS,IAAI,WAAY,KAAK,YAAY,CAAC;AAAA,QAC9D;AAAA,QACA;AAAA,QACA;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AACF;;;ACIA,IAAM,oBACJ;AAsCK,SAAS,WAAW,MAA4C;AACrE,QAAM,YAAY,KAAK,aAAa;AACpC,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,QAAQ,KAAgD;AAC5D,YAAM,SACJ,OAAO,IAAI,mBAAmB,WAC1B,IAAI,iBACJ,KAAK,UAAU,IAAI,cAAc;AACvC,YAAM,EAAE,KAAK,QAAQ,OAAO,IAAI,cAAc,KAAK,WAAW,KAAK,MAAM;AAEzE,YAAM,aAAa,sBAAsB;AAAA,QACvC;AAAA,QACA,eAAe;AAAA,QACf,WAAW;AAAA,QACX,cAAc;AAAA,QACd,YAAY,IAAI;AAAA,QAChB,oBAAoB,KAAK;AAAA,MAC3B,CAAC;AAED,YAAM,SAAS,MAAM;AAAA,QACnB;AAAA,UACE,OAAO,KAAK;AAAA,UACZ,UAAU;AAAA,YACR,EAAE,MAAM,UAAU,SAAS,kBAAkB;AAAA,YAC7C,EAAE,MAAM,QAAQ,SAAS,WAAW;AAAA,UACtC;AAAA,UACA,UAAU;AAAA,UACV,aAAa,KAAK,eAAe;AAAA,UACjC,WAAW,KAAK,aAAa;AAAA,QAC/B;AAAA,QACA,KAAK;AAAA,MACP;AAEA,YAAM,YAAY,wBAAwB,OAAO,SAAS,IAAI,cAAc;AAC5E,YAAM,MAAwB,CAAC;AAC/B,YAAM,cAAc,KAAK;AACzB,YAAM,mBACJ,aAAa,qBAAqB,SAC9B,YAAY,iBAAiB,WAAW,IACtC,kBAAkB,MAAM,IACxB,YAAY,mBACd;AACN,YAAM,WAAW,aAAa;AAC9B,iBAAW,YAAY,WAAW;AAChC,cAAM,OAAO,OAAO,SAAS,YAAY,WAAW,SAAS,QAAQ,KAAK,IAAI;AAC9E,YAAI,CAAC,QAAQ,SAAS,UAAU,IAAI,SAAS,IAAI,EAAG;AACpD,YAAI,oBAAoB,CAAC,0BAA0B,MAAM,gBAAgB,EAAG;AAC5E,YAAI,aAAa,UAAa,mBAAmB,QAAQ,IAAI,IAAI,WAAW,EAAG;AAC/E,YAAI,KAAK,IAAI;AAAA,MACf;AACA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAIO,SAAS,kBAAkB,MAAwB;AACxD,QAAM,MAAgB,CAAC;AACvB,aAAW,QAAQ,KAAK,MAAM,IAAI,GAAG;AACnC,UAAM,QAAQ,kBAAkB,KAAK,IAAI;AACzC,QAAI,MAAO,KAAI,KAAK,MAAM,CAAC,CAAE;AAAA,EAC/B;AACA,SAAO;AACT;AAKO,SAAS,mBAAmB,UAAkB,WAA2B;AAC9E,QAAM,OAAO,CAAC,MACZ,EACG,MAAM,mBAAmB,EACzB,IAAI,CAAC,MAAM,EAAE,KAAK,CAAC,EACnB,OAAO,CAAC,MAAM,EAAE,SAAS,CAAC;AAC/B,QAAM,IAAI,IAAI,IAAI,KAAK,QAAQ,CAAC;AAChC,QAAM,IAAI,IAAI,IAAI,KAAK,SAAS,CAAC;AACjC,MAAI,QAAQ;AACZ,aAAW,KAAK,EAAG,KAAI,CAAC,EAAE,IAAI,CAAC,EAAG;AAClC,aAAW,KAAK,EAAG,KAAI,CAAC,EAAE,IAAI,CAAC,EAAG;AAClC,SAAO;AACT;AAEA,SAAS,0BAA0B,WAAmB,UAAsC;AAC1F,MAAI,SAAS,WAAW,EAAG,QAAO;AAClC,QAAM,OAAO,IAAI,IAAI,kBAAkB,SAAS,CAAC;AACjD,aAAW,WAAW,UAAU;AAC9B,QAAI,CAAC,KAAK,IAAI,OAAO,EAAG,QAAO;AAAA,EACjC;AACA,SAAO;AACT;AAKA,SAAS,cACP,KACA,WACA,YAC6D;AAC7D,QAAM,OAAO,IAAI,QAAQ,GAAG,EAAE;AAC9B,MAAI,CAAC,QAAQ,KAAK,WAAW,WAAW,GAAG;AACzC,WAAO,EAAE,KAAK,CAAC,GAAG,QAAQ,CAAC,GAAG,QAAQ,WAAW;AAAA,EACnD;AACA,QAAM,OAAO,CAAC,GAAG,KAAK,UAAU,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS,EAAE,CAAC;AAC7E,MAAI,CAAC,KAAM,QAAO,EAAE,KAAK,CAAC,GAAG,QAAQ,CAAC,GAAG,QAAQ,WAAW;AAE5D,QAAM,UAAU,CAAC,GAAG,KAAK,SAAS,EAAE,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AAC5E,QAAM,UAAU,CAAC,OAA8D;AAAA,IAC7E,IAAI,EAAE;AAAA,IACN,OAAO,EAAE;AAAA,EACX;AACA,QAAM,MAAM,QAAQ,MAAM,GAAG,SAAS,EAAE,IAAI,OAAO;AACnD,QAAM,SAAS,QAAQ,MAAM,CAAC,SAAS,EAAE,QAAQ,EAAE,IAAI,OAAO;AAE9D,QAAM,UAAU,OAAO,QAAQ,KAAK,UAAU,EAC3C,KAAK,CAAC,GAAG,MAAM,EAAE,CAAC,IAAI,EAAE,CAAC,CAAC,EAC1B,MAAM,GAAG,CAAC,EACV,IAAI,CAAC,CAAC,KAAK,KAAK,MAAM,GAAG,GAAG,KAAK,MAAM,QAAQ,CAAC,CAAC,GAAG;AACvD,QAAM,SACJ,QAAQ,SAAS,IAAI,GAAG,UAAU,+BAA0B,QAAQ,KAAK,IAAI,CAAC,KAAK;AAErF,SAAO,EAAE,KAAK,QAAQ,OAAO;AAC/B;;;AC/LO,SAAS,eACX,OACyB;AAC5B,MAAI,MAAM,WAAW,GAAG;AACtB,UAAM,IAAI,MAAM,wCAAwC;AAAA,EAC1D;AACA,SAAO;AAAA,IACL,MAAM,YAAY,MAAM,IAAI,CAAC,MAAM,EAAE,IAAI,EAAE,KAAK,GAAG,CAAC;AAAA,IACpD,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAwE,CAAC;AAC/E,iBAAW,QAAQ,OAAO;AACxB,cAAM,MAAM,MAAM,KAAK,OAAO,GAAG;AACjC,gBAAQ,KAAK,EAAE,MAAM,IAAI,CAAC;AAAA,MAC5B;AAQA,YAAM,YAAY,QAAQ,IAAI,CAAC,MAAM,EAAE,IAAI,QAAQ;AACnD,YAAM,UAAwB,UAAU,MAAM,CAAC,MAAM,MAAM,MAAM,IAC7D,SACA,UAAU,SAAS,cAAc,IAC/B,iBACA,UAAU,SAAS,eAAe,IAChC,kBACA,UAAU,SAAS,MAAM,IACvB,SACA;AAEV,YAAM,eAAe,QAAQ;AAAA,QAAQ,CAAC,MACpC,EAAE,IAAI,kBAAkB,SAAS,IAC7B,EAAE,IAAI,oBACN,CAAC,EAAE,MAAM,EAAE,KAAK,MAAM,QAAQ,EAAE,IAAI,aAAa,QAAQ,QAAQ,EAAE,IAAI,CAAC;AAAA,MAC9E;AAEA,YAAM,UAAU,QAAQ;AAAA,QAAQ,CAAC,MAC/B,EAAE,IAAI,QAAQ,IAAI,CAAC,WAAW,IAAI,EAAE,KAAK,IAAI,KAAK,MAAM,EAAE;AAAA,MAC5D;AAEA,aAAO;AAAA,QACL,UAAU;AAAA,QACV;AAAA,QACA,mBAAmB;AAAA,QACnB,OAAO,QAAQ,CAAC,GAAG,IAAI;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AACF;;;ACpBO,SAAS,sBACd,SAC4B;AAC5B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,QAAM,gBAAgB,QAAQ,8BAA8B;AAE5D,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,OAAO,KAA6D;AACxE,YAAM,UAAoB,CAAC;AAC3B,YAAM,eAA0E,CAAC;AAKjF,YAAM,oBAAoB;AAAA,QACxB,IAAI;AAAA,QACJ,IAAI,uBAAuB,IAAI;AAAA,QAC/B,QAAQ;AAAA,MACV;AACA,YAAM,qBAAqB;AAAA,QACzB,IAAI;AAAA,QACJ,IAAI;AAAA,QACJ,QAAQ;AAAA,MACV;AACA,YAAM,QAAQ,qBAAqB;AACnC,YAAM,cAAc,SAAS;AAC7B,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,mBAAmB,oBAAoB,OAAO,eAAe;AAAA,MACzE,CAAC;AACD,UAAI,CAAC,aAAa;AAChB,gBAAQ,KAAK,iBAAiB,MAAM,QAAQ,CAAC,CAAC,gBAAgB,cAAc,EAAE;AAAA,MAChF;AAGA,YAAM,aACJ,QAAQ,cAAc,UACtB,IAAI,KAAK,YAAY,IAAI,KAAK,YAAY,QAAQ;AACpD,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ;AAAA,UACN,cAAc,IAAI,KAAK;AAAA,UACvB,aAAa,IAAI,KAAK;AAAA,UACtB,WAAW,QAAQ;AAAA,QACrB;AAAA,MACF,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ;AAAA,UACN,UAAU,IAAI,KAAK,YAAY,IAAI,KAAK,UAAU,QAAQ,CAAC,CAAC,aAAa,QAAQ,SAAS;AAAA,QAC5F;AAAA,MACF;AAGA,YAAM,kBAAkB,QAAQ,iBAC5B,aAAa,IAAI,oBAAoB,QAAQ,cAAc,IAC3D,EAAE,QAAQ,MAAM,UAAU,CAAC,EAAE;AACjC,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ,gBAAgB;AAAA,QACxB,QAAQ;AAAA,UACN,UAAU,gBAAgB,SAAS;AAAA,UACnC,QAAQ,gBAAgB,SAAS,MAAM,GAAG,CAAC;AAAA,QAC7C;AAAA,MACF,CAAC;AACD,UAAI,CAAC,gBAAgB,QAAQ;AAC3B,gBAAQ,KAAK,0BAA0B,gBAAgB,SAAS,MAAM,YAAY;AAAA,MACpF;AAGA,UAAI,sBAAkD;AACtD,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,8BAAsB,oBAAoB,EAAE,MAAM,QAAQ,WAAW,CAAC;AAAA,MACxE;AAIA,YAAM,kBAAkB;AACxB,YAAM,kBAAkB,qBAAqB,YAAY,CAAC,GAAG;AAAA,QAC3D,CAAC,MAAM,EAAE,YAAY;AAAA,MACvB;AACA,YAAM,oBACJ,CAAC,uBACD,CAAC,iBACA,eAAe,WAAW,KAAK,oBAAoB,YAAY;AAClE,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,QAAQ,qBAAqB,oBAAoB,eAAe,OAAO;AAAA,MACnF,CAAC;AACD,UAAI,CAAC,mBAAmB;AACtB,gBAAQ;AAAA,UACN,mCAAmC,eAAe,MAAM,sCAAsC,oBAAqB,OAAO;AAAA,QAC5H;AAAA,MACF;AAGA,UAAI,eAAoC;AACxC,UAAI,QAAQ,cAAc,QAAQ,WAAW,UAAU,IAAI;AACzD,uBAAe,YAAY,QAAQ,YAAY,CAAC,CAAC;AAAA,MACnD;AAEA,YAAM,eAAe,cAAc,UAAU,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,aAAa,OAAO;AACrF,YAAM,aAAa,YAAY,WAAW;AAC1C,mBAAa,KAAK;AAAA,QAChB,MAAM;AAAA,QACN,QAAQ;AAAA,QACR,QAAQ,EAAE,aAAa,cAAc,OAAO,UAAU,GAAG,aAAa,YAAY,OAAO;AAAA,MAC3F,CAAC;AACD,UAAI,CAAC,YAAY;AACf,gBAAQ,KAAK,wBAAwB,YAAY,MAAM,EAAE;AAAA,MAC3D;AAGA,YAAM,YAAY,aAAa,MAAM,CAAC,MAAM,EAAE,MAAM;AACpD,YAAM,WAAW,YAAY,SAAS;AAEtC,aAAO;AAAA,QACL;AAAA,QACA,SAAS,QAAQ,SAAS,IAAI,UAAU,CAAC,kBAAkB;AAAA,QAC3D,mBAAmB;AAAA,QACnB;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,cACP,WACA,mBACA,WACQ;AACR,MAAI,CAAC,aAAa,UAAU,SAAS,EAAG,QAAO;AAC/C,QAAM,cAAc,IAAI,IAAI,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;AACtD,QAAM,aAAuB,CAAC;AAC9B,aAAW,CAAC,QAAQ,MAAM,KAAK,mBAAmB;AAChD,UAAM,aAAa,OAAO,MAAM,GAAG,EAAE,CAAC,KAAK;AAC3C,QAAI,CAAC,YAAY,IAAI,UAAU,EAAG;AAClC,UAAM,iBAAiB,OAAO,OAAO,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACnE,QAAI,eAAe,WAAW,EAAG;AACjC,eAAW,KAAK,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,eAAe,MAAM;AAAA,EACnF;AACA,MAAI,WAAW,WAAW,EAAG,QAAO;AACpC,SAAO,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAC5D;AAEA,SAAS,aACP,WACA,SAC8E;AAC9E,QAAM,WAA0D,CAAC;AACjE,aAAW,CAAC,SAAS,QAAQ,KAAK,WAAW;AAC3C,UAAM,OAAO,YAAY,QAAQ;AACjC,QAAI,SAAS,OAAW;AACxB,eAAW,UAAU,SAAS;AAC5B,YAAM,UAAU,mBAAmB,MAAM,CAAC,GAAG,MAAM;AACnD,UAAI,CAAC,QAAQ,QAAQ;AACnB,iBAAS,KAAK,EAAE,YAAY,OAAO,IAAI,QAAQ,QAAQ,UAAU,wBAAwB,CAAC;AAAA,MAC5F;AAAA,IACF;AAAA,EACF;AACA,SAAO,EAAE,QAAQ,SAAS,WAAW,GAAG,SAAS;AACnD;AAEA,SAAS,YAAY,UAAuC;AAC1D,MAAI,OAAO,aAAa,SAAU,QAAO;AACzC,MAAI,YAAY,OAAO,aAAa,UAAU;AAC5C,UAAM,MAAM;AACZ,QAAI,OAAO,IAAI,SAAS,SAAU,QAAO,IAAI;AAC7C,QAAI,OAAO,IAAI,WAAW,SAAU,QAAO,IAAI;AAC/C,QAAI,OAAO,IAAI,YAAY,SAAU,QAAO,IAAI;AAAA,EAClD;AACA,SAAO;AACT;;;AC5MO,SAAS,YACd,SAC4B;AAC5B,QAAM,iBAAiB,QAAQ,kBAAkB;AACjD,SAAO;AAAA,IACL,MAAM;AAAA,IACN,MAAM,OAAO,KAA6D;AACxE,YAAM,cAAc,IAAI,IAAI,QAAQ,UAAU,IAAI,CAAC,MAAM,EAAE,EAAE,CAAC;AAG9D,YAAM,WAAW,iBAAiB,IAAI,uBAAuB,IAAI,aAAa,WAAW;AACzF,YAAM,YAAY,iBAAiB,IAAI,aAAa,WAAW;AAC/D,YAAM,QAAQ,YAAY;AAC1B,YAAM,SAAS,SAAS;AACxB,aAAO;AAAA,QACL,UAAU,SAAS,SAAS;AAAA,QAC5B,SAAS,SACL,CAAC,kBAAkB,MAAM,QAAQ,CAAC,CAAC,WAAM,cAAc,EAAE,IACzD,CAAC,kBAAkB,MAAM,QAAQ,CAAC,CAAC,MAAM,cAAc,EAAE;AAAA,QAC7D,mBAAmB;AAAA,UACjB,EAAE,MAAM,eAAe,QAAQ,QAAQ,EAAE,UAAU,WAAW,OAAO,eAAe,EAAE;AAAA,QACxF;AAAA,QACA;AAAA,MACF;AAAA,IACF;AAAA,EACF;AACF;AAEA,SAAS,iBACP,mBACA,aACQ;AACR,QAAM,aAAuB,CAAC;AAC9B,aAAW,CAAC,QAAQ,MAAM,KAAK,mBAAmB;AAChD,UAAM,aAAa,OAAO,MAAM,GAAG,EAAE,CAAC,KAAK;AAC3C,QAAI,CAAC,YAAY,IAAI,UAAU,EAAG;AAClC,UAAM,OAAO,OAAO,OAAO,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AACzD,QAAI,KAAK,SAAS,EAAG,YAAW,KAAK,KAAK,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,KAAK,MAAM;AAAA,EACpF;AACA,SAAO,WAAW,WAAW,IAAI,IAAI,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAC1F;;;ACrCA,eAAsB,QACpB,MAC+C;AAC/C,SAAO,YAAY,IAAI;AACzB;;;ACFA,SAAS,kBAAkB;AAkD3B,eAAsB,gBACpB,MACsD;AACtD,QAAM,cAAc,KAAK,eAAe;AAGxC,QAAM,mBAAmB,MAAM,YAAkC;AAAA,IAC/D,GAAG;AAAA,IACH,UAAU,CAAC,UAAU,QAAQ,KAAK,oBAAoB,KAAK,iBAAiB,UAAU,GAAG;AAAA,IACzF,QAAQ,GAAG,KAAK,MAAM;AAAA,EACxB,CAAC;AAED,QAAM,cAA0E,CAAC;AACjF,QAAM,UAA8B,CAAC;AACrC,MAAI,kBAAoC,CAAC,KAAK,eAAe;AAC7D,MAAI,gBAAgB,KAAK;AACzB,MAAI,oBAAoB,YAAY,KAAK,eAAe;AACxD,MAAI,kBAAkBA,eAAc,gBAAgB;AAEpD,WAAS,MAAM,GAAG,MAAM,KAAK,gBAAgB,OAAO;AAElD,QAAI,KAAK,OAAO,SAAS,EAAE,QAAQ,CAAC,EAAE,KAAM;AAI5C,UAAM,aAAa,MAAM,KAAK,OAAO,QAAQ;AAAA,MAC3C,gBAAgB,gBAAgB,CAAC,KAAK,KAAK;AAAA,MAC3C;AAAA,MACA,UAAU,CAAC;AAAA,MACX,gBAAgB,KAAK;AAAA,MACrB,YAAY;AAAA,MACZ,QAAQ,IAAI,gBAAgB,EAAE;AAAA,MAC9B,QAAQ,KAAK;AAAA,MACb,SAAS,KAAK,gBAAgB,KAAK,iBAAiB,QAAQ,KAAK,eAAe;AAAA,MAChF,qBAAqB,KAAK;AAAA,IAC5B,CAAC;AAGD,UAAM,iBAKD,CAAC;AACN,aAAS,IAAI,GAAG,IAAI,WAAW,QAAQ,KAAK;AAC1C,YAAM,UAAU,WAAW,CAAC;AAC5B,YAAM,OAAO,YAAY,OAAO;AAChC,YAAM,WAAW,MAAM,YAAkC;AAAA,QACvD,GAAG;AAAA,QACH,UAAU,CAAC,UAAU,QAAQ,KAAK,oBAAoB,SAAS,UAAU,GAAG;AAAA,QAC5E,QAAQ,GAAG,KAAK,MAAM,QAAQ,GAAG,cAAc,CAAC;AAAA,MAClD,CAAC;AACD,YAAM,YAAYA,eAAc,QAAQ;AACxC,qBAAe,KAAK,EAAE,aAAa,MAAM,SAAS,UAAU,UAAU,CAAC;AAAA,IACzE;AAGA,mBAAe,KAAK,CAAC,GAAG,MAAM,EAAE,YAAY,EAAE,SAAS;AACvD,UAAM,WAAW,eAAe,MAAM,GAAG,WAAW;AACpD,sBAAkB,SAAS,IAAI,CAAC,MAAM,EAAE,OAAO;AAC/C,UAAM,MAAM,eAAe,CAAC;AAC5B,QAAI,OAAO,IAAI,YAAY,iBAAiB;AAC1C,sBAAgB,IAAI;AACpB,0BAAoB,IAAI;AACxB,wBAAkB,IAAI;AAAA,IACxB;AAEA,UAAM,SAA2B;AAAA,MAC/B,iBAAiB;AAAA,MACjB,YAAY,eAAe,IAAI,CAAC,MAAM;AACpC,cAAM,YAAY,mBAAmB,EAAE,QAAQ;AAC/C,eAAO;AAAA,UACL,aAAa,EAAE;AAAA,UACf,WAAW,EAAE;AAAA,UACb,MAAM,CAAC,EAAE,WAAW,EAAE,SAAS;AAAA,UAC/B,YAAY,UAAU;AAAA,UACtB,WAAW,UAAU;AAAA,QACvB;AAAA,MACF,CAAC;AAAA,MACD,UAAU,SAAS,IAAI,CAAC,MAAM,EAAE,WAAW;AAAA,IAC7C;AACA,YAAQ,KAAK,MAAM;AACnB,gBAAY,KAAK;AAAA,MACf;AAAA,MACA,UAAU,eAAe,IAAI,CAAC,OAAO;AAAA,QACnC,aAAa,EAAE;AAAA,QACf,SAAS,EAAE;AAAA,QACX,UAAU,EAAE;AAAA,MACd,EAAE;AAAA,IACJ,CAAC;AAAA,EACH;AAEA,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEO,SAAS,YAAY,SAAiC;AAG3D,QAAM,WACJ,OAAO,YAAY,WACf,UACA,KAAK,UAAU;AAAA,IACb,MAAM,QAAQ;AAAA,IACd,aAAa,QAAQ;AAAA,IACrB,SAAS,QAAQ,WAAW;AAAA,EAC9B,CAAC;AACP,SAAO,WAAW,QAAQ,EAAE,OAAO,QAAQ,EAAE,OAAO,KAAK,EAAE,MAAM,GAAG,EAAE;AACxE;AAEA,SAASA,eACP,UACQ;AACR,QAAM,aAAuB,CAAC;AAC9B,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,iBAAiB,OAAO,OAAO,KAAK,WAAW,EAAE,IAAI,CAAC,MAAM,EAAE,SAAS;AAC7E,QAAI,eAAe,SAAS,GAAG;AAC7B,iBAAW,KAAK,eAAe,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,eAAe,MAAM;AAAA,IACnF;AAAA,EACF;AACA,SAAO,WAAW,WAAW,IAAI,IAAI,WAAW,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,WAAW;AAC1F;AAIA,SAAS,mBACP,UAIA;AACA,QAAM,UAAkC,CAAC;AACzC,QAAM,YAAoC,CAAC;AAC3C,QAAM,aAAa,oBAAI,IAAsB;AAC7C,aAAW,QAAQ,SAAS,OAAO;AACjC,UAAM,cAAc,OAAO,OAAO,KAAK,WAAW;AAClD,QAAI,YAAY,WAAW,EAAG;AAC9B,UAAM,gBAAgB,YAAY,OAAO,CAAC,GAAG,MAAM,IAAI,EAAE,WAAW,CAAC,IAAI,YAAY;AACrF,UAAM,MAAM,WAAW,IAAI,KAAK,UAAU,KAAK,CAAC;AAChD,QAAI,KAAK,aAAa;AACtB,eAAW,IAAI,KAAK,YAAY,GAAG;AACnC,eAAW,SAAS,aAAa;AAC/B,iBAAW,CAAC,KAAK,KAAK,KAAK,OAAO,QAAQ,MAAM,UAAU,GAAG;AAC3D,gBAAQ,GAAG,KAAK,QAAQ,GAAG,KAAK,KAAK;AACrC,kBAAU,GAAG,KAAK,UAAU,GAAG,KAAK,KAAK;AAAA,MAC3C;AAAA,IACF;AAAA,EACF;AACA,QAAM,aAAqC,CAAC;AAC5C,aAAW,OAAO,OAAO,KAAK,OAAO,GAAG;AACtC,UAAM,QAAQ,UAAU,GAAG,KAAK;AAChC,eAAW,GAAG,IAAI,QAAQ,KAAK,QAAQ,GAAG,KAAK,KAAK,QAAQ;AAAA,EAC9D;AACA,QAAM,YAAY,CAAC,GAAG,WAAW,QAAQ,CAAC,EAAE,IAAI,CAAC,CAAC,YAAY,KAAK,OAAO;AAAA,IACxE;AAAA,IACA,WAAW,MAAM,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,MAAM;AAAA,EACtD,EAAE;AACF,SAAO,EAAE,YAAY,UAAU;AACjC;;;ACxKA,eAAsB,mBACpB,MACyD;AAGzD,MAAK,KAAa,kBAAkB,UAAU;AAC5C,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AAKA,MAAI,KAAK,YAAY,SAAS,KAAK,QAAQ;AACzC,UAAM,IAAI;AAAA,MACR;AAAA,IACF;AAAA,EACF;AACA,MAAI,KAAK,kBAAkB,SAAS,CAAC,KAAK,WAAW,CAAC,KAAK,SAAS;AAClE,UAAM,IAAI,MAAM,mEAAmE;AAAA,EACrF;AAGA,QAAM,eAAe,MAAM,gBAAgB,IAAI;AAG/C,QAAM,EAAE,aAAAC,aAAY,IAAI,MAAM,OAAO,4BAAiB;AAEtD,QAAM,oBAAoB,MAAMA,aAAkC;AAAA,IAChE,GAAG;AAAA,IACH,WAAW,KAAK;AAAA,IAChB,UAAU,CAAC,UAAU,QAAQ,KAAK,oBAAoB,KAAK,iBAAiB,UAAU,GAAG;AAAA,IACzF,QAAQ,GAAG,KAAK,MAAM;AAAA,EACxB,CAAC;AAED,QAAM,kBAAkB,MAAMA,aAAkC;AAAA,IAC9D,GAAG;AAAA,IACH,WAAW,KAAK;AAAA,IAChB,UAAU,CAAC,UAAU,QACnB,KAAK,oBAAoB,aAAa,eAAe,UAAU,GAAG;AAAA,IACpE,QAAQ,GAAG,KAAK,MAAM;AAAA,EACxB,CAAC;AAUD,QAAM,qBAAqB,oBAAI,IAAuB;AACtD,QAAM,oBAAoB,oBAAI,IAAuB;AACrD,QAAM,cAAwB,oBAAI,IAAI;AACtC,QAAM,sBAAgC,oBAAI,IAAI;AAC9C,aAAW,QAAQ,gBAAgB,OAAO;AACxC,uBAAmB,IAAI,KAAK,QAAQ,KAAK,QAAQ;AACjD,gBAAY,IAAI,KAAK,QAAQ,KAAK,WAAW;AAAA,EAC/C;AACA,aAAW,QAAQ,kBAAkB,OAAO;AAC1C,sBAAkB,IAAI,KAAK,QAAQ,KAAK,QAAQ;AAChD,wBAAoB,IAAI,KAAK,QAAQ,KAAK,WAAW;AAAA,EACvD;AAEA,QAAM,aAAa,MAAM,KAAK,KAAK,OAAO;AAAA,IACxC;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA,WAAW,KAAK;AAAA,IAChB,MAAM;AAAA,MACJ,WAAW,gBAAgB,WAAW;AAAA,MACtC,UAAU,kBAAkB,WAAW;AAAA,IACzC;AAAA,IACA,QAAQ,IAAI,gBAAgB,EAAE;AAAA,EAChC,CAAC;AAGD,MAAI;AACJ,MAAI,KAAK,kBAAkB,QAAQ,WAAW,aAAa,QAAQ;AACjE,UAAM,SAAS,KAAK,sBAAsB;AAC1C,UAAM,eAAe,OAAO,aAAa,eAAe,KAAK,eAAe;AAC5E,eAAW,WAAW;AAAA,MACpB,QAAQ;AAAA,MACR,MAAM;AAAA,MACN;AAAA,MACA,SAAS,KAAK;AAAA,MACd,QAAQ,KAAK;AAAA,IACf,CAAC;AAAA,EACH;AAEA,SAAO;AAAA,IACL,GAAG;AAAA,IACH;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF;AACF;AAEA,SAAS,kBAAkB,eAA+B,iBAAyC;AAGjG,MAAI,OAAO,kBAAkB,YAAY,OAAO,oBAAoB,UAAU;AAC5E,UAAM,MAAM,CAAC,MACX,OAAO,MAAM,WACT,qBACA,YAAY,EAAE,WAAW,GAAG,EAAE,UAAU,SAAS,EAAE,OAAO,KAAK,EAAE,GAAG,EAAE,UAAU;AAAA,EAAK,EAAE,OAAO,KAAK,EAAE;AAC3G,WAAO;AAAA,EAAiB,IAAI,eAAe,CAAC;AAAA;AAAA,EAAiB,IAAI,aAAa,CAAC;AAAA,EACjF;AACA,QAAM,QAAkB,CAAC;AACzB,QAAM,KAAK,cAAc;AACzB,QAAM,KAAK,YAAY;AACvB,aAAW,KAAK,gBAAgB,MAAM,IAAI,EAAG,OAAM,KAAK,KAAK,CAAC,EAAE;AAChE,aAAW,KAAK,cAAc,MAAM,IAAI,EAAG,OAAM,KAAK,KAAK,CAAC,EAAE;AAC9D,SAAO,MAAM,KAAK,IAAI;AACxB;","names":["meanComposite","runCampaign"]}

package/dist/contract/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { S as Scenario, M as MutableSurface, b as DispatchContext, a as JudgeConfig, I as ImprovementDriver, G as Gate } from '../types-Dbj5gu8n.js';
 export { f as CampaignAggregates, g as CampaignArtifactWriter, h as CampaignCellResult, i as CampaignCostMeter, j as CampaignResult, k as CampaignTraceWriter, C as CodeSurface, D as Dispatch, l as GateContext, m as GateDecision, n as GateResult, o as GenerationCandidate, p as GenerationRecord, r as JudgeDimension, J as JudgeScore, t as Mutator, O as OptimizerConfig, v as SessionScript } from '../types-Dbj5gu8n.js';
-import { C as CampaignStorage, d as RunImprovementLoopResult } from '../run-improvement-loop-BPMjNKMJ.js';
-export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, G as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, b as RunEvalOptions, c as RunImprovementLoopOptions, g as composeGate, h as defaultProductionGate, i as evolutionaryDriver, j as fsCampaignStorage, k as gepaDriver, l as heldOutGate, m as inMemoryCampaignStorage, r as runCampaign, n as runEval, p as runImprovementLoop } from '../run-improvement-loop-BPMjNKMJ.js';
+import { C as CampaignStorage, e as RunImprovementLoopResult } from '../run-improvement-loop-Cc7oZlRP.js';
+export { D as DefaultProductionGateOptions, E as EvolutionaryDriverOptions, a as GepaDriverOptions, H as HeldOutGateOptions, R as RunCampaignOptions, c as RunEvalOptions, d as RunImprovementLoopOptions, h as composeGate, j as defaultProductionGate, k as evolutionaryDriver, m as fsCampaignStorage, n as gepaDriver, o as heldOutGate, p as inMemoryCampaignStorage, r as runCampaign, s as runEval, t as runImprovementLoop } from '../run-improvement-loop-Cc7oZlRP.js';
 export { D as DeploymentOutcome, F as FileSystemOutcomeStore, b as FileSystemOutcomeStoreOptions, I as InMemoryOutcomeStore, O as OutcomeStore } from '../outcome-store-D6KWmYvj.js';
 import { a as HostedTenant, I as InsightReport, T as TraceSpanEvent } from '../index-DQHtWQ57.js';
 export { F as FailureClusterInsight, b as InterRaterInsight, J as JudgeInsight, L as LiftInsight, O as OutcomeCorrelationInsight, R as Recommendation, c as ReleaseSummary, S as ScalarDistribution } from '../index-DQHtWQ57.js';

package/dist/contract/index.js CHANGED Viewed

@@ -6,7 +6,7 @@ import {
   heldOutGate,
   runEval,
   runImprovementLoop
-} from "../chunk-XAP6DJZE.js";
+} from "../chunk-YXD7GWJI.js";
 import {
   fsCampaignStorage,
   inMemoryCampaignStorage,

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.51.0",
+    "version": "0.52.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/dist/{run-improvement-loop-BPMjNKMJ.d.ts → run-improvement-loop-Cc7oZlRP.d.ts} RENAMED Viewed

@@ -79,25 +79,48 @@ declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDrive
  *
  * `gepaDriver` — a reflective `ImprovementDriver` for prompt-tier surfaces.
  * Each generation it reflects on the prior best candidate's per-scenario
- * scores + weakest dimensions (the `GenerationCandidate` evidence from
- * `runOptimization`), asks an LLM to propose targeted rewrites of the current
- * surface, and returns them as the next population.
+ * scores + weakest dimensions, asks an LLM to propose targeted rewrites of
+ * the current surface, and returns them as the next population.
  *
- * This is the substrate's best-in-class prompt optimizer: surface-agnostic, so
- * ANY string surface in ANY consumer opts in by selecting it — system prompts,
- * prompt addenda, judge/reviewer prompts, even a driver's own reflection
- * prompt. It reuses the generic reflection primitive (`buildReflectionPrompt` /
- * `parseReflectionResponse`) and the router client; it has NO dependency on the
- * legacy `runMultiShotOptimization` / `prompt-evolution` orchestration.
+ * Honest scope vs the GEPA paper (Agrawal et al., arXiv:2507.19457):
+ * this driver implements the *reflection* primitive — it does NOT implement
+ * GEPA's Pareto frontier of candidates, multi-objective non-dominated
+ * tracking, or the combine-complementary-lessons step. We use "best by
+ * composite" as the parent each generation; the paper retains a Pareto set
+ * and combines lessons across non-dominated candidates. Tracked as #101 in
+ * the substrate roadmap. See `docs/specs/driver-honest-spec.md`.
  *
- * It earns its keep where there is real per-instance signal (which the
+ * Optional `constraints` move structured-doc guards into the driver
+ * (preserve H2 section headings, cap sentence-level edits) — useful when
+ * the surface IS a structured procedure like a SKILL.md / runbook /
+ * judge rubric. When `constraints` is omitted, behavior is unchanged.
+ *
+ * The driver is surface-agnostic — any string surface in any consumer opts
+ * in by selecting it. Reuses the generic reflection primitive
+ * (`buildReflectionPrompt` / `parseReflectionResponse`) and the router
+ * client; no dependency on the legacy `runMultiShotOptimization` /
+ * `prompt-evolution` orchestration.
+ *
+ * Earns its keep where there is real per-instance signal (which the
  * dimensional + per-scenario evidence + the `LabeledScenarioStore` flywheel
- * now provide). For thin-signal surfaces it degrades to plain reflection — so
- * it is a SELECTABLE driver, never a forced default. On generation 0 (no
- * history) it reflects on the current surface against the mutation primitives
- * alone.
+ * now provide). For thin-signal surfaces it degrades to plain reflection.
+ * On generation 0 (no history) it reflects on the current surface against
+ * the mutation primitives alone.
  */
+interface GepaDriverConstraints {
+    /** H2 section headings that MUST appear unchanged in every candidate.
+     *  When set, the driver auto-detects current H2s if this is empty AND
+     *  rejects any candidate that drops or renames a preserved heading.
+     *  Use when the surface is a structured doc (SKILL.md, runbook,
+     *  sectioned system prompt, judge rubric). */
+    preserveSections?: string[];
+    /** Maximum sentence-level edits per candidate vs the parent surface.
+     *  Rejection threshold = maxSentenceEdits × 2 (counts adds + removes).
+     *  Inspired by SkillOpt's edit-budget as a "textual learning rate."
+     *  Cap prevents an LLM rewrite from overwriting useful prior rules. */
+    maxSentenceEdits?: number;
+}
 interface GepaDriverOptions {
     /** Router transport (apiKey/baseUrl). */
     llm: LlmClientOptions;
@@ -113,8 +136,18 @@ interface GepaDriverOptions {
     temperature?: number;
     /** Reflection max tokens. Default 6000. */
     maxTokens?: number;
+    /** Structured-doc constraints. Candidates violating any are rejected
+     *  post-parse and dropped from the returned population. */
+    constraints?: GepaDriverConstraints;
 }
 declare function gepaDriver(opts: GepaDriverOptions): ImprovementDriver;
+/** Extract H2 headings (`## Foo`) from a markdown surface. Exported for
+ *  consumers building custom mutators that share the same invariant. */
+declare function extractH2Sections(text: string): string[];
+/** Sentence-level edit distance — count distinct add/remove ops between
+ *  two surfaces via a normalised line-by-line set diff. Treats trivial
+ *  whitespace as identical. Exported for tests + consumer-side validators. */
+declare function countSentenceEdits(baseline: string, candidate: string): number;
 /**
  * @experimental
@@ -414,4 +447,4 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
 }
 declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
-export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverOptions as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type OpenAutoPrResult as a, type RunEvalOptions as b, type RunImprovementLoopOptions as c, type RunImprovementLoopResult as d, type RunOptimizationOptions as e, type RunOptimizationResult as f, composeGate as g, defaultProductionGate as h, evolutionaryDriver as i, fsCampaignStorage as j, gepaDriver as k, heldOutGate as l, inMemoryCampaignStorage as m, runEval as n, openAutoPr as o, runImprovementLoop as p, runOptimization as q, runCampaign as r, surfaceHash as s };
+export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type EvolutionaryDriverOptions as E, type GepaDriverConstraints as G, type HeldOutGateOptions as H, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type GepaDriverOptions as a, type OpenAutoPrResult as b, type RunEvalOptions as c, type RunImprovementLoopOptions as d, type RunImprovementLoopResult as e, type RunOptimizationOptions as f, type RunOptimizationResult as g, composeGate as h, countSentenceEdits as i, defaultProductionGate as j, evolutionaryDriver as k, extractH2Sections as l, fsCampaignStorage as m, gepaDriver as n, heldOutGate as o, inMemoryCampaignStorage as p, openAutoPr as q, runCampaign as r, runEval as s, runImprovementLoop as t, runOptimization as u, surfaceHash as v };

package/docs/specs/driver-honest-spec.md ADDED Viewed

@@ -0,0 +1,251 @@
+# Driver Honest Spec — what each driver IS, what each methodology actually is, where we deviate
+**Status:** Living document. Updated when we learn the truth from primary sources.
+**Date:** 2026-05-27
+This document exists because the project shipped two drivers with methodology names attached (`gepaDriver`, `skillOptDriver`) without the methodology specs being precisely encoded anywhere in the repo. That created an integrity gap. This doc closes it.
+Every claim in this doc is sourced from a primary reference (paper, code, or directly verifiable from our source). Marketing language is forbidden. If something is not implemented we say so.
+---
+## Part 1 — GEPA (the paper)
+**Source**: Agrawal et al., *"GEPA: Reflective Prompt Evolution Can Outperform Reinforcement Learning"*, arXiv:2507.19457, July 2025.
+### What GEPA actually does
+Outer loop (verbatim from abstract): "samples trajectories (e.g., reasoning, tool calls, and tool outputs) and reflects on them in natural language to diagnose problems, propose and test prompt updates, and combine complementary lessons from the **Pareto frontier of its own attempts**."
+Named primitives in the paper:
+- **GEPA** (Genetic-Pareto) — the overall optimizer
+- **Pareto frontier** — non-dominated candidate set retained across iterations
+- **Prompt updates** — mutations proposed by reflection
+- **Rollouts** — trajectory samples
+### What gepaDriver in our substrate ACTUALLY does
+Source: `src/campaign/drivers/gepa.ts` (132 lines)
+- Single LLM call per `propose()` invocation
+- Input: prior generation's **single best candidate by composite score** + that candidate's top/bottom scenarios + 3 weakest dimensions (`buildEvidence`)
+- Output: N proposals, each a full document rewrite
+- Dedup by exact text equality
+### Deviations from the GEPA paper
+| GEPA paper | Our `gepaDriver` |
+|---|---|
+| **Pareto frontier** of candidates | **Single "best by composite"** — no Pareto set, no non-dominated tracking |
+| **Combine complementary lessons** from frontier | Each generation reflects on ONE prior candidate; no combination |
+| Multi-objective optimization | Single-objective (composite score) |
+| Genetic operators (mutation, crossover) | Reflection only — no crossover |
+| Sample efficiency claim (35× fewer rollouts than GRPO) | Unmeasured against any baseline |
+**Honest assessment**: our `gepaDriver` is a **reflective full-rewrite driver**, not GEPA. It captures GEPA's *reflection* primitive but not its *Pareto* mechanism. The name oversells. A faithful renaming would be `reflectiveRewriteDriver`. A faithful implementation would add a Pareto candidate pool + combine step.
+---
+## Part 2 — SkillOpt (the paper + code)
+**Source**:
+- README: https://github.com/microsoft/SkillOpt
+- Source: `/tmp/SkillOpt/skillopt/` (cloned 2026-05-27)
+- Key files: `engine/trainer.py`, `optimizer/clip.py` (rank_and_select), `optimizer/update_modes.py`, `evaluation/gate.py`, `types.py`
+### What SkillOpt actually does
+**6-stage per-step pipeline** (verbatim from `trainer.py:516` and adjacent):
+1. **Rollout** — `adapter.rollout(train_env, current_skill, ...)` collects trajectories on a batch.
+2. **Reflect** — `adapter.reflect()` analyses trajectories and emits **structured patches** (NOT full rewrites in patch mode). Failure trials → failure patches; success trials → success patches.
+3. **Aggregate** — `merge_patches(current_skill, all_failure_patches, all_success_patches, batch_size=merge_bs)` — hierarchically merges patches across accumulated batches.
+4. **Select** — `rank_and_select(current_skill, merged_patch, max_edits=edit_budget)` — if edit pool > budget, calls an optimizer LLM to **rank edits by importance** and keep top-L. Budget is "analogous to gradient clipping" (their words).
+5. **Update** — apply patch in one of 3 modes:
+   - **`patch`** — deterministic diff apply via `apply_patch_with_report()`; ops are `append | insert_after | replace | delete`
+   - **`rewrite_from_suggestions`** — LLM regenerates full skill from suggestions
+   - **`full_rewrite_minibatch`** — reflection directly emits complete candidate skills; select picks the best
+6. **Evaluate & Gate** — runs candidate on selection set, calls `evaluate_gate(cand_hard, current_score, best_score)`. Returns `accept_new_best | accept | reject` from a **literal `cand_hard > current_score`** comparison (`evaluation/gate.py:38`). No statistical test.
+Plus epoch-level stages:
+- **Slow update** — `run_slow_update()` builds longitudinal pairs across epochs.
+- **Meta skill** — `run_meta_skill()` produces optimizer-side memory of patterns across adjacent epochs.
+### Canonical patch shape (from `types.py:22-45`)
+```python
+EditOp = Literal["append", "insert_after", "replace", "delete"]
+@dataclass
+class Edit:
+    op: EditOp
+    content: str
+    target: str  # for replace/delete/insert_after
+    support_count: int | None  # how many trials voted for this edit
+    source_type: Literal["failure", "success"] | None
+    merge_level: int | None
+@dataclass
+class Patch:
+    edits: list[Edit]
+    reasoning: str
+    ranking_details: dict | None
+```
+### What `skillOptDriver` v0.51.0 in our substrate ACTUALLY does
+Source: `src/campaign/drivers/skillopt.ts` (current as of 0.51.0)
+- Single LLM call per `propose()` returning N full document rewrites
+- Post-parse rejection on: (a) any H2 header dropped, (b) sentence-edit count > editBudget × 2
+- Substantively equivalent to `gepaDriver` + 2 validation constraints
+### Deviations from SkillOpt
+| SkillOpt actual | Our 0.51.0 `skillOptDriver` |
+|---|---|
+| 6-stage pipeline (rollout → reflect → aggregate → select → update → gate) | Single LLM call → N rewrites |
+| **Patch-based edits** (`{op, target, content, support_count, source_type}`) | Full document rewrites only |
+| `merge_patches()` hierarchical merge across batches | No aggregation; each `propose()` is independent |
+| `rank_and_select(max_edits=edit_budget)` LLM-ranking of edits | All candidates that pass validation are returned |
+| 3 update modes (`patch`, `rewrite_from_suggestions`, `full_rewrite_minibatch`) | Only `full_rewrite_minibatch`-equivalent |
+| `evaluate_gate()` with `accept_new_best/accept/reject` codes | Substrate's outer gate decides ship/hold/inspect; driver doesn't see fine-grained accept signal |
+| Longitudinal `slow_update` across epochs | Not implemented |
+| `meta_skill` optimizer-side memory | Not implemented |
+| Selection-set cache (`sel_cache`) for repeated candidate hashes | Not implemented |
+| Edit-budget LR scheduler (constant / linear / cosine / autonomous) | Single fixed `editBudget` |
+| Mini-batch accumulation (`steps_per_epoch`, `merge_batch_size`) | Not implemented |
+| `decide_autonomous_learning_rate()` | Not implemented |
+| `longitudinal_pair_policy` (mixed / changed / unchanged) | Not implemented |
+**Honest assessment**: 13 substantive deviations. `skillOptDriver` 0.51.0 is **not** SkillOpt. It is `gepaDriver` with two post-validation constraints (section preservation, sentence-edit count). The methodology name oversells the implementation.
+### One thing where we are STRICTER than SkillOpt
+**The gate.** SkillOpt: literal `cand_hard > current_score` (`evaluation/gate.py:38`). Our substrate: paired bootstrap + 95% CI + Cohen's d + MDE + p-value (`defaultProductionGate`). When the lift CI straddles zero, our gate returns `hold` / `inspect`. SkillOpt would accept any improvement at all, even single-sample noise.
+This is real differentiation we have not been crediting ourselves for.
+---
+## Part 3 — Hermes Agent's "self-improvement"
+**Source**: `/tmp/hermes-agent/` (cloned 2026-05-27)
+- `agent/curator.py` (the actual loop)
+- `agent/skill_commands.py`
+- `agent/skill_utils.py`
+### What Hermes actually does
+From `curator.py` line 1: "Curator — background skill maintenance orchestrator. The curator is an auxiliary-model task that periodically reviews agent-created skills and maintains the collection."
+Trigger: idle-driven, with default `DEFAULT_INTERVAL_HOURS = 24 * 7` (7 days). When the agent has been idle for `DEFAULT_MIN_IDLE_HOURS = 2` and the last curator run was > 7 days ago, `maybe_run_curator()` spawns a forked AIAgent.
+What the curator does:
+- "Auto-transition lifecycle states based on derived skill activity timestamps"
+- "Spawn a background review agent that can **pin / archive / consolidate / patch** agent-created skills via `skill_manage`"
+- "Persist curator state (last_run_at, paused, etc.) in `.curator_state`"
+Strict invariants:
+- Only touches agent-created skills
+- "Never auto-deletes — only archives"
+- Pinned skills bypass auto-transitions
+- Uses the auxiliary client (separate from main session)
+### Hermes' actual gate
+**There is none.** The curator is an LLM editor making editorial decisions. There is no:
+- Held-out validation
+- Performance comparison between old and new skill versions
+- Statistical test
+- Rejection-on-regression mechanism
+Skills are refined by an LLM looking at usage patterns; the refinement is accepted because the LLM proposed it.
+### Honest assessment
+Hermes has a **skill curation system**, not a self-improvement loop. The README's claim "the only agent with a built-in learning loop" is generous — it's a 7-day-cron LLM librarian. There's no measurable guarantee that today's curated skill collection performs better than yesterday's.
+Compare:
+| Component | Hermes | SkillOpt | Tangle |
+|---|---|---|---|
+| Validation gate | None | `>` | Paired bootstrap CI |
+| Patch-level edits | No (LLM rewrites whole skill) | Yes | No (full rewrite only) |
+| Skill ranking / selection | No | Yes | No |
+| Sample efficiency claim | None | 35× vs GRPO | None |
+| Frequency | 7-day cron | Per training step | Per `selfImprove()` call |
+Where Tangle WINS: the gate. Where SkillOpt WINS: the pipeline sophistication. Where Hermes WINS: the deployment story (multi-platform, multi-tool-backend).
+---
+## Part 4 — What we should actually do
+### Phase A — rename to honest names (0.51.1, this session)
+The current `skillOptDriver` and `gepaDriver` names overclaim. Options:
+1. **Rename both:**
+   - `gepaDriver` → `reflectiveRewriteDriver` (drops the "Pareto" implication)
+   - `skillOptDriver` → `constrainedReflectiveDriver` (drops the SkillOpt-methodology implication)
+   - Reserve `gepaDriver` + `skillOptDriver` for faithful implementations
+2. **Keep `gepaDriver` name** (it's our most-used driver; renaming is disruptive); rename `skillOptDriver`.
+3. **Keep both names; add `@experimental` + a "differs from paper" docstring section.** Cheapest. Truthful enough.
+Recommendation: **option 3 plus a frontmatter "deviations from paper" section** in each driver source file. Empirically test before renaming.
+### Phase B — build the honest empirical harness (0.51.1, this session)
+`tests/driver-empirical.bench.ts` — for each driver:
+- Same scenarios (5 synthetic + 5 real legal-agent scenarios)
+- Same judge
+- Same `baselineSurface`
+- Same `budget` (1 gen, 3 candidates, holdout 0.3)
+- Report: lift mean, lift CI95, p-value, rollouts spent, $$ spent
+Drivers in the matrix:
+- `gepaDriver` (current full-rewrite reflection)
+- `skillOptDriver` (current 0.51.0 full-rewrite + constraints)
+- Future: real `skillOptDriverV2` with patch mode
+This is the **falsifiable test** of whether our drivers' methodology claims are worth the names.
+### Phase C — implement SkillOpt patch mode for real (0.52.0)
+Build `skillOptDriverV2` with:
+1. **`Edit` type matching SkillOpt's**: `{op: 'append'|'insert_after'|'replace'|'delete', content, target?, support_count?, source_type?}`
+2. **Reflect step emits patches**, not full rewrites
+3. **`mergePatches()`** — LLM-driven hierarchical merge of failure + success patches
+4. **`rankAndSelect()`** — LLM-driven ranking when edit pool > budget
+5. **Deterministic `applyPatch()`** — string ops, no LLM
+6. **Keep our gate** (paired bootstrap CI). Don't downgrade to SkillOpt's `>` — that's our edge.
+Estimated scope: 400-600 lines + tests.
+### Phase D — implement GEPA's Pareto frontier (0.53.0)
+Build `gepaDriverV2` with:
+1. **Candidate pool** retained across generations (non-dominated)
+2. **Multi-objective evaluation** (composite + cost + length + diversity)
+3. **Combine step** — LLM combines lessons from non-dominated candidates
+4. Keep reflection.
+5. Sample-efficiency target: match the paper's ~35× claim on a benchmark we choose.
+Estimated scope: 500-800 lines + tests.
+---
+## Source pointers (audit trail)
+- GEPA paper: https://arxiv.org/abs/2507.19457
+- SkillOpt repo: https://github.com/microsoft/SkillOpt (cloned at `/tmp/SkillOpt/` 2026-05-27)
+- Hermes repo: https://github.com/NousResearch/hermes-agent (cloned at `/tmp/hermes-agent/` 2026-05-27)
+- Our gepaDriver: `src/campaign/drivers/gepa.ts`
+- Our skillOptDriver: `src/campaign/drivers/skillopt.ts`
+- Our gate: `src/campaign/gates/default-production-gate.ts`
+- Our reflection primitive: `src/reflective-mutation.ts`
+Update this doc when:
+- We discover new behavior in any of the upstream methods (via reading their code, not their READMEs)
+- We ship a driver that closes one of the named gaps
+- We run the empirical harness and have real numbers to add