npm - @tangle-network/agent-eval - Versions diffs - 0.38.0 → 0.40.2 - Mend

@tangle-network/agent-eval 0.38.0 → 0.40.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

package/dist/campaign/index.d.ts +775 -0
package/dist/campaign/index.js +807 -0
package/dist/campaign/index.js.map +1 -0
package/dist/chunk-5U2DOJU4.js +565 -0
package/dist/chunk-5U2DOJU4.js.map +1 -0
package/dist/{chunk-KE7TDJUO.js → chunk-AU2JLNSZ.js} +2 -2
package/dist/{chunk-TSPOEDM3.js → chunk-BWZEGTES.js} +2 -5
package/dist/chunk-BWZEGTES.js.map +1 -0
package/dist/{chunk-3HYQXPC2.js → chunk-DMW5VENN.js} +3 -3
package/dist/{chunk-TQL7BAOY.js → chunk-EGIPWXHL.js} +2 -2
package/dist/chunk-GGE4NNQT.js +65 -0
package/dist/chunk-GGE4NNQT.js.map +1 -0
package/dist/{chunk-7PR3WPWE.js → chunk-L7XMNXLO.js} +2 -2
package/dist/{chunk-RL6TERL2.js → chunk-LCIDRYGP.js} +3 -3
package/dist/{chunk-L5UNCDAJ.js → chunk-MAOZCN36.js} +2 -64
package/dist/chunk-MAOZCN36.js.map +1 -0
package/dist/{chunk-LGAPK7NA.js → chunk-NKLGKF2Q.js} +2 -2
package/dist/chunk-TMXPFWC7.js +305 -0
package/dist/chunk-TMXPFWC7.js.map +1 -0
package/dist/{chunk-KHZRNY3F.js → chunk-WP7SY7AI.js} +5 -4
package/dist/chunk-WP7SY7AI.js.map +1 -0
package/dist/chunk-YV7J7X5N.js +313 -0
package/dist/chunk-YV7J7X5N.js.map +1 -0
package/dist/{control-DVrmvM_k.d.ts → control-CmLJk3IG.d.ts} +1 -1
package/dist/control.d.ts +3 -3
package/dist/control.js +2 -2
package/dist/{dataset-ueRVTUoY.d.ts → dataset-BlwAtYYf.d.ts} +1 -1
package/dist/{feedback-trajectory-iATEAHmc.d.ts → feedback-trajectory-Dvy-bt7x.d.ts} +1 -1
package/dist/governance/index.d.ts +133 -5
package/dist/index.d.ts +35 -34
package/dist/index.js +97 -630
package/dist/index.js.map +1 -1
package/dist/multishot/index.d.ts +21 -21
package/dist/multishot/index.js +64 -15
package/dist/multishot/index.js.map +1 -1
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +2 -2
package/dist/optimization.js +5 -5
package/dist/pipelines/index.js +2 -2
package/dist/red-team-30II1T4o.d.ts +63 -0
package/dist/{release-report-D2ykiLSe.d.ts → release-report-Di84bXD7.d.ts} +5 -2
package/dist/reporting.d.ts +2 -2
package/dist/reporting.js +3 -3
package/dist/rl.js +15 -315
package/dist/rl.js.map +1 -1
package/dist/run-campaign-JYJXYHHL.js +10 -0
package/dist/run-campaign-JYJXYHHL.js.map +1 -0
package/dist/traces.js +7 -5
package/dist/wire/index.d.ts +2 -2
package/docs/design/loop-taxonomy.md +233 -0
package/docs/design/self-improvement-engine.md +130 -0
package/package.json +33 -24
package/dist/chunk-KHZRNY3F.js.map +0 -1
package/dist/chunk-L5UNCDAJ.js.map +0 -1
package/dist/chunk-TSPOEDM3.js.map +0 -1
package/dist/index-CN2agEaO.d.ts +0 -191
/package/dist/{chunk-KE7TDJUO.js.map → chunk-AU2JLNSZ.js.map} +0 -0
/package/dist/{chunk-3HYQXPC2.js.map → chunk-DMW5VENN.js.map} +0 -0
/package/dist/{chunk-TQL7BAOY.js.map → chunk-EGIPWXHL.js.map} +0 -0
/package/dist/{chunk-7PR3WPWE.js.map → chunk-L7XMNXLO.js.map} +0 -0
/package/dist/{chunk-RL6TERL2.js.map → chunk-LCIDRYGP.js.map} +0 -0
/package/dist/{chunk-LGAPK7NA.js.map → chunk-NKLGKF2Q.js.map} +0 -0

package/dist/campaign/index.d.ts ADDED Viewed

@@ -0,0 +1,775 @@
+import { RunRecord } from '@tangle-network/agent-runtime';
+import { R as RedTeamCase } from '../red-team-30II1T4o.js';
+import '../dataset-BlwAtYYf.js';
+import '../errors-mje_cKOs.js';
+import '../store-Db2Bv8Cf.js';
+/**
+ * @experimental
+ *
+ * Pass A substrate types — `runCampaign` is the one primitive every
+ * eval flow composes from. Three contracts in this file:
+ *
+ *   - `Scenario`            input set
+ *   - `DispatchFn`          how to run one scenario → artifact
+ *   - `CampaignResult`      defined output schema (the contract downstream tools depend on)
+ *
+ * Three more lifted from earlier substrate work (re-exported):
+ *
+ *   - `JudgeConfig`         pluggable dimensional scorer (0.38)
+ *   - `Mutator`             optimization-loop surface mutator
+ *   - `Gate`                promotion gate (`HeldOutGate` and friends adapt to this)
+ *
+ * No new architecture vs 0.38 — Pass A formalizes the shapes so consumers
+ * can build dashboards / CI gates / regression diffs against a stable schema.
+ */
+/** @experimental Stable identifier + kind tag for any scenario. Consumers
+ *  extend with their per-domain payload (persona, task, requirement, ...). */
+interface Scenario {
+    id: string;
+    kind: string;
+    tags?: string[];
+}
+/** @experimental Context handed to every dispatch invocation. Scoped — every
+ *  trace/span carries the cellId, every artifact write lands under the cell's
+ *  artifact root, the cost meter accumulates per cell. */
+interface DispatchContext {
+    cellId: string;
+    rep: number;
+    generation?: number;
+    seed: number;
+    signal: AbortSignal;
+    trace: CampaignTraceWriter;
+    artifacts: CampaignArtifactWriter;
+    cost: CampaignCostMeter;
+    /** Populated when this run is part of a multi-cycle improvement loop. */
+    cycleId?: string;
+    /** Populated when the substrate resumed from a prior cache hit. */
+    resumedFrom?: string;
+}
+/** @experimental One function: scenario + ctx → artifact. Dispatcher chooses
+ *  whether to call `runMultishot`, `runLoop`, raw `streamPrompt`, anything. */
+type DispatchFn<TScenario extends Scenario, TArtifact> = (scenario: TScenario, ctx: DispatchContext) => Promise<TArtifact>;
+/** @experimental One session within a multi-session journey. Dispatch is
+ *  invoked once per session in order; state from prior session's artifact
+ *  is exposed via `ctx.priorSessionArtifact`. */
+interface SessionScript<TScenario, TArtifact> {
+    id: string;
+    intent: string;
+    maxTurns?: number;
+    /** When true, knowledge accumulated this session persists to next. */
+    affectsKnowledge?: boolean;
+    /** Optional per-session persona evolution — called after the session
+     *  resolves. Returns the persona shape used by the NEXT session. */
+    evolveAfterSession?: (artifact: TArtifact, sessionIndex: number, scenario: TScenario) => TScenario;
+}
+interface JudgeDimension {
+    /** JSON field name + score key. */
+    key: string;
+    /** Description shown in the judge's user prompt. */
+    description: string;
+}
+/** @experimental Pluggable dimensional scorer. Consumers register one per
+ *  scoring concern. `appliesTo` lets a judge run only on scenarios that match
+ *  (e.g., legal_citation judge only on legal scenarios). */
+interface JudgeConfig<TArtifact, TScenario extends Scenario = Scenario> {
+    name: string;
+    model?: string;
+    dimensions: JudgeDimension[];
+    systemPrompt: string;
+    buildPrompt: (input: {
+        artifact: TArtifact;
+        scenario: TScenario;
+    }) => string;
+    appliesTo?: (scenario: TScenario) => boolean;
+    apiKey?: string;
+    baseUrl?: string;
+}
+interface JudgeScore {
+    dimensions: Record<string, number>;
+    composite: number;
+    notes: string;
+}
+/** @experimental A tier-4 code surface — a candidate change to the agent's
+ *  IMPLEMENTATION, not its prompt. Produced by autoresearch (reads codebase +
+ *  trace findings → opens a worktree). Measured by checking out `worktreeRef`
+ *  and running the worker against the changed code. See the improvement-tier
+ *  table in `docs/design/loop-taxonomy.md`. */
+interface CodeSurface {
+    kind: 'code';
+    /** Worktree path or git ref holding the candidate code change. The
+     *  consumer's `dispatchWithSurface` checks this out before running. */
+    worktreeRef: string;
+    /** Base ref the change is measured against. Default: the repo's main. */
+    baseRef?: string;
+    /** Human summary of what changed — rendered into the auto-PR body. */
+    summary?: string;
+}
+/** @experimental The mutable surface a driver proposes. Tiers (see
+ *  `docs/design/loop-taxonomy.md`):
+ *   - `string`      — tiers 1-2: system-prompt addendum / serialized tool
+ *                     config. Cheap, reversible, text-diffable.
+ *   - `CodeSurface` — tier 4: an implementation change behind a worktree ref.
+ *  Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
+ *  not this type. */
+type MutableSurface = string | CodeSurface;
+/** @experimental Stateless surface mutation — given findings + current
+ *  surface, return N candidate surfaces. Pure transform, no generation
+ *  awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
+ *  conform. Wrapped by `evolutionaryDriver` to become an `ImprovementDriver`. */
+interface Mutator<TFindings = unknown> {
+    kind: string;
+    mutate(args: {
+        findings: TFindings[];
+        currentSurface: MutableSurface;
+        populationSize: number;
+        signal: AbortSignal;
+    }): Promise<MutableSurface[]>;
+}
+/** @experimental Everything a driver's `propose()` may read to plan the next
+ *  batch of candidates. The first six fields are always present; the rest are
+ *  optional context the loop supplies when available, so cheap drivers
+ *  (`evolutionaryDriver`) can ignore them while a code-tier `autoresearchDriver`
+ *  consumes the research report + dataset to drive a sandbox runLoop.
+ *  See `docs/design/self-improvement-engine.md`. */
+interface ProposeContext<TFindings = unknown> {
+    currentSurface: MutableSurface;
+    history: GenerationRecord[];
+    findings: TFindings[];
+    /** BREADTH: how many candidate surfaces to return this generation. */
+    populationSize: number;
+    generation: number;
+    signal: AbortSignal;
+    /** The Phase-2 research report (analyst findings + diff), produced AFTER the
+     *  trace analysts run. Opaque to the substrate — the driver that consumes it
+     *  types it. See the phase diagram in self-improvement-engine.md. */
+    report?: unknown;
+    /** Handle to all captured data — the driver samples traces / artifacts /
+     *  rewards here to ground its proposals. */
+    dataset?: LabeledScenarioStore;
+    /** DEPTH: max runLoop iterations the generating agent may take per candidate
+     *  (autoresearchDriver). 1 = single-shot; >1 = it may iterate on its own
+     *  change before handing it back to be measured. */
+    maxImprovementShots?: number;
+}
+/** @experimental A surface-improvement strategy — the DRIVER of the
+ *  improvement loop. Given the current best surface, the history of what's
+ *  been tried + scored, and any external findings, propose the next batch of
+ *  candidate surfaces to measure. Optionally decide to stop early.
+ *
+ *  The evolutionary mutator (`evolutionaryDriver`) and a reflective analyst
+ *  (`analystDriver`, consumer-wired from `@tangle-network/agent-runtime`'s
+ *  `runAnalystLoop`) are two drivers of the SAME loop — not two loops. The
+ *  loop body (`runOptimization`) and the gated promotion shell
+ *  (`runImprovementLoop`) are driver-agnostic. */
+interface ImprovementDriver<TFindings = unknown> {
+    kind: string;
+    /** Plan: propose N candidate surfaces for the next generation. */
+    propose(ctx: ProposeContext<TFindings>): Promise<MutableSurface[]>;
+    /** Decide: stop early when the driver judges the search converged or
+     *  exhausted. Default (omitted) runs all `maxGenerations`. */
+    decide?(args: {
+        history: GenerationRecord[];
+    }): {
+        stop: boolean;
+        reason?: string;
+    };
+}
+interface OptimizerConfig {
+    driver: ImprovementDriver;
+    populationSize: number;
+    maxGenerations: number;
+    surfaceExtractor: (profile: unknown) => MutableSurface;
+}
+/** @experimental Five-valued verdict taxonomy (MOSS-paper alignment). */
+type GateDecision = 'ship' | 'hold' | 'need_more_work' | 'model_ceiling' | 'arch_ceiling';
+interface GateContext<TArtifact, TScenario extends Scenario> {
+    candidateArtifacts: Map<string, TArtifact>;
+    baselineArtifacts?: Map<string, TArtifact>;
+    judgeScores: Map<string, Record<string, JudgeScore>>;
+    scenarios: TScenario[];
+    cost: {
+        candidate: number;
+        baseline: number;
+    };
+    signal: AbortSignal;
+}
+interface GateResult {
+    decision: GateDecision;
+    reasons: string[];
+    contributingGates: Array<{
+        name: string;
+        passed: boolean;
+        detail: unknown;
+    }>;
+    delta?: number;
+}
+/** @experimental Composable promotion gate. */
+interface Gate<TArtifact = unknown, TScenario extends Scenario = Scenario> {
+    name: string;
+    decide(ctx: GateContext<TArtifact, TScenario>): Promise<GateResult>;
+}
+/** @experimental Scoped trace writer handed to each dispatch — every span
+ *  auto-tagged with the cellId so traces filter cleanly. */
+interface CampaignTraceWriter {
+    span(name: string, attributes?: Record<string, unknown>): TraceSpan;
+    flush(): Promise<void>;
+}
+interface TraceSpan {
+    end(attributes?: Record<string, unknown>): void;
+    setAttribute(key: string, value: unknown): void;
+}
+/** @experimental Scoped artifact writer — `write(path, content)` lands under
+ *  `<runDir>/<cellId>/<path>`. */
+interface CampaignArtifactWriter {
+    write(path: string, content: string | Uint8Array): Promise<string>;
+    writeJson(path: string, value: unknown): Promise<string>;
+}
+/** @experimental Cell-scoped cost meter. Substrate auto-tracks LLM costs
+ *  via the cost-ledger backend hooks; consumers can record additional
+ *  spend (sandbox time, tool costs) via `observe`. */
+interface CampaignCostMeter {
+    observe(amountUsd: number, source: string): void;
+    current(): number;
+}
+/** @experimental Source tag — required on every store write. Used by the
+ *  default training-source filter (production-trace samples NOT used as
+ *  training scenarios unless explicitly opted in). */
+type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
+type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
+/** @experimental Required-provenance write. The store rejects writes that
+ *  lack provenance — a default-on flywheel without provenance is the
+ *  data-poisoning vector flagged in the alignment review. */
+interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact = unknown> {
+    scenario: TScenario;
+    artifact: TArtifact;
+    judgeScores: Record<string, JudgeScore>;
+    source: LabeledScenarioSource;
+    sourceVersionHash: string;
+    capturedAt: string;
+    redactionStatus: RedactionStatus;
+    /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
+    rateLimitBucket?: string;
+}
+interface LabeledScenarioRecord<TScenario extends Scenario = Scenario, TArtifact = unknown> extends LabeledScenarioWrite<TScenario, TArtifact> {
+    /** Stable hash of (scenario.id, source, capturedAt, sourceVersionHash). */
+    recordHash: string;
+    /** Substrate-assigned split — train if captured before the campaign's
+     *  `temporalCutoff`, test if after. Explicit override allowed via filter. */
+    split: 'train' | 'test';
+}
+interface LabeledScenarioSampleArgs {
+    count: number;
+    /** REQUIRED — substrate refuses to sample without an explicit split. */
+    split: 'train' | 'test';
+    /** REQUIRED — only records captured before this timestamp are returned.
+     *  Enforces temporal split discipline (test scenarios captured AFTER train
+     *  cannot enter the training pool). */
+    capturedBefore: string;
+    filter?: {
+        kind?: string;
+        source?: LabeledScenarioSource | LabeledScenarioSource[];
+        minComposite?: number;
+        maxComposite?: number;
+    };
+}
+interface LabeledScenarioStore {
+    observe(write: LabeledScenarioWrite): Promise<void>;
+    sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
+    size(): Promise<{
+        train: number;
+        test: number;
+        bySource: Record<string, number>;
+    }>;
+}
+interface CampaignCellResult<TArtifact> {
+    cellId: string;
+    scenarioId: string;
+    rep: number;
+    generation?: number;
+    artifact: TArtifact;
+    judgeScores: Record<string, JudgeScore>;
+    costUsd: number;
+    durationMs: number;
+    seed: number;
+    cached: boolean;
+    error?: string;
+}
+interface JudgeAggregate {
+    mean: number;
+    stdev: number;
+    ci95: [number, number];
+    n: number;
+}
+interface ScenarioAggregate {
+    meanComposite: number;
+    ci95: [number, number];
+    n: number;
+}
+interface GenerationRecord {
+    generationIndex: number;
+    candidates: Array<{
+        surfaceHash: string;
+        composite: number;
+        ci95: [number, number];
+    }>;
+    promoted: string[];
+}
+interface CampaignAggregates {
+    byJudge: Record<string, JudgeAggregate>;
+    byScenario: Record<string, ScenarioAggregate>;
+    totalCostUsd: number;
+    cellsExecuted: number;
+    cellsSkipped: number;
+    cellsCached: number;
+    cellsFailed: number;
+}
+interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scenario> {
+    /** sha256(scenarios, judges, dispatch source ref, optimizer config, seed). Stable identity for reruns. */
+    manifestHash: string;
+    seed: number;
+    startedAt: string;
+    endedAt: string;
+    durationMs: number;
+    cells: Array<CampaignCellResult<TArtifact>>;
+    aggregates: CampaignAggregates;
+    optimization?: {
+        generations: GenerationRecord[];
+        winnerSurfaceHash?: string;
+    };
+    gate?: GateResult;
+    prUrl?: string;
+    runDir: string;
+    artifactsByPath: Record<string, string>;
+    /** Substrate strips the input scenarios to id+kind for the result manifest;
+     *  consumers needing full payload look it up via the original input. The
+     *  type parameter `TScenario` is propagated for downstream consumers that
+     *  want narrowed types when extending `CampaignResult`. */
+    scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
+}
+/**
+ * @experimental
+ *
+ * `openAutoPr` — thin shell-out helper for the `runImprovementLoop` preset's
+ * `autoOnPromote: 'pr'` mode. Substitutes for the per-product PR-opening
+ * code consumers duplicated 4 times. The PR body includes the campaign's
+ * manifest hash, gate verdict, and scorecard summary so reviewers can see
+ * exactly what was promoted + why.
+ *
+ * NOT a deploy mechanism — this only OPENS a PR. The human reviews + merges.
+ * The Shape B (`autoOnPromote: 'config'`) live-runtime-mutation path is
+ * deferred to Pass B with the full shadow / canary / rollback stack.
+ */
+interface OpenAutoPrOptions<TArtifact, TScenario extends Scenario> {
+    /** Campaign result to attach to the PR. */
+    result: CampaignResult<TArtifact, TScenario>;
+    /** Gate verdict explaining the promotion. Substrate refuses to open a PR
+     *  when `gate.decision !== 'ship'` — fails loud. */
+    gate: GateResult;
+    /** Promoted surface diff — typically the new system prompt addendum or
+     *  full profile diff. Substrate writes it as the PR body. */
+    promotedDiff: string;
+    /** GH owner/repo target (e.g., `tangle-network/gtm-agent`). */
+    ghOwner: string;
+    ghRepo: string;
+    /** Branch name for the PR. Default `auto/<manifestHash[:12]>`. */
+    branch?: string;
+    /** PR title. Default includes manifest hash. */
+    title?: string;
+    /** Whether to actually open the PR or just dry-run. Default reads
+     *  `GH_AUTO_PR_TOKEN` env — present = open, absent = dry-run. */
+    dryRun?: boolean;
+    /** Test seam — substitute `gh pr create` invocation. */
+    ghExec?: (args: string[]) => {
+        stdout: string;
+        stderr: string;
+        status: number;
+    };
+}
+interface OpenAutoPrResult {
+    opened: boolean;
+    prUrl?: string;
+    dryRun: boolean;
+    reason: string;
+}
+declare function openAutoPr<TArtifact, TScenario extends Scenario>(options: OpenAutoPrOptions<TArtifact, TScenario>): OpenAutoPrResult;
+/**
+ * @experimental
+ *
+ * `evolutionaryDriver` — adapts a stateless `Mutator` (population mutation:
+ * GEPA / AxGEPA / reflective-mutation) into an `ImprovementDriver`. This is
+ * the evolutionary strategy: each generation, mutate the current best surface
+ * into N candidates, measure, select. No generation memory beyond the current
+ * surface; the loop body handles ranking + promotion.
+ *
+ * The reflective alternative — `analystDriver` — is consumer-wired from
+ * `@tangle-network/agent-runtime`'s `runAnalystLoop`: it reasons over the
+ * full generation history + trace findings to propose targeted edits rather
+ * than blind mutations. Both conform to `ImprovementDriver`; the improvement
+ * loop is identical regardless of which drives it.
+ */
+interface EvolutionaryDriverOptions<TFindings = unknown> {
+    mutator: Mutator<TFindings>;
+    /** External findings fed to the mutator each generation. Default: []. */
+    findings?: TFindings[];
+}
+declare function evolutionaryDriver<TFindings = unknown>(opts: EvolutionaryDriverOptions<TFindings>): ImprovementDriver<TFindings>;
+/**
+ * @experimental
+ *
+ * Compose multiple `Gate` implementations — every gate must pass for the
+ * composite to ship. Closes the alignment reviewer's "default-only
+ * heldOutGate + costGate would happily promote a reward-hacked prompt"
+ * concern by making safety gates first-class composable defaults.
+ */
+/** Compose gates — all must `ship` for the composite to `ship`. First
+ *  non-ship verdict short-circuits the composite verdict, but ALL gates run
+ *  (so the result records every gate's reason — useful for diagnostics). */
+declare function composeGate<TArtifact = unknown, TScenario extends Scenario = Scenario>(...gates: Array<Gate<TArtifact, TScenario>>): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * `defaultProductionGate` — composes the substrate's existing safety
+ * primitives (red-team / reward-hacking / canary / heldout) into a single
+ * Gate.decide shape. Closes the alignment + Anthropic-SI reviewers' "safety
+ * primitives are off the critical path" blocker.
+ *
+ * The composition is opinionated — when consumers wire `runImprovementLoop`,
+ * THIS gate is the default. Consumers can still pass a custom gate to
+ * override; the recommended pattern is to compose THIS gate with whatever
+ * extra domain-specific gates they need (`composeGate(defaultProductionGate(...), customGate)`).
+ */
+interface DefaultProductionGateOptions {
+    /** Required: scenarios held out from training; substrate compares
+     *  candidate-on-holdout vs baseline-on-holdout. */
+    holdoutScenarios: Scenario[];
+    /** Minimum mean-composite improvement required to ship. Default 0.5. */
+    deltaThreshold?: number;
+    /** Total $ budget for ALL cells in this campaign — including baseline + candidate.
+     *  Composite verdict refuses to ship when spend exceeded budget. */
+    budgetUsd?: number;
+    /** Red-team cases to probe candidate outputs against. When omitted the
+     *  substrate uses `DEFAULT_RED_TEAM_CORPUS`. Provide a domain-specific
+     *  battery for tighter coverage. */
+    redTeamBattery?: RedTeamCase[];
+    /** Run records (oldest-first) needed for the reward-hacking detector.
+     *  Substrate populates from prior production-loop generations. */
+    recentRuns?: RunRecord[];
+    /** When true, the gate refuses to ship if the reward-hacking detector
+     *  fires at the `gaming` severity. Default true. */
+    blockOnRewardHackingGaming?: boolean;
+}
+declare function defaultProductionGate<TArtifact, TScenario extends Scenario>(options: DefaultProductionGateOptions): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * Thin Gate adapter — exposes delta-threshold-on-holdout as a composable
+ * `Gate`. Use when you want held-out as one of N composed gates instead of
+ * the full `defaultProductionGate` stack.
+ */
+interface HeldOutGateOptions<TScenario extends Scenario = Scenario> {
+    scenarios: TScenario[];
+    deltaThreshold?: number;
+}
+declare function heldOutGate<TArtifact, TScenario extends Scenario>(options: HeldOutGateOptions<TScenario>): Gate<TArtifact, TScenario>;
+/**
+ * @experimental
+ *
+ * Filesystem `LabeledScenarioStore` adapter. The default capture sink for
+ * traces + eval artifacts. Production deployments typically swap for a
+ * Turso/SQLite adapter (same interface).
+ *
+ * Records land as one JSONL file per source under `<root>/<source>.jsonl`.
+ * Each line is a `LabeledScenarioRecord`. Append-only — no in-place edits.
+ *
+ * Safety properties enforced at write-time:
+ *
+ *   - **Provenance required**: writes without `source`, `sourceVersionHash`,
+ *     `capturedAt`, `redactionStatus` are rejected. Closes the alignment
+ *     reviewer's data-poisoning gap.
+ *   - **Per-source rate limits**: optional `rateLimitBucket` + `maxWritesPerMinute`
+ *     stops a single tenant/source from flooding the store.
+ *
+ * Safety properties enforced at sample-time:
+ *
+ *   - **Required split + capturedBefore**: substrate refuses to sample without
+ *     an explicit `split` ('train' | 'test') AND a temporal cutoff. Eliminates
+ *     accidental train/test contamination.
+ *   - **Default training-source filter**: when the store is sampled with
+ *     `split: 'train'`, production-trace records are EXCLUDED unless the
+ *     caller passes `filter.source: 'production-trace'` explicitly. Closes
+ *     the contamination-by-default gap flagged by the senior eval engineer.
+ */
+interface FsLabeledScenarioStoreOptions {
+    /** Root directory for JSONL files. Created if missing. */
+    root: string;
+    /** Per-source rate limit. When set, writes exceeding the cap are rejected
+     *  with a typed error. Default: no limit. */
+    maxWritesPerMinutePerBucket?: number;
+    /** Test seam — override `Date.now()` for deterministic tests. */
+    now?: () => number;
+}
+declare class LabeledScenarioStoreError extends Error {
+    readonly code: string;
+    constructor(code: string, message: string);
+}
+declare class FsLabeledScenarioStore implements LabeledScenarioStore {
+    private readonly options;
+    private readonly now;
+    private readonly rateLimits;
+    constructor(options: FsLabeledScenarioStoreOptions);
+    observe(write: LabeledScenarioWrite): Promise<void>;
+    sample(args: LabeledScenarioSampleArgs): Promise<LabeledScenarioRecord[]>;
+    size(): Promise<{
+        train: number;
+        test: number;
+        bySource: Record<string, number>;
+    }>;
+    private assertProvenance;
+    private assertRateLimit;
+    private toRecord;
+    private pathForSource;
+}
+/**
+ * @experimental
+ *
+ * `runCampaign` — Pass A substrate primitive. ONE function that orchestrates
+ * scenarios → dispatch → artifacts → judges → aggregates, with full
+ * reproducibility (seed + manifest hash), cell-level resumability, bootstrap
+ * CIs, and the `LabeledScenarioStore` capture flywheel.
+ *
+ * Improvement loops (optimizer / gate / autoOnPromote) ride on top of this
+ * primitive but live in `presets/run-improvement-loop.ts`. This file keeps
+ * the core orchestrator minimal — Phase 1 of the Pass A track.
+ */
+interface RunCampaignOptions<TScenario extends Scenario, TArtifact> {
+    scenarios: TScenario[];
+    dispatch: DispatchFn<TScenario, TArtifact>;
+    judges?: JudgeConfig<TArtifact, TScenario>[];
+    /** Required for reproducibility. Default 42. */
+    seed?: number;
+    /** Per-scenario replicates for CI bands. Default 1; raise to 5+ for
+     *  bootstrap-tight intervals on critical eval. */
+    reps?: number;
+    /** When true (default), completed cells are cached by
+     *  (manifestHash, scenarioId, rep, generation). Re-runs skip cached cells. */
+    resumable?: boolean;
+    /** Optional store — when present, every artifact + judge score is captured
+     *  with the configured `captureSource`. Capture is default ON; pass `'off'`
+     *  to disable. */
+    labeledStore?: LabeledScenarioStore | 'off';
+    captureSource?: 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
+    captureSourceVersionHash?: string;
+    /** Wall-clock cost cap across all cells. Cells beyond ceiling are skipped. */
+    costCeiling?: number;
+    /** Max concurrent cells. Default 2. */
+    maxConcurrency?: number;
+    /** Required: where artifacts + traces land. */
+    runDir: string;
+    /** Tracing posture. Default is the substrate's `FileSystemTraceStore` rooted
+     *  at `<runDir>/traces/`. `'off'` disables capture entirely — substrate
+     *  refuses this when the caller wires `autoOnPromote !== 'none'`. */
+    tracing?: 'on' | 'off';
+    /** Test seam — override the wall clock for deterministic tests. */
+    now?: () => Date;
+    /** Test seam — override per-cell trace writer factory. */
+    buildTraceWriter?: (cellId: string, dir: string) => CampaignTraceWriter;
+}
+declare function runCampaign<TScenario extends Scenario, TArtifact>(opts: RunCampaignOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
+/**
+ * @experimental
+ *
+ * `runEval` — the simplest preset over `runCampaign`. No optimizer, no
+ * gate, no auto-PR. Just: run scenarios through dispatch, score with
+ * judges, return CampaignResult.
+ *
+ * The 80% case for consumers who want a scorecard, not an improvement loop.
+ */
+interface RunEvalOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'runDir'> {
+    runDir: string;
+}
+declare function runEval<TScenario extends Scenario, TArtifact>(opts: RunEvalOptions<TScenario, TArtifact>): Promise<CampaignResult<TArtifact, TScenario>>;
+/**
+ * @experimental
+ *
+ * `runOptimization` — the improvement loop body. Runs N generations: the
+ * `ImprovementDriver` proposes K candidate surfaces per generation, each
+ * candidate runs a campaign (the measurement), top-scoring promote to the
+ * next generation. Driver-agnostic — the same loop runs an evolutionary
+ * population mutator (`evolutionaryDriver`) or a reflective analyst
+ * (`analystDriver`); they differ only in how `propose()` picks candidates.
+ *
+ * This is `runLoop`'s shape (plan → measure → decide) specialized to surface
+ * improvement: `driver.propose` = plan, `runCampaign` = the measurement (which
+ * runs the worker behind `dispatch`), the mean-composite ranking = the
+ * validator, `driver.decide` = the stop check.
+ *
+ * The gated-promotion shell (`runImprovementLoop`) wraps this with a holdout
+ * re-score + release gate + optional PR.
+ */
+interface RunOptimizationOptions<TScenario extends Scenario, TArtifact> extends Omit<RunCampaignOptions<TScenario, TArtifact>, 'dispatch'> {
+    /** Initial mutable surface (typically system prompt or addendum). */
+    baselineSurface: MutableSurface;
+    /** Dispatcher that takes the CURRENT surface + scenario → artifact. */
+    dispatchWithSurface: (surface: MutableSurface, scenario: TScenario, ctx: Parameters<RunCampaignOptions<TScenario, TArtifact>['dispatch']>[1]) => Promise<TArtifact>;
+    /** The improvement strategy. Wrap a population `Mutator` via
+     *  `evolutionaryDriver({ mutator })`, or pass a reflective `analystDriver`. */
+    driver: ImprovementDriver;
+    populationSize: number;
+    maxGenerations: number;
+    /** How many top-scoring candidates carry to the next generation. Default 2. */
+    promoteTopK?: number;
+    /** DEPTH knob forwarded to the driver's `propose()` — max runLoop iterations
+     *  the generating agent may take per candidate (autoresearchDriver). */
+    maxImprovementShots?: number;
+    /** Phase-2 research report forwarded to `propose()` (analyst findings +
+     *  diff). Opaque here; the driver types it. */
+    report?: unknown;
+}
+interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
+    generations: Array<{
+        record: GenerationRecord;
+        surfaces: Array<{
+            surfaceHash: string;
+            surface: MutableSurface;
+            campaign: CampaignResult<TArtifact, TScenario>;
+        }>;
+    }>;
+    winnerSurface: MutableSurface;
+    winnerSurfaceHash: string;
+    baselineCampaign: CampaignResult<TArtifact, TScenario>;
+}
+declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
+declare function surfaceHash(surface: MutableSurface): string;
+/**
+ * @experimental
+ *
+ * `runImprovementLoop` — the gated-promotion shell around the improvement
+ * loop body (`runOptimization`). Drives candidate surfaces via the
+ * `ImprovementDriver`, re-scores the winner against the baseline on a
+ * holdout set, runs the release gate, and optionally opens a PR.
+ *
+ * Role vocabulary (see docs/design/loop-taxonomy.md):
+ *   - DRIVER     = the `ImprovementDriver` (evolutionary GEPA mutator OR
+ *                  reflective analyst). Proposes candidate SURFACES — the
+ *                  worker's system prompt / tool config — NOT conversation
+ *                  turns.
+ *   - MEASUREMENT= `runCampaign`. Scores one surface by running the worker
+ *                  (via `dispatch`) over scenarios and judging the output.
+ *   - WORKER     = the agent harness in the sandbox, invoked behind the
+ *                  topology-opaque `dispatch` seam — never referenced here.
+ *
+ * Distinct from `runLoop` in `@tangle-network/agent-runtime`, which is the
+ * INNER conversation loop (driver↔workers in a sandbox). `runImprovementLoop`
+ * is the OUTER loop: it improves the surface that those workers run.
+ *
+ * Hard-refuses unsafe configurations:
+ *   - `tracing: 'off'` when a driver is wired (improvement is unattributable)
+ *   - `autoOnPromote: 'config'` — DEFERRED to Pass B; v0.40 only ships
+ *     `'pr'` and `'none'`.
+ */
+interface RunImprovementLoopOptions<TScenario extends Scenario, TArtifact> extends RunOptimizationOptions<TScenario, TArtifact> {
+    /** Holdout scenarios kept OUT of the training optimization pool — used
+     *  ONLY to score baseline vs winner for the gate. */
+    holdoutScenarios: TScenario[];
+    /** Promotion gate. Substrate strongly recommends `defaultProductionGate`
+     *  for production wiring (composes red-team / reward-hacking / canary /
+     *  heldout). */
+    gate: Gate<TArtifact, TScenario>;
+    /** What to do when the gate ships:
+     *   - `'pr'`: open a PR via `openAutoPr`
+     *   - `'none'`: just report — caller decides what to do with the winner
+     *  v0.40 does NOT support `'config'` (live-runtime self-mutation) —
+     *  deferred to Pass B behind safety stack. */
+    autoOnPromote: 'pr' | 'none';
+    /** GH owner / repo for the auto-PR. Required when autoOnPromote === 'pr'. */
+    ghOwner?: string;
+    ghRepo?: string;
+    /** Optional render override — substrate writes a diff-shaped surface; pass
+     *  a function to format the promoted surface differently. */
+    renderPromotedDiff?: (winnerSurface: MutableSurface, baselineSurface: MutableSurface) => string;
+}
+interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extends RunOptimizationResult<TArtifact, TScenario> {
+    baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
+    winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
+    gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
+    prResult?: ReturnType<typeof openAutoPr>;
+}
+declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
+/**
+ * @experimental
+ *
+ * VCS-pluggable worktree adapter. One improvement = one worktree, PR-like
+ * (multiple commits allowed). A code-tier driver's `propose()` creates a
+ * worktree, an agent commits the change into it, and `finalize()` returns a
+ * `CodeSurface{ worktreeRef }` the measurement checks out to run the worker
+ * against the changed code. On promotion the worktree becomes the PR branch.
+ *
+ * The interface is VCS-agnostic so a future `jj` ([jj-vcs](https://github.com/jj-vcs/jj))
+ * adapter can slot in without touching driver code. Only the git adapter
+ * ships today. See `docs/design/self-improvement-engine.md`.
+ */
+interface Worktree {
+    /** Absolute path to the checked-out worktree directory. */
+    path: string;
+    /** The branch the worktree is on (becomes the PR branch on promotion). */
+    branch: string;
+    /** The ref the worktree was forked from. */
+    baseRef: string;
+}
+interface WorktreeAdapter {
+    /** Create an isolated worktree on a fresh branch off `baseRef`. */
+    create(opts: {
+        baseRef: string;
+        label: string;
+    }): Promise<Worktree>;
+    /** Commit any pending changes in the worktree, then return a CodeSurface
+     *  pointing at it. The agent has already written its change into
+     *  `worktree.path` by the time this is called. */
+    finalize(worktree: Worktree, summary: string): Promise<CodeSurface>;
+    /** Remove the worktree (and its branch) — called for losing candidates. */
+    discard(worktree: Worktree): Promise<void>;
+}
+declare class WorktreeAdapterError extends Error {
+    readonly cause?: unknown | undefined;
+    constructor(message: string, cause?: unknown | undefined);
+}
+interface GitWorktreeAdapterOptions {
+    /** Repo root the worktrees fork from. */
+    repoRoot: string;
+    /** Directory worktrees are created under. Default: `<repoRoot>/.worktrees`. */
+    worktreeDir?: string;
+    /** Branch-name prefix. Default: `improve`. */
+    branchPrefix?: string;
+    /** Test seam — defaults to a real `git` runner. */
+    git?: (args: string[], cwd: string) => string;
+}
+declare function gitWorktreeAdapter(opts: GitWorktreeAdapterOptions): WorktreeAdapter;
+/** Resolve a `CodeSurface`'s worktreeRef to a directory the measurement can
+ *  run the worker in. A path ref is returned as-is; anything else is treated
+ *  as a ref under the adapter's worktree dir. */
+declare function resolveWorktreePath(surface: CodeSurface, worktreeDir?: string): string;
+export { type CampaignAggregates, type CampaignArtifactWriter, type CampaignCellResult, type CampaignCostMeter, type CampaignResult, type CampaignTraceWriter, type CodeSurface, type DefaultProductionGateOptions, type DispatchContext, type DispatchFn, type EvolutionaryDriverOptions, FsLabeledScenarioStore, type FsLabeledScenarioStoreOptions, type Gate, type GateContext, type GateDecision, type GateResult, type GenerationRecord, type GitWorktreeAdapterOptions, type HeldOutGateOptions, type ImprovementDriver, type JudgeAggregate, type JudgeConfig, type JudgeDimension, type JudgeScore, type LabeledScenarioRecord, type LabeledScenarioSampleArgs, type LabeledScenarioSource, type LabeledScenarioStore, LabeledScenarioStoreError, type LabeledScenarioWrite, type MutableSurface, type Mutator, type OpenAutoPrOptions, type OpenAutoPrResult, type OptimizerConfig, type ProposeContext, type RedactionStatus, type RunCampaignOptions, type RunEvalOptions, type RunImprovementLoopOptions, type RunImprovementLoopResult, type RunOptimizationOptions, type RunOptimizationResult, type Scenario, type ScenarioAggregate, type SessionScript, type TraceSpan, type Worktree, type WorktreeAdapter, WorktreeAdapterError, composeGate, defaultProductionGate, evolutionaryDriver, gitWorktreeAdapter, heldOutGate, openAutoPr, resolveWorktreePath, runCampaign, runEval, runImprovementLoop, runOptimization, surfaceHash };