npm - @tangle-network/agent-runtime - Versions diffs - 0.47.0 → 0.48.0 - Mend

@tangle-network/agent-runtime 0.47.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/agent.js +1 -1
package/dist/{chunk-MGFEUYOH.js → chunk-656G2XCL.js} +3 -3
package/dist/{chunk-72JQCHOZ.js → chunk-IW2LMLK6.js} +1634 -191
package/dist/chunk-IW2LMLK6.js.map +1 -0
package/dist/{chunk-5YDS7BLC.js → chunk-LX66I3SC.js} +2 -2
package/dist/{chunk-T4OQQEE3.js → chunk-TJS7S3HJ.js} +2 -2
package/dist/index.js +3 -3
package/dist/loop-runner-bin.js +3 -3
package/dist/loops.d.ts +2 -2
package/dist/loops.js +47 -1
package/dist/mcp/bin.js +3 -3
package/dist/mcp/index.d.ts +1 -1
package/dist/mcp/index.js +3 -3
package/dist/runtime.d.ts +825 -64
package/dist/runtime.js +47 -1
package/dist/{types-Cbx3dNK5.d.ts → types-BpDfCPUp.d.ts} +1 -1
package/dist/workflow.js +1 -1
package/package.json +2 -2
package/dist/chunk-72JQCHOZ.js.map +0 -1
/package/dist/{chunk-MGFEUYOH.js.map → chunk-656G2XCL.js.map} +0 -0
/package/dist/{chunk-5YDS7BLC.js.map → chunk-LX66I3SC.js.map} +0 -0
/package/dist/{chunk-T4OQQEE3.js.map → chunk-TJS7S3HJ.js.map} +0 -0

package/dist/runtime.d.ts CHANGED Viewed

@@ -1,12 +1,12 @@
-import { AgentProfile as AgentProfile$1, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
+import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
 export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
-import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, E as ExecutorFactory, d as AgentSpec, e as ExecutorRegistry, B as Budget, A as Agent, f as RootHandle, g as SupervisedResult, h as Spend, S as Scope, U as UsageEvent, i as Supervisor } from './types-Cbx3dNK5.js';
-export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-Cbx3dNK5.js';
+import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
+export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
+import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
+export { DefaultVerdict } from '@tangle-network/agent-eval';
 export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
 import { S as SandboxClient, b as LoopResult, c as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-nBMuollC.js';
 export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as LoopIterationDispatchPayload, H as LoopIterationEndedPayload, J as LoopIterationStartedPayload, a as LoopLineageOptions, M as LoopPlanDescription, N as LoopPlanPayload, f as LoopSandboxPlacement, P as LoopStartedPayload, Q as LoopTeardownFailedPayload, e as LoopTraceEmitter, T as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-nBMuollC.js';
-import { AgentProfile, AnalystFinding, DefaultVerdict, ChatClient } from '@tangle-network/agent-eval';
-export { DefaultVerdict } from '@tangle-network/agent-eval';
 import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
 import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
 export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
@@ -114,66 +114,66 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
 declare function materializeTreeView(events: SpawnEvent[]): TreeView;
 /**
- * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
- * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
- * run once on the prompt, emit the terminal result event, tear down.
- */
-declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
-/**
- * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
- *
- * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
- * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
- * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
- * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
- * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
- * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
- * the third silent. The fleet's products skipped (c) and fell back to a
- * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
- * to kill.
+ * auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
  *
- * `loopDispatch` collapses all three into one typed call:
+ * `observe()` critiques execution quality ("what's unfinished"). This audits ALIGNMENT —
+ * a different failure class the score can't see until it's too late: an agent can be
+ * executing flawlessly down the wrong route. The auditor reads the trajectory and
+ * compares three intents:
  *
- *   const dispatch = loopDispatch({
- *     sandboxClient,
- *     toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
- *   })
- *   await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
+ *   declared  — what the task says to do (the prompt / acceptance criteria)
+ *   revealed  — what the agent is ACTUALLY optimizing, inferred from its action pattern
+ *               (the inverse-inference move: actions reveal objectives)
+ *   user      — what the principal actually wants (the contract, when it differs from
+ *               the literal task text), plus where the user's own trajectory is heading
  *
- * Usage is reported automatically; trace events are forwarded automatically;
- * the ctx is built automatically. The seam becomes impossible to mis-wire.
+ * and returns a verdict (aligned / drifting / diverged) with evidence and ONE
+ * recommended intervention. FIREWALLED like every analyst: input is the trajectory and
+ * the intents — never the verifier or its data (zero check-leakage, so route auditing
+ * is always Goodhart-safe to run online).
  *
- * Typed structurally against the campaign `DispatchContext` (imported type-only
- * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
- * inversion.
+ * Where it runs: between shots (steer the next one), as a watchdog over the lifecycle
+ * stream (abort-and-refund a diverged rollout — the budget pool makes early abort
+ * strictly valuable), or post-hoc over a whole BenchmarkReport (the meta-intent pass:
+ * is the LOOP optimizing the right thing — degenerate submissions, check-gaming shapes,
+ * objective drift across tasks).
  */
-/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
-type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
-interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
-    /** Sandbox client used for every cell's `runLoop`. Supplied once. */
-    sandboxClient: SandboxClient;
-    /** Build the per-cell runLoop options from the scenario (+ profile, when
-     *  used with `runProfileMatrix`). */
-    toLoopOptions: (scenario: TScenario, profile: AgentProfile) => LoopOptionsForDispatch<Task, Output, Decision>;
-    /** Map the finished loop to the artifact the judges score. Default:
-     *  `result.winner?.output`. A loop with no winner yields `undefined` (judges
-     *  skip the cell) — but the loop's token usage is STILL reported, so the
-     *  integrity guard sees real activity. */
-    toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
-    /** Forward `loop.*` trace events into the campaign's scoped trace so loop
-     *  spans correlate with the cell. Default true. */
-    forwardTrace?: boolean;
-    /** Cost-meter source label for the loop's spend. Default `'loop'`. */
-    costSource?: string;
+interface AuditIntentInput {
+    /** The declared intent: the task text / acceptance criteria the agent was given. */
+    declaredIntent: string;
+    /** The trajectory so far — tool calls + results + assistant turns (any event shapes). */
+    trace: ReadonlyArray<unknown>;
+    /** The principal's actual intent when it differs from the literal task (the contract). */
+    userIntent?: string;
+    /** The loop-level purpose (meta-intent): what the WHOLE run is for — lets the auditor
+     *  flag locally-sensible work that serves the wrong larger objective. */
+    metaIntent?: string;
+    runId?: string;
 }
-/**
- * Adapter for `runProfileMatrix` (profile is an axis). Returns a
- * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
- * reports usage automatically.
- */
-declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
+interface AuditIntentOptions {
+    chat: ChatClient;
+    model?: string;
+    /** Override the auditor instruction (optimizable like any analyst prompt). */
+    auditorInstruction?: string;
+    /** Cap trace lines fed to the auditor. Default 80. */
+    maxTraceLines?: number;
+    signal?: AbortSignal;
+}
+interface IntentAudit {
+    /** What the agent's actions reveal it is actually optimizing — one sentence. */
+    revealedIntent: string;
+    verdict: 'aligned' | 'drifting' | 'diverged';
+    /** Trajectory-grounded evidence for the verdict (specific calls/patterns). */
+    evidence: string;
+    /** The single recommended intervention. */
+    recommendation: 'continue' | 'steer' | 'abort';
+    /** When recommendation is 'steer': the corrective instruction to inject. */
+    steer?: string;
+    confidence: number;
+}
+declare const defaultAuditorInstruction: string;
+declare function auditIntent(input: AuditIntentInput, opts: AuditIntentOptions): Promise<IntentAudit>;
 /**
  * @experimental
@@ -329,7 +329,7 @@ interface ShapeContext<D = unknown> {
     spawnChild(name: string, spec: AgentSpec): Agent<unknown, Outcome<D>>;
     /** Derive a child `AgentSpec` from the persona's root spec with an overridden profile —
      *  the seam a shape uses to give a worker a narrower role/prompt than the root persona. */
-    childSpec(profile: AgentProfile$1, harness?: BackendType | null): AgentSpec;
+    childSpec(profile: AgentProfile, harness?: BackendType | null): AgentSpec;
 }
 /**
  * A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it
@@ -790,7 +790,7 @@ interface RenderCorpusToInstructionsOptions {
     readonly corpus: Corpus;
     readonly filter: CorpusFilter;
     /** The profile to project the facts into. The result is a fresh profile — the input is unchanged. */
-    readonly profile: AgentProfile$1;
+    readonly profile: AgentProfile;
     /** Where the rendered facts land: appended to `prompt.instructions[]` (default) or folded into
      *  the single-blob `resources.instructions` string. */
     readonly target?: 'prompt' | 'resources';
@@ -799,7 +799,7 @@ interface RenderCorpusToInstructionsOptions {
 }
 /** `renderCorpusToInstructions(opts)` — the flywheel read-back projection. Async (queries the
  *  durable corpus); returns a fresh `AgentProfile` with the accreted facts merged in. */
-type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile$1>;
+type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile>;
 /**
  * One node in the reconstructed trajectory tree — a driver OR a leaf, with its OWN spend and the
  * spend ROLLED UP over its subtree. Reconstructed from the `SpawnJournal` (structure + per-node
@@ -941,7 +941,15 @@ interface ObserveOptions {
     signal?: AbortSignal;
     /** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */
     maxTraceLines?: number;
-}
+    /** Override the analyst's system instruction — the prompt that turns a trace into
+     *  findings + recommended_actions. The analyst IS the steerer, so this is the knob a
+     *  prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The
+     *  firewall (trace-only, never the verdict) is structural (input has no score), so a
+     *  custom instruction cannot break it. */
+    analystInstruction?: string;
+}
+/** The default observer instruction — exported so an optimizer can seed its population. */
+declare const defaultAnalystInstruction: string;
 interface Observation {
     findings: AnalystFinding[];
     /** Facts persisted to the corpus (empty when no corpus was supplied). */
@@ -954,6 +962,466 @@ declare function observe(input: ObserveInput, opts: ObserveOptions): Promise<Obs
  *  steer; the operator block is the advice. */
 declare function renderReport(findings: ReadonlyArray<AnalystFinding>): string;
+/**
+ * harvestCorpus — production traces → corpus, the G2 bridge (the playbook's step 6).
+ * The flywheel's write side, batched: run the firewalled `observe()` analyst over a
+ * stream of completed runs (yesterday's production traces, a benchmark's rollouts, a
+ * fleet's day) and accrete the trace-derived facts into the durable corpus.
+ *
+ * Store-agnostic by design: the caller maps its trace store's rows (a
+ * `ProductionTraceSink` ndjson, OTLP spans, RunRecords) to `ObserveInput` — task text,
+ * final output, the event trace, terminal outcome. The analyst reads BEHAVIOR only
+ * (the firewall is structural: the input carries no judge verdict), and corpus appends
+ * are idempotent on (claim + tags), so re-harvesting the same window is safe.
+ *
+ * The nightly product job is then three lines:
+ *   const runs = mapSinkRowsToObserveInputs(await readSink(yesterday))
+ *   const report = await harvestCorpus({ runs, chat, corpus, tags: ['gtm-agent'] })
+ *   log(report)   // runsObserved / findings / learned / failures
+ *
+ * NOTE on the read side: harvesting is safe and cheap; *injecting* facts back into runs
+ * is the measured danger zone — naive unconditional priming tested NEGATIVE (−11.6pp,
+ * context pollution; docs/research/layer-across-run.md). Gate any priming design on its
+ * own A/B; the corpus's first consumers are operators and optimizers, not prompts.
+ */
+interface HarvestCorpusOptions {
+    /** The completed runs to analyze — map your store's rows to `ObserveInput`. */
+    runs: AsyncIterable<ObserveInput> | Iterable<ObserveInput>;
+    /** The model-call seam (agent-eval `createChatClient`). */
+    chat: ChatClient;
+    model?: string;
+    /** The durable corpus the facts accrete into. */
+    corpus: Corpus;
+    /** Tags written onto learned facts (the product/domain key the read side queries by). */
+    tags?: ReadonlyArray<string>;
+    /** Override the analyst instruction (the GEPA-tunable knob). */
+    analystInstruction?: string;
+    /** Runs analyzed in parallel. Default 4. */
+    concurrency?: number;
+    /** Hard cap on runs consumed from the stream (a cost guard for unbounded stores). */
+    maxRuns?: number;
+    signal?: AbortSignal;
+}
+interface HarvestFailure {
+    runId: string;
+    error: string;
+}
+interface HarvestReport {
+    runsObserved: number;
+    /** Total findings the analyst produced (including ones already known). */
+    findings: number;
+    /** NEW facts actually appended (idempotent dedup excludes re-learned ones). */
+    learned: number;
+    /** Per-run analysis failures — reported, never silently dropped. */
+    failures: HarvestFailure[];
+}
+declare function harvestCorpus(opts: HarvestCorpusOptions): Promise<HarvestReport>;
+/**
+ * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
+ * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
+ * run once on the prompt, emit the terminal result event, tear down.
+ */
+declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
+/**
+ * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
+ *
+ * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
+ * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
+ * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
+ * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
+ * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
+ * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
+ * the third silent. The fleet's products skipped (c) and fell back to a
+ * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
+ * to kill.
+ *
+ * `loopDispatch` collapses all three into one typed call:
+ *
+ *   const dispatch = loopDispatch({
+ *     sandboxClient,
+ *     toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
+ *   })
+ *   await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
+ *
+ * Usage is reported automatically; trace events are forwarded automatically;
+ * the ctx is built automatically. The seam becomes impossible to mis-wire.
+ *
+ * Typed structurally against the campaign `DispatchContext` (imported type-only
+ * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
+ * inversion.
+ */
+/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
+type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
+interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
+    /** Sandbox client used for every cell's `runLoop`. Supplied once. */
+    sandboxClient: SandboxClient;
+    /** Build the per-cell runLoop options from the scenario (+ profile, when
+     *  used with `runProfileMatrix`). */
+    toLoopOptions: (scenario: TScenario, profile: AgentProfile$1) => LoopOptionsForDispatch<Task, Output, Decision>;
+    /** Map the finished loop to the artifact the judges score. Default:
+     *  `result.winner?.output`. A loop with no winner yields `undefined` (judges
+     *  skip the cell) — but the loop's token usage is STILL reported, so the
+     *  integrity guard sees real activity. */
+    toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
+    /** Forward `loop.*` trace events into the campaign's scoped trace so loop
+     *  spans correlate with the cell. Default true. */
+    forwardTrace?: boolean;
+    /** Cost-meter source label for the loop's spend. Default `'loop'`. */
+    costSource?: string;
+}
+/**
+ * Adapter for `runProfileMatrix` (profile is an axis). Returns a
+ * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
+ * reports usage automatically.
+ */
+declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
+/**
+ * The general agentic primitive — sequential (depth) and parallel (breadth) over a shared,
+ * checkable artifact, driven through the keystone Supervisor as one recursive `Agent.act`.
+ *
+ * The domain lives behind ONE seam — `AgenticSurface` (open an artifact, list tools, call a tool,
+ * score the artifact, close it). EnterpriseOps implements it (seed a gym DB, MCP tools, SQL
+ * verifier); Commit0/AppWorld/terminal-bench implement it the same way (a repo workspace, shell
+ * tools, the test suite). The drivers below are domain-blind: they run over any surface.
+ *
+ * Two shapes, the agent's POMDP rollout as the unit:
+ *  - DEPTH   one persistent artifact carried across shots. Each shot the agent works the tool loop;
+ *            between shots a trace-analyst (selector≠judge: reads the trajectory, never the score)
+ *            steers the resumed session toward what's unfinished. shot n stands on shot n-1's
+ *            artifact state + history. This is continuation — long-horizon, same artifact.
+ *  - BREADTH K independent artifacts, each a fresh rollout, the deployable verifier picks the best.
+ *
+ * Both are an `Agent` whose `act` spawns leaf shots through `scope.spawn` and reacts via
+ * `scope.next()` — so the conserved budget pool meters them (equal-k by construction), the journal
+ * records the tree, and the same primitive nests. `runAgentic` runs the chosen driver through
+ * `createSupervisor().run`. The leaf (one shot over a handle) is resolved per-spawn from a
+ * surface-closed registry — the open `Executor` seam, not bespoke per-benchmark glue.
+ */
+interface AgenticTask {
+    readonly id: string;
+    readonly systemPrompt: string;
+    readonly userPrompt: string;
+    /** Opaque domain payload the surface reads (EOPS: servers/verifiers/tools). Drivers never read it. */
+    readonly meta?: Record<string, unknown>;
+}
+interface ArtifactHandle {
+    readonly id: string;
+    readonly surface: string;
+    /** Opaque per-artifact context the surface stashes (EOPS: the seeded gym server + db id). */
+    readonly ctx?: unknown;
+}
+interface AgenticTool {
+    readonly type: 'function';
+    readonly function: {
+        name: string;
+        description?: string;
+        parameters: Record<string, unknown>;
+    };
+}
+interface SurfaceScore {
+    passes: number;
+    total: number;
+    /** Checks excluded as malformed (data defect, not the agent). `total === 0` ⇒ unscoreable. */
+    errored: number;
+}
+/** A stateful, checkable environment an agent operates over with tools. Open behind one interface. */
+interface AgenticSurface {
+    readonly name: string;
+    open(task: AgenticTask): Promise<ArtifactHandle>;
+    tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]>;
+    call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string>;
+    score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
+    close(handle: ArtifactHandle): Promise<void>;
+}
+interface AgenticOptions {
+    routerBaseUrl: string;
+    routerKey: string;
+    model: string;
+    temperature?: number;
+    /** Turns the agent may take within ONE shot before the driver intervenes. */
+    innerTurns?: number;
+    /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
+     *  prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
+    analystInstruction?: string;
+    /** The critic's model — lets the analyst be a stronger (or cheaper) model than the
+     *  worker. Omitted ⇒ the worker's `model`. */
+    analystModel?: string;
+    /** Across-run learning: when set, the analyst's observe() pass appends trace-derived
+     *  facts here (the flywheel write side). Priming (the read side) is the caller's move —
+     *  query the corpus and fold facts into the task's systemPrompt before runAgentic. */
+    corpus?: Corpus;
+    /** Tags written onto learned facts (and used by the caller's priming query). */
+    corpusTags?: string[];
+}
+type Msg = Record<string, unknown>;
+interface ShotResult {
+    messages: Msg[];
+    score: number;
+    passes: number;
+    total: number;
+    completions: number;
+    toolErrors: number;
+}
+interface AgenticRunResult {
+    /** The strategy name (built-in 'depth'/'breadth' or a custom strategy's name). */
+    mode: string;
+    score: number;
+    resolved: boolean;
+    completions: number;
+    /** DEPTH: score after each shot — the progress-over-rounds curve. BREADTH: best-so-far per rollout. */
+    progression: number[];
+    shots: number;
+    /** The cost vector, stamped by `runAgentic` from the Supervisor's conserved pool: real
+     *  router tokens, priced usd (0 when the model is unpriced — never fabricated), wall ms. */
+    usd: number;
+    ms: number;
+    tokens: {
+        input: number;
+        output: number;
+    };
+}
+/** DEPTH: one persistent artifact, carried across analyst-steered shots. */
+declare function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
+    maxShots: number;
+}): Agent<unknown, Outcome<unknown>>;
+/** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
+declare function breadthDriver(_surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
+    width: number;
+}): Agent<unknown, Outcome<unknown>>;
+/**
+ * A Strategy is HOW you spend the compute budget to beat the Environment's check — it
+ * builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev
+ * authors their own by implementing `driver()` to return an Agent whose `act()` spawns
+ * shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are
+ * the reference implementations to copy:
+ *   sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample).
+ *   refine — attempt → observe() reads the trace → steer the next → repeat (iterate).
+ * (A multi-agent "team" is just a Strategy whose driver spawns several different agents.)
+ */
+interface Strategy {
+    readonly name: string;
+    driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>;
+}
+declare const sample: Strategy;
+declare const refine: Strategy;
+/** A role for one shot — multi-agent loops (researcher + engineer, a panel of k
+ *  researchers) give each shot its own system prompt and optionally its own model. */
+interface ShotPersona {
+    /** Replaces the task's systemPrompt for a FRESH shot; on a carried conversation it is
+     *  injected as a hand-off message (the transcript's earlier roles stay intact). */
+    systemPrompt?: string;
+    /** Per-shot model override (e.g. a stronger model for the engineer shot). */
+    model?: string;
+}
+interface ShotSpec {
+    /** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */
+    handle?: ArtifactHandle;
+    messages?: Msg[];
+    steer?: string;
+    persona?: ShotPersona;
+}
+interface StrategyResult {
+    score: number;
+    resolved: boolean;
+    completions: number;
+    progression: number[];
+    shots: number;
+}
+/** Artifact lifecycle a strategy may manage itself — open/close ONLY. Raw `call`/`score`
+ *  are withheld: scores reach the body solely through `shot()`'s ShotResult (the
+ *  harness-verified channel), so a body cannot peek the check or fabricate around it. */
+interface StrategyArtifacts {
+    readonly name: string;
+    open(task: AgenticTask): Promise<ArtifactHandle>;
+    close(handle: ArtifactHandle): Promise<void>;
+}
+/** What a strategy body composes with: the artifact lifecycle, the budget, and the two steps. */
+interface StrategyCtx {
+    /** Open/close artifacts the body manages itself (e.g. one persistent handle for depth). */
+    readonly surface: StrategyArtifacts;
+    readonly task: AgenticTask;
+    readonly opts: AgenticOptions;
+    readonly budget: number;
+    readonly scope: Scope<Outcome<unknown>>;
+    /** Run ONE worker shot; its harness-scored result, or null if it went down. */
+    shot(spec?: ShotSpec): Promise<ShotResult | null>;
+    /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
+    critique(messages: Msg[]): Promise<string | null>;
+}
+/** Author a Strategy from the composable steps — the open, compact way. */
+declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
+/** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot
+ *  fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck)
+ *  — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
+ *  checkpoint across all lines), the deployable metric. This is the "experts build BETTER
+ *  optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
+declare const adaptiveRefine: Strategy;
+/** The explore-then-exploit MIX: spend ⌈budget/2⌉ on independent samples (kept open),
+ *  then refine the best-verifying line with the remaining budget. Sample's basin escape +
+ *  refine's accumulation — the third built-in, authored from the public steps. */
+declare const sampleThenRefine: Strategy;
+interface RunAgenticOptions extends AgenticOptions {
+    surface: AgenticSurface;
+    task: AgenticTask;
+    /** Lifecycle observability — every spawn/settle (shots, analysts) streams here live.
+     *  The seam online watchdogs/route-auditors subscribe to. */
+    hooks?: RuntimeHooks;
+    /** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */
+    strategy?: Strategy;
+    /** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */
+    mode?: 'depth' | 'breadth';
+    /** budget: refine→max shots; sample→rollout width. */
+    budget: number;
+    rootBudget?: Budget;
+}
+/** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
+declare function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunResult>;
+/**
+ * runBenchmark — the packaged optimization suite. Define a domain by implementing an
+ * `Environment` (open / tools / call / score / close); get the optimization strategies
+ * compared, scored by your own deployable check, with a paired-bootstrap report — free.
+ *
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
+ * is how you spend the budget to beat the check. Two built-ins:
+ *
+ *   sample  — N independent attempts, keep the best-verifying one.   (best-of-N / resample)
+ *   refine  — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
+ *
+ * Both run at equal budget through the Supervisor's conserved pool; the headline is the
+ * paired lift of refine over sample. Author your own strategy with `defineStrategy`.
+ */
+/** A checkable task domain — implement these 5 hooks and the suite does the rest. The
+ *  same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
+type Environment = AgenticSurface;
+interface BenchmarkConfig {
+    /** The task domain (5 hooks). */
+    environment: Environment;
+    /** The tasks to score across. */
+    tasks: AgenticTask[];
+    /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
+    worker: AgenticOptions;
+    /** Which strategies to compare. Pass the built-ins (`refine`, `sample`) or your own.
+     *  Default: [sample, refine]. */
+    strategies?: Strategy[];
+    /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
+    budget?: number;
+    /** Tasks scored in parallel. Default 3. */
+    concurrency?: number;
+    /** Progress hook — fires as each task settles (the live-monitoring seam: append to a
+     *  progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
+    onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void;
+    /** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
+     *  here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
+    hooks?: RuntimeHooks;
+}
+interface BenchmarkLift {
+    /** Mean of paired deltas (refine − sample). */
+    mean: number;
+    low: number;
+    high: number;
+    n: number;
+}
+/** One strategy's outcome on one task — the per-task cell an optimizer consumes. */
+interface BenchmarkCell {
+    score: number;
+    resolved: boolean;
+    /** The progress curve (refine: score per shot; sample: best-so-far per rollout). */
+    progression: number[];
+    usd: number;
+    ms: number;
+    tokens: {
+        input: number;
+        output: number;
+    };
+}
+interface BenchmarkTaskRow {
+    taskId: string;
+    /** Per-strategy cells; absent when the task errored before completing all strategies. */
+    cells?: Record<string, BenchmarkCell>;
+    /** Why the task was excluded (infra/setup failure) — never silently dropped. */
+    error?: string;
+}
+interface BenchmarkStrategySummary {
+    /** Mean verifier score (0..1). */
+    score: number;
+    /** Fraction of tasks fully resolved. */
+    resolved: number;
+    /** Mean cost vector per task. */
+    usd: number;
+    ms: number;
+}
+interface BenchmarkReport {
+    n: number;
+    excluded: number;
+    /** Per-strategy means (keyed by strategy.name). */
+    perStrategy: Record<string, BenchmarkStrategySummary>;
+    /** The full per-task × per-strategy table — the LOSSES an optimizer (GEPA, a
+     *  strategy-author, an operator) consumes. Includes errored tasks with the reason. */
+    perTask: BenchmarkTaskRow[];
+    /** The non-dominated strategies on (score ↑, $/task ↓) — collapse-last, per the canon:
+     *  a strategy that ties on score at half the cost WINS and a scalar would hide it. */
+    pareto: string[];
+    /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
+    refineVsSample?: BenchmarkLift;
+}
+/** Run the requested strategies over the tasks, scored by the Environment's own check.
+ *  Resilient: a task whose rollouts fail (transient infra) is excluded from the stats but
+ *  reported in `perTask` with the error — never silently dropped. */
+declare function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport>;
+/** Pretty-print a report — the "free optimization" verdict, with the cost vector. */
+declare function printBenchmarkReport(report: BenchmarkReport): void;
+/**
+ * createMcpEnvironment — wrap any MCP server as an `Environment` (the product-adoption
+ * primitive: a product's agent tools are usually already an MCP surface, so the domain
+ * only writes the lifecycle hooks — open a scoped artifact, score it with a deployable
+ * check, close it — and the tool plumbing is derived from the server).
+ *
+ * What the helper owns (the generic 80%, hardened on the EnterpriseOps gym):
+ *   - JSON-RPC `tools/list` → `AgenticTool[]`, with schemas coerced to the
+ *     OpenAI-tool-valid shape (top-level oneOf/anyOf/allOf/enum/not are rejected by
+ *     tool-calling providers; nested combinators are fine).
+ *   - JSON-RPC `tools/call` → the tool's text content (errors surfaced as `ERROR: …`
+ *     strings — a bad call is the agent's outcome, not an infra fault).
+ *   - SSE response parsing (streamable-HTTP MCP servers answer with `data:` lines).
+ *   - Bounded retry with backoff on thrown fetches (transient network ≠ task failure).
+ *
+ * What the domain supplies: `open` (create/seed the per-task artifact and return its
+ * MCP endpoint — url + headers carry the per-artifact scoping, e.g. a database id
+ * header), `score` (the deployable check), and optional `close`/`selectTools`.
+ */
+/** Where a handle's MCP server lives; headers carry per-artifact scoping. */
+interface McpEndpoint {
+    url: string;
+    headers?: Record<string, string>;
+}
+interface McpEnvironmentOptions {
+    name: string;
+    /** Create/seed the per-task artifact; return its handle + the MCP endpoint scoped to it. */
+    open(task: AgenticTask): Promise<{
+        handle: ArtifactHandle;
+        endpoint: McpEndpoint;
+    }>;
+    /** The deployable check over the artifact's current state. */
+    score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
+    /** Teardown (delete the seeded artifact). Optional — omit for stateless servers. */
+    close?(handle: ArtifactHandle): Promise<void>;
+    /** Restrict/order the server's tools per task (e.g. the task's selected_tools). Default: all. */
+    selectTools?(task: AgenticTask, all: AgenticTool[]): AgenticTool[];
+    /** Cap on a tool result's text fed back to the worker. Default 1500 chars. */
+    maxResultChars?: number;
+}
+declare function createMcpEnvironment(opts: McpEnvironmentOptions): Environment;
 /**
  * @experimental
  *
@@ -1175,7 +1643,7 @@ declare class FileCorpus implements Corpus {
  * An empty query result returns a fresh COPY of the profile with no instruction change (a valid
  * "nothing learned yet" read, not an error).
  */
-declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile$1>;
+declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile>;
 /**
  * @experimental
@@ -1282,6 +1750,39 @@ declare function trajectoryReport(journal: SpawnJournal, blobs: ResultBlobStore,
  */
 declare function equalKOnCost(arms: ReadonlyArray<EqualKArm>, options?: EqualKOnCostOptions): EqualKVerdict;
+interface PromotionGateOptions {
+    /** The HOLDOUT report — must carry per-task cells for both strategy names. */
+    report: BenchmarkReport;
+    /** The incumbent champion's strategy name. */
+    incumbent: string;
+    /** The challenger's strategy name. */
+    candidate: string;
+    /** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
+    deltaThreshold?: number;
+    /** Minimum paired tasks before significance can be claimed. Default 6 — below that
+     *  the bootstrap CI is too wide to separate a real lift from the per-task noise. */
+    minPairedTasks?: number;
+    /** Bootstrap statistic over the paired deltas. Default 'mean'. */
+    statistic?: 'mean' | 'median';
+    /** Fixed by the substrate by default — the same report always yields the same verdict. */
+    seed?: number;
+    resamples?: number;
+}
+interface PromotionVerdict {
+    promoted: boolean;
+    reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
+    /** Paired tasks that carried both strategies' cells. */
+    n: number;
+    /** Paired (candidate − incumbent) lift across the holdout tasks. */
+    lift: {
+        mean: number;
+        median: number;
+        low: number;
+        high: number;
+    };
+}
+declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
 /**
  * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
  * dispatch.
@@ -1711,6 +2212,190 @@ interface OpenSandboxRunOptions {
  */
 declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandboxRunOptions, deliverable: Deliverable<Out>): Promise<SandboxRun<Out>>;
+/**
+ * authorStrategy — the agent-authored layer as a package primitive (software-3.0): an
+ * LLM reads a benchmark's per-task LOSSES + the defineStrategy contract and writes a NEW
+ * optimization strategy as code; the caller gates it like any human-built candidate
+ * (runBenchmark + a frozen holdout).
+ *
+ * Structurally safe by construction: the authored body composes shot()/critique() and
+ * spends through the Supervisor's conserved pool — it can be wrong, but it cannot
+ * Goodhart the check (it never sees the verifiers) and it cannot win by overspending.
+ *
+ * The authored module is written to `outDir` and dynamically imported — run under a
+ * TS-capable loader (tsx) since models often emit type annotations.
+ */
+/** The compressed consumable a skill carries: everything an author needs to emit a loop. */
+declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n  shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n    Runs ONE worker attempt (a bounded tool loop) over an artifact.\n    - omit handle  => the shot opens its OWN fresh artifact and closes it after (a sample).\n    - pass handle  => the shot CONTINUES that artifact (state accumulates across shots).\n    - messages     => the carried conversation (pass the previous ShotResult.messages to continue).\n    - steer        => a corrective instruction injected before the shot.\n    - persona      => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n      (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n      personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n      a carried conversation it arrives as a hand-off message. Same conserved budget.\n    ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n    Returns null if the attempt failed infra-wise.\n\n  critique(messages): Promise<string | null>\n    A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n    instruction (or null when it judges the work complete). Costs ~1 completion.\n\n  surface.open(task) / surface.close(handle)\n    Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n  fresh conversation too, but be explicit). To CONTINUE, pass the previous\n  ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n  you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n  // your composition\n})\n";
+interface AuthorStrategyOptions {
+    /** The model-call seam (agent-eval `createChatClient`). */
+    chat: ChatClient;
+    model?: string;
+    /** A NAMED fallback author tried once when the primary call fails or returns no code
+     *  block (thinking models time out at the edge on long authoring prompts, or return
+     *  empty content without `maxTokens`). Opt-in — absent means the primary's failure
+     *  propagates. */
+    fallbackModel?: string;
+    /** The contract text shown to the author. Default `strategyAuthorContract`. The
+     *  meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
+     *  variant on the same frozen holdout as any strategy. */
+    contract?: string;
+    /** The environment the losses came from (orientation only — never the verifiers). */
+    environmentName: string;
+    /** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
+    lossesJson: string;
+    /** The budget the strategy must respect (shots/width). */
+    budget: number;
+    /** Where the authored module file is written (created if missing). */
+    outDir: string;
+    temperature?: number;
+    /** Completion cap — required by thinking-model authors that stream reasoning first. */
+    maxTokens?: number;
+    signal?: AbortSignal;
+}
+/** Static CONTRACT lint over an authored strategy module — the module-boundary
+ *  enforcement of the harness's two measurement invariants:
+ *    - author blindness: the only import allowed is the loops surface. A body that could
+ *      reach the filesystem, network, or process could read or mutate verifier/artifact
+ *      state outside the brokered shots, and the harness-verified score would stop
+ *      meaning "what the shots achieved".
+ *    - conserved dose: no out-of-band compute (fetch/require/eval) — every unit a
+ *      strategy spends is metered by the Supervisor's pool, which is what makes
+ *      equal-budget comparisons between strategies valid.
+ *  A lint, not a sandbox: its job is keeping the benchmark numbers interpretable. */
+declare function assertStrategyContract(code: string): void;
+interface AuthoredStrategy {
+    strategy: Strategy;
+    file: string;
+    code: string;
+}
+/** Author + load a strategy from losses. Throws when the author emits no loadable module;
+ *  with `fallbackModel` set, the named fallback gets one attempt first. */
+declare function authorStrategy(opts: AuthorStrategyOptions): Promise<AuthoredStrategy>;
+/**
+ * runStrategyEvolution — the multi-generation strategy search: per generation the system
+ * authors a POPULATION of candidate strategies from the current tournament's losses,
+ * plays them against the incumbent at equal budget, and advances a champion; one final
+ * promotion decision runs on a NEVER-BEFORE-USED holdout slice through `promotionGate`.
+ *
+ * Measurement invariants (the reasons this design is shaped the way it is):
+ *  - The author sees TRAIN losses only. The holdout slice is drawn fresh (disjoint task
+ *    offsets) after all authoring is done — one promotion decision, one untouched slice,
+ *    so adaptive reuse of evaluation data never enters the verdict.
+ *  - Every tournament runs at the same per-strategy budget through the conserved pool;
+ *    candidates cannot win by overspending.
+ *  - Champion selection within the search is a SEARCH policy (configurable, default
+ *    cost-aware: ties on score go to the cheapest strategy — a scalar hides a strategy
+ *    that ties at half the cost). The promotion verdict never comes from search
+ *    selection; it comes from the gate on the fresh slice.
+ *  - Every authored artifact's description length (gzip bits) is recorded, so the
+ *    artifact-complexity-vs-holdout-gap relation is analyzable from any run's report.
+ *
+ * Lineage fields (`parent`, `generation`) are recorded on every archive node so a
+ * descendant-productivity parent-selection policy can be added without changing the
+ * report schema; the v1 search authors from the latest tournament's losses.
+ */
+interface EvolutionAuthor {
+    /** The model-call seam (agent-eval `createChatClient`). */
+    chat: ChatClient;
+    model?: string;
+    fallbackModel?: string;
+    temperature?: number;
+    maxTokens?: number;
+}
+type ChampionPolicy = 'score' | 'costAware';
+interface StrategyEvolutionConfig {
+    environment: Environment;
+    /** Task supply by DISJOINT slice: `(offset, n)` must return n tasks unique to that
+     *  offset range. Train draws [0, trainN); the holdout draws [trainN + holdoutOffset,
+     *  …) — tasks the search never touched. */
+    tasks: (offset: number, n: number) => Promise<AgenticTask[]>;
+    trainN: number;
+    holdoutN: number;
+    /** Extra offset past the train slice for the holdout draw (rotate across runs). */
+    holdoutOffset?: number;
+    worker: AgenticOptions;
+    author: EvolutionAuthor;
+    /** Rollouts (sample) / shots (refine) per strategy per task. Default 3. */
+    budget?: number;
+    concurrency?: number;
+    /** Author→tournament rounds after gen0. Default 2. */
+    generations?: number;
+    /** Authored candidates per generation. Default 2. */
+    populationSize?: number;
+    /** The gen0 field. Default [sample, refine, sampleThenRefine]. */
+    baselines?: Strategy[];
+    /** Search-side champion selection. Default 'costAware'. */
+    champion?: ChampionPolicy;
+    /** Score band treated as a tie under 'costAware'. Default 0.01. */
+    championEpsilon?: number;
+    /** Where authored modules are written. */
+    outDir: string;
+    /** Promotion-gate evidence floor (paired holdout tasks). */
+    minPairedTasks?: number;
+    onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
+    hooks?: RuntimeHooks;
+}
+interface ChampionPick {
+    name: string;
+    score: number;
+    usd: number;
+}
+interface EvolutionCandidate {
+    name: string;
+    file?: string;
+    gzipBits?: number;
+    codeChars?: number;
+    /** Present when this author attempt failed (recorded, never silent). */
+    error?: string;
+}
+interface EvolutionGeneration {
+    generation: number;
+    candidates: EvolutionCandidate[];
+    report: BenchmarkReport;
+    champion: ChampionPick;
+}
+interface EvolutionArchiveNode {
+    name: string;
+    source: 'baseline' | 'authored';
+    generation: number;
+    /** The champion whose tournament losses this candidate was authored from. */
+    parent?: string;
+    gzipBits?: number;
+    file?: string;
+    /** Latest measured tournament result — 0 until the node's first tournament settles
+     *  (an authored node is created before its generation's benchmark runs). */
+    score: number;
+    usd: number;
+}
+interface EvolutionReport {
+    gen0: BenchmarkReport;
+    gen0Champion: ChampionPick;
+    generations: EvolutionGeneration[];
+    archive: EvolutionArchiveNode[];
+    finalChampion: ChampionPick;
+    holdout: BenchmarkReport;
+    verdict: PromotionVerdict;
+    /** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
+     *  re-measurement, so cross-generation deltas mix true drift with run-to-run variance
+     *  (entries are unpaired across generations). The only evidence-grade comparison in
+     *  this report is `verdict` — both finalists measured fresh, paired, on the holdout. */
+    trajectory: Array<{
+        generation: number;
+        champion: string;
+        score: number;
+        usd: number;
+    }>;
+}
+/** Search-side champion selection over a tournament report. 'score' takes the best mean
+ *  score (ties → field order). 'costAware' treats scores within `epsilon` of the best as
+ *  tied and takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
+declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
+declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
 /**
  * @experimental
  *
@@ -1869,6 +2554,34 @@ interface BridgeSeam {
     agentProfile?: Record<string, unknown>;
     timeoutMs?: number;
 }
+/** An OpenAI-shape function tool the model may call. */
+interface ToolSpec {
+    type: 'function';
+    function: {
+        name: string;
+        description?: string;
+        parameters: unknown;
+    };
+}
+/**
+ * Router seam WITH tool use — the tool-using router backend. Same direct
+ * OpenAI-compatible endpoint as `RouterSeam`, but each turn passes `tools`; when
+ * the model emits tool_calls they run via `executeToolCall` ON THIS HOST and the
+ * results fold back as `tool` messages, repeating until the model answers without
+ * a tool or `maxTurns` is hit. A real agentic loop, OFF-BOX — no sandbox, so it
+ * is unaffected by a box's egress allowlist. One turn = one completion = the
+ * equal-compute unit. `executeToolCall` receives the task so per-task tool
+ * surfaces (e.g. a gym keyed by task) can dispatch correctly.
+ */
+interface RouterToolsSeam {
+    routerBaseUrl: string;
+    routerKey: string;
+    model?: string;
+    tools: ReadonlyArray<ToolSpec>;
+    executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
+    /** Max inference turns (default 4). */
+    maxTurns?: number;
+}
 /**
  * The single built-in executor entrypoint. The backend is DATA — the cost dial a
  * profile, an experiment config, or a replay journal can name — not an import
@@ -1879,6 +2592,8 @@ interface BridgeSeam {
 type ExecutorConfig = ({
     backend: 'router';
 } & RouterSeam) | ({
+    backend: 'router-tools';
+} & RouterToolsSeam) | ({
     backend: 'bridge';
 } & BridgeSeam) | ({
     backend: 'cli';
@@ -2016,6 +2731,47 @@ declare function createSupervisor<Task, Out>(): Supervisor<Task, Out>;
  */
 declare function createRootHandle<Out>(): RootHandle<Out>;
+/**
+ * createVerifierEnvironment — ANY checkable task as an `Environment`, no tool surface
+ * required. The generalization piece: EOPS/commit0-style domains have tools that mutate
+ * an external artifact, but math problems, legal drafts, creative briefs, GTM copy, and
+ * QA tasks have a different shape — the artifact IS the worker's answer, and the domain
+ * is defined by one function: the deployable check over that answer.
+ *
+ *   const gsm8k = createVerifierEnvironment({
+ *     name: 'gsm8k',
+ *     check: (task, answer) => ({
+ *       passes: extractFinalNumber(answer) === task.meta?.answer ? 1 : 0,
+ *       total: 1,
+ *       errored: 0,
+ *     }),
+ *   })
+ *   await runBenchmark({ environment: gsm8k, tasks, worker })   // sample vs refine on math
+ *
+ * The worker gets one built-in tool — `submit_answer` — plus any read-only domain tools
+ * the caller adds (a calculator, a retrieval call, a style guide lookup). Every
+ * submission is kept; `score()` checks the BEST submission (keep-best is the measured
+ * law: workers reach correct answers then revise past them). The refine strategy's
+ * critic reads the submission trajectory like any other trace, so iterate-with-feedback
+ * works unchanged on answer domains.
+ *
+ * The check can be graded (passes/total expresses partial credit — rubric points,
+ * sub-answers, unit-test counts), and MUST be deployable (computable without an oracle
+ * at serve time): exact/numeric match, schema validation, a compiled rubric — not a
+ * peek at held-out labels the production system wouldn't have.
+ */
+interface VerifierEnvironmentOptions {
+    name: string;
+    /** The deployable check over a submitted answer. Graded via passes/total. */
+    check(task: AgenticTask, answer: string): Promise<SurfaceScore> | SurfaceScore;
+    /** Extra domain tools (read-only helpers: calculator, retrieval, style lookup). */
+    extraTools?: AgenticTool[];
+    /** Executes the extra tools. Required when `extraTools` is set. */
+    callExtra?(task: AgenticTask, name: string, args: Record<string, unknown>): Promise<string> | string;
+}
+declare function createVerifierEnvironment(opts: VerifierEnvironmentOptions): Environment;
 /** Command runner seam. Host code can use `localShell`; sandbox code can wrap `box.exec`. */
 type Shell = (args: ReadonlyArray<string>, cwd?: string) => Promise<{
     stdout: string;
@@ -2043,5 +2799,10 @@ interface GitWorkspaceOptions {
     readonly noHooks?: boolean;
 }
 declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
+/** A jj-backed `Workspace` (Jujutsu, colocated with git for the durable remote).
+ *  Same port, same `Shell` — a drop-in for `gitWorkspace`. jj suits agent loops:
+ *  no staging area, and a first-class operation log (native resume/undo). Live use
+ *  requires `jj` on the `Shell`'s host. */
+declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
-export { Agent, AgentRunSpec, AgentSpec, type AssertTraceDerivedFindings, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, SpawnEvent, SpawnJournal, Spend, type SteerContext, SupervisedResult, Supervisor, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, assertTraceDerivedFindings, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, definePersona, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, inlineSandboxClient, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, probeSandboxCapabilities, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runPersonified, settledToIteration, spendFromUsageEvents, trajectoryReport, verify, widen };
+export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };