@tangle-network/agent-runtime 0.47.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +79 -15
  2. package/dist/agent.js +1 -1
  3. package/dist/chunk-GHX7XOJ2.js +433 -0
  4. package/dist/chunk-GHX7XOJ2.js.map +1 -0
  5. package/dist/{chunk-T4OQQEE3.js → chunk-IQS4HI3F.js} +14 -5
  6. package/dist/chunk-IQS4HI3F.js.map +1 -0
  7. package/dist/{chunk-72JQCHOZ.js → chunk-PXUTIMGJ.js} +2318 -237
  8. package/dist/chunk-PXUTIMGJ.js.map +1 -0
  9. package/dist/{chunk-MGFEUYOH.js → chunk-U2VEWKKK.js} +3 -3
  10. package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
  11. package/dist/chunk-VIEDXELL.js.map +1 -0
  12. package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
  13. package/dist/index.d.ts +29 -4
  14. package/dist/index.js +109 -21
  15. package/dist/index.js.map +1 -1
  16. package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
  17. package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
  18. package/dist/loop-runner-bin.d.ts +2 -2
  19. package/dist/loop-runner-bin.js +3 -3
  20. package/dist/loops.d.ts +3 -3
  21. package/dist/loops.js +57 -1
  22. package/dist/mcp/bin.js +187 -24
  23. package/dist/mcp/bin.js.map +1 -1
  24. package/dist/mcp/index.d.ts +28 -125
  25. package/dist/mcp/index.js +28 -6
  26. package/dist/mcp/index.js.map +1 -1
  27. package/dist/platform.js +2 -2
  28. package/dist/platform.js.map +1 -1
  29. package/dist/runtime.d.ts +1100 -62
  30. package/dist/runtime.js +57 -1
  31. package/dist/{types-Cbx3dNK5.d.ts → types-BpDfCPUp.d.ts} +1 -1
  32. package/dist/workflow.js +1 -1
  33. package/package.json +7 -6
  34. package/dist/chunk-5YDS7BLC.js +0 -218
  35. package/dist/chunk-5YDS7BLC.js.map +0 -1
  36. package/dist/chunk-72JQCHOZ.js.map +0 -1
  37. package/dist/chunk-JNPK46YH.js.map +0 -1
  38. package/dist/chunk-T4OQQEE3.js.map +0 -1
  39. package/dist/kb-gate-51BlLlVM.d.ts +0 -529
  40. /package/dist/{chunk-MGFEUYOH.js.map → chunk-U2VEWKKK.js.map} +0 -0
  41. /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
package/dist/runtime.d.ts CHANGED
@@ -1,16 +1,16 @@
1
- import { AgentProfile as AgentProfile$1, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
1
+ import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
2
2
  export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
3
- import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, E as ExecutorFactory, d as AgentSpec, e as ExecutorRegistry, B as Budget, A as Agent, f as RootHandle, g as SupervisedResult, h as Spend, S as Scope, U as UsageEvent, i as Supervisor } from './types-Cbx3dNK5.js';
4
- export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-Cbx3dNK5.js';
3
+ import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
4
+ export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
5
+ import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
6
+ import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
7
+ export { DefaultVerdict } from '@tangle-network/agent-eval';
5
8
  export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
6
9
  import { S as SandboxClient, b as LoopResult, c as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-nBMuollC.js';
7
10
  export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as LoopIterationDispatchPayload, H as LoopIterationEndedPayload, J as LoopIterationStartedPayload, a as LoopLineageOptions, M as LoopPlanDescription, N as LoopPlanPayload, f as LoopSandboxPlacement, P as LoopStartedPayload, Q as LoopTeardownFailedPayload, e as LoopTraceEmitter, T as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-nBMuollC.js';
8
- import { AgentProfile, AnalystFinding, DefaultVerdict, ChatClient } from '@tangle-network/agent-eval';
9
- export { DefaultVerdict } from '@tangle-network/agent-eval';
10
11
  import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
11
12
  import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
12
13
  export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
13
- import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
14
14
 
15
15
  /**
16
16
  * @experimental
@@ -114,66 +114,200 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
114
114
  declare function materializeTreeView(events: SpawnEvent[]): TreeView;
115
115
 
116
116
  /**
117
- * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
118
- * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
119
- * run once on the prompt, emit the terminal result event, tear down.
117
+ * createWaterfallCollector 100% trajectory observability from the lifecycle stream:
118
+ * every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
119
+ * The sum of spans IS the run's cost story what each step cost in dollars, tokens,
120
+ * and wall-clock, rendered as a text waterfall or exported as structured rows for any
121
+ * chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
122
+ * across every task the hooks observe.
120
123
  */
121
- declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
124
+
125
+ interface WaterfallSpan {
126
+ id: string;
127
+ /** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
128
+ label: string;
129
+ runId: string;
130
+ parentId?: string;
131
+ startMs: number;
132
+ endMs?: number;
133
+ status: 'running' | 'done' | 'down';
134
+ usd: number;
135
+ tokens: {
136
+ input: number;
137
+ output: number;
138
+ };
139
+ score?: number;
140
+ }
141
+ interface WaterfallReport {
142
+ spans: WaterfallSpan[];
143
+ /** Wall-clock of the observed window (first spawn → last settle). */
144
+ totalMs: number;
145
+ totalUsd: number;
146
+ totalTokens: {
147
+ input: number;
148
+ output: number;
149
+ };
150
+ /** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
151
+ byKind: Record<string, {
152
+ count: number;
153
+ ms: number;
154
+ usd: number;
155
+ tokens: {
156
+ input: number;
157
+ output: number;
158
+ };
159
+ }>;
160
+ }
161
+ interface WaterfallCollector {
162
+ /** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
163
+ hooks: RuntimeHooks;
164
+ report(): WaterfallReport;
165
+ /** The text waterfall — one row per span, bars scaled to the observed window. */
166
+ render(opts?: {
167
+ width?: number;
168
+ maxRows?: number;
169
+ }): string;
170
+ reset(): void;
171
+ }
172
+ declare function createWaterfallCollector(): WaterfallCollector;
122
173
 
123
174
  /**
124
- * `loopDispatch`turn `runLoop` into an agent-eval campaign dispatch.
175
+ * anytimeReporttime-to-satisfactory-output metrics, derived entirely from the
176
+ * waterfall's spans (no new instrumentation): per task, the best-so-far score after each
177
+ * shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
178
+ * anytime-optimization metrics:
179
+ *
180
+ * TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
181
+ * over tasks that reached it)
182
+ * STT shots-to-target — attempts until best-so-far ≥ target
183
+ * ERT expected running time (the COCO benchmarking convention): TOTAL time spent
184
+ * across all tasks — including failures' full budgets — divided by the number of
185
+ * tasks that reached the target. The honest "how long per success, all-in".
186
+ * AUC the anytime curve's area (mean best-so-far score across the budget, per shot
187
+ * index) — higher = climbs earlier.
188
+ *
189
+ * The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
190
+ * (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
191
+ * runtime-to-target per (task, target) pair — optionally overridden per task
192
+ * (`targetFor`) when satisfaction is task-specific. Spans come from
193
+ * `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
194
+ * (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
195
+ */
196
+
197
+ interface AnytimeTaskCurve {
198
+ taskId: string;
199
+ strategy: string;
200
+ /** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
201
+ * cumulative usd, and the running max score. */
202
+ points: Array<{
203
+ elapsedMs: number;
204
+ cumUsd: number;
205
+ best: number;
206
+ }>;
207
+ /** Per satisficing target (keyed by the target value as a string): the first point
208
+ * where best ≥ target, or null when never reached within budget. */
209
+ hits: Record<string, {
210
+ ms: number;
211
+ shots: number;
212
+ usd: number;
213
+ } | null>;
214
+ }
215
+ interface AnytimeStrategySummary {
216
+ strategy: string;
217
+ /** The satisficing target this row summarizes. */
218
+ target: number;
219
+ tasks: number;
220
+ reachedTarget: number;
221
+ /** Median time-to-target over the tasks that reached it (null when none did). */
222
+ medianTttMs: number | null;
223
+ medianShotsToTarget: number | null;
224
+ /** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
225
+ ertMs: number | null;
226
+ /** Same construction over dollars: Σ all spend / #successes. */
227
+ erUsd: number | null;
228
+ /** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
229
+ curveByShot: number[];
230
+ /** Area under the per-shot anytime curve, normalized to [0,1]. */
231
+ auc: number;
232
+ }
233
+ interface AnytimeReport {
234
+ targets: number[];
235
+ perTask: AnytimeTaskCurve[];
236
+ /** One summary per (strategy, target) pair — the COCO-style multi-target view. */
237
+ perStrategy: AnytimeStrategySummary[];
238
+ }
239
+ /** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
240
+ * bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
241
+ * `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
242
+ * per-task bar replaces every entry of `targets` for that task. */
243
+ declare function anytimeReport(spans: WaterfallSpan[], opts?: {
244
+ targets?: number[];
245
+ targetFor?: (taskId: string) => number;
246
+ }): AnytimeReport;
247
+ /** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
248
+ declare function renderAnytimeTable(report: AnytimeReport): string;
249
+
250
+ /**
251
+ * auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
125
252
  *
126
- * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
127
- * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
128
- * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
129
- * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
130
- * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
131
- * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
132
- * the third silent. The fleet's products skipped (c) and fell back to a
133
- * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
134
- * to kill.
253
+ * `observe()` critiques execution quality ("what's unfinished"). This audits ALIGNMENT
254
+ * a different failure class the score can't see until it's too late: an agent can be
255
+ * executing flawlessly down the wrong route. The auditor reads the trajectory and
256
+ * compares three intents:
135
257
  *
136
- * `loopDispatch` collapses all three into one typed call:
258
+ * declared — what the task says to do (the prompt / acceptance criteria)
259
+ * revealed — what the agent is ACTUALLY optimizing, inferred from its action pattern
260
+ * (the inverse-inference move: actions reveal objectives)
261
+ * user — what the principal actually wants (the contract, when it differs from
262
+ * the literal task text), plus where the user's own trajectory is heading
137
263
  *
138
- * const dispatch = loopDispatch({
139
- * sandboxClient,
140
- * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
141
- * })
142
- * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
264
+ * and returns a verdict (aligned / drifting / diverged) with evidence and ONE
265
+ * recommended intervention. FIREWALLED like every analyst: input is the trajectory and
266
+ * the intents never the verifier or its data (zero check-leakage, so route auditing
267
+ * is always Goodhart-safe to run online).
143
268
  *
144
- * Usage is reported automatically; trace events are forwarded automatically;
145
- * the ctx is built automatically. The seam becomes impossible to mis-wire.
146
- *
147
- * Typed structurally against the campaign `DispatchContext` (imported type-only
148
- * from `@tangle-network/agent-eval/campaign`) a downward dependency, never an
149
- * inversion.
269
+ * Where it runs: between shots (steer the next one), as a watchdog over the lifecycle
270
+ * stream (abort-and-refund a diverged rollout the budget pool makes early abort
271
+ * strictly valuable), or post-hoc over a whole BenchmarkReport (the meta-intent pass:
272
+ * is the LOOP optimizing the right thing degenerate submissions, check-gaming shapes,
273
+ * objective drift across tasks).
150
274
  */
151
275
 
152
- /** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
153
- type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
154
- interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
155
- /** Sandbox client used for every cell's `runLoop`. Supplied once. */
156
- sandboxClient: SandboxClient;
157
- /** Build the per-cell runLoop options from the scenario (+ profile, when
158
- * used with `runProfileMatrix`). */
159
- toLoopOptions: (scenario: TScenario, profile: AgentProfile) => LoopOptionsForDispatch<Task, Output, Decision>;
160
- /** Map the finished loop to the artifact the judges score. Default:
161
- * `result.winner?.output`. A loop with no winner yields `undefined` (judges
162
- * skip the cell) — but the loop's token usage is STILL reported, so the
163
- * integrity guard sees real activity. */
164
- toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
165
- /** Forward `loop.*` trace events into the campaign's scoped trace so loop
166
- * spans correlate with the cell. Default true. */
167
- forwardTrace?: boolean;
168
- /** Cost-meter source label for the loop's spend. Default `'loop'`. */
169
- costSource?: string;
276
+ interface AuditIntentInput {
277
+ /** The declared intent: the task text / acceptance criteria the agent was given. */
278
+ declaredIntent: string;
279
+ /** The trajectory so far tool calls + results + assistant turns (any event shapes). */
280
+ trace: ReadonlyArray<unknown>;
281
+ /** The principal's actual intent when it differs from the literal task (the contract). */
282
+ userIntent?: string;
283
+ /** The loop-level purpose (meta-intent): what the WHOLE run is for — lets the auditor
284
+ * flag locally-sensible work that serves the wrong larger objective. */
285
+ metaIntent?: string;
286
+ runId?: string;
170
287
  }
171
- /**
172
- * Adapter for `runProfileMatrix` (profile is an axis). Returns a
173
- * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
174
- * reports usage automatically.
175
- */
176
- declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
288
+ interface AuditIntentOptions {
289
+ chat: ChatClient;
290
+ model?: string;
291
+ /** Override the auditor instruction (optimizable like any analyst prompt). */
292
+ auditorInstruction?: string;
293
+ /** Cap trace lines fed to the auditor. Default 80. */
294
+ maxTraceLines?: number;
295
+ signal?: AbortSignal;
296
+ }
297
+ interface IntentAudit {
298
+ /** What the agent's actions reveal it is actually optimizing — one sentence. */
299
+ revealedIntent: string;
300
+ verdict: 'aligned' | 'drifting' | 'diverged';
301
+ /** Trajectory-grounded evidence for the verdict (specific calls/patterns). */
302
+ evidence: string;
303
+ /** The single recommended intervention. */
304
+ recommendation: 'continue' | 'steer' | 'abort';
305
+ /** When recommendation is 'steer': the corrective instruction to inject. */
306
+ steer?: string;
307
+ confidence: number;
308
+ }
309
+ declare const defaultAuditorInstruction: string;
310
+ declare function auditIntent(input: AuditIntentInput, opts: AuditIntentOptions): Promise<IntentAudit>;
177
311
 
178
312
  /**
179
313
  * @experimental
@@ -329,7 +463,7 @@ interface ShapeContext<D = unknown> {
329
463
  spawnChild(name: string, spec: AgentSpec): Agent<unknown, Outcome<D>>;
330
464
  /** Derive a child `AgentSpec` from the persona's root spec with an overridden profile —
331
465
  * the seam a shape uses to give a worker a narrower role/prompt than the root persona. */
332
- childSpec(profile: AgentProfile$1, harness?: BackendType | null): AgentSpec;
466
+ childSpec(profile: AgentProfile, harness?: BackendType | null): AgentSpec;
333
467
  }
334
468
  /**
335
469
  * A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it
@@ -790,7 +924,7 @@ interface RenderCorpusToInstructionsOptions {
790
924
  readonly corpus: Corpus;
791
925
  readonly filter: CorpusFilter;
792
926
  /** The profile to project the facts into. The result is a fresh profile — the input is unchanged. */
793
- readonly profile: AgentProfile$1;
927
+ readonly profile: AgentProfile;
794
928
  /** Where the rendered facts land: appended to `prompt.instructions[]` (default) or folded into
795
929
  * the single-blob `resources.instructions` string. */
796
930
  readonly target?: 'prompt' | 'resources';
@@ -799,7 +933,7 @@ interface RenderCorpusToInstructionsOptions {
799
933
  }
800
934
  /** `renderCorpusToInstructions(opts)` — the flywheel read-back projection. Async (queries the
801
935
  * durable corpus); returns a fresh `AgentProfile` with the accreted facts merged in. */
802
- type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile$1>;
936
+ type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile>;
803
937
  /**
804
938
  * One node in the reconstructed trajectory tree — a driver OR a leaf, with its OWN spend and the
805
939
  * spend ROLLED UP over its subtree. Reconstructed from the `SpawnJournal` (structure + per-node
@@ -941,7 +1075,15 @@ interface ObserveOptions {
941
1075
  signal?: AbortSignal;
942
1076
  /** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */
943
1077
  maxTraceLines?: number;
944
- }
1078
+ /** Override the analyst's system instruction — the prompt that turns a trace into
1079
+ * findings + recommended_actions. The analyst IS the steerer, so this is the knob a
1080
+ * prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The
1081
+ * firewall (trace-only, never the verdict) is structural (input has no score), so a
1082
+ * custom instruction cannot break it. */
1083
+ analystInstruction?: string;
1084
+ }
1085
+ /** The default observer instruction — exported so an optimizer can seed its population. */
1086
+ declare const defaultAnalystInstruction: string;
945
1087
  interface Observation {
946
1088
  findings: AnalystFinding[];
947
1089
  /** Facts persisted to the corpus (empty when no corpus was supplied). */
@@ -954,6 +1096,488 @@ declare function observe(input: ObserveInput, opts: ObserveOptions): Promise<Obs
954
1096
  * steer; the operator block is the advice. */
955
1097
  declare function renderReport(findings: ReadonlyArray<AnalystFinding>): string;
956
1098
 
1099
+ /**
1100
+ * harvestCorpus — production traces → corpus, the G2 bridge (the playbook's step 6).
1101
+ * The flywheel's write side, batched: run the firewalled `observe()` analyst over a
1102
+ * stream of completed runs (yesterday's production traces, a benchmark's rollouts, a
1103
+ * fleet's day) and accrete the trace-derived facts into the durable corpus.
1104
+ *
1105
+ * Store-agnostic by design: the caller maps its trace store's rows (a
1106
+ * `ProductionTraceSink` ndjson, OTLP spans, RunRecords) to `ObserveInput` — task text,
1107
+ * final output, the event trace, terminal outcome. The analyst reads BEHAVIOR only
1108
+ * (the firewall is structural: the input carries no judge verdict), and corpus appends
1109
+ * are idempotent on (claim + tags), so re-harvesting the same window is safe.
1110
+ *
1111
+ * The nightly product job is then three lines:
1112
+ * const runs = mapSinkRowsToObserveInputs(await readSink(yesterday))
1113
+ * const report = await harvestCorpus({ runs, chat, corpus, tags: ['gtm-agent'] })
1114
+ * log(report) // runsObserved / findings / learned / failures
1115
+ *
1116
+ * NOTE on the read side: harvesting is safe and cheap; *injecting* facts back into runs
1117
+ * is the measured danger zone — naive unconditional priming tested NEGATIVE (−11.6pp,
1118
+ * context pollution; docs/research/layer-across-run.md). Gate any priming design on its
1119
+ * own A/B; the corpus's first consumers are operators and optimizers, not prompts.
1120
+ */
1121
+
1122
+ interface HarvestCorpusOptions {
1123
+ /** The completed runs to analyze — map your store's rows to `ObserveInput`. */
1124
+ runs: AsyncIterable<ObserveInput> | Iterable<ObserveInput>;
1125
+ /** The model-call seam (agent-eval `createChatClient`). */
1126
+ chat: ChatClient;
1127
+ model?: string;
1128
+ /** The durable corpus the facts accrete into. */
1129
+ corpus: Corpus;
1130
+ /** Tags written onto learned facts (the product/domain key the read side queries by). */
1131
+ tags?: ReadonlyArray<string>;
1132
+ /** Override the analyst instruction (the GEPA-tunable knob). */
1133
+ analystInstruction?: string;
1134
+ /** Runs analyzed in parallel. Default 4. */
1135
+ concurrency?: number;
1136
+ /** Hard cap on runs consumed from the stream (a cost guard for unbounded stores). */
1137
+ maxRuns?: number;
1138
+ signal?: AbortSignal;
1139
+ }
1140
+ interface HarvestFailure {
1141
+ runId: string;
1142
+ error: string;
1143
+ }
1144
+ interface HarvestReport {
1145
+ runsObserved: number;
1146
+ /** Total findings the analyst produced (including ones already known). */
1147
+ findings: number;
1148
+ /** NEW facts actually appended (idempotent dedup excludes re-learned ones). */
1149
+ learned: number;
1150
+ /** Per-run analysis failures — reported, never silently dropped. */
1151
+ failures: HarvestFailure[];
1152
+ }
1153
+ declare function harvestCorpus(opts: HarvestCorpusOptions): Promise<HarvestReport>;
1154
+
1155
+ /**
1156
+ * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
1157
+ * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
1158
+ * run once on the prompt, emit the terminal result event, tear down.
1159
+ */
1160
+ declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
1161
+
1162
+ /**
1163
+ * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
1164
+ *
1165
+ * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
1166
+ * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
1167
+ * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
1168
+ * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
1169
+ * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
1170
+ * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
1171
+ * the third silent. The fleet's products skipped (c) and fell back to a
1172
+ * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
1173
+ * to kill.
1174
+ *
1175
+ * `loopDispatch` collapses all three into one typed call:
1176
+ *
1177
+ * const dispatch = loopDispatch({
1178
+ * sandboxClient,
1179
+ * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
1180
+ * })
1181
+ * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
1182
+ *
1183
+ * Usage is reported automatically; trace events are forwarded automatically;
1184
+ * the ctx is built automatically. The seam becomes impossible to mis-wire.
1185
+ *
1186
+ * Typed structurally against the campaign `DispatchContext` (imported type-only
1187
+ * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
1188
+ * inversion.
1189
+ */
1190
+
1191
+ /** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
1192
+ type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
1193
+ interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
1194
+ /** Sandbox client used for every cell's `runLoop`. Supplied once. */
1195
+ sandboxClient: SandboxClient;
1196
+ /** Build the per-cell runLoop options from the scenario (+ profile, when
1197
+ * used with `runProfileMatrix`). */
1198
+ toLoopOptions: (scenario: TScenario, profile: AgentProfile$1) => LoopOptionsForDispatch<Task, Output, Decision>;
1199
+ /** Map the finished loop to the artifact the judges score. Default:
1200
+ * `result.winner?.output`. A loop with no winner yields `undefined` (judges
1201
+ * skip the cell) — but the loop's token usage is STILL reported, so the
1202
+ * integrity guard sees real activity. */
1203
+ toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
1204
+ /** Forward `loop.*` trace events into the campaign's scoped trace so loop
1205
+ * spans correlate with the cell. Default true. */
1206
+ forwardTrace?: boolean;
1207
+ /** Cost-meter source label for the loop's spend. Default `'loop'`. */
1208
+ costSource?: string;
1209
+ }
1210
+ /**
1211
+ * Adapter for `runProfileMatrix` (profile is an axis). Returns a
1212
+ * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
1213
+ * reports usage automatically.
1214
+ */
1215
+ declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
1216
+
1217
+ /**
1218
+ * The general agentic primitive — sequential (depth) and parallel (breadth) over a shared,
1219
+ * checkable artifact, driven through the keystone Supervisor as one recursive `Agent.act`.
1220
+ *
1221
+ * The domain lives behind ONE seam — `AgenticSurface` (open an artifact, list tools, call a tool,
1222
+ * score the artifact, close it). EnterpriseOps implements it (seed a gym DB, MCP tools, SQL
1223
+ * verifier); Commit0/AppWorld/terminal-bench implement it the same way (a repo workspace, shell
1224
+ * tools, the test suite). The drivers below are domain-blind: they run over any surface.
1225
+ *
1226
+ * Two shapes, the agent's POMDP rollout as the unit:
1227
+ * - DEPTH one persistent artifact carried across shots. Each shot the agent works the tool loop;
1228
+ * between shots a trace-analyst (selector≠judge: reads the trajectory, never the score)
1229
+ * steers the resumed session toward what's unfinished. shot n stands on shot n-1's
1230
+ * artifact state + history. This is continuation — long-horizon, same artifact.
1231
+ * - BREADTH K independent artifacts, each a fresh rollout, the deployable verifier picks the best.
1232
+ *
1233
+ * Both are an `Agent` whose `act` spawns leaf shots through `scope.spawn` and reacts via
1234
+ * `scope.next()` — so the conserved budget pool meters them (equal-k by construction), the journal
1235
+ * records the tree, and the same primitive nests. `runAgentic` runs the chosen driver through
1236
+ * `createSupervisor().run`. The leaf (one shot over a handle) is resolved per-spawn from a
1237
+ * surface-closed registry — the open `Executor` seam, not bespoke per-benchmark glue.
1238
+ */
1239
+
1240
+ interface AgenticTask {
1241
+ readonly id: string;
1242
+ readonly systemPrompt: string;
1243
+ readonly userPrompt: string;
1244
+ /** Opaque domain payload the surface reads (EOPS: servers/verifiers/tools). Drivers never read it. */
1245
+ readonly meta?: Record<string, unknown>;
1246
+ }
1247
+ interface ArtifactHandle {
1248
+ readonly id: string;
1249
+ readonly surface: string;
1250
+ /** Opaque per-artifact context the surface stashes (EOPS: the seeded gym server + db id). */
1251
+ readonly ctx?: unknown;
1252
+ }
1253
+ interface AgenticTool {
1254
+ readonly type: 'function';
1255
+ readonly function: {
1256
+ name: string;
1257
+ description?: string;
1258
+ parameters: Record<string, unknown>;
1259
+ };
1260
+ }
1261
+ interface SurfaceScore {
1262
+ passes: number;
1263
+ total: number;
1264
+ /** Checks excluded as malformed (data defect, not the agent). `total === 0` ⇒ unscoreable. */
1265
+ errored: number;
1266
+ }
1267
+ /** A stateful, checkable environment an agent operates over with tools. Open behind one interface. */
1268
+ interface AgenticSurface {
1269
+ readonly name: string;
1270
+ open(task: AgenticTask): Promise<ArtifactHandle>;
1271
+ tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]>;
1272
+ call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string>;
1273
+ score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
1274
+ close(handle: ArtifactHandle): Promise<void>;
1275
+ }
1276
+ interface AgenticOptions {
1277
+ routerBaseUrl: string;
1278
+ routerKey: string;
1279
+ model: string;
1280
+ temperature?: number;
1281
+ /** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
1282
+ * budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
1283
+ maxTokens?: number;
1284
+ /** Turns the agent may take within ONE shot before the driver intervenes. */
1285
+ innerTurns?: number;
1286
+ /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
1287
+ * prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
1288
+ analystInstruction?: string;
1289
+ /** The critic's model — lets the analyst be a stronger (or cheaper) model than the
1290
+ * worker. Omitted ⇒ the worker's `model`. */
1291
+ analystModel?: string;
1292
+ /** Across-run learning: when set, the analyst's observe() pass appends trace-derived
1293
+ * facts here (the flywheel write side). Priming (the read side) is the caller's move —
1294
+ * query the corpus and fold facts into the task's systemPrompt before runAgentic. */
1295
+ corpus?: Corpus;
1296
+ /** Tags written onto learned facts (and used by the caller's priming query). */
1297
+ corpusTags?: string[];
1298
+ }
1299
+ type Msg = Record<string, unknown>;
1300
+ interface ShotResult {
1301
+ messages: Msg[];
1302
+ score: number;
1303
+ passes: number;
1304
+ total: number;
1305
+ completions: number;
1306
+ toolErrors: number;
1307
+ }
1308
+ interface AgenticRunResult {
1309
+ /** The strategy name (built-in 'depth'/'breadth' or a custom strategy's name). */
1310
+ mode: string;
1311
+ score: number;
1312
+ resolved: boolean;
1313
+ completions: number;
1314
+ /** DEPTH: score after each shot — the progress-over-rounds curve. BREADTH: best-so-far per rollout. */
1315
+ progression: number[];
1316
+ shots: number;
1317
+ /** The cost vector, stamped by `runAgentic` from the Supervisor's conserved pool: real
1318
+ * router tokens, priced usd (0 when the model is unpriced — never fabricated), wall ms. */
1319
+ usd: number;
1320
+ ms: number;
1321
+ tokens: {
1322
+ input: number;
1323
+ output: number;
1324
+ };
1325
+ }
1326
+ /** DEPTH: one persistent artifact, carried across analyst-steered shots. */
1327
+ declare function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
1328
+ maxShots: number;
1329
+ }): Agent<unknown, Outcome<unknown>>;
1330
+ /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
1331
+ declare function breadthDriver(_surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
1332
+ width: number;
1333
+ }): Agent<unknown, Outcome<unknown>>;
1334
+ /**
1335
+ * A Strategy is HOW you spend the compute budget to beat the Environment's check — it
1336
+ * builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev
1337
+ * authors their own by implementing `driver()` to return an Agent whose `act()` spawns
1338
+ * shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are
1339
+ * the reference implementations to copy:
1340
+ * sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample).
1341
+ * refine — attempt → observe() reads the trace → steer the next → repeat (iterate).
1342
+ * (A multi-agent "team" is just a Strategy whose driver spawns several different agents.)
1343
+ */
1344
+ interface Strategy {
1345
+ readonly name: string;
1346
+ driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>;
1347
+ }
1348
+ declare const sample: Strategy;
1349
+ declare const refine: Strategy;
1350
+ /** A role for one shot — multi-agent loops (researcher + engineer, a panel of k
1351
+ * researchers) give each shot its own system prompt and optionally its own model. */
1352
+ interface ShotPersona {
1353
+ /** Replaces the task's systemPrompt for a FRESH shot; on a carried conversation it is
1354
+ * injected as a hand-off message (the transcript's earlier roles stay intact). */
1355
+ systemPrompt?: string;
1356
+ /** Per-shot model override (e.g. a stronger model for the engineer shot). */
1357
+ model?: string;
1358
+ }
1359
+ interface ShotSpec {
1360
+ /** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */
1361
+ handle?: ArtifactHandle;
1362
+ messages?: Msg[];
1363
+ steer?: string;
1364
+ persona?: ShotPersona;
1365
+ /** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
1366
+ * the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
1367
+ tools?: string[];
1368
+ }
1369
+ interface StrategyResult {
1370
+ score: number;
1371
+ resolved: boolean;
1372
+ completions: number;
1373
+ progression: number[];
1374
+ shots: number;
1375
+ }
1376
+ /** Artifact lifecycle a strategy may manage itself — open/close ONLY. Raw `call`/`score`
1377
+ * are withheld: scores reach the body solely through `shot()`'s ShotResult (the
1378
+ * harness-verified channel), so a body cannot peek the check or fabricate around it. */
1379
+ interface StrategyArtifacts {
1380
+ readonly name: string;
1381
+ open(task: AgenticTask): Promise<ArtifactHandle>;
1382
+ close(handle: ArtifactHandle): Promise<void>;
1383
+ }
1384
+ /** What a strategy body composes with: the artifact lifecycle, the budget, and the two steps. */
1385
+ interface StrategyCtx {
1386
+ /** Open/close artifacts the body manages itself (e.g. one persistent handle for depth). */
1387
+ readonly surface: StrategyArtifacts;
1388
+ readonly task: AgenticTask;
1389
+ readonly opts: AgenticOptions;
1390
+ readonly budget: number;
1391
+ readonly scope: Scope<Outcome<unknown>>;
1392
+ /** Run ONE worker shot; its harness-scored result, or null if it went down. */
1393
+ shot(spec?: ShotSpec): Promise<ShotResult | null>;
1394
+ /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
1395
+ critique(messages: Msg[]): Promise<string | null>;
1396
+ /** The RAW analyst channel: the firewalled critic answers `instruction` over the
1397
+ * trajectory verbatim — no findings extraction, so verdict-shaped formats
1398
+ * (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
1399
+ * trajectory in, never scores. Null when the analyst went down. */
1400
+ consult(messages: Msg[], instruction: string): Promise<string | null>;
1401
+ /** The tools THIS artifact's task actually offers (names + descriptions only — never
1402
+ * the implementations). Tool sets vary per task on heterogeneous domains; a strategy
1403
+ * that restricts shots MUST select from this list, never from hardcoded names. */
1404
+ listTools(handle: ArtifactHandle): Promise<Array<{
1405
+ name: string;
1406
+ description?: string;
1407
+ }>>;
1408
+ }
1409
+ /** Author a Strategy from the composable steps — the open, compact way. */
1410
+ declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
1411
+ /** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot
1412
+ * fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck)
1413
+ * — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
1414
+ * checkpoint across all lines), the deployable metric. This is the "experts build BETTER
1415
+ * optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
1416
+ declare const adaptiveRefine: Strategy;
1417
+ /** The explore-then-exploit MIX: spend ⌈budget/2⌉ on independent samples (kept open),
1418
+ * then refine the best-verifying line with the remaining budget. Sample's basin escape +
1419
+ * refine's accumulation — the third built-in, authored from the public steps. */
1420
+ declare const sampleThenRefine: Strategy;
1421
+ interface RunAgenticOptions extends AgenticOptions {
1422
+ surface: AgenticSurface;
1423
+ task: AgenticTask;
1424
+ /** Lifecycle observability — every spawn/settle (shots, analysts) streams here live.
1425
+ * The seam online watchdogs/route-auditors subscribe to. */
1426
+ hooks?: RuntimeHooks;
1427
+ /** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */
1428
+ strategy?: Strategy;
1429
+ /** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */
1430
+ mode?: 'depth' | 'breadth';
1431
+ /** budget: refine→max shots; sample→rollout width. */
1432
+ budget: number;
1433
+ rootBudget?: Budget;
1434
+ }
1435
+ /** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
1436
+ declare function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunResult>;
1437
+
1438
+ /**
1439
+ * runBenchmark — the packaged optimization suite. Define a domain by implementing an
1440
+ * `Environment` (open / tools / call / score / close); get the optimization strategies
1441
+ * compared, scored by your own deployable check, with a paired-bootstrap report — free.
1442
+ *
1443
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
1444
+ * is how you spend the budget to beat the check. Two built-ins:
1445
+ *
1446
+ * sample — N independent attempts, keep the best-verifying one. (best-of-N / resample)
1447
+ * refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
1448
+ *
1449
+ * Both run at equal budget through the Supervisor's conserved pool; the headline is the
1450
+ * paired lift of refine over sample. Author your own strategy with `defineStrategy`.
1451
+ */
1452
+
1453
+ /** A checkable task domain — implement these 5 hooks and the suite does the rest. The
1454
+ * same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
1455
+ type Environment = AgenticSurface;
1456
+ interface BenchmarkConfig {
1457
+ /** The task domain (5 hooks). */
1458
+ environment: Environment;
1459
+ /** The tasks to score across. */
1460
+ tasks: AgenticTask[];
1461
+ /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
1462
+ worker: AgenticOptions;
1463
+ /** Which strategies to compare. Pass the built-ins (`refine`, `sample`) or your own.
1464
+ * Default: [sample, refine]. */
1465
+ strategies?: Strategy[];
1466
+ /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
1467
+ budget?: number;
1468
+ /** Tasks scored in parallel. Default 3. */
1469
+ concurrency?: number;
1470
+ /** Progress hook — fires as each task settles (the live-monitoring seam: append to a
1471
+ * progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
1472
+ onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void;
1473
+ /** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
1474
+ * here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
1475
+ hooks?: RuntimeHooks;
1476
+ }
1477
+ interface BenchmarkLift {
1478
+ /** Mean of paired deltas (refine − sample). */
1479
+ mean: number;
1480
+ low: number;
1481
+ high: number;
1482
+ n: number;
1483
+ }
1484
+ /** One strategy's outcome on one task — the per-task cell an optimizer consumes. */
1485
+ interface BenchmarkCell {
1486
+ score: number;
1487
+ resolved: boolean;
1488
+ /** The progress curve (refine: score per shot; sample: best-so-far per rollout). */
1489
+ progression: number[];
1490
+ usd: number;
1491
+ ms: number;
1492
+ tokens: {
1493
+ input: number;
1494
+ output: number;
1495
+ };
1496
+ }
1497
+ interface BenchmarkTaskRow {
1498
+ taskId: string;
1499
+ /** Per-strategy cells; absent when the task errored before completing all strategies. */
1500
+ cells?: Record<string, BenchmarkCell>;
1501
+ /** Per-strategy failures on this task: the strategy competed, threw, and scored an
1502
+ * honest zero — it loses, it does not poison the row. The message is kept so a later
1503
+ * generation's author can see WHY a candidate died. */
1504
+ errors?: Record<string, string>;
1505
+ /** Why the task was excluded (infra/setup failure) — never silently dropped. */
1506
+ error?: string;
1507
+ }
1508
+ interface BenchmarkStrategySummary {
1509
+ /** Mean verifier score (0..1). */
1510
+ score: number;
1511
+ /** Fraction of tasks fully resolved. */
1512
+ resolved: number;
1513
+ /** Mean cost vector per task. */
1514
+ usd: number;
1515
+ ms: number;
1516
+ }
1517
+ interface BenchmarkReport {
1518
+ n: number;
1519
+ excluded: number;
1520
+ /** Per-strategy means (keyed by strategy.name). */
1521
+ perStrategy: Record<string, BenchmarkStrategySummary>;
1522
+ /** The full per-task × per-strategy table — the LOSSES an optimizer (GEPA, a
1523
+ * strategy-author, an operator) consumes. Includes errored tasks with the reason. */
1524
+ perTask: BenchmarkTaskRow[];
1525
+ /** The non-dominated strategies on (score ↑, $/task ↓) — collapse-last, per the canon:
1526
+ * a strategy that ties on score at half the cost WINS and a scalar would hide it. */
1527
+ pareto: string[];
1528
+ /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
1529
+ refineVsSample?: BenchmarkLift;
1530
+ }
1531
+ /** Run the requested strategies over the tasks, scored by the Environment's own check.
1532
+ * Resilient: a task whose rollouts fail (transient infra) is excluded from the stats but
1533
+ * reported in `perTask` with the error — never silently dropped. */
1534
+ declare function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport>;
1535
+ /** Pretty-print a report — the "free optimization" verdict, with the cost vector. */
1536
+ declare function printBenchmarkReport(report: BenchmarkReport): void;
1537
+
1538
+ /**
1539
+ * createMcpEnvironment — wrap any MCP server as an `Environment` (the product-adoption
1540
+ * primitive: a product's agent tools are usually already an MCP surface, so the domain
1541
+ * only writes the lifecycle hooks — open a scoped artifact, score it with a deployable
1542
+ * check, close it — and the tool plumbing is derived from the server).
1543
+ *
1544
+ * What the helper owns (the generic 80%, hardened on the EnterpriseOps gym):
1545
+ * - JSON-RPC `tools/list` → `AgenticTool[]`, with schemas coerced to the
1546
+ * OpenAI-tool-valid shape (top-level oneOf/anyOf/allOf/enum/not are rejected by
1547
+ * tool-calling providers; nested combinators are fine).
1548
+ * - JSON-RPC `tools/call` → the tool's text content (errors surfaced as `ERROR: …`
1549
+ * strings — a bad call is the agent's outcome, not an infra fault).
1550
+ * - SSE response parsing (streamable-HTTP MCP servers answer with `data:` lines).
1551
+ * - Bounded retry with backoff on thrown fetches (transient network ≠ task failure).
1552
+ *
1553
+ * What the domain supplies: `open` (create/seed the per-task artifact and return its
1554
+ * MCP endpoint — url + headers carry the per-artifact scoping, e.g. a database id
1555
+ * header), `score` (the deployable check), and optional `close`/`selectTools`.
1556
+ */
1557
+
1558
+ /** Where a handle's MCP server lives; headers carry per-artifact scoping. */
1559
+ interface McpEndpoint {
1560
+ url: string;
1561
+ headers?: Record<string, string>;
1562
+ }
1563
+ interface McpEnvironmentOptions {
1564
+ name: string;
1565
+ /** Create/seed the per-task artifact; return its handle + the MCP endpoint scoped to it. */
1566
+ open(task: AgenticTask): Promise<{
1567
+ handle: ArtifactHandle;
1568
+ endpoint: McpEndpoint;
1569
+ }>;
1570
+ /** The deployable check over the artifact's current state. */
1571
+ score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
1572
+ /** Teardown (delete the seeded artifact). Optional — omit for stateless servers. */
1573
+ close?(handle: ArtifactHandle): Promise<void>;
1574
+ /** Restrict/order the server's tools per task (e.g. the task's selected_tools). Default: all. */
1575
+ selectTools?(task: AgenticTask, all: AgenticTool[]): AgenticTool[];
1576
+ /** Cap on a tool result's text fed back to the worker. Default 1500 chars. */
1577
+ maxResultChars?: number;
1578
+ }
1579
+ declare function createMcpEnvironment(opts: McpEnvironmentOptions): Environment;
1580
+
957
1581
  /**
958
1582
  * @experimental
959
1583
  *
@@ -1175,7 +1799,7 @@ declare class FileCorpus implements Corpus {
1175
1799
  * An empty query result returns a fresh COPY of the profile with no instruction change (a valid
1176
1800
  * "nothing learned yet" read, not an error).
1177
1801
  */
1178
- declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile$1>;
1802
+ declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile>;
1179
1803
 
1180
1804
  /**
1181
1805
  * @experimental
@@ -1282,6 +1906,64 @@ declare function trajectoryReport(journal: SpawnJournal, blobs: ResultBlobStore,
1282
1906
  */
1283
1907
  declare function equalKOnCost(arms: ReadonlyArray<EqualKArm>, options?: EqualKOnCostOptions): EqualKVerdict;
1284
1908
 
1909
+ interface PromotionGateOptions {
1910
+ /** The HOLDOUT report — must carry per-task cells for both strategy names. */
1911
+ report: BenchmarkReport;
1912
+ /** The incumbent champion's strategy name. */
1913
+ incumbent: string;
1914
+ /** The challenger's strategy name. */
1915
+ candidate: string;
1916
+ /** 'superiority' (default): the candidate must score significantly BETTER.
1917
+ * 'non-inferiority': the candidate must prove its score is not worse than the
1918
+ * incumbent by more than `scoreTolerance` AND its cost savings are significant —
1919
+ * the gate for "same quality, cheaper" claims. */
1920
+ mode?: 'superiority' | 'non-inferiority';
1921
+ /** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
1922
+ scoreTolerance?: number;
1923
+ /** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
1924
+ deltaThreshold?: number;
1925
+ /** Minimum paired tasks before significance can be claimed. Default 6 — below that
1926
+ * the bootstrap CI is too wide to separate a real lift from the per-task noise. */
1927
+ minPairedTasks?: number;
1928
+ /** Bootstrap statistic over the paired deltas. Default 'mean'. */
1929
+ statistic?: 'mean' | 'median';
1930
+ /** Fixed by the substrate by default — the same report always yields the same verdict. */
1931
+ seed?: number;
1932
+ resamples?: number;
1933
+ }
1934
+ interface PromotionVerdict {
1935
+ promoted: boolean;
1936
+ reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
1937
+ mode: 'superiority' | 'non-inferiority';
1938
+ /** Paired tasks that carried both strategies' cells. */
1939
+ n: number;
1940
+ /** Paired (candidate − incumbent) lift across the holdout tasks. */
1941
+ lift: {
1942
+ mean: number;
1943
+ median: number;
1944
+ low: number;
1945
+ high: number;
1946
+ };
1947
+ /** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
1948
+ * positive means the candidate is cheaper; significant iff the CI low clears zero. */
1949
+ costSavings?: {
1950
+ mean: number;
1951
+ median: number;
1952
+ low: number;
1953
+ high: number;
1954
+ };
1955
+ /** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
1956
+ * is FASTER. Informational in every mode (never gates); the latency answer to "what
1957
+ * does this win actually cost the user?". */
1958
+ latency?: {
1959
+ mean: number;
1960
+ median: number;
1961
+ low: number;
1962
+ high: number;
1963
+ };
1964
+ }
1965
+ declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
1966
+
1285
1967
  /**
1286
1968
  * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
1287
1969
  * dispatch.
@@ -1711,6 +2393,284 @@ interface OpenSandboxRunOptions {
1711
2393
  */
1712
2394
  declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandboxRunOptions, deliverable: Deliverable<Out>): Promise<SandboxRun<Out>>;
1713
2395
 
2396
+ /**
2397
+ * authorStrategy — the agent-authored layer as a package primitive (software-3.0): an
2398
+ * LLM reads a benchmark's per-task LOSSES + the defineStrategy contract and writes a NEW
2399
+ * optimization strategy as code; the caller gates it like any human-built candidate
2400
+ * (runBenchmark + a frozen holdout).
2401
+ *
2402
+ * Structurally safe by construction: the authored body composes shot()/critique() and
2403
+ * spends through the Supervisor's conserved pool — it can be wrong, but it cannot
2404
+ * Goodhart the check (it never sees the verifiers) and it cannot win by overspending.
2405
+ *
2406
+ * The authored module is written to `outDir` and dynamically imported — run under a
2407
+ * TS-capable loader (tsx) since models often emit type annotations.
2408
+ */
2409
+
2410
+ /** The compressed consumable a skill carries: everything an author needs to emit a loop. */
2411
+ declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n name (focus an explore shot on read-only tools, an execute shot on write tools).\n Restriction-only; unknown names make the shot fail. ALWAYS select from\n await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n consult(messages, instruction): Promise<string | null>\n The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n (a decision, a prediction). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n listTools(handle): Promise<Array<{ name, description? }>>\n The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
2412
+ interface AuthorStrategyOptions {
2413
+ /** The model-call seam (agent-eval `createChatClient`). */
2414
+ chat: ChatClient;
2415
+ model?: string;
2416
+ /** A NAMED fallback author tried once when the primary call fails or returns no code
2417
+ * block (thinking models time out at the edge on long authoring prompts, or return
2418
+ * empty content without `maxTokens`). Opt-in — absent means the primary's failure
2419
+ * propagates. */
2420
+ fallbackModel?: string;
2421
+ /** The contract text shown to the author. Default `strategyAuthorContract`. The
2422
+ * meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
2423
+ * variant on the same frozen holdout as any strategy. */
2424
+ contract?: string;
2425
+ /** The environment the losses came from (orientation only — never the verifiers). */
2426
+ environmentName: string;
2427
+ /** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
2428
+ lossesJson: string;
2429
+ /** The budget the strategy must respect (shots/width). */
2430
+ budget: number;
2431
+ /** Where the authored module file is written (created if missing). */
2432
+ outDir: string;
2433
+ temperature?: number;
2434
+ /** Completion cap — required by thinking-model authors that stream reasoning first. */
2435
+ maxTokens?: number;
2436
+ signal?: AbortSignal;
2437
+ }
2438
+ /** Static CONTRACT lint over an authored strategy module — the module-boundary
2439
+ * enforcement of the harness's two measurement invariants:
2440
+ * - author blindness: the only import allowed is the loops surface. A body that could
2441
+ * reach the filesystem, network, or process could read or mutate verifier/artifact
2442
+ * state outside the brokered shots, and the harness-verified score would stop
2443
+ * meaning "what the shots achieved".
2444
+ * - conserved dose: no out-of-band compute (fetch/require/eval) — every unit a
2445
+ * strategy spends is metered by the Supervisor's pool, which is what makes
2446
+ * equal-budget comparisons between strategies valid.
2447
+ * A lint, not a sandbox: its job is keeping the benchmark numbers interpretable. */
2448
+ declare function assertStrategyContract(code: string): void;
2449
+ interface AuthoredStrategy {
2450
+ strategy: Strategy;
2451
+ file: string;
2452
+ code: string;
2453
+ }
2454
+ /** Author + load a strategy from losses. Throws when the author emits no loadable module;
2455
+ * with `fallbackModel` set, the named fallback gets one attempt first. */
2456
+ declare function authorStrategy(opts: AuthorStrategyOptions): Promise<AuthoredStrategy>;
2457
+
2458
+ /**
2459
+ * runStrategyEvolution — the multi-generation strategy search: per generation the system
2460
+ * authors a POPULATION of candidate strategies from the current tournament's losses,
2461
+ * plays them against the incumbent at equal budget, and advances a champion; one final
2462
+ * promotion decision runs on a NEVER-BEFORE-USED holdout slice through `promotionGate`.
2463
+ *
2464
+ * Measurement invariants (the reasons this design is shaped the way it is):
2465
+ * - The author sees TRAIN losses only. The holdout slice is drawn fresh (disjoint task
2466
+ * offsets) after all authoring is done — one promotion decision, one untouched slice,
2467
+ * so adaptive reuse of evaluation data never enters the verdict.
2468
+ * - Every tournament runs at the same per-strategy budget through the conserved pool;
2469
+ * candidates cannot win by overspending.
2470
+ * - Champion selection within the search is a SEARCH policy (configurable, default
2471
+ * cost-aware: ties on score go to the cheapest strategy — a scalar hides a strategy
2472
+ * that ties at half the cost). The promotion verdict never comes from search
2473
+ * selection; it comes from the gate on the fresh slice.
2474
+ * - Every authored artifact's description length (gzip bits) is recorded, so the
2475
+ * artifact-complexity-vs-holdout-gap relation is analyzable from any run's report.
2476
+ *
2477
+ * Lineage fields (`parent`, `generation`) are recorded on every archive node so a
2478
+ * descendant-productivity parent-selection policy can be added without changing the
2479
+ * report schema; the v1 search authors from the latest tournament's losses.
2480
+ */
2481
+
2482
+ interface EvolutionAuthor {
2483
+ /** The model-call seam (agent-eval `createChatClient`). */
2484
+ chat: ChatClient;
2485
+ model?: string;
2486
+ fallbackModel?: string;
2487
+ temperature?: number;
2488
+ maxTokens?: number;
2489
+ }
2490
+ type ChampionPolicy = 'score' | 'costAware';
2491
+ interface StrategyEvolutionConfig {
2492
+ environment: Environment;
2493
+ /** Task supply by DISJOINT slice: `(offset, n)` must return n tasks unique to that
2494
+ * offset range. Train draws [0, trainN); the holdout draws [trainN + holdoutOffset,
2495
+ * …) — tasks the search never touched. */
2496
+ tasks: (offset: number, n: number) => Promise<AgenticTask[]>;
2497
+ trainN: number;
2498
+ holdoutN: number;
2499
+ /** Extra offset past the train slice for the holdout draw (rotate across runs). */
2500
+ holdoutOffset?: number;
2501
+ worker: AgenticOptions;
2502
+ author: EvolutionAuthor;
2503
+ /** Rollouts (sample) / shots (refine) per strategy per task. Default 3. */
2504
+ budget?: number;
2505
+ concurrency?: number;
2506
+ /** Author→tournament rounds after gen0. Default 2. */
2507
+ generations?: number;
2508
+ /** Authored candidates per generation. Default 2. */
2509
+ populationSize?: number;
2510
+ /** The gen0 field. Default [sample, refine, sampleThenRefine]. */
2511
+ baselines?: Strategy[];
2512
+ /** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
2513
+ * incumbent's score (superiority gate). 'cost': the candidate must prove score
2514
+ * NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
2515
+ * savings — the "same quality, cheaper" objective. The author is told the objective
2516
+ * and sees per-task spend either way. */
2517
+ objective?: 'score' | 'cost';
2518
+ /** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
2519
+ scoreTolerance?: number;
2520
+ /** Search-side champion selection. Default 'costAware'. */
2521
+ champion?: ChampionPolicy;
2522
+ /** Score band treated as a tie under 'costAware'. Default 0.01. */
2523
+ championEpsilon?: number;
2524
+ /** Where authored modules are written. */
2525
+ outDir: string;
2526
+ /** Promotion-gate evidence floor (paired holdout tasks). */
2527
+ minPairedTasks?: number;
2528
+ /** BAND-AWARE scoring — concentrate the measurement where lift is possible.
2529
+ * Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
2530
+ * budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
2531
+ * (headroom exists) and take the first `holdoutN`. Band membership is decided before
2532
+ * either finalist touches a task and both finalists then face the SAME tasks — the
2533
+ * estimand becomes "paired lift on headroom tasks", pre-registered by this config.
2534
+ * Train: champion selection ignores zero-spread tasks (every field strategy scored
2535
+ * identically — zero selection information, pure noise dilution). */
2536
+ band?: {
2537
+ holdoutPoolN: number;
2538
+ /** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
2539
+ * tasks the reference already solves fully (no headroom, a candidate can only tie). */
2540
+ maxRefScore?: number;
2541
+ };
2542
+ /** What the author learns from a tournament. 'exact' (default) = scores + progressions
2543
+ * per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
2544
+ * per generation reaches the author from the evaluation data). */
2545
+ lossesDetail?: 'exact' | 'binary';
2546
+ /** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
2547
+ * compress it to a short natural-language summary, have a fresh author re-implement
2548
+ * from the summary alone (no losses, no code), and score the reproduction on the same
2549
+ * holdout. A reproduction gap is an overfitting signal (their detector: 100%
2550
+ * sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
2551
+ * never gate-blocking in v1. */
2552
+ reproducerCheck?: {
2553
+ /** Word budget for the strategy summary. Default 64. */
2554
+ summaryMaxWords?: number;
2555
+ /** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
2556
+ * Default 0.05. */
2557
+ tolerance?: number;
2558
+ };
2559
+ /** Endurance: write the run state after every completed phase; with `resume`, a
2560
+ * restart skips completed phases (authored modules re-imported from their files).
2561
+ * Worst case after a mid-run death is re-paying ONE phase, never the run. */
2562
+ checkpoint?: {
2563
+ path: string;
2564
+ resume?: boolean;
2565
+ };
2566
+ /** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
2567
+ * The seam for environment recycling — no artifacts span phases, so a runner may
2568
+ * recreate a wedge-prone environment container here. */
2569
+ onPhase?: (phase: string) => Promise<void>;
2570
+ onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
2571
+ hooks?: RuntimeHooks;
2572
+ }
2573
+ interface ChampionPick {
2574
+ name: string;
2575
+ score: number;
2576
+ usd: number;
2577
+ }
2578
+ interface EvolutionCandidate {
2579
+ name: string;
2580
+ file?: string;
2581
+ gzipBits?: number;
2582
+ codeChars?: number;
2583
+ /** Present when this author attempt failed (recorded, never silent). */
2584
+ error?: string;
2585
+ }
2586
+ interface EvolutionGeneration {
2587
+ generation: number;
2588
+ candidates: EvolutionCandidate[];
2589
+ report: BenchmarkReport;
2590
+ champion: ChampionPick;
2591
+ }
2592
+ interface EvolutionArchiveNode {
2593
+ name: string;
2594
+ source: 'baseline' | 'authored';
2595
+ generation: number;
2596
+ /** The champion whose tournament losses this candidate was authored from. */
2597
+ parent?: string;
2598
+ gzipBits?: number;
2599
+ file?: string;
2600
+ /** Latest measured tournament result — 0 until the node's first tournament settles
2601
+ * (an authored node is created before its generation's benchmark runs). */
2602
+ score: number;
2603
+ usd: number;
2604
+ }
2605
+ interface ReproductionCheck {
2606
+ /** The compressed strategy description the reproducer implemented from. */
2607
+ summary: string;
2608
+ reproducedName: string;
2609
+ file?: string;
2610
+ championHoldoutScore: number;
2611
+ reproducedHoldoutScore: number;
2612
+ /** champion − reproduced (positive = the reproduction fell short). */
2613
+ gap: number;
2614
+ /** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
2615
+ * overfitting signal: the champion's win did not fit through the summary. */
2616
+ reproducible: boolean;
2617
+ /** Infra failure during reproduction (distinct from a semantic reproduction failure). */
2618
+ error?: string;
2619
+ }
2620
+ interface EvolutionBandInfo {
2621
+ /** Tasks screened by the reference on the holdout pool. */
2622
+ screened: number;
2623
+ /** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
2624
+ inBand: number;
2625
+ /** Reference scores per screened task (the screening record). */
2626
+ refScores: Array<{
2627
+ taskId: string;
2628
+ score: number;
2629
+ }>;
2630
+ }
2631
+ interface EvolutionReport {
2632
+ gen0: BenchmarkReport;
2633
+ gen0Champion: ChampionPick;
2634
+ generations: EvolutionGeneration[];
2635
+ archive: EvolutionArchiveNode[];
2636
+ finalChampion: ChampionPick;
2637
+ holdout: BenchmarkReport;
2638
+ verdict: PromotionVerdict;
2639
+ /** Present when band screening ran — the verdict's estimand is then "paired lift on
2640
+ * headroom tasks" (band membership fixed by the reference screen, pre-registered). */
2641
+ band?: EvolutionBandInfo;
2642
+ /** Present when reproducerCheck ran (final champion was authored). */
2643
+ reproduction?: ReproductionCheck;
2644
+ /** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
2645
+ * re-measurement, so cross-generation deltas mix true drift with run-to-run variance
2646
+ * (entries are unpaired across generations). The only evidence-grade comparison in
2647
+ * this report is `verdict` — both finalists measured fresh, paired, on the holdout. */
2648
+ trajectory: Array<{
2649
+ generation: number;
2650
+ champion: string;
2651
+ score: number;
2652
+ usd: number;
2653
+ }>;
2654
+ }
2655
+ /** Strategy means recomputed over the DISCRIMINATING tasks only — tasks where the field
2656
+ * strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
2657
+ * 0.0, everyone tied) carry no selection information; averaging over them dilutes real
2658
+ * differences toward zero. Search-side denoising only — the gate never uses this. */
2659
+ declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
2660
+ score: number;
2661
+ usd: number;
2662
+ }> | null;
2663
+ /** The champion pick over a means table. 'score' takes the best mean score (ties →
2664
+ * field order). 'costAware' treats scores within `epsilon` of the best as tied and
2665
+ * takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
2666
+ declare function pickChampion(means: Record<string, {
2667
+ score: number;
2668
+ usd: number;
2669
+ }>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
2670
+ /** Search-side champion selection over a tournament report. */
2671
+ declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
2672
+ declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
2673
+
1714
2674
  /**
1715
2675
  * @experimental
1716
2676
  *
@@ -1869,6 +2829,36 @@ interface BridgeSeam {
1869
2829
  agentProfile?: Record<string, unknown>;
1870
2830
  timeoutMs?: number;
1871
2831
  }
2832
+ /** An OpenAI-shape function tool the model may call. */
2833
+ interface ToolSpec {
2834
+ type: 'function';
2835
+ function: {
2836
+ name: string;
2837
+ description?: string;
2838
+ parameters: unknown;
2839
+ };
2840
+ }
2841
+ /**
2842
+ * Router seam WITH tool use — the tool-using router backend. Same direct
2843
+ * OpenAI-compatible endpoint as `RouterSeam`, but each turn passes `tools`; when
2844
+ * the model emits tool_calls they run via `executeToolCall` ON THIS HOST and the
2845
+ * results fold back as `tool` messages, repeating until the model answers without
2846
+ * a tool or `maxTurns` is hit. A real agentic loop, OFF-BOX — no sandbox, so it
2847
+ * is unaffected by a box's egress allowlist. One turn = one completion = the
2848
+ * equal-compute unit. `executeToolCall` receives the task so per-task tool
2849
+ * surfaces (e.g. a gym keyed by task) can dispatch correctly.
2850
+ */
2851
+ interface RouterToolsSeam {
2852
+ routerBaseUrl: string;
2853
+ routerKey: string;
2854
+ model?: string;
2855
+ tools: ReadonlyArray<ToolSpec>;
2856
+ executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
2857
+ /** Max inference turns. Default 200 (runaway backstop — set far above any
2858
+ * legitimate workflow). For tighter per-workflow limits use a cost budget
2859
+ * or wall-clock deadline at the call site. */
2860
+ maxTurns?: number;
2861
+ }
1872
2862
  /**
1873
2863
  * The single built-in executor entrypoint. The backend is DATA — the cost dial a
1874
2864
  * profile, an experiment config, or a replay journal can name — not an import
@@ -1879,6 +2869,8 @@ interface BridgeSeam {
1879
2869
  type ExecutorConfig = ({
1880
2870
  backend: 'router';
1881
2871
  } & RouterSeam) | ({
2872
+ backend: 'router-tools';
2873
+ } & RouterToolsSeam) | ({
1882
2874
  backend: 'bridge';
1883
2875
  } & BridgeSeam) | ({
1884
2876
  backend: 'cli';
@@ -2016,6 +3008,47 @@ declare function createSupervisor<Task, Out>(): Supervisor<Task, Out>;
2016
3008
  */
2017
3009
  declare function createRootHandle<Out>(): RootHandle<Out>;
2018
3010
 
3011
+ /**
3012
+ * createVerifierEnvironment — ANY checkable task as an `Environment`, no tool surface
3013
+ * required. The generalization piece: EOPS/commit0-style domains have tools that mutate
3014
+ * an external artifact, but math problems, legal drafts, creative briefs, GTM copy, and
3015
+ * QA tasks have a different shape — the artifact IS the worker's answer, and the domain
3016
+ * is defined by one function: the deployable check over that answer.
3017
+ *
3018
+ * const gsm8k = createVerifierEnvironment({
3019
+ * name: 'gsm8k',
3020
+ * check: (task, answer) => ({
3021
+ * passes: extractFinalNumber(answer) === task.meta?.answer ? 1 : 0,
3022
+ * total: 1,
3023
+ * errored: 0,
3024
+ * }),
3025
+ * })
3026
+ * await runBenchmark({ environment: gsm8k, tasks, worker }) // sample vs refine on math
3027
+ *
3028
+ * The worker gets one built-in tool — `submit_answer` — plus any read-only domain tools
3029
+ * the caller adds (a calculator, a retrieval call, a style guide lookup). Every
3030
+ * submission is kept; `score()` checks the BEST submission (keep-best is the measured
3031
+ * law: workers reach correct answers then revise past them). The refine strategy's
3032
+ * critic reads the submission trajectory like any other trace, so iterate-with-feedback
3033
+ * works unchanged on answer domains.
3034
+ *
3035
+ * The check can be graded (passes/total expresses partial credit — rubric points,
3036
+ * sub-answers, unit-test counts), and MUST be deployable (computable without an oracle
3037
+ * at serve time): exact/numeric match, schema validation, a compiled rubric — not a
3038
+ * peek at held-out labels the production system wouldn't have.
3039
+ */
3040
+
3041
+ interface VerifierEnvironmentOptions {
3042
+ name: string;
3043
+ /** The deployable check over a submitted answer. Graded via passes/total. */
3044
+ check(task: AgenticTask, answer: string): Promise<SurfaceScore> | SurfaceScore;
3045
+ /** Extra domain tools (read-only helpers: calculator, retrieval, style lookup). */
3046
+ extraTools?: AgenticTool[];
3047
+ /** Executes the extra tools. Required when `extraTools` is set. */
3048
+ callExtra?(task: AgenticTask, name: string, args: Record<string, unknown>): Promise<string> | string;
3049
+ }
3050
+ declare function createVerifierEnvironment(opts: VerifierEnvironmentOptions): Environment;
3051
+
2019
3052
  /** Command runner seam. Host code can use `localShell`; sandbox code can wrap `box.exec`. */
2020
3053
  type Shell = (args: ReadonlyArray<string>, cwd?: string) => Promise<{
2021
3054
  stdout: string;
@@ -2043,5 +3076,10 @@ interface GitWorkspaceOptions {
2043
3076
  readonly noHooks?: boolean;
2044
3077
  }
2045
3078
  declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
3079
+ /** A jj-backed `Workspace` (Jujutsu, colocated with git for the durable remote).
3080
+ * Same port, same `Shell` — a drop-in for `gitWorkspace`. jj suits agent loops:
3081
+ * no staging area, and a first-class operation log (native resume/undo). Live use
3082
+ * requires `jj` on the `Shell`'s host. */
3083
+ declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
2046
3084
 
2047
- export { Agent, AgentRunSpec, AgentSpec, type AssertTraceDerivedFindings, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, SpawnEvent, SpawnJournal, Spend, type SteerContext, SupervisedResult, Supervisor, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, assertTraceDerivedFindings, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, definePersona, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, inlineSandboxClient, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, probeSandboxCapabilities, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runPersonified, settledToIteration, spendFromUsageEvents, trajectoryReport, verify, widen };
3085
+ export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };