@tangle-network/agent-runtime 0.46.0 → 0.48.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/agent.d.ts +1 -1
  2. package/dist/agent.js +1 -1
  3. package/dist/analyst-loop.d.ts +1 -1
  4. package/dist/{chunk-GN75RGM6.js → chunk-656G2XCL.js} +3 -3
  5. package/dist/{chunk-65FQLI4V.js → chunk-IW2LMLK6.js} +1714 -42
  6. package/dist/chunk-IW2LMLK6.js.map +1 -0
  7. package/dist/{chunk-I42NHLKX.js → chunk-LX66I3SC.js} +11 -6
  8. package/dist/chunk-LX66I3SC.js.map +1 -0
  9. package/dist/{chunk-KPN7OQ64.js → chunk-TJS7S3HJ.js} +2 -2
  10. package/dist/{chunk-KPN7OQ64.js.map → chunk-TJS7S3HJ.js.map} +1 -1
  11. package/dist/{coder-DCWFQpmJ.d.ts → coder-CVZNGbyg.d.ts} +1 -1
  12. package/dist/{driver-C-mtBo7h.d.ts → driver-DYU2sgHr.d.ts} +1 -1
  13. package/dist/index.d.ts +7 -7
  14. package/dist/index.js +3 -3
  15. package/dist/{kb-gate-2Gwpz_27.d.ts → kb-gate-51BlLlVM.d.ts} +8 -2
  16. package/dist/{loop-runner-bin-D-K6bRp3.d.ts → loop-runner-bin-DEm4roYF.d.ts} +4 -4
  17. package/dist/loop-runner-bin.d.ts +5 -5
  18. package/dist/loop-runner-bin.js +3 -3
  19. package/dist/loops.d.ts +5 -5
  20. package/dist/loops.js +55 -1
  21. package/dist/mcp/bin.js +3 -3
  22. package/dist/mcp/index.d.ts +71 -70
  23. package/dist/mcp/index.js +199 -27
  24. package/dist/mcp/index.js.map +1 -1
  25. package/dist/{otel-export-nurzFwuJ.d.ts → otel-export-EzfsVUhh.d.ts} +1 -1
  26. package/dist/profiles.d.ts +2 -2
  27. package/dist/{run-loop-CU2Y00Si.d.ts → run-loop-DvD4aGiE.d.ts} +1 -1
  28. package/dist/runtime.d.ts +915 -71
  29. package/dist/runtime.js +55 -1
  30. package/dist/{types-BfoeiQRZ.d.ts → types-BpDfCPUp.d.ts} +5 -5
  31. package/dist/{types-DnYoHvvZ.d.ts → types-nBMuollC.d.ts} +17 -0
  32. package/dist/workflow.d.ts +2 -2
  33. package/dist/workflow.js +1 -1
  34. package/package.json +25 -14
  35. package/skills/loop-writer/SKILL.md +163 -0
  36. package/dist/chunk-65FQLI4V.js.map +0 -1
  37. package/dist/chunk-I42NHLKX.js.map +0 -1
  38. /package/dist/{chunk-GN75RGM6.js.map → chunk-656G2XCL.js.map} +0 -0
package/dist/runtime.d.ts CHANGED
@@ -1,15 +1,15 @@
1
- import { AgentProfile as AgentProfile$1, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
1
+ import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
2
2
  export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
3
- import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, E as ExecutorFactory, d as AgentSpec, e as ExecutorRegistry, B as Budget, A as Agent, f as RootHandle, g as SupervisedResult, h as Spend, S as Scope, U as UsageEvent, i as Supervisor } from './types-BfoeiQRZ.js';
4
- export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BfoeiQRZ.js';
5
- export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-C-mtBo7h.js';
6
- import { S as SandboxClient, b as LoopResult, c as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-DnYoHvvZ.js';
7
- export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as LoopIterationDispatchPayload, H as LoopIterationEndedPayload, J as LoopIterationStartedPayload, a as LoopLineageOptions, M as LoopPlanDescription, N as LoopPlanPayload, f as LoopSandboxPlacement, P as LoopStartedPayload, Q as LoopTeardownFailedPayload, e as LoopTraceEmitter, T as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-DnYoHvvZ.js';
8
- import { AgentProfile, AnalystFinding, DefaultVerdict } from '@tangle-network/agent-eval';
3
+ import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
4
+ export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
5
+ import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
9
6
  export { DefaultVerdict } from '@tangle-network/agent-eval';
7
+ export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
8
+ import { S as SandboxClient, b as LoopResult, c as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-nBMuollC.js';
9
+ export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as LoopIterationDispatchPayload, H as LoopIterationEndedPayload, J as LoopIterationStartedPayload, a as LoopLineageOptions, M as LoopPlanDescription, N as LoopPlanPayload, f as LoopSandboxPlacement, P as LoopStartedPayload, Q as LoopTeardownFailedPayload, e as LoopTraceEmitter, T as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-nBMuollC.js';
10
10
  import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
11
- import { R as RunLoopOptions } from './run-loop-CU2Y00Si.js';
12
- export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-CU2Y00Si.js';
11
+ import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
12
+ export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
13
13
  import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
14
14
 
15
15
  /**
@@ -114,66 +114,66 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
114
114
  declare function materializeTreeView(events: SpawnEvent[]): TreeView;
115
115
 
116
116
  /**
117
- * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
118
- * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
119
- * run once on the prompt, emit the terminal result event, tear down.
120
- */
121
- declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
122
-
123
- /**
124
- * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
125
- *
126
- * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
127
- * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
128
- * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
129
- * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
130
- * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
131
- * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
132
- * the third silent. The fleet's products skipped (c) and fell back to a
133
- * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
134
- * to kill.
117
+ * auditIntent the route-rigor analyst: is this trajectory even going the RIGHT WAY?
135
118
  *
136
- * `loopDispatch` collapses all three into one typed call:
119
+ * `observe()` critiques execution quality ("what's unfinished"). This audits ALIGNMENT —
120
+ * a different failure class the score can't see until it's too late: an agent can be
121
+ * executing flawlessly down the wrong route. The auditor reads the trajectory and
122
+ * compares three intents:
137
123
  *
138
- * const dispatch = loopDispatch({
139
- * sandboxClient,
140
- * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
141
- * })
142
- * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
124
+ * declared — what the task says to do (the prompt / acceptance criteria)
125
+ * revealed — what the agent is ACTUALLY optimizing, inferred from its action pattern
126
+ * (the inverse-inference move: actions reveal objectives)
127
+ * user — what the principal actually wants (the contract, when it differs from
128
+ * the literal task text), plus where the user's own trajectory is heading
143
129
  *
144
- * Usage is reported automatically; trace events are forwarded automatically;
145
- * the ctx is built automatically. The seam becomes impossible to mis-wire.
130
+ * and returns a verdict (aligned / drifting / diverged) with evidence and ONE
131
+ * recommended intervention. FIREWALLED like every analyst: input is the trajectory and
132
+ * the intents — never the verifier or its data (zero check-leakage, so route auditing
133
+ * is always Goodhart-safe to run online).
146
134
  *
147
- * Typed structurally against the campaign `DispatchContext` (imported type-only
148
- * from `@tangle-network/agent-eval/campaign`)a downward dependency, never an
149
- * inversion.
135
+ * Where it runs: between shots (steer the next one), as a watchdog over the lifecycle
136
+ * stream (abort-and-refund a diverged rollout the budget pool makes early abort
137
+ * strictly valuable), or post-hoc over a whole BenchmarkReport (the meta-intent pass:
138
+ * is the LOOP optimizing the right thing — degenerate submissions, check-gaming shapes,
139
+ * objective drift across tasks).
150
140
  */
151
141
 
152
- /** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
153
- type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
154
- interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
155
- /** Sandbox client used for every cell's `runLoop`. Supplied once. */
156
- sandboxClient: SandboxClient;
157
- /** Build the per-cell runLoop options from the scenario (+ profile, when
158
- * used with `runProfileMatrix`). */
159
- toLoopOptions: (scenario: TScenario, profile: AgentProfile) => LoopOptionsForDispatch<Task, Output, Decision>;
160
- /** Map the finished loop to the artifact the judges score. Default:
161
- * `result.winner?.output`. A loop with no winner yields `undefined` (judges
162
- * skip the cell) — but the loop's token usage is STILL reported, so the
163
- * integrity guard sees real activity. */
164
- toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
165
- /** Forward `loop.*` trace events into the campaign's scoped trace so loop
166
- * spans correlate with the cell. Default true. */
167
- forwardTrace?: boolean;
168
- /** Cost-meter source label for the loop's spend. Default `'loop'`. */
169
- costSource?: string;
142
+ interface AuditIntentInput {
143
+ /** The declared intent: the task text / acceptance criteria the agent was given. */
144
+ declaredIntent: string;
145
+ /** The trajectory so far tool calls + results + assistant turns (any event shapes). */
146
+ trace: ReadonlyArray<unknown>;
147
+ /** The principal's actual intent when it differs from the literal task (the contract). */
148
+ userIntent?: string;
149
+ /** The loop-level purpose (meta-intent): what the WHOLE run is for — lets the auditor
150
+ * flag locally-sensible work that serves the wrong larger objective. */
151
+ metaIntent?: string;
152
+ runId?: string;
170
153
  }
171
- /**
172
- * Adapter for `runProfileMatrix` (profile is an axis). Returns a
173
- * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
174
- * reports usage automatically.
175
- */
176
- declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
154
+ interface AuditIntentOptions {
155
+ chat: ChatClient;
156
+ model?: string;
157
+ /** Override the auditor instruction (optimizable like any analyst prompt). */
158
+ auditorInstruction?: string;
159
+ /** Cap trace lines fed to the auditor. Default 80. */
160
+ maxTraceLines?: number;
161
+ signal?: AbortSignal;
162
+ }
163
+ interface IntentAudit {
164
+ /** What the agent's actions reveal it is actually optimizing — one sentence. */
165
+ revealedIntent: string;
166
+ verdict: 'aligned' | 'drifting' | 'diverged';
167
+ /** Trajectory-grounded evidence for the verdict (specific calls/patterns). */
168
+ evidence: string;
169
+ /** The single recommended intervention. */
170
+ recommendation: 'continue' | 'steer' | 'abort';
171
+ /** When recommendation is 'steer': the corrective instruction to inject. */
172
+ steer?: string;
173
+ confidence: number;
174
+ }
175
+ declare const defaultAuditorInstruction: string;
176
+ declare function auditIntent(input: AuditIntentInput, opts: AuditIntentOptions): Promise<IntentAudit>;
177
177
 
178
178
  /**
179
179
  * @experimental
@@ -329,7 +329,7 @@ interface ShapeContext<D = unknown> {
329
329
  spawnChild(name: string, spec: AgentSpec): Agent<unknown, Outcome<D>>;
330
330
  /** Derive a child `AgentSpec` from the persona's root spec with an overridden profile —
331
331
  * the seam a shape uses to give a worker a narrower role/prompt than the root persona. */
332
- childSpec(profile: AgentProfile$1, harness?: BackendType | null): AgentSpec;
332
+ childSpec(profile: AgentProfile, harness?: BackendType | null): AgentSpec;
333
333
  }
334
334
  /**
335
335
  * A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it
@@ -790,7 +790,7 @@ interface RenderCorpusToInstructionsOptions {
790
790
  readonly corpus: Corpus;
791
791
  readonly filter: CorpusFilter;
792
792
  /** The profile to project the facts into. The result is a fresh profile — the input is unchanged. */
793
- readonly profile: AgentProfile$1;
793
+ readonly profile: AgentProfile;
794
794
  /** Where the rendered facts land: appended to `prompt.instructions[]` (default) or folded into
795
795
  * the single-blob `resources.instructions` string. */
796
796
  readonly target?: 'prompt' | 'resources';
@@ -799,7 +799,7 @@ interface RenderCorpusToInstructionsOptions {
799
799
  }
800
800
  /** `renderCorpusToInstructions(opts)` — the flywheel read-back projection. Async (queries the
801
801
  * durable corpus); returns a fresh `AgentProfile` with the accreted facts merged in. */
802
- type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile$1>;
802
+ type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile>;
803
803
  /**
804
804
  * One node in the reconstructed trajectory tree — a driver OR a leaf, with its OWN spend and the
805
805
  * spend ROLLED UP over its subtree. Reconstructed from the `SpawnJournal` (structure + per-node
@@ -899,6 +899,529 @@ interface EqualKOnCostOptions {
899
899
  /** `equalKOnCost(arms, opts)` — the cross-arm equal-compute check on conserved cost. */
900
900
  type EqualKOnCost = (arms: ReadonlyArray<EqualKArm>, options?: EqualKOnCostOptions) => EqualKVerdict;
901
901
 
902
+ /**
903
+ * The third-person observer — the connective tissue that closes the loop.
904
+ *
905
+ * A driver spawns a worker; the worker can't see itself. `observe` reads the
906
+ * worker's TRACE (what it actually did — every tool call, cost, failure) and
907
+ * produces two streams:
908
+ * - `findings` / `report` — fed back DOWN (a steer for the next attempt) and
909
+ * OUT (the operator-facing "what I noticed + what to change").
910
+ * - `learned` — durable facts written to the cross-run `Corpus` so the NEXT
911
+ * run starts smarter (the continuous half of "continuous self-improvement").
912
+ *
913
+ * Findings are TRACE-derived, never JUDGE-derived (`derived_from_judge:false`):
914
+ * the observer reads behavior, never the acceptance verdict — the selector≠judge
915
+ * firewall (docs/learning-flywheel.md). The observer is harness-agnostic: it
916
+ * reads a trace + an output, so it watches opencode, codex, hermes, or a BYO
917
+ * agent identically.
918
+ */
919
+
920
+ interface ObserveInput {
921
+ /** What the worker was asked to do. */
922
+ task: string;
923
+ /** What it produced (its final answer / artifact summary). */
924
+ output: string;
925
+ /** The worker's trace — any event array (sandbox events, tool-call records). */
926
+ trace: ReadonlyArray<unknown>;
927
+ /** Terminal status only (passed/failed/unknown) — NOT a judge score; the
928
+ * observer never reads the verdict, it reads behavior. */
929
+ outcome?: 'passed' | 'failed' | 'unknown';
930
+ /** Provenance back to the run. */
931
+ runId?: string;
932
+ }
933
+ interface ObserveOptions {
934
+ /** The model-call seam (agent-eval `createChatClient`: router / cli-bridge / …). */
935
+ chat: ChatClient;
936
+ model?: string;
937
+ /** When set, learned facts are appended (idempotent) for the next run to read. */
938
+ corpus?: Corpus;
939
+ /** Tags written onto learned facts + used by the next run's corpus query. */
940
+ tags?: ReadonlyArray<string>;
941
+ signal?: AbortSignal;
942
+ /** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */
943
+ maxTraceLines?: number;
944
+ /** Override the analyst's system instruction — the prompt that turns a trace into
945
+ * findings + recommended_actions. The analyst IS the steerer, so this is the knob a
946
+ * prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The
947
+ * firewall (trace-only, never the verdict) is structural (input has no score), so a
948
+ * custom instruction cannot break it. */
949
+ analystInstruction?: string;
950
+ }
951
+ /** The default observer instruction — exported so an optimizer can seed its population. */
952
+ declare const defaultAnalystInstruction: string;
953
+ interface Observation {
954
+ findings: AnalystFinding[];
955
+ /** Facts persisted to the corpus (empty when no corpus was supplied). */
956
+ learned: CorpusRecord[];
957
+ /** Operator-facing markdown: what the observer noticed + what to change. */
958
+ report: string;
959
+ }
960
+ declare function observe(input: ObserveInput, opts: ObserveOptions): Promise<Observation>;
961
+ /** Operator-facing report, split by who should act. The agent block is the
962
+ * steer; the operator block is the advice. */
963
+ declare function renderReport(findings: ReadonlyArray<AnalystFinding>): string;
964
+
965
+ /**
966
+ * harvestCorpus — production traces → corpus, the G2 bridge (the playbook's step 6).
967
+ * The flywheel's write side, batched: run the firewalled `observe()` analyst over a
968
+ * stream of completed runs (yesterday's production traces, a benchmark's rollouts, a
969
+ * fleet's day) and accrete the trace-derived facts into the durable corpus.
970
+ *
971
+ * Store-agnostic by design: the caller maps its trace store's rows (a
972
+ * `ProductionTraceSink` ndjson, OTLP spans, RunRecords) to `ObserveInput` — task text,
973
+ * final output, the event trace, terminal outcome. The analyst reads BEHAVIOR only
974
+ * (the firewall is structural: the input carries no judge verdict), and corpus appends
975
+ * are idempotent on (claim + tags), so re-harvesting the same window is safe.
976
+ *
977
+ * The nightly product job is then three lines:
978
+ * const runs = mapSinkRowsToObserveInputs(await readSink(yesterday))
979
+ * const report = await harvestCorpus({ runs, chat, corpus, tags: ['gtm-agent'] })
980
+ * log(report) // runsObserved / findings / learned / failures
981
+ *
982
+ * NOTE on the read side: harvesting is safe and cheap; *injecting* facts back into runs
983
+ * is the measured danger zone — naive unconditional priming tested NEGATIVE (−11.6pp,
984
+ * context pollution; docs/research/layer-across-run.md). Gate any priming design on its
985
+ * own A/B; the corpus's first consumers are operators and optimizers, not prompts.
986
+ */
987
+
988
+ interface HarvestCorpusOptions {
989
+ /** The completed runs to analyze — map your store's rows to `ObserveInput`. */
990
+ runs: AsyncIterable<ObserveInput> | Iterable<ObserveInput>;
991
+ /** The model-call seam (agent-eval `createChatClient`). */
992
+ chat: ChatClient;
993
+ model?: string;
994
+ /** The durable corpus the facts accrete into. */
995
+ corpus: Corpus;
996
+ /** Tags written onto learned facts (the product/domain key the read side queries by). */
997
+ tags?: ReadonlyArray<string>;
998
+ /** Override the analyst instruction (the GEPA-tunable knob). */
999
+ analystInstruction?: string;
1000
+ /** Runs analyzed in parallel. Default 4. */
1001
+ concurrency?: number;
1002
+ /** Hard cap on runs consumed from the stream (a cost guard for unbounded stores). */
1003
+ maxRuns?: number;
1004
+ signal?: AbortSignal;
1005
+ }
1006
+ interface HarvestFailure {
1007
+ runId: string;
1008
+ error: string;
1009
+ }
1010
+ interface HarvestReport {
1011
+ runsObserved: number;
1012
+ /** Total findings the analyst produced (including ones already known). */
1013
+ findings: number;
1014
+ /** NEW facts actually appended (idempotent dedup excludes re-learned ones). */
1015
+ learned: number;
1016
+ /** Per-run analysis failures — reported, never silently dropped. */
1017
+ failures: HarvestFailure[];
1018
+ }
1019
+ declare function harvestCorpus(opts: HarvestCorpusOptions): Promise<HarvestReport>;
1020
+
1021
+ /**
1022
+ * Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
1023
+ * instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
1024
+ * run once on the prompt, emit the terminal result event, tear down.
1025
+ */
1026
+ declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
1027
+
1028
+ /**
1029
+ * `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
1030
+ *
1031
+ * Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
1032
+ * `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
1033
+ * sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
1034
+ * `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
1035
+ * forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
1036
+ * `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
1037
+ * the third silent. The fleet's products skipped (c) and fell back to a
1038
+ * `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
1039
+ * to kill.
1040
+ *
1041
+ * `loopDispatch` collapses all three into one typed call:
1042
+ *
1043
+ * const dispatch = loopDispatch({
1044
+ * sandboxClient,
1045
+ * toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
1046
+ * })
1047
+ * await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
1048
+ *
1049
+ * Usage is reported automatically; trace events are forwarded automatically;
1050
+ * the ctx is built automatically. The seam becomes impossible to mis-wire.
1051
+ *
1052
+ * Typed structurally against the campaign `DispatchContext` (imported type-only
1053
+ * from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
1054
+ * inversion.
1055
+ */
1056
+
1057
+ /** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
1058
+ type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
1059
+ interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
1060
+ /** Sandbox client used for every cell's `runLoop`. Supplied once. */
1061
+ sandboxClient: SandboxClient;
1062
+ /** Build the per-cell runLoop options from the scenario (+ profile, when
1063
+ * used with `runProfileMatrix`). */
1064
+ toLoopOptions: (scenario: TScenario, profile: AgentProfile$1) => LoopOptionsForDispatch<Task, Output, Decision>;
1065
+ /** Map the finished loop to the artifact the judges score. Default:
1066
+ * `result.winner?.output`. A loop with no winner yields `undefined` (judges
1067
+ * skip the cell) — but the loop's token usage is STILL reported, so the
1068
+ * integrity guard sees real activity. */
1069
+ toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
1070
+ /** Forward `loop.*` trace events into the campaign's scoped trace so loop
1071
+ * spans correlate with the cell. Default true. */
1072
+ forwardTrace?: boolean;
1073
+ /** Cost-meter source label for the loop's spend. Default `'loop'`. */
1074
+ costSource?: string;
1075
+ }
1076
+ /**
1077
+ * Adapter for `runProfileMatrix` (profile is an axis). Returns a
1078
+ * `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
1079
+ * reports usage automatically.
1080
+ */
1081
+ declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
1082
+
1083
+ /**
1084
+ * The general agentic primitive — sequential (depth) and parallel (breadth) over a shared,
1085
+ * checkable artifact, driven through the keystone Supervisor as one recursive `Agent.act`.
1086
+ *
1087
+ * The domain lives behind ONE seam — `AgenticSurface` (open an artifact, list tools, call a tool,
1088
+ * score the artifact, close it). EnterpriseOps implements it (seed a gym DB, MCP tools, SQL
1089
+ * verifier); Commit0/AppWorld/terminal-bench implement it the same way (a repo workspace, shell
1090
+ * tools, the test suite). The drivers below are domain-blind: they run over any surface.
1091
+ *
1092
+ * Two shapes, the agent's POMDP rollout as the unit:
1093
+ * - DEPTH one persistent artifact carried across shots. Each shot the agent works the tool loop;
1094
+ * between shots a trace-analyst (selector≠judge: reads the trajectory, never the score)
1095
+ * steers the resumed session toward what's unfinished. shot n stands on shot n-1's
1096
+ * artifact state + history. This is continuation — long-horizon, same artifact.
1097
+ * - BREADTH K independent artifacts, each a fresh rollout, the deployable verifier picks the best.
1098
+ *
1099
+ * Both are an `Agent` whose `act` spawns leaf shots through `scope.spawn` and reacts via
1100
+ * `scope.next()` — so the conserved budget pool meters them (equal-k by construction), the journal
1101
+ * records the tree, and the same primitive nests. `runAgentic` runs the chosen driver through
1102
+ * `createSupervisor().run`. The leaf (one shot over a handle) is resolved per-spawn from a
1103
+ * surface-closed registry — the open `Executor` seam, not bespoke per-benchmark glue.
1104
+ */
1105
+
1106
+ interface AgenticTask {
1107
+ readonly id: string;
1108
+ readonly systemPrompt: string;
1109
+ readonly userPrompt: string;
1110
+ /** Opaque domain payload the surface reads (EOPS: servers/verifiers/tools). Drivers never read it. */
1111
+ readonly meta?: Record<string, unknown>;
1112
+ }
1113
+ interface ArtifactHandle {
1114
+ readonly id: string;
1115
+ readonly surface: string;
1116
+ /** Opaque per-artifact context the surface stashes (EOPS: the seeded gym server + db id). */
1117
+ readonly ctx?: unknown;
1118
+ }
1119
+ interface AgenticTool {
1120
+ readonly type: 'function';
1121
+ readonly function: {
1122
+ name: string;
1123
+ description?: string;
1124
+ parameters: Record<string, unknown>;
1125
+ };
1126
+ }
1127
+ interface SurfaceScore {
1128
+ passes: number;
1129
+ total: number;
1130
+ /** Checks excluded as malformed (data defect, not the agent). `total === 0` ⇒ unscoreable. */
1131
+ errored: number;
1132
+ }
1133
+ /** A stateful, checkable environment an agent operates over with tools. Open behind one interface. */
1134
+ interface AgenticSurface {
1135
+ readonly name: string;
1136
+ open(task: AgenticTask): Promise<ArtifactHandle>;
1137
+ tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]>;
1138
+ call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string>;
1139
+ score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
1140
+ close(handle: ArtifactHandle): Promise<void>;
1141
+ }
1142
+ interface AgenticOptions {
1143
+ routerBaseUrl: string;
1144
+ routerKey: string;
1145
+ model: string;
1146
+ temperature?: number;
1147
+ /** Turns the agent may take within ONE shot before the driver intervenes. */
1148
+ innerTurns?: number;
1149
+ /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
1150
+ * prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
1151
+ analystInstruction?: string;
1152
+ /** The critic's model — lets the analyst be a stronger (or cheaper) model than the
1153
+ * worker. Omitted ⇒ the worker's `model`. */
1154
+ analystModel?: string;
1155
+ /** Across-run learning: when set, the analyst's observe() pass appends trace-derived
1156
+ * facts here (the flywheel write side). Priming (the read side) is the caller's move —
1157
+ * query the corpus and fold facts into the task's systemPrompt before runAgentic. */
1158
+ corpus?: Corpus;
1159
+ /** Tags written onto learned facts (and used by the caller's priming query). */
1160
+ corpusTags?: string[];
1161
+ }
1162
+ type Msg = Record<string, unknown>;
1163
+ interface ShotResult {
1164
+ messages: Msg[];
1165
+ score: number;
1166
+ passes: number;
1167
+ total: number;
1168
+ completions: number;
1169
+ toolErrors: number;
1170
+ }
1171
+ interface AgenticRunResult {
1172
+ /** The strategy name (built-in 'depth'/'breadth' or a custom strategy's name). */
1173
+ mode: string;
1174
+ score: number;
1175
+ resolved: boolean;
1176
+ completions: number;
1177
+ /** DEPTH: score after each shot — the progress-over-rounds curve. BREADTH: best-so-far per rollout. */
1178
+ progression: number[];
1179
+ shots: number;
1180
+ /** The cost vector, stamped by `runAgentic` from the Supervisor's conserved pool: real
1181
+ * router tokens, priced usd (0 when the model is unpriced — never fabricated), wall ms. */
1182
+ usd: number;
1183
+ ms: number;
1184
+ tokens: {
1185
+ input: number;
1186
+ output: number;
1187
+ };
1188
+ }
1189
+ /** DEPTH: one persistent artifact, carried across analyst-steered shots. */
1190
+ declare function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
1191
+ maxShots: number;
1192
+ }): Agent<unknown, Outcome<unknown>>;
1193
+ /** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
1194
+ declare function breadthDriver(_surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
1195
+ width: number;
1196
+ }): Agent<unknown, Outcome<unknown>>;
1197
+ /**
1198
+ * A Strategy is HOW you spend the compute budget to beat the Environment's check — it
1199
+ * builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev
1200
+ * authors their own by implementing `driver()` to return an Agent whose `act()` spawns
1201
+ * shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are
1202
+ * the reference implementations to copy:
1203
+ * sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample).
1204
+ * refine — attempt → observe() reads the trace → steer the next → repeat (iterate).
1205
+ * (A multi-agent "team" is just a Strategy whose driver spawns several different agents.)
1206
+ */
1207
+ interface Strategy {
1208
+ readonly name: string;
1209
+ driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>;
1210
+ }
1211
+ declare const sample: Strategy;
1212
+ declare const refine: Strategy;
1213
+ /** A role for one shot — multi-agent loops (researcher + engineer, a panel of k
1214
+ * researchers) give each shot its own system prompt and optionally its own model. */
1215
+ interface ShotPersona {
1216
+ /** Replaces the task's systemPrompt for a FRESH shot; on a carried conversation it is
1217
+ * injected as a hand-off message (the transcript's earlier roles stay intact). */
1218
+ systemPrompt?: string;
1219
+ /** Per-shot model override (e.g. a stronger model for the engineer shot). */
1220
+ model?: string;
1221
+ }
1222
+ interface ShotSpec {
1223
+ /** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */
1224
+ handle?: ArtifactHandle;
1225
+ messages?: Msg[];
1226
+ steer?: string;
1227
+ persona?: ShotPersona;
1228
+ }
1229
+ interface StrategyResult {
1230
+ score: number;
1231
+ resolved: boolean;
1232
+ completions: number;
1233
+ progression: number[];
1234
+ shots: number;
1235
+ }
1236
+ /** Artifact lifecycle a strategy may manage itself — open/close ONLY. Raw `call`/`score`
1237
+ * are withheld: scores reach the body solely through `shot()`'s ShotResult (the
1238
+ * harness-verified channel), so a body cannot peek the check or fabricate around it. */
1239
+ interface StrategyArtifacts {
1240
+ readonly name: string;
1241
+ open(task: AgenticTask): Promise<ArtifactHandle>;
1242
+ close(handle: ArtifactHandle): Promise<void>;
1243
+ }
1244
+ /** What a strategy body composes with: the artifact lifecycle, the budget, and the two steps. */
1245
+ interface StrategyCtx {
1246
+ /** Open/close artifacts the body manages itself (e.g. one persistent handle for depth). */
1247
+ readonly surface: StrategyArtifacts;
1248
+ readonly task: AgenticTask;
1249
+ readonly opts: AgenticOptions;
1250
+ readonly budget: number;
1251
+ readonly scope: Scope<Outcome<unknown>>;
1252
+ /** Run ONE worker shot; its harness-scored result, or null if it went down. */
1253
+ shot(spec?: ShotSpec): Promise<ShotResult | null>;
1254
+ /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
1255
+ critique(messages: Msg[]): Promise<string | null>;
1256
+ }
1257
+ /** Author a Strategy from the composable steps — the open, compact way. */
1258
+ declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
1259
+ /** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot
1260
+ * fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck)
1261
+ * — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
1262
+ * checkpoint across all lines), the deployable metric. This is the "experts build BETTER
1263
+ * optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
1264
+ declare const adaptiveRefine: Strategy;
1265
+ /** The explore-then-exploit MIX: spend ⌈budget/2⌉ on independent samples (kept open),
1266
+ * then refine the best-verifying line with the remaining budget. Sample's basin escape +
1267
+ * refine's accumulation — the third built-in, authored from the public steps. */
1268
+ declare const sampleThenRefine: Strategy;
1269
+ interface RunAgenticOptions extends AgenticOptions {
1270
+ surface: AgenticSurface;
1271
+ task: AgenticTask;
1272
+ /** Lifecycle observability — every spawn/settle (shots, analysts) streams here live.
1273
+ * The seam online watchdogs/route-auditors subscribe to. */
1274
+ hooks?: RuntimeHooks;
1275
+ /** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */
1276
+ strategy?: Strategy;
1277
+ /** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */
1278
+ mode?: 'depth' | 'breadth';
1279
+ /** budget: refine→max shots; sample→rollout width. */
1280
+ budget: number;
1281
+ rootBudget?: Budget;
1282
+ }
1283
+ /** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
1284
+ declare function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunResult>;
1285
+
1286
+ /**
1287
+ * runBenchmark — the packaged optimization suite. Define a domain by implementing an
1288
+ * `Environment` (open / tools / call / score / close); get the optimization strategies
1289
+ * compared, scored by your own deployable check, with a paired-bootstrap report — free.
1290
+ *
1291
+ * The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
1292
+ * is how you spend the budget to beat the check. Two built-ins:
1293
+ *
1294
+ * sample — N independent attempts, keep the best-verifying one. (best-of-N / resample)
1295
+ * refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
1296
+ *
1297
+ * Both run at equal budget through the Supervisor's conserved pool; the headline is the
1298
+ * paired lift of refine over sample. Author your own strategy with `defineStrategy`.
1299
+ */
1300
+
1301
+ /** A checkable task domain — implement these 5 hooks and the suite does the rest. The
1302
+ * same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
1303
+ type Environment = AgenticSurface;
1304
+ interface BenchmarkConfig {
1305
+ /** The task domain (5 hooks). */
1306
+ environment: Environment;
1307
+ /** The tasks to score across. */
1308
+ tasks: AgenticTask[];
1309
+ /** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
1310
+ worker: AgenticOptions;
1311
+ /** Which strategies to compare. Pass the built-ins (`refine`, `sample`) or your own.
1312
+ * Default: [sample, refine]. */
1313
+ strategies?: Strategy[];
1314
+ /** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
1315
+ budget?: number;
1316
+ /** Tasks scored in parallel. Default 3. */
1317
+ concurrency?: number;
1318
+ /** Progress hook — fires as each task settles (the live-monitoring seam: append to a
1319
+ * progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
1320
+ onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void;
1321
+ /** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
1322
+ * here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
1323
+ hooks?: RuntimeHooks;
1324
+ }
1325
+ interface BenchmarkLift {
1326
+ /** Mean of paired deltas (refine − sample). */
1327
+ mean: number;
1328
+ low: number;
1329
+ high: number;
1330
+ n: number;
1331
+ }
1332
+ /** One strategy's outcome on one task — the per-task cell an optimizer consumes. */
1333
+ interface BenchmarkCell {
1334
+ score: number;
1335
+ resolved: boolean;
1336
+ /** The progress curve (refine: score per shot; sample: best-so-far per rollout). */
1337
+ progression: number[];
1338
+ usd: number;
1339
+ ms: number;
1340
+ tokens: {
1341
+ input: number;
1342
+ output: number;
1343
+ };
1344
+ }
1345
+ interface BenchmarkTaskRow {
1346
+ taskId: string;
1347
+ /** Per-strategy cells; absent when the task errored before completing all strategies. */
1348
+ cells?: Record<string, BenchmarkCell>;
1349
+ /** Why the task was excluded (infra/setup failure) — never silently dropped. */
1350
+ error?: string;
1351
+ }
1352
+ interface BenchmarkStrategySummary {
1353
+ /** Mean verifier score (0..1). */
1354
+ score: number;
1355
+ /** Fraction of tasks fully resolved. */
1356
+ resolved: number;
1357
+ /** Mean cost vector per task. */
1358
+ usd: number;
1359
+ ms: number;
1360
+ }
1361
+ interface BenchmarkReport {
1362
+ n: number;
1363
+ excluded: number;
1364
+ /** Per-strategy means (keyed by strategy.name). */
1365
+ perStrategy: Record<string, BenchmarkStrategySummary>;
1366
+ /** The full per-task × per-strategy table — the LOSSES an optimizer (GEPA, a
1367
+ * strategy-author, an operator) consumes. Includes errored tasks with the reason. */
1368
+ perTask: BenchmarkTaskRow[];
1369
+ /** The non-dominated strategies on (score ↑, $/task ↓) — collapse-last, per the canon:
1370
+ * a strategy that ties on score at half the cost WINS and a scalar would hide it. */
1371
+ pareto: string[];
1372
+ /** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
1373
+ refineVsSample?: BenchmarkLift;
1374
+ }
1375
+ /** Run the requested strategies over the tasks, scored by the Environment's own check.
1376
+ * Resilient: a task whose rollouts fail (transient infra) is excluded from the stats but
1377
+ * reported in `perTask` with the error — never silently dropped. */
1378
+ declare function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport>;
1379
+ /** Pretty-print a report — the "free optimization" verdict, with the cost vector. */
1380
+ declare function printBenchmarkReport(report: BenchmarkReport): void;
1381
+
1382
+ /**
1383
+ * createMcpEnvironment — wrap any MCP server as an `Environment` (the product-adoption
1384
+ * primitive: a product's agent tools are usually already an MCP surface, so the domain
1385
+ * only writes the lifecycle hooks — open a scoped artifact, score it with a deployable
1386
+ * check, close it — and the tool plumbing is derived from the server).
1387
+ *
1388
+ * What the helper owns (the generic 80%, hardened on the EnterpriseOps gym):
1389
+ * - JSON-RPC `tools/list` → `AgenticTool[]`, with schemas coerced to the
1390
+ * OpenAI-tool-valid shape (top-level oneOf/anyOf/allOf/enum/not are rejected by
1391
+ * tool-calling providers; nested combinators are fine).
1392
+ * - JSON-RPC `tools/call` → the tool's text content (errors surfaced as `ERROR: …`
1393
+ * strings — a bad call is the agent's outcome, not an infra fault).
1394
+ * - SSE response parsing (streamable-HTTP MCP servers answer with `data:` lines).
1395
+ * - Bounded retry with backoff on thrown fetches (transient network ≠ task failure).
1396
+ *
1397
+ * What the domain supplies: `open` (create/seed the per-task artifact and return its
1398
+ * MCP endpoint — url + headers carry the per-artifact scoping, e.g. a database id
1399
+ * header), `score` (the deployable check), and optional `close`/`selectTools`.
1400
+ */
1401
+
1402
+ /** Where a handle's MCP server lives; headers carry per-artifact scoping. */
1403
+ interface McpEndpoint {
1404
+ url: string;
1405
+ headers?: Record<string, string>;
1406
+ }
1407
+ interface McpEnvironmentOptions {
1408
+ name: string;
1409
+ /** Create/seed the per-task artifact; return its handle + the MCP endpoint scoped to it. */
1410
+ open(task: AgenticTask): Promise<{
1411
+ handle: ArtifactHandle;
1412
+ endpoint: McpEndpoint;
1413
+ }>;
1414
+ /** The deployable check over the artifact's current state. */
1415
+ score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
1416
+ /** Teardown (delete the seeded artifact). Optional — omit for stateless servers. */
1417
+ close?(handle: ArtifactHandle): Promise<void>;
1418
+ /** Restrict/order the server's tools per task (e.g. the task's selected_tools). Default: all. */
1419
+ selectTools?(task: AgenticTask, all: AgenticTool[]): AgenticTool[];
1420
+ /** Cap on a tool result's text fed back to the worker. Default 1500 chars. */
1421
+ maxResultChars?: number;
1422
+ }
1423
+ declare function createMcpEnvironment(opts: McpEnvironmentOptions): Environment;
1424
+
902
1425
  /**
903
1426
  * @experimental
904
1427
  *
@@ -1120,7 +1643,7 @@ declare class FileCorpus implements Corpus {
1120
1643
  * An empty query result returns a fresh COPY of the profile with no instruction change (a valid
1121
1644
  * "nothing learned yet" read, not an error).
1122
1645
  */
1123
- declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile$1>;
1646
+ declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile>;
1124
1647
 
1125
1648
  /**
1126
1649
  * @experimental
@@ -1227,6 +1750,39 @@ declare function trajectoryReport(journal: SpawnJournal, blobs: ResultBlobStore,
1227
1750
  */
1228
1751
  declare function equalKOnCost(arms: ReadonlyArray<EqualKArm>, options?: EqualKOnCostOptions): EqualKVerdict;
1229
1752
 
1753
+ interface PromotionGateOptions {
1754
+ /** The HOLDOUT report — must carry per-task cells for both strategy names. */
1755
+ report: BenchmarkReport;
1756
+ /** The incumbent champion's strategy name. */
1757
+ incumbent: string;
1758
+ /** The challenger's strategy name. */
1759
+ candidate: string;
1760
+ /** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
1761
+ deltaThreshold?: number;
1762
+ /** Minimum paired tasks before significance can be claimed. Default 6 — below that
1763
+ * the bootstrap CI is too wide to separate a real lift from the per-task noise. */
1764
+ minPairedTasks?: number;
1765
+ /** Bootstrap statistic over the paired deltas. Default 'mean'. */
1766
+ statistic?: 'mean' | 'median';
1767
+ /** Fixed by the substrate by default — the same report always yields the same verdict. */
1768
+ seed?: number;
1769
+ resamples?: number;
1770
+ }
1771
+ interface PromotionVerdict {
1772
+ promoted: boolean;
1773
+ reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
1774
+ /** Paired tasks that carried both strategies' cells. */
1775
+ n: number;
1776
+ /** Paired (candidate − incumbent) lift across the holdout tasks. */
1777
+ lift: {
1778
+ mean: number;
1779
+ median: number;
1780
+ low: number;
1781
+ high: number;
1782
+ };
1783
+ }
1784
+ declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
1785
+
1230
1786
  /**
1231
1787
  * Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
1232
1788
  * dispatch.
@@ -1656,20 +2212,204 @@ interface OpenSandboxRunOptions {
1656
2212
  */
1657
2213
  declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandboxRunOptions, deliverable: Deliverable<Out>): Promise<SandboxRun<Out>>;
1658
2214
 
2215
+ /**
2216
+ * authorStrategy — the agent-authored layer as a package primitive (software-3.0): an
2217
+ * LLM reads a benchmark's per-task LOSSES + the defineStrategy contract and writes a NEW
2218
+ * optimization strategy as code; the caller gates it like any human-built candidate
2219
+ * (runBenchmark + a frozen holdout).
2220
+ *
2221
+ * Structurally safe by construction: the authored body composes shot()/critique() and
2222
+ * spends through the Supervisor's conserved pool — it can be wrong, but it cannot
2223
+ * Goodhart the check (it never sees the verifiers) and it cannot win by overspending.
2224
+ *
2225
+ * The authored module is written to `outDir` and dynamically imported — run under a
2226
+ * TS-capable loader (tsx) since models often emit type annotations.
2227
+ */
2228
+
2229
+ /** The compressed consumable a skill carries: everything an author needs to emit a loop. */
2230
+ declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n // your composition\n})\n";
2231
+ interface AuthorStrategyOptions {
2232
+ /** The model-call seam (agent-eval `createChatClient`). */
2233
+ chat: ChatClient;
2234
+ model?: string;
2235
+ /** A NAMED fallback author tried once when the primary call fails or returns no code
2236
+ * block (thinking models time out at the edge on long authoring prompts, or return
2237
+ * empty content without `maxTokens`). Opt-in — absent means the primary's failure
2238
+ * propagates. */
2239
+ fallbackModel?: string;
2240
+ /** The contract text shown to the author. Default `strategyAuthorContract`. The
2241
+ * meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
2242
+ * variant on the same frozen holdout as any strategy. */
2243
+ contract?: string;
2244
+ /** The environment the losses came from (orientation only — never the verifiers). */
2245
+ environmentName: string;
2246
+ /** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
2247
+ lossesJson: string;
2248
+ /** The budget the strategy must respect (shots/width). */
2249
+ budget: number;
2250
+ /** Where the authored module file is written (created if missing). */
2251
+ outDir: string;
2252
+ temperature?: number;
2253
+ /** Completion cap — required by thinking-model authors that stream reasoning first. */
2254
+ maxTokens?: number;
2255
+ signal?: AbortSignal;
2256
+ }
2257
+ /** Static CONTRACT lint over an authored strategy module — the module-boundary
2258
+ * enforcement of the harness's two measurement invariants:
2259
+ * - author blindness: the only import allowed is the loops surface. A body that could
2260
+ * reach the filesystem, network, or process could read or mutate verifier/artifact
2261
+ * state outside the brokered shots, and the harness-verified score would stop
2262
+ * meaning "what the shots achieved".
2263
+ * - conserved dose: no out-of-band compute (fetch/require/eval) — every unit a
2264
+ * strategy spends is metered by the Supervisor's pool, which is what makes
2265
+ * equal-budget comparisons between strategies valid.
2266
+ * A lint, not a sandbox: its job is keeping the benchmark numbers interpretable. */
2267
+ declare function assertStrategyContract(code: string): void;
2268
+ interface AuthoredStrategy {
2269
+ strategy: Strategy;
2270
+ file: string;
2271
+ code: string;
2272
+ }
2273
+ /** Author + load a strategy from losses. Throws when the author emits no loadable module;
2274
+ * with `fallbackModel` set, the named fallback gets one attempt first. */
2275
+ declare function authorStrategy(opts: AuthorStrategyOptions): Promise<AuthoredStrategy>;
2276
+
2277
+ /**
2278
+ * runStrategyEvolution — the multi-generation strategy search: per generation the system
2279
+ * authors a POPULATION of candidate strategies from the current tournament's losses,
2280
+ * plays them against the incumbent at equal budget, and advances a champion; one final
2281
+ * promotion decision runs on a NEVER-BEFORE-USED holdout slice through `promotionGate`.
2282
+ *
2283
+ * Measurement invariants (the reasons this design is shaped the way it is):
2284
+ * - The author sees TRAIN losses only. The holdout slice is drawn fresh (disjoint task
2285
+ * offsets) after all authoring is done — one promotion decision, one untouched slice,
2286
+ * so adaptive reuse of evaluation data never enters the verdict.
2287
+ * - Every tournament runs at the same per-strategy budget through the conserved pool;
2288
+ * candidates cannot win by overspending.
2289
+ * - Champion selection within the search is a SEARCH policy (configurable, default
2290
+ * cost-aware: ties on score go to the cheapest strategy — a scalar hides a strategy
2291
+ * that ties at half the cost). The promotion verdict never comes from search
2292
+ * selection; it comes from the gate on the fresh slice.
2293
+ * - Every authored artifact's description length (gzip bits) is recorded, so the
2294
+ * artifact-complexity-vs-holdout-gap relation is analyzable from any run's report.
2295
+ *
2296
+ * Lineage fields (`parent`, `generation`) are recorded on every archive node so a
2297
+ * descendant-productivity parent-selection policy can be added without changing the
2298
+ * report schema; the v1 search authors from the latest tournament's losses.
2299
+ */
2300
+
2301
+ interface EvolutionAuthor {
2302
+ /** The model-call seam (agent-eval `createChatClient`). */
2303
+ chat: ChatClient;
2304
+ model?: string;
2305
+ fallbackModel?: string;
2306
+ temperature?: number;
2307
+ maxTokens?: number;
2308
+ }
2309
+ type ChampionPolicy = 'score' | 'costAware';
2310
+ interface StrategyEvolutionConfig {
2311
+ environment: Environment;
2312
+ /** Task supply by DISJOINT slice: `(offset, n)` must return n tasks unique to that
2313
+ * offset range. Train draws [0, trainN); the holdout draws [trainN + holdoutOffset,
2314
+ * …) — tasks the search never touched. */
2315
+ tasks: (offset: number, n: number) => Promise<AgenticTask[]>;
2316
+ trainN: number;
2317
+ holdoutN: number;
2318
+ /** Extra offset past the train slice for the holdout draw (rotate across runs). */
2319
+ holdoutOffset?: number;
2320
+ worker: AgenticOptions;
2321
+ author: EvolutionAuthor;
2322
+ /** Rollouts (sample) / shots (refine) per strategy per task. Default 3. */
2323
+ budget?: number;
2324
+ concurrency?: number;
2325
+ /** Author→tournament rounds after gen0. Default 2. */
2326
+ generations?: number;
2327
+ /** Authored candidates per generation. Default 2. */
2328
+ populationSize?: number;
2329
+ /** The gen0 field. Default [sample, refine, sampleThenRefine]. */
2330
+ baselines?: Strategy[];
2331
+ /** Search-side champion selection. Default 'costAware'. */
2332
+ champion?: ChampionPolicy;
2333
+ /** Score band treated as a tie under 'costAware'. Default 0.01. */
2334
+ championEpsilon?: number;
2335
+ /** Where authored modules are written. */
2336
+ outDir: string;
2337
+ /** Promotion-gate evidence floor (paired holdout tasks). */
2338
+ minPairedTasks?: number;
2339
+ onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
2340
+ hooks?: RuntimeHooks;
2341
+ }
2342
+ interface ChampionPick {
2343
+ name: string;
2344
+ score: number;
2345
+ usd: number;
2346
+ }
2347
+ interface EvolutionCandidate {
2348
+ name: string;
2349
+ file?: string;
2350
+ gzipBits?: number;
2351
+ codeChars?: number;
2352
+ /** Present when this author attempt failed (recorded, never silent). */
2353
+ error?: string;
2354
+ }
2355
+ interface EvolutionGeneration {
2356
+ generation: number;
2357
+ candidates: EvolutionCandidate[];
2358
+ report: BenchmarkReport;
2359
+ champion: ChampionPick;
2360
+ }
2361
+ interface EvolutionArchiveNode {
2362
+ name: string;
2363
+ source: 'baseline' | 'authored';
2364
+ generation: number;
2365
+ /** The champion whose tournament losses this candidate was authored from. */
2366
+ parent?: string;
2367
+ gzipBits?: number;
2368
+ file?: string;
2369
+ /** Latest measured tournament result — 0 until the node's first tournament settles
2370
+ * (an authored node is created before its generation's benchmark runs). */
2371
+ score: number;
2372
+ usd: number;
2373
+ }
2374
+ interface EvolutionReport {
2375
+ gen0: BenchmarkReport;
2376
+ gen0Champion: ChampionPick;
2377
+ generations: EvolutionGeneration[];
2378
+ archive: EvolutionArchiveNode[];
2379
+ finalChampion: ChampionPick;
2380
+ holdout: BenchmarkReport;
2381
+ verdict: PromotionVerdict;
2382
+ /** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
2383
+ * re-measurement, so cross-generation deltas mix true drift with run-to-run variance
2384
+ * (entries are unpaired across generations). The only evidence-grade comparison in
2385
+ * this report is `verdict` — both finalists measured fresh, paired, on the holdout. */
2386
+ trajectory: Array<{
2387
+ generation: number;
2388
+ champion: string;
2389
+ score: number;
2390
+ usd: number;
2391
+ }>;
2392
+ }
2393
+ /** Search-side champion selection over a tournament report. 'score' takes the best mean
2394
+ * score (ties → field order). 'costAware' treats scores within `epsilon` of the best as
2395
+ * tied and takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
2396
+ declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
2397
+ declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
2398
+
1659
2399
  /**
1660
2400
  * @experimental
1661
2401
  *
1662
2402
  * The conserved budget reservation pool — the invariant the whole instrument
1663
2403
  * rests on (critique M5/B3). One root `Budget` becomes a conserved pool of three
1664
- * quantities (tokens, usd, iterations) plus an absolute deadline. Children RESERVE
1665
- * atomically at spawn and RECONCILE at settle:
2404
+ * quantities (tokens, usd, iterations) plus an absolute deadline. Children reserve
2405
+ * atomically at spawn and reconcile at settle:
1666
2406
  *
1667
2407
  * total ≡ free + reserved + committed (invariant, always)
1668
2408
  *
1669
- * `reserve` moves a child's whole ceiling from `free` → `reserved` and FAILS CLOSED
2409
+ * `reserve` moves a child's whole ceiling from `free` → `reserved` and fails closed
1670
2410
  * when `free` can't cover it (never read-then-spawn overcommit, so `Σk(treatment) ≡
1671
2411
  * Σk(blind)` by construction). `reconcile` releases the reservation, commits ACTUAL
1672
- * spend, and refunds the unspent remainder to `free`. Tokens and usd are SEPARATE
2412
+ * spend, and refunds the unspent remainder to `free`. Tokens and usd are separate
1673
2413
  * channels (`LoopTokenUsage` has no `usd`); iterations are conserved alongside them.
1674
2414
  *
1675
2415
  * Pure and deterministic: `now()` is injected, there is no I/O, and no wall-clock or
@@ -1814,6 +2554,34 @@ interface BridgeSeam {
1814
2554
  agentProfile?: Record<string, unknown>;
1815
2555
  timeoutMs?: number;
1816
2556
  }
2557
+ /** An OpenAI-shape function tool the model may call. */
2558
+ interface ToolSpec {
2559
+ type: 'function';
2560
+ function: {
2561
+ name: string;
2562
+ description?: string;
2563
+ parameters: unknown;
2564
+ };
2565
+ }
2566
+ /**
2567
+ * Router seam WITH tool use — the tool-using router backend. Same direct
2568
+ * OpenAI-compatible endpoint as `RouterSeam`, but each turn passes `tools`; when
2569
+ * the model emits tool_calls they run via `executeToolCall` ON THIS HOST and the
2570
+ * results fold back as `tool` messages, repeating until the model answers without
2571
+ * a tool or `maxTurns` is hit. A real agentic loop, OFF-BOX — no sandbox, so it
2572
+ * is unaffected by a box's egress allowlist. One turn = one completion = the
2573
+ * equal-compute unit. `executeToolCall` receives the task so per-task tool
2574
+ * surfaces (e.g. a gym keyed by task) can dispatch correctly.
2575
+ */
2576
+ interface RouterToolsSeam {
2577
+ routerBaseUrl: string;
2578
+ routerKey: string;
2579
+ model?: string;
2580
+ tools: ReadonlyArray<ToolSpec>;
2581
+ executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
2582
+ /** Max inference turns (default 4). */
2583
+ maxTurns?: number;
2584
+ }
1817
2585
  /**
1818
2586
  * The single built-in executor entrypoint. The backend is DATA — the cost dial a
1819
2587
  * profile, an experiment config, or a replay journal can name — not an import
@@ -1824,6 +2592,8 @@ interface BridgeSeam {
1824
2592
  type ExecutorConfig = ({
1825
2593
  backend: 'router';
1826
2594
  } & RouterSeam) | ({
2595
+ backend: 'router-tools';
2596
+ } & RouterToolsSeam) | ({
1827
2597
  backend: 'bridge';
1828
2598
  } & BridgeSeam) | ({
1829
2599
  backend: 'cli';
@@ -1961,4 +2731,78 @@ declare function createSupervisor<Task, Out>(): Supervisor<Task, Out>;
1961
2731
  */
1962
2732
  declare function createRootHandle<Out>(): RootHandle<Out>;
1963
2733
 
1964
- export { Agent, AgentRunSpec, AgentSpec, type AssertTraceDerivedFindings, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, NodeId, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, SpawnEvent, SpawnJournal, Spend, type SteerContext, SupervisedResult, Supervisor, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, acquireSandbox, assertTraceDerivedFindings, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, definePersona, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, inlineSandboxClient, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, openSandboxRun, panel, pipeline, probeSandboxCapabilities, registerShape, renderCorpusToInstructions, replaySpawnTree, reportLoopUsage, runPersonified, settledToIteration, spendFromUsageEvents, trajectoryReport, verify, widen };
2734
+ /**
2735
+ * createVerifierEnvironment — ANY checkable task as an `Environment`, no tool surface
2736
+ * required. The generalization piece: EOPS/commit0-style domains have tools that mutate
2737
+ * an external artifact, but math problems, legal drafts, creative briefs, GTM copy, and
2738
+ * QA tasks have a different shape — the artifact IS the worker's answer, and the domain
2739
+ * is defined by one function: the deployable check over that answer.
2740
+ *
2741
+ * const gsm8k = createVerifierEnvironment({
2742
+ * name: 'gsm8k',
2743
+ * check: (task, answer) => ({
2744
+ * passes: extractFinalNumber(answer) === task.meta?.answer ? 1 : 0,
2745
+ * total: 1,
2746
+ * errored: 0,
2747
+ * }),
2748
+ * })
2749
+ * await runBenchmark({ environment: gsm8k, tasks, worker }) // sample vs refine on math
2750
+ *
2751
+ * The worker gets one built-in tool — `submit_answer` — plus any read-only domain tools
2752
+ * the caller adds (a calculator, a retrieval call, a style guide lookup). Every
2753
+ * submission is kept; `score()` checks the BEST submission (keep-best is the measured
2754
+ * law: workers reach correct answers then revise past them). The refine strategy's
2755
+ * critic reads the submission trajectory like any other trace, so iterate-with-feedback
2756
+ * works unchanged on answer domains.
2757
+ *
2758
+ * The check can be graded (passes/total expresses partial credit — rubric points,
2759
+ * sub-answers, unit-test counts), and MUST be deployable (computable without an oracle
2760
+ * at serve time): exact/numeric match, schema validation, a compiled rubric — not a
2761
+ * peek at held-out labels the production system wouldn't have.
2762
+ */
2763
+
2764
+ interface VerifierEnvironmentOptions {
2765
+ name: string;
2766
+ /** The deployable check over a submitted answer. Graded via passes/total. */
2767
+ check(task: AgenticTask, answer: string): Promise<SurfaceScore> | SurfaceScore;
2768
+ /** Extra domain tools (read-only helpers: calculator, retrieval, style lookup). */
2769
+ extraTools?: AgenticTool[];
2770
+ /** Executes the extra tools. Required when `extraTools` is set. */
2771
+ callExtra?(task: AgenticTask, name: string, args: Record<string, unknown>): Promise<string> | string;
2772
+ }
2773
+ declare function createVerifierEnvironment(opts: VerifierEnvironmentOptions): Environment;
2774
+
2775
+ /** Command runner seam. Host code can use `localShell`; sandbox code can wrap `box.exec`. */
2776
+ type Shell = (args: ReadonlyArray<string>, cwd?: string) => Promise<{
2777
+ stdout: string;
2778
+ stderr: string;
2779
+ code: number;
2780
+ }>;
2781
+ type WorkspaceCommit = {
2782
+ readonly ok: true;
2783
+ readonly rev: string;
2784
+ } | {
2785
+ readonly ok: false;
2786
+ readonly conflict: string;
2787
+ };
2788
+ interface Workspace {
2789
+ readonly ref: string;
2790
+ materialize(dir: string): Promise<void>;
2791
+ commit(dir: string, message: string): Promise<WorkspaceCommit>;
2792
+ head(): Promise<string>;
2793
+ }
2794
+ declare function localShell(): Shell;
2795
+ interface GitWorkspaceOptions {
2796
+ readonly ref: string;
2797
+ readonly shell?: Shell;
2798
+ readonly branch?: string;
2799
+ readonly noHooks?: boolean;
2800
+ }
2801
+ declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
2802
+ /** A jj-backed `Workspace` (Jujutsu, colocated with git for the durable remote).
2803
+ * Same port, same `Shell` — a drop-in for `gitWorkspace`. jj suits agent loops:
2804
+ * no staging area, and a first-class operation log (native resume/undo). Live use
2805
+ * requires `jj` on the `Shell`'s host. */
2806
+ declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
2807
+
2808
+ export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };