@tangle-network/agent-runtime 0.47.0 → 0.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.js +1 -1
- package/dist/chunk-GHX7XOJ2.js +433 -0
- package/dist/chunk-GHX7XOJ2.js.map +1 -0
- package/dist/{chunk-T4OQQEE3.js → chunk-IQS4HI3F.js} +14 -5
- package/dist/chunk-IQS4HI3F.js.map +1 -0
- package/dist/{chunk-72JQCHOZ.js → chunk-PXUTIMGJ.js} +2318 -237
- package/dist/chunk-PXUTIMGJ.js.map +1 -0
- package/dist/{chunk-MGFEUYOH.js → chunk-U2VEWKKK.js} +3 -3
- package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
- package/dist/chunk-VIEDXELL.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
- package/dist/index.d.ts +29 -4
- package/dist/index.js +109 -21
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
- package/dist/loop-runner-bin.d.ts +2 -2
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +3 -3
- package/dist/loops.js +57 -1
- package/dist/mcp/bin.js +187 -24
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +28 -125
- package/dist/mcp/index.js +28 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/runtime.d.ts +1100 -62
- package/dist/runtime.js +57 -1
- package/dist/{types-Cbx3dNK5.d.ts → types-BpDfCPUp.d.ts} +1 -1
- package/dist/workflow.js +1 -1
- package/package.json +7 -6
- package/dist/chunk-5YDS7BLC.js +0 -218
- package/dist/chunk-5YDS7BLC.js.map +0 -1
- package/dist/chunk-72JQCHOZ.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-T4OQQEE3.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- /package/dist/{chunk-MGFEUYOH.js.map → chunk-U2VEWKKK.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
package/dist/runtime.d.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
import { AgentProfile
|
|
1
|
+
import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
|
|
2
2
|
export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
|
|
3
|
-
import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled,
|
|
4
|
-
export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-
|
|
3
|
+
import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
|
|
4
|
+
export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
|
|
5
|
+
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
6
|
+
import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
|
|
7
|
+
export { DefaultVerdict } from '@tangle-network/agent-eval';
|
|
5
8
|
export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
|
|
6
9
|
import { S as SandboxClient, b as LoopResult, c as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-nBMuollC.js';
|
|
7
10
|
export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as LoopIterationDispatchPayload, H as LoopIterationEndedPayload, J as LoopIterationStartedPayload, a as LoopLineageOptions, M as LoopPlanDescription, N as LoopPlanPayload, f as LoopSandboxPlacement, P as LoopStartedPayload, Q as LoopTeardownFailedPayload, e as LoopTraceEmitter, T as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-nBMuollC.js';
|
|
8
|
-
import { AgentProfile, AnalystFinding, DefaultVerdict, ChatClient } from '@tangle-network/agent-eval';
|
|
9
|
-
export { DefaultVerdict } from '@tangle-network/agent-eval';
|
|
10
11
|
import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
|
|
11
12
|
import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
|
|
12
13
|
export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
|
|
13
|
-
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* @experimental
|
|
@@ -114,66 +114,200 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
|
|
|
114
114
|
declare function materializeTreeView(events: SpawnEvent[]): TreeView;
|
|
115
115
|
|
|
116
116
|
/**
|
|
117
|
-
*
|
|
118
|
-
*
|
|
119
|
-
*
|
|
117
|
+
* createWaterfallCollector — 100% trajectory observability from the lifecycle stream:
|
|
118
|
+
* every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
|
|
119
|
+
* The sum of spans IS the run's cost story — what each step cost in dollars, tokens,
|
|
120
|
+
* and wall-clock, rendered as a text waterfall or exported as structured rows for any
|
|
121
|
+
* chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
|
|
122
|
+
* across every task the hooks observe.
|
|
120
123
|
*/
|
|
121
|
-
|
|
124
|
+
|
|
125
|
+
interface WaterfallSpan {
|
|
126
|
+
id: string;
|
|
127
|
+
/** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
|
|
128
|
+
label: string;
|
|
129
|
+
runId: string;
|
|
130
|
+
parentId?: string;
|
|
131
|
+
startMs: number;
|
|
132
|
+
endMs?: number;
|
|
133
|
+
status: 'running' | 'done' | 'down';
|
|
134
|
+
usd: number;
|
|
135
|
+
tokens: {
|
|
136
|
+
input: number;
|
|
137
|
+
output: number;
|
|
138
|
+
};
|
|
139
|
+
score?: number;
|
|
140
|
+
}
|
|
141
|
+
interface WaterfallReport {
|
|
142
|
+
spans: WaterfallSpan[];
|
|
143
|
+
/** Wall-clock of the observed window (first spawn → last settle). */
|
|
144
|
+
totalMs: number;
|
|
145
|
+
totalUsd: number;
|
|
146
|
+
totalTokens: {
|
|
147
|
+
input: number;
|
|
148
|
+
output: number;
|
|
149
|
+
};
|
|
150
|
+
/** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
|
|
151
|
+
byKind: Record<string, {
|
|
152
|
+
count: number;
|
|
153
|
+
ms: number;
|
|
154
|
+
usd: number;
|
|
155
|
+
tokens: {
|
|
156
|
+
input: number;
|
|
157
|
+
output: number;
|
|
158
|
+
};
|
|
159
|
+
}>;
|
|
160
|
+
}
|
|
161
|
+
interface WaterfallCollector {
|
|
162
|
+
/** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
|
|
163
|
+
hooks: RuntimeHooks;
|
|
164
|
+
report(): WaterfallReport;
|
|
165
|
+
/** The text waterfall — one row per span, bars scaled to the observed window. */
|
|
166
|
+
render(opts?: {
|
|
167
|
+
width?: number;
|
|
168
|
+
maxRows?: number;
|
|
169
|
+
}): string;
|
|
170
|
+
reset(): void;
|
|
171
|
+
}
|
|
172
|
+
declare function createWaterfallCollector(): WaterfallCollector;
|
|
122
173
|
|
|
123
174
|
/**
|
|
124
|
-
*
|
|
175
|
+
* anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
|
|
176
|
+
* waterfall's spans (no new instrumentation): per task, the best-so-far score after each
|
|
177
|
+
* shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
|
|
178
|
+
* anytime-optimization metrics:
|
|
179
|
+
*
|
|
180
|
+
* TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
|
|
181
|
+
* over tasks that reached it)
|
|
182
|
+
* STT shots-to-target — attempts until best-so-far ≥ target
|
|
183
|
+
* ERT expected running time (the COCO benchmarking convention): TOTAL time spent
|
|
184
|
+
* across all tasks — including failures' full budgets — divided by the number of
|
|
185
|
+
* tasks that reached the target. The honest "how long per success, all-in".
|
|
186
|
+
* AUC the anytime curve's area (mean best-so-far score across the budget, per shot
|
|
187
|
+
* index) — higher = climbs earlier.
|
|
188
|
+
*
|
|
189
|
+
* The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
|
|
190
|
+
* (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
|
|
191
|
+
* runtime-to-target per (task, target) pair — optionally overridden per task
|
|
192
|
+
* (`targetFor`) when satisfaction is task-specific. Spans come from
|
|
193
|
+
* `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
|
|
194
|
+
* (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
|
|
195
|
+
*/
|
|
196
|
+
|
|
197
|
+
interface AnytimeTaskCurve {
|
|
198
|
+
taskId: string;
|
|
199
|
+
strategy: string;
|
|
200
|
+
/** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
|
|
201
|
+
* cumulative usd, and the running max score. */
|
|
202
|
+
points: Array<{
|
|
203
|
+
elapsedMs: number;
|
|
204
|
+
cumUsd: number;
|
|
205
|
+
best: number;
|
|
206
|
+
}>;
|
|
207
|
+
/** Per satisficing target (keyed by the target value as a string): the first point
|
|
208
|
+
* where best ≥ target, or null when never reached within budget. */
|
|
209
|
+
hits: Record<string, {
|
|
210
|
+
ms: number;
|
|
211
|
+
shots: number;
|
|
212
|
+
usd: number;
|
|
213
|
+
} | null>;
|
|
214
|
+
}
|
|
215
|
+
interface AnytimeStrategySummary {
|
|
216
|
+
strategy: string;
|
|
217
|
+
/** The satisficing target this row summarizes. */
|
|
218
|
+
target: number;
|
|
219
|
+
tasks: number;
|
|
220
|
+
reachedTarget: number;
|
|
221
|
+
/** Median time-to-target over the tasks that reached it (null when none did). */
|
|
222
|
+
medianTttMs: number | null;
|
|
223
|
+
medianShotsToTarget: number | null;
|
|
224
|
+
/** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
|
|
225
|
+
ertMs: number | null;
|
|
226
|
+
/** Same construction over dollars: Σ all spend / #successes. */
|
|
227
|
+
erUsd: number | null;
|
|
228
|
+
/** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
|
|
229
|
+
curveByShot: number[];
|
|
230
|
+
/** Area under the per-shot anytime curve, normalized to [0,1]. */
|
|
231
|
+
auc: number;
|
|
232
|
+
}
|
|
233
|
+
interface AnytimeReport {
|
|
234
|
+
targets: number[];
|
|
235
|
+
perTask: AnytimeTaskCurve[];
|
|
236
|
+
/** One summary per (strategy, target) pair — the COCO-style multi-target view. */
|
|
237
|
+
perStrategy: AnytimeStrategySummary[];
|
|
238
|
+
}
|
|
239
|
+
/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
|
|
240
|
+
* bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
|
|
241
|
+
* `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
|
|
242
|
+
* per-task bar replaces every entry of `targets` for that task. */
|
|
243
|
+
declare function anytimeReport(spans: WaterfallSpan[], opts?: {
|
|
244
|
+
targets?: number[];
|
|
245
|
+
targetFor?: (taskId: string) => number;
|
|
246
|
+
}): AnytimeReport;
|
|
247
|
+
/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
|
|
248
|
+
declare function renderAnytimeTable(report: AnytimeReport): string;
|
|
249
|
+
|
|
250
|
+
/**
|
|
251
|
+
* auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
|
|
125
252
|
*
|
|
126
|
-
*
|
|
127
|
-
*
|
|
128
|
-
*
|
|
129
|
-
*
|
|
130
|
-
* forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
|
|
131
|
-
* `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
|
|
132
|
-
* the third silent. The fleet's products skipped (c) and fell back to a
|
|
133
|
-
* `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
|
|
134
|
-
* to kill.
|
|
253
|
+
* `observe()` critiques execution quality ("what's unfinished"). This audits ALIGNMENT —
|
|
254
|
+
* a different failure class the score can't see until it's too late: an agent can be
|
|
255
|
+
* executing flawlessly down the wrong route. The auditor reads the trajectory and
|
|
256
|
+
* compares three intents:
|
|
135
257
|
*
|
|
136
|
-
*
|
|
258
|
+
* declared — what the task says to do (the prompt / acceptance criteria)
|
|
259
|
+
* revealed — what the agent is ACTUALLY optimizing, inferred from its action pattern
|
|
260
|
+
* (the inverse-inference move: actions reveal objectives)
|
|
261
|
+
* user — what the principal actually wants (the contract, when it differs from
|
|
262
|
+
* the literal task text), plus where the user's own trajectory is heading
|
|
137
263
|
*
|
|
138
|
-
*
|
|
139
|
-
*
|
|
140
|
-
*
|
|
141
|
-
*
|
|
142
|
-
* await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
|
|
264
|
+
* and returns a verdict (aligned / drifting / diverged) with evidence and ONE
|
|
265
|
+
* recommended intervention. FIREWALLED like every analyst: input is the trajectory and
|
|
266
|
+
* the intents — never the verifier or its data (zero check-leakage, so route auditing
|
|
267
|
+
* is always Goodhart-safe to run online).
|
|
143
268
|
*
|
|
144
|
-
*
|
|
145
|
-
*
|
|
146
|
-
*
|
|
147
|
-
*
|
|
148
|
-
*
|
|
149
|
-
* inversion.
|
|
269
|
+
* Where it runs: between shots (steer the next one), as a watchdog over the lifecycle
|
|
270
|
+
* stream (abort-and-refund a diverged rollout — the budget pool makes early abort
|
|
271
|
+
* strictly valuable), or post-hoc over a whole BenchmarkReport (the meta-intent pass:
|
|
272
|
+
* is the LOOP optimizing the right thing — degenerate submissions, check-gaming shapes,
|
|
273
|
+
* objective drift across tasks).
|
|
150
274
|
*/
|
|
151
275
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
/**
|
|
156
|
-
|
|
157
|
-
/**
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
* integrity guard sees real activity. */
|
|
164
|
-
toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
|
|
165
|
-
/** Forward `loop.*` trace events into the campaign's scoped trace so loop
|
|
166
|
-
* spans correlate with the cell. Default true. */
|
|
167
|
-
forwardTrace?: boolean;
|
|
168
|
-
/** Cost-meter source label for the loop's spend. Default `'loop'`. */
|
|
169
|
-
costSource?: string;
|
|
276
|
+
interface AuditIntentInput {
|
|
277
|
+
/** The declared intent: the task text / acceptance criteria the agent was given. */
|
|
278
|
+
declaredIntent: string;
|
|
279
|
+
/** The trajectory so far — tool calls + results + assistant turns (any event shapes). */
|
|
280
|
+
trace: ReadonlyArray<unknown>;
|
|
281
|
+
/** The principal's actual intent when it differs from the literal task (the contract). */
|
|
282
|
+
userIntent?: string;
|
|
283
|
+
/** The loop-level purpose (meta-intent): what the WHOLE run is for — lets the auditor
|
|
284
|
+
* flag locally-sensible work that serves the wrong larger objective. */
|
|
285
|
+
metaIntent?: string;
|
|
286
|
+
runId?: string;
|
|
170
287
|
}
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
288
|
+
interface AuditIntentOptions {
|
|
289
|
+
chat: ChatClient;
|
|
290
|
+
model?: string;
|
|
291
|
+
/** Override the auditor instruction (optimizable like any analyst prompt). */
|
|
292
|
+
auditorInstruction?: string;
|
|
293
|
+
/** Cap trace lines fed to the auditor. Default 80. */
|
|
294
|
+
maxTraceLines?: number;
|
|
295
|
+
signal?: AbortSignal;
|
|
296
|
+
}
|
|
297
|
+
interface IntentAudit {
|
|
298
|
+
/** What the agent's actions reveal it is actually optimizing — one sentence. */
|
|
299
|
+
revealedIntent: string;
|
|
300
|
+
verdict: 'aligned' | 'drifting' | 'diverged';
|
|
301
|
+
/** Trajectory-grounded evidence for the verdict (specific calls/patterns). */
|
|
302
|
+
evidence: string;
|
|
303
|
+
/** The single recommended intervention. */
|
|
304
|
+
recommendation: 'continue' | 'steer' | 'abort';
|
|
305
|
+
/** When recommendation is 'steer': the corrective instruction to inject. */
|
|
306
|
+
steer?: string;
|
|
307
|
+
confidence: number;
|
|
308
|
+
}
|
|
309
|
+
declare const defaultAuditorInstruction: string;
|
|
310
|
+
declare function auditIntent(input: AuditIntentInput, opts: AuditIntentOptions): Promise<IntentAudit>;
|
|
177
311
|
|
|
178
312
|
/**
|
|
179
313
|
* @experimental
|
|
@@ -329,7 +463,7 @@ interface ShapeContext<D = unknown> {
|
|
|
329
463
|
spawnChild(name: string, spec: AgentSpec): Agent<unknown, Outcome<D>>;
|
|
330
464
|
/** Derive a child `AgentSpec` from the persona's root spec with an overridden profile —
|
|
331
465
|
* the seam a shape uses to give a worker a narrower role/prompt than the root persona. */
|
|
332
|
-
childSpec(profile: AgentProfile
|
|
466
|
+
childSpec(profile: AgentProfile, harness?: BackendType | null): AgentSpec;
|
|
333
467
|
}
|
|
334
468
|
/**
|
|
335
469
|
* A reusable act-body factory. Given the persona's content + seams (`ShapeContext`), it
|
|
@@ -790,7 +924,7 @@ interface RenderCorpusToInstructionsOptions {
|
|
|
790
924
|
readonly corpus: Corpus;
|
|
791
925
|
readonly filter: CorpusFilter;
|
|
792
926
|
/** The profile to project the facts into. The result is a fresh profile — the input is unchanged. */
|
|
793
|
-
readonly profile: AgentProfile
|
|
927
|
+
readonly profile: AgentProfile;
|
|
794
928
|
/** Where the rendered facts land: appended to `prompt.instructions[]` (default) or folded into
|
|
795
929
|
* the single-blob `resources.instructions` string. */
|
|
796
930
|
readonly target?: 'prompt' | 'resources';
|
|
@@ -799,7 +933,7 @@ interface RenderCorpusToInstructionsOptions {
|
|
|
799
933
|
}
|
|
800
934
|
/** `renderCorpusToInstructions(opts)` — the flywheel read-back projection. Async (queries the
|
|
801
935
|
* durable corpus); returns a fresh `AgentProfile` with the accreted facts merged in. */
|
|
802
|
-
type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile
|
|
936
|
+
type RenderCorpusToInstructions = (opts: RenderCorpusToInstructionsOptions) => Promise<AgentProfile>;
|
|
803
937
|
/**
|
|
804
938
|
* One node in the reconstructed trajectory tree — a driver OR a leaf, with its OWN spend and the
|
|
805
939
|
* spend ROLLED UP over its subtree. Reconstructed from the `SpawnJournal` (structure + per-node
|
|
@@ -941,7 +1075,15 @@ interface ObserveOptions {
|
|
|
941
1075
|
signal?: AbortSignal;
|
|
942
1076
|
/** Cap the trace lines fed to the observer (keeps the call cheap). Default 80. */
|
|
943
1077
|
maxTraceLines?: number;
|
|
944
|
-
|
|
1078
|
+
/** Override the analyst's system instruction — the prompt that turns a trace into
|
|
1079
|
+
* findings + recommended_actions. The analyst IS the steerer, so this is the knob a
|
|
1080
|
+
* prompt optimizer (GEPA) tunes. Omitted ⇒ the default observer instruction. The
|
|
1081
|
+
* firewall (trace-only, never the verdict) is structural (input has no score), so a
|
|
1082
|
+
* custom instruction cannot break it. */
|
|
1083
|
+
analystInstruction?: string;
|
|
1084
|
+
}
|
|
1085
|
+
/** The default observer instruction — exported so an optimizer can seed its population. */
|
|
1086
|
+
declare const defaultAnalystInstruction: string;
|
|
945
1087
|
interface Observation {
|
|
946
1088
|
findings: AnalystFinding[];
|
|
947
1089
|
/** Facts persisted to the corpus (empty when no corpus was supplied). */
|
|
@@ -954,6 +1096,488 @@ declare function observe(input: ObserveInput, opts: ObserveOptions): Promise<Obs
|
|
|
954
1096
|
* steer; the operator block is the advice. */
|
|
955
1097
|
declare function renderReport(findings: ReadonlyArray<AnalystFinding>): string;
|
|
956
1098
|
|
|
1099
|
+
/**
|
|
1100
|
+
* harvestCorpus — production traces → corpus, the G2 bridge (the playbook's step 6).
|
|
1101
|
+
* The flywheel's write side, batched: run the firewalled `observe()` analyst over a
|
|
1102
|
+
* stream of completed runs (yesterday's production traces, a benchmark's rollouts, a
|
|
1103
|
+
* fleet's day) and accrete the trace-derived facts into the durable corpus.
|
|
1104
|
+
*
|
|
1105
|
+
* Store-agnostic by design: the caller maps its trace store's rows (a
|
|
1106
|
+
* `ProductionTraceSink` ndjson, OTLP spans, RunRecords) to `ObserveInput` — task text,
|
|
1107
|
+
* final output, the event trace, terminal outcome. The analyst reads BEHAVIOR only
|
|
1108
|
+
* (the firewall is structural: the input carries no judge verdict), and corpus appends
|
|
1109
|
+
* are idempotent on (claim + tags), so re-harvesting the same window is safe.
|
|
1110
|
+
*
|
|
1111
|
+
* The nightly product job is then three lines:
|
|
1112
|
+
* const runs = mapSinkRowsToObserveInputs(await readSink(yesterday))
|
|
1113
|
+
* const report = await harvestCorpus({ runs, chat, corpus, tags: ['gtm-agent'] })
|
|
1114
|
+
* log(report) // runsObserved / findings / learned / failures
|
|
1115
|
+
*
|
|
1116
|
+
* NOTE on the read side: harvesting is safe and cheap; *injecting* facts back into runs
|
|
1117
|
+
* is the measured danger zone — naive unconditional priming tested NEGATIVE (−11.6pp,
|
|
1118
|
+
* context pollution; docs/research/layer-across-run.md). Gate any priming design on its
|
|
1119
|
+
* own A/B; the corpus's first consumers are operators and optimizers, not prompts.
|
|
1120
|
+
*/
|
|
1121
|
+
|
|
1122
|
+
interface HarvestCorpusOptions {
|
|
1123
|
+
/** The completed runs to analyze — map your store's rows to `ObserveInput`. */
|
|
1124
|
+
runs: AsyncIterable<ObserveInput> | Iterable<ObserveInput>;
|
|
1125
|
+
/** The model-call seam (agent-eval `createChatClient`). */
|
|
1126
|
+
chat: ChatClient;
|
|
1127
|
+
model?: string;
|
|
1128
|
+
/** The durable corpus the facts accrete into. */
|
|
1129
|
+
corpus: Corpus;
|
|
1130
|
+
/** Tags written onto learned facts (the product/domain key the read side queries by). */
|
|
1131
|
+
tags?: ReadonlyArray<string>;
|
|
1132
|
+
/** Override the analyst instruction (the GEPA-tunable knob). */
|
|
1133
|
+
analystInstruction?: string;
|
|
1134
|
+
/** Runs analyzed in parallel. Default 4. */
|
|
1135
|
+
concurrency?: number;
|
|
1136
|
+
/** Hard cap on runs consumed from the stream (a cost guard for unbounded stores). */
|
|
1137
|
+
maxRuns?: number;
|
|
1138
|
+
signal?: AbortSignal;
|
|
1139
|
+
}
|
|
1140
|
+
interface HarvestFailure {
|
|
1141
|
+
runId: string;
|
|
1142
|
+
error: string;
|
|
1143
|
+
}
|
|
1144
|
+
interface HarvestReport {
|
|
1145
|
+
runsObserved: number;
|
|
1146
|
+
/** Total findings the analyst produced (including ones already known). */
|
|
1147
|
+
findings: number;
|
|
1148
|
+
/** NEW facts actually appended (idempotent dedup excludes re-learned ones). */
|
|
1149
|
+
learned: number;
|
|
1150
|
+
/** Per-run analysis failures — reported, never silently dropped. */
|
|
1151
|
+
failures: HarvestFailure[];
|
|
1152
|
+
}
|
|
1153
|
+
declare function harvestCorpus(opts: HarvestCorpusOptions): Promise<HarvestReport>;
|
|
1154
|
+
|
|
1155
|
+
/**
|
|
1156
|
+
* Adapt an `ExecutorFactory` into a `SandboxClient` for `runLoop`. The factory is
|
|
1157
|
+
* instantiated fresh per `streamPrompt` (mirrors the per-spawn executor lifecycle):
|
|
1158
|
+
* run once on the prompt, emit the terminal result event, tear down.
|
|
1159
|
+
*/
|
|
1160
|
+
declare function inlineSandboxClient(factory: ExecutorFactory<unknown>): SandboxClient;
|
|
1161
|
+
|
|
1162
|
+
/**
|
|
1163
|
+
* `loopDispatch` — turn `runLoop` into an agent-eval campaign dispatch.
|
|
1164
|
+
*
|
|
1165
|
+
* Without this adapter a consumer wiring `runLoop` into `runProfileMatrix` /
|
|
1166
|
+
* `runCampaign` has to, by hand, every time: (a) build an `ExecCtx` with a
|
|
1167
|
+
* sandbox client, (b) adapt the campaign `DispatchContext.trace` into a
|
|
1168
|
+
* `LoopTraceEmitter` (or lose all loop trace correlation), and (c) remember to
|
|
1169
|
+
* forward the loop's cost + tokens via `ctx.cost` (forgetting it yields a
|
|
1170
|
+
* `{0,0}` cell the backend-integrity guard reads as a stub). Three foot-guns,
|
|
1171
|
+
* the third silent. The fleet's products skipped (c) and fell back to a
|
|
1172
|
+
* `workerRecords[]` side-channel — the exact anti-pattern the substrate exists
|
|
1173
|
+
* to kill.
|
|
1174
|
+
*
|
|
1175
|
+
* `loopDispatch` collapses all three into one typed call:
|
|
1176
|
+
*
|
|
1177
|
+
* const dispatch = loopDispatch({
|
|
1178
|
+
* sandboxClient,
|
|
1179
|
+
* toLoopOptions: (scenario, profile) => ({ driver, agentRun, output, validator, task }),
|
|
1180
|
+
* })
|
|
1181
|
+
* await runProfileMatrix({ profiles, scenarios, dispatch, judges, commitSha })
|
|
1182
|
+
*
|
|
1183
|
+
* Usage is reported automatically; trace events are forwarded automatically;
|
|
1184
|
+
* the ctx is built automatically. The seam becomes impossible to mis-wire.
|
|
1185
|
+
*
|
|
1186
|
+
* Typed structurally against the campaign `DispatchContext` (imported type-only
|
|
1187
|
+
* from `@tangle-network/agent-eval/campaign`) — a downward dependency, never an
|
|
1188
|
+
* inversion.
|
|
1189
|
+
*/
|
|
1190
|
+
|
|
1191
|
+
/** runLoop options minus the `ctx` (loopDispatch builds the ctx). */
|
|
1192
|
+
type LoopOptionsForDispatch<Task, Output, Decision> = Omit<RunLoopOptions<Task, Output, Decision>, 'ctx'>;
|
|
1193
|
+
interface LoopDispatchOptions<Task, Output, Decision, TScenario extends Scenario, TArtifact> {
|
|
1194
|
+
/** Sandbox client used for every cell's `runLoop`. Supplied once. */
|
|
1195
|
+
sandboxClient: SandboxClient;
|
|
1196
|
+
/** Build the per-cell runLoop options from the scenario (+ profile, when
|
|
1197
|
+
* used with `runProfileMatrix`). */
|
|
1198
|
+
toLoopOptions: (scenario: TScenario, profile: AgentProfile$1) => LoopOptionsForDispatch<Task, Output, Decision>;
|
|
1199
|
+
/** Map the finished loop to the artifact the judges score. Default:
|
|
1200
|
+
* `result.winner?.output`. A loop with no winner yields `undefined` (judges
|
|
1201
|
+
* skip the cell) — but the loop's token usage is STILL reported, so the
|
|
1202
|
+
* integrity guard sees real activity. */
|
|
1203
|
+
toArtifact?: (result: LoopResult<Task, Output, Decision>) => TArtifact;
|
|
1204
|
+
/** Forward `loop.*` trace events into the campaign's scoped trace so loop
|
|
1205
|
+
* spans correlate with the cell. Default true. */
|
|
1206
|
+
forwardTrace?: boolean;
|
|
1207
|
+
/** Cost-meter source label for the loop's spend. Default `'loop'`. */
|
|
1208
|
+
costSource?: string;
|
|
1209
|
+
}
|
|
1210
|
+
/**
|
|
1211
|
+
* Adapter for `runProfileMatrix` (profile is an axis). Returns a
|
|
1212
|
+
* `ProfileDispatchFn` that runs `runLoop` per (profile, scenario) cell and
|
|
1213
|
+
* reports usage automatically.
|
|
1214
|
+
*/
|
|
1215
|
+
declare function loopDispatch<Task, Output, Decision, TScenario extends Scenario, TArtifact>(opts: LoopDispatchOptions<Task, Output, Decision, TScenario, TArtifact>): ProfileDispatchFn<TScenario, TArtifact>;
|
|
1216
|
+
|
|
1217
|
+
/**
|
|
1218
|
+
* The general agentic primitive — sequential (depth) and parallel (breadth) over a shared,
|
|
1219
|
+
* checkable artifact, driven through the keystone Supervisor as one recursive `Agent.act`.
|
|
1220
|
+
*
|
|
1221
|
+
* The domain lives behind ONE seam — `AgenticSurface` (open an artifact, list tools, call a tool,
|
|
1222
|
+
* score the artifact, close it). EnterpriseOps implements it (seed a gym DB, MCP tools, SQL
|
|
1223
|
+
* verifier); Commit0/AppWorld/terminal-bench implement it the same way (a repo workspace, shell
|
|
1224
|
+
* tools, the test suite). The drivers below are domain-blind: they run over any surface.
|
|
1225
|
+
*
|
|
1226
|
+
* Two shapes, the agent's POMDP rollout as the unit:
|
|
1227
|
+
* - DEPTH one persistent artifact carried across shots. Each shot the agent works the tool loop;
|
|
1228
|
+
* between shots a trace-analyst (selector≠judge: reads the trajectory, never the score)
|
|
1229
|
+
* steers the resumed session toward what's unfinished. shot n stands on shot n-1's
|
|
1230
|
+
* artifact state + history. This is continuation — long-horizon, same artifact.
|
|
1231
|
+
* - BREADTH K independent artifacts, each a fresh rollout, the deployable verifier picks the best.
|
|
1232
|
+
*
|
|
1233
|
+
* Both are an `Agent` whose `act` spawns leaf shots through `scope.spawn` and reacts via
|
|
1234
|
+
* `scope.next()` — so the conserved budget pool meters them (equal-k by construction), the journal
|
|
1235
|
+
* records the tree, and the same primitive nests. `runAgentic` runs the chosen driver through
|
|
1236
|
+
* `createSupervisor().run`. The leaf (one shot over a handle) is resolved per-spawn from a
|
|
1237
|
+
* surface-closed registry — the open `Executor` seam, not bespoke per-benchmark glue.
|
|
1238
|
+
*/
|
|
1239
|
+
|
|
1240
|
+
interface AgenticTask {
|
|
1241
|
+
readonly id: string;
|
|
1242
|
+
readonly systemPrompt: string;
|
|
1243
|
+
readonly userPrompt: string;
|
|
1244
|
+
/** Opaque domain payload the surface reads (EOPS: servers/verifiers/tools). Drivers never read it. */
|
|
1245
|
+
readonly meta?: Record<string, unknown>;
|
|
1246
|
+
}
|
|
1247
|
+
interface ArtifactHandle {
|
|
1248
|
+
readonly id: string;
|
|
1249
|
+
readonly surface: string;
|
|
1250
|
+
/** Opaque per-artifact context the surface stashes (EOPS: the seeded gym server + db id). */
|
|
1251
|
+
readonly ctx?: unknown;
|
|
1252
|
+
}
|
|
1253
|
+
interface AgenticTool {
|
|
1254
|
+
readonly type: 'function';
|
|
1255
|
+
readonly function: {
|
|
1256
|
+
name: string;
|
|
1257
|
+
description?: string;
|
|
1258
|
+
parameters: Record<string, unknown>;
|
|
1259
|
+
};
|
|
1260
|
+
}
|
|
1261
|
+
interface SurfaceScore {
|
|
1262
|
+
passes: number;
|
|
1263
|
+
total: number;
|
|
1264
|
+
/** Checks excluded as malformed (data defect, not the agent). `total === 0` ⇒ unscoreable. */
|
|
1265
|
+
errored: number;
|
|
1266
|
+
}
|
|
1267
|
+
/** A stateful, checkable environment an agent operates over with tools. Open behind one interface. */
|
|
1268
|
+
interface AgenticSurface {
|
|
1269
|
+
readonly name: string;
|
|
1270
|
+
open(task: AgenticTask): Promise<ArtifactHandle>;
|
|
1271
|
+
tools(task: AgenticTask, handle: ArtifactHandle): Promise<AgenticTool[]>;
|
|
1272
|
+
call(handle: ArtifactHandle, name: string, args: Record<string, unknown>): Promise<string>;
|
|
1273
|
+
score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
|
|
1274
|
+
close(handle: ArtifactHandle): Promise<void>;
|
|
1275
|
+
}
|
|
1276
|
+
interface AgenticOptions {
|
|
1277
|
+
routerBaseUrl: string;
|
|
1278
|
+
routerKey: string;
|
|
1279
|
+
model: string;
|
|
1280
|
+
temperature?: number;
|
|
1281
|
+
/** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
|
|
1282
|
+
* budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
|
|
1283
|
+
maxTokens?: number;
|
|
1284
|
+
/** Turns the agent may take within ONE shot before the driver intervenes. */
|
|
1285
|
+
innerTurns?: number;
|
|
1286
|
+
/** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
|
|
1287
|
+
* prompt optimizer (GEPA) tunes — the analyst IS the steerer. Omitted ⇒ the default. */
|
|
1288
|
+
analystInstruction?: string;
|
|
1289
|
+
/** The critic's model — lets the analyst be a stronger (or cheaper) model than the
|
|
1290
|
+
* worker. Omitted ⇒ the worker's `model`. */
|
|
1291
|
+
analystModel?: string;
|
|
1292
|
+
/** Across-run learning: when set, the analyst's observe() pass appends trace-derived
|
|
1293
|
+
* facts here (the flywheel write side). Priming (the read side) is the caller's move —
|
|
1294
|
+
* query the corpus and fold facts into the task's systemPrompt before runAgentic. */
|
|
1295
|
+
corpus?: Corpus;
|
|
1296
|
+
/** Tags written onto learned facts (and used by the caller's priming query). */
|
|
1297
|
+
corpusTags?: string[];
|
|
1298
|
+
}
|
|
1299
|
+
type Msg = Record<string, unknown>;
|
|
1300
|
+
interface ShotResult {
|
|
1301
|
+
messages: Msg[];
|
|
1302
|
+
score: number;
|
|
1303
|
+
passes: number;
|
|
1304
|
+
total: number;
|
|
1305
|
+
completions: number;
|
|
1306
|
+
toolErrors: number;
|
|
1307
|
+
}
|
|
1308
|
+
interface AgenticRunResult {
|
|
1309
|
+
/** The strategy name (built-in 'depth'/'breadth' or a custom strategy's name). */
|
|
1310
|
+
mode: string;
|
|
1311
|
+
score: number;
|
|
1312
|
+
resolved: boolean;
|
|
1313
|
+
completions: number;
|
|
1314
|
+
/** DEPTH: score after each shot — the progress-over-rounds curve. BREADTH: best-so-far per rollout. */
|
|
1315
|
+
progression: number[];
|
|
1316
|
+
shots: number;
|
|
1317
|
+
/** The cost vector, stamped by `runAgentic` from the Supervisor's conserved pool: real
|
|
1318
|
+
* router tokens, priced usd (0 when the model is unpriced — never fabricated), wall ms. */
|
|
1319
|
+
usd: number;
|
|
1320
|
+
ms: number;
|
|
1321
|
+
tokens: {
|
|
1322
|
+
input: number;
|
|
1323
|
+
output: number;
|
|
1324
|
+
};
|
|
1325
|
+
}
|
|
1326
|
+
/** DEPTH: one persistent artifact, carried across analyst-steered shots. */
|
|
1327
|
+
declare function depthDriver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
|
|
1328
|
+
maxShots: number;
|
|
1329
|
+
}): Agent<unknown, Outcome<unknown>>;
|
|
1330
|
+
/** BREADTH: K independent rollouts (each own artifact), verifier picks the best. */
|
|
1331
|
+
declare function breadthDriver(_surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, cfg: {
|
|
1332
|
+
width: number;
|
|
1333
|
+
}): Agent<unknown, Outcome<unknown>>;
|
|
1334
|
+
/**
|
|
1335
|
+
* A Strategy is HOW you spend the compute budget to beat the Environment's check — it
|
|
1336
|
+
* builds the driver `Agent` the Supervisor runs. This is the OPEN extension point: a dev
|
|
1337
|
+
* authors their own by implementing `driver()` to return an Agent whose `act()` spawns
|
|
1338
|
+
* shots/analysts via `scope.spawn` / `scope.next` / `scope.send`. The two built-ins are
|
|
1339
|
+
* the reference implementations to copy:
|
|
1340
|
+
* sample — K INDEPENDENT attempts, keep the best-verifying (best-of-N / resample).
|
|
1341
|
+
* refine — attempt → observe() reads the trace → steer the next → repeat (iterate).
|
|
1342
|
+
* (A multi-agent "team" is just a Strategy whose driver spawns several different agents.)
|
|
1343
|
+
*/
|
|
1344
|
+
interface Strategy {
|
|
1345
|
+
readonly name: string;
|
|
1346
|
+
driver(surface: AgenticSurface, task: AgenticTask, opts: AgenticOptions, budget: number): Agent<unknown, Outcome<unknown>>;
|
|
1347
|
+
}
|
|
1348
|
+
declare const sample: Strategy;
|
|
1349
|
+
declare const refine: Strategy;
|
|
1350
|
+
/** A role for one shot — multi-agent loops (researcher + engineer, a panel of k
|
|
1351
|
+
* researchers) give each shot its own system prompt and optionally its own model. */
|
|
1352
|
+
interface ShotPersona {
|
|
1353
|
+
/** Replaces the task's systemPrompt for a FRESH shot; on a carried conversation it is
|
|
1354
|
+
* injected as a hand-off message (the transcript's earlier roles stay intact). */
|
|
1355
|
+
systemPrompt?: string;
|
|
1356
|
+
/** Per-shot model override (e.g. a stronger model for the engineer shot). */
|
|
1357
|
+
model?: string;
|
|
1358
|
+
}
|
|
1359
|
+
interface ShotSpec {
|
|
1360
|
+
/** present ⇒ continue this artifact (depth); absent ⇒ the shot opens a fresh one (sample/restart). */
|
|
1361
|
+
handle?: ArtifactHandle;
|
|
1362
|
+
messages?: Msg[];
|
|
1363
|
+
steer?: string;
|
|
1364
|
+
persona?: ShotPersona;
|
|
1365
|
+
/** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
|
|
1366
|
+
* the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
|
|
1367
|
+
tools?: string[];
|
|
1368
|
+
}
|
|
1369
|
+
interface StrategyResult {
|
|
1370
|
+
score: number;
|
|
1371
|
+
resolved: boolean;
|
|
1372
|
+
completions: number;
|
|
1373
|
+
progression: number[];
|
|
1374
|
+
shots: number;
|
|
1375
|
+
}
|
|
1376
|
+
/** Artifact lifecycle a strategy may manage itself — open/close ONLY. Raw `call`/`score`
|
|
1377
|
+
* are withheld: scores reach the body solely through `shot()`'s ShotResult (the
|
|
1378
|
+
* harness-verified channel), so a body cannot peek the check or fabricate around it. */
|
|
1379
|
+
interface StrategyArtifacts {
|
|
1380
|
+
readonly name: string;
|
|
1381
|
+
open(task: AgenticTask): Promise<ArtifactHandle>;
|
|
1382
|
+
close(handle: ArtifactHandle): Promise<void>;
|
|
1383
|
+
}
|
|
1384
|
+
/** What a strategy body composes with: the artifact lifecycle, the budget, and the two steps. */
|
|
1385
|
+
interface StrategyCtx {
|
|
1386
|
+
/** Open/close artifacts the body manages itself (e.g. one persistent handle for depth). */
|
|
1387
|
+
readonly surface: StrategyArtifacts;
|
|
1388
|
+
readonly task: AgenticTask;
|
|
1389
|
+
readonly opts: AgenticOptions;
|
|
1390
|
+
readonly budget: number;
|
|
1391
|
+
readonly scope: Scope<Outcome<unknown>>;
|
|
1392
|
+
/** Run ONE worker shot; its harness-scored result, or null if it went down. */
|
|
1393
|
+
shot(spec?: ShotSpec): Promise<ShotResult | null>;
|
|
1394
|
+
/** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
|
|
1395
|
+
critique(messages: Msg[]): Promise<string | null>;
|
|
1396
|
+
/** The RAW analyst channel: the firewalled critic answers `instruction` over the
|
|
1397
|
+
* trajectory verbatim — no findings extraction, so verdict-shaped formats
|
|
1398
|
+
* (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
|
|
1399
|
+
* trajectory in, never scores. Null when the analyst went down. */
|
|
1400
|
+
consult(messages: Msg[], instruction: string): Promise<string | null>;
|
|
1401
|
+
/** The tools THIS artifact's task actually offers (names + descriptions only — never
|
|
1402
|
+
* the implementations). Tool sets vary per task on heterogeneous domains; a strategy
|
|
1403
|
+
* that restricts shots MUST select from this list, never from hardcoded names. */
|
|
1404
|
+
listTools(handle: ArtifactHandle): Promise<Array<{
|
|
1405
|
+
name: string;
|
|
1406
|
+
description?: string;
|
|
1407
|
+
}>>;
|
|
1408
|
+
}
|
|
1409
|
+
/** Author a Strategy from the composable steps — the open, compact way. */
|
|
1410
|
+
declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
|
|
1411
|
+
/** A NEW strategy, authored from the steps (~20 lines): refine, but when a steered shot
|
|
1412
|
+
* fails to improve the score it ABANDONS that line and restarts fresh (branch-when-stuck)
|
|
1413
|
+
* — the widen/MCTS idea the depth-stuck failure motivated. Scored keep-best (the best
|
|
1414
|
+
* checkpoint across all lines), the deployable metric. This is the "experts build BETTER
|
|
1415
|
+
* optimizations" path: a new technique, compact, with zero Supervisor ceremony. */
|
|
1416
|
+
declare const adaptiveRefine: Strategy;
|
|
1417
|
+
/** The explore-then-exploit MIX: spend ⌈budget/2⌉ on independent samples (kept open),
|
|
1418
|
+
* then refine the best-verifying line with the remaining budget. Sample's basin escape +
|
|
1419
|
+
* refine's accumulation — the third built-in, authored from the public steps. */
|
|
1420
|
+
declare const sampleThenRefine: Strategy;
|
|
1421
|
+
interface RunAgenticOptions extends AgenticOptions {
|
|
1422
|
+
surface: AgenticSurface;
|
|
1423
|
+
task: AgenticTask;
|
|
1424
|
+
/** Lifecycle observability — every spawn/settle (shots, analysts) streams here live.
|
|
1425
|
+
* The seam online watchdogs/route-auditors subscribe to. */
|
|
1426
|
+
hooks?: RuntimeHooks;
|
|
1427
|
+
/** A Strategy (the open way) — author/pass your own. Overrides `mode` when present. */
|
|
1428
|
+
strategy?: Strategy;
|
|
1429
|
+
/** Built-in shorthand: 'depth'→refine, 'breadth'→sample. Default 'depth'. */
|
|
1430
|
+
mode?: 'depth' | 'breadth';
|
|
1431
|
+
/** budget: refine→max shots; sample→rollout width. */
|
|
1432
|
+
budget: number;
|
|
1433
|
+
rootBudget?: Budget;
|
|
1434
|
+
}
|
|
1435
|
+
/** Run a Strategy through the keystone Supervisor — `Agent.act` over a conserved-budget Scope. */
|
|
1436
|
+
declare function runAgentic(opts: RunAgenticOptions): Promise<AgenticRunResult>;
|
|
1437
|
+
|
|
1438
|
+
/**
|
|
1439
|
+
* runBenchmark — the packaged optimization suite. Define a domain by implementing an
|
|
1440
|
+
* `Environment` (open / tools / call / score / close); get the optimization strategies
|
|
1441
|
+
* compared, scored by your own deployable check, with a paired-bootstrap report — free.
|
|
1442
|
+
*
|
|
1443
|
+
* The mental model: you have a TASK + a deployable CHECK + a compute BUDGET. A strategy
|
|
1444
|
+
* is how you spend the budget to beat the check. Two built-ins:
|
|
1445
|
+
*
|
|
1446
|
+
* sample — N independent attempts, keep the best-verifying one. (best-of-N / resample)
|
|
1447
|
+
* refine — attempt → a critic reads the trace → steer the next → repeat. (iterate-with-feedback)
|
|
1448
|
+
*
|
|
1449
|
+
* Both run at equal budget through the Supervisor's conserved pool; the headline is the
|
|
1450
|
+
* paired lift of refine over sample. Author your own strategy with `defineStrategy`.
|
|
1451
|
+
*/
|
|
1452
|
+
|
|
1453
|
+
/** A checkable task domain — implement these 5 hooks and the suite does the rest. The
|
|
1454
|
+
* same seam as `AgenticSurface`; `Environment` is the RL/gym-standard name for it. */
|
|
1455
|
+
type Environment = AgenticSurface;
|
|
1456
|
+
interface BenchmarkConfig {
|
|
1457
|
+
/** The task domain (5 hooks). */
|
|
1458
|
+
environment: Environment;
|
|
1459
|
+
/** The tasks to score across. */
|
|
1460
|
+
tasks: AgenticTask[];
|
|
1461
|
+
/** The worker: model + router + (optional) the critic's instruction (the steerer knob). */
|
|
1462
|
+
worker: AgenticOptions;
|
|
1463
|
+
/** Which strategies to compare. Pass the built-ins (`refine`, `sample`) or your own.
|
|
1464
|
+
* Default: [sample, refine]. */
|
|
1465
|
+
strategies?: Strategy[];
|
|
1466
|
+
/** Shots (refine) / width (sample) — the equal compute budget per strategy. Default 3. */
|
|
1467
|
+
budget?: number;
|
|
1468
|
+
/** Tasks scored in parallel. Default 3. */
|
|
1469
|
+
concurrency?: number;
|
|
1470
|
+
/** Progress hook — fires as each task settles (the live-monitoring seam: append to a
|
|
1471
|
+
* progress file, render a tree, stream to a dashboard). `done` counts settled tasks. */
|
|
1472
|
+
onTask?: (row: BenchmarkTaskRow, done: number, total: number) => void;
|
|
1473
|
+
/** Lifecycle observability — every spawn/settle of every cell's shots/analysts streams
|
|
1474
|
+
* here live (the watchdog/route-auditor seam, passed through to `runAgentic`). */
|
|
1475
|
+
hooks?: RuntimeHooks;
|
|
1476
|
+
}
|
|
1477
|
+
interface BenchmarkLift {
|
|
1478
|
+
/** Mean of paired deltas (refine − sample). */
|
|
1479
|
+
mean: number;
|
|
1480
|
+
low: number;
|
|
1481
|
+
high: number;
|
|
1482
|
+
n: number;
|
|
1483
|
+
}
|
|
1484
|
+
/** One strategy's outcome on one task — the per-task cell an optimizer consumes. */
|
|
1485
|
+
interface BenchmarkCell {
|
|
1486
|
+
score: number;
|
|
1487
|
+
resolved: boolean;
|
|
1488
|
+
/** The progress curve (refine: score per shot; sample: best-so-far per rollout). */
|
|
1489
|
+
progression: number[];
|
|
1490
|
+
usd: number;
|
|
1491
|
+
ms: number;
|
|
1492
|
+
tokens: {
|
|
1493
|
+
input: number;
|
|
1494
|
+
output: number;
|
|
1495
|
+
};
|
|
1496
|
+
}
|
|
1497
|
+
interface BenchmarkTaskRow {
|
|
1498
|
+
taskId: string;
|
|
1499
|
+
/** Per-strategy cells; absent when the task errored before completing all strategies. */
|
|
1500
|
+
cells?: Record<string, BenchmarkCell>;
|
|
1501
|
+
/** Per-strategy failures on this task: the strategy competed, threw, and scored an
|
|
1502
|
+
* honest zero — it loses, it does not poison the row. The message is kept so a later
|
|
1503
|
+
* generation's author can see WHY a candidate died. */
|
|
1504
|
+
errors?: Record<string, string>;
|
|
1505
|
+
/** Why the task was excluded (infra/setup failure) — never silently dropped. */
|
|
1506
|
+
error?: string;
|
|
1507
|
+
}
|
|
1508
|
+
interface BenchmarkStrategySummary {
|
|
1509
|
+
/** Mean verifier score (0..1). */
|
|
1510
|
+
score: number;
|
|
1511
|
+
/** Fraction of tasks fully resolved. */
|
|
1512
|
+
resolved: number;
|
|
1513
|
+
/** Mean cost vector per task. */
|
|
1514
|
+
usd: number;
|
|
1515
|
+
ms: number;
|
|
1516
|
+
}
|
|
1517
|
+
interface BenchmarkReport {
|
|
1518
|
+
n: number;
|
|
1519
|
+
excluded: number;
|
|
1520
|
+
/** Per-strategy means (keyed by strategy.name). */
|
|
1521
|
+
perStrategy: Record<string, BenchmarkStrategySummary>;
|
|
1522
|
+
/** The full per-task × per-strategy table — the LOSSES an optimizer (GEPA, a
|
|
1523
|
+
* strategy-author, an operator) consumes. Includes errored tasks with the reason. */
|
|
1524
|
+
perTask: BenchmarkTaskRow[];
|
|
1525
|
+
/** The non-dominated strategies on (score ↑, $/task ↓) — collapse-last, per the canon:
|
|
1526
|
+
* a strategy that ties on score at half the cost WINS and a scalar would hide it. */
|
|
1527
|
+
pareto: string[];
|
|
1528
|
+
/** The headline when both `refine` and `sample` ran: paired-bootstrap lift of refine over sample. */
|
|
1529
|
+
refineVsSample?: BenchmarkLift;
|
|
1530
|
+
}
|
|
1531
|
+
/** Run the requested strategies over the tasks, scored by the Environment's own check.
|
|
1532
|
+
* Resilient: a task whose rollouts fail (transient infra) is excluded from the stats but
|
|
1533
|
+
* reported in `perTask` with the error — never silently dropped. */
|
|
1534
|
+
declare function runBenchmark(cfg: BenchmarkConfig): Promise<BenchmarkReport>;
|
|
1535
|
+
/** Pretty-print a report — the "free optimization" verdict, with the cost vector. */
|
|
1536
|
+
declare function printBenchmarkReport(report: BenchmarkReport): void;
|
|
1537
|
+
|
|
1538
|
+
/**
|
|
1539
|
+
* createMcpEnvironment — wrap any MCP server as an `Environment` (the product-adoption
|
|
1540
|
+
* primitive: a product's agent tools are usually already an MCP surface, so the domain
|
|
1541
|
+
* only writes the lifecycle hooks — open a scoped artifact, score it with a deployable
|
|
1542
|
+
* check, close it — and the tool plumbing is derived from the server).
|
|
1543
|
+
*
|
|
1544
|
+
* What the helper owns (the generic 80%, hardened on the EnterpriseOps gym):
|
|
1545
|
+
* - JSON-RPC `tools/list` → `AgenticTool[]`, with schemas coerced to the
|
|
1546
|
+
* OpenAI-tool-valid shape (top-level oneOf/anyOf/allOf/enum/not are rejected by
|
|
1547
|
+
* tool-calling providers; nested combinators are fine).
|
|
1548
|
+
* - JSON-RPC `tools/call` → the tool's text content (errors surfaced as `ERROR: …`
|
|
1549
|
+
* strings — a bad call is the agent's outcome, not an infra fault).
|
|
1550
|
+
* - SSE response parsing (streamable-HTTP MCP servers answer with `data:` lines).
|
|
1551
|
+
* - Bounded retry with backoff on thrown fetches (transient network ≠ task failure).
|
|
1552
|
+
*
|
|
1553
|
+
* What the domain supplies: `open` (create/seed the per-task artifact and return its
|
|
1554
|
+
* MCP endpoint — url + headers carry the per-artifact scoping, e.g. a database id
|
|
1555
|
+
* header), `score` (the deployable check), and optional `close`/`selectTools`.
|
|
1556
|
+
*/
|
|
1557
|
+
|
|
1558
|
+
/** Where a handle's MCP server lives; headers carry per-artifact scoping. */
|
|
1559
|
+
interface McpEndpoint {
|
|
1560
|
+
url: string;
|
|
1561
|
+
headers?: Record<string, string>;
|
|
1562
|
+
}
|
|
1563
|
+
interface McpEnvironmentOptions {
|
|
1564
|
+
name: string;
|
|
1565
|
+
/** Create/seed the per-task artifact; return its handle + the MCP endpoint scoped to it. */
|
|
1566
|
+
open(task: AgenticTask): Promise<{
|
|
1567
|
+
handle: ArtifactHandle;
|
|
1568
|
+
endpoint: McpEndpoint;
|
|
1569
|
+
}>;
|
|
1570
|
+
/** The deployable check over the artifact's current state. */
|
|
1571
|
+
score(task: AgenticTask, handle: ArtifactHandle): Promise<SurfaceScore>;
|
|
1572
|
+
/** Teardown (delete the seeded artifact). Optional — omit for stateless servers. */
|
|
1573
|
+
close?(handle: ArtifactHandle): Promise<void>;
|
|
1574
|
+
/** Restrict/order the server's tools per task (e.g. the task's selected_tools). Default: all. */
|
|
1575
|
+
selectTools?(task: AgenticTask, all: AgenticTool[]): AgenticTool[];
|
|
1576
|
+
/** Cap on a tool result's text fed back to the worker. Default 1500 chars. */
|
|
1577
|
+
maxResultChars?: number;
|
|
1578
|
+
}
|
|
1579
|
+
declare function createMcpEnvironment(opts: McpEnvironmentOptions): Environment;
|
|
1580
|
+
|
|
957
1581
|
/**
|
|
958
1582
|
* @experimental
|
|
959
1583
|
*
|
|
@@ -1175,7 +1799,7 @@ declare class FileCorpus implements Corpus {
|
|
|
1175
1799
|
* An empty query result returns a fresh COPY of the profile with no instruction change (a valid
|
|
1176
1800
|
* "nothing learned yet" read, not an error).
|
|
1177
1801
|
*/
|
|
1178
|
-
declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile
|
|
1802
|
+
declare function renderCorpusToInstructions(opts: RenderCorpusToInstructionsOptions): Promise<AgentProfile>;
|
|
1179
1803
|
|
|
1180
1804
|
/**
|
|
1181
1805
|
* @experimental
|
|
@@ -1282,6 +1906,64 @@ declare function trajectoryReport(journal: SpawnJournal, blobs: ResultBlobStore,
|
|
|
1282
1906
|
*/
|
|
1283
1907
|
declare function equalKOnCost(arms: ReadonlyArray<EqualKArm>, options?: EqualKOnCostOptions): EqualKVerdict;
|
|
1284
1908
|
|
|
1909
|
+
interface PromotionGateOptions {
|
|
1910
|
+
/** The HOLDOUT report — must carry per-task cells for both strategy names. */
|
|
1911
|
+
report: BenchmarkReport;
|
|
1912
|
+
/** The incumbent champion's strategy name. */
|
|
1913
|
+
incumbent: string;
|
|
1914
|
+
/** The challenger's strategy name. */
|
|
1915
|
+
candidate: string;
|
|
1916
|
+
/** 'superiority' (default): the candidate must score significantly BETTER.
|
|
1917
|
+
* 'non-inferiority': the candidate must prove its score is not worse than the
|
|
1918
|
+
* incumbent by more than `scoreTolerance` AND its cost savings are significant —
|
|
1919
|
+
* the gate for "same quality, cheaper" claims. */
|
|
1920
|
+
mode?: 'superiority' | 'non-inferiority';
|
|
1921
|
+
/** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
1922
|
+
scoreTolerance?: number;
|
|
1923
|
+
/** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
|
|
1924
|
+
deltaThreshold?: number;
|
|
1925
|
+
/** Minimum paired tasks before significance can be claimed. Default 6 — below that
|
|
1926
|
+
* the bootstrap CI is too wide to separate a real lift from the per-task noise. */
|
|
1927
|
+
minPairedTasks?: number;
|
|
1928
|
+
/** Bootstrap statistic over the paired deltas. Default 'mean'. */
|
|
1929
|
+
statistic?: 'mean' | 'median';
|
|
1930
|
+
/** Fixed by the substrate by default — the same report always yields the same verdict. */
|
|
1931
|
+
seed?: number;
|
|
1932
|
+
resamples?: number;
|
|
1933
|
+
}
|
|
1934
|
+
interface PromotionVerdict {
|
|
1935
|
+
promoted: boolean;
|
|
1936
|
+
reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
|
|
1937
|
+
mode: 'superiority' | 'non-inferiority';
|
|
1938
|
+
/** Paired tasks that carried both strategies' cells. */
|
|
1939
|
+
n: number;
|
|
1940
|
+
/** Paired (candidate − incumbent) lift across the holdout tasks. */
|
|
1941
|
+
lift: {
|
|
1942
|
+
mean: number;
|
|
1943
|
+
median: number;
|
|
1944
|
+
low: number;
|
|
1945
|
+
high: number;
|
|
1946
|
+
};
|
|
1947
|
+
/** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
|
|
1948
|
+
* positive means the candidate is cheaper; significant iff the CI low clears zero. */
|
|
1949
|
+
costSavings?: {
|
|
1950
|
+
mean: number;
|
|
1951
|
+
median: number;
|
|
1952
|
+
low: number;
|
|
1953
|
+
high: number;
|
|
1954
|
+
};
|
|
1955
|
+
/** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
|
|
1956
|
+
* is FASTER. Informational in every mode (never gates); the latency answer to "what
|
|
1957
|
+
* does this win actually cost the user?". */
|
|
1958
|
+
latency?: {
|
|
1959
|
+
mean: number;
|
|
1960
|
+
median: number;
|
|
1961
|
+
low: number;
|
|
1962
|
+
high: number;
|
|
1963
|
+
};
|
|
1964
|
+
}
|
|
1965
|
+
declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
|
|
1966
|
+
|
|
1285
1967
|
/**
|
|
1286
1968
|
* Bridge a finished `runLoop` into an agent-eval campaign / profile-matrix
|
|
1287
1969
|
* dispatch.
|
|
@@ -1711,6 +2393,284 @@ interface OpenSandboxRunOptions {
|
|
|
1711
2393
|
*/
|
|
1712
2394
|
declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandboxRunOptions, deliverable: Deliverable<Out>): Promise<SandboxRun<Out>>;
|
|
1713
2395
|
|
|
2396
|
+
/**
|
|
2397
|
+
* authorStrategy — the agent-authored layer as a package primitive (software-3.0): an
|
|
2398
|
+
* LLM reads a benchmark's per-task LOSSES + the defineStrategy contract and writes a NEW
|
|
2399
|
+
* optimization strategy as code; the caller gates it like any human-built candidate
|
|
2400
|
+
* (runBenchmark + a frozen holdout).
|
|
2401
|
+
*
|
|
2402
|
+
* Structurally safe by construction: the authored body composes shot()/critique() and
|
|
2403
|
+
* spends through the Supervisor's conserved pool — it can be wrong, but it cannot
|
|
2404
|
+
* Goodhart the check (it never sees the verifiers) and it cannot win by overspending.
|
|
2405
|
+
*
|
|
2406
|
+
* The authored module is written to `outDir` and dynamically imported — run under a
|
|
2407
|
+
* TS-capable loader (tsx) since models often emit type annotations.
|
|
2408
|
+
*/
|
|
2409
|
+
|
|
2410
|
+
/** The compressed consumable a skill carries: everything an author needs to emit a loop. */
|
|
2411
|
+
declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n name (focus an explore shot on read-only tools, an execute shot on write tools).\n Restriction-only; unknown names make the shot fail. ALWAYS select from\n await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n consult(messages, instruction): Promise<string | null>\n The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n (a decision, a prediction). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n listTools(handle): Promise<Array<{ name, description? }>>\n The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
|
|
2412
|
+
interface AuthorStrategyOptions {
|
|
2413
|
+
/** The model-call seam (agent-eval `createChatClient`). */
|
|
2414
|
+
chat: ChatClient;
|
|
2415
|
+
model?: string;
|
|
2416
|
+
/** A NAMED fallback author tried once when the primary call fails or returns no code
|
|
2417
|
+
* block (thinking models time out at the edge on long authoring prompts, or return
|
|
2418
|
+
* empty content without `maxTokens`). Opt-in — absent means the primary's failure
|
|
2419
|
+
* propagates. */
|
|
2420
|
+
fallbackModel?: string;
|
|
2421
|
+
/** The contract text shown to the author. Default `strategyAuthorContract`. The
|
|
2422
|
+
* meta-optimization coordinate: a GEPA/skill loop can evolve this text and gate each
|
|
2423
|
+
* variant on the same frozen holdout as any strategy. */
|
|
2424
|
+
contract?: string;
|
|
2425
|
+
/** The environment the losses came from (orientation only — never the verifiers). */
|
|
2426
|
+
environmentName: string;
|
|
2427
|
+
/** The per-task losses table (e.g. JSON.stringify(report.perTask)) — the gradient. */
|
|
2428
|
+
lossesJson: string;
|
|
2429
|
+
/** The budget the strategy must respect (shots/width). */
|
|
2430
|
+
budget: number;
|
|
2431
|
+
/** Where the authored module file is written (created if missing). */
|
|
2432
|
+
outDir: string;
|
|
2433
|
+
temperature?: number;
|
|
2434
|
+
/** Completion cap — required by thinking-model authors that stream reasoning first. */
|
|
2435
|
+
maxTokens?: number;
|
|
2436
|
+
signal?: AbortSignal;
|
|
2437
|
+
}
|
|
2438
|
+
/** Static CONTRACT lint over an authored strategy module — the module-boundary
|
|
2439
|
+
* enforcement of the harness's two measurement invariants:
|
|
2440
|
+
* - author blindness: the only import allowed is the loops surface. A body that could
|
|
2441
|
+
* reach the filesystem, network, or process could read or mutate verifier/artifact
|
|
2442
|
+
* state outside the brokered shots, and the harness-verified score would stop
|
|
2443
|
+
* meaning "what the shots achieved".
|
|
2444
|
+
* - conserved dose: no out-of-band compute (fetch/require/eval) — every unit a
|
|
2445
|
+
* strategy spends is metered by the Supervisor's pool, which is what makes
|
|
2446
|
+
* equal-budget comparisons between strategies valid.
|
|
2447
|
+
* A lint, not a sandbox: its job is keeping the benchmark numbers interpretable. */
|
|
2448
|
+
declare function assertStrategyContract(code: string): void;
|
|
2449
|
+
interface AuthoredStrategy {
|
|
2450
|
+
strategy: Strategy;
|
|
2451
|
+
file: string;
|
|
2452
|
+
code: string;
|
|
2453
|
+
}
|
|
2454
|
+
/** Author + load a strategy from losses. Throws when the author emits no loadable module;
|
|
2455
|
+
* with `fallbackModel` set, the named fallback gets one attempt first. */
|
|
2456
|
+
declare function authorStrategy(opts: AuthorStrategyOptions): Promise<AuthoredStrategy>;
|
|
2457
|
+
|
|
2458
|
+
/**
|
|
2459
|
+
* runStrategyEvolution — the multi-generation strategy search: per generation the system
|
|
2460
|
+
* authors a POPULATION of candidate strategies from the current tournament's losses,
|
|
2461
|
+
* plays them against the incumbent at equal budget, and advances a champion; one final
|
|
2462
|
+
* promotion decision runs on a NEVER-BEFORE-USED holdout slice through `promotionGate`.
|
|
2463
|
+
*
|
|
2464
|
+
* Measurement invariants (the reasons this design is shaped the way it is):
|
|
2465
|
+
* - The author sees TRAIN losses only. The holdout slice is drawn fresh (disjoint task
|
|
2466
|
+
* offsets) after all authoring is done — one promotion decision, one untouched slice,
|
|
2467
|
+
* so adaptive reuse of evaluation data never enters the verdict.
|
|
2468
|
+
* - Every tournament runs at the same per-strategy budget through the conserved pool;
|
|
2469
|
+
* candidates cannot win by overspending.
|
|
2470
|
+
* - Champion selection within the search is a SEARCH policy (configurable, default
|
|
2471
|
+
* cost-aware: ties on score go to the cheapest strategy — a scalar hides a strategy
|
|
2472
|
+
* that ties at half the cost). The promotion verdict never comes from search
|
|
2473
|
+
* selection; it comes from the gate on the fresh slice.
|
|
2474
|
+
* - Every authored artifact's description length (gzip bits) is recorded, so the
|
|
2475
|
+
* artifact-complexity-vs-holdout-gap relation is analyzable from any run's report.
|
|
2476
|
+
*
|
|
2477
|
+
* Lineage fields (`parent`, `generation`) are recorded on every archive node so a
|
|
2478
|
+
* descendant-productivity parent-selection policy can be added without changing the
|
|
2479
|
+
* report schema; the v1 search authors from the latest tournament's losses.
|
|
2480
|
+
*/
|
|
2481
|
+
|
|
2482
|
+
interface EvolutionAuthor {
|
|
2483
|
+
/** The model-call seam (agent-eval `createChatClient`). */
|
|
2484
|
+
chat: ChatClient;
|
|
2485
|
+
model?: string;
|
|
2486
|
+
fallbackModel?: string;
|
|
2487
|
+
temperature?: number;
|
|
2488
|
+
maxTokens?: number;
|
|
2489
|
+
}
|
|
2490
|
+
type ChampionPolicy = 'score' | 'costAware';
|
|
2491
|
+
interface StrategyEvolutionConfig {
|
|
2492
|
+
environment: Environment;
|
|
2493
|
+
/** Task supply by DISJOINT slice: `(offset, n)` must return n tasks unique to that
|
|
2494
|
+
* offset range. Train draws [0, trainN); the holdout draws [trainN + holdoutOffset,
|
|
2495
|
+
* …) — tasks the search never touched. */
|
|
2496
|
+
tasks: (offset: number, n: number) => Promise<AgenticTask[]>;
|
|
2497
|
+
trainN: number;
|
|
2498
|
+
holdoutN: number;
|
|
2499
|
+
/** Extra offset past the train slice for the holdout draw (rotate across runs). */
|
|
2500
|
+
holdoutOffset?: number;
|
|
2501
|
+
worker: AgenticOptions;
|
|
2502
|
+
author: EvolutionAuthor;
|
|
2503
|
+
/** Rollouts (sample) / shots (refine) per strategy per task. Default 3. */
|
|
2504
|
+
budget?: number;
|
|
2505
|
+
concurrency?: number;
|
|
2506
|
+
/** Author→tournament rounds after gen0. Default 2. */
|
|
2507
|
+
generations?: number;
|
|
2508
|
+
/** Authored candidates per generation. Default 2. */
|
|
2509
|
+
populationSize?: number;
|
|
2510
|
+
/** The gen0 field. Default [sample, refine, sampleThenRefine]. */
|
|
2511
|
+
baselines?: Strategy[];
|
|
2512
|
+
/** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
|
|
2513
|
+
* incumbent's score (superiority gate). 'cost': the candidate must prove score
|
|
2514
|
+
* NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
|
|
2515
|
+
* savings — the "same quality, cheaper" objective. The author is told the objective
|
|
2516
|
+
* and sees per-task spend either way. */
|
|
2517
|
+
objective?: 'score' | 'cost';
|
|
2518
|
+
/** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
2519
|
+
scoreTolerance?: number;
|
|
2520
|
+
/** Search-side champion selection. Default 'costAware'. */
|
|
2521
|
+
champion?: ChampionPolicy;
|
|
2522
|
+
/** Score band treated as a tie under 'costAware'. Default 0.01. */
|
|
2523
|
+
championEpsilon?: number;
|
|
2524
|
+
/** Where authored modules are written. */
|
|
2525
|
+
outDir: string;
|
|
2526
|
+
/** Promotion-gate evidence floor (paired holdout tasks). */
|
|
2527
|
+
minPairedTasks?: number;
|
|
2528
|
+
/** BAND-AWARE scoring — concentrate the measurement where lift is possible.
|
|
2529
|
+
* Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
|
|
2530
|
+
* budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
|
|
2531
|
+
* (headroom exists) and take the first `holdoutN`. Band membership is decided before
|
|
2532
|
+
* either finalist touches a task and both finalists then face the SAME tasks — the
|
|
2533
|
+
* estimand becomes "paired lift on headroom tasks", pre-registered by this config.
|
|
2534
|
+
* Train: champion selection ignores zero-spread tasks (every field strategy scored
|
|
2535
|
+
* identically — zero selection information, pure noise dilution). */
|
|
2536
|
+
band?: {
|
|
2537
|
+
holdoutPoolN: number;
|
|
2538
|
+
/** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
|
|
2539
|
+
* tasks the reference already solves fully (no headroom, a candidate can only tie). */
|
|
2540
|
+
maxRefScore?: number;
|
|
2541
|
+
};
|
|
2542
|
+
/** What the author learns from a tournament. 'exact' (default) = scores + progressions
|
|
2543
|
+
* per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
|
|
2544
|
+
* per generation reaches the author from the evaluation data). */
|
|
2545
|
+
lossesDetail?: 'exact' | 'binary';
|
|
2546
|
+
/** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
|
|
2547
|
+
* compress it to a short natural-language summary, have a fresh author re-implement
|
|
2548
|
+
* from the summary alone (no losses, no code), and score the reproduction on the same
|
|
2549
|
+
* holdout. A reproduction gap is an overfitting signal (their detector: 100%
|
|
2550
|
+
* sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
|
|
2551
|
+
* never gate-blocking in v1. */
|
|
2552
|
+
reproducerCheck?: {
|
|
2553
|
+
/** Word budget for the strategy summary. Default 64. */
|
|
2554
|
+
summaryMaxWords?: number;
|
|
2555
|
+
/** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
|
|
2556
|
+
* Default 0.05. */
|
|
2557
|
+
tolerance?: number;
|
|
2558
|
+
};
|
|
2559
|
+
/** Endurance: write the run state after every completed phase; with `resume`, a
|
|
2560
|
+
* restart skips completed phases (authored modules re-imported from their files).
|
|
2561
|
+
* Worst case after a mid-run death is re-paying ONE phase, never the run. */
|
|
2562
|
+
checkpoint?: {
|
|
2563
|
+
path: string;
|
|
2564
|
+
resume?: boolean;
|
|
2565
|
+
};
|
|
2566
|
+
/** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
|
|
2567
|
+
* The seam for environment recycling — no artifacts span phases, so a runner may
|
|
2568
|
+
* recreate a wedge-prone environment container here. */
|
|
2569
|
+
onPhase?: (phase: string) => Promise<void>;
|
|
2570
|
+
onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
|
|
2571
|
+
hooks?: RuntimeHooks;
|
|
2572
|
+
}
|
|
2573
|
+
interface ChampionPick {
|
|
2574
|
+
name: string;
|
|
2575
|
+
score: number;
|
|
2576
|
+
usd: number;
|
|
2577
|
+
}
|
|
2578
|
+
interface EvolutionCandidate {
|
|
2579
|
+
name: string;
|
|
2580
|
+
file?: string;
|
|
2581
|
+
gzipBits?: number;
|
|
2582
|
+
codeChars?: number;
|
|
2583
|
+
/** Present when this author attempt failed (recorded, never silent). */
|
|
2584
|
+
error?: string;
|
|
2585
|
+
}
|
|
2586
|
+
interface EvolutionGeneration {
|
|
2587
|
+
generation: number;
|
|
2588
|
+
candidates: EvolutionCandidate[];
|
|
2589
|
+
report: BenchmarkReport;
|
|
2590
|
+
champion: ChampionPick;
|
|
2591
|
+
}
|
|
2592
|
+
interface EvolutionArchiveNode {
|
|
2593
|
+
name: string;
|
|
2594
|
+
source: 'baseline' | 'authored';
|
|
2595
|
+
generation: number;
|
|
2596
|
+
/** The champion whose tournament losses this candidate was authored from. */
|
|
2597
|
+
parent?: string;
|
|
2598
|
+
gzipBits?: number;
|
|
2599
|
+
file?: string;
|
|
2600
|
+
/** Latest measured tournament result — 0 until the node's first tournament settles
|
|
2601
|
+
* (an authored node is created before its generation's benchmark runs). */
|
|
2602
|
+
score: number;
|
|
2603
|
+
usd: number;
|
|
2604
|
+
}
|
|
2605
|
+
interface ReproductionCheck {
|
|
2606
|
+
/** The compressed strategy description the reproducer implemented from. */
|
|
2607
|
+
summary: string;
|
|
2608
|
+
reproducedName: string;
|
|
2609
|
+
file?: string;
|
|
2610
|
+
championHoldoutScore: number;
|
|
2611
|
+
reproducedHoldoutScore: number;
|
|
2612
|
+
/** champion − reproduced (positive = the reproduction fell short). */
|
|
2613
|
+
gap: number;
|
|
2614
|
+
/** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
|
|
2615
|
+
* overfitting signal: the champion's win did not fit through the summary. */
|
|
2616
|
+
reproducible: boolean;
|
|
2617
|
+
/** Infra failure during reproduction (distinct from a semantic reproduction failure). */
|
|
2618
|
+
error?: string;
|
|
2619
|
+
}
|
|
2620
|
+
interface EvolutionBandInfo {
|
|
2621
|
+
/** Tasks screened by the reference on the holdout pool. */
|
|
2622
|
+
screened: number;
|
|
2623
|
+
/** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
|
|
2624
|
+
inBand: number;
|
|
2625
|
+
/** Reference scores per screened task (the screening record). */
|
|
2626
|
+
refScores: Array<{
|
|
2627
|
+
taskId: string;
|
|
2628
|
+
score: number;
|
|
2629
|
+
}>;
|
|
2630
|
+
}
|
|
2631
|
+
interface EvolutionReport {
|
|
2632
|
+
gen0: BenchmarkReport;
|
|
2633
|
+
gen0Champion: ChampionPick;
|
|
2634
|
+
generations: EvolutionGeneration[];
|
|
2635
|
+
archive: EvolutionArchiveNode[];
|
|
2636
|
+
finalChampion: ChampionPick;
|
|
2637
|
+
holdout: BenchmarkReport;
|
|
2638
|
+
verdict: PromotionVerdict;
|
|
2639
|
+
/** Present when band screening ran — the verdict's estimand is then "paired lift on
|
|
2640
|
+
* headroom tasks" (band membership fixed by the reference screen, pre-registered). */
|
|
2641
|
+
band?: EvolutionBandInfo;
|
|
2642
|
+
/** Present when reproducerCheck ran (final champion was authored). */
|
|
2643
|
+
reproduction?: ReproductionCheck;
|
|
2644
|
+
/** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
|
|
2645
|
+
* re-measurement, so cross-generation deltas mix true drift with run-to-run variance
|
|
2646
|
+
* (entries are unpaired across generations). The only evidence-grade comparison in
|
|
2647
|
+
* this report is `verdict` — both finalists measured fresh, paired, on the holdout. */
|
|
2648
|
+
trajectory: Array<{
|
|
2649
|
+
generation: number;
|
|
2650
|
+
champion: string;
|
|
2651
|
+
score: number;
|
|
2652
|
+
usd: number;
|
|
2653
|
+
}>;
|
|
2654
|
+
}
|
|
2655
|
+
/** Strategy means recomputed over the DISCRIMINATING tasks only — tasks where the field
|
|
2656
|
+
* strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
|
|
2657
|
+
* 0.0, everyone tied) carry no selection information; averaging over them dilutes real
|
|
2658
|
+
* differences toward zero. Search-side denoising only — the gate never uses this. */
|
|
2659
|
+
declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
|
|
2660
|
+
score: number;
|
|
2661
|
+
usd: number;
|
|
2662
|
+
}> | null;
|
|
2663
|
+
/** The champion pick over a means table. 'score' takes the best mean score (ties →
|
|
2664
|
+
* field order). 'costAware' treats scores within `epsilon` of the best as tied and
|
|
2665
|
+
* takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
|
|
2666
|
+
declare function pickChampion(means: Record<string, {
|
|
2667
|
+
score: number;
|
|
2668
|
+
usd: number;
|
|
2669
|
+
}>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2670
|
+
/** Search-side champion selection over a tournament report. */
|
|
2671
|
+
declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2672
|
+
declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
|
|
2673
|
+
|
|
1714
2674
|
/**
|
|
1715
2675
|
* @experimental
|
|
1716
2676
|
*
|
|
@@ -1869,6 +2829,36 @@ interface BridgeSeam {
|
|
|
1869
2829
|
agentProfile?: Record<string, unknown>;
|
|
1870
2830
|
timeoutMs?: number;
|
|
1871
2831
|
}
|
|
2832
|
+
/** An OpenAI-shape function tool the model may call. */
|
|
2833
|
+
interface ToolSpec {
|
|
2834
|
+
type: 'function';
|
|
2835
|
+
function: {
|
|
2836
|
+
name: string;
|
|
2837
|
+
description?: string;
|
|
2838
|
+
parameters: unknown;
|
|
2839
|
+
};
|
|
2840
|
+
}
|
|
2841
|
+
/**
|
|
2842
|
+
* Router seam WITH tool use — the tool-using router backend. Same direct
|
|
2843
|
+
* OpenAI-compatible endpoint as `RouterSeam`, but each turn passes `tools`; when
|
|
2844
|
+
* the model emits tool_calls they run via `executeToolCall` ON THIS HOST and the
|
|
2845
|
+
* results fold back as `tool` messages, repeating until the model answers without
|
|
2846
|
+
* a tool or `maxTurns` is hit. A real agentic loop, OFF-BOX — no sandbox, so it
|
|
2847
|
+
* is unaffected by a box's egress allowlist. One turn = one completion = the
|
|
2848
|
+
* equal-compute unit. `executeToolCall` receives the task so per-task tool
|
|
2849
|
+
* surfaces (e.g. a gym keyed by task) can dispatch correctly.
|
|
2850
|
+
*/
|
|
2851
|
+
interface RouterToolsSeam {
|
|
2852
|
+
routerBaseUrl: string;
|
|
2853
|
+
routerKey: string;
|
|
2854
|
+
model?: string;
|
|
2855
|
+
tools: ReadonlyArray<ToolSpec>;
|
|
2856
|
+
executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
|
|
2857
|
+
/** Max inference turns. Default 200 (runaway backstop — set far above any
|
|
2858
|
+
* legitimate workflow). For tighter per-workflow limits use a cost budget
|
|
2859
|
+
* or wall-clock deadline at the call site. */
|
|
2860
|
+
maxTurns?: number;
|
|
2861
|
+
}
|
|
1872
2862
|
/**
|
|
1873
2863
|
* The single built-in executor entrypoint. The backend is DATA — the cost dial a
|
|
1874
2864
|
* profile, an experiment config, or a replay journal can name — not an import
|
|
@@ -1879,6 +2869,8 @@ interface BridgeSeam {
|
|
|
1879
2869
|
type ExecutorConfig = ({
|
|
1880
2870
|
backend: 'router';
|
|
1881
2871
|
} & RouterSeam) | ({
|
|
2872
|
+
backend: 'router-tools';
|
|
2873
|
+
} & RouterToolsSeam) | ({
|
|
1882
2874
|
backend: 'bridge';
|
|
1883
2875
|
} & BridgeSeam) | ({
|
|
1884
2876
|
backend: 'cli';
|
|
@@ -2016,6 +3008,47 @@ declare function createSupervisor<Task, Out>(): Supervisor<Task, Out>;
|
|
|
2016
3008
|
*/
|
|
2017
3009
|
declare function createRootHandle<Out>(): RootHandle<Out>;
|
|
2018
3010
|
|
|
3011
|
+
/**
|
|
3012
|
+
* createVerifierEnvironment — ANY checkable task as an `Environment`, no tool surface
|
|
3013
|
+
* required. The generalization piece: EOPS/commit0-style domains have tools that mutate
|
|
3014
|
+
* an external artifact, but math problems, legal drafts, creative briefs, GTM copy, and
|
|
3015
|
+
* QA tasks have a different shape — the artifact IS the worker's answer, and the domain
|
|
3016
|
+
* is defined by one function: the deployable check over that answer.
|
|
3017
|
+
*
|
|
3018
|
+
* const gsm8k = createVerifierEnvironment({
|
|
3019
|
+
* name: 'gsm8k',
|
|
3020
|
+
* check: (task, answer) => ({
|
|
3021
|
+
* passes: extractFinalNumber(answer) === task.meta?.answer ? 1 : 0,
|
|
3022
|
+
* total: 1,
|
|
3023
|
+
* errored: 0,
|
|
3024
|
+
* }),
|
|
3025
|
+
* })
|
|
3026
|
+
* await runBenchmark({ environment: gsm8k, tasks, worker }) // sample vs refine on math
|
|
3027
|
+
*
|
|
3028
|
+
* The worker gets one built-in tool — `submit_answer` — plus any read-only domain tools
|
|
3029
|
+
* the caller adds (a calculator, a retrieval call, a style guide lookup). Every
|
|
3030
|
+
* submission is kept; `score()` checks the BEST submission (keep-best is the measured
|
|
3031
|
+
* law: workers reach correct answers then revise past them). The refine strategy's
|
|
3032
|
+
* critic reads the submission trajectory like any other trace, so iterate-with-feedback
|
|
3033
|
+
* works unchanged on answer domains.
|
|
3034
|
+
*
|
|
3035
|
+
* The check can be graded (passes/total expresses partial credit — rubric points,
|
|
3036
|
+
* sub-answers, unit-test counts), and MUST be deployable (computable without an oracle
|
|
3037
|
+
* at serve time): exact/numeric match, schema validation, a compiled rubric — not a
|
|
3038
|
+
* peek at held-out labels the production system wouldn't have.
|
|
3039
|
+
*/
|
|
3040
|
+
|
|
3041
|
+
interface VerifierEnvironmentOptions {
|
|
3042
|
+
name: string;
|
|
3043
|
+
/** The deployable check over a submitted answer. Graded via passes/total. */
|
|
3044
|
+
check(task: AgenticTask, answer: string): Promise<SurfaceScore> | SurfaceScore;
|
|
3045
|
+
/** Extra domain tools (read-only helpers: calculator, retrieval, style lookup). */
|
|
3046
|
+
extraTools?: AgenticTool[];
|
|
3047
|
+
/** Executes the extra tools. Required when `extraTools` is set. */
|
|
3048
|
+
callExtra?(task: AgenticTask, name: string, args: Record<string, unknown>): Promise<string> | string;
|
|
3049
|
+
}
|
|
3050
|
+
declare function createVerifierEnvironment(opts: VerifierEnvironmentOptions): Environment;
|
|
3051
|
+
|
|
2019
3052
|
/** Command runner seam. Host code can use `localShell`; sandbox code can wrap `box.exec`. */
|
|
2020
3053
|
type Shell = (args: ReadonlyArray<string>, cwd?: string) => Promise<{
|
|
2021
3054
|
stdout: string;
|
|
@@ -2043,5 +3076,10 @@ interface GitWorkspaceOptions {
|
|
|
2043
3076
|
readonly noHooks?: boolean;
|
|
2044
3077
|
}
|
|
2045
3078
|
declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
3079
|
+
/** A jj-backed `Workspace` (Jujutsu, colocated with git for the durable remote).
|
|
3080
|
+
* Same port, same `Shell` — a drop-in for `gitWorkspace`. jj suits agent loops:
|
|
3081
|
+
* no staging area, and a first-class operation log (native resume/undo). Live use
|
|
3082
|
+
* requires `jj` on the `Shell`'s host. */
|
|
3083
|
+
declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
2046
3084
|
|
|
2047
|
-
export { Agent, AgentRunSpec, AgentSpec, type AssertTraceDerivedFindings, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, SpawnEvent, SpawnJournal, Spend, type SteerContext, SupervisedResult, Supervisor, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, assertTraceDerivedFindings, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, definePersona, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, inlineSandboxClient, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, probeSandboxCapabilities, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runPersonified, settledToIteration, spendFromUsageEvents, trajectoryReport, verify, widen };
|
|
3085
|
+
export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
|