@tangle-network/agent-runtime 0.48.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.d.ts +1 -1
- package/dist/agent.js +1 -1
- package/dist/analyst-loop.d.ts +1 -1
- package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
- package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
- package/dist/chunk-CM2IK7VS.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
- package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
- package/dist/chunk-NDM5VXZW.js.map +1 -0
- package/dist/chunk-OM3YNZIW.js +978 -0
- package/dist/chunk-OM3YNZIW.js.map +1 -0
- package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
- package/dist/chunk-RHW75JW5.js.map +1 -0
- package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
- package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
- package/dist/index.d.ts +34 -9
- package/dist/index.js +117 -27
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
- package/dist/loop-runner-bin.d.ts +5 -5
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +6 -6
- package/dist/loops.js +17 -1
- package/dist/mcp/bin.js +206 -29
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +41 -177
- package/dist/mcp/index.js +40 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/profiles.d.ts +2 -2
- package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
- package/dist/runtime.d.ts +403 -24
- package/dist/runtime.js +17 -1
- package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
- package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
- package/dist/workflow.d.ts +2 -2
- package/dist/workflow.js +1 -1
- package/package.json +6 -5
- package/dist/chunk-IW2LMLK6.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-LX66I3SC.js +0 -218
- package/dist/chunk-LX66I3SC.js.map +0 -1
- package/dist/chunk-TJS7S3HJ.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- package/dist/otel-export-EzfsVUhh.d.ts +0 -191
- /package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0
package/dist/runtime.d.ts
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
1
|
import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, SandboxEvent } from '@tangle-network/sandbox';
|
|
2
2
|
export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
|
|
3
|
-
import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-
|
|
4
|
-
export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-
|
|
3
|
+
import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-5MGt5KTY.js';
|
|
4
|
+
export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-5MGt5KTY.js';
|
|
5
|
+
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
5
6
|
import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
|
|
6
7
|
export { DefaultVerdict } from '@tangle-network/agent-eval';
|
|
7
|
-
export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-
|
|
8
|
-
import { S as SandboxClient, b as LoopResult,
|
|
9
|
-
export { D as Driver,
|
|
8
|
+
export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DLI1io57.js';
|
|
9
|
+
import { S as SandboxClient, b as LoopResult, d as LoopTokenUsage, R as RuntimeStreamEvent, A as AgentRunSpec, E as ExecCtx, I as Iteration } from './types-BEQsBhOE.js';
|
|
10
|
+
export { D as Driver, F as LoopDecisionPayload, G as LoopEndedPayload, H as LoopIterationDispatchPayload, J as LoopIterationEndedPayload, M as LoopIterationStartedPayload, a as LoopLineageOptions, N as LoopPlanDescription, P as LoopPlanPayload, e as LoopSandboxPlacement, Q as LoopStartedPayload, T as LoopTeardownFailedPayload, f as LoopTraceEmitter, g as LoopTraceEvent, L as LoopWinner, O as OutputAdapter, U as ValidationCtx, V as Validator } from './types-BEQsBhOE.js';
|
|
10
11
|
import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
|
|
11
|
-
import { R as RunLoopOptions } from './run-loop-
|
|
12
|
-
export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-
|
|
13
|
-
import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
|
|
12
|
+
import { R as RunLoopOptions } from './run-loop-BIineL1T.js';
|
|
13
|
+
export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-BIineL1T.js';
|
|
14
14
|
|
|
15
15
|
/**
|
|
16
16
|
* @experimental
|
|
@@ -113,6 +113,140 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
|
|
|
113
113
|
*/
|
|
114
114
|
declare function materializeTreeView(events: SpawnEvent[]): TreeView;
|
|
115
115
|
|
|
116
|
+
/**
|
|
117
|
+
* createWaterfallCollector — 100% trajectory observability from the lifecycle stream:
|
|
118
|
+
* every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
|
|
119
|
+
* The sum of spans IS the run's cost story — what each step cost in dollars, tokens,
|
|
120
|
+
* and wall-clock, rendered as a text waterfall or exported as structured rows for any
|
|
121
|
+
* chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
|
|
122
|
+
* across every task the hooks observe.
|
|
123
|
+
*/
|
|
124
|
+
|
|
125
|
+
interface WaterfallSpan {
|
|
126
|
+
id: string;
|
|
127
|
+
/** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
|
|
128
|
+
label: string;
|
|
129
|
+
runId: string;
|
|
130
|
+
parentId?: string;
|
|
131
|
+
startMs: number;
|
|
132
|
+
endMs?: number;
|
|
133
|
+
status: 'running' | 'done' | 'down';
|
|
134
|
+
usd: number;
|
|
135
|
+
tokens: {
|
|
136
|
+
input: number;
|
|
137
|
+
output: number;
|
|
138
|
+
};
|
|
139
|
+
score?: number;
|
|
140
|
+
}
|
|
141
|
+
interface WaterfallReport {
|
|
142
|
+
spans: WaterfallSpan[];
|
|
143
|
+
/** Wall-clock of the observed window (first spawn → last settle). */
|
|
144
|
+
totalMs: number;
|
|
145
|
+
totalUsd: number;
|
|
146
|
+
totalTokens: {
|
|
147
|
+
input: number;
|
|
148
|
+
output: number;
|
|
149
|
+
};
|
|
150
|
+
/** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
|
|
151
|
+
byKind: Record<string, {
|
|
152
|
+
count: number;
|
|
153
|
+
ms: number;
|
|
154
|
+
usd: number;
|
|
155
|
+
tokens: {
|
|
156
|
+
input: number;
|
|
157
|
+
output: number;
|
|
158
|
+
};
|
|
159
|
+
}>;
|
|
160
|
+
}
|
|
161
|
+
interface WaterfallCollector {
|
|
162
|
+
/** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
|
|
163
|
+
hooks: RuntimeHooks;
|
|
164
|
+
report(): WaterfallReport;
|
|
165
|
+
/** The text waterfall — one row per span, bars scaled to the observed window. */
|
|
166
|
+
render(opts?: {
|
|
167
|
+
width?: number;
|
|
168
|
+
maxRows?: number;
|
|
169
|
+
}): string;
|
|
170
|
+
reset(): void;
|
|
171
|
+
}
|
|
172
|
+
declare function createWaterfallCollector(): WaterfallCollector;
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
|
|
176
|
+
* waterfall's spans (no new instrumentation): per task, the best-so-far score after each
|
|
177
|
+
* shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
|
|
178
|
+
* anytime-optimization metrics:
|
|
179
|
+
*
|
|
180
|
+
* TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
|
|
181
|
+
* over tasks that reached it)
|
|
182
|
+
* STT shots-to-target — attempts until best-so-far ≥ target
|
|
183
|
+
* ERT expected running time (the COCO benchmarking convention): TOTAL time spent
|
|
184
|
+
* across all tasks — including failures' full budgets — divided by the number of
|
|
185
|
+
* tasks that reached the target. The honest "how long per success, all-in".
|
|
186
|
+
* AUC the anytime curve's area (mean best-so-far score across the budget, per shot
|
|
187
|
+
* index) — higher = climbs earlier.
|
|
188
|
+
*
|
|
189
|
+
* The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
|
|
190
|
+
* (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
|
|
191
|
+
* runtime-to-target per (task, target) pair — optionally overridden per task
|
|
192
|
+
* (`targetFor`) when satisfaction is task-specific. Spans come from
|
|
193
|
+
* `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
|
|
194
|
+
* (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
|
|
195
|
+
*/
|
|
196
|
+
|
|
197
|
+
interface AnytimeTaskCurve {
|
|
198
|
+
taskId: string;
|
|
199
|
+
strategy: string;
|
|
200
|
+
/** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
|
|
201
|
+
* cumulative usd, and the running max score. */
|
|
202
|
+
points: Array<{
|
|
203
|
+
elapsedMs: number;
|
|
204
|
+
cumUsd: number;
|
|
205
|
+
best: number;
|
|
206
|
+
}>;
|
|
207
|
+
/** Per satisficing target (keyed by the target value as a string): the first point
|
|
208
|
+
* where best ≥ target, or null when never reached within budget. */
|
|
209
|
+
hits: Record<string, {
|
|
210
|
+
ms: number;
|
|
211
|
+
shots: number;
|
|
212
|
+
usd: number;
|
|
213
|
+
} | null>;
|
|
214
|
+
}
|
|
215
|
+
interface AnytimeStrategySummary {
|
|
216
|
+
strategy: string;
|
|
217
|
+
/** The satisficing target this row summarizes. */
|
|
218
|
+
target: number;
|
|
219
|
+
tasks: number;
|
|
220
|
+
reachedTarget: number;
|
|
221
|
+
/** Median time-to-target over the tasks that reached it (null when none did). */
|
|
222
|
+
medianTttMs: number | null;
|
|
223
|
+
medianShotsToTarget: number | null;
|
|
224
|
+
/** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
|
|
225
|
+
ertMs: number | null;
|
|
226
|
+
/** Same construction over dollars: Σ all spend / #successes. */
|
|
227
|
+
erUsd: number | null;
|
|
228
|
+
/** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
|
|
229
|
+
curveByShot: number[];
|
|
230
|
+
/** Area under the per-shot anytime curve, normalized to [0,1]. */
|
|
231
|
+
auc: number;
|
|
232
|
+
}
|
|
233
|
+
interface AnytimeReport {
|
|
234
|
+
targets: number[];
|
|
235
|
+
perTask: AnytimeTaskCurve[];
|
|
236
|
+
/** One summary per (strategy, target) pair — the COCO-style multi-target view. */
|
|
237
|
+
perStrategy: AnytimeStrategySummary[];
|
|
238
|
+
}
|
|
239
|
+
/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
|
|
240
|
+
* bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
|
|
241
|
+
* `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
|
|
242
|
+
* per-task bar replaces every entry of `targets` for that task. */
|
|
243
|
+
declare function anytimeReport(spans: WaterfallSpan[], opts?: {
|
|
244
|
+
targets?: number[];
|
|
245
|
+
targetFor?: (taskId: string) => number;
|
|
246
|
+
}): AnytimeReport;
|
|
247
|
+
/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
|
|
248
|
+
declare function renderAnytimeTable(report: AnytimeReport): string;
|
|
249
|
+
|
|
116
250
|
/**
|
|
117
251
|
* auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
|
|
118
252
|
*
|
|
@@ -1144,6 +1278,9 @@ interface AgenticOptions {
|
|
|
1144
1278
|
routerKey: string;
|
|
1145
1279
|
model: string;
|
|
1146
1280
|
temperature?: number;
|
|
1281
|
+
/** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
|
|
1282
|
+
* budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
|
|
1283
|
+
maxTokens?: number;
|
|
1147
1284
|
/** Turns the agent may take within ONE shot before the driver intervenes. */
|
|
1148
1285
|
innerTurns?: number;
|
|
1149
1286
|
/** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
|
|
@@ -1225,6 +1362,9 @@ interface ShotSpec {
|
|
|
1225
1362
|
messages?: Msg[];
|
|
1226
1363
|
steer?: string;
|
|
1227
1364
|
persona?: ShotPersona;
|
|
1365
|
+
/** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
|
|
1366
|
+
* the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
|
|
1367
|
+
tools?: string[];
|
|
1228
1368
|
}
|
|
1229
1369
|
interface StrategyResult {
|
|
1230
1370
|
score: number;
|
|
@@ -1253,6 +1393,18 @@ interface StrategyCtx {
|
|
|
1253
1393
|
shot(spec?: ShotSpec): Promise<ShotResult | null>;
|
|
1254
1394
|
/** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
|
|
1255
1395
|
critique(messages: Msg[]): Promise<string | null>;
|
|
1396
|
+
/** The RAW analyst channel: the firewalled critic answers `instruction` over the
|
|
1397
|
+
* trajectory verbatim — no findings extraction, so verdict-shaped formats
|
|
1398
|
+
* (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
|
|
1399
|
+
* trajectory in, never scores. Null when the analyst went down. */
|
|
1400
|
+
consult(messages: Msg[], instruction: string): Promise<string | null>;
|
|
1401
|
+
/** The tools THIS artifact's task actually offers (names + descriptions only — never
|
|
1402
|
+
* the implementations). Tool sets vary per task on heterogeneous domains; a strategy
|
|
1403
|
+
* that restricts shots MUST select from this list, never from hardcoded names. */
|
|
1404
|
+
listTools(handle: ArtifactHandle): Promise<Array<{
|
|
1405
|
+
name: string;
|
|
1406
|
+
description?: string;
|
|
1407
|
+
}>>;
|
|
1256
1408
|
}
|
|
1257
1409
|
/** Author a Strategy from the composable steps — the open, compact way. */
|
|
1258
1410
|
declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
|
|
@@ -1346,6 +1498,10 @@ interface BenchmarkTaskRow {
|
|
|
1346
1498
|
taskId: string;
|
|
1347
1499
|
/** Per-strategy cells; absent when the task errored before completing all strategies. */
|
|
1348
1500
|
cells?: Record<string, BenchmarkCell>;
|
|
1501
|
+
/** Per-strategy failures on this task: the strategy competed, threw, and scored an
|
|
1502
|
+
* honest zero — it loses, it does not poison the row. The message is kept so a later
|
|
1503
|
+
* generation's author can see WHY a candidate died. */
|
|
1504
|
+
errors?: Record<string, string>;
|
|
1349
1505
|
/** Why the task was excluded (infra/setup failure) — never silently dropped. */
|
|
1350
1506
|
error?: string;
|
|
1351
1507
|
}
|
|
@@ -1757,6 +1913,13 @@ interface PromotionGateOptions {
|
|
|
1757
1913
|
incumbent: string;
|
|
1758
1914
|
/** The challenger's strategy name. */
|
|
1759
1915
|
candidate: string;
|
|
1916
|
+
/** 'superiority' (default): the candidate must score significantly BETTER.
|
|
1917
|
+
* 'non-inferiority': the candidate must prove its score is not worse than the
|
|
1918
|
+
* incumbent by more than `scoreTolerance` AND its cost savings are significant —
|
|
1919
|
+
* the gate for "same quality, cheaper" claims. */
|
|
1920
|
+
mode?: 'superiority' | 'non-inferiority';
|
|
1921
|
+
/** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
1922
|
+
scoreTolerance?: number;
|
|
1760
1923
|
/** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
|
|
1761
1924
|
deltaThreshold?: number;
|
|
1762
1925
|
/** Minimum paired tasks before significance can be claimed. Default 6 — below that
|
|
@@ -1770,7 +1933,8 @@ interface PromotionGateOptions {
|
|
|
1770
1933
|
}
|
|
1771
1934
|
interface PromotionVerdict {
|
|
1772
1935
|
promoted: boolean;
|
|
1773
|
-
reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
|
|
1936
|
+
reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
|
|
1937
|
+
mode: 'superiority' | 'non-inferiority';
|
|
1774
1938
|
/** Paired tasks that carried both strategies' cells. */
|
|
1775
1939
|
n: number;
|
|
1776
1940
|
/** Paired (candidate − incumbent) lift across the holdout tasks. */
|
|
@@ -1780,6 +1944,23 @@ interface PromotionVerdict {
|
|
|
1780
1944
|
low: number;
|
|
1781
1945
|
high: number;
|
|
1782
1946
|
};
|
|
1947
|
+
/** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
|
|
1948
|
+
* positive means the candidate is cheaper; significant iff the CI low clears zero. */
|
|
1949
|
+
costSavings?: {
|
|
1950
|
+
mean: number;
|
|
1951
|
+
median: number;
|
|
1952
|
+
low: number;
|
|
1953
|
+
high: number;
|
|
1954
|
+
};
|
|
1955
|
+
/** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
|
|
1956
|
+
* is FASTER. Informational in every mode (never gates); the latency answer to "what
|
|
1957
|
+
* does this win actually cost the user?". */
|
|
1958
|
+
latency?: {
|
|
1959
|
+
mean: number;
|
|
1960
|
+
median: number;
|
|
1961
|
+
low: number;
|
|
1962
|
+
high: number;
|
|
1963
|
+
};
|
|
1783
1964
|
}
|
|
1784
1965
|
declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
|
|
1785
1966
|
|
|
@@ -1815,6 +1996,117 @@ interface UsageSink {
|
|
|
1815
1996
|
*/
|
|
1816
1997
|
declare function reportLoopUsage<Task, Output, Decision>(cost: UsageSink, result: Pick<LoopResult<Task, Output, Decision>, 'costUsd' | 'tokenUsage'>, source?: string): void;
|
|
1817
1998
|
|
|
1999
|
+
/**
|
|
2000
|
+
* The one router chat client: direct OpenAI-compatible completions through the
|
|
2001
|
+
* Tangle router — the cheapest dial, no sandbox. Three layers: `routerChatWithUsage`
|
|
2002
|
+
* (chat-only), `routerChatWithTools` (one completion with function tools), and
|
|
2003
|
+
* `routerToolLoop` (the off-box agentic loop over tool-calling). Shared by the
|
|
2004
|
+
* built-in executors and the bench/lab harnesses.
|
|
2005
|
+
*
|
|
2006
|
+
* Reports REAL token usage so the backend-integrity guard sees a real backend.
|
|
2007
|
+
* Returns `undefined` usage when the provider omitted it — never a fabricated 0
|
|
2008
|
+
* (a phantom 0 reads as a free call downstream, which the gate would act on).
|
|
2009
|
+
*/
|
|
2010
|
+
interface RouterConfig {
|
|
2011
|
+
routerBaseUrl: string;
|
|
2012
|
+
routerKey: string;
|
|
2013
|
+
model: string;
|
|
2014
|
+
}
|
|
2015
|
+
interface RouterChatResult {
|
|
2016
|
+
content: string;
|
|
2017
|
+
/** REAL usage, or undefined when the provider reported none. */
|
|
2018
|
+
usage?: {
|
|
2019
|
+
input: number;
|
|
2020
|
+
output: number;
|
|
2021
|
+
};
|
|
2022
|
+
/** Derived from usage via `estimateCost` when the model is priced; else undefined. */
|
|
2023
|
+
costUsd?: number;
|
|
2024
|
+
}
|
|
2025
|
+
declare function routerChatWithUsage(cfg: RouterConfig, messages: Array<{
|
|
2026
|
+
role: string;
|
|
2027
|
+
content: string;
|
|
2028
|
+
}>, opts?: {
|
|
2029
|
+
temperature?: number;
|
|
2030
|
+
signal?: AbortSignal;
|
|
2031
|
+
maxTokens?: number;
|
|
2032
|
+
}): Promise<RouterChatResult>;
|
|
2033
|
+
/** A tool-call the model emitted (provider-neutral; mirrors the runtime's ToolCallRequest). */
|
|
2034
|
+
interface RouterToolCall {
|
|
2035
|
+
id: string;
|
|
2036
|
+
name: string;
|
|
2037
|
+
/** Raw JSON arguments string as emitted by the model. */
|
|
2038
|
+
arguments: string;
|
|
2039
|
+
}
|
|
2040
|
+
interface RouterChatToolsResult {
|
|
2041
|
+
content: string | null;
|
|
2042
|
+
toolCalls: RouterToolCall[];
|
|
2043
|
+
usage?: {
|
|
2044
|
+
input: number;
|
|
2045
|
+
output: number;
|
|
2046
|
+
};
|
|
2047
|
+
costUsd?: number;
|
|
2048
|
+
}
|
|
2049
|
+
/**
|
|
2050
|
+
* A router completion WITH tool-calling — the operator driver's LLM seam. Passes OpenAI-shape
|
|
2051
|
+
* `messages` (system/user/assistant-with-tool_calls/tool roles) + function `tools`, and returns the
|
|
2052
|
+
* assistant text plus the tool calls the model wants run. Same fail-loud + real-usage discipline as
|
|
2053
|
+
* `routerChatWithUsage`. `tool_choice: 'auto'` lets the model decide; the driver loops on the result.
|
|
2054
|
+
*/
|
|
2055
|
+
declare function routerChatWithTools(cfg: RouterConfig, messages: ReadonlyArray<Record<string, unknown>>, tools: ReadonlyArray<{
|
|
2056
|
+
type: 'function';
|
|
2057
|
+
function: {
|
|
2058
|
+
name: string;
|
|
2059
|
+
description?: string;
|
|
2060
|
+
parameters: unknown;
|
|
2061
|
+
};
|
|
2062
|
+
}>, opts?: {
|
|
2063
|
+
temperature?: number;
|
|
2064
|
+
signal?: AbortSignal;
|
|
2065
|
+
toolChoice?: 'auto' | 'required' | 'none';
|
|
2066
|
+
}): Promise<RouterChatToolsResult>;
|
|
2067
|
+
interface ToolSpec {
|
|
2068
|
+
type: 'function';
|
|
2069
|
+
function: {
|
|
2070
|
+
name: string;
|
|
2071
|
+
description?: string;
|
|
2072
|
+
parameters: unknown;
|
|
2073
|
+
};
|
|
2074
|
+
}
|
|
2075
|
+
interface RouterToolLoopResult {
|
|
2076
|
+
/** The model's final assistant text (the turn where it stopped calling tools, or the budget turn). */
|
|
2077
|
+
final: string;
|
|
2078
|
+
/** Inference turns spent (≤ maxTurns) — the equal-budget unit vs random@k. */
|
|
2079
|
+
turns: number;
|
|
2080
|
+
toolCalls: number;
|
|
2081
|
+
/** The behavior trace: each tool call + its result, in order. What a trace-analyst
|
|
2082
|
+
* steerer reads (behavior, never the verdict) to diagnose + redirect the next shot. */
|
|
2083
|
+
toolTrace: Array<{
|
|
2084
|
+
name: string;
|
|
2085
|
+
args: string;
|
|
2086
|
+
result: string;
|
|
2087
|
+
}>;
|
|
2088
|
+
usage: {
|
|
2089
|
+
input: number;
|
|
2090
|
+
output: number;
|
|
2091
|
+
};
|
|
2092
|
+
}
|
|
2093
|
+
/**
|
|
2094
|
+
* The tool-using router backend: a real agentic loop OVER the Tangle router (which
|
|
2095
|
+
* supports tool-calling), off-box — no sandbox. Each turn is one router completion
|
|
2096
|
+
* with `tools`; if the model emits tool_calls, `execute` runs them on the host and
|
|
2097
|
+
* their results are folded back as `tool` messages; the loop repeats until the
|
|
2098
|
+
* model answers without a tool call or the turn budget is hit. One turn = one
|
|
2099
|
+
* inference call, so `maxTurns` is the equal-compute unit against random@k.
|
|
2100
|
+
*
|
|
2101
|
+
* This is the depth substrate for agentic gates (the worker ACTS, observes the real
|
|
2102
|
+
* result, and continues) that the chat-only `routerChatWithUsage` cannot express.
|
|
2103
|
+
*/
|
|
2104
|
+
declare function routerToolLoop(cfg: RouterConfig, system: string, user: string, tools: ReadonlyArray<ToolSpec>, execute: (name: string, args: Record<string, unknown>) => Promise<string>, opts?: {
|
|
2105
|
+
maxTurns?: number;
|
|
2106
|
+
temperature?: number;
|
|
2107
|
+
signal?: AbortSignal;
|
|
2108
|
+
}): Promise<RouterToolLoopResult>;
|
|
2109
|
+
|
|
1818
2110
|
/**
|
|
1819
2111
|
* @experimental
|
|
1820
2112
|
*
|
|
@@ -2227,7 +2519,7 @@ declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandbox
|
|
|
2227
2519
|
*/
|
|
2228
2520
|
|
|
2229
2521
|
/** The compressed consumable a skill carries: everything an author needs to emit a loop. */
|
|
2230
|
-
declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n // your composition\n})\n";
|
|
2522
|
+
declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n name (focus an explore shot on read-only tools, an execute shot on write tools).\n Restriction-only; unknown names make the shot fail. ALWAYS select from\n await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n consult(messages, instruction): Promise<string | null>\n The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n (a decision, a prediction). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n listTools(handle): Promise<Array<{ name, description? }>>\n The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
|
|
2231
2523
|
interface AuthorStrategyOptions {
|
|
2232
2524
|
/** The model-call seam (agent-eval `createChatClient`). */
|
|
2233
2525
|
chat: ChatClient;
|
|
@@ -2328,6 +2620,14 @@ interface StrategyEvolutionConfig {
|
|
|
2328
2620
|
populationSize?: number;
|
|
2329
2621
|
/** The gen0 field. Default [sample, refine, sampleThenRefine]. */
|
|
2330
2622
|
baselines?: Strategy[];
|
|
2623
|
+
/** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
|
|
2624
|
+
* incumbent's score (superiority gate). 'cost': the candidate must prove score
|
|
2625
|
+
* NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
|
|
2626
|
+
* savings — the "same quality, cheaper" objective. The author is told the objective
|
|
2627
|
+
* and sees per-task spend either way. */
|
|
2628
|
+
objective?: 'score' | 'cost';
|
|
2629
|
+
/** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
|
|
2630
|
+
scoreTolerance?: number;
|
|
2331
2631
|
/** Search-side champion selection. Default 'costAware'. */
|
|
2332
2632
|
champion?: ChampionPolicy;
|
|
2333
2633
|
/** Score band treated as a tie under 'costAware'. Default 0.01. */
|
|
@@ -2336,6 +2636,48 @@ interface StrategyEvolutionConfig {
|
|
|
2336
2636
|
outDir: string;
|
|
2337
2637
|
/** Promotion-gate evidence floor (paired holdout tasks). */
|
|
2338
2638
|
minPairedTasks?: number;
|
|
2639
|
+
/** BAND-AWARE scoring — concentrate the measurement where lift is possible.
|
|
2640
|
+
* Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
|
|
2641
|
+
* budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
|
|
2642
|
+
* (headroom exists) and take the first `holdoutN`. Band membership is decided before
|
|
2643
|
+
* either finalist touches a task and both finalists then face the SAME tasks — the
|
|
2644
|
+
* estimand becomes "paired lift on headroom tasks", pre-registered by this config.
|
|
2645
|
+
* Train: champion selection ignores zero-spread tasks (every field strategy scored
|
|
2646
|
+
* identically — zero selection information, pure noise dilution). */
|
|
2647
|
+
band?: {
|
|
2648
|
+
holdoutPoolN: number;
|
|
2649
|
+
/** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
|
|
2650
|
+
* tasks the reference already solves fully (no headroom, a candidate can only tie). */
|
|
2651
|
+
maxRefScore?: number;
|
|
2652
|
+
};
|
|
2653
|
+
/** What the author learns from a tournament. 'exact' (default) = scores + progressions
|
|
2654
|
+
* per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
|
|
2655
|
+
* per generation reaches the author from the evaluation data). */
|
|
2656
|
+
lossesDetail?: 'exact' | 'binary';
|
|
2657
|
+
/** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
|
|
2658
|
+
* compress it to a short natural-language summary, have a fresh author re-implement
|
|
2659
|
+
* from the summary alone (no losses, no code), and score the reproduction on the same
|
|
2660
|
+
* holdout. A reproduction gap is an overfitting signal (their detector: 100%
|
|
2661
|
+
* sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
|
|
2662
|
+
* never gate-blocking in v1. */
|
|
2663
|
+
reproducerCheck?: {
|
|
2664
|
+
/** Word budget for the strategy summary. Default 64. */
|
|
2665
|
+
summaryMaxWords?: number;
|
|
2666
|
+
/** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
|
|
2667
|
+
* Default 0.05. */
|
|
2668
|
+
tolerance?: number;
|
|
2669
|
+
};
|
|
2670
|
+
/** Endurance: write the run state after every completed phase; with `resume`, a
|
|
2671
|
+
* restart skips completed phases (authored modules re-imported from their files).
|
|
2672
|
+
* Worst case after a mid-run death is re-paying ONE phase, never the run. */
|
|
2673
|
+
checkpoint?: {
|
|
2674
|
+
path: string;
|
|
2675
|
+
resume?: boolean;
|
|
2676
|
+
};
|
|
2677
|
+
/** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
|
|
2678
|
+
* The seam for environment recycling — no artifacts span phases, so a runner may
|
|
2679
|
+
* recreate a wedge-prone environment container here. */
|
|
2680
|
+
onPhase?: (phase: string) => Promise<void>;
|
|
2339
2681
|
onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
|
|
2340
2682
|
hooks?: RuntimeHooks;
|
|
2341
2683
|
}
|
|
@@ -2371,6 +2713,32 @@ interface EvolutionArchiveNode {
|
|
|
2371
2713
|
score: number;
|
|
2372
2714
|
usd: number;
|
|
2373
2715
|
}
|
|
2716
|
+
interface ReproductionCheck {
|
|
2717
|
+
/** The compressed strategy description the reproducer implemented from. */
|
|
2718
|
+
summary: string;
|
|
2719
|
+
reproducedName: string;
|
|
2720
|
+
file?: string;
|
|
2721
|
+
championHoldoutScore: number;
|
|
2722
|
+
reproducedHoldoutScore: number;
|
|
2723
|
+
/** champion − reproduced (positive = the reproduction fell short). */
|
|
2724
|
+
gap: number;
|
|
2725
|
+
/** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
|
|
2726
|
+
* overfitting signal: the champion's win did not fit through the summary. */
|
|
2727
|
+
reproducible: boolean;
|
|
2728
|
+
/** Infra failure during reproduction (distinct from a semantic reproduction failure). */
|
|
2729
|
+
error?: string;
|
|
2730
|
+
}
|
|
2731
|
+
interface EvolutionBandInfo {
|
|
2732
|
+
/** Tasks screened by the reference on the holdout pool. */
|
|
2733
|
+
screened: number;
|
|
2734
|
+
/** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
|
|
2735
|
+
inBand: number;
|
|
2736
|
+
/** Reference scores per screened task (the screening record). */
|
|
2737
|
+
refScores: Array<{
|
|
2738
|
+
taskId: string;
|
|
2739
|
+
score: number;
|
|
2740
|
+
}>;
|
|
2741
|
+
}
|
|
2374
2742
|
interface EvolutionReport {
|
|
2375
2743
|
gen0: BenchmarkReport;
|
|
2376
2744
|
gen0Champion: ChampionPick;
|
|
@@ -2379,6 +2747,11 @@ interface EvolutionReport {
|
|
|
2379
2747
|
finalChampion: ChampionPick;
|
|
2380
2748
|
holdout: BenchmarkReport;
|
|
2381
2749
|
verdict: PromotionVerdict;
|
|
2750
|
+
/** Present when band screening ran — the verdict's estimand is then "paired lift on
|
|
2751
|
+
* headroom tasks" (band membership fixed by the reference screen, pre-registered). */
|
|
2752
|
+
band?: EvolutionBandInfo;
|
|
2753
|
+
/** Present when reproducerCheck ran (final champion was authored). */
|
|
2754
|
+
reproduction?: ReproductionCheck;
|
|
2382
2755
|
/** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
|
|
2383
2756
|
* re-measurement, so cross-generation deltas mix true drift with run-to-run variance
|
|
2384
2757
|
* (entries are unpaired across generations). The only evidence-grade comparison in
|
|
@@ -2390,9 +2763,22 @@ interface EvolutionReport {
|
|
|
2390
2763
|
usd: number;
|
|
2391
2764
|
}>;
|
|
2392
2765
|
}
|
|
2393
|
-
/**
|
|
2394
|
-
*
|
|
2395
|
-
*
|
|
2766
|
+
/** Strategy means recomputed over the DISCRIMINATING tasks only — tasks where the field
|
|
2767
|
+
* strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
|
|
2768
|
+
* 0.0, everyone tied) carry no selection information; averaging over them dilutes real
|
|
2769
|
+
* differences toward zero. Search-side denoising only — the gate never uses this. */
|
|
2770
|
+
declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
|
|
2771
|
+
score: number;
|
|
2772
|
+
usd: number;
|
|
2773
|
+
}> | null;
|
|
2774
|
+
/** The champion pick over a means table. 'score' takes the best mean score (ties →
|
|
2775
|
+
* field order). 'costAware' treats scores within `epsilon` of the best as tied and
|
|
2776
|
+
* takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
|
|
2777
|
+
declare function pickChampion(means: Record<string, {
|
|
2778
|
+
score: number;
|
|
2779
|
+
usd: number;
|
|
2780
|
+
}>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2781
|
+
/** Search-side champion selection over a tournament report. */
|
|
2396
2782
|
declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
|
|
2397
2783
|
declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
|
|
2398
2784
|
|
|
@@ -2554,15 +2940,6 @@ interface BridgeSeam {
|
|
|
2554
2940
|
agentProfile?: Record<string, unknown>;
|
|
2555
2941
|
timeoutMs?: number;
|
|
2556
2942
|
}
|
|
2557
|
-
/** An OpenAI-shape function tool the model may call. */
|
|
2558
|
-
interface ToolSpec {
|
|
2559
|
-
type: 'function';
|
|
2560
|
-
function: {
|
|
2561
|
-
name: string;
|
|
2562
|
-
description?: string;
|
|
2563
|
-
parameters: unknown;
|
|
2564
|
-
};
|
|
2565
|
-
}
|
|
2566
2943
|
/**
|
|
2567
2944
|
* Router seam WITH tool use — the tool-using router backend. Same direct
|
|
2568
2945
|
* OpenAI-compatible endpoint as `RouterSeam`, but each turn passes `tools`; when
|
|
@@ -2579,7 +2956,9 @@ interface RouterToolsSeam {
|
|
|
2579
2956
|
model?: string;
|
|
2580
2957
|
tools: ReadonlyArray<ToolSpec>;
|
|
2581
2958
|
executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
|
|
2582
|
-
/** Max inference turns (
|
|
2959
|
+
/** Max inference turns. Default 200 (runaway backstop — set far above any
|
|
2960
|
+
* legitimate workflow). For tighter per-workflow limits use a cost budget
|
|
2961
|
+
* or wall-clock deadline at the call site. */
|
|
2583
2962
|
maxTurns?: number;
|
|
2584
2963
|
}
|
|
2585
2964
|
/**
|
|
@@ -2805,4 +3184,4 @@ declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
|
2805
3184
|
* requires `jj` on the `Shell`'s host. */
|
|
2806
3185
|
declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
|
|
2807
3186
|
|
|
2808
|
-
export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
|
|
3187
|
+
export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterChatResult, type RouterChatToolsResult, type RouterConfig, type RouterSeam, type RouterToolCall, type RouterToolLoopResult, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, routerChatWithTools, routerChatWithUsage, routerToolLoop, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
|