@tangle-network/agent-runtime 0.48.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +79 -15
  2. package/dist/agent.js +1 -1
  3. package/dist/chunk-GHX7XOJ2.js +433 -0
  4. package/dist/chunk-GHX7XOJ2.js.map +1 -0
  5. package/dist/{chunk-TJS7S3HJ.js → chunk-IQS4HI3F.js} +14 -5
  6. package/dist/chunk-IQS4HI3F.js.map +1 -0
  7. package/dist/{chunk-IW2LMLK6.js → chunk-PXUTIMGJ.js} +767 -129
  8. package/dist/chunk-PXUTIMGJ.js.map +1 -0
  9. package/dist/{chunk-656G2XCL.js → chunk-U2VEWKKK.js} +3 -3
  10. package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
  11. package/dist/chunk-VIEDXELL.js.map +1 -0
  12. package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
  13. package/dist/index.d.ts +29 -4
  14. package/dist/index.js +109 -21
  15. package/dist/index.js.map +1 -1
  16. package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
  17. package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
  18. package/dist/loop-runner-bin.d.ts +2 -2
  19. package/dist/loop-runner-bin.js +3 -3
  20. package/dist/loops.d.ts +2 -2
  21. package/dist/loops.js +11 -1
  22. package/dist/mcp/bin.js +187 -24
  23. package/dist/mcp/bin.js.map +1 -1
  24. package/dist/mcp/index.d.ts +27 -124
  25. package/dist/mcp/index.js +28 -6
  26. package/dist/mcp/index.js.map +1 -1
  27. package/dist/platform.js +2 -2
  28. package/dist/platform.js.map +1 -1
  29. package/dist/runtime.d.ts +285 -8
  30. package/dist/runtime.js +11 -1
  31. package/dist/workflow.js +1 -1
  32. package/package.json +6 -5
  33. package/dist/chunk-IW2LMLK6.js.map +0 -1
  34. package/dist/chunk-JNPK46YH.js.map +0 -1
  35. package/dist/chunk-LX66I3SC.js +0 -218
  36. package/dist/chunk-LX66I3SC.js.map +0 -1
  37. package/dist/chunk-TJS7S3HJ.js.map +0 -1
  38. package/dist/kb-gate-51BlLlVM.d.ts +0 -529
  39. /package/dist/{chunk-656G2XCL.js.map → chunk-U2VEWKKK.js.map} +0 -0
  40. /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
package/dist/runtime.d.ts CHANGED
@@ -2,6 +2,7 @@ import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, Sandb
2
2
  export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
3
3
  import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
4
4
  export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
5
+ import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
5
6
  import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
6
7
  export { DefaultVerdict } from '@tangle-network/agent-eval';
7
8
  export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
@@ -10,7 +11,6 @@ export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as Loop
10
11
  import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
11
12
  import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
12
13
  export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
13
- import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
14
14
 
15
15
  /**
16
16
  * @experimental
@@ -113,6 +113,140 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
113
113
  */
114
114
  declare function materializeTreeView(events: SpawnEvent[]): TreeView;
115
115
 
116
+ /**
117
+ * createWaterfallCollector — 100% trajectory observability from the lifecycle stream:
118
+ * every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
119
+ * The sum of spans IS the run's cost story — what each step cost in dollars, tokens,
120
+ * and wall-clock, rendered as a text waterfall or exported as structured rows for any
121
+ * chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
122
+ * across every task the hooks observe.
123
+ */
124
+
125
+ interface WaterfallSpan {
126
+ id: string;
127
+ /** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
128
+ label: string;
129
+ runId: string;
130
+ parentId?: string;
131
+ startMs: number;
132
+ endMs?: number;
133
+ status: 'running' | 'done' | 'down';
134
+ usd: number;
135
+ tokens: {
136
+ input: number;
137
+ output: number;
138
+ };
139
+ score?: number;
140
+ }
141
+ interface WaterfallReport {
142
+ spans: WaterfallSpan[];
143
+ /** Wall-clock of the observed window (first spawn → last settle). */
144
+ totalMs: number;
145
+ totalUsd: number;
146
+ totalTokens: {
147
+ input: number;
148
+ output: number;
149
+ };
150
+ /** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
151
+ byKind: Record<string, {
152
+ count: number;
153
+ ms: number;
154
+ usd: number;
155
+ tokens: {
156
+ input: number;
157
+ output: number;
158
+ };
159
+ }>;
160
+ }
161
+ interface WaterfallCollector {
162
+ /** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
163
+ hooks: RuntimeHooks;
164
+ report(): WaterfallReport;
165
+ /** The text waterfall — one row per span, bars scaled to the observed window. */
166
+ render(opts?: {
167
+ width?: number;
168
+ maxRows?: number;
169
+ }): string;
170
+ reset(): void;
171
+ }
172
+ declare function createWaterfallCollector(): WaterfallCollector;
173
+
174
+ /**
175
+ * anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
176
+ * waterfall's spans (no new instrumentation): per task, the best-so-far score after each
177
+ * shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
178
+ * anytime-optimization metrics:
179
+ *
180
+ * TTT time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
181
+ * over tasks that reached it)
182
+ * STT shots-to-target — attempts until best-so-far ≥ target
183
+ * ERT expected running time (the COCO benchmarking convention): TOTAL time spent
184
+ * across all tasks — including failures' full budgets — divided by the number of
185
+ * tasks that reached the target. The honest "how long per success, all-in".
186
+ * AUC the anytime curve's area (mean best-so-far score across the budget, per shot
187
+ * index) — higher = climbs earlier.
188
+ *
189
+ * The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
190
+ * (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
191
+ * runtime-to-target per (task, target) pair — optionally overridden per task
192
+ * (`targetFor`) when satisfaction is task-specific. Spans come from
193
+ * `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
194
+ * (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
195
+ */
196
+
197
+ interface AnytimeTaskCurve {
198
+ taskId: string;
199
+ strategy: string;
200
+ /** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
201
+ * cumulative usd, and the running max score. */
202
+ points: Array<{
203
+ elapsedMs: number;
204
+ cumUsd: number;
205
+ best: number;
206
+ }>;
207
+ /** Per satisficing target (keyed by the target value as a string): the first point
208
+ * where best ≥ target, or null when never reached within budget. */
209
+ hits: Record<string, {
210
+ ms: number;
211
+ shots: number;
212
+ usd: number;
213
+ } | null>;
214
+ }
215
+ interface AnytimeStrategySummary {
216
+ strategy: string;
217
+ /** The satisficing target this row summarizes. */
218
+ target: number;
219
+ tasks: number;
220
+ reachedTarget: number;
221
+ /** Median time-to-target over the tasks that reached it (null when none did). */
222
+ medianTttMs: number | null;
223
+ medianShotsToTarget: number | null;
224
+ /** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
225
+ ertMs: number | null;
226
+ /** Same construction over dollars: Σ all spend / #successes. */
227
+ erUsd: number | null;
228
+ /** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
229
+ curveByShot: number[];
230
+ /** Area under the per-shot anytime curve, normalized to [0,1]. */
231
+ auc: number;
232
+ }
233
+ interface AnytimeReport {
234
+ targets: number[];
235
+ perTask: AnytimeTaskCurve[];
236
+ /** One summary per (strategy, target) pair — the COCO-style multi-target view. */
237
+ perStrategy: AnytimeStrategySummary[];
238
+ }
239
+ /** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
240
+ * bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
241
+ * `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
242
+ * per-task bar replaces every entry of `targets` for that task. */
243
+ declare function anytimeReport(spans: WaterfallSpan[], opts?: {
244
+ targets?: number[];
245
+ targetFor?: (taskId: string) => number;
246
+ }): AnytimeReport;
247
+ /** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
248
+ declare function renderAnytimeTable(report: AnytimeReport): string;
249
+
116
250
  /**
117
251
  * auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
118
252
  *
@@ -1144,6 +1278,9 @@ interface AgenticOptions {
1144
1278
  routerKey: string;
1145
1279
  model: string;
1146
1280
  temperature?: number;
1281
+ /** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
1282
+ * budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
1283
+ maxTokens?: number;
1147
1284
  /** Turns the agent may take within ONE shot before the driver intervenes. */
1148
1285
  innerTurns?: number;
1149
1286
  /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
@@ -1225,6 +1362,9 @@ interface ShotSpec {
1225
1362
  messages?: Msg[];
1226
1363
  steer?: string;
1227
1364
  persona?: ShotPersona;
1365
+ /** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
1366
+ * the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
1367
+ tools?: string[];
1228
1368
  }
1229
1369
  interface StrategyResult {
1230
1370
  score: number;
@@ -1253,6 +1393,18 @@ interface StrategyCtx {
1253
1393
  shot(spec?: ShotSpec): Promise<ShotResult | null>;
1254
1394
  /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
1255
1395
  critique(messages: Msg[]): Promise<string | null>;
1396
+ /** The RAW analyst channel: the firewalled critic answers `instruction` over the
1397
+ * trajectory verbatim — no findings extraction, so verdict-shaped formats
1398
+ * (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
1399
+ * trajectory in, never scores. Null when the analyst went down. */
1400
+ consult(messages: Msg[], instruction: string): Promise<string | null>;
1401
+ /** The tools THIS artifact's task actually offers (names + descriptions only — never
1402
+ * the implementations). Tool sets vary per task on heterogeneous domains; a strategy
1403
+ * that restricts shots MUST select from this list, never from hardcoded names. */
1404
+ listTools(handle: ArtifactHandle): Promise<Array<{
1405
+ name: string;
1406
+ description?: string;
1407
+ }>>;
1256
1408
  }
1257
1409
  /** Author a Strategy from the composable steps — the open, compact way. */
1258
1410
  declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
@@ -1346,6 +1498,10 @@ interface BenchmarkTaskRow {
1346
1498
  taskId: string;
1347
1499
  /** Per-strategy cells; absent when the task errored before completing all strategies. */
1348
1500
  cells?: Record<string, BenchmarkCell>;
1501
+ /** Per-strategy failures on this task: the strategy competed, threw, and scored an
1502
+ * honest zero — it loses, it does not poison the row. The message is kept so a later
1503
+ * generation's author can see WHY a candidate died. */
1504
+ errors?: Record<string, string>;
1349
1505
  /** Why the task was excluded (infra/setup failure) — never silently dropped. */
1350
1506
  error?: string;
1351
1507
  }
@@ -1757,6 +1913,13 @@ interface PromotionGateOptions {
1757
1913
  incumbent: string;
1758
1914
  /** The challenger's strategy name. */
1759
1915
  candidate: string;
1916
+ /** 'superiority' (default): the candidate must score significantly BETTER.
1917
+ * 'non-inferiority': the candidate must prove its score is not worse than the
1918
+ * incumbent by more than `scoreTolerance` AND its cost savings are significant —
1919
+ * the gate for "same quality, cheaper" claims. */
1920
+ mode?: 'superiority' | 'non-inferiority';
1921
+ /** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
1922
+ scoreTolerance?: number;
1760
1923
  /** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
1761
1924
  deltaThreshold?: number;
1762
1925
  /** Minimum paired tasks before significance can be claimed. Default 6 — below that
@@ -1770,7 +1933,8 @@ interface PromotionGateOptions {
1770
1933
  }
1771
1934
  interface PromotionVerdict {
1772
1935
  promoted: boolean;
1773
- reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
1936
+ reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
1937
+ mode: 'superiority' | 'non-inferiority';
1774
1938
  /** Paired tasks that carried both strategies' cells. */
1775
1939
  n: number;
1776
1940
  /** Paired (candidate − incumbent) lift across the holdout tasks. */
@@ -1780,6 +1944,23 @@ interface PromotionVerdict {
1780
1944
  low: number;
1781
1945
  high: number;
1782
1946
  };
1947
+ /** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
1948
+ * positive means the candidate is cheaper; significant iff the CI low clears zero. */
1949
+ costSavings?: {
1950
+ mean: number;
1951
+ median: number;
1952
+ low: number;
1953
+ high: number;
1954
+ };
1955
+ /** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
1956
+ * is FASTER. Informational in every mode (never gates); the latency answer to "what
1957
+ * does this win actually cost the user?". */
1958
+ latency?: {
1959
+ mean: number;
1960
+ median: number;
1961
+ low: number;
1962
+ high: number;
1963
+ };
1783
1964
  }
1784
1965
  declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
1785
1966
 
@@ -2227,7 +2408,7 @@ declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandbox
2227
2408
  */
2228
2409
 
2229
2410
  /** The compressed consumable a skill carries: everything an author needs to emit a loop. */
2230
- declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n // your composition\n})\n";
2411
+ declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n Runs ONE worker attempt (a bounded tool loop) over an artifact.\n - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).\n - pass handle => the shot CONTINUES that artifact (state accumulates across shots).\n - messages => the carried conversation (pass the previous ShotResult.messages to continue).\n - steer => a corrective instruction injected before the shot.\n - persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n a carried conversation it arrives as a hand-off message. Same conserved budget.\n - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n name (focus an explore shot on read-only tools, an execute shot on write tools).\n Restriction-only; unknown names make the shot fail. ALWAYS select from\n await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n Returns null if the attempt failed infra-wise.\n\n critique(messages): Promise<string | null>\n A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n instruction (or null when it judges the work complete). Costs ~1 completion.\n\n consult(messages, instruction): Promise<string | null>\n The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n (a decision, a prediction). Costs ~1 completion.\n\n surface.open(task) / surface.close(handle)\n Open a persistent artifact you manage yourself (remember to close in a finally).\n close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n listTools(handle): Promise<Array<{ name, description? }>>\n The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n fresh conversation too, but be explicit). To CONTINUE, pass the previous\n ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
2231
2412
  interface AuthorStrategyOptions {
2232
2413
  /** The model-call seam (agent-eval `createChatClient`). */
2233
2414
  chat: ChatClient;
@@ -2328,6 +2509,14 @@ interface StrategyEvolutionConfig {
2328
2509
  populationSize?: number;
2329
2510
  /** The gen0 field. Default [sample, refine, sampleThenRefine]. */
2330
2511
  baselines?: Strategy[];
2512
+ /** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
2513
+ * incumbent's score (superiority gate). 'cost': the candidate must prove score
2514
+ * NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
2515
+ * savings — the "same quality, cheaper" objective. The author is told the objective
2516
+ * and sees per-task spend either way. */
2517
+ objective?: 'score' | 'cost';
2518
+ /** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
2519
+ scoreTolerance?: number;
2331
2520
  /** Search-side champion selection. Default 'costAware'. */
2332
2521
  champion?: ChampionPolicy;
2333
2522
  /** Score band treated as a tie under 'costAware'. Default 0.01. */
@@ -2336,6 +2525,48 @@ interface StrategyEvolutionConfig {
2336
2525
  outDir: string;
2337
2526
  /** Promotion-gate evidence floor (paired holdout tasks). */
2338
2527
  minPairedTasks?: number;
2528
+ /** BAND-AWARE scoring — concentrate the measurement where lift is possible.
2529
+ * Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
2530
+ * budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
2531
+ * (headroom exists) and take the first `holdoutN`. Band membership is decided before
2532
+ * either finalist touches a task and both finalists then face the SAME tasks — the
2533
+ * estimand becomes "paired lift on headroom tasks", pre-registered by this config.
2534
+ * Train: champion selection ignores zero-spread tasks (every field strategy scored
2535
+ * identically — zero selection information, pure noise dilution). */
2536
+ band?: {
2537
+ holdoutPoolN: number;
2538
+ /** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
2539
+ * tasks the reference already solves fully (no headroom, a candidate can only tie). */
2540
+ maxRefScore?: number;
2541
+ };
2542
+ /** What the author learns from a tournament. 'exact' (default) = scores + progressions
2543
+ * per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
2544
+ * per generation reaches the author from the evaluation data). */
2545
+ lossesDetail?: 'exact' | 'binary';
2546
+ /** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
2547
+ * compress it to a short natural-language summary, have a fresh author re-implement
2548
+ * from the summary alone (no losses, no code), and score the reproduction on the same
2549
+ * holdout. A reproduction gap is an overfitting signal (their detector: 100%
2550
+ * sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
2551
+ * never gate-blocking in v1. */
2552
+ reproducerCheck?: {
2553
+ /** Word budget for the strategy summary. Default 64. */
2554
+ summaryMaxWords?: number;
2555
+ /** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
2556
+ * Default 0.05. */
2557
+ tolerance?: number;
2558
+ };
2559
+ /** Endurance: write the run state after every completed phase; with `resume`, a
2560
+ * restart skips completed phases (authored modules re-imported from their files).
2561
+ * Worst case after a mid-run death is re-paying ONE phase, never the run. */
2562
+ checkpoint?: {
2563
+ path: string;
2564
+ resume?: boolean;
2565
+ };
2566
+ /** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
2567
+ * The seam for environment recycling — no artifacts span phases, so a runner may
2568
+ * recreate a wedge-prone environment container here. */
2569
+ onPhase?: (phase: string) => Promise<void>;
2339
2570
  onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
2340
2571
  hooks?: RuntimeHooks;
2341
2572
  }
@@ -2371,6 +2602,32 @@ interface EvolutionArchiveNode {
2371
2602
  score: number;
2372
2603
  usd: number;
2373
2604
  }
2605
+ interface ReproductionCheck {
2606
+ /** The compressed strategy description the reproducer implemented from. */
2607
+ summary: string;
2608
+ reproducedName: string;
2609
+ file?: string;
2610
+ championHoldoutScore: number;
2611
+ reproducedHoldoutScore: number;
2612
+ /** champion − reproduced (positive = the reproduction fell short). */
2613
+ gap: number;
2614
+ /** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
2615
+ * overfitting signal: the champion's win did not fit through the summary. */
2616
+ reproducible: boolean;
2617
+ /** Infra failure during reproduction (distinct from a semantic reproduction failure). */
2618
+ error?: string;
2619
+ }
2620
+ interface EvolutionBandInfo {
2621
+ /** Tasks screened by the reference on the holdout pool. */
2622
+ screened: number;
2623
+ /** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
2624
+ inBand: number;
2625
+ /** Reference scores per screened task (the screening record). */
2626
+ refScores: Array<{
2627
+ taskId: string;
2628
+ score: number;
2629
+ }>;
2630
+ }
2374
2631
  interface EvolutionReport {
2375
2632
  gen0: BenchmarkReport;
2376
2633
  gen0Champion: ChampionPick;
@@ -2379,6 +2636,11 @@ interface EvolutionReport {
2379
2636
  finalChampion: ChampionPick;
2380
2637
  holdout: BenchmarkReport;
2381
2638
  verdict: PromotionVerdict;
2639
+ /** Present when band screening ran — the verdict's estimand is then "paired lift on
2640
+ * headroom tasks" (band membership fixed by the reference screen, pre-registered). */
2641
+ band?: EvolutionBandInfo;
2642
+ /** Present when reproducerCheck ran (final champion was authored). */
2643
+ reproduction?: ReproductionCheck;
2382
2644
  /** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
2383
2645
  * re-measurement, so cross-generation deltas mix true drift with run-to-run variance
2384
2646
  * (entries are unpaired across generations). The only evidence-grade comparison in
@@ -2390,9 +2652,22 @@ interface EvolutionReport {
2390
2652
  usd: number;
2391
2653
  }>;
2392
2654
  }
2393
- /** Search-side champion selection over a tournament report. 'score' takes the best mean
2394
- * score (ties field order). 'costAware' treats scores within `epsilon` of the best as
2395
- * tied and takes the cheapest the (score, $) Pareto rule collapsed to one pick. */
2655
+ /** Strategy means recomputed over the DISCRIMINATING tasks only tasks where the field
2656
+ * strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
2657
+ * 0.0, everyone tied) carry no selection information; averaging over them dilutes real
2658
+ * differences toward zero. Search-side denoising only — the gate never uses this. */
2659
+ declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
2660
+ score: number;
2661
+ usd: number;
2662
+ }> | null;
2663
+ /** The champion pick over a means table. 'score' takes the best mean score (ties →
2664
+ * field order). 'costAware' treats scores within `epsilon` of the best as tied and
2665
+ * takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
2666
+ declare function pickChampion(means: Record<string, {
2667
+ score: number;
2668
+ usd: number;
2669
+ }>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
2670
+ /** Search-side champion selection over a tournament report. */
2396
2671
  declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
2397
2672
  declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
2398
2673
 
@@ -2579,7 +2854,9 @@ interface RouterToolsSeam {
2579
2854
  model?: string;
2580
2855
  tools: ReadonlyArray<ToolSpec>;
2581
2856
  executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
2582
- /** Max inference turns (default 4). */
2857
+ /** Max inference turns. Default 200 (runaway backstop — set far above any
2858
+ * legitimate workflow). For tighter per-workflow limits use a cost budget
2859
+ * or wall-clock deadline at the call site. */
2583
2860
  maxTurns?: number;
2584
2861
  }
2585
2862
  /**
@@ -2805,4 +3082,4 @@ declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
2805
3082
  * requires `jj` on the `Shell`'s host. */
2806
3083
  declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
2807
3084
 
2808
- export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
3085
+ export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
package/dist/runtime.js CHANGED
@@ -7,6 +7,7 @@ import {
7
7
  InMemorySpawnJournal,
8
8
  acquireSandbox,
9
9
  adaptiveRefine,
10
+ anytimeReport,
10
11
  assertStrategyContract,
11
12
  assertTraceDerivedFindings,
12
13
  auditIntent,
@@ -29,6 +30,7 @@ import {
29
30
  createShapeRegistry,
30
31
  createSupervisor,
31
32
  createVerifierEnvironment,
33
+ createWaterfallCollector,
32
34
  defaultAnalystInstruction,
33
35
  defaultAuditorInstruction,
34
36
  defaultSelectWinner,
@@ -36,6 +38,7 @@ import {
36
38
  defineStrategy,
37
39
  depthDriver,
38
40
  deterministicCompletion,
41
+ discriminatingMeans,
39
42
  equalKOnCost,
40
43
  fanout,
41
44
  flatWidenGate,
@@ -50,6 +53,7 @@ import {
50
53
  observe,
51
54
  openSandboxRun,
52
55
  panel,
56
+ pickChampion,
53
57
  pipeline,
54
58
  printBenchmarkReport,
55
59
  probeSandboxCapabilities,
@@ -57,6 +61,7 @@ import {
57
61
  refine,
58
62
  registerShape,
59
63
  renderAnalyses,
64
+ renderAnytimeTable,
60
65
  renderCorpusToInstructions,
61
66
  renderReport,
62
67
  replaySpawnTree,
@@ -77,7 +82,7 @@ import {
77
82
  trajectoryReport,
78
83
  verify,
79
84
  widen
80
- } from "./chunk-IW2LMLK6.js";
85
+ } from "./chunk-PXUTIMGJ.js";
81
86
  import {
82
87
  extractLlmCallEvent,
83
88
  mapSandboxEvent
@@ -92,6 +97,7 @@ export {
92
97
  InMemorySpawnJournal,
93
98
  acquireSandbox,
94
99
  adaptiveRefine,
100
+ anytimeReport,
95
101
  assertStrategyContract,
96
102
  assertTraceDerivedFindings,
97
103
  auditIntent,
@@ -114,6 +120,7 @@ export {
114
120
  createShapeRegistry,
115
121
  createSupervisor,
116
122
  createVerifierEnvironment,
123
+ createWaterfallCollector,
117
124
  defaultAnalystInstruction,
118
125
  defaultAuditorInstruction,
119
126
  defaultSelectWinner,
@@ -121,6 +128,7 @@ export {
121
128
  defineStrategy,
122
129
  depthDriver,
123
130
  deterministicCompletion,
131
+ discriminatingMeans,
124
132
  equalKOnCost,
125
133
  extractLlmCallEvent,
126
134
  fanout,
@@ -137,6 +145,7 @@ export {
137
145
  observe,
138
146
  openSandboxRun,
139
147
  panel,
148
+ pickChampion,
140
149
  pipeline,
141
150
  printBenchmarkReport,
142
151
  probeSandboxCapabilities,
@@ -144,6 +153,7 @@ export {
144
153
  refine,
145
154
  registerShape,
146
155
  renderAnalyses,
156
+ renderAnytimeTable,
147
157
  renderCorpusToInstructions,
148
158
  renderReport,
149
159
  replaySpawnTree,
package/dist/workflow.js CHANGED
@@ -2,7 +2,7 @@ import {
2
2
  createSandboxForSpec,
3
3
  describeSandboxPlacement,
4
4
  runLoop
5
- } from "./chunk-IW2LMLK6.js";
5
+ } from "./chunk-PXUTIMGJ.js";
6
6
  import {
7
7
  ValidationError,
8
8
  extractLlmCallEvent
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-runtime",
3
- "version": "0.48.0",
3
+ "version": "0.49.0",
4
4
  "description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
5
5
  "homepage": "https://github.com/tangle-network/agent-runtime#readme",
6
6
  "repository": {
@@ -95,14 +95,14 @@
95
95
  "test:watch": "vitest",
96
96
  "lint": "biome check src tests examples",
97
97
  "lint:fix": "biome check --write src tests examples",
98
- "typecheck": "tsc --noEmit",
98
+ "typecheck": "tsc --noEmit && pnpm run typecheck:examples",
99
99
  "typecheck:examples": "tsc --noEmit -p tsconfig.examples.json",
100
100
  "verify:package": "node scripts/verify-package-exports.mjs"
101
101
  },
102
102
  "devDependencies": {
103
103
  "@biomejs/biome": "^2.4.0",
104
104
  "@tangle-network/agent-eval": "^0.89.0",
105
- "@tangle-network/sandbox": "^0.4.0",
105
+ "@tangle-network/sandbox": "^0.6.0",
106
106
  "@types/node": "^25.6.0",
107
107
  "playwright": "^1.40.0",
108
108
  "tsup": "^8.0.0",
@@ -112,7 +112,8 @@
112
112
  "pnpm": {
113
113
  "minimumReleaseAge": 4320,
114
114
  "minimumReleaseAgeExclude": [
115
- "@tangle-network/agent-eval"
115
+ "@tangle-network/agent-eval",
116
+ "@tangle-network/sandbox"
116
117
  ],
117
118
  "onlyBuiltDependencies": [
118
119
  "esbuild"
@@ -126,7 +127,7 @@
126
127
  "peerDependencies": {
127
128
  "@tangle-network/agent-eval": ">=0.83.0 <1.0.0",
128
129
  "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
129
- "@tangle-network/sandbox": ">=0.1.2 <0.5.0",
130
+ "@tangle-network/sandbox": ">=0.1.2 <0.7.0",
130
131
  "playwright": "^1.40.0"
131
132
  },
132
133
  "peerDependenciesMeta": {