npm - @tangle-network/agent-runtime - Versions diffs - 0.48.0 → 0.49.0 - Mend

@tangle-network/agent-runtime 0.48.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

package/README.md +79 -15
package/dist/agent.js +1 -1
package/dist/chunk-GHX7XOJ2.js +433 -0
package/dist/chunk-GHX7XOJ2.js.map +1 -0
package/dist/{chunk-TJS7S3HJ.js → chunk-IQS4HI3F.js} +14 -5
package/dist/chunk-IQS4HI3F.js.map +1 -0
package/dist/{chunk-IW2LMLK6.js → chunk-PXUTIMGJ.js} +767 -129
package/dist/chunk-PXUTIMGJ.js.map +1 -0
package/dist/{chunk-656G2XCL.js → chunk-U2VEWKKK.js} +3 -3
package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
package/dist/chunk-VIEDXELL.js.map +1 -0
package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
package/dist/index.d.ts +29 -4
package/dist/index.js +109 -21
package/dist/index.js.map +1 -1
package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
package/dist/loop-runner-bin.d.ts +2 -2
package/dist/loop-runner-bin.js +3 -3
package/dist/loops.d.ts +2 -2
package/dist/loops.js +11 -1
package/dist/mcp/bin.js +187 -24
package/dist/mcp/bin.js.map +1 -1
package/dist/mcp/index.d.ts +27 -124
package/dist/mcp/index.js +28 -6
package/dist/mcp/index.js.map +1 -1
package/dist/platform.js +2 -2
package/dist/platform.js.map +1 -1
package/dist/runtime.d.ts +285 -8
package/dist/runtime.js +11 -1
package/dist/workflow.js +1 -1
package/package.json +6 -5
package/dist/chunk-IW2LMLK6.js.map +0 -1
package/dist/chunk-JNPK46YH.js.map +0 -1
package/dist/chunk-LX66I3SC.js +0 -218
package/dist/chunk-LX66I3SC.js.map +0 -1
package/dist/chunk-TJS7S3HJ.js.map +0 -1
package/dist/kb-gate-51BlLlVM.d.ts +0 -529
/package/dist/{chunk-656G2XCL.js.map → chunk-U2VEWKKK.js.map} +0 -0
/package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0

package/dist/runtime.d.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import { AgentProfile, BackendType, CreateSandboxOptions, SandboxInstance, Sandb
 export { AgentProfile, CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox';
 import { R as ResultBlobStore, a as SpawnJournal, N as NodeId, b as SpawnEvent, T as TreeView, c as Settled, d as AgentSpec, E as ExecutorRegistry, B as Budget, A as Agent, e as RootHandle, f as SupervisedResult, g as Spend, S as Scope, h as ExecutorFactory, U as UsageEvent, i as Supervisor } from './types-BpDfCPUp.js';
 export { j as Executor, k as ExecutorContext, l as ExecutorResult, H as Handle, m as NodeSnapshot, n as NodeStatus, o as Restart, p as RootSignal, q as Runtime, r as SpawnOpts, s as SupervisorOpts, W as WidenGate } from './types-BpDfCPUp.js';
+import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
 import { ChatClient, AnalystFinding, DefaultVerdict, AgentProfile as AgentProfile$1 } from '@tangle-network/agent-eval';
 export { DefaultVerdict } from '@tangle-network/agent-eval';
 export { A as AnalyzeInput, a as CompletionAnalyst, b as CompletionEvidence, c as CompletionPolicy, d as CompletionVerdict, C as CreateDriverOptions, D as DriverDecision, P as PlannerContext, e as TopologyMove, T as TopologyPlanner, f as completionAuthorizes, g as createDriver, h as deterministicCompletion, r as renderAnalyses, s as sentinelCompletion, i as stopSentinel } from './driver-DYU2sgHr.js';
@@ -10,7 +11,6 @@ export { D as Driver, C as LoopDecisionPayload, F as LoopEndedPayload, G as Loop
 import { Scenario, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign';
 import { R as RunLoopOptions } from './run-loop-DvD4aGiE.js';
 export { c as createSandboxForSpec, d as defaultSelectWinner, r as runLoop } from './run-loop-DvD4aGiE.js';
-import { R as RuntimeHooks } from './runtime-hooks-C7JwKb9E.js';
 /**
  * @experimental
@@ -113,6 +113,140 @@ declare function replaySpawnTree(journal: SpawnJournal, blobs: ResultBlobStore,
  */
 declare function materializeTreeView(events: SpawnEvent[]): TreeView;
+/**
+ * createWaterfallCollector — 100% trajectory observability from the lifecycle stream:
+ * every spawn/settle (shots, analysts, nested agents) becomes one timed, billed span.
+ * The sum of spans IS the run's cost story — what each step cost in dollars, tokens,
+ * and wall-clock, rendered as a text waterfall or exported as structured rows for any
+ * chart. Attach the collector's `hooks` to `runAgentic`/`runBenchmark`; spans accumulate
+ * across every task the hooks observe.
+ */
+interface WaterfallSpan {
+    id: string;
+    /** The spawn label (`shot:0`, `analyst:1`, a nested agent's label) — the row name. */
+    label: string;
+    runId: string;
+    parentId?: string;
+    startMs: number;
+    endMs?: number;
+    status: 'running' | 'done' | 'down';
+    usd: number;
+    tokens: {
+        input: number;
+        output: number;
+    };
+    score?: number;
+}
+interface WaterfallReport {
+    spans: WaterfallSpan[];
+    /** Wall-clock of the observed window (first spawn → last settle). */
+    totalMs: number;
+    totalUsd: number;
+    totalTokens: {
+        input: number;
+        output: number;
+    };
+    /** Rollup by label prefix (the part before ':') — shots vs analysts vs anything else. */
+    byKind: Record<string, {
+        count: number;
+        ms: number;
+        usd: number;
+        tokens: {
+            input: number;
+            output: number;
+        };
+    }>;
+}
+interface WaterfallCollector {
+    /** Attach these to RunAgenticOptions.hooks / BenchmarkConfig.hooks. */
+    hooks: RuntimeHooks;
+    report(): WaterfallReport;
+    /** The text waterfall — one row per span, bars scaled to the observed window. */
+    render(opts?: {
+        width?: number;
+        maxRows?: number;
+    }): string;
+    reset(): void;
+}
+declare function createWaterfallCollector(): WaterfallCollector;
+/**
+ * anytimeReport — time-to-satisfactory-output metrics, derived entirely from the
+ * waterfall's spans (no new instrumentation): per task, the best-so-far score after each
+ * shot with its elapsed wall-clock and cumulative spend; per strategy, the standard
+ * anytime-optimization metrics:
+ *
+ *   TTT  time-to-target — elapsed ms until best-so-far ≥ the target (per task; median
+ *        over tasks that reached it)
+ *   STT  shots-to-target — attempts until best-so-far ≥ target
+ *   ERT  expected running time (the COCO benchmarking convention): TOTAL time spent
+ *        across all tasks — including failures' full budgets — divided by the number of
+ *        tasks that reached the target. The honest "how long per success, all-in".
+ *   AUC  the anytime curve's area (mean best-so-far score across the budget, per shot
+ *        index) — higher = climbs earlier.
+ *
+ * The "satisfactory" bar follows the COCO/BBOB convention: a SET of satisficing targets
+ * (e.g. [0.5, 0.8, 1.0] on the normalized check score), each measured independently —
+ * runtime-to-target per (task, target) pair — optionally overridden per task
+ * (`targetFor`) when satisfaction is task-specific. Spans come from
+ * `createWaterfallCollector().report()`; tasks are grouped by the supervisor runId
+ * (`agentic:<strategy>:<taskId>`); shot spans are `shot:N` labels.
+ */
+interface AnytimeTaskCurve {
+    taskId: string;
+    strategy: string;
+    /** Best-so-far after each settled shot: elapsed ms from the task's first spawn,
+     *  cumulative usd, and the running max score. */
+    points: Array<{
+        elapsedMs: number;
+        cumUsd: number;
+        best: number;
+    }>;
+    /** Per satisficing target (keyed by the target value as a string): the first point
+     *  where best ≥ target, or null when never reached within budget. */
+    hits: Record<string, {
+        ms: number;
+        shots: number;
+        usd: number;
+    } | null>;
+}
+interface AnytimeStrategySummary {
+    strategy: string;
+    /** The satisficing target this row summarizes. */
+    target: number;
+    tasks: number;
+    reachedTarget: number;
+    /** Median time-to-target over the tasks that reached it (null when none did). */
+    medianTttMs: number | null;
+    medianShotsToTarget: number | null;
+    /** COCO ERT: Σ all task wall-time (incl. failures) / #successes. Null when 0 succeed. */
+    ertMs: number | null;
+    /** Same construction over dollars: Σ all spend / #successes. */
+    erUsd: number | null;
+    /** Mean best-so-far score by shot index (the anytime curve, averaged over tasks). */
+    curveByShot: number[];
+    /** Area under the per-shot anytime curve, normalized to [0,1]. */
+    auc: number;
+}
+interface AnytimeReport {
+    targets: number[];
+    perTask: AnytimeTaskCurve[];
+    /** One summary per (strategy, target) pair — the COCO-style multi-target view. */
+    perStrategy: AnytimeStrategySummary[];
+}
+/** Derive anytime metrics from waterfall spans. `targets` are the satisficing score
+ *  bars (default [1] = fully resolved; COCO-style multi-target: [0.5, 0.8, 1]);
+ *  `targetFor` overrides the bar per task (task-specific satisfaction) — when set, the
+ *  per-task bar replaces every entry of `targets` for that task. */
+declare function anytimeReport(spans: WaterfallSpan[], opts?: {
+    targets?: number[];
+    targetFor?: (taskId: string) => number;
+}): AnytimeReport;
+/** One row per (strategy, satisficing target): the shareable time-to-satisfactory table. */
+declare function renderAnytimeTable(report: AnytimeReport): string;
 /**
  * auditIntent — the route-rigor analyst: is this trajectory even going the RIGHT WAY?
  *
@@ -1144,6 +1278,9 @@ interface AgenticOptions {
     routerKey: string;
     model: string;
     temperature?: number;
+    /** Completion cap per worker turn — REQUIRED for thinking models (they burn unbounded
+     *  budgets on reasoning and return empty content without it). Omitted ⇒ provider default. */
+    maxTokens?: number;
     /** Turns the agent may take within ONE shot before the driver intervenes. */
     innerTurns?: number;
     /** The depth STEERER's analyst instruction (observe()'s system prompt). The knob a
@@ -1225,6 +1362,9 @@ interface ShotSpec {
     messages?: Msg[];
     steer?: string;
     persona?: ShotPersona;
+    /** Restrict THIS shot to a subset of the domain's tools (by name) — focus a shot on
+     *  the relevant capabilities. Restriction-only; unknown names throw. Omitted ⇒ all. */
+    tools?: string[];
 }
 interface StrategyResult {
     score: number;
@@ -1253,6 +1393,18 @@ interface StrategyCtx {
     shot(spec?: ShotSpec): Promise<ShotResult | null>;
     /** The firewalled critic reads the trajectory → a steer string, or null on COMPLETE/down. */
     critique(messages: Msg[]): Promise<string | null>;
+    /** The RAW analyst channel: the firewalled critic answers `instruction` over the
+     *  trajectory verbatim — no findings extraction, so verdict-shaped formats
+     *  (CONTINUE/STOP decisions, calibrated predictions) survive. Same firewall:
+     *  trajectory in, never scores. Null when the analyst went down. */
+    consult(messages: Msg[], instruction: string): Promise<string | null>;
+    /** The tools THIS artifact's task actually offers (names + descriptions only — never
+     *  the implementations). Tool sets vary per task on heterogeneous domains; a strategy
+     *  that restricts shots MUST select from this list, never from hardcoded names. */
+    listTools(handle: ArtifactHandle): Promise<Array<{
+        name: string;
+        description?: string;
+    }>>;
 }
 /** Author a Strategy from the composable steps — the open, compact way. */
 declare function defineStrategy(name: string, run: (ctx: StrategyCtx) => Promise<StrategyResult>): Strategy;
@@ -1346,6 +1498,10 @@ interface BenchmarkTaskRow {
     taskId: string;
     /** Per-strategy cells; absent when the task errored before completing all strategies. */
     cells?: Record<string, BenchmarkCell>;
+    /** Per-strategy failures on this task: the strategy competed, threw, and scored an
+     *  honest zero — it loses, it does not poison the row. The message is kept so a later
+     *  generation's author can see WHY a candidate died. */
+    errors?: Record<string, string>;
     /** Why the task was excluded (infra/setup failure) — never silently dropped. */
     error?: string;
 }
@@ -1757,6 +1913,13 @@ interface PromotionGateOptions {
     incumbent: string;
     /** The challenger's strategy name. */
     candidate: string;
+    /** 'superiority' (default): the candidate must score significantly BETTER.
+     *  'non-inferiority': the candidate must prove its score is not worse than the
+     *  incumbent by more than `scoreTolerance` AND its cost savings are significant —
+     *  the gate for "same quality, cheaper" claims. */
+    mode?: 'superiority' | 'non-inferiority';
+    /** non-inferiority: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
+    scoreTolerance?: number;
     /** The CI lower bound on the paired lift must EXCEED this (score scale). Default 0. */
     deltaThreshold?: number;
     /** Minimum paired tasks before significance can be claimed. Default 6 — below that
@@ -1770,7 +1933,8 @@ interface PromotionGateOptions {
 }
 interface PromotionVerdict {
     promoted: boolean;
-    reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant';
+    reason: 'identical-champion' | 'few-tasks' | 'no-margin' | 'significant' | 'non-inferior-and-cheaper' | 'non-inferiority-unproven' | 'not-cheaper';
+    mode: 'superiority' | 'non-inferiority';
     /** Paired tasks that carried both strategies' cells. */
     n: number;
     /** Paired (candidate − incumbent) lift across the holdout tasks. */
@@ -1780,6 +1944,23 @@ interface PromotionVerdict {
         low: number;
         high: number;
     };
+    /** non-inferiority mode: paired (incumbent − candidate) cost SAVINGS per task (usd) —
+     *  positive means the candidate is cheaper; significant iff the CI low clears zero. */
+    costSavings?: {
+        mean: number;
+        median: number;
+        low: number;
+        high: number;
+    };
+    /** Paired (candidate − incumbent) wall-clock per task (ms) — negative = the candidate
+     *  is FASTER. Informational in every mode (never gates); the latency answer to "what
+     *  does this win actually cost the user?". */
+    latency?: {
+        mean: number;
+        median: number;
+        low: number;
+        high: number;
+    };
 }
 declare function promotionGate(opts: PromotionGateOptions): PromotionVerdict;
@@ -2227,7 +2408,7 @@ declare function openSandboxRun<Out>(client: SandboxClient, options: OpenSandbox
  */
 /** The compressed consumable a skill carries: everything an author needs to emit a loop. */
-declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n  shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>\n    Runs ONE worker attempt (a bounded tool loop) over an artifact.\n    - omit handle  => the shot opens its OWN fresh artifact and closes it after (a sample).\n    - pass handle  => the shot CONTINUES that artifact (state accumulates across shots).\n    - messages     => the carried conversation (pass the previous ShotResult.messages to continue).\n    - steer        => a corrective instruction injected before the shot.\n    - persona      => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n      (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n      personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n      a carried conversation it arrives as a hand-off message. Same conserved budget.\n    ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n    Returns null if the attempt failed infra-wise.\n\n  critique(messages): Promise<string | null>\n    A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n    instruction (or null when it judges the work complete). Costs ~1 completion.\n\n  surface.open(task) / surface.close(handle)\n    Open a persistent artifact you manage yourself (remember to close in a finally).\n\nRules:\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n  fresh conversation too, but be explicit). To CONTINUE, pass the previous\n  ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n  you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {\n  // your composition\n})\n";
+declare const strategyAuthorContract = "\nYou author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to\nspend a compute budget to beat a task's deployable check. You compose exactly two steps:\n\n  shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>\n    Runs ONE worker attempt (a bounded tool loop) over an artifact.\n    - omit handle  => the shot opens its OWN fresh artifact and closes it after (a sample).\n    - pass handle  => the shot CONTINUES that artifact (state accumulates across shots).\n    - messages     => the carried conversation (pass the previous ShotResult.messages to continue).\n    - steer        => a corrective instruction injected before the shot.\n    - persona      => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model\n      (multi-agent strategies: a researcher shot then an engineer shot, a panel of k\n      personas over one budget). On a fresh shot the systemPrompt replaces the task's; on\n      a carried conversation it arrives as a hand-off message. Same conserved budget.\n    - tools        => string[] \u2014 restrict THIS shot to a subset of the task's tools by\n      name (focus an explore shot on read-only tools, an execute shot on write tools).\n      Restriction-only; unknown names make the shot fail. ALWAYS select from\n      await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.\n    ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }\n    Returns null if the attempt failed infra-wise.\n\n  critique(messages): Promise<string | null>\n    A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective\n    instruction (or null when it judges the work complete). Costs ~1 completion.\n\n  consult(messages, instruction): Promise<string | null>\n    The RAW analyst channel: the same firewalled critic answers YOUR instruction over the\n    trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format\n    (a decision, a prediction). Costs ~1 completion.\n\n  surface.open(task) / surface.close(handle)\n    Open a persistent artifact you manage yourself (remember to close in a finally).\n    close is idempotent \u2014 closing an already-closed handle is a safe no-op.\n\n  listTools(handle): Promise<Array<{ name, description? }>>\n    The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a\n    shot with `tools`, you MUST pick names from await listTools(handle); hardcoding\n    names from an example kills your shots on every task whose tools differ.\n\nRules:\n- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects\n  crashes the whole benchmark run.\n- Stay within ~budget total shots; every shot/critique spends from a conserved pool.\n- For a FRESH attempt OMIT `messages` entirely (never pass `[]` \u2014 an empty array is a\n  fresh conversation too, but be explicit). To CONTINUE, pass the previous\n  ShotResult.messages unchanged.\n- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint\n  you reached (keep-best, never final-state), progression = score after each shot.\n- The module must be EXACTLY this shape (no other imports, no commentary outside code):\n\nimport { defineStrategy } from '@tangle-network/agent-runtime/loops'\nexport default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {\n  // your composition (listTools comes from the destructured context \u2014 it is NOT a global)\n})\n";
 interface AuthorStrategyOptions {
     /** The model-call seam (agent-eval `createChatClient`). */
     chat: ChatClient;
@@ -2328,6 +2509,14 @@ interface StrategyEvolutionConfig {
     populationSize?: number;
     /** The gen0 field. Default [sample, refine, sampleThenRefine]. */
     baselines?: Strategy[];
+    /** What "better" means for PROMOTION. 'score' (default): the candidate must beat the
+     *  incumbent's score (superiority gate). 'cost': the candidate must prove score
+     *  NON-INFERIORITY (not worse by more than `scoreTolerance`) plus significant cost
+     *  savings — the "same quality, cheaper" objective. The author is told the objective
+     *  and sees per-task spend either way. */
+    objective?: 'score' | 'cost';
+    /** Cost objective: the score CI lower bound must clear −scoreTolerance. Default 0.05. */
+    scoreTolerance?: number;
     /** Search-side champion selection. Default 'costAware'. */
     champion?: ChampionPolicy;
     /** Score band treated as a tie under 'costAware'. Default 0.01. */
@@ -2336,6 +2525,48 @@ interface StrategyEvolutionConfig {
     outDir: string;
     /** Promotion-gate evidence floor (paired holdout tasks). */
     minPairedTasks?: number;
+    /** BAND-AWARE scoring — concentrate the measurement where lift is possible.
+     *  Holdout: draw `holdoutPoolN` candidate tasks and run `baselines[0]` once at the run
+     *  budget as an INDEPENDENT reference screen; keep tasks scoring ≤ `maxRefScore`
+     *  (headroom exists) and take the first `holdoutN`. Band membership is decided before
+     *  either finalist touches a task and both finalists then face the SAME tasks — the
+     *  estimand becomes "paired lift on headroom tasks", pre-registered by this config.
+     *  Train: champion selection ignores zero-spread tasks (every field strategy scored
+     *  identically — zero selection information, pure noise dilution). */
+    band?: {
+        holdoutPoolN: number;
+        /** Keep holdout tasks where the reference scores ≤ this. Default 0.99 — drop only
+         *  tasks the reference already solves fully (no headroom, a candidate can only tie). */
+        maxRefScore?: number;
+    };
+    /** What the author learns from a tournament. 'exact' (default) = scores + progressions
+     *  per task; 'binary' = pass/fail only — the leakage-bounded channel (one bit per cell
+     *  per generation reaches the author from the evaluation data). */
+    lossesDetail?: 'exact' | 'binary';
+    /** Reproducer certification (arXiv:2606.11045): when the final champion is AUTHORED,
+     *  compress it to a short natural-language summary, have a fresh author re-implement
+     *  from the summary alone (no losses, no code), and score the reproduction on the same
+     *  holdout. A reproduction gap is an overfitting signal (their detector: 100%
+     *  sensitivity / 91% specificity in the ML-agent setting) — recorded on the report,
+     *  never gate-blocking in v1. */
+    reproducerCheck?: {
+        /** Word budget for the strategy summary. Default 64. */
+        summaryMaxWords?: number;
+        /** Reproduction counts as faithful when reproducedScore ≥ championScore − tolerance.
+         *  Default 0.05. */
+        tolerance?: number;
+    };
+    /** Endurance: write the run state after every completed phase; with `resume`, a
+     *  restart skips completed phases (authored modules re-imported from their files).
+     *  Worst case after a mid-run death is re-paying ONE phase, never the run. */
+    checkpoint?: {
+        path: string;
+        resume?: boolean;
+    };
+    /** Called before each benchmark phase (gen0, gen1…, band-screen, holdout, reproduce).
+     *  The seam for environment recycling — no artifacts span phases, so a runner may
+     *  recreate a wedge-prone environment container here. */
+    onPhase?: (phase: string) => Promise<void>;
     onTask?: (phase: string, row: BenchmarkTaskRow, done: number, total: number) => void;
     hooks?: RuntimeHooks;
 }
@@ -2371,6 +2602,32 @@ interface EvolutionArchiveNode {
     score: number;
     usd: number;
 }
+interface ReproductionCheck {
+    /** The compressed strategy description the reproducer implemented from. */
+    summary: string;
+    reproducedName: string;
+    file?: string;
+    championHoldoutScore: number;
+    reproducedHoldoutScore: number;
+    /** champion − reproduced (positive = the reproduction fell short). */
+    gap: number;
+    /** reproducedScore ≥ championScore − tolerance. A failed reproduction is an
+     *  overfitting signal: the champion's win did not fit through the summary. */
+    reproducible: boolean;
+    /** Infra failure during reproduction (distinct from a semantic reproduction failure). */
+    error?: string;
+}
+interface EvolutionBandInfo {
+    /** Tasks screened by the reference on the holdout pool. */
+    screened: number;
+    /** Tasks kept (reference score ≤ maxRefScore) before truncating to holdoutN. */
+    inBand: number;
+    /** Reference scores per screened task (the screening record). */
+    refScores: Array<{
+        taskId: string;
+        score: number;
+    }>;
+}
 interface EvolutionReport {
     gen0: BenchmarkReport;
     gen0Champion: ChampionPick;
@@ -2379,6 +2636,11 @@ interface EvolutionReport {
     finalChampion: ChampionPick;
     holdout: BenchmarkReport;
     verdict: PromotionVerdict;
+    /** Present when band screening ran — the verdict's estimand is then "paired lift on
+     *  headroom tasks" (band membership fixed by the reference screen, pre-registered). */
+    band?: EvolutionBandInfo;
+    /** Present when reproducerCheck ran (final champion was authored). */
+    reproduction?: ReproductionCheck;
     /** SEARCH TELEMETRY, not evidence: each entry is that generation's own train-slice
      *  re-measurement, so cross-generation deltas mix true drift with run-to-run variance
      *  (entries are unpaired across generations). The only evidence-grade comparison in
@@ -2390,9 +2652,22 @@ interface EvolutionReport {
         usd: number;
     }>;
 }
-/** Search-side champion selection over a tournament report. 'score' takes the best mean
- *  score (ties → field order). 'costAware' treats scores within `epsilon` of the best as
- *  tied and takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
+/** Strategy means recomputed over the DISCRIMINATING tasks only — tasks where the field
+ *  strategies did not all score identically. Zero-spread tasks (everyone 1.0, everyone
+ *  0.0, everyone tied) carry no selection information; averaging over them dilutes real
+ *  differences toward zero. Search-side denoising only — the gate never uses this. */
+declare function discriminatingMeans(report: BenchmarkReport, fieldOrder: string[]): Record<string, {
+    score: number;
+    usd: number;
+}> | null;
+/** The champion pick over a means table. 'score' takes the best mean score (ties →
+ *  field order). 'costAware' treats scores within `epsilon` of the best as tied and
+ *  takes the cheapest — the (score, $) Pareto rule collapsed to one pick. */
+declare function pickChampion(means: Record<string, {
+    score: number;
+    usd: number;
+}>, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
+/** Search-side champion selection over a tournament report. */
 declare function selectChampion(report: BenchmarkReport, fieldOrder: string[], policy: ChampionPolicy, epsilon: number): ChampionPick;
 declare function runStrategyEvolution(cfg: StrategyEvolutionConfig): Promise<EvolutionReport>;
@@ -2579,7 +2854,9 @@ interface RouterToolsSeam {
     model?: string;
     tools: ReadonlyArray<ToolSpec>;
     executeToolCall: (name: string, args: Record<string, unknown>, task: unknown) => Promise<string>;
-    /** Max inference turns (default 4). */
+    /** Max inference turns. Default 200 (runaway backstop — set far above any
+     *  legitimate workflow). For tighter per-workflow limits use a cost budget
+     *  or wall-clock deadline at the call site. */
     maxTurns?: number;
 }
 /**
@@ -2805,4 +3082,4 @@ declare function gitWorkspace(opts: GitWorkspaceOptions): Workspace;
  *  requires `jj` on the `Shell`'s host. */
 declare function jjWorkspace(opts: GitWorkspaceOptions): Workspace;
-export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };
+export { Agent, AgentRunSpec, AgentSpec, type AgenticOptions, type AgenticRunResult, type AgenticSurface, type AgenticTask, type AgenticTool, type AnytimeReport, type AnytimeStrategySummary, type AnytimeTaskCurve, type ArtifactHandle, type AssertTraceDerivedFindings, type AuditIntentInput, type AuditIntentOptions, type AuthorStrategyOptions, type AuthoredStrategy, type BenchmarkCell, type BenchmarkConfig, type BenchmarkLift, type BenchmarkReport, type BenchmarkStrategySummary, type BenchmarkTaskRow, type BridgeSeam, Budget, type BudgetPool, type BudgetReadout, type ChampionPick, type ChampionPolicy, type CheckpointCapableBox, type CliSeam, type CombinatorShape, type Corpus, type CorpusFilter, type CorpusRecord, type CreateScopeAnalystOptions, type CriuCapableClient, type DefinePersona, type DefinePersonaInput, type Deliverable, type Environment, type EqualKArm, type EqualKOnCost, type EqualKOnCostOptions, type EqualKVerdict, type EvolutionArchiveNode, type EvolutionAuthor, type EvolutionBandInfo, type EvolutionCandidate, type EvolutionGeneration, type EvolutionReport, ExecCtx, type ExecutorConfig, ExecutorFactory, ExecutorRegistry, type Fanout, type FanoutOptions, type FanoutSynthesis, FileCorpus, FileResultBlobStore, FileSpawnJournal, type FlatWidenGate, type ForkCapableBox, type GitWorkspaceOptions, type HarvestCorpusOptions, type HarvestFailure, type HarvestReport, InMemoryCorpus, InMemoryResultBlobStore, InMemorySpawnJournal, type IntentAudit, Iteration, type LoopDispatchOptions, type LoopOptionsForDispatch, LoopResult, type LoopShape, LoopTokenUsage, type LoopUntil, type LoopUntilSpec, type LoopUntilState, type McpEndpoint, type McpEnvironmentOptions, NodeId, type Observation, type ObserveInput, type ObserveOptions, type OpenSandboxRunOptions, type Outcome, type Panel, type PanelJudge, type PanelSpec, type PanelVerdict, type Persona, type PersonaContext, type PersonaExecutors, type Pipeline, type PipelineStage, type PromotionGateOptions, type PromotionVerdict, type RenderCorpusToInstructions, type RenderCorpusToInstructionsOptions, type ReservationTicket, ResultBlobStore, RootHandle, type RouterSeam, type RouterToolsSeam, type RunAgenticOptions, RunLoopOptions, type RunPersonified, type RunPersonifiedOptions, type SandboxCapabilities, SandboxClient, type SandboxLineage, type SandboxLineageHandle, type SandboxRun, type SandboxSeam, Scope, type ScopeAnalyst, type ScopeAnalyzeInput, type ScopeWidenGate, type SessionCapableBox, Settled, type ShapeBudget, type ShapeContext, type ShapeRegistry, type Shell, type ShotPersona, type ShotSpec, SpawnEvent, SpawnJournal, Spend, type SteerContext, type Strategy, type StrategyCtx, type StrategyEvolutionConfig, type StrategyResult, SupervisedResult, Supervisor, type SurfaceScore, type ToolSpec, type TrajectoryNode, type TrajectoryReport, type TrajectoryReportFn, type TrajectoryReportOptions, TreeView, type TurnResult, UsageEvent, type UsageSink, type VerifierEnvironmentOptions, type Verify, type VerifySpec, type WaterfallCollector, type WaterfallReport, type WaterfallSpan, type Widen, type WidenDecision, type WidenLineage, type WidenSpec, type Workspace, type WorkspaceCommit, acquireSandbox, adaptiveRefine, anytimeReport, assertStrategyContract, assertTraceDerivedFindings, auditIntent, authorStrategy, breadthDriver, buildSteerContext, builtinShapes, contentAddress, createBudgetPool, createExecutor, createExecutorRegistry, createMcpEnvironment, createRootHandle, createSandboxLineage, createScope, createScopeAnalyst, createShapeRegistry, createSupervisor, createVerifierEnvironment, createWaterfallCollector, defaultAnalystInstruction, defaultAuditorInstruction, definePersona, defineStrategy, depthDriver, discriminatingMeans, equalKOnCost, extractLlmCallEvent, fanout, flatWidenGate, gitWorkspace, harvestCorpus, inlineSandboxClient, jjWorkspace, localShell, loopDispatch, loopUntil, mapSandboxEvent, materializeTreeView, observe, openSandboxRun, panel, pickChampion, pipeline, printBenchmarkReport, probeSandboxCapabilities, promotionGate, refine, registerShape, renderAnytimeTable, renderCorpusToInstructions, renderReport, replaySpawnTree, reportLoopUsage, runAgentic, runBenchmark, runPersonified, runStrategyEvolution, sample, sampleThenRefine, selectChampion, settledToIteration, spendFromUsageEvents, strategyAuthorContract, trajectoryReport, verify, widen };

package/dist/runtime.js CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   InMemorySpawnJournal,
   acquireSandbox,
   adaptiveRefine,
+  anytimeReport,
   assertStrategyContract,
   assertTraceDerivedFindings,
   auditIntent,
@@ -29,6 +30,7 @@ import {
   createShapeRegistry,
   createSupervisor,
   createVerifierEnvironment,
+  createWaterfallCollector,
   defaultAnalystInstruction,
   defaultAuditorInstruction,
   defaultSelectWinner,
@@ -36,6 +38,7 @@ import {
   defineStrategy,
   depthDriver,
   deterministicCompletion,
+  discriminatingMeans,
   equalKOnCost,
   fanout,
   flatWidenGate,
@@ -50,6 +53,7 @@ import {
   observe,
   openSandboxRun,
   panel,
+  pickChampion,
   pipeline,
   printBenchmarkReport,
   probeSandboxCapabilities,
@@ -57,6 +61,7 @@ import {
   refine,
   registerShape,
   renderAnalyses,
+  renderAnytimeTable,
   renderCorpusToInstructions,
   renderReport,
   replaySpawnTree,
@@ -77,7 +82,7 @@ import {
   trajectoryReport,
   verify,
   widen
-} from "./chunk-IW2LMLK6.js";
+} from "./chunk-PXUTIMGJ.js";
 import {
   extractLlmCallEvent,
   mapSandboxEvent
@@ -92,6 +97,7 @@ export {
   InMemorySpawnJournal,
   acquireSandbox,
   adaptiveRefine,
+  anytimeReport,
   assertStrategyContract,
   assertTraceDerivedFindings,
   auditIntent,
@@ -114,6 +120,7 @@ export {
   createShapeRegistry,
   createSupervisor,
   createVerifierEnvironment,
+  createWaterfallCollector,
   defaultAnalystInstruction,
   defaultAuditorInstruction,
   defaultSelectWinner,
@@ -121,6 +128,7 @@ export {
   defineStrategy,
   depthDriver,
   deterministicCompletion,
+  discriminatingMeans,
   equalKOnCost,
   extractLlmCallEvent,
   fanout,
@@ -137,6 +145,7 @@ export {
   observe,
   openSandboxRun,
   panel,
+  pickChampion,
   pipeline,
   printBenchmarkReport,
   probeSandboxCapabilities,
@@ -144,6 +153,7 @@ export {
   refine,
   registerShape,
   renderAnalyses,
+  renderAnytimeTable,
   renderCorpusToInstructions,
   renderReport,
   replaySpawnTree,

package/dist/workflow.js CHANGED Viewed

@@ -2,7 +2,7 @@ import {
   createSandboxForSpec,
   describeSandboxPlacement,
   runLoop
-} from "./chunk-IW2LMLK6.js";
+} from "./chunk-PXUTIMGJ.js";
 import {
   ValidationError,
   extractLlmCallEvent

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.48.0",
+  "version": "0.49.0",
   "description": "Shared task-lifecycle skeleton for agents: a recursive loop kernel for chat turns, one-shot tasks, and multi-attempt loops, with trace capture and eval-gated self-improvement. Domain behavior lives in adapters; scoring and ship-gates in @tangle-network/agent-eval.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
@@ -95,14 +95,14 @@
     "test:watch": "vitest",
     "lint": "biome check src tests examples",
     "lint:fix": "biome check --write src tests examples",
-    "typecheck": "tsc --noEmit",
+    "typecheck": "tsc --noEmit && pnpm run typecheck:examples",
     "typecheck:examples": "tsc --noEmit -p tsconfig.examples.json",
     "verify:package": "node scripts/verify-package-exports.mjs"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
     "@tangle-network/agent-eval": "^0.89.0",
-    "@tangle-network/sandbox": "^0.4.0",
+    "@tangle-network/sandbox": "^0.6.0",
     "@types/node": "^25.6.0",
     "playwright": "^1.40.0",
     "tsup": "^8.0.0",
@@ -112,7 +112,8 @@
   "pnpm": {
     "minimumReleaseAge": 4320,
     "minimumReleaseAgeExclude": [
-      "@tangle-network/agent-eval"
+      "@tangle-network/agent-eval",
+      "@tangle-network/sandbox"
     ],
     "onlyBuiltDependencies": [
       "esbuild"
@@ -126,7 +127,7 @@
   "peerDependencies": {
     "@tangle-network/agent-eval": ">=0.83.0 <1.0.0",
     "@tangle-network/agent-knowledge": ">=1.3.0 <2.0.0",
-    "@tangle-network/sandbox": ">=0.1.2 <0.5.0",
+    "@tangle-network/sandbox": ">=0.1.2 <0.7.0",
     "playwright": "^1.40.0"
   },
   "peerDependenciesMeta": {