npm - @tangle-network/agent-runtime - Versions diffs - 0.48.0 → 0.50.0 - Mend

@tangle-network/agent-runtime 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (51) hide show

package/README.md +79 -15
package/dist/agent.d.ts +1 -1
package/dist/agent.js +1 -1
package/dist/analyst-loop.d.ts +1 -1
package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
package/dist/chunk-CM2IK7VS.js.map +1 -0
package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
package/dist/chunk-NDM5VXZW.js.map +1 -0
package/dist/chunk-OM3YNZIW.js +978 -0
package/dist/chunk-OM3YNZIW.js.map +1 -0
package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
package/dist/chunk-RHW75JW5.js.map +1 -0
package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
package/dist/index.d.ts +34 -9
package/dist/index.js +117 -27
package/dist/index.js.map +1 -1
package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
package/dist/loop-runner-bin.d.ts +5 -5
package/dist/loop-runner-bin.js +3 -3
package/dist/loops.d.ts +6 -6
package/dist/loops.js +17 -1
package/dist/mcp/bin.js +206 -29
package/dist/mcp/bin.js.map +1 -1
package/dist/mcp/index.d.ts +41 -177
package/dist/mcp/index.js +40 -6
package/dist/mcp/index.js.map +1 -1
package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
package/dist/platform.js +2 -2
package/dist/platform.js.map +1 -1
package/dist/profiles.d.ts +2 -2
package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
package/dist/runtime.d.ts +403 -24
package/dist/runtime.js +17 -1
package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
package/dist/workflow.d.ts +2 -2
package/dist/workflow.js +1 -1
package/package.json +6 -5
package/dist/chunk-IW2LMLK6.js.map +0 -1
package/dist/chunk-JNPK46YH.js.map +0 -1
package/dist/chunk-LX66I3SC.js +0 -218
package/dist/chunk-LX66I3SC.js.map +0 -1
package/dist/chunk-TJS7S3HJ.js.map +0 -1
package/dist/kb-gate-51BlLlVM.d.ts +0 -529
package/dist/otel-export-EzfsVUhh.d.ts +0 -191
/package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
/package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0

package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} RENAMED Viewed

@@ -426,6 +426,103 @@ function isNoEntError(err) {
   return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
 }
+// src/runtime/anytime.ts
+var median = (xs) => {
+  if (xs.length === 0) return null;
+  const s = [...xs].sort((a, b) => a - b);
+  const mid = Math.floor(s.length / 2);
+  return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
+};
+function anytimeReport(spans, opts) {
+  const targets = opts?.targets ?? [1];
+  const byRun = /* @__PURE__ */ new Map();
+  for (const s of spans) {
+    if (!s.label.startsWith("shot:")) continue;
+    const list = byRun.get(s.runId) ?? [];
+    list.push(s);
+    byRun.set(s.runId, list);
+  }
+  const perTask = [];
+  for (const [runId, shots] of byRun) {
+    const m = runId.match(/^agentic:(.+):(.+)$/);
+    const strategy = m?.[1] ?? runId;
+    const taskId = m?.[2] ?? runId;
+    const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
+    const t0 = Math.min(...ordered.map((s) => s.startMs));
+    const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
+    let best = 0;
+    let cumUsd = 0;
+    const points = [];
+    const hits = {};
+    for (const t of taskTargets) hits[String(t)] = null;
+    for (const s of ordered) {
+      cumUsd += s.usd;
+      if (typeof s.score === "number" && s.score > best) best = s.score;
+      const elapsedMs = (s.endMs ?? s.startMs) - t0;
+      points.push({ elapsedMs, cumUsd, best });
+      for (const t of taskTargets) {
+        if (hits[String(t)] === null && best >= t) {
+          hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
+        }
+      }
+    }
+    perTask.push({ taskId, strategy, points, hits });
+  }
+  const byStrategy = /* @__PURE__ */ new Map();
+  for (const t of perTask) {
+    const list = byStrategy.get(t.strategy) ?? [];
+    list.push(t);
+    byStrategy.set(t.strategy, list);
+  }
+  const perStrategy = [];
+  for (const [strategy, tasks] of byStrategy) {
+    const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
+    const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
+    const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
+    const curveByShot = [];
+    for (let i = 0; i < maxShots; i += 1) {
+      const vals = tasks.map(
+        (t) => t.points[Math.min(i, t.points.length - 1)].best
+      );
+      curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
+    }
+    const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
+    const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
+    for (const t of summaryTargets) {
+      const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
+      const reached = tasks.filter((x) => key(x) !== null);
+      perStrategy.push({
+        strategy,
+        target: t,
+        tasks: tasks.length,
+        reachedTarget: reached.length,
+        medianTttMs: median(reached.map((x) => key(x).ms)),
+        medianShotsToTarget: median(reached.map((x) => key(x).shots)),
+        ertMs: reached.length > 0 ? totalMs / reached.length : null,
+        erUsd: reached.length > 0 ? totalUsd / reached.length : null,
+        curveByShot,
+        auc
+      });
+    }
+  }
+  perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
+  return { targets, perTask, perStrategy };
+}
+function renderAnytimeTable(report) {
+  const lines = [
+    `anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
+    "strategy            \u2265tgt   reach   med-TTT   med-shots   ERT(all-in)   $/success   AUC   curve"
+  ];
+  for (const s of report.perStrategy) {
+    const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
+    const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
+    lines.push(
+      `${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? "      \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`}   ${s.medianShotsToTarget === null ? "    \u2014" : String(s.medianShotsToTarget).padStart(5)}   ${s.ertMs === null ? "         \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`}   ${s.erUsd === null ? "       \u2014" : `$${s.erUsd.toFixed(4)}`}   ${s.auc.toFixed(2)}   ${curve}`
+    );
+  }
+  return lines.join("\n");
+}
 // src/runtime/audit-intent.ts
 var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
 function summarize(trace, maxLines) {
@@ -2346,20 +2443,20 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
 }
 async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
   let reconciled = false;
-  const reconcileOnce = (spend2) => {
+  const reconcileOnce = (spend) => {
     if (reconciled) return;
     reconciled = true;
-    pool2.reconcile(ticket, clampSpend(spend2, opts.budget));
+    pool2.reconcile(ticket, clampSpend(spend, opts.budget));
   };
   try {
     live.status = "running";
     const ran = executor.execute(task, childAbort.signal);
     let artifact;
     if (isAsyncIterable2(ran)) {
-      const spend2 = await foldStream(ran);
-      live.spent = spend2;
+      const spend = await foldStream(ran);
+      live.spent = spend;
       artifact = executor.resultArtifact();
-      reconcileOnce(spend2);
+      reconcileOnce(spend);
     } else {
       const terminal = await ran;
       live.spent = terminal.spent;
@@ -2448,21 +2545,21 @@ async function foldStream(stream) {
   }
   return { iterations, tokens, usd, ms: 0 };
 }
-function clampSpend(spend2, budget) {
-  const totalTokens2 = spend2.tokens.input + spend2.tokens.output;
+function clampSpend(spend, budget) {
+  const totalTokens2 = spend.tokens.input + spend.tokens.output;
   const tokensOk = totalTokens2 <= budget.maxTokens;
-  const itersOk = spend2.iterations <= budget.maxIterations;
-  const usdOk = budget.maxUsd === void 0 || spend2.usd <= budget.maxUsd;
-  if (tokensOk && itersOk && usdOk) return spend2;
+  const itersOk = spend.iterations <= budget.maxIterations;
+  const usdOk = budget.maxUsd === void 0 || spend.usd <= budget.maxUsd;
+  if (tokensOk && itersOk && usdOk) return spend;
   const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
   return {
-    iterations: Math.min(spend2.iterations, budget.maxIterations),
+    iterations: Math.min(spend.iterations, budget.maxIterations),
     tokens: ratio < 1 ? {
-      input: Math.floor(spend2.tokens.input * ratio),
-      output: Math.floor(spend2.tokens.output * ratio)
-    } : spend2.tokens,
-    usd: budget.maxUsd === void 0 ? spend2.usd : Math.min(spend2.usd, budget.maxUsd),
-    ms: spend2.ms
+      input: Math.floor(spend.tokens.input * ratio),
+      output: Math.floor(spend.tokens.output * ratio)
+    } : spend.tokens,
+    usd: budget.maxUsd === void 0 ? spend.usd : Math.min(spend.usd, budget.maxUsd),
+    ms: spend.ms
   };
 }
 async function teardownSafe(executor, grace) {
@@ -3012,7 +3109,138 @@ function isNoEntError2(err) {
 // src/runtime/supervise/runtime.ts
 import { spawn } from "child_process";
+import { estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
+// src/runtime/router-client.ts
 import { estimateCost, isModelPriced } from "@tangle-network/agent-eval";
+async function routerChatWithUsage(cfg, messages, opts) {
+  const url = `${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`;
+  const headers = { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` };
+  let temperature = opts?.temperature ?? 0.2;
+  let lastErr = "";
+  for (let attempt = 0; attempt < 5; attempt += 1) {
+    const res = await fetch(url, {
+      method: "POST",
+      headers,
+      // max_tokens default is generous: THINKING models (kimi-k2.6) spend the budget on
+      // reasoning_content first — a small router default yields EMPTY content.
+      body: JSON.stringify({
+        model: cfg.model,
+        messages,
+        temperature,
+        max_tokens: opts?.maxTokens ?? 8192
+      }),
+      ...opts?.signal ? { signal: opts.signal } : {}
+    });
+    if (res.ok) return parseChatResult(await res.json(), cfg.model);
+    const status = res.status;
+    const text = (await res.text()).slice(0, 200);
+    lastErr = `router ${status}: ${text}`;
+    if (status === 400 && /temperature/i.test(text) && temperature !== 1) {
+      temperature = 1;
+      continue;
+    }
+    if (![408, 425, 429, 500, 502, 503, 504, 520, 522, 524].includes(status))
+      throw new Error(lastErr);
+    if (attempt < 4) await new Promise((r) => setTimeout(r, 800 * 2 ** attempt));
+  }
+  throw new Error(`${lastErr} (exhausted retries)`);
+}
+function parseChatResult(json, model) {
+  const data = json;
+  const u = data.usage;
+  const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
+  const costUsd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : void 0;
+  return {
+    content: data.choices?.[0]?.message?.content ?? "",
+    ...usage ? { usage } : {},
+    ...costUsd !== void 0 ? { costUsd } : {}
+  };
+}
+async function routerChatWithTools(cfg, messages, tools, opts) {
+  const res = await fetch(`${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
+    method: "POST",
+    headers: { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` },
+    body: JSON.stringify({
+      model: cfg.model,
+      messages,
+      tools,
+      tool_choice: opts?.toolChoice ?? "auto",
+      temperature: opts?.temperature ?? 0.3
+    }),
+    ...opts?.signal ? { signal: opts.signal } : {}
+  });
+  if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
+  const data = await res.json();
+  const msg = data.choices?.[0]?.message;
+  const toolCalls = (msg?.tool_calls ?? []).map((tc, i) => ({
+    id: tc.id ?? `call_${i}`,
+    name: tc.function?.name ?? "",
+    arguments: tc.function?.arguments ?? "{}"
+  }));
+  const u = data.usage;
+  const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
+  const costUsd = usage && isModelPriced(cfg.model) ? estimateCost(usage.input, usage.output, cfg.model) : void 0;
+  return {
+    content: msg?.content ?? null,
+    toolCalls,
+    ...usage ? { usage } : {},
+    ...costUsd !== void 0 ? { costUsd } : {}
+  };
+}
+async function routerToolLoop(cfg, system, user, tools, execute, opts) {
+  const maxTurns = opts?.maxTurns ?? 4;
+  const messages = [
+    { role: "system", content: system },
+    { role: "user", content: user }
+  ];
+  let toolCalls = 0;
+  let lastText = "";
+  const usage = { input: 0, output: 0 };
+  const toolTrace = [];
+  for (let turn = 1; turn <= maxTurns; turn += 1) {
+    const r = await routerChatWithTools(cfg, messages, tools, {
+      ...opts?.temperature !== void 0 ? { temperature: opts.temperature } : {},
+      ...opts?.signal ? { signal: opts.signal } : {}
+    });
+    if (r.usage) {
+      usage.input += r.usage.input;
+      usage.output += r.usage.output;
+    }
+    if (r.content) lastText = r.content;
+    if (r.toolCalls.length === 0)
+      return { final: lastText, turns: turn, toolCalls, toolTrace, usage };
+    messages.push({
+      role: "assistant",
+      content: r.content ?? "",
+      tool_calls: r.toolCalls.map((tc) => ({
+        id: tc.id,
+        type: "function",
+        function: { name: tc.name, arguments: tc.arguments }
+      }))
+    });
+    for (const tc of r.toolCalls) {
+      toolCalls += 1;
+      let args = {};
+      try {
+        args = JSON.parse(tc.arguments);
+      } catch {
+        messages.push({
+          role: "tool",
+          tool_call_id: tc.id,
+          content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}`
+        });
+        continue;
+      }
+      const out = await execute(tc.name, args);
+      messages.push({ role: "tool", tool_call_id: tc.id, content: out });
+      toolTrace.push({ name: tc.name, args: tc.arguments, result: out });
+    }
+  }
+  return { final: lastText, turns: maxTurns, toolCalls, toolTrace, usage };
+}
+// src/runtime/supervise/runtime.ts
 var routerSeamKey = "router";
 var sandboxSeamKey = "sandbox";
 var cliSeamKey = "cli";
@@ -3058,30 +3286,19 @@ var routerInlineExecutor = (spec, ctx) => {
       const messages = taskToMessages(task, spec);
       const started = Date.now();
       const linked = linkSignals(signal, controller.signal);
-      const res = await fetch(`${seam.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
-        method: "POST",
-        headers: { "content-type": "application/json", authorization: `Bearer ${seam.routerKey}` },
-        body: JSON.stringify({ model, messages, temperature: 0.2 }),
-        ...linked ? { signal: linked } : {}
-      });
-      if (!res.ok) {
-        throw new ValidationError(
-          `routerInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`
-        );
-      }
-      const data = await res.json();
-      const u = data.usage;
-      const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
-      const usd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : 0;
-      const content = data.choices?.[0]?.message?.content ?? "";
+      const r = await routerChatWithUsage(
+        { routerBaseUrl: seam.routerBaseUrl, routerKey: seam.routerKey, model },
+        messages,
+        linked ? { signal: linked } : {}
+      );
       const spent = {
         iterations: 1,
-        tokens: usage ? { input: usage.input, output: usage.output } : zeroTokenUsage(),
-        usd,
+        tokens: r.usage ? { input: r.usage.input, output: r.usage.output } : zeroTokenUsage(),
+        usd: r.costUsd ?? 0,
         ms: Date.now() - started
       };
-      const out = { content };
-      artifact = { outRef: contentRef("router", { model, content }), out, spent };
+      const out = { content: r.content };
+      artifact = { outRef: contentRef("router", { model, content: r.content }), out, spent };
       return artifact;
     },
     teardown(_grace) {
@@ -3110,7 +3327,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
       "routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
     );
   }
-  const maxTurns = seam.maxTurns ?? 4;
+  const maxTurns = seam.maxTurns ?? 200;
   const controller = new AbortController();
   const abortIfSignalled = () => {
     if (ctx.signal.aborted) controller.abort();
@@ -3188,7 +3405,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
           messages.push({ role: "tool", tool_call_id: id, content: result });
         }
       }
-      const usd = isModelPriced(model) ? estimateCost(tokens.input, tokens.output, model) : 0;
+      const usd = isModelPriced2(model) ? estimateCost2(tokens.input, tokens.output, model) : 0;
       const spent = { iterations: turns, tokens, usd, ms: Date.now() - started };
       const out = { content: lastText };
       artifact = { outRef: contentRef("router-tools", { model, content: lastText }), out, spent };
@@ -4228,12 +4445,12 @@ function countStatuses(reported) {
 function zeroSpend4() {
   return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
 }
-function cloneSpend(spend2) {
+function cloneSpend(spend) {
   return {
-    iterations: spend2.iterations,
-    tokens: { input: spend2.tokens.input, output: spend2.tokens.output },
-    usd: spend2.usd,
-    ms: spend2.ms
+    iterations: spend.iterations,
+    tokens: { input: spend.tokens.input, output: spend.tokens.output },
+    usd: spend.usd,
+    ms: spend.ms
   };
 }
 function addSpend(acc, delta) {
@@ -4249,13 +4466,13 @@ function spreadOf(values) {
 function fractionalSpread(values) {
   const spread = spreadOf(values);
   if (spread === 0) return 0;
-  const median = medianOf(values);
-  if (median === 0) {
+  const median2 = medianOf(values);
+  if (median2 === 0) {
     throw new Error(
       "equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
     );
   }
-  return spread / median;
+  return spread / median2;
 }
 function medianOf(values) {
   if (values.length === 0) {
@@ -4287,28 +4504,34 @@ function requireNode2(nodes, id, root) {
   return node;
 }
 function requireSpend(rolled, id, root) {
-  const spend2 = rolled.get(id);
-  if (!spend2) {
+  const spend = rolled.get(id);
+  if (!spend) {
     throw new Error(
       `trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
     );
   }
-  return spend2;
+  return spend;
 }
 // src/runtime/promotion-gate.ts
 import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
 function promotionGate(opts) {
+  const mode = opts.mode ?? "superiority";
   if (opts.candidate === opts.incumbent) {
     return {
       promoted: false,
       reason: "identical-champion",
+      mode,
       n: 0,
       lift: { mean: 0, median: 0, low: 0, high: 0 }
     };
   }
   const before = [];
   const after = [];
+  const incUsd = [];
+  const candUsd = [];
+  const incMs = [];
+  const candMs = [];
   const cellIds = [];
   for (const row of opts.report.perTask) {
     const inc = row.cells?.[opts.incumbent];
@@ -4316,6 +4539,10 @@ function promotionGate(opts) {
     if (!inc || !cand) continue;
     before.push(inc.score);
     after.push(cand.score);
+    incUsd.push(inc.usd);
+    candUsd.push(cand.usd);
+    incMs.push(inc.ms);
+    candMs.push(cand.ms);
     cellIds.push(row.taskId);
   }
   if (before.length === 0) {
@@ -4339,15 +4566,91 @@ function promotionGate(opts) {
     low: sig.bootstrap.low,
     high: sig.bootstrap.high
   };
-  if (sig.fewRuns) return { promoted: false, reason: "few-tasks", n: sig.n, lift };
-  return sig.significant ? { promoted: true, reason: "significant", n: sig.n, lift } : { promoted: false, reason: "no-margin", n: sig.n, lift };
+  const latSig = heldoutSignificance(
+    { before: incMs, after: candMs, cellIds },
+    {
+      deltaThreshold: 0,
+      minProductiveRuns: 1,
+      statistic: opts.statistic ?? "mean",
+      ...opts.seed !== void 0 ? { seed: opts.seed } : {},
+      ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
+    }
+  );
+  const latency = {
+    mean: latSig.bootstrap.mean,
+    median: latSig.bootstrap.median,
+    low: latSig.bootstrap.low,
+    high: latSig.bootstrap.high
+  };
+  if (mode === "superiority") {
+    if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
+    return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
+  }
+  const tolerance = opts.scoreTolerance ?? 0.05;
+  const scoreSig = heldoutSignificance(
+    { before, after, cellIds },
+    {
+      deltaThreshold: -tolerance,
+      minProductiveRuns: opts.minPairedTasks ?? 6,
+      statistic: opts.statistic ?? "mean",
+      ...opts.seed !== void 0 ? { seed: opts.seed } : {},
+      ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
+    }
+  );
+  const costSig = heldoutSignificance(
+    { before: candUsd, after: incUsd, cellIds },
+    {
+      deltaThreshold: 0,
+      minProductiveRuns: opts.minPairedTasks ?? 6,
+      statistic: opts.statistic ?? "mean",
+      ...opts.seed !== void 0 ? { seed: opts.seed } : {},
+      ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
+    }
+  );
+  const costSavings = {
+    mean: costSig.bootstrap.mean,
+    median: costSig.bootstrap.median,
+    low: costSig.bootstrap.low,
+    high: costSig.bootstrap.high
+  };
+  if (scoreSig.fewRuns)
+    return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
+  if (!scoreSig.significant)
+    return {
+      promoted: false,
+      reason: "non-inferiority-unproven",
+      mode,
+      n: scoreSig.n,
+      lift,
+      costSavings,
+      latency
+    };
+  if (!costSig.significant)
+    return {
+      promoted: false,
+      reason: "not-cheaper",
+      mode,
+      n: scoreSig.n,
+      lift,
+      costSavings,
+      latency
+    };
+  return {
+    promoted: true,
+    reason: "non-inferior-and-cheaper",
+    mode,
+    n: scoreSig.n,
+    lift,
+    costSavings,
+    latency
+  };
 }
 // src/runtime/run-benchmark.ts
 import { pairedBootstrap, paretoFrontier } from "@tangle-network/agent-eval";
 // src/runtime/strategy.ts
-import { createChatClient, estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
+import { createChatClient, estimateCost as estimateCost3, isModelPriced as isModelPriced3 } from "@tangle-network/agent-eval";
 var taskNudge = "Use the available tools to bring the artifact to the required final state. Address EVERY distinct change the request implies. After each tool result, check what remains and continue. Re-read the values you set to confirm they took. Reply DONE only once every required change is made and verified.";
 async function runShot(surface, _task, handle, tools, messages, opts, modelOverride) {
   const innerTurns = opts.innerTurns ?? 4;
@@ -4364,7 +4667,8 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
         messages,
         tools,
         tool_choice: "auto",
-        temperature: opts.temperature ?? 0.7
+        temperature: opts.temperature ?? 0.7,
+        ...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
       })
     });
     if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
@@ -4403,12 +4707,15 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
   }
   return { messages, completions, toolCalls, toolErrors, tokens };
 }
-async function analyze(task, messages, opts) {
-  const trajectory = messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
+function compactTrajectory(messages) {
+  return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
     if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
     const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
     return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
   }).join("\n").slice(0, 7e3);
+}
+async function consultAnalyst(task, messages, instruction, opts) {
+  const trajectory = compactTrajectory(messages);
   const analystModel = opts.analystModel ?? opts.model;
   const chat = createChatClient({
     transport: "router",
@@ -4416,6 +4723,52 @@ async function analyze(task, messages, opts) {
     baseUrl: opts.routerBaseUrl,
     defaultModel: analystModel
   });
+  const res = await chat.chat({
+    model: analystModel,
+    temperature: 0.2,
+    maxTokens: 1024,
+    messages: [
+      { role: "system", content: instruction },
+      {
+        role: "user",
+        content: `TASK: ${task.userPrompt.slice(0, 1500)}
+TRAJECTORY:
+${trajectory}`
+      }
+    ]
+  });
+  const usage = res.usage;
+  return {
+    steer: res.content.trim(),
+    tokens: {
+      input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
+      output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
+    }
+  };
+}
+async function analyze(task, messages, opts) {
+  const trajectory = compactTrajectory(messages);
+  const analystModel = opts.analystModel ?? opts.model;
+  const inner = createChatClient({
+    transport: "router",
+    apiKey: opts.routerKey,
+    baseUrl: opts.routerBaseUrl,
+    defaultModel: analystModel
+  });
+  const tokens = { input: 0, output: 0 };
+  const chat = {
+    ...inner,
+    chat: async (req, callOpts) => {
+      const res = await inner.chat(req, callOpts);
+      const u = res.usage;
+      if (u) {
+        tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
+        tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
+      }
+      return res;
+    }
+  };
   const obs = await observe(
     {
       task: task.userPrompt,
@@ -4432,14 +4785,8 @@ async function analyze(task, messages, opts) {
     }
   );
   const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
-  return steer || "COMPLETE";
+  return { steer: steer || "COMPLETE", tokens };
 }
-var spend = (iterations) => ({
-  iterations,
-  tokens: { input: 0, output: 0 },
-  usd: 0,
-  ms: 0
-});
 function shotExecutor(surface, opts) {
   let artifact;
   return {
@@ -4449,7 +4796,19 @@ function shotExecutor(surface, opts) {
       const own = !t.handle;
       const handle = t.handle ?? await surface.open(t.task);
       try {
-        const tools = await surface.tools(t.task, handle);
+        const allTools = await surface.tools(t.task, handle);
+        let tools = allTools;
+        if (t.tools) {
+          const known = new Set(allTools.map((tool) => tool.function.name));
+          const unknown = t.tools.filter((name) => !known.has(name));
+          if (unknown.length > 0) {
+            throw new Error(
+              `shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
+            );
+          }
+          const want = new Set(t.tools);
+          tools = allTools.filter((tool) => want.has(tool.function.name));
+        }
         const messages = t.messages?.length ? t.messages : [
           { role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
           { role: "user", content: `${t.task.userPrompt}
@@ -4483,7 +4842,7 @@ ${taskNudge}` }
           spent: {
             iterations: shot.completions,
             tokens: shot.tokens,
-            usd: isModelPriced2(opts.model) ? estimateCost2(shot.tokens.input, shot.tokens.output, opts.model) : 0,
+            usd: isModelPriced3(opts.model) ? estimateCost3(shot.tokens.input, shot.tokens.output, opts.model) : 0,
             ms: 0
           }
         };
@@ -4505,8 +4864,18 @@ function analystExecutor(opts) {
     runtime: "agentic-analyst",
     async execute(task) {
       const t = task;
-      const findings = await analyze(t.task, t.messages, opts);
-      artifact = { outRef: `analyst:${findings.length}`, out: findings, spent: spend(1) };
+      const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
+      const analystModel = opts.analystModel ?? opts.model;
+      artifact = {
+        outRef: `analyst:${steer.length}`,
+        out: steer,
+        spent: {
+          iterations: 1,
+          tokens,
+          usd: isModelPriced3(analystModel) ? estimateCost3(tokens.input, tokens.output, analystModel) : 0,
+          ms: 0
+        }
+      };
       return artifact;
     },
     teardown: () => Promise.resolve({ destroyed: true }),
@@ -4669,12 +5038,21 @@ function defineStrategy(name, run) {
         const innerTurns = opts.innerTurns ?? 4;
         let verifiedBest = 0;
         let verifiedResolved = false;
+        const openHandles = /* @__PURE__ */ new Set();
         const ctx = {
           // Narrowed to open/close — the body gets no raw call()/score() access.
           surface: {
             name: surface.name,
-            open: (t) => surface.open(t),
-            close: (h) => surface.close(h)
+            open: async (t) => {
+              const h = await surface.open(t);
+              openHandles.add(h.id);
+              return h;
+            },
+            close: async (h) => {
+              if (!h || !openHandles.has(h.id)) return;
+              openHandles.delete(h.id);
+              await surface.close(h);
+            }
           },
           task,
           opts,
@@ -4690,7 +5068,8 @@ function defineStrategy(name, run) {
                 handle: spec?.handle,
                 messages: spec?.messages,
                 steer: spec?.steer,
-                persona: spec?.persona
+                persona: spec?.persona,
+                tools: spec?.tools
               },
               { budget: perChild(innerTurns), label: child.name }
             );
@@ -4702,6 +5081,13 @@ function defineStrategy(name, run) {
             if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
             return out;
           },
+          async listTools(handle) {
+            const tools = await surface.tools(task, handle);
+            return tools.map((t) => ({
+              name: t.function.name,
+              ...t.function.description ? { description: t.function.description } : {}
+            }));
+          },
           async critique(messages) {
             const child = leaf(`analyst:${seq}`, "analyst");
             seq += 1;
@@ -4715,12 +5101,33 @@ function defineStrategy(name, run) {
             if (settled.kind === "down") return null;
             const findings = settled.out;
             return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
+          },
+          async consult(messages, instruction) {
+            const child = leaf(`analyst:${seq}`, "analyst");
+            seq += 1;
+            const res = scope.spawn(
+              child,
+              { task, messages, rawInstruction: instruction },
+              { budget: perChild(1), label: child.name }
+            );
+            if (!res.ok) return null;
+            const settled = await drainOne2(scope);
+            if (settled.kind === "down") return null;
+            return settled.out;
           }
         };
         const r = await run(ctx);
         return {
           kind: "done",
-          deliverable: { mode: name, ...r, score: verifiedBest, resolved: verifiedResolved }
+          deliverable: {
+            mode: name,
+            ...r,
+            progression: Array.isArray(r.progression) ? r.progression : [],
+            completions: typeof r.completions === "number" ? r.completions : 0,
+            shots: typeof r.shots === "number" ? r.shots : 0,
+            score: verifiedBest,
+            resolved: verifiedResolved
+          }
         };
       }
     })
@@ -4875,27 +5282,44 @@ async function runBenchmark(cfg) {
   let settled = 0;
   const perTask = await pool(cfg.tasks, concurrency, async (task) => {
     const cells = {};
+    const errors = {};
     let row;
     try {
       for (const s of strategies) {
-        const r = await runAgentic({
-          ...cfg.worker,
-          surface: cfg.environment,
-          task,
-          strategy: s,
-          budget,
-          ...cfg.hooks ? { hooks: cfg.hooks } : {}
-        });
-        cells[s.name] = {
-          score: r.score,
-          resolved: r.resolved,
-          progression: r.progression,
-          usd: r.usd,
-          ms: r.ms,
-          tokens: r.tokens
-        };
+        try {
+          const r = await runAgentic({
+            ...cfg.worker,
+            surface: cfg.environment,
+            task,
+            strategy: s,
+            budget,
+            ...cfg.hooks ? { hooks: cfg.hooks } : {}
+          });
+          cells[s.name] = {
+            score: r.score,
+            resolved: r.resolved,
+            progression: r.progression,
+            usd: r.usd,
+            ms: r.ms,
+            tokens: r.tokens
+          };
+        } catch (e) {
+          errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
+          cells[s.name] = {
+            score: 0,
+            resolved: false,
+            progression: [],
+            usd: 0,
+            ms: 0,
+            tokens: { input: 0, output: 0 }
+          };
+        }
       }
-      row = { taskId: task.id, cells };
+      row = {
+        taskId: task.id,
+        cells,
+        ...Object.keys(errors).length > 0 ? { errors } : {}
+      };
     } catch (e) {
       row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
     }
@@ -5200,7 +5624,7 @@ var strategyAuthorContract = `
 You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
 spend a compute budget to beat a task's deployable check. You compose exactly two steps:
-  shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
+  shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
     Runs ONE worker attempt (a bounded tool loop) over an artifact.
     - omit handle  => the shot opens its OWN fresh artifact and closes it after (a sample).
     - pass handle  => the shot CONTINUES that artifact (state accumulates across shots).
@@ -5210,6 +5634,10 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
       (multi-agent strategies: a researcher shot then an engineer shot, a panel of k
       personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
       a carried conversation it arrives as a hand-off message. Same conserved budget.
+    - tools        => string[] \u2014 restrict THIS shot to a subset of the task's tools by
+      name (focus an explore shot on read-only tools, an execute shot on write tools).
+      Restriction-only; unknown names make the shot fail. ALWAYS select from
+      await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
     ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
     Returns null if the attempt failed infra-wise.
@@ -5217,10 +5645,23 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
     A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
     instruction (or null when it judges the work complete). Costs ~1 completion.
+  consult(messages, instruction): Promise<string | null>
+    The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
+    trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
+    (a decision, a prediction). Costs ~1 completion.
   surface.open(task) / surface.close(handle)
     Open a persistent artifact you manage yourself (remember to close in a finally).
+    close is idempotent \u2014 closing an already-closed handle is a safe no-op.
+  listTools(handle): Promise<Array<{ name, description? }>>
+    The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
+    shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
+    names from an example kills your shots on every task whose tools differ.
 Rules:
+- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
+  crashes the whole benchmark run.
 - Stay within ~budget total shots; every shot/critique spends from a conserved pool.
 - For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
   fresh conversation too, but be explicit). To CONTINUE, pass the previous
@@ -5230,8 +5671,8 @@ Rules:
 - The module must be EXACTLY this shape (no other imports, no commentary outside code):
 import { defineStrategy } from '@tangle-network/agent-runtime/loops'
-export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
-  // your composition
+export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
+  // your composition (listTools comes from the destructured context \u2014 it is NOT a global)
 })
 `;
 function assertStrategyContract(code) {
@@ -5307,34 +5748,89 @@ async function authorStrategy(opts) {
 }
 // src/runtime/strategy-evolution.ts
+import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
 import { gzipSync } from "zlib";
-function selectChampion(report, fieldOrder, policy, epsilon) {
-  const entries = fieldOrder.map((name) => ({ name, summary: report.perStrategy[name] })).filter((e) => !!e.summary);
+function discriminatingMeans(report, fieldOrder) {
+  const rows = report.perTask.filter((r) => {
+    if (!r.cells) return false;
+    const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
+    if (scores.length < fieldOrder.length) return false;
+    return Math.max(...scores) - Math.min(...scores) > 0;
+  });
+  if (rows.length === 0) return null;
+  const out = {};
+  for (const name of fieldOrder) {
+    const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
+    out[name] = {
+      score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
+      usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
+    };
+  }
+  return out;
+}
+function pickChampion(means, fieldOrder, policy, epsilon) {
+  const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
   if (entries.length === 0)
-    throw new Error("selectChampion: report carries none of the field strategies");
+    throw new Error("pickChampion: the means table carries none of the field strategies");
   const best = Math.max(...entries.map((e) => e.summary.score));
   const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
-  if (!pick) throw new Error("selectChampion: empty pick (unreachable)");
+  if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
   return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
 }
+function selectChampion(report, fieldOrder, policy, epsilon) {
+  return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
+}
 var fieldSummary = (archive) => archive.map(
   (n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
 ).join("\n");
-var compactLosses = (report) => {
+var compactLosses = (report, detail) => {
   const r2 = (x) => Math.round(x * 100) / 100;
   const rows = report.perTask.map(
     (row) => row.cells ? {
       task: row.taskId,
+      ...row.errors ? {
+        errors: Object.fromEntries(
+          Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
+        )
+      } : {},
       cells: Object.fromEntries(
         Object.entries(row.cells).map(([name, c]) => [
           name,
-          { score: r2(c.score), resolved: c.resolved, progression: c.progression.map(r2) }
+          // 'binary' is the leakage-bounded channel: the author learns pass/fail per
+          // task and nothing else — the per-generation leak from the evaluation data
+          // is capped at one bit per cell (arXiv:2606.11045 measured that exploration
+          // survives this; whether AUTHORING does is the E1-coarse A/B).
+          detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
+            score: r2(c.score),
+            resolved: c.resolved,
+            usd: Math.round(c.usd * 1e4) / 1e4,
+            progression: (c.progression ?? []).map(r2)
+          }
         ])
       )
     } : { task: row.taskId, error: row.error?.slice(0, 80) }
   );
   return JSON.stringify(rows).slice(0, 12e3);
 };
+function renameStrategy(orig, unique) {
+  if (orig.name === unique) return orig;
+  return {
+    name: unique,
+    driver: (s, t, o, b) => {
+      const agent = orig.driver(s, t, o, b);
+      return {
+        ...agent,
+        name: unique,
+        act: async (task, scope) => {
+          const out = await agent.act(task, scope);
+          if (out.kind !== "done") return out;
+          const deliverable = { ...out.deliverable, mode: unique };
+          return { ...out, deliverable };
+        }
+      };
+    }
+  };
+}
 async function runStrategyEvolution(cfg) {
   const budget = cfg.budget ?? 3;
   const concurrency = cfg.concurrency ?? 3;
@@ -5342,37 +5838,72 @@ async function runStrategyEvolution(cfg) {
   const populationSize = cfg.populationSize ?? 2;
   const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
   const policy = cfg.champion ?? "costAware";
-  const epsilon = cfg.championEpsilon ?? 0.01;
+  const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
   const byName = new Map(baselines.map((s) => [s.name, s]));
-  const bench = (phase, tasks, strategies) => runBenchmark({
-    environment: cfg.environment,
-    tasks,
-    worker: cfg.worker,
-    strategies,
+  const codeByName = /* @__PURE__ */ new Map();
+  const fingerprint = {
+    trainN: cfg.trainN,
+    holdoutN: cfg.holdoutN,
     budget,
-    concurrency,
-    ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
-    ...cfg.hooks ? { hooks: cfg.hooks } : {}
-  });
+    generations,
+    populationSize
+  };
+  let ckpt;
+  if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
+    const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
+    if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
+      throw new Error(
+        `evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
+      );
+    }
+    ckpt = raw;
+  }
+  const save = (state) => {
+    if (cfg.checkpoint)
+      writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
+  };
+  const bench = async (phase, tasks, strategies) => {
+    await cfg.onPhase?.(phase);
+    return runBenchmark({
+      environment: cfg.environment,
+      tasks,
+      worker: cfg.worker,
+      strategies,
+      budget,
+      concurrency,
+      ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
+      ...cfg.hooks ? { hooks: cfg.hooks } : {}
+    });
+  };
   const train = await cfg.tasks(0, cfg.trainN);
-  const gen0 = await bench("gen0", train, baselines);
-  const archive = baselines.map((s) => ({
+  const probeTask = train[0];
+  if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
+  const probe = await cfg.environment.open(probeTask);
+  let toolCatalog;
+  try {
+    const tools = await cfg.environment.tools(probeTask, probe);
+    toolCatalog = tools.map(
+      (t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
+    ).join("\n");
+  } finally {
+    await cfg.environment.close(probe);
+  }
+  const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
+  const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
     name: s.name,
     source: "baseline",
     generation: 0,
     score: gen0.perStrategy[s.name]?.score ?? 0,
     usd: gen0.perStrategy[s.name]?.usd ?? 0
   }));
-  const gen0Champion = selectChampion(
+  const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
     gen0,
     baselines.map((s) => s.name),
     policy,
     epsilon
   );
-  let incumbent = gen0Champion;
-  let latestReport = gen0;
-  const generationRows = [];
-  const trajectory = [
+  const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
+  const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
     {
       generation: 0,
       champion: gen0Champion.name,
@@ -5380,13 +5911,39 @@ async function runStrategyEvolution(cfg) {
       usd: gen0Champion.usd
     }
   ];
-  let authoredOk = 0;
-  for (let g = 1; g <= generations; g += 1) {
-    const lossesJson = compactLosses(latestReport);
+  for (const row of generationRows) {
+    for (const c of row.candidates) {
+      if (!c.file || c.error) continue;
+      const mod = await import(`file://${c.file}`);
+      if (!mod.default || typeof mod.default.driver !== "function") {
+        throw new Error(
+          `evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
+        );
+      }
+      byName.set(c.name, renameStrategy(mod.default, c.name));
+      codeByName.set(c.name, readFileSync(c.file, "utf8"));
+    }
+  }
+  let authoredOk = generationRows.reduce(
+    (n, row) => n + row.candidates.filter((c) => !c.error).length,
+    0
+  );
+  const lastRow = generationRows[generationRows.length - 1];
+  let incumbent = lastRow ? lastRow.champion : gen0Champion;
+  let latestReport = lastRow ? lastRow.report : gen0;
+  if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
+  for (let g = generationRows.length + 1; g <= generations; g += 1) {
+    const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
     const candidates = [];
     const newStrategies = [];
     for (let i = 0; i < populationSize; i += 1) {
-      const contract = `${strategyAuthorContract}
+      const objectiveNote = cfg.objective === "cost" ? `
+YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
+      const contract = `${strategyAuthorContract}${objectiveNote}
+EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
+${toolCatalog}
 STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
 ${fieldSummary(archive)}
@@ -5406,26 +5963,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
           outDir: cfg.outDir
         });
         const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
-        const strategy = unique === authored.strategy.name ? authored.strategy : {
-          name: unique,
-          driver: (s, t, o, b) => {
-            const agent = authored.strategy.driver(s, t, o, b);
-            return {
-              ...agent,
-              name: unique,
-              act: async (task, scope) => {
-                const out = await agent.act(task, scope);
-                if (out.kind !== "done") return out;
-                const deliverable = {
-                  ...out.deliverable,
-                  mode: unique
-                };
-                return { ...out, deliverable };
-              }
-            };
-          }
-        };
+        const strategy = renameStrategy(authored.strategy, unique);
         byName.set(unique, strategy);
+        codeByName.set(unique, authored.code);
         newStrategies.push(strategy);
         archive.push({
           name: unique,
@@ -5463,12 +6003,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
         node.usd = cell.usd;
       }
     }
-    const champion = selectChampion(
-      report,
-      field.map((s) => s.name),
-      policy,
-      epsilon
-    );
+    const fieldNames = field.map((s) => s.name);
+    const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
+    const champion = pickChampion(means, fieldNames, policy, epsilon);
     generationRows.push({ generation: g, candidates, report, champion });
     trajectory.push({
       generation: g,
@@ -5478,21 +6015,134 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
     });
     incumbent = champion;
     latestReport = report;
+    save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
   }
   if (authoredOk === 0) {
     throw new Error(
       "runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
     );
   }
-  const holdoutTasks = await cfg.tasks(cfg.trainN + (cfg.holdoutOffset ?? 0), cfg.holdoutN);
-  const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
-  const holdout = await bench("holdout", holdoutTasks, finalists);
-  const verdict = promotionGate({
-    report: holdout,
-    incumbent: gen0Champion.name,
-    candidate: incumbent.name,
-    ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
-  });
+  const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
+  let holdoutTasks = [];
+  let bandInfo;
+  if (ckpt?.holdout && ckpt.verdict) {
+    bandInfo = ckpt.band;
+    if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
+      const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
+      const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
+      holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
+    }
+  } else if (cfg.band) {
+    const maxRef = cfg.band.maxRefScore ?? 0.99;
+    const reference = baselines[0];
+    if (!reference)
+      throw new Error("evolution band: baselines[0] required as the screening reference");
+    const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
+    const screen = await bench("band-screen", pool2, [reference]);
+    const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
+    const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
+    const kept = pool2.filter((t) => inBandIds.has(t.id));
+    if (kept.length < cfg.holdoutN) {
+      throw new Error(
+        `evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
+      );
+    }
+    holdoutTasks = kept.slice(0, cfg.holdoutN);
+    bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
+  } else {
+    holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
+  }
+  let holdout;
+  let verdict;
+  if (ckpt?.holdout && ckpt.verdict) {
+    holdout = ckpt.holdout;
+    verdict = ckpt.verdict;
+  } else {
+    const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
+    holdout = await bench("holdout", holdoutTasks, finalists);
+    verdict = promotionGate({
+      report: holdout,
+      incumbent: gen0Champion.name,
+      candidate: incumbent.name,
+      ...cfg.objective === "cost" ? {
+        mode: "non-inferiority",
+        ...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
+      } : {},
+      ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
+    });
+    save({
+      gen0,
+      gen0Champion,
+      generations: generationRows,
+      archive,
+      trajectory,
+      holdout,
+      verdict,
+      ...bandInfo ? { band: bandInfo } : {}
+    });
+  }
+  let reproduction;
+  const championCode = codeByName.get(incumbent.name);
+  if (cfg.reproducerCheck && championCode) {
+    const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
+    const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
+    const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
+    try {
+      const summaryRes = await cfg.author.chat.chat({
+        ...cfg.author.model ? { model: cfg.author.model } : {},
+        temperature: 0.2,
+        maxTokens: 512,
+        messages: [
+          {
+            role: "system",
+            content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
+          },
+          { role: "user", content: championCode }
+        ]
+      });
+      const summary = summaryRes.content.trim();
+      const reproduced = await authorStrategy({
+        chat: cfg.author.chat,
+        ...cfg.author.model ? { model: cfg.author.model } : {},
+        ...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
+        ...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
+        temperature: 0.2,
+        contract: `${strategyAuthorContract}
+IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
+${summary}`,
+        environmentName: cfg.environment.name,
+        lossesJson: "[]",
+        budget,
+        outDir: cfg.outDir
+      });
+      const reproStrategy = {
+        name: `${incumbent.name}-reproduced`,
+        driver: reproduced.strategy.driver
+      };
+      const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
+      const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
+      reproduction = {
+        summary,
+        reproducedName: reproStrategy.name,
+        file: reproduced.file,
+        championHoldoutScore,
+        reproducedHoldoutScore,
+        gap: championHoldoutScore - reproducedHoldoutScore,
+        reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
+      };
+    } catch (e) {
+      reproduction = {
+        summary: "",
+        reproducedName: "",
+        championHoldoutScore,
+        reproducedHoldoutScore: 0,
+        gap: championHoldoutScore,
+        reproducible: false,
+        error: e instanceof Error ? e.message.slice(0, 300) : String(e)
+      };
+    }
+  }
   return {
     gen0,
     gen0Champion,
@@ -5501,6 +6151,8 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
     finalChampion: incumbent,
     holdout,
     verdict,
+    ...bandInfo ? { band: bandInfo } : {},
+    ...reproduction ? { reproduction } : {},
     trajectory
   };
 }
@@ -5572,6 +6224,103 @@ function createVerifierEnvironment(opts) {
   };
 }
+// src/runtime/waterfall.ts
+function createWaterfallCollector() {
+  let spans = /* @__PURE__ */ new Map();
+  const onEvent = (event) => {
+    if (event.target === "agent.spawn") {
+      const p = event.payload ?? {};
+      const id = p.childId ?? event.id;
+      spans.set(id, {
+        id,
+        label: p.label ?? id,
+        runId: event.runId,
+        ...event.parentId !== void 0 ? { parentId: event.parentId } : {},
+        startMs: event.timestamp,
+        status: "running",
+        usd: 0,
+        tokens: { input: 0, output: 0 }
+      });
+      return;
+    }
+    if (event.target === "agent.child") {
+      const p = event.payload ?? {};
+      const id = p.childId;
+      if (!id) return;
+      const span = spans.get(id);
+      if (!span) return;
+      span.endMs = event.timestamp;
+      span.status = p.status === "down" ? "down" : "done";
+      span.usd = p.spent?.usd ?? 0;
+      span.tokens = {
+        input: p.spent?.tokens?.input ?? 0,
+        output: p.spent?.tokens?.output ?? 0
+      };
+      if (typeof p.score === "number") span.score = p.score;
+    }
+  };
+  const report = () => {
+    const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
+    const start = all[0]?.startMs ?? 0;
+    const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
+    const byKind = {};
+    let totalUsd = 0;
+    const totalTokens2 = { input: 0, output: 0 };
+    for (const s of all) {
+      totalUsd += s.usd;
+      totalTokens2.input += s.tokens.input;
+      totalTokens2.output += s.tokens.output;
+      const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
+      const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
+      k.count += 1;
+      k.ms += (s.endMs ?? s.startMs) - s.startMs;
+      k.usd += s.usd;
+      k.tokens.input += s.tokens.input;
+      k.tokens.output += s.tokens.output;
+    }
+    return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
+  };
+  const render = (opts) => {
+    const { spans: all, totalMs, totalUsd, byKind } = report();
+    if (all.length === 0) return "(no spans observed)";
+    const width = opts?.width ?? 48;
+    const maxRows = opts?.maxRows ?? 60;
+    const start = all[0]?.startMs ?? 0;
+    const scale = totalMs > 0 ? width / totalMs : 0;
+    const lines = [];
+    const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
+    for (const s of all.slice(0, maxRows)) {
+      const offset = Math.round((s.startMs - start) * scale);
+      const dur = (s.endMs ?? s.startMs) - s.startMs;
+      const len = Math.max(1, Math.round(dur * scale));
+      const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
+      const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
+      lines.push(
+        `${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
+      );
+    }
+    if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
+    lines.push("\u2014".repeat(labelWidth + width + 2));
+    for (const [kind, k] of Object.entries(byKind)) {
+      lines.push(
+        `${kind.padEnd(labelWidth)} \xD7${k.count}  ${(k.ms / 1e3).toFixed(1)}s busy  $${k.usd.toFixed(4)}  ${k.tokens.input}/${k.tokens.output}tok`
+      );
+    }
+    lines.push(
+      `TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall  $${totalUsd.toFixed(4)}`
+    );
+    return lines.join("\n");
+  };
+  return {
+    hooks: { onEvent },
+    report,
+    render,
+    reset: () => {
+      spans = /* @__PURE__ */ new Map();
+    }
+  };
+}
 // src/runtime/workspace.ts
 function localShell() {
   return async (args, cwd) => {
@@ -5674,6 +6423,10 @@ function tail(s) {
 }
 export {
+  deleteBoxSafe,
+  throwAbort,
+  throwIfAborted,
+  sleep,
   contentAddress,
   InMemoryResultBlobStore,
   FileResultBlobStore,
@@ -5681,6 +6434,8 @@ export {
   FileSpawnJournal,
   replaySpawnTree,
   materializeTreeView,
+  anytimeReport,
+  renderAnytimeTable,
   defaultAuditorInstruction,
   auditIntent,
   completionAuthorizes,
@@ -5723,6 +6478,9 @@ export {
   InMemoryCorpus,
   FileCorpus,
   renderCorpusToInstructions,
+  routerChatWithUsage,
+  routerChatWithTools,
+  routerToolLoop,
   createExecutor,
   createExecutorRegistry,
   spendFromUsageEvents,
@@ -5751,11 +6509,14 @@ export {
   strategyAuthorContract,
   assertStrategyContract,
   authorStrategy,
+  discriminatingMeans,
+  pickChampion,
   selectChampion,
   runStrategyEvolution,
   createVerifierEnvironment,
+  createWaterfallCollector,
   localShell,
   gitWorkspace,
   jjWorkspace
 };
-//# sourceMappingURL=chunk-IW2LMLK6.js.map
+//# sourceMappingURL=chunk-CM2IK7VS.js.map