@tangle-network/agent-runtime 0.48.0 → 0.50.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. package/README.md +79 -15
  2. package/dist/agent.d.ts +1 -1
  3. package/dist/agent.js +1 -1
  4. package/dist/analyst-loop.d.ts +1 -1
  5. package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
  6. package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
  7. package/dist/chunk-CM2IK7VS.js.map +1 -0
  8. package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
  9. package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
  10. package/dist/chunk-NDM5VXZW.js.map +1 -0
  11. package/dist/chunk-OM3YNZIW.js +978 -0
  12. package/dist/chunk-OM3YNZIW.js.map +1 -0
  13. package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
  14. package/dist/chunk-RHW75JW5.js.map +1 -0
  15. package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
  16. package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
  17. package/dist/index.d.ts +34 -9
  18. package/dist/index.js +117 -27
  19. package/dist/index.js.map +1 -1
  20. package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
  21. package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
  22. package/dist/loop-runner-bin.d.ts +5 -5
  23. package/dist/loop-runner-bin.js +3 -3
  24. package/dist/loops.d.ts +6 -6
  25. package/dist/loops.js +17 -1
  26. package/dist/mcp/bin.js +206 -29
  27. package/dist/mcp/bin.js.map +1 -1
  28. package/dist/mcp/index.d.ts +41 -177
  29. package/dist/mcp/index.js +40 -6
  30. package/dist/mcp/index.js.map +1 -1
  31. package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
  32. package/dist/platform.js +2 -2
  33. package/dist/platform.js.map +1 -1
  34. package/dist/profiles.d.ts +2 -2
  35. package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
  36. package/dist/runtime.d.ts +403 -24
  37. package/dist/runtime.js +17 -1
  38. package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
  39. package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
  40. package/dist/workflow.d.ts +2 -2
  41. package/dist/workflow.js +1 -1
  42. package/package.json +6 -5
  43. package/dist/chunk-IW2LMLK6.js.map +0 -1
  44. package/dist/chunk-JNPK46YH.js.map +0 -1
  45. package/dist/chunk-LX66I3SC.js +0 -218
  46. package/dist/chunk-LX66I3SC.js.map +0 -1
  47. package/dist/chunk-TJS7S3HJ.js.map +0 -1
  48. package/dist/kb-gate-51BlLlVM.d.ts +0 -529
  49. package/dist/otel-export-EzfsVUhh.d.ts +0 -191
  50. /package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
  51. /package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0
@@ -426,6 +426,103 @@ function isNoEntError(err) {
426
426
  return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
427
427
  }
428
428
 
429
+ // src/runtime/anytime.ts
430
+ var median = (xs) => {
431
+ if (xs.length === 0) return null;
432
+ const s = [...xs].sort((a, b) => a - b);
433
+ const mid = Math.floor(s.length / 2);
434
+ return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
435
+ };
436
+ function anytimeReport(spans, opts) {
437
+ const targets = opts?.targets ?? [1];
438
+ const byRun = /* @__PURE__ */ new Map();
439
+ for (const s of spans) {
440
+ if (!s.label.startsWith("shot:")) continue;
441
+ const list = byRun.get(s.runId) ?? [];
442
+ list.push(s);
443
+ byRun.set(s.runId, list);
444
+ }
445
+ const perTask = [];
446
+ for (const [runId, shots] of byRun) {
447
+ const m = runId.match(/^agentic:(.+):(.+)$/);
448
+ const strategy = m?.[1] ?? runId;
449
+ const taskId = m?.[2] ?? runId;
450
+ const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
451
+ const t0 = Math.min(...ordered.map((s) => s.startMs));
452
+ const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
453
+ let best = 0;
454
+ let cumUsd = 0;
455
+ const points = [];
456
+ const hits = {};
457
+ for (const t of taskTargets) hits[String(t)] = null;
458
+ for (const s of ordered) {
459
+ cumUsd += s.usd;
460
+ if (typeof s.score === "number" && s.score > best) best = s.score;
461
+ const elapsedMs = (s.endMs ?? s.startMs) - t0;
462
+ points.push({ elapsedMs, cumUsd, best });
463
+ for (const t of taskTargets) {
464
+ if (hits[String(t)] === null && best >= t) {
465
+ hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
466
+ }
467
+ }
468
+ }
469
+ perTask.push({ taskId, strategy, points, hits });
470
+ }
471
+ const byStrategy = /* @__PURE__ */ new Map();
472
+ for (const t of perTask) {
473
+ const list = byStrategy.get(t.strategy) ?? [];
474
+ list.push(t);
475
+ byStrategy.set(t.strategy, list);
476
+ }
477
+ const perStrategy = [];
478
+ for (const [strategy, tasks] of byStrategy) {
479
+ const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
480
+ const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
481
+ const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
482
+ const curveByShot = [];
483
+ for (let i = 0; i < maxShots; i += 1) {
484
+ const vals = tasks.map(
485
+ (t) => t.points[Math.min(i, t.points.length - 1)].best
486
+ );
487
+ curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
488
+ }
489
+ const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
490
+ const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
491
+ for (const t of summaryTargets) {
492
+ const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
493
+ const reached = tasks.filter((x) => key(x) !== null);
494
+ perStrategy.push({
495
+ strategy,
496
+ target: t,
497
+ tasks: tasks.length,
498
+ reachedTarget: reached.length,
499
+ medianTttMs: median(reached.map((x) => key(x).ms)),
500
+ medianShotsToTarget: median(reached.map((x) => key(x).shots)),
501
+ ertMs: reached.length > 0 ? totalMs / reached.length : null,
502
+ erUsd: reached.length > 0 ? totalUsd / reached.length : null,
503
+ curveByShot,
504
+ auc
505
+ });
506
+ }
507
+ }
508
+ perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
509
+ return { targets, perTask, perStrategy };
510
+ }
511
+ function renderAnytimeTable(report) {
512
+ const lines = [
513
+ `anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
514
+ "strategy \u2265tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve"
515
+ ];
516
+ for (const s of report.perStrategy) {
517
+ const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
518
+ const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
519
+ lines.push(
520
+ `${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? " \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`} ${s.medianShotsToTarget === null ? " \u2014" : String(s.medianShotsToTarget).padStart(5)} ${s.ertMs === null ? " \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`} ${s.erUsd === null ? " \u2014" : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`
521
+ );
522
+ }
523
+ return lines.join("\n");
524
+ }
525
+
429
526
  // src/runtime/audit-intent.ts
430
527
  var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
431
528
  function summarize(trace, maxLines) {
@@ -2346,20 +2443,20 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
2346
2443
  }
2347
2444
  async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
2348
2445
  let reconciled = false;
2349
- const reconcileOnce = (spend2) => {
2446
+ const reconcileOnce = (spend) => {
2350
2447
  if (reconciled) return;
2351
2448
  reconciled = true;
2352
- pool2.reconcile(ticket, clampSpend(spend2, opts.budget));
2449
+ pool2.reconcile(ticket, clampSpend(spend, opts.budget));
2353
2450
  };
2354
2451
  try {
2355
2452
  live.status = "running";
2356
2453
  const ran = executor.execute(task, childAbort.signal);
2357
2454
  let artifact;
2358
2455
  if (isAsyncIterable2(ran)) {
2359
- const spend2 = await foldStream(ran);
2360
- live.spent = spend2;
2456
+ const spend = await foldStream(ran);
2457
+ live.spent = spend;
2361
2458
  artifact = executor.resultArtifact();
2362
- reconcileOnce(spend2);
2459
+ reconcileOnce(spend);
2363
2460
  } else {
2364
2461
  const terminal = await ran;
2365
2462
  live.spent = terminal.spent;
@@ -2448,21 +2545,21 @@ async function foldStream(stream) {
2448
2545
  }
2449
2546
  return { iterations, tokens, usd, ms: 0 };
2450
2547
  }
2451
- function clampSpend(spend2, budget) {
2452
- const totalTokens2 = spend2.tokens.input + spend2.tokens.output;
2548
+ function clampSpend(spend, budget) {
2549
+ const totalTokens2 = spend.tokens.input + spend.tokens.output;
2453
2550
  const tokensOk = totalTokens2 <= budget.maxTokens;
2454
- const itersOk = spend2.iterations <= budget.maxIterations;
2455
- const usdOk = budget.maxUsd === void 0 || spend2.usd <= budget.maxUsd;
2456
- if (tokensOk && itersOk && usdOk) return spend2;
2551
+ const itersOk = spend.iterations <= budget.maxIterations;
2552
+ const usdOk = budget.maxUsd === void 0 || spend.usd <= budget.maxUsd;
2553
+ if (tokensOk && itersOk && usdOk) return spend;
2457
2554
  const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
2458
2555
  return {
2459
- iterations: Math.min(spend2.iterations, budget.maxIterations),
2556
+ iterations: Math.min(spend.iterations, budget.maxIterations),
2460
2557
  tokens: ratio < 1 ? {
2461
- input: Math.floor(spend2.tokens.input * ratio),
2462
- output: Math.floor(spend2.tokens.output * ratio)
2463
- } : spend2.tokens,
2464
- usd: budget.maxUsd === void 0 ? spend2.usd : Math.min(spend2.usd, budget.maxUsd),
2465
- ms: spend2.ms
2558
+ input: Math.floor(spend.tokens.input * ratio),
2559
+ output: Math.floor(spend.tokens.output * ratio)
2560
+ } : spend.tokens,
2561
+ usd: budget.maxUsd === void 0 ? spend.usd : Math.min(spend.usd, budget.maxUsd),
2562
+ ms: spend.ms
2466
2563
  };
2467
2564
  }
2468
2565
  async function teardownSafe(executor, grace) {
@@ -3012,7 +3109,138 @@ function isNoEntError2(err) {
3012
3109
 
3013
3110
  // src/runtime/supervise/runtime.ts
3014
3111
  import { spawn } from "child_process";
3112
+ import { estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
3113
+
3114
+ // src/runtime/router-client.ts
3015
3115
  import { estimateCost, isModelPriced } from "@tangle-network/agent-eval";
3116
+ async function routerChatWithUsage(cfg, messages, opts) {
3117
+ const url = `${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`;
3118
+ const headers = { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` };
3119
+ let temperature = opts?.temperature ?? 0.2;
3120
+ let lastErr = "";
3121
+ for (let attempt = 0; attempt < 5; attempt += 1) {
3122
+ const res = await fetch(url, {
3123
+ method: "POST",
3124
+ headers,
3125
+ // max_tokens default is generous: THINKING models (kimi-k2.6) spend the budget on
3126
+ // reasoning_content first — a small router default yields EMPTY content.
3127
+ body: JSON.stringify({
3128
+ model: cfg.model,
3129
+ messages,
3130
+ temperature,
3131
+ max_tokens: opts?.maxTokens ?? 8192
3132
+ }),
3133
+ ...opts?.signal ? { signal: opts.signal } : {}
3134
+ });
3135
+ if (res.ok) return parseChatResult(await res.json(), cfg.model);
3136
+ const status = res.status;
3137
+ const text = (await res.text()).slice(0, 200);
3138
+ lastErr = `router ${status}: ${text}`;
3139
+ if (status === 400 && /temperature/i.test(text) && temperature !== 1) {
3140
+ temperature = 1;
3141
+ continue;
3142
+ }
3143
+ if (![408, 425, 429, 500, 502, 503, 504, 520, 522, 524].includes(status))
3144
+ throw new Error(lastErr);
3145
+ if (attempt < 4) await new Promise((r) => setTimeout(r, 800 * 2 ** attempt));
3146
+ }
3147
+ throw new Error(`${lastErr} (exhausted retries)`);
3148
+ }
3149
+ function parseChatResult(json, model) {
3150
+ const data = json;
3151
+ const u = data.usage;
3152
+ const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
3153
+ const costUsd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : void 0;
3154
+ return {
3155
+ content: data.choices?.[0]?.message?.content ?? "",
3156
+ ...usage ? { usage } : {},
3157
+ ...costUsd !== void 0 ? { costUsd } : {}
3158
+ };
3159
+ }
3160
+ async function routerChatWithTools(cfg, messages, tools, opts) {
3161
+ const res = await fetch(`${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
3162
+ method: "POST",
3163
+ headers: { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` },
3164
+ body: JSON.stringify({
3165
+ model: cfg.model,
3166
+ messages,
3167
+ tools,
3168
+ tool_choice: opts?.toolChoice ?? "auto",
3169
+ temperature: opts?.temperature ?? 0.3
3170
+ }),
3171
+ ...opts?.signal ? { signal: opts.signal } : {}
3172
+ });
3173
+ if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
3174
+ const data = await res.json();
3175
+ const msg = data.choices?.[0]?.message;
3176
+ const toolCalls = (msg?.tool_calls ?? []).map((tc, i) => ({
3177
+ id: tc.id ?? `call_${i}`,
3178
+ name: tc.function?.name ?? "",
3179
+ arguments: tc.function?.arguments ?? "{}"
3180
+ }));
3181
+ const u = data.usage;
3182
+ const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
3183
+ const costUsd = usage && isModelPriced(cfg.model) ? estimateCost(usage.input, usage.output, cfg.model) : void 0;
3184
+ return {
3185
+ content: msg?.content ?? null,
3186
+ toolCalls,
3187
+ ...usage ? { usage } : {},
3188
+ ...costUsd !== void 0 ? { costUsd } : {}
3189
+ };
3190
+ }
3191
+ async function routerToolLoop(cfg, system, user, tools, execute, opts) {
3192
+ const maxTurns = opts?.maxTurns ?? 4;
3193
+ const messages = [
3194
+ { role: "system", content: system },
3195
+ { role: "user", content: user }
3196
+ ];
3197
+ let toolCalls = 0;
3198
+ let lastText = "";
3199
+ const usage = { input: 0, output: 0 };
3200
+ const toolTrace = [];
3201
+ for (let turn = 1; turn <= maxTurns; turn += 1) {
3202
+ const r = await routerChatWithTools(cfg, messages, tools, {
3203
+ ...opts?.temperature !== void 0 ? { temperature: opts.temperature } : {},
3204
+ ...opts?.signal ? { signal: opts.signal } : {}
3205
+ });
3206
+ if (r.usage) {
3207
+ usage.input += r.usage.input;
3208
+ usage.output += r.usage.output;
3209
+ }
3210
+ if (r.content) lastText = r.content;
3211
+ if (r.toolCalls.length === 0)
3212
+ return { final: lastText, turns: turn, toolCalls, toolTrace, usage };
3213
+ messages.push({
3214
+ role: "assistant",
3215
+ content: r.content ?? "",
3216
+ tool_calls: r.toolCalls.map((tc) => ({
3217
+ id: tc.id,
3218
+ type: "function",
3219
+ function: { name: tc.name, arguments: tc.arguments }
3220
+ }))
3221
+ });
3222
+ for (const tc of r.toolCalls) {
3223
+ toolCalls += 1;
3224
+ let args = {};
3225
+ try {
3226
+ args = JSON.parse(tc.arguments);
3227
+ } catch {
3228
+ messages.push({
3229
+ role: "tool",
3230
+ tool_call_id: tc.id,
3231
+ content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}`
3232
+ });
3233
+ continue;
3234
+ }
3235
+ const out = await execute(tc.name, args);
3236
+ messages.push({ role: "tool", tool_call_id: tc.id, content: out });
3237
+ toolTrace.push({ name: tc.name, args: tc.arguments, result: out });
3238
+ }
3239
+ }
3240
+ return { final: lastText, turns: maxTurns, toolCalls, toolTrace, usage };
3241
+ }
3242
+
3243
+ // src/runtime/supervise/runtime.ts
3016
3244
  var routerSeamKey = "router";
3017
3245
  var sandboxSeamKey = "sandbox";
3018
3246
  var cliSeamKey = "cli";
@@ -3058,30 +3286,19 @@ var routerInlineExecutor = (spec, ctx) => {
3058
3286
  const messages = taskToMessages(task, spec);
3059
3287
  const started = Date.now();
3060
3288
  const linked = linkSignals(signal, controller.signal);
3061
- const res = await fetch(`${seam.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
3062
- method: "POST",
3063
- headers: { "content-type": "application/json", authorization: `Bearer ${seam.routerKey}` },
3064
- body: JSON.stringify({ model, messages, temperature: 0.2 }),
3065
- ...linked ? { signal: linked } : {}
3066
- });
3067
- if (!res.ok) {
3068
- throw new ValidationError(
3069
- `routerInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`
3070
- );
3071
- }
3072
- const data = await res.json();
3073
- const u = data.usage;
3074
- const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
3075
- const usd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : 0;
3076
- const content = data.choices?.[0]?.message?.content ?? "";
3289
+ const r = await routerChatWithUsage(
3290
+ { routerBaseUrl: seam.routerBaseUrl, routerKey: seam.routerKey, model },
3291
+ messages,
3292
+ linked ? { signal: linked } : {}
3293
+ );
3077
3294
  const spent = {
3078
3295
  iterations: 1,
3079
- tokens: usage ? { input: usage.input, output: usage.output } : zeroTokenUsage(),
3080
- usd,
3296
+ tokens: r.usage ? { input: r.usage.input, output: r.usage.output } : zeroTokenUsage(),
3297
+ usd: r.costUsd ?? 0,
3081
3298
  ms: Date.now() - started
3082
3299
  };
3083
- const out = { content };
3084
- artifact = { outRef: contentRef("router", { model, content }), out, spent };
3300
+ const out = { content: r.content };
3301
+ artifact = { outRef: contentRef("router", { model, content: r.content }), out, spent };
3085
3302
  return artifact;
3086
3303
  },
3087
3304
  teardown(_grace) {
@@ -3110,7 +3327,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
3110
3327
  "routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
3111
3328
  );
3112
3329
  }
3113
- const maxTurns = seam.maxTurns ?? 4;
3330
+ const maxTurns = seam.maxTurns ?? 200;
3114
3331
  const controller = new AbortController();
3115
3332
  const abortIfSignalled = () => {
3116
3333
  if (ctx.signal.aborted) controller.abort();
@@ -3188,7 +3405,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
3188
3405
  messages.push({ role: "tool", tool_call_id: id, content: result });
3189
3406
  }
3190
3407
  }
3191
- const usd = isModelPriced(model) ? estimateCost(tokens.input, tokens.output, model) : 0;
3408
+ const usd = isModelPriced2(model) ? estimateCost2(tokens.input, tokens.output, model) : 0;
3192
3409
  const spent = { iterations: turns, tokens, usd, ms: Date.now() - started };
3193
3410
  const out = { content: lastText };
3194
3411
  artifact = { outRef: contentRef("router-tools", { model, content: lastText }), out, spent };
@@ -4228,12 +4445,12 @@ function countStatuses(reported) {
4228
4445
  function zeroSpend4() {
4229
4446
  return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
4230
4447
  }
4231
- function cloneSpend(spend2) {
4448
+ function cloneSpend(spend) {
4232
4449
  return {
4233
- iterations: spend2.iterations,
4234
- tokens: { input: spend2.tokens.input, output: spend2.tokens.output },
4235
- usd: spend2.usd,
4236
- ms: spend2.ms
4450
+ iterations: spend.iterations,
4451
+ tokens: { input: spend.tokens.input, output: spend.tokens.output },
4452
+ usd: spend.usd,
4453
+ ms: spend.ms
4237
4454
  };
4238
4455
  }
4239
4456
  function addSpend(acc, delta) {
@@ -4249,13 +4466,13 @@ function spreadOf(values) {
4249
4466
  function fractionalSpread(values) {
4250
4467
  const spread = spreadOf(values);
4251
4468
  if (spread === 0) return 0;
4252
- const median = medianOf(values);
4253
- if (median === 0) {
4469
+ const median2 = medianOf(values);
4470
+ if (median2 === 0) {
4254
4471
  throw new Error(
4255
4472
  "equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
4256
4473
  );
4257
4474
  }
4258
- return spread / median;
4475
+ return spread / median2;
4259
4476
  }
4260
4477
  function medianOf(values) {
4261
4478
  if (values.length === 0) {
@@ -4287,28 +4504,34 @@ function requireNode2(nodes, id, root) {
4287
4504
  return node;
4288
4505
  }
4289
4506
  function requireSpend(rolled, id, root) {
4290
- const spend2 = rolled.get(id);
4291
- if (!spend2) {
4507
+ const spend = rolled.get(id);
4508
+ if (!spend) {
4292
4509
  throw new Error(
4293
4510
  `trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
4294
4511
  );
4295
4512
  }
4296
- return spend2;
4513
+ return spend;
4297
4514
  }
4298
4515
 
4299
4516
  // src/runtime/promotion-gate.ts
4300
4517
  import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
4301
4518
  function promotionGate(opts) {
4519
+ const mode = opts.mode ?? "superiority";
4302
4520
  if (opts.candidate === opts.incumbent) {
4303
4521
  return {
4304
4522
  promoted: false,
4305
4523
  reason: "identical-champion",
4524
+ mode,
4306
4525
  n: 0,
4307
4526
  lift: { mean: 0, median: 0, low: 0, high: 0 }
4308
4527
  };
4309
4528
  }
4310
4529
  const before = [];
4311
4530
  const after = [];
4531
+ const incUsd = [];
4532
+ const candUsd = [];
4533
+ const incMs = [];
4534
+ const candMs = [];
4312
4535
  const cellIds = [];
4313
4536
  for (const row of opts.report.perTask) {
4314
4537
  const inc = row.cells?.[opts.incumbent];
@@ -4316,6 +4539,10 @@ function promotionGate(opts) {
4316
4539
  if (!inc || !cand) continue;
4317
4540
  before.push(inc.score);
4318
4541
  after.push(cand.score);
4542
+ incUsd.push(inc.usd);
4543
+ candUsd.push(cand.usd);
4544
+ incMs.push(inc.ms);
4545
+ candMs.push(cand.ms);
4319
4546
  cellIds.push(row.taskId);
4320
4547
  }
4321
4548
  if (before.length === 0) {
@@ -4339,15 +4566,91 @@ function promotionGate(opts) {
4339
4566
  low: sig.bootstrap.low,
4340
4567
  high: sig.bootstrap.high
4341
4568
  };
4342
- if (sig.fewRuns) return { promoted: false, reason: "few-tasks", n: sig.n, lift };
4343
- return sig.significant ? { promoted: true, reason: "significant", n: sig.n, lift } : { promoted: false, reason: "no-margin", n: sig.n, lift };
4569
+ const latSig = heldoutSignificance(
4570
+ { before: incMs, after: candMs, cellIds },
4571
+ {
4572
+ deltaThreshold: 0,
4573
+ minProductiveRuns: 1,
4574
+ statistic: opts.statistic ?? "mean",
4575
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4576
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4577
+ }
4578
+ );
4579
+ const latency = {
4580
+ mean: latSig.bootstrap.mean,
4581
+ median: latSig.bootstrap.median,
4582
+ low: latSig.bootstrap.low,
4583
+ high: latSig.bootstrap.high
4584
+ };
4585
+ if (mode === "superiority") {
4586
+ if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
4587
+ return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
4588
+ }
4589
+ const tolerance = opts.scoreTolerance ?? 0.05;
4590
+ const scoreSig = heldoutSignificance(
4591
+ { before, after, cellIds },
4592
+ {
4593
+ deltaThreshold: -tolerance,
4594
+ minProductiveRuns: opts.minPairedTasks ?? 6,
4595
+ statistic: opts.statistic ?? "mean",
4596
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4597
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4598
+ }
4599
+ );
4600
+ const costSig = heldoutSignificance(
4601
+ { before: candUsd, after: incUsd, cellIds },
4602
+ {
4603
+ deltaThreshold: 0,
4604
+ minProductiveRuns: opts.minPairedTasks ?? 6,
4605
+ statistic: opts.statistic ?? "mean",
4606
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4607
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4608
+ }
4609
+ );
4610
+ const costSavings = {
4611
+ mean: costSig.bootstrap.mean,
4612
+ median: costSig.bootstrap.median,
4613
+ low: costSig.bootstrap.low,
4614
+ high: costSig.bootstrap.high
4615
+ };
4616
+ if (scoreSig.fewRuns)
4617
+ return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
4618
+ if (!scoreSig.significant)
4619
+ return {
4620
+ promoted: false,
4621
+ reason: "non-inferiority-unproven",
4622
+ mode,
4623
+ n: scoreSig.n,
4624
+ lift,
4625
+ costSavings,
4626
+ latency
4627
+ };
4628
+ if (!costSig.significant)
4629
+ return {
4630
+ promoted: false,
4631
+ reason: "not-cheaper",
4632
+ mode,
4633
+ n: scoreSig.n,
4634
+ lift,
4635
+ costSavings,
4636
+ latency
4637
+ };
4638
+ return {
4639
+ promoted: true,
4640
+ reason: "non-inferior-and-cheaper",
4641
+ mode,
4642
+ n: scoreSig.n,
4643
+ lift,
4644
+ costSavings,
4645
+ latency
4646
+ };
4344
4647
  }
4345
4648
 
4346
4649
  // src/runtime/run-benchmark.ts
4347
4650
  import { pairedBootstrap, paretoFrontier } from "@tangle-network/agent-eval";
4348
4651
 
4349
4652
  // src/runtime/strategy.ts
4350
- import { createChatClient, estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
4653
+ import { createChatClient, estimateCost as estimateCost3, isModelPriced as isModelPriced3 } from "@tangle-network/agent-eval";
4351
4654
  var taskNudge = "Use the available tools to bring the artifact to the required final state. Address EVERY distinct change the request implies. After each tool result, check what remains and continue. Re-read the values you set to confirm they took. Reply DONE only once every required change is made and verified.";
4352
4655
  async function runShot(surface, _task, handle, tools, messages, opts, modelOverride) {
4353
4656
  const innerTurns = opts.innerTurns ?? 4;
@@ -4364,7 +4667,8 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
4364
4667
  messages,
4365
4668
  tools,
4366
4669
  tool_choice: "auto",
4367
- temperature: opts.temperature ?? 0.7
4670
+ temperature: opts.temperature ?? 0.7,
4671
+ ...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
4368
4672
  })
4369
4673
  });
4370
4674
  if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
@@ -4403,12 +4707,15 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
4403
4707
  }
4404
4708
  return { messages, completions, toolCalls, toolErrors, tokens };
4405
4709
  }
4406
- async function analyze(task, messages, opts) {
4407
- const trajectory = messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
4710
+ function compactTrajectory(messages) {
4711
+ return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
4408
4712
  if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
4409
4713
  const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
4410
4714
  return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
4411
4715
  }).join("\n").slice(0, 7e3);
4716
+ }
4717
+ async function consultAnalyst(task, messages, instruction, opts) {
4718
+ const trajectory = compactTrajectory(messages);
4412
4719
  const analystModel = opts.analystModel ?? opts.model;
4413
4720
  const chat = createChatClient({
4414
4721
  transport: "router",
@@ -4416,6 +4723,52 @@ async function analyze(task, messages, opts) {
4416
4723
  baseUrl: opts.routerBaseUrl,
4417
4724
  defaultModel: analystModel
4418
4725
  });
4726
+ const res = await chat.chat({
4727
+ model: analystModel,
4728
+ temperature: 0.2,
4729
+ maxTokens: 1024,
4730
+ messages: [
4731
+ { role: "system", content: instruction },
4732
+ {
4733
+ role: "user",
4734
+ content: `TASK: ${task.userPrompt.slice(0, 1500)}
4735
+
4736
+ TRAJECTORY:
4737
+ ${trajectory}`
4738
+ }
4739
+ ]
4740
+ });
4741
+ const usage = res.usage;
4742
+ return {
4743
+ steer: res.content.trim(),
4744
+ tokens: {
4745
+ input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
4746
+ output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
4747
+ }
4748
+ };
4749
+ }
4750
+ async function analyze(task, messages, opts) {
4751
+ const trajectory = compactTrajectory(messages);
4752
+ const analystModel = opts.analystModel ?? opts.model;
4753
+ const inner = createChatClient({
4754
+ transport: "router",
4755
+ apiKey: opts.routerKey,
4756
+ baseUrl: opts.routerBaseUrl,
4757
+ defaultModel: analystModel
4758
+ });
4759
+ const tokens = { input: 0, output: 0 };
4760
+ const chat = {
4761
+ ...inner,
4762
+ chat: async (req, callOpts) => {
4763
+ const res = await inner.chat(req, callOpts);
4764
+ const u = res.usage;
4765
+ if (u) {
4766
+ tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
4767
+ tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
4768
+ }
4769
+ return res;
4770
+ }
4771
+ };
4419
4772
  const obs = await observe(
4420
4773
  {
4421
4774
  task: task.userPrompt,
@@ -4432,14 +4785,8 @@ async function analyze(task, messages, opts) {
4432
4785
  }
4433
4786
  );
4434
4787
  const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
4435
- return steer || "COMPLETE";
4788
+ return { steer: steer || "COMPLETE", tokens };
4436
4789
  }
4437
- var spend = (iterations) => ({
4438
- iterations,
4439
- tokens: { input: 0, output: 0 },
4440
- usd: 0,
4441
- ms: 0
4442
- });
4443
4790
  function shotExecutor(surface, opts) {
4444
4791
  let artifact;
4445
4792
  return {
@@ -4449,7 +4796,19 @@ function shotExecutor(surface, opts) {
4449
4796
  const own = !t.handle;
4450
4797
  const handle = t.handle ?? await surface.open(t.task);
4451
4798
  try {
4452
- const tools = await surface.tools(t.task, handle);
4799
+ const allTools = await surface.tools(t.task, handle);
4800
+ let tools = allTools;
4801
+ if (t.tools) {
4802
+ const known = new Set(allTools.map((tool) => tool.function.name));
4803
+ const unknown = t.tools.filter((name) => !known.has(name));
4804
+ if (unknown.length > 0) {
4805
+ throw new Error(
4806
+ `shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
4807
+ );
4808
+ }
4809
+ const want = new Set(t.tools);
4810
+ tools = allTools.filter((tool) => want.has(tool.function.name));
4811
+ }
4453
4812
  const messages = t.messages?.length ? t.messages : [
4454
4813
  { role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
4455
4814
  { role: "user", content: `${t.task.userPrompt}
@@ -4483,7 +4842,7 @@ ${taskNudge}` }
4483
4842
  spent: {
4484
4843
  iterations: shot.completions,
4485
4844
  tokens: shot.tokens,
4486
- usd: isModelPriced2(opts.model) ? estimateCost2(shot.tokens.input, shot.tokens.output, opts.model) : 0,
4845
+ usd: isModelPriced3(opts.model) ? estimateCost3(shot.tokens.input, shot.tokens.output, opts.model) : 0,
4487
4846
  ms: 0
4488
4847
  }
4489
4848
  };
@@ -4505,8 +4864,18 @@ function analystExecutor(opts) {
4505
4864
  runtime: "agentic-analyst",
4506
4865
  async execute(task) {
4507
4866
  const t = task;
4508
- const findings = await analyze(t.task, t.messages, opts);
4509
- artifact = { outRef: `analyst:${findings.length}`, out: findings, spent: spend(1) };
4867
+ const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
4868
+ const analystModel = opts.analystModel ?? opts.model;
4869
+ artifact = {
4870
+ outRef: `analyst:${steer.length}`,
4871
+ out: steer,
4872
+ spent: {
4873
+ iterations: 1,
4874
+ tokens,
4875
+ usd: isModelPriced3(analystModel) ? estimateCost3(tokens.input, tokens.output, analystModel) : 0,
4876
+ ms: 0
4877
+ }
4878
+ };
4510
4879
  return artifact;
4511
4880
  },
4512
4881
  teardown: () => Promise.resolve({ destroyed: true }),
@@ -4669,12 +5038,21 @@ function defineStrategy(name, run) {
4669
5038
  const innerTurns = opts.innerTurns ?? 4;
4670
5039
  let verifiedBest = 0;
4671
5040
  let verifiedResolved = false;
5041
+ const openHandles = /* @__PURE__ */ new Set();
4672
5042
  const ctx = {
4673
5043
  // Narrowed to open/close — the body gets no raw call()/score() access.
4674
5044
  surface: {
4675
5045
  name: surface.name,
4676
- open: (t) => surface.open(t),
4677
- close: (h) => surface.close(h)
5046
+ open: async (t) => {
5047
+ const h = await surface.open(t);
5048
+ openHandles.add(h.id);
5049
+ return h;
5050
+ },
5051
+ close: async (h) => {
5052
+ if (!h || !openHandles.has(h.id)) return;
5053
+ openHandles.delete(h.id);
5054
+ await surface.close(h);
5055
+ }
4678
5056
  },
4679
5057
  task,
4680
5058
  opts,
@@ -4690,7 +5068,8 @@ function defineStrategy(name, run) {
4690
5068
  handle: spec?.handle,
4691
5069
  messages: spec?.messages,
4692
5070
  steer: spec?.steer,
4693
- persona: spec?.persona
5071
+ persona: spec?.persona,
5072
+ tools: spec?.tools
4694
5073
  },
4695
5074
  { budget: perChild(innerTurns), label: child.name }
4696
5075
  );
@@ -4702,6 +5081,13 @@ function defineStrategy(name, run) {
4702
5081
  if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
4703
5082
  return out;
4704
5083
  },
5084
+ async listTools(handle) {
5085
+ const tools = await surface.tools(task, handle);
5086
+ return tools.map((t) => ({
5087
+ name: t.function.name,
5088
+ ...t.function.description ? { description: t.function.description } : {}
5089
+ }));
5090
+ },
4705
5091
  async critique(messages) {
4706
5092
  const child = leaf(`analyst:${seq}`, "analyst");
4707
5093
  seq += 1;
@@ -4715,12 +5101,33 @@ function defineStrategy(name, run) {
4715
5101
  if (settled.kind === "down") return null;
4716
5102
  const findings = settled.out;
4717
5103
  return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
5104
+ },
5105
+ async consult(messages, instruction) {
5106
+ const child = leaf(`analyst:${seq}`, "analyst");
5107
+ seq += 1;
5108
+ const res = scope.spawn(
5109
+ child,
5110
+ { task, messages, rawInstruction: instruction },
5111
+ { budget: perChild(1), label: child.name }
5112
+ );
5113
+ if (!res.ok) return null;
5114
+ const settled = await drainOne2(scope);
5115
+ if (settled.kind === "down") return null;
5116
+ return settled.out;
4718
5117
  }
4719
5118
  };
4720
5119
  const r = await run(ctx);
4721
5120
  return {
4722
5121
  kind: "done",
4723
- deliverable: { mode: name, ...r, score: verifiedBest, resolved: verifiedResolved }
5122
+ deliverable: {
5123
+ mode: name,
5124
+ ...r,
5125
+ progression: Array.isArray(r.progression) ? r.progression : [],
5126
+ completions: typeof r.completions === "number" ? r.completions : 0,
5127
+ shots: typeof r.shots === "number" ? r.shots : 0,
5128
+ score: verifiedBest,
5129
+ resolved: verifiedResolved
5130
+ }
4724
5131
  };
4725
5132
  }
4726
5133
  })
@@ -4875,27 +5282,44 @@ async function runBenchmark(cfg) {
4875
5282
  let settled = 0;
4876
5283
  const perTask = await pool(cfg.tasks, concurrency, async (task) => {
4877
5284
  const cells = {};
5285
+ const errors = {};
4878
5286
  let row;
4879
5287
  try {
4880
5288
  for (const s of strategies) {
4881
- const r = await runAgentic({
4882
- ...cfg.worker,
4883
- surface: cfg.environment,
4884
- task,
4885
- strategy: s,
4886
- budget,
4887
- ...cfg.hooks ? { hooks: cfg.hooks } : {}
4888
- });
4889
- cells[s.name] = {
4890
- score: r.score,
4891
- resolved: r.resolved,
4892
- progression: r.progression,
4893
- usd: r.usd,
4894
- ms: r.ms,
4895
- tokens: r.tokens
4896
- };
5289
+ try {
5290
+ const r = await runAgentic({
5291
+ ...cfg.worker,
5292
+ surface: cfg.environment,
5293
+ task,
5294
+ strategy: s,
5295
+ budget,
5296
+ ...cfg.hooks ? { hooks: cfg.hooks } : {}
5297
+ });
5298
+ cells[s.name] = {
5299
+ score: r.score,
5300
+ resolved: r.resolved,
5301
+ progression: r.progression,
5302
+ usd: r.usd,
5303
+ ms: r.ms,
5304
+ tokens: r.tokens
5305
+ };
5306
+ } catch (e) {
5307
+ errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
5308
+ cells[s.name] = {
5309
+ score: 0,
5310
+ resolved: false,
5311
+ progression: [],
5312
+ usd: 0,
5313
+ ms: 0,
5314
+ tokens: { input: 0, output: 0 }
5315
+ };
5316
+ }
4897
5317
  }
4898
- row = { taskId: task.id, cells };
5318
+ row = {
5319
+ taskId: task.id,
5320
+ cells,
5321
+ ...Object.keys(errors).length > 0 ? { errors } : {}
5322
+ };
4899
5323
  } catch (e) {
4900
5324
  row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
4901
5325
  }
@@ -5200,7 +5624,7 @@ var strategyAuthorContract = `
5200
5624
  You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
5201
5625
  spend a compute budget to beat a task's deployable check. You compose exactly two steps:
5202
5626
 
5203
- shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
5627
+ shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
5204
5628
  Runs ONE worker attempt (a bounded tool loop) over an artifact.
5205
5629
  - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
5206
5630
  - pass handle => the shot CONTINUES that artifact (state accumulates across shots).
@@ -5210,6 +5634,10 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
5210
5634
  (multi-agent strategies: a researcher shot then an engineer shot, a panel of k
5211
5635
  personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
5212
5636
  a carried conversation it arrives as a hand-off message. Same conserved budget.
5637
+ - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by
5638
+ name (focus an explore shot on read-only tools, an execute shot on write tools).
5639
+ Restriction-only; unknown names make the shot fail. ALWAYS select from
5640
+ await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
5213
5641
  ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
5214
5642
  Returns null if the attempt failed infra-wise.
5215
5643
 
@@ -5217,10 +5645,23 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
5217
5645
  A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
5218
5646
  instruction (or null when it judges the work complete). Costs ~1 completion.
5219
5647
 
5648
+ consult(messages, instruction): Promise<string | null>
5649
+ The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
5650
+ trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
5651
+ (a decision, a prediction). Costs ~1 completion.
5652
+
5220
5653
  surface.open(task) / surface.close(handle)
5221
5654
  Open a persistent artifact you manage yourself (remember to close in a finally).
5655
+ close is idempotent \u2014 closing an already-closed handle is a safe no-op.
5656
+
5657
+ listTools(handle): Promise<Array<{ name, description? }>>
5658
+ The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
5659
+ shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
5660
+ names from an example kills your shots on every task whose tools differ.
5222
5661
 
5223
5662
  Rules:
5663
+ - ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
5664
+ crashes the whole benchmark run.
5224
5665
  - Stay within ~budget total shots; every shot/critique spends from a conserved pool.
5225
5666
  - For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
5226
5667
  fresh conversation too, but be explicit). To CONTINUE, pass the previous
@@ -5230,8 +5671,8 @@ Rules:
5230
5671
  - The module must be EXACTLY this shape (no other imports, no commentary outside code):
5231
5672
 
5232
5673
  import { defineStrategy } from '@tangle-network/agent-runtime/loops'
5233
- export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
5234
- // your composition
5674
+ export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
5675
+ // your composition (listTools comes from the destructured context \u2014 it is NOT a global)
5235
5676
  })
5236
5677
  `;
5237
5678
  function assertStrategyContract(code) {
@@ -5307,34 +5748,89 @@ async function authorStrategy(opts) {
5307
5748
  }
5308
5749
 
5309
5750
  // src/runtime/strategy-evolution.ts
5751
+ import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
5310
5752
  import { gzipSync } from "zlib";
5311
- function selectChampion(report, fieldOrder, policy, epsilon) {
5312
- const entries = fieldOrder.map((name) => ({ name, summary: report.perStrategy[name] })).filter((e) => !!e.summary);
5753
+ function discriminatingMeans(report, fieldOrder) {
5754
+ const rows = report.perTask.filter((r) => {
5755
+ if (!r.cells) return false;
5756
+ const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
5757
+ if (scores.length < fieldOrder.length) return false;
5758
+ return Math.max(...scores) - Math.min(...scores) > 0;
5759
+ });
5760
+ if (rows.length === 0) return null;
5761
+ const out = {};
5762
+ for (const name of fieldOrder) {
5763
+ const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
5764
+ out[name] = {
5765
+ score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
5766
+ usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
5767
+ };
5768
+ }
5769
+ return out;
5770
+ }
5771
+ function pickChampion(means, fieldOrder, policy, epsilon) {
5772
+ const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
5313
5773
  if (entries.length === 0)
5314
- throw new Error("selectChampion: report carries none of the field strategies");
5774
+ throw new Error("pickChampion: the means table carries none of the field strategies");
5315
5775
  const best = Math.max(...entries.map((e) => e.summary.score));
5316
5776
  const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
5317
- if (!pick) throw new Error("selectChampion: empty pick (unreachable)");
5777
+ if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
5318
5778
  return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
5319
5779
  }
5780
+ function selectChampion(report, fieldOrder, policy, epsilon) {
5781
+ return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
5782
+ }
5320
5783
  var fieldSummary = (archive) => archive.map(
5321
5784
  (n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
5322
5785
  ).join("\n");
5323
- var compactLosses = (report) => {
5786
+ var compactLosses = (report, detail) => {
5324
5787
  const r2 = (x) => Math.round(x * 100) / 100;
5325
5788
  const rows = report.perTask.map(
5326
5789
  (row) => row.cells ? {
5327
5790
  task: row.taskId,
5791
+ ...row.errors ? {
5792
+ errors: Object.fromEntries(
5793
+ Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
5794
+ )
5795
+ } : {},
5328
5796
  cells: Object.fromEntries(
5329
5797
  Object.entries(row.cells).map(([name, c]) => [
5330
5798
  name,
5331
- { score: r2(c.score), resolved: c.resolved, progression: c.progression.map(r2) }
5799
+ // 'binary' is the leakage-bounded channel: the author learns pass/fail per
5800
+ // task and nothing else — the per-generation leak from the evaluation data
5801
+ // is capped at one bit per cell (arXiv:2606.11045 measured that exploration
5802
+ // survives this; whether AUTHORING does is the E1-coarse A/B).
5803
+ detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
5804
+ score: r2(c.score),
5805
+ resolved: c.resolved,
5806
+ usd: Math.round(c.usd * 1e4) / 1e4,
5807
+ progression: (c.progression ?? []).map(r2)
5808
+ }
5332
5809
  ])
5333
5810
  )
5334
5811
  } : { task: row.taskId, error: row.error?.slice(0, 80) }
5335
5812
  );
5336
5813
  return JSON.stringify(rows).slice(0, 12e3);
5337
5814
  };
5815
+ function renameStrategy(orig, unique) {
5816
+ if (orig.name === unique) return orig;
5817
+ return {
5818
+ name: unique,
5819
+ driver: (s, t, o, b) => {
5820
+ const agent = orig.driver(s, t, o, b);
5821
+ return {
5822
+ ...agent,
5823
+ name: unique,
5824
+ act: async (task, scope) => {
5825
+ const out = await agent.act(task, scope);
5826
+ if (out.kind !== "done") return out;
5827
+ const deliverable = { ...out.deliverable, mode: unique };
5828
+ return { ...out, deliverable };
5829
+ }
5830
+ };
5831
+ }
5832
+ };
5833
+ }
5338
5834
  async function runStrategyEvolution(cfg) {
5339
5835
  const budget = cfg.budget ?? 3;
5340
5836
  const concurrency = cfg.concurrency ?? 3;
@@ -5342,37 +5838,72 @@ async function runStrategyEvolution(cfg) {
5342
5838
  const populationSize = cfg.populationSize ?? 2;
5343
5839
  const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
5344
5840
  const policy = cfg.champion ?? "costAware";
5345
- const epsilon = cfg.championEpsilon ?? 0.01;
5841
+ const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
5346
5842
  const byName = new Map(baselines.map((s) => [s.name, s]));
5347
- const bench = (phase, tasks, strategies) => runBenchmark({
5348
- environment: cfg.environment,
5349
- tasks,
5350
- worker: cfg.worker,
5351
- strategies,
5843
+ const codeByName = /* @__PURE__ */ new Map();
5844
+ const fingerprint = {
5845
+ trainN: cfg.trainN,
5846
+ holdoutN: cfg.holdoutN,
5352
5847
  budget,
5353
- concurrency,
5354
- ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
5355
- ...cfg.hooks ? { hooks: cfg.hooks } : {}
5356
- });
5848
+ generations,
5849
+ populationSize
5850
+ };
5851
+ let ckpt;
5852
+ if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
5853
+ const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
5854
+ if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
5855
+ throw new Error(
5856
+ `evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
5857
+ );
5858
+ }
5859
+ ckpt = raw;
5860
+ }
5861
+ const save = (state) => {
5862
+ if (cfg.checkpoint)
5863
+ writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
5864
+ };
5865
+ const bench = async (phase, tasks, strategies) => {
5866
+ await cfg.onPhase?.(phase);
5867
+ return runBenchmark({
5868
+ environment: cfg.environment,
5869
+ tasks,
5870
+ worker: cfg.worker,
5871
+ strategies,
5872
+ budget,
5873
+ concurrency,
5874
+ ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
5875
+ ...cfg.hooks ? { hooks: cfg.hooks } : {}
5876
+ });
5877
+ };
5357
5878
  const train = await cfg.tasks(0, cfg.trainN);
5358
- const gen0 = await bench("gen0", train, baselines);
5359
- const archive = baselines.map((s) => ({
5879
+ const probeTask = train[0];
5880
+ if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
5881
+ const probe = await cfg.environment.open(probeTask);
5882
+ let toolCatalog;
5883
+ try {
5884
+ const tools = await cfg.environment.tools(probeTask, probe);
5885
+ toolCatalog = tools.map(
5886
+ (t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
5887
+ ).join("\n");
5888
+ } finally {
5889
+ await cfg.environment.close(probe);
5890
+ }
5891
+ const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
5892
+ const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
5360
5893
  name: s.name,
5361
5894
  source: "baseline",
5362
5895
  generation: 0,
5363
5896
  score: gen0.perStrategy[s.name]?.score ?? 0,
5364
5897
  usd: gen0.perStrategy[s.name]?.usd ?? 0
5365
5898
  }));
5366
- const gen0Champion = selectChampion(
5899
+ const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
5367
5900
  gen0,
5368
5901
  baselines.map((s) => s.name),
5369
5902
  policy,
5370
5903
  epsilon
5371
5904
  );
5372
- let incumbent = gen0Champion;
5373
- let latestReport = gen0;
5374
- const generationRows = [];
5375
- const trajectory = [
5905
+ const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
5906
+ const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
5376
5907
  {
5377
5908
  generation: 0,
5378
5909
  champion: gen0Champion.name,
@@ -5380,13 +5911,39 @@ async function runStrategyEvolution(cfg) {
5380
5911
  usd: gen0Champion.usd
5381
5912
  }
5382
5913
  ];
5383
- let authoredOk = 0;
5384
- for (let g = 1; g <= generations; g += 1) {
5385
- const lossesJson = compactLosses(latestReport);
5914
+ for (const row of generationRows) {
5915
+ for (const c of row.candidates) {
5916
+ if (!c.file || c.error) continue;
5917
+ const mod = await import(`file://${c.file}`);
5918
+ if (!mod.default || typeof mod.default.driver !== "function") {
5919
+ throw new Error(
5920
+ `evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
5921
+ );
5922
+ }
5923
+ byName.set(c.name, renameStrategy(mod.default, c.name));
5924
+ codeByName.set(c.name, readFileSync(c.file, "utf8"));
5925
+ }
5926
+ }
5927
+ let authoredOk = generationRows.reduce(
5928
+ (n, row) => n + row.candidates.filter((c) => !c.error).length,
5929
+ 0
5930
+ );
5931
+ const lastRow = generationRows[generationRows.length - 1];
5932
+ let incumbent = lastRow ? lastRow.champion : gen0Champion;
5933
+ let latestReport = lastRow ? lastRow.report : gen0;
5934
+ if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
5935
+ for (let g = generationRows.length + 1; g <= generations; g += 1) {
5936
+ const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
5386
5937
  const candidates = [];
5387
5938
  const newStrategies = [];
5388
5939
  for (let i = 0; i < populationSize; i += 1) {
5389
- const contract = `${strategyAuthorContract}
5940
+ const objectiveNote = cfg.objective === "cost" ? `
5941
+
5942
+ YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
5943
+ const contract = `${strategyAuthorContract}${objectiveNote}
5944
+
5945
+ EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
5946
+ ${toolCatalog}
5390
5947
 
5391
5948
  STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
5392
5949
  ${fieldSummary(archive)}
@@ -5406,26 +5963,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5406
5963
  outDir: cfg.outDir
5407
5964
  });
5408
5965
  const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
5409
- const strategy = unique === authored.strategy.name ? authored.strategy : {
5410
- name: unique,
5411
- driver: (s, t, o, b) => {
5412
- const agent = authored.strategy.driver(s, t, o, b);
5413
- return {
5414
- ...agent,
5415
- name: unique,
5416
- act: async (task, scope) => {
5417
- const out = await agent.act(task, scope);
5418
- if (out.kind !== "done") return out;
5419
- const deliverable = {
5420
- ...out.deliverable,
5421
- mode: unique
5422
- };
5423
- return { ...out, deliverable };
5424
- }
5425
- };
5426
- }
5427
- };
5966
+ const strategy = renameStrategy(authored.strategy, unique);
5428
5967
  byName.set(unique, strategy);
5968
+ codeByName.set(unique, authored.code);
5429
5969
  newStrategies.push(strategy);
5430
5970
  archive.push({
5431
5971
  name: unique,
@@ -5463,12 +6003,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5463
6003
  node.usd = cell.usd;
5464
6004
  }
5465
6005
  }
5466
- const champion = selectChampion(
5467
- report,
5468
- field.map((s) => s.name),
5469
- policy,
5470
- epsilon
5471
- );
6006
+ const fieldNames = field.map((s) => s.name);
6007
+ const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
6008
+ const champion = pickChampion(means, fieldNames, policy, epsilon);
5472
6009
  generationRows.push({ generation: g, candidates, report, champion });
5473
6010
  trajectory.push({
5474
6011
  generation: g,
@@ -5478,21 +6015,134 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5478
6015
  });
5479
6016
  incumbent = champion;
5480
6017
  latestReport = report;
6018
+ save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
5481
6019
  }
5482
6020
  if (authoredOk === 0) {
5483
6021
  throw new Error(
5484
6022
  "runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
5485
6023
  );
5486
6024
  }
5487
- const holdoutTasks = await cfg.tasks(cfg.trainN + (cfg.holdoutOffset ?? 0), cfg.holdoutN);
5488
- const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
5489
- const holdout = await bench("holdout", holdoutTasks, finalists);
5490
- const verdict = promotionGate({
5491
- report: holdout,
5492
- incumbent: gen0Champion.name,
5493
- candidate: incumbent.name,
5494
- ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
5495
- });
6025
+ const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
6026
+ let holdoutTasks = [];
6027
+ let bandInfo;
6028
+ if (ckpt?.holdout && ckpt.verdict) {
6029
+ bandInfo = ckpt.band;
6030
+ if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
6031
+ const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
6032
+ const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
6033
+ holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
6034
+ }
6035
+ } else if (cfg.band) {
6036
+ const maxRef = cfg.band.maxRefScore ?? 0.99;
6037
+ const reference = baselines[0];
6038
+ if (!reference)
6039
+ throw new Error("evolution band: baselines[0] required as the screening reference");
6040
+ const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
6041
+ const screen = await bench("band-screen", pool2, [reference]);
6042
+ const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
6043
+ const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
6044
+ const kept = pool2.filter((t) => inBandIds.has(t.id));
6045
+ if (kept.length < cfg.holdoutN) {
6046
+ throw new Error(
6047
+ `evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
6048
+ );
6049
+ }
6050
+ holdoutTasks = kept.slice(0, cfg.holdoutN);
6051
+ bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
6052
+ } else {
6053
+ holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
6054
+ }
6055
+ let holdout;
6056
+ let verdict;
6057
+ if (ckpt?.holdout && ckpt.verdict) {
6058
+ holdout = ckpt.holdout;
6059
+ verdict = ckpt.verdict;
6060
+ } else {
6061
+ const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
6062
+ holdout = await bench("holdout", holdoutTasks, finalists);
6063
+ verdict = promotionGate({
6064
+ report: holdout,
6065
+ incumbent: gen0Champion.name,
6066
+ candidate: incumbent.name,
6067
+ ...cfg.objective === "cost" ? {
6068
+ mode: "non-inferiority",
6069
+ ...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
6070
+ } : {},
6071
+ ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
6072
+ });
6073
+ save({
6074
+ gen0,
6075
+ gen0Champion,
6076
+ generations: generationRows,
6077
+ archive,
6078
+ trajectory,
6079
+ holdout,
6080
+ verdict,
6081
+ ...bandInfo ? { band: bandInfo } : {}
6082
+ });
6083
+ }
6084
+ let reproduction;
6085
+ const championCode = codeByName.get(incumbent.name);
6086
+ if (cfg.reproducerCheck && championCode) {
6087
+ const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
6088
+ const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
6089
+ const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
6090
+ try {
6091
+ const summaryRes = await cfg.author.chat.chat({
6092
+ ...cfg.author.model ? { model: cfg.author.model } : {},
6093
+ temperature: 0.2,
6094
+ maxTokens: 512,
6095
+ messages: [
6096
+ {
6097
+ role: "system",
6098
+ content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
6099
+ },
6100
+ { role: "user", content: championCode }
6101
+ ]
6102
+ });
6103
+ const summary = summaryRes.content.trim();
6104
+ const reproduced = await authorStrategy({
6105
+ chat: cfg.author.chat,
6106
+ ...cfg.author.model ? { model: cfg.author.model } : {},
6107
+ ...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
6108
+ ...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
6109
+ temperature: 0.2,
6110
+ contract: `${strategyAuthorContract}
6111
+
6112
+ IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
6113
+ ${summary}`,
6114
+ environmentName: cfg.environment.name,
6115
+ lossesJson: "[]",
6116
+ budget,
6117
+ outDir: cfg.outDir
6118
+ });
6119
+ const reproStrategy = {
6120
+ name: `${incumbent.name}-reproduced`,
6121
+ driver: reproduced.strategy.driver
6122
+ };
6123
+ const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
6124
+ const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
6125
+ reproduction = {
6126
+ summary,
6127
+ reproducedName: reproStrategy.name,
6128
+ file: reproduced.file,
6129
+ championHoldoutScore,
6130
+ reproducedHoldoutScore,
6131
+ gap: championHoldoutScore - reproducedHoldoutScore,
6132
+ reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
6133
+ };
6134
+ } catch (e) {
6135
+ reproduction = {
6136
+ summary: "",
6137
+ reproducedName: "",
6138
+ championHoldoutScore,
6139
+ reproducedHoldoutScore: 0,
6140
+ gap: championHoldoutScore,
6141
+ reproducible: false,
6142
+ error: e instanceof Error ? e.message.slice(0, 300) : String(e)
6143
+ };
6144
+ }
6145
+ }
5496
6146
  return {
5497
6147
  gen0,
5498
6148
  gen0Champion,
@@ -5501,6 +6151,8 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5501
6151
  finalChampion: incumbent,
5502
6152
  holdout,
5503
6153
  verdict,
6154
+ ...bandInfo ? { band: bandInfo } : {},
6155
+ ...reproduction ? { reproduction } : {},
5504
6156
  trajectory
5505
6157
  };
5506
6158
  }
@@ -5572,6 +6224,103 @@ function createVerifierEnvironment(opts) {
5572
6224
  };
5573
6225
  }
5574
6226
 
6227
+ // src/runtime/waterfall.ts
6228
+ function createWaterfallCollector() {
6229
+ let spans = /* @__PURE__ */ new Map();
6230
+ const onEvent = (event) => {
6231
+ if (event.target === "agent.spawn") {
6232
+ const p = event.payload ?? {};
6233
+ const id = p.childId ?? event.id;
6234
+ spans.set(id, {
6235
+ id,
6236
+ label: p.label ?? id,
6237
+ runId: event.runId,
6238
+ ...event.parentId !== void 0 ? { parentId: event.parentId } : {},
6239
+ startMs: event.timestamp,
6240
+ status: "running",
6241
+ usd: 0,
6242
+ tokens: { input: 0, output: 0 }
6243
+ });
6244
+ return;
6245
+ }
6246
+ if (event.target === "agent.child") {
6247
+ const p = event.payload ?? {};
6248
+ const id = p.childId;
6249
+ if (!id) return;
6250
+ const span = spans.get(id);
6251
+ if (!span) return;
6252
+ span.endMs = event.timestamp;
6253
+ span.status = p.status === "down" ? "down" : "done";
6254
+ span.usd = p.spent?.usd ?? 0;
6255
+ span.tokens = {
6256
+ input: p.spent?.tokens?.input ?? 0,
6257
+ output: p.spent?.tokens?.output ?? 0
6258
+ };
6259
+ if (typeof p.score === "number") span.score = p.score;
6260
+ }
6261
+ };
6262
+ const report = () => {
6263
+ const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
6264
+ const start = all[0]?.startMs ?? 0;
6265
+ const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
6266
+ const byKind = {};
6267
+ let totalUsd = 0;
6268
+ const totalTokens2 = { input: 0, output: 0 };
6269
+ for (const s of all) {
6270
+ totalUsd += s.usd;
6271
+ totalTokens2.input += s.tokens.input;
6272
+ totalTokens2.output += s.tokens.output;
6273
+ const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
6274
+ const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
6275
+ k.count += 1;
6276
+ k.ms += (s.endMs ?? s.startMs) - s.startMs;
6277
+ k.usd += s.usd;
6278
+ k.tokens.input += s.tokens.input;
6279
+ k.tokens.output += s.tokens.output;
6280
+ }
6281
+ return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
6282
+ };
6283
+ const render = (opts) => {
6284
+ const { spans: all, totalMs, totalUsd, byKind } = report();
6285
+ if (all.length === 0) return "(no spans observed)";
6286
+ const width = opts?.width ?? 48;
6287
+ const maxRows = opts?.maxRows ?? 60;
6288
+ const start = all[0]?.startMs ?? 0;
6289
+ const scale = totalMs > 0 ? width / totalMs : 0;
6290
+ const lines = [];
6291
+ const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
6292
+ for (const s of all.slice(0, maxRows)) {
6293
+ const offset = Math.round((s.startMs - start) * scale);
6294
+ const dur = (s.endMs ?? s.startMs) - s.startMs;
6295
+ const len = Math.max(1, Math.round(dur * scale));
6296
+ const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
6297
+ const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
6298
+ lines.push(
6299
+ `${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
6300
+ );
6301
+ }
6302
+ if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
6303
+ lines.push("\u2014".repeat(labelWidth + width + 2));
6304
+ for (const [kind, k] of Object.entries(byKind)) {
6305
+ lines.push(
6306
+ `${kind.padEnd(labelWidth)} \xD7${k.count} ${(k.ms / 1e3).toFixed(1)}s busy $${k.usd.toFixed(4)} ${k.tokens.input}/${k.tokens.output}tok`
6307
+ );
6308
+ }
6309
+ lines.push(
6310
+ `TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall $${totalUsd.toFixed(4)}`
6311
+ );
6312
+ return lines.join("\n");
6313
+ };
6314
+ return {
6315
+ hooks: { onEvent },
6316
+ report,
6317
+ render,
6318
+ reset: () => {
6319
+ spans = /* @__PURE__ */ new Map();
6320
+ }
6321
+ };
6322
+ }
6323
+
5575
6324
  // src/runtime/workspace.ts
5576
6325
  function localShell() {
5577
6326
  return async (args, cwd) => {
@@ -5674,6 +6423,10 @@ function tail(s) {
5674
6423
  }
5675
6424
 
5676
6425
  export {
6426
+ deleteBoxSafe,
6427
+ throwAbort,
6428
+ throwIfAborted,
6429
+ sleep,
5677
6430
  contentAddress,
5678
6431
  InMemoryResultBlobStore,
5679
6432
  FileResultBlobStore,
@@ -5681,6 +6434,8 @@ export {
5681
6434
  FileSpawnJournal,
5682
6435
  replaySpawnTree,
5683
6436
  materializeTreeView,
6437
+ anytimeReport,
6438
+ renderAnytimeTable,
5684
6439
  defaultAuditorInstruction,
5685
6440
  auditIntent,
5686
6441
  completionAuthorizes,
@@ -5723,6 +6478,9 @@ export {
5723
6478
  InMemoryCorpus,
5724
6479
  FileCorpus,
5725
6480
  renderCorpusToInstructions,
6481
+ routerChatWithUsage,
6482
+ routerChatWithTools,
6483
+ routerToolLoop,
5726
6484
  createExecutor,
5727
6485
  createExecutorRegistry,
5728
6486
  spendFromUsageEvents,
@@ -5751,11 +6509,14 @@ export {
5751
6509
  strategyAuthorContract,
5752
6510
  assertStrategyContract,
5753
6511
  authorStrategy,
6512
+ discriminatingMeans,
6513
+ pickChampion,
5754
6514
  selectChampion,
5755
6515
  runStrategyEvolution,
5756
6516
  createVerifierEnvironment,
6517
+ createWaterfallCollector,
5757
6518
  localShell,
5758
6519
  gitWorkspace,
5759
6520
  jjWorkspace
5760
6521
  };
5761
- //# sourceMappingURL=chunk-IW2LMLK6.js.map
6522
+ //# sourceMappingURL=chunk-CM2IK7VS.js.map