@tangle-network/agent-runtime 0.48.0 → 0.49.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +79 -15
  2. package/dist/agent.js +1 -1
  3. package/dist/chunk-GHX7XOJ2.js +433 -0
  4. package/dist/chunk-GHX7XOJ2.js.map +1 -0
  5. package/dist/{chunk-TJS7S3HJ.js → chunk-IQS4HI3F.js} +14 -5
  6. package/dist/chunk-IQS4HI3F.js.map +1 -0
  7. package/dist/{chunk-IW2LMLK6.js → chunk-PXUTIMGJ.js} +767 -129
  8. package/dist/chunk-PXUTIMGJ.js.map +1 -0
  9. package/dist/{chunk-656G2XCL.js → chunk-U2VEWKKK.js} +3 -3
  10. package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
  11. package/dist/chunk-VIEDXELL.js.map +1 -0
  12. package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
  13. package/dist/index.d.ts +29 -4
  14. package/dist/index.js +109 -21
  15. package/dist/index.js.map +1 -1
  16. package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
  17. package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
  18. package/dist/loop-runner-bin.d.ts +2 -2
  19. package/dist/loop-runner-bin.js +3 -3
  20. package/dist/loops.d.ts +2 -2
  21. package/dist/loops.js +11 -1
  22. package/dist/mcp/bin.js +187 -24
  23. package/dist/mcp/bin.js.map +1 -1
  24. package/dist/mcp/index.d.ts +27 -124
  25. package/dist/mcp/index.js +28 -6
  26. package/dist/mcp/index.js.map +1 -1
  27. package/dist/platform.js +2 -2
  28. package/dist/platform.js.map +1 -1
  29. package/dist/runtime.d.ts +285 -8
  30. package/dist/runtime.js +11 -1
  31. package/dist/workflow.js +1 -1
  32. package/package.json +6 -5
  33. package/dist/chunk-IW2LMLK6.js.map +0 -1
  34. package/dist/chunk-JNPK46YH.js.map +0 -1
  35. package/dist/chunk-LX66I3SC.js +0 -218
  36. package/dist/chunk-LX66I3SC.js.map +0 -1
  37. package/dist/chunk-TJS7S3HJ.js.map +0 -1
  38. package/dist/kb-gate-51BlLlVM.d.ts +0 -529
  39. /package/dist/{chunk-656G2XCL.js.map → chunk-U2VEWKKK.js.map} +0 -0
  40. /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
@@ -426,6 +426,103 @@ function isNoEntError(err) {
426
426
  return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
427
427
  }
428
428
 
429
+ // src/runtime/anytime.ts
430
+ var median = (xs) => {
431
+ if (xs.length === 0) return null;
432
+ const s = [...xs].sort((a, b) => a - b);
433
+ const mid = Math.floor(s.length / 2);
434
+ return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
435
+ };
436
+ function anytimeReport(spans, opts) {
437
+ const targets = opts?.targets ?? [1];
438
+ const byRun = /* @__PURE__ */ new Map();
439
+ for (const s of spans) {
440
+ if (!s.label.startsWith("shot:")) continue;
441
+ const list = byRun.get(s.runId) ?? [];
442
+ list.push(s);
443
+ byRun.set(s.runId, list);
444
+ }
445
+ const perTask = [];
446
+ for (const [runId, shots] of byRun) {
447
+ const m = runId.match(/^agentic:(.+):(.+)$/);
448
+ const strategy = m?.[1] ?? runId;
449
+ const taskId = m?.[2] ?? runId;
450
+ const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
451
+ const t0 = Math.min(...ordered.map((s) => s.startMs));
452
+ const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
453
+ let best = 0;
454
+ let cumUsd = 0;
455
+ const points = [];
456
+ const hits = {};
457
+ for (const t of taskTargets) hits[String(t)] = null;
458
+ for (const s of ordered) {
459
+ cumUsd += s.usd;
460
+ if (typeof s.score === "number" && s.score > best) best = s.score;
461
+ const elapsedMs = (s.endMs ?? s.startMs) - t0;
462
+ points.push({ elapsedMs, cumUsd, best });
463
+ for (const t of taskTargets) {
464
+ if (hits[String(t)] === null && best >= t) {
465
+ hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
466
+ }
467
+ }
468
+ }
469
+ perTask.push({ taskId, strategy, points, hits });
470
+ }
471
+ const byStrategy = /* @__PURE__ */ new Map();
472
+ for (const t of perTask) {
473
+ const list = byStrategy.get(t.strategy) ?? [];
474
+ list.push(t);
475
+ byStrategy.set(t.strategy, list);
476
+ }
477
+ const perStrategy = [];
478
+ for (const [strategy, tasks] of byStrategy) {
479
+ const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
480
+ const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
481
+ const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
482
+ const curveByShot = [];
483
+ for (let i = 0; i < maxShots; i += 1) {
484
+ const vals = tasks.map(
485
+ (t) => t.points[Math.min(i, t.points.length - 1)].best
486
+ );
487
+ curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
488
+ }
489
+ const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
490
+ const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
491
+ for (const t of summaryTargets) {
492
+ const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
493
+ const reached = tasks.filter((x) => key(x) !== null);
494
+ perStrategy.push({
495
+ strategy,
496
+ target: t,
497
+ tasks: tasks.length,
498
+ reachedTarget: reached.length,
499
+ medianTttMs: median(reached.map((x) => key(x).ms)),
500
+ medianShotsToTarget: median(reached.map((x) => key(x).shots)),
501
+ ertMs: reached.length > 0 ? totalMs / reached.length : null,
502
+ erUsd: reached.length > 0 ? totalUsd / reached.length : null,
503
+ curveByShot,
504
+ auc
505
+ });
506
+ }
507
+ }
508
+ perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
509
+ return { targets, perTask, perStrategy };
510
+ }
511
+ function renderAnytimeTable(report) {
512
+ const lines = [
513
+ `anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
514
+ "strategy \u2265tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve"
515
+ ];
516
+ for (const s of report.perStrategy) {
517
+ const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
518
+ const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
519
+ lines.push(
520
+ `${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? " \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`} ${s.medianShotsToTarget === null ? " \u2014" : String(s.medianShotsToTarget).padStart(5)} ${s.ertMs === null ? " \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`} ${s.erUsd === null ? " \u2014" : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`
521
+ );
522
+ }
523
+ return lines.join("\n");
524
+ }
525
+
429
526
  // src/runtime/audit-intent.ts
430
527
  var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
431
528
  function summarize(trace, maxLines) {
@@ -2346,20 +2443,20 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
2346
2443
  }
2347
2444
  async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
2348
2445
  let reconciled = false;
2349
- const reconcileOnce = (spend2) => {
2446
+ const reconcileOnce = (spend) => {
2350
2447
  if (reconciled) return;
2351
2448
  reconciled = true;
2352
- pool2.reconcile(ticket, clampSpend(spend2, opts.budget));
2449
+ pool2.reconcile(ticket, clampSpend(spend, opts.budget));
2353
2450
  };
2354
2451
  try {
2355
2452
  live.status = "running";
2356
2453
  const ran = executor.execute(task, childAbort.signal);
2357
2454
  let artifact;
2358
2455
  if (isAsyncIterable2(ran)) {
2359
- const spend2 = await foldStream(ran);
2360
- live.spent = spend2;
2456
+ const spend = await foldStream(ran);
2457
+ live.spent = spend;
2361
2458
  artifact = executor.resultArtifact();
2362
- reconcileOnce(spend2);
2459
+ reconcileOnce(spend);
2363
2460
  } else {
2364
2461
  const terminal = await ran;
2365
2462
  live.spent = terminal.spent;
@@ -2448,21 +2545,21 @@ async function foldStream(stream) {
2448
2545
  }
2449
2546
  return { iterations, tokens, usd, ms: 0 };
2450
2547
  }
2451
- function clampSpend(spend2, budget) {
2452
- const totalTokens2 = spend2.tokens.input + spend2.tokens.output;
2548
+ function clampSpend(spend, budget) {
2549
+ const totalTokens2 = spend.tokens.input + spend.tokens.output;
2453
2550
  const tokensOk = totalTokens2 <= budget.maxTokens;
2454
- const itersOk = spend2.iterations <= budget.maxIterations;
2455
- const usdOk = budget.maxUsd === void 0 || spend2.usd <= budget.maxUsd;
2456
- if (tokensOk && itersOk && usdOk) return spend2;
2551
+ const itersOk = spend.iterations <= budget.maxIterations;
2552
+ const usdOk = budget.maxUsd === void 0 || spend.usd <= budget.maxUsd;
2553
+ if (tokensOk && itersOk && usdOk) return spend;
2457
2554
  const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
2458
2555
  return {
2459
- iterations: Math.min(spend2.iterations, budget.maxIterations),
2556
+ iterations: Math.min(spend.iterations, budget.maxIterations),
2460
2557
  tokens: ratio < 1 ? {
2461
- input: Math.floor(spend2.tokens.input * ratio),
2462
- output: Math.floor(spend2.tokens.output * ratio)
2463
- } : spend2.tokens,
2464
- usd: budget.maxUsd === void 0 ? spend2.usd : Math.min(spend2.usd, budget.maxUsd),
2465
- ms: spend2.ms
2558
+ input: Math.floor(spend.tokens.input * ratio),
2559
+ output: Math.floor(spend.tokens.output * ratio)
2560
+ } : spend.tokens,
2561
+ usd: budget.maxUsd === void 0 ? spend.usd : Math.min(spend.usd, budget.maxUsd),
2562
+ ms: spend.ms
2466
2563
  };
2467
2564
  }
2468
2565
  async function teardownSafe(executor, grace) {
@@ -3110,7 +3207,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
3110
3207
  "routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
3111
3208
  );
3112
3209
  }
3113
- const maxTurns = seam.maxTurns ?? 4;
3210
+ const maxTurns = seam.maxTurns ?? 200;
3114
3211
  const controller = new AbortController();
3115
3212
  const abortIfSignalled = () => {
3116
3213
  if (ctx.signal.aborted) controller.abort();
@@ -4228,12 +4325,12 @@ function countStatuses(reported) {
4228
4325
  function zeroSpend4() {
4229
4326
  return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
4230
4327
  }
4231
- function cloneSpend(spend2) {
4328
+ function cloneSpend(spend) {
4232
4329
  return {
4233
- iterations: spend2.iterations,
4234
- tokens: { input: spend2.tokens.input, output: spend2.tokens.output },
4235
- usd: spend2.usd,
4236
- ms: spend2.ms
4330
+ iterations: spend.iterations,
4331
+ tokens: { input: spend.tokens.input, output: spend.tokens.output },
4332
+ usd: spend.usd,
4333
+ ms: spend.ms
4237
4334
  };
4238
4335
  }
4239
4336
  function addSpend(acc, delta) {
@@ -4249,13 +4346,13 @@ function spreadOf(values) {
4249
4346
  function fractionalSpread(values) {
4250
4347
  const spread = spreadOf(values);
4251
4348
  if (spread === 0) return 0;
4252
- const median = medianOf(values);
4253
- if (median === 0) {
4349
+ const median2 = medianOf(values);
4350
+ if (median2 === 0) {
4254
4351
  throw new Error(
4255
4352
  "equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
4256
4353
  );
4257
4354
  }
4258
- return spread / median;
4355
+ return spread / median2;
4259
4356
  }
4260
4357
  function medianOf(values) {
4261
4358
  if (values.length === 0) {
@@ -4287,28 +4384,34 @@ function requireNode2(nodes, id, root) {
4287
4384
  return node;
4288
4385
  }
4289
4386
  function requireSpend(rolled, id, root) {
4290
- const spend2 = rolled.get(id);
4291
- if (!spend2) {
4387
+ const spend = rolled.get(id);
4388
+ if (!spend) {
4292
4389
  throw new Error(
4293
4390
  `trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
4294
4391
  );
4295
4392
  }
4296
- return spend2;
4393
+ return spend;
4297
4394
  }
4298
4395
 
4299
4396
  // src/runtime/promotion-gate.ts
4300
4397
  import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
4301
4398
  function promotionGate(opts) {
4399
+ const mode = opts.mode ?? "superiority";
4302
4400
  if (opts.candidate === opts.incumbent) {
4303
4401
  return {
4304
4402
  promoted: false,
4305
4403
  reason: "identical-champion",
4404
+ mode,
4306
4405
  n: 0,
4307
4406
  lift: { mean: 0, median: 0, low: 0, high: 0 }
4308
4407
  };
4309
4408
  }
4310
4409
  const before = [];
4311
4410
  const after = [];
4411
+ const incUsd = [];
4412
+ const candUsd = [];
4413
+ const incMs = [];
4414
+ const candMs = [];
4312
4415
  const cellIds = [];
4313
4416
  for (const row of opts.report.perTask) {
4314
4417
  const inc = row.cells?.[opts.incumbent];
@@ -4316,6 +4419,10 @@ function promotionGate(opts) {
4316
4419
  if (!inc || !cand) continue;
4317
4420
  before.push(inc.score);
4318
4421
  after.push(cand.score);
4422
+ incUsd.push(inc.usd);
4423
+ candUsd.push(cand.usd);
4424
+ incMs.push(inc.ms);
4425
+ candMs.push(cand.ms);
4319
4426
  cellIds.push(row.taskId);
4320
4427
  }
4321
4428
  if (before.length === 0) {
@@ -4339,8 +4446,84 @@ function promotionGate(opts) {
4339
4446
  low: sig.bootstrap.low,
4340
4447
  high: sig.bootstrap.high
4341
4448
  };
4342
- if (sig.fewRuns) return { promoted: false, reason: "few-tasks", n: sig.n, lift };
4343
- return sig.significant ? { promoted: true, reason: "significant", n: sig.n, lift } : { promoted: false, reason: "no-margin", n: sig.n, lift };
4449
+ const latSig = heldoutSignificance(
4450
+ { before: incMs, after: candMs, cellIds },
4451
+ {
4452
+ deltaThreshold: 0,
4453
+ minProductiveRuns: 1,
4454
+ statistic: opts.statistic ?? "mean",
4455
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4456
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4457
+ }
4458
+ );
4459
+ const latency = {
4460
+ mean: latSig.bootstrap.mean,
4461
+ median: latSig.bootstrap.median,
4462
+ low: latSig.bootstrap.low,
4463
+ high: latSig.bootstrap.high
4464
+ };
4465
+ if (mode === "superiority") {
4466
+ if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
4467
+ return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
4468
+ }
4469
+ const tolerance = opts.scoreTolerance ?? 0.05;
4470
+ const scoreSig = heldoutSignificance(
4471
+ { before, after, cellIds },
4472
+ {
4473
+ deltaThreshold: -tolerance,
4474
+ minProductiveRuns: opts.minPairedTasks ?? 6,
4475
+ statistic: opts.statistic ?? "mean",
4476
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4477
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4478
+ }
4479
+ );
4480
+ const costSig = heldoutSignificance(
4481
+ { before: candUsd, after: incUsd, cellIds },
4482
+ {
4483
+ deltaThreshold: 0,
4484
+ minProductiveRuns: opts.minPairedTasks ?? 6,
4485
+ statistic: opts.statistic ?? "mean",
4486
+ ...opts.seed !== void 0 ? { seed: opts.seed } : {},
4487
+ ...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
4488
+ }
4489
+ );
4490
+ const costSavings = {
4491
+ mean: costSig.bootstrap.mean,
4492
+ median: costSig.bootstrap.median,
4493
+ low: costSig.bootstrap.low,
4494
+ high: costSig.bootstrap.high
4495
+ };
4496
+ if (scoreSig.fewRuns)
4497
+ return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
4498
+ if (!scoreSig.significant)
4499
+ return {
4500
+ promoted: false,
4501
+ reason: "non-inferiority-unproven",
4502
+ mode,
4503
+ n: scoreSig.n,
4504
+ lift,
4505
+ costSavings,
4506
+ latency
4507
+ };
4508
+ if (!costSig.significant)
4509
+ return {
4510
+ promoted: false,
4511
+ reason: "not-cheaper",
4512
+ mode,
4513
+ n: scoreSig.n,
4514
+ lift,
4515
+ costSavings,
4516
+ latency
4517
+ };
4518
+ return {
4519
+ promoted: true,
4520
+ reason: "non-inferior-and-cheaper",
4521
+ mode,
4522
+ n: scoreSig.n,
4523
+ lift,
4524
+ costSavings,
4525
+ latency
4526
+ };
4344
4527
  }
4345
4528
 
4346
4529
  // src/runtime/run-benchmark.ts
@@ -4364,7 +4547,8 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
4364
4547
  messages,
4365
4548
  tools,
4366
4549
  tool_choice: "auto",
4367
- temperature: opts.temperature ?? 0.7
4550
+ temperature: opts.temperature ?? 0.7,
4551
+ ...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
4368
4552
  })
4369
4553
  });
4370
4554
  if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
@@ -4403,12 +4587,15 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
4403
4587
  }
4404
4588
  return { messages, completions, toolCalls, toolErrors, tokens };
4405
4589
  }
4406
- async function analyze(task, messages, opts) {
4407
- const trajectory = messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
4590
+ function compactTrajectory(messages) {
4591
+ return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
4408
4592
  if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
4409
4593
  const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
4410
4594
  return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
4411
4595
  }).join("\n").slice(0, 7e3);
4596
+ }
4597
+ async function consultAnalyst(task, messages, instruction, opts) {
4598
+ const trajectory = compactTrajectory(messages);
4412
4599
  const analystModel = opts.analystModel ?? opts.model;
4413
4600
  const chat = createChatClient({
4414
4601
  transport: "router",
@@ -4416,6 +4603,52 @@ async function analyze(task, messages, opts) {
4416
4603
  baseUrl: opts.routerBaseUrl,
4417
4604
  defaultModel: analystModel
4418
4605
  });
4606
+ const res = await chat.chat({
4607
+ model: analystModel,
4608
+ temperature: 0.2,
4609
+ maxTokens: 1024,
4610
+ messages: [
4611
+ { role: "system", content: instruction },
4612
+ {
4613
+ role: "user",
4614
+ content: `TASK: ${task.userPrompt.slice(0, 1500)}
4615
+
4616
+ TRAJECTORY:
4617
+ ${trajectory}`
4618
+ }
4619
+ ]
4620
+ });
4621
+ const usage = res.usage;
4622
+ return {
4623
+ steer: res.content.trim(),
4624
+ tokens: {
4625
+ input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
4626
+ output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
4627
+ }
4628
+ };
4629
+ }
4630
+ async function analyze(task, messages, opts) {
4631
+ const trajectory = compactTrajectory(messages);
4632
+ const analystModel = opts.analystModel ?? opts.model;
4633
+ const inner = createChatClient({
4634
+ transport: "router",
4635
+ apiKey: opts.routerKey,
4636
+ baseUrl: opts.routerBaseUrl,
4637
+ defaultModel: analystModel
4638
+ });
4639
+ const tokens = { input: 0, output: 0 };
4640
+ const chat = {
4641
+ ...inner,
4642
+ chat: async (req, callOpts) => {
4643
+ const res = await inner.chat(req, callOpts);
4644
+ const u = res.usage;
4645
+ if (u) {
4646
+ tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
4647
+ tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
4648
+ }
4649
+ return res;
4650
+ }
4651
+ };
4419
4652
  const obs = await observe(
4420
4653
  {
4421
4654
  task: task.userPrompt,
@@ -4432,14 +4665,8 @@ async function analyze(task, messages, opts) {
4432
4665
  }
4433
4666
  );
4434
4667
  const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
4435
- return steer || "COMPLETE";
4668
+ return { steer: steer || "COMPLETE", tokens };
4436
4669
  }
4437
- var spend = (iterations) => ({
4438
- iterations,
4439
- tokens: { input: 0, output: 0 },
4440
- usd: 0,
4441
- ms: 0
4442
- });
4443
4670
  function shotExecutor(surface, opts) {
4444
4671
  let artifact;
4445
4672
  return {
@@ -4449,7 +4676,19 @@ function shotExecutor(surface, opts) {
4449
4676
  const own = !t.handle;
4450
4677
  const handle = t.handle ?? await surface.open(t.task);
4451
4678
  try {
4452
- const tools = await surface.tools(t.task, handle);
4679
+ const allTools = await surface.tools(t.task, handle);
4680
+ let tools = allTools;
4681
+ if (t.tools) {
4682
+ const known = new Set(allTools.map((tool) => tool.function.name));
4683
+ const unknown = t.tools.filter((name) => !known.has(name));
4684
+ if (unknown.length > 0) {
4685
+ throw new Error(
4686
+ `shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
4687
+ );
4688
+ }
4689
+ const want = new Set(t.tools);
4690
+ tools = allTools.filter((tool) => want.has(tool.function.name));
4691
+ }
4453
4692
  const messages = t.messages?.length ? t.messages : [
4454
4693
  { role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
4455
4694
  { role: "user", content: `${t.task.userPrompt}
@@ -4505,8 +4744,18 @@ function analystExecutor(opts) {
4505
4744
  runtime: "agentic-analyst",
4506
4745
  async execute(task) {
4507
4746
  const t = task;
4508
- const findings = await analyze(t.task, t.messages, opts);
4509
- artifact = { outRef: `analyst:${findings.length}`, out: findings, spent: spend(1) };
4747
+ const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
4748
+ const analystModel = opts.analystModel ?? opts.model;
4749
+ artifact = {
4750
+ outRef: `analyst:${steer.length}`,
4751
+ out: steer,
4752
+ spent: {
4753
+ iterations: 1,
4754
+ tokens,
4755
+ usd: isModelPriced2(analystModel) ? estimateCost2(tokens.input, tokens.output, analystModel) : 0,
4756
+ ms: 0
4757
+ }
4758
+ };
4510
4759
  return artifact;
4511
4760
  },
4512
4761
  teardown: () => Promise.resolve({ destroyed: true }),
@@ -4669,12 +4918,21 @@ function defineStrategy(name, run) {
4669
4918
  const innerTurns = opts.innerTurns ?? 4;
4670
4919
  let verifiedBest = 0;
4671
4920
  let verifiedResolved = false;
4921
+ const openHandles = /* @__PURE__ */ new Set();
4672
4922
  const ctx = {
4673
4923
  // Narrowed to open/close — the body gets no raw call()/score() access.
4674
4924
  surface: {
4675
4925
  name: surface.name,
4676
- open: (t) => surface.open(t),
4677
- close: (h) => surface.close(h)
4926
+ open: async (t) => {
4927
+ const h = await surface.open(t);
4928
+ openHandles.add(h.id);
4929
+ return h;
4930
+ },
4931
+ close: async (h) => {
4932
+ if (!h || !openHandles.has(h.id)) return;
4933
+ openHandles.delete(h.id);
4934
+ await surface.close(h);
4935
+ }
4678
4936
  },
4679
4937
  task,
4680
4938
  opts,
@@ -4690,7 +4948,8 @@ function defineStrategy(name, run) {
4690
4948
  handle: spec?.handle,
4691
4949
  messages: spec?.messages,
4692
4950
  steer: spec?.steer,
4693
- persona: spec?.persona
4951
+ persona: spec?.persona,
4952
+ tools: spec?.tools
4694
4953
  },
4695
4954
  { budget: perChild(innerTurns), label: child.name }
4696
4955
  );
@@ -4702,6 +4961,13 @@ function defineStrategy(name, run) {
4702
4961
  if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
4703
4962
  return out;
4704
4963
  },
4964
+ async listTools(handle) {
4965
+ const tools = await surface.tools(task, handle);
4966
+ return tools.map((t) => ({
4967
+ name: t.function.name,
4968
+ ...t.function.description ? { description: t.function.description } : {}
4969
+ }));
4970
+ },
4705
4971
  async critique(messages) {
4706
4972
  const child = leaf(`analyst:${seq}`, "analyst");
4707
4973
  seq += 1;
@@ -4715,12 +4981,33 @@ function defineStrategy(name, run) {
4715
4981
  if (settled.kind === "down") return null;
4716
4982
  const findings = settled.out;
4717
4983
  return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
4984
+ },
4985
+ async consult(messages, instruction) {
4986
+ const child = leaf(`analyst:${seq}`, "analyst");
4987
+ seq += 1;
4988
+ const res = scope.spawn(
4989
+ child,
4990
+ { task, messages, rawInstruction: instruction },
4991
+ { budget: perChild(1), label: child.name }
4992
+ );
4993
+ if (!res.ok) return null;
4994
+ const settled = await drainOne2(scope);
4995
+ if (settled.kind === "down") return null;
4996
+ return settled.out;
4718
4997
  }
4719
4998
  };
4720
4999
  const r = await run(ctx);
4721
5000
  return {
4722
5001
  kind: "done",
4723
- deliverable: { mode: name, ...r, score: verifiedBest, resolved: verifiedResolved }
5002
+ deliverable: {
5003
+ mode: name,
5004
+ ...r,
5005
+ progression: Array.isArray(r.progression) ? r.progression : [],
5006
+ completions: typeof r.completions === "number" ? r.completions : 0,
5007
+ shots: typeof r.shots === "number" ? r.shots : 0,
5008
+ score: verifiedBest,
5009
+ resolved: verifiedResolved
5010
+ }
4724
5011
  };
4725
5012
  }
4726
5013
  })
@@ -4875,27 +5162,44 @@ async function runBenchmark(cfg) {
4875
5162
  let settled = 0;
4876
5163
  const perTask = await pool(cfg.tasks, concurrency, async (task) => {
4877
5164
  const cells = {};
5165
+ const errors = {};
4878
5166
  let row;
4879
5167
  try {
4880
5168
  for (const s of strategies) {
4881
- const r = await runAgentic({
4882
- ...cfg.worker,
4883
- surface: cfg.environment,
4884
- task,
4885
- strategy: s,
4886
- budget,
4887
- ...cfg.hooks ? { hooks: cfg.hooks } : {}
4888
- });
4889
- cells[s.name] = {
4890
- score: r.score,
4891
- resolved: r.resolved,
4892
- progression: r.progression,
4893
- usd: r.usd,
4894
- ms: r.ms,
4895
- tokens: r.tokens
4896
- };
5169
+ try {
5170
+ const r = await runAgentic({
5171
+ ...cfg.worker,
5172
+ surface: cfg.environment,
5173
+ task,
5174
+ strategy: s,
5175
+ budget,
5176
+ ...cfg.hooks ? { hooks: cfg.hooks } : {}
5177
+ });
5178
+ cells[s.name] = {
5179
+ score: r.score,
5180
+ resolved: r.resolved,
5181
+ progression: r.progression,
5182
+ usd: r.usd,
5183
+ ms: r.ms,
5184
+ tokens: r.tokens
5185
+ };
5186
+ } catch (e) {
5187
+ errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
5188
+ cells[s.name] = {
5189
+ score: 0,
5190
+ resolved: false,
5191
+ progression: [],
5192
+ usd: 0,
5193
+ ms: 0,
5194
+ tokens: { input: 0, output: 0 }
5195
+ };
5196
+ }
4897
5197
  }
4898
- row = { taskId: task.id, cells };
5198
+ row = {
5199
+ taskId: task.id,
5200
+ cells,
5201
+ ...Object.keys(errors).length > 0 ? { errors } : {}
5202
+ };
4899
5203
  } catch (e) {
4900
5204
  row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
4901
5205
  }
@@ -5200,7 +5504,7 @@ var strategyAuthorContract = `
5200
5504
  You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
5201
5505
  spend a compute budget to beat a task's deployable check. You compose exactly two steps:
5202
5506
 
5203
- shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
5507
+ shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
5204
5508
  Runs ONE worker attempt (a bounded tool loop) over an artifact.
5205
5509
  - omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
5206
5510
  - pass handle => the shot CONTINUES that artifact (state accumulates across shots).
@@ -5210,6 +5514,10 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
5210
5514
  (multi-agent strategies: a researcher shot then an engineer shot, a panel of k
5211
5515
  personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
5212
5516
  a carried conversation it arrives as a hand-off message. Same conserved budget.
5517
+ - tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by
5518
+ name (focus an explore shot on read-only tools, an execute shot on write tools).
5519
+ Restriction-only; unknown names make the shot fail. ALWAYS select from
5520
+ await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
5213
5521
  ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
5214
5522
  Returns null if the attempt failed infra-wise.
5215
5523
 
@@ -5217,10 +5525,23 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
5217
5525
  A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
5218
5526
  instruction (or null when it judges the work complete). Costs ~1 completion.
5219
5527
 
5528
+ consult(messages, instruction): Promise<string | null>
5529
+ The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
5530
+ trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
5531
+ (a decision, a prediction). Costs ~1 completion.
5532
+
5220
5533
  surface.open(task) / surface.close(handle)
5221
5534
  Open a persistent artifact you manage yourself (remember to close in a finally).
5535
+ close is idempotent \u2014 closing an already-closed handle is a safe no-op.
5536
+
5537
+ listTools(handle): Promise<Array<{ name, description? }>>
5538
+ The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
5539
+ shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
5540
+ names from an example kills your shots on every task whose tools differ.
5222
5541
 
5223
5542
  Rules:
5543
+ - ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
5544
+ crashes the whole benchmark run.
5224
5545
  - Stay within ~budget total shots; every shot/critique spends from a conserved pool.
5225
5546
  - For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
5226
5547
  fresh conversation too, but be explicit). To CONTINUE, pass the previous
@@ -5230,8 +5551,8 @@ Rules:
5230
5551
  - The module must be EXACTLY this shape (no other imports, no commentary outside code):
5231
5552
 
5232
5553
  import { defineStrategy } from '@tangle-network/agent-runtime/loops'
5233
- export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
5234
- // your composition
5554
+ export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
5555
+ // your composition (listTools comes from the destructured context \u2014 it is NOT a global)
5235
5556
  })
5236
5557
  `;
5237
5558
  function assertStrategyContract(code) {
@@ -5307,34 +5628,89 @@ async function authorStrategy(opts) {
5307
5628
  }
5308
5629
 
5309
5630
  // src/runtime/strategy-evolution.ts
5631
+ import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
5310
5632
  import { gzipSync } from "zlib";
5311
- function selectChampion(report, fieldOrder, policy, epsilon) {
5312
- const entries = fieldOrder.map((name) => ({ name, summary: report.perStrategy[name] })).filter((e) => !!e.summary);
5633
+ function discriminatingMeans(report, fieldOrder) {
5634
+ const rows = report.perTask.filter((r) => {
5635
+ if (!r.cells) return false;
5636
+ const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
5637
+ if (scores.length < fieldOrder.length) return false;
5638
+ return Math.max(...scores) - Math.min(...scores) > 0;
5639
+ });
5640
+ if (rows.length === 0) return null;
5641
+ const out = {};
5642
+ for (const name of fieldOrder) {
5643
+ const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
5644
+ out[name] = {
5645
+ score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
5646
+ usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
5647
+ };
5648
+ }
5649
+ return out;
5650
+ }
5651
+ function pickChampion(means, fieldOrder, policy, epsilon) {
5652
+ const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
5313
5653
  if (entries.length === 0)
5314
- throw new Error("selectChampion: report carries none of the field strategies");
5654
+ throw new Error("pickChampion: the means table carries none of the field strategies");
5315
5655
  const best = Math.max(...entries.map((e) => e.summary.score));
5316
5656
  const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
5317
- if (!pick) throw new Error("selectChampion: empty pick (unreachable)");
5657
+ if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
5318
5658
  return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
5319
5659
  }
5660
+ function selectChampion(report, fieldOrder, policy, epsilon) {
5661
+ return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
5662
+ }
5320
5663
  var fieldSummary = (archive) => archive.map(
5321
5664
  (n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
5322
5665
  ).join("\n");
5323
- var compactLosses = (report) => {
5666
+ var compactLosses = (report, detail) => {
5324
5667
  const r2 = (x) => Math.round(x * 100) / 100;
5325
5668
  const rows = report.perTask.map(
5326
5669
  (row) => row.cells ? {
5327
5670
  task: row.taskId,
5671
+ ...row.errors ? {
5672
+ errors: Object.fromEntries(
5673
+ Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
5674
+ )
5675
+ } : {},
5328
5676
  cells: Object.fromEntries(
5329
5677
  Object.entries(row.cells).map(([name, c]) => [
5330
5678
  name,
5331
- { score: r2(c.score), resolved: c.resolved, progression: c.progression.map(r2) }
5679
+ // 'binary' is the leakage-bounded channel: the author learns pass/fail per
5680
+ // task and nothing else — the per-generation leak from the evaluation data
5681
+ // is capped at one bit per cell (arXiv:2606.11045 measured that exploration
5682
+ // survives this; whether AUTHORING does is the E1-coarse A/B).
5683
+ detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
5684
+ score: r2(c.score),
5685
+ resolved: c.resolved,
5686
+ usd: Math.round(c.usd * 1e4) / 1e4,
5687
+ progression: (c.progression ?? []).map(r2)
5688
+ }
5332
5689
  ])
5333
5690
  )
5334
5691
  } : { task: row.taskId, error: row.error?.slice(0, 80) }
5335
5692
  );
5336
5693
  return JSON.stringify(rows).slice(0, 12e3);
5337
5694
  };
5695
+ function renameStrategy(orig, unique) {
5696
+ if (orig.name === unique) return orig;
5697
+ return {
5698
+ name: unique,
5699
+ driver: (s, t, o, b) => {
5700
+ const agent = orig.driver(s, t, o, b);
5701
+ return {
5702
+ ...agent,
5703
+ name: unique,
5704
+ act: async (task, scope) => {
5705
+ const out = await agent.act(task, scope);
5706
+ if (out.kind !== "done") return out;
5707
+ const deliverable = { ...out.deliverable, mode: unique };
5708
+ return { ...out, deliverable };
5709
+ }
5710
+ };
5711
+ }
5712
+ };
5713
+ }
5338
5714
  async function runStrategyEvolution(cfg) {
5339
5715
  const budget = cfg.budget ?? 3;
5340
5716
  const concurrency = cfg.concurrency ?? 3;
@@ -5342,37 +5718,72 @@ async function runStrategyEvolution(cfg) {
5342
5718
  const populationSize = cfg.populationSize ?? 2;
5343
5719
  const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
5344
5720
  const policy = cfg.champion ?? "costAware";
5345
- const epsilon = cfg.championEpsilon ?? 0.01;
5721
+ const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
5346
5722
  const byName = new Map(baselines.map((s) => [s.name, s]));
5347
- const bench = (phase, tasks, strategies) => runBenchmark({
5348
- environment: cfg.environment,
5349
- tasks,
5350
- worker: cfg.worker,
5351
- strategies,
5723
+ const codeByName = /* @__PURE__ */ new Map();
5724
+ const fingerprint = {
5725
+ trainN: cfg.trainN,
5726
+ holdoutN: cfg.holdoutN,
5352
5727
  budget,
5353
- concurrency,
5354
- ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
5355
- ...cfg.hooks ? { hooks: cfg.hooks } : {}
5356
- });
5728
+ generations,
5729
+ populationSize
5730
+ };
5731
+ let ckpt;
5732
+ if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
5733
+ const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
5734
+ if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
5735
+ throw new Error(
5736
+ `evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
5737
+ );
5738
+ }
5739
+ ckpt = raw;
5740
+ }
5741
+ const save = (state) => {
5742
+ if (cfg.checkpoint)
5743
+ writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
5744
+ };
5745
+ const bench = async (phase, tasks, strategies) => {
5746
+ await cfg.onPhase?.(phase);
5747
+ return runBenchmark({
5748
+ environment: cfg.environment,
5749
+ tasks,
5750
+ worker: cfg.worker,
5751
+ strategies,
5752
+ budget,
5753
+ concurrency,
5754
+ ...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
5755
+ ...cfg.hooks ? { hooks: cfg.hooks } : {}
5756
+ });
5757
+ };
5357
5758
  const train = await cfg.tasks(0, cfg.trainN);
5358
- const gen0 = await bench("gen0", train, baselines);
5359
- const archive = baselines.map((s) => ({
5759
+ const probeTask = train[0];
5760
+ if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
5761
+ const probe = await cfg.environment.open(probeTask);
5762
+ let toolCatalog;
5763
+ try {
5764
+ const tools = await cfg.environment.tools(probeTask, probe);
5765
+ toolCatalog = tools.map(
5766
+ (t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
5767
+ ).join("\n");
5768
+ } finally {
5769
+ await cfg.environment.close(probe);
5770
+ }
5771
+ const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
5772
+ const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
5360
5773
  name: s.name,
5361
5774
  source: "baseline",
5362
5775
  generation: 0,
5363
5776
  score: gen0.perStrategy[s.name]?.score ?? 0,
5364
5777
  usd: gen0.perStrategy[s.name]?.usd ?? 0
5365
5778
  }));
5366
- const gen0Champion = selectChampion(
5779
+ const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
5367
5780
  gen0,
5368
5781
  baselines.map((s) => s.name),
5369
5782
  policy,
5370
5783
  epsilon
5371
5784
  );
5372
- let incumbent = gen0Champion;
5373
- let latestReport = gen0;
5374
- const generationRows = [];
5375
- const trajectory = [
5785
+ const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
5786
+ const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
5376
5787
  {
5377
5788
  generation: 0,
5378
5789
  champion: gen0Champion.name,
@@ -5380,13 +5791,39 @@ async function runStrategyEvolution(cfg) {
5380
5791
  usd: gen0Champion.usd
5381
5792
  }
5382
5793
  ];
5383
- let authoredOk = 0;
5384
- for (let g = 1; g <= generations; g += 1) {
5385
- const lossesJson = compactLosses(latestReport);
5794
+ for (const row of generationRows) {
5795
+ for (const c of row.candidates) {
5796
+ if (!c.file || c.error) continue;
5797
+ const mod = await import(`file://${c.file}`);
5798
+ if (!mod.default || typeof mod.default.driver !== "function") {
5799
+ throw new Error(
5800
+ `evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
5801
+ );
5802
+ }
5803
+ byName.set(c.name, renameStrategy(mod.default, c.name));
5804
+ codeByName.set(c.name, readFileSync(c.file, "utf8"));
5805
+ }
5806
+ }
5807
+ let authoredOk = generationRows.reduce(
5808
+ (n, row) => n + row.candidates.filter((c) => !c.error).length,
5809
+ 0
5810
+ );
5811
+ const lastRow = generationRows[generationRows.length - 1];
5812
+ let incumbent = lastRow ? lastRow.champion : gen0Champion;
5813
+ let latestReport = lastRow ? lastRow.report : gen0;
5814
+ if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
5815
+ for (let g = generationRows.length + 1; g <= generations; g += 1) {
5816
+ const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
5386
5817
  const candidates = [];
5387
5818
  const newStrategies = [];
5388
5819
  for (let i = 0; i < populationSize; i += 1) {
5389
- const contract = `${strategyAuthorContract}
5820
+ const objectiveNote = cfg.objective === "cost" ? `
5821
+
5822
+ YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
5823
+ const contract = `${strategyAuthorContract}${objectiveNote}
5824
+
5825
+ EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
5826
+ ${toolCatalog}
5390
5827
 
5391
5828
  STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
5392
5829
  ${fieldSummary(archive)}
@@ -5406,26 +5843,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5406
5843
  outDir: cfg.outDir
5407
5844
  });
5408
5845
  const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
5409
- const strategy = unique === authored.strategy.name ? authored.strategy : {
5410
- name: unique,
5411
- driver: (s, t, o, b) => {
5412
- const agent = authored.strategy.driver(s, t, o, b);
5413
- return {
5414
- ...agent,
5415
- name: unique,
5416
- act: async (task, scope) => {
5417
- const out = await agent.act(task, scope);
5418
- if (out.kind !== "done") return out;
5419
- const deliverable = {
5420
- ...out.deliverable,
5421
- mode: unique
5422
- };
5423
- return { ...out, deliverable };
5424
- }
5425
- };
5426
- }
5427
- };
5846
+ const strategy = renameStrategy(authored.strategy, unique);
5428
5847
  byName.set(unique, strategy);
5848
+ codeByName.set(unique, authored.code);
5429
5849
  newStrategies.push(strategy);
5430
5850
  archive.push({
5431
5851
  name: unique,
@@ -5463,12 +5883,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5463
5883
  node.usd = cell.usd;
5464
5884
  }
5465
5885
  }
5466
- const champion = selectChampion(
5467
- report,
5468
- field.map((s) => s.name),
5469
- policy,
5470
- epsilon
5471
- );
5886
+ const fieldNames = field.map((s) => s.name);
5887
+ const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
5888
+ const champion = pickChampion(means, fieldNames, policy, epsilon);
5472
5889
  generationRows.push({ generation: g, candidates, report, champion });
5473
5890
  trajectory.push({
5474
5891
  generation: g,
@@ -5478,21 +5895,134 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5478
5895
  });
5479
5896
  incumbent = champion;
5480
5897
  latestReport = report;
5898
+ save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
5481
5899
  }
5482
5900
  if (authoredOk === 0) {
5483
5901
  throw new Error(
5484
5902
  "runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
5485
5903
  );
5486
5904
  }
5487
- const holdoutTasks = await cfg.tasks(cfg.trainN + (cfg.holdoutOffset ?? 0), cfg.holdoutN);
5488
- const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
5489
- const holdout = await bench("holdout", holdoutTasks, finalists);
5490
- const verdict = promotionGate({
5491
- report: holdout,
5492
- incumbent: gen0Champion.name,
5493
- candidate: incumbent.name,
5494
- ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
5495
- });
5905
+ const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
5906
+ let holdoutTasks = [];
5907
+ let bandInfo;
5908
+ if (ckpt?.holdout && ckpt.verdict) {
5909
+ bandInfo = ckpt.band;
5910
+ if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
5911
+ const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
5912
+ const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
5913
+ holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
5914
+ }
5915
+ } else if (cfg.band) {
5916
+ const maxRef = cfg.band.maxRefScore ?? 0.99;
5917
+ const reference = baselines[0];
5918
+ if (!reference)
5919
+ throw new Error("evolution band: baselines[0] required as the screening reference");
5920
+ const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
5921
+ const screen = await bench("band-screen", pool2, [reference]);
5922
+ const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
5923
+ const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
5924
+ const kept = pool2.filter((t) => inBandIds.has(t.id));
5925
+ if (kept.length < cfg.holdoutN) {
5926
+ throw new Error(
5927
+ `evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
5928
+ );
5929
+ }
5930
+ holdoutTasks = kept.slice(0, cfg.holdoutN);
5931
+ bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
5932
+ } else {
5933
+ holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
5934
+ }
5935
+ let holdout;
5936
+ let verdict;
5937
+ if (ckpt?.holdout && ckpt.verdict) {
5938
+ holdout = ckpt.holdout;
5939
+ verdict = ckpt.verdict;
5940
+ } else {
5941
+ const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
5942
+ holdout = await bench("holdout", holdoutTasks, finalists);
5943
+ verdict = promotionGate({
5944
+ report: holdout,
5945
+ incumbent: gen0Champion.name,
5946
+ candidate: incumbent.name,
5947
+ ...cfg.objective === "cost" ? {
5948
+ mode: "non-inferiority",
5949
+ ...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
5950
+ } : {},
5951
+ ...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
5952
+ });
5953
+ save({
5954
+ gen0,
5955
+ gen0Champion,
5956
+ generations: generationRows,
5957
+ archive,
5958
+ trajectory,
5959
+ holdout,
5960
+ verdict,
5961
+ ...bandInfo ? { band: bandInfo } : {}
5962
+ });
5963
+ }
5964
+ let reproduction;
5965
+ const championCode = codeByName.get(incumbent.name);
5966
+ if (cfg.reproducerCheck && championCode) {
5967
+ const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
5968
+ const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
5969
+ const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
5970
+ try {
5971
+ const summaryRes = await cfg.author.chat.chat({
5972
+ ...cfg.author.model ? { model: cfg.author.model } : {},
5973
+ temperature: 0.2,
5974
+ maxTokens: 512,
5975
+ messages: [
5976
+ {
5977
+ role: "system",
5978
+ content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
5979
+ },
5980
+ { role: "user", content: championCode }
5981
+ ]
5982
+ });
5983
+ const summary = summaryRes.content.trim();
5984
+ const reproduced = await authorStrategy({
5985
+ chat: cfg.author.chat,
5986
+ ...cfg.author.model ? { model: cfg.author.model } : {},
5987
+ ...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
5988
+ ...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
5989
+ temperature: 0.2,
5990
+ contract: `${strategyAuthorContract}
5991
+
5992
+ IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
5993
+ ${summary}`,
5994
+ environmentName: cfg.environment.name,
5995
+ lossesJson: "[]",
5996
+ budget,
5997
+ outDir: cfg.outDir
5998
+ });
5999
+ const reproStrategy = {
6000
+ name: `${incumbent.name}-reproduced`,
6001
+ driver: reproduced.strategy.driver
6002
+ };
6003
+ const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
6004
+ const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
6005
+ reproduction = {
6006
+ summary,
6007
+ reproducedName: reproStrategy.name,
6008
+ file: reproduced.file,
6009
+ championHoldoutScore,
6010
+ reproducedHoldoutScore,
6011
+ gap: championHoldoutScore - reproducedHoldoutScore,
6012
+ reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
6013
+ };
6014
+ } catch (e) {
6015
+ reproduction = {
6016
+ summary: "",
6017
+ reproducedName: "",
6018
+ championHoldoutScore,
6019
+ reproducedHoldoutScore: 0,
6020
+ gap: championHoldoutScore,
6021
+ reproducible: false,
6022
+ error: e instanceof Error ? e.message.slice(0, 300) : String(e)
6023
+ };
6024
+ }
6025
+ }
5496
6026
  return {
5497
6027
  gen0,
5498
6028
  gen0Champion,
@@ -5501,6 +6031,8 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
5501
6031
  finalChampion: incumbent,
5502
6032
  holdout,
5503
6033
  verdict,
6034
+ ...bandInfo ? { band: bandInfo } : {},
6035
+ ...reproduction ? { reproduction } : {},
5504
6036
  trajectory
5505
6037
  };
5506
6038
  }
@@ -5572,6 +6104,103 @@ function createVerifierEnvironment(opts) {
5572
6104
  };
5573
6105
  }
5574
6106
 
6107
+ // src/runtime/waterfall.ts
6108
+ function createWaterfallCollector() {
6109
+ let spans = /* @__PURE__ */ new Map();
6110
+ const onEvent = (event) => {
6111
+ if (event.target === "agent.spawn") {
6112
+ const p = event.payload ?? {};
6113
+ const id = p.childId ?? event.id;
6114
+ spans.set(id, {
6115
+ id,
6116
+ label: p.label ?? id,
6117
+ runId: event.runId,
6118
+ ...event.parentId !== void 0 ? { parentId: event.parentId } : {},
6119
+ startMs: event.timestamp,
6120
+ status: "running",
6121
+ usd: 0,
6122
+ tokens: { input: 0, output: 0 }
6123
+ });
6124
+ return;
6125
+ }
6126
+ if (event.target === "agent.child") {
6127
+ const p = event.payload ?? {};
6128
+ const id = p.childId;
6129
+ if (!id) return;
6130
+ const span = spans.get(id);
6131
+ if (!span) return;
6132
+ span.endMs = event.timestamp;
6133
+ span.status = p.status === "down" ? "down" : "done";
6134
+ span.usd = p.spent?.usd ?? 0;
6135
+ span.tokens = {
6136
+ input: p.spent?.tokens?.input ?? 0,
6137
+ output: p.spent?.tokens?.output ?? 0
6138
+ };
6139
+ if (typeof p.score === "number") span.score = p.score;
6140
+ }
6141
+ };
6142
+ const report = () => {
6143
+ const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
6144
+ const start = all[0]?.startMs ?? 0;
6145
+ const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
6146
+ const byKind = {};
6147
+ let totalUsd = 0;
6148
+ const totalTokens2 = { input: 0, output: 0 };
6149
+ for (const s of all) {
6150
+ totalUsd += s.usd;
6151
+ totalTokens2.input += s.tokens.input;
6152
+ totalTokens2.output += s.tokens.output;
6153
+ const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
6154
+ const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
6155
+ k.count += 1;
6156
+ k.ms += (s.endMs ?? s.startMs) - s.startMs;
6157
+ k.usd += s.usd;
6158
+ k.tokens.input += s.tokens.input;
6159
+ k.tokens.output += s.tokens.output;
6160
+ }
6161
+ return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
6162
+ };
6163
+ const render = (opts) => {
6164
+ const { spans: all, totalMs, totalUsd, byKind } = report();
6165
+ if (all.length === 0) return "(no spans observed)";
6166
+ const width = opts?.width ?? 48;
6167
+ const maxRows = opts?.maxRows ?? 60;
6168
+ const start = all[0]?.startMs ?? 0;
6169
+ const scale = totalMs > 0 ? width / totalMs : 0;
6170
+ const lines = [];
6171
+ const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
6172
+ for (const s of all.slice(0, maxRows)) {
6173
+ const offset = Math.round((s.startMs - start) * scale);
6174
+ const dur = (s.endMs ?? s.startMs) - s.startMs;
6175
+ const len = Math.max(1, Math.round(dur * scale));
6176
+ const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
6177
+ const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
6178
+ lines.push(
6179
+ `${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
6180
+ );
6181
+ }
6182
+ if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
6183
+ lines.push("\u2014".repeat(labelWidth + width + 2));
6184
+ for (const [kind, k] of Object.entries(byKind)) {
6185
+ lines.push(
6186
+ `${kind.padEnd(labelWidth)} \xD7${k.count} ${(k.ms / 1e3).toFixed(1)}s busy $${k.usd.toFixed(4)} ${k.tokens.input}/${k.tokens.output}tok`
6187
+ );
6188
+ }
6189
+ lines.push(
6190
+ `TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall $${totalUsd.toFixed(4)}`
6191
+ );
6192
+ return lines.join("\n");
6193
+ };
6194
+ return {
6195
+ hooks: { onEvent },
6196
+ report,
6197
+ render,
6198
+ reset: () => {
6199
+ spans = /* @__PURE__ */ new Map();
6200
+ }
6201
+ };
6202
+ }
6203
+
5575
6204
  // src/runtime/workspace.ts
5576
6205
  function localShell() {
5577
6206
  return async (args, cwd) => {
@@ -5674,6 +6303,10 @@ function tail(s) {
5674
6303
  }
5675
6304
 
5676
6305
  export {
6306
+ deleteBoxSafe,
6307
+ throwAbort,
6308
+ throwIfAborted,
6309
+ sleep,
5677
6310
  contentAddress,
5678
6311
  InMemoryResultBlobStore,
5679
6312
  FileResultBlobStore,
@@ -5681,6 +6314,8 @@ export {
5681
6314
  FileSpawnJournal,
5682
6315
  replaySpawnTree,
5683
6316
  materializeTreeView,
6317
+ anytimeReport,
6318
+ renderAnytimeTable,
5684
6319
  defaultAuditorInstruction,
5685
6320
  auditIntent,
5686
6321
  completionAuthorizes,
@@ -5751,11 +6386,14 @@ export {
5751
6386
  strategyAuthorContract,
5752
6387
  assertStrategyContract,
5753
6388
  authorStrategy,
6389
+ discriminatingMeans,
6390
+ pickChampion,
5754
6391
  selectChampion,
5755
6392
  runStrategyEvolution,
5756
6393
  createVerifierEnvironment,
6394
+ createWaterfallCollector,
5757
6395
  localShell,
5758
6396
  gitWorkspace,
5759
6397
  jjWorkspace
5760
6398
  };
5761
- //# sourceMappingURL=chunk-IW2LMLK6.js.map
6399
+ //# sourceMappingURL=chunk-PXUTIMGJ.js.map