@tangle-network/agent-runtime 0.48.0 → 0.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.js +1 -1
- package/dist/chunk-GHX7XOJ2.js +433 -0
- package/dist/chunk-GHX7XOJ2.js.map +1 -0
- package/dist/{chunk-TJS7S3HJ.js → chunk-IQS4HI3F.js} +14 -5
- package/dist/chunk-IQS4HI3F.js.map +1 -0
- package/dist/{chunk-IW2LMLK6.js → chunk-PXUTIMGJ.js} +767 -129
- package/dist/chunk-PXUTIMGJ.js.map +1 -0
- package/dist/{chunk-656G2XCL.js → chunk-U2VEWKKK.js} +3 -3
- package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
- package/dist/chunk-VIEDXELL.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
- package/dist/index.d.ts +29 -4
- package/dist/index.js +109 -21
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
- package/dist/loop-runner-bin.d.ts +2 -2
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +2 -2
- package/dist/loops.js +11 -1
- package/dist/mcp/bin.js +187 -24
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +27 -124
- package/dist/mcp/index.js +28 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/runtime.d.ts +285 -8
- package/dist/runtime.js +11 -1
- package/dist/workflow.js +1 -1
- package/package.json +6 -5
- package/dist/chunk-IW2LMLK6.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-LX66I3SC.js +0 -218
- package/dist/chunk-LX66I3SC.js.map +0 -1
- package/dist/chunk-TJS7S3HJ.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- /package/dist/{chunk-656G2XCL.js.map → chunk-U2VEWKKK.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
|
@@ -426,6 +426,103 @@ function isNoEntError(err) {
|
|
|
426
426
|
return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
+
// src/runtime/anytime.ts
|
|
430
|
+
var median = (xs) => {
|
|
431
|
+
if (xs.length === 0) return null;
|
|
432
|
+
const s = [...xs].sort((a, b) => a - b);
|
|
433
|
+
const mid = Math.floor(s.length / 2);
|
|
434
|
+
return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
435
|
+
};
|
|
436
|
+
function anytimeReport(spans, opts) {
|
|
437
|
+
const targets = opts?.targets ?? [1];
|
|
438
|
+
const byRun = /* @__PURE__ */ new Map();
|
|
439
|
+
for (const s of spans) {
|
|
440
|
+
if (!s.label.startsWith("shot:")) continue;
|
|
441
|
+
const list = byRun.get(s.runId) ?? [];
|
|
442
|
+
list.push(s);
|
|
443
|
+
byRun.set(s.runId, list);
|
|
444
|
+
}
|
|
445
|
+
const perTask = [];
|
|
446
|
+
for (const [runId, shots] of byRun) {
|
|
447
|
+
const m = runId.match(/^agentic:(.+):(.+)$/);
|
|
448
|
+
const strategy = m?.[1] ?? runId;
|
|
449
|
+
const taskId = m?.[2] ?? runId;
|
|
450
|
+
const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
|
|
451
|
+
const t0 = Math.min(...ordered.map((s) => s.startMs));
|
|
452
|
+
const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
|
|
453
|
+
let best = 0;
|
|
454
|
+
let cumUsd = 0;
|
|
455
|
+
const points = [];
|
|
456
|
+
const hits = {};
|
|
457
|
+
for (const t of taskTargets) hits[String(t)] = null;
|
|
458
|
+
for (const s of ordered) {
|
|
459
|
+
cumUsd += s.usd;
|
|
460
|
+
if (typeof s.score === "number" && s.score > best) best = s.score;
|
|
461
|
+
const elapsedMs = (s.endMs ?? s.startMs) - t0;
|
|
462
|
+
points.push({ elapsedMs, cumUsd, best });
|
|
463
|
+
for (const t of taskTargets) {
|
|
464
|
+
if (hits[String(t)] === null && best >= t) {
|
|
465
|
+
hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
perTask.push({ taskId, strategy, points, hits });
|
|
470
|
+
}
|
|
471
|
+
const byStrategy = /* @__PURE__ */ new Map();
|
|
472
|
+
for (const t of perTask) {
|
|
473
|
+
const list = byStrategy.get(t.strategy) ?? [];
|
|
474
|
+
list.push(t);
|
|
475
|
+
byStrategy.set(t.strategy, list);
|
|
476
|
+
}
|
|
477
|
+
const perStrategy = [];
|
|
478
|
+
for (const [strategy, tasks] of byStrategy) {
|
|
479
|
+
const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
|
|
480
|
+
const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
|
|
481
|
+
const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
|
|
482
|
+
const curveByShot = [];
|
|
483
|
+
for (let i = 0; i < maxShots; i += 1) {
|
|
484
|
+
const vals = tasks.map(
|
|
485
|
+
(t) => t.points[Math.min(i, t.points.length - 1)].best
|
|
486
|
+
);
|
|
487
|
+
curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
|
|
488
|
+
}
|
|
489
|
+
const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
|
|
490
|
+
const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
|
|
491
|
+
for (const t of summaryTargets) {
|
|
492
|
+
const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
|
|
493
|
+
const reached = tasks.filter((x) => key(x) !== null);
|
|
494
|
+
perStrategy.push({
|
|
495
|
+
strategy,
|
|
496
|
+
target: t,
|
|
497
|
+
tasks: tasks.length,
|
|
498
|
+
reachedTarget: reached.length,
|
|
499
|
+
medianTttMs: median(reached.map((x) => key(x).ms)),
|
|
500
|
+
medianShotsToTarget: median(reached.map((x) => key(x).shots)),
|
|
501
|
+
ertMs: reached.length > 0 ? totalMs / reached.length : null,
|
|
502
|
+
erUsd: reached.length > 0 ? totalUsd / reached.length : null,
|
|
503
|
+
curveByShot,
|
|
504
|
+
auc
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
|
|
509
|
+
return { targets, perTask, perStrategy };
|
|
510
|
+
}
|
|
511
|
+
function renderAnytimeTable(report) {
|
|
512
|
+
const lines = [
|
|
513
|
+
`anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
|
|
514
|
+
"strategy \u2265tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve"
|
|
515
|
+
];
|
|
516
|
+
for (const s of report.perStrategy) {
|
|
517
|
+
const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
|
|
518
|
+
const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
|
|
519
|
+
lines.push(
|
|
520
|
+
`${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? " \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`} ${s.medianShotsToTarget === null ? " \u2014" : String(s.medianShotsToTarget).padStart(5)} ${s.ertMs === null ? " \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`} ${s.erUsd === null ? " \u2014" : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
return lines.join("\n");
|
|
524
|
+
}
|
|
525
|
+
|
|
429
526
|
// src/runtime/audit-intent.ts
|
|
430
527
|
var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
|
|
431
528
|
function summarize(trace, maxLines) {
|
|
@@ -2346,20 +2443,20 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
|
|
|
2346
2443
|
}
|
|
2347
2444
|
async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
|
|
2348
2445
|
let reconciled = false;
|
|
2349
|
-
const reconcileOnce = (
|
|
2446
|
+
const reconcileOnce = (spend) => {
|
|
2350
2447
|
if (reconciled) return;
|
|
2351
2448
|
reconciled = true;
|
|
2352
|
-
pool2.reconcile(ticket, clampSpend(
|
|
2449
|
+
pool2.reconcile(ticket, clampSpend(spend, opts.budget));
|
|
2353
2450
|
};
|
|
2354
2451
|
try {
|
|
2355
2452
|
live.status = "running";
|
|
2356
2453
|
const ran = executor.execute(task, childAbort.signal);
|
|
2357
2454
|
let artifact;
|
|
2358
2455
|
if (isAsyncIterable2(ran)) {
|
|
2359
|
-
const
|
|
2360
|
-
live.spent =
|
|
2456
|
+
const spend = await foldStream(ran);
|
|
2457
|
+
live.spent = spend;
|
|
2361
2458
|
artifact = executor.resultArtifact();
|
|
2362
|
-
reconcileOnce(
|
|
2459
|
+
reconcileOnce(spend);
|
|
2363
2460
|
} else {
|
|
2364
2461
|
const terminal = await ran;
|
|
2365
2462
|
live.spent = terminal.spent;
|
|
@@ -2448,21 +2545,21 @@ async function foldStream(stream) {
|
|
|
2448
2545
|
}
|
|
2449
2546
|
return { iterations, tokens, usd, ms: 0 };
|
|
2450
2547
|
}
|
|
2451
|
-
function clampSpend(
|
|
2452
|
-
const totalTokens2 =
|
|
2548
|
+
function clampSpend(spend, budget) {
|
|
2549
|
+
const totalTokens2 = spend.tokens.input + spend.tokens.output;
|
|
2453
2550
|
const tokensOk = totalTokens2 <= budget.maxTokens;
|
|
2454
|
-
const itersOk =
|
|
2455
|
-
const usdOk = budget.maxUsd === void 0 ||
|
|
2456
|
-
if (tokensOk && itersOk && usdOk) return
|
|
2551
|
+
const itersOk = spend.iterations <= budget.maxIterations;
|
|
2552
|
+
const usdOk = budget.maxUsd === void 0 || spend.usd <= budget.maxUsd;
|
|
2553
|
+
if (tokensOk && itersOk && usdOk) return spend;
|
|
2457
2554
|
const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
|
|
2458
2555
|
return {
|
|
2459
|
-
iterations: Math.min(
|
|
2556
|
+
iterations: Math.min(spend.iterations, budget.maxIterations),
|
|
2460
2557
|
tokens: ratio < 1 ? {
|
|
2461
|
-
input: Math.floor(
|
|
2462
|
-
output: Math.floor(
|
|
2463
|
-
} :
|
|
2464
|
-
usd: budget.maxUsd === void 0 ?
|
|
2465
|
-
ms:
|
|
2558
|
+
input: Math.floor(spend.tokens.input * ratio),
|
|
2559
|
+
output: Math.floor(spend.tokens.output * ratio)
|
|
2560
|
+
} : spend.tokens,
|
|
2561
|
+
usd: budget.maxUsd === void 0 ? spend.usd : Math.min(spend.usd, budget.maxUsd),
|
|
2562
|
+
ms: spend.ms
|
|
2466
2563
|
};
|
|
2467
2564
|
}
|
|
2468
2565
|
async function teardownSafe(executor, grace) {
|
|
@@ -3110,7 +3207,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
|
|
|
3110
3207
|
"routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
|
|
3111
3208
|
);
|
|
3112
3209
|
}
|
|
3113
|
-
const maxTurns = seam.maxTurns ??
|
|
3210
|
+
const maxTurns = seam.maxTurns ?? 200;
|
|
3114
3211
|
const controller = new AbortController();
|
|
3115
3212
|
const abortIfSignalled = () => {
|
|
3116
3213
|
if (ctx.signal.aborted) controller.abort();
|
|
@@ -4228,12 +4325,12 @@ function countStatuses(reported) {
|
|
|
4228
4325
|
function zeroSpend4() {
|
|
4229
4326
|
return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
|
|
4230
4327
|
}
|
|
4231
|
-
function cloneSpend(
|
|
4328
|
+
function cloneSpend(spend) {
|
|
4232
4329
|
return {
|
|
4233
|
-
iterations:
|
|
4234
|
-
tokens: { input:
|
|
4235
|
-
usd:
|
|
4236
|
-
ms:
|
|
4330
|
+
iterations: spend.iterations,
|
|
4331
|
+
tokens: { input: spend.tokens.input, output: spend.tokens.output },
|
|
4332
|
+
usd: spend.usd,
|
|
4333
|
+
ms: spend.ms
|
|
4237
4334
|
};
|
|
4238
4335
|
}
|
|
4239
4336
|
function addSpend(acc, delta) {
|
|
@@ -4249,13 +4346,13 @@ function spreadOf(values) {
|
|
|
4249
4346
|
function fractionalSpread(values) {
|
|
4250
4347
|
const spread = spreadOf(values);
|
|
4251
4348
|
if (spread === 0) return 0;
|
|
4252
|
-
const
|
|
4253
|
-
if (
|
|
4349
|
+
const median2 = medianOf(values);
|
|
4350
|
+
if (median2 === 0) {
|
|
4254
4351
|
throw new Error(
|
|
4255
4352
|
"equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
|
|
4256
4353
|
);
|
|
4257
4354
|
}
|
|
4258
|
-
return spread /
|
|
4355
|
+
return spread / median2;
|
|
4259
4356
|
}
|
|
4260
4357
|
function medianOf(values) {
|
|
4261
4358
|
if (values.length === 0) {
|
|
@@ -4287,28 +4384,34 @@ function requireNode2(nodes, id, root) {
|
|
|
4287
4384
|
return node;
|
|
4288
4385
|
}
|
|
4289
4386
|
function requireSpend(rolled, id, root) {
|
|
4290
|
-
const
|
|
4291
|
-
if (!
|
|
4387
|
+
const spend = rolled.get(id);
|
|
4388
|
+
if (!spend) {
|
|
4292
4389
|
throw new Error(
|
|
4293
4390
|
`trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
|
|
4294
4391
|
);
|
|
4295
4392
|
}
|
|
4296
|
-
return
|
|
4393
|
+
return spend;
|
|
4297
4394
|
}
|
|
4298
4395
|
|
|
4299
4396
|
// src/runtime/promotion-gate.ts
|
|
4300
4397
|
import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
|
|
4301
4398
|
function promotionGate(opts) {
|
|
4399
|
+
const mode = opts.mode ?? "superiority";
|
|
4302
4400
|
if (opts.candidate === opts.incumbent) {
|
|
4303
4401
|
return {
|
|
4304
4402
|
promoted: false,
|
|
4305
4403
|
reason: "identical-champion",
|
|
4404
|
+
mode,
|
|
4306
4405
|
n: 0,
|
|
4307
4406
|
lift: { mean: 0, median: 0, low: 0, high: 0 }
|
|
4308
4407
|
};
|
|
4309
4408
|
}
|
|
4310
4409
|
const before = [];
|
|
4311
4410
|
const after = [];
|
|
4411
|
+
const incUsd = [];
|
|
4412
|
+
const candUsd = [];
|
|
4413
|
+
const incMs = [];
|
|
4414
|
+
const candMs = [];
|
|
4312
4415
|
const cellIds = [];
|
|
4313
4416
|
for (const row of opts.report.perTask) {
|
|
4314
4417
|
const inc = row.cells?.[opts.incumbent];
|
|
@@ -4316,6 +4419,10 @@ function promotionGate(opts) {
|
|
|
4316
4419
|
if (!inc || !cand) continue;
|
|
4317
4420
|
before.push(inc.score);
|
|
4318
4421
|
after.push(cand.score);
|
|
4422
|
+
incUsd.push(inc.usd);
|
|
4423
|
+
candUsd.push(cand.usd);
|
|
4424
|
+
incMs.push(inc.ms);
|
|
4425
|
+
candMs.push(cand.ms);
|
|
4319
4426
|
cellIds.push(row.taskId);
|
|
4320
4427
|
}
|
|
4321
4428
|
if (before.length === 0) {
|
|
@@ -4339,8 +4446,84 @@ function promotionGate(opts) {
|
|
|
4339
4446
|
low: sig.bootstrap.low,
|
|
4340
4447
|
high: sig.bootstrap.high
|
|
4341
4448
|
};
|
|
4342
|
-
|
|
4343
|
-
|
|
4449
|
+
const latSig = heldoutSignificance(
|
|
4450
|
+
{ before: incMs, after: candMs, cellIds },
|
|
4451
|
+
{
|
|
4452
|
+
deltaThreshold: 0,
|
|
4453
|
+
minProductiveRuns: 1,
|
|
4454
|
+
statistic: opts.statistic ?? "mean",
|
|
4455
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4456
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4457
|
+
}
|
|
4458
|
+
);
|
|
4459
|
+
const latency = {
|
|
4460
|
+
mean: latSig.bootstrap.mean,
|
|
4461
|
+
median: latSig.bootstrap.median,
|
|
4462
|
+
low: latSig.bootstrap.low,
|
|
4463
|
+
high: latSig.bootstrap.high
|
|
4464
|
+
};
|
|
4465
|
+
if (mode === "superiority") {
|
|
4466
|
+
if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
|
|
4467
|
+
return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
|
|
4468
|
+
}
|
|
4469
|
+
const tolerance = opts.scoreTolerance ?? 0.05;
|
|
4470
|
+
const scoreSig = heldoutSignificance(
|
|
4471
|
+
{ before, after, cellIds },
|
|
4472
|
+
{
|
|
4473
|
+
deltaThreshold: -tolerance,
|
|
4474
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4475
|
+
statistic: opts.statistic ?? "mean",
|
|
4476
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4477
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4478
|
+
}
|
|
4479
|
+
);
|
|
4480
|
+
const costSig = heldoutSignificance(
|
|
4481
|
+
{ before: candUsd, after: incUsd, cellIds },
|
|
4482
|
+
{
|
|
4483
|
+
deltaThreshold: 0,
|
|
4484
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4485
|
+
statistic: opts.statistic ?? "mean",
|
|
4486
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4487
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4488
|
+
}
|
|
4489
|
+
);
|
|
4490
|
+
const costSavings = {
|
|
4491
|
+
mean: costSig.bootstrap.mean,
|
|
4492
|
+
median: costSig.bootstrap.median,
|
|
4493
|
+
low: costSig.bootstrap.low,
|
|
4494
|
+
high: costSig.bootstrap.high
|
|
4495
|
+
};
|
|
4496
|
+
if (scoreSig.fewRuns)
|
|
4497
|
+
return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
|
|
4498
|
+
if (!scoreSig.significant)
|
|
4499
|
+
return {
|
|
4500
|
+
promoted: false,
|
|
4501
|
+
reason: "non-inferiority-unproven",
|
|
4502
|
+
mode,
|
|
4503
|
+
n: scoreSig.n,
|
|
4504
|
+
lift,
|
|
4505
|
+
costSavings,
|
|
4506
|
+
latency
|
|
4507
|
+
};
|
|
4508
|
+
if (!costSig.significant)
|
|
4509
|
+
return {
|
|
4510
|
+
promoted: false,
|
|
4511
|
+
reason: "not-cheaper",
|
|
4512
|
+
mode,
|
|
4513
|
+
n: scoreSig.n,
|
|
4514
|
+
lift,
|
|
4515
|
+
costSavings,
|
|
4516
|
+
latency
|
|
4517
|
+
};
|
|
4518
|
+
return {
|
|
4519
|
+
promoted: true,
|
|
4520
|
+
reason: "non-inferior-and-cheaper",
|
|
4521
|
+
mode,
|
|
4522
|
+
n: scoreSig.n,
|
|
4523
|
+
lift,
|
|
4524
|
+
costSavings,
|
|
4525
|
+
latency
|
|
4526
|
+
};
|
|
4344
4527
|
}
|
|
4345
4528
|
|
|
4346
4529
|
// src/runtime/run-benchmark.ts
|
|
@@ -4364,7 +4547,8 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
|
|
|
4364
4547
|
messages,
|
|
4365
4548
|
tools,
|
|
4366
4549
|
tool_choice: "auto",
|
|
4367
|
-
temperature: opts.temperature ?? 0.7
|
|
4550
|
+
temperature: opts.temperature ?? 0.7,
|
|
4551
|
+
...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
|
|
4368
4552
|
})
|
|
4369
4553
|
});
|
|
4370
4554
|
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
@@ -4403,12 +4587,15 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
|
|
|
4403
4587
|
}
|
|
4404
4588
|
return { messages, completions, toolCalls, toolErrors, tokens };
|
|
4405
4589
|
}
|
|
4406
|
-
|
|
4407
|
-
|
|
4590
|
+
function compactTrajectory(messages) {
|
|
4591
|
+
return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
|
|
4408
4592
|
if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
|
|
4409
4593
|
const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
|
|
4410
4594
|
return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
|
|
4411
4595
|
}).join("\n").slice(0, 7e3);
|
|
4596
|
+
}
|
|
4597
|
+
async function consultAnalyst(task, messages, instruction, opts) {
|
|
4598
|
+
const trajectory = compactTrajectory(messages);
|
|
4412
4599
|
const analystModel = opts.analystModel ?? opts.model;
|
|
4413
4600
|
const chat = createChatClient({
|
|
4414
4601
|
transport: "router",
|
|
@@ -4416,6 +4603,52 @@ async function analyze(task, messages, opts) {
|
|
|
4416
4603
|
baseUrl: opts.routerBaseUrl,
|
|
4417
4604
|
defaultModel: analystModel
|
|
4418
4605
|
});
|
|
4606
|
+
const res = await chat.chat({
|
|
4607
|
+
model: analystModel,
|
|
4608
|
+
temperature: 0.2,
|
|
4609
|
+
maxTokens: 1024,
|
|
4610
|
+
messages: [
|
|
4611
|
+
{ role: "system", content: instruction },
|
|
4612
|
+
{
|
|
4613
|
+
role: "user",
|
|
4614
|
+
content: `TASK: ${task.userPrompt.slice(0, 1500)}
|
|
4615
|
+
|
|
4616
|
+
TRAJECTORY:
|
|
4617
|
+
${trajectory}`
|
|
4618
|
+
}
|
|
4619
|
+
]
|
|
4620
|
+
});
|
|
4621
|
+
const usage = res.usage;
|
|
4622
|
+
return {
|
|
4623
|
+
steer: res.content.trim(),
|
|
4624
|
+
tokens: {
|
|
4625
|
+
input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
|
|
4626
|
+
output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
|
|
4627
|
+
}
|
|
4628
|
+
};
|
|
4629
|
+
}
|
|
4630
|
+
async function analyze(task, messages, opts) {
|
|
4631
|
+
const trajectory = compactTrajectory(messages);
|
|
4632
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4633
|
+
const inner = createChatClient({
|
|
4634
|
+
transport: "router",
|
|
4635
|
+
apiKey: opts.routerKey,
|
|
4636
|
+
baseUrl: opts.routerBaseUrl,
|
|
4637
|
+
defaultModel: analystModel
|
|
4638
|
+
});
|
|
4639
|
+
const tokens = { input: 0, output: 0 };
|
|
4640
|
+
const chat = {
|
|
4641
|
+
...inner,
|
|
4642
|
+
chat: async (req, callOpts) => {
|
|
4643
|
+
const res = await inner.chat(req, callOpts);
|
|
4644
|
+
const u = res.usage;
|
|
4645
|
+
if (u) {
|
|
4646
|
+
tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
|
|
4647
|
+
tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
|
|
4648
|
+
}
|
|
4649
|
+
return res;
|
|
4650
|
+
}
|
|
4651
|
+
};
|
|
4419
4652
|
const obs = await observe(
|
|
4420
4653
|
{
|
|
4421
4654
|
task: task.userPrompt,
|
|
@@ -4432,14 +4665,8 @@ async function analyze(task, messages, opts) {
|
|
|
4432
4665
|
}
|
|
4433
4666
|
);
|
|
4434
4667
|
const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
|
|
4435
|
-
return steer || "COMPLETE";
|
|
4668
|
+
return { steer: steer || "COMPLETE", tokens };
|
|
4436
4669
|
}
|
|
4437
|
-
var spend = (iterations) => ({
|
|
4438
|
-
iterations,
|
|
4439
|
-
tokens: { input: 0, output: 0 },
|
|
4440
|
-
usd: 0,
|
|
4441
|
-
ms: 0
|
|
4442
|
-
});
|
|
4443
4670
|
function shotExecutor(surface, opts) {
|
|
4444
4671
|
let artifact;
|
|
4445
4672
|
return {
|
|
@@ -4449,7 +4676,19 @@ function shotExecutor(surface, opts) {
|
|
|
4449
4676
|
const own = !t.handle;
|
|
4450
4677
|
const handle = t.handle ?? await surface.open(t.task);
|
|
4451
4678
|
try {
|
|
4452
|
-
const
|
|
4679
|
+
const allTools = await surface.tools(t.task, handle);
|
|
4680
|
+
let tools = allTools;
|
|
4681
|
+
if (t.tools) {
|
|
4682
|
+
const known = new Set(allTools.map((tool) => tool.function.name));
|
|
4683
|
+
const unknown = t.tools.filter((name) => !known.has(name));
|
|
4684
|
+
if (unknown.length > 0) {
|
|
4685
|
+
throw new Error(
|
|
4686
|
+
`shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
|
|
4687
|
+
);
|
|
4688
|
+
}
|
|
4689
|
+
const want = new Set(t.tools);
|
|
4690
|
+
tools = allTools.filter((tool) => want.has(tool.function.name));
|
|
4691
|
+
}
|
|
4453
4692
|
const messages = t.messages?.length ? t.messages : [
|
|
4454
4693
|
{ role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
|
|
4455
4694
|
{ role: "user", content: `${t.task.userPrompt}
|
|
@@ -4505,8 +4744,18 @@ function analystExecutor(opts) {
|
|
|
4505
4744
|
runtime: "agentic-analyst",
|
|
4506
4745
|
async execute(task) {
|
|
4507
4746
|
const t = task;
|
|
4508
|
-
const
|
|
4509
|
-
|
|
4747
|
+
const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
|
|
4748
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4749
|
+
artifact = {
|
|
4750
|
+
outRef: `analyst:${steer.length}`,
|
|
4751
|
+
out: steer,
|
|
4752
|
+
spent: {
|
|
4753
|
+
iterations: 1,
|
|
4754
|
+
tokens,
|
|
4755
|
+
usd: isModelPriced2(analystModel) ? estimateCost2(tokens.input, tokens.output, analystModel) : 0,
|
|
4756
|
+
ms: 0
|
|
4757
|
+
}
|
|
4758
|
+
};
|
|
4510
4759
|
return artifact;
|
|
4511
4760
|
},
|
|
4512
4761
|
teardown: () => Promise.resolve({ destroyed: true }),
|
|
@@ -4669,12 +4918,21 @@ function defineStrategy(name, run) {
|
|
|
4669
4918
|
const innerTurns = opts.innerTurns ?? 4;
|
|
4670
4919
|
let verifiedBest = 0;
|
|
4671
4920
|
let verifiedResolved = false;
|
|
4921
|
+
const openHandles = /* @__PURE__ */ new Set();
|
|
4672
4922
|
const ctx = {
|
|
4673
4923
|
// Narrowed to open/close — the body gets no raw call()/score() access.
|
|
4674
4924
|
surface: {
|
|
4675
4925
|
name: surface.name,
|
|
4676
|
-
open: (t) =>
|
|
4677
|
-
|
|
4926
|
+
open: async (t) => {
|
|
4927
|
+
const h = await surface.open(t);
|
|
4928
|
+
openHandles.add(h.id);
|
|
4929
|
+
return h;
|
|
4930
|
+
},
|
|
4931
|
+
close: async (h) => {
|
|
4932
|
+
if (!h || !openHandles.has(h.id)) return;
|
|
4933
|
+
openHandles.delete(h.id);
|
|
4934
|
+
await surface.close(h);
|
|
4935
|
+
}
|
|
4678
4936
|
},
|
|
4679
4937
|
task,
|
|
4680
4938
|
opts,
|
|
@@ -4690,7 +4948,8 @@ function defineStrategy(name, run) {
|
|
|
4690
4948
|
handle: spec?.handle,
|
|
4691
4949
|
messages: spec?.messages,
|
|
4692
4950
|
steer: spec?.steer,
|
|
4693
|
-
persona: spec?.persona
|
|
4951
|
+
persona: spec?.persona,
|
|
4952
|
+
tools: spec?.tools
|
|
4694
4953
|
},
|
|
4695
4954
|
{ budget: perChild(innerTurns), label: child.name }
|
|
4696
4955
|
);
|
|
@@ -4702,6 +4961,13 @@ function defineStrategy(name, run) {
|
|
|
4702
4961
|
if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
|
|
4703
4962
|
return out;
|
|
4704
4963
|
},
|
|
4964
|
+
async listTools(handle) {
|
|
4965
|
+
const tools = await surface.tools(task, handle);
|
|
4966
|
+
return tools.map((t) => ({
|
|
4967
|
+
name: t.function.name,
|
|
4968
|
+
...t.function.description ? { description: t.function.description } : {}
|
|
4969
|
+
}));
|
|
4970
|
+
},
|
|
4705
4971
|
async critique(messages) {
|
|
4706
4972
|
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4707
4973
|
seq += 1;
|
|
@@ -4715,12 +4981,33 @@ function defineStrategy(name, run) {
|
|
|
4715
4981
|
if (settled.kind === "down") return null;
|
|
4716
4982
|
const findings = settled.out;
|
|
4717
4983
|
return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
|
|
4984
|
+
},
|
|
4985
|
+
async consult(messages, instruction) {
|
|
4986
|
+
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4987
|
+
seq += 1;
|
|
4988
|
+
const res = scope.spawn(
|
|
4989
|
+
child,
|
|
4990
|
+
{ task, messages, rawInstruction: instruction },
|
|
4991
|
+
{ budget: perChild(1), label: child.name }
|
|
4992
|
+
);
|
|
4993
|
+
if (!res.ok) return null;
|
|
4994
|
+
const settled = await drainOne2(scope);
|
|
4995
|
+
if (settled.kind === "down") return null;
|
|
4996
|
+
return settled.out;
|
|
4718
4997
|
}
|
|
4719
4998
|
};
|
|
4720
4999
|
const r = await run(ctx);
|
|
4721
5000
|
return {
|
|
4722
5001
|
kind: "done",
|
|
4723
|
-
deliverable: {
|
|
5002
|
+
deliverable: {
|
|
5003
|
+
mode: name,
|
|
5004
|
+
...r,
|
|
5005
|
+
progression: Array.isArray(r.progression) ? r.progression : [],
|
|
5006
|
+
completions: typeof r.completions === "number" ? r.completions : 0,
|
|
5007
|
+
shots: typeof r.shots === "number" ? r.shots : 0,
|
|
5008
|
+
score: verifiedBest,
|
|
5009
|
+
resolved: verifiedResolved
|
|
5010
|
+
}
|
|
4724
5011
|
};
|
|
4725
5012
|
}
|
|
4726
5013
|
})
|
|
@@ -4875,27 +5162,44 @@ async function runBenchmark(cfg) {
|
|
|
4875
5162
|
let settled = 0;
|
|
4876
5163
|
const perTask = await pool(cfg.tasks, concurrency, async (task) => {
|
|
4877
5164
|
const cells = {};
|
|
5165
|
+
const errors = {};
|
|
4878
5166
|
let row;
|
|
4879
5167
|
try {
|
|
4880
5168
|
for (const s of strategies) {
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
5169
|
+
try {
|
|
5170
|
+
const r = await runAgentic({
|
|
5171
|
+
...cfg.worker,
|
|
5172
|
+
surface: cfg.environment,
|
|
5173
|
+
task,
|
|
5174
|
+
strategy: s,
|
|
5175
|
+
budget,
|
|
5176
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5177
|
+
});
|
|
5178
|
+
cells[s.name] = {
|
|
5179
|
+
score: r.score,
|
|
5180
|
+
resolved: r.resolved,
|
|
5181
|
+
progression: r.progression,
|
|
5182
|
+
usd: r.usd,
|
|
5183
|
+
ms: r.ms,
|
|
5184
|
+
tokens: r.tokens
|
|
5185
|
+
};
|
|
5186
|
+
} catch (e) {
|
|
5187
|
+
errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
|
|
5188
|
+
cells[s.name] = {
|
|
5189
|
+
score: 0,
|
|
5190
|
+
resolved: false,
|
|
5191
|
+
progression: [],
|
|
5192
|
+
usd: 0,
|
|
5193
|
+
ms: 0,
|
|
5194
|
+
tokens: { input: 0, output: 0 }
|
|
5195
|
+
};
|
|
5196
|
+
}
|
|
4897
5197
|
}
|
|
4898
|
-
row = {
|
|
5198
|
+
row = {
|
|
5199
|
+
taskId: task.id,
|
|
5200
|
+
cells,
|
|
5201
|
+
...Object.keys(errors).length > 0 ? { errors } : {}
|
|
5202
|
+
};
|
|
4899
5203
|
} catch (e) {
|
|
4900
5204
|
row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
|
|
4901
5205
|
}
|
|
@@ -5200,7 +5504,7 @@ var strategyAuthorContract = `
|
|
|
5200
5504
|
You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
|
|
5201
5505
|
spend a compute budget to beat a task's deployable check. You compose exactly two steps:
|
|
5202
5506
|
|
|
5203
|
-
shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
|
|
5507
|
+
shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
|
|
5204
5508
|
Runs ONE worker attempt (a bounded tool loop) over an artifact.
|
|
5205
5509
|
- omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
|
|
5206
5510
|
- pass handle => the shot CONTINUES that artifact (state accumulates across shots).
|
|
@@ -5210,6 +5514,10 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
|
|
|
5210
5514
|
(multi-agent strategies: a researcher shot then an engineer shot, a panel of k
|
|
5211
5515
|
personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
|
|
5212
5516
|
a carried conversation it arrives as a hand-off message. Same conserved budget.
|
|
5517
|
+
- tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by
|
|
5518
|
+
name (focus an explore shot on read-only tools, an execute shot on write tools).
|
|
5519
|
+
Restriction-only; unknown names make the shot fail. ALWAYS select from
|
|
5520
|
+
await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
|
|
5213
5521
|
ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
|
|
5214
5522
|
Returns null if the attempt failed infra-wise.
|
|
5215
5523
|
|
|
@@ -5217,10 +5525,23 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
|
|
|
5217
5525
|
A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
|
|
5218
5526
|
instruction (or null when it judges the work complete). Costs ~1 completion.
|
|
5219
5527
|
|
|
5528
|
+
consult(messages, instruction): Promise<string | null>
|
|
5529
|
+
The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
|
|
5530
|
+
trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
|
|
5531
|
+
(a decision, a prediction). Costs ~1 completion.
|
|
5532
|
+
|
|
5220
5533
|
surface.open(task) / surface.close(handle)
|
|
5221
5534
|
Open a persistent artifact you manage yourself (remember to close in a finally).
|
|
5535
|
+
close is idempotent \u2014 closing an already-closed handle is a safe no-op.
|
|
5536
|
+
|
|
5537
|
+
listTools(handle): Promise<Array<{ name, description? }>>
|
|
5538
|
+
The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
|
|
5539
|
+
shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
|
|
5540
|
+
names from an example kills your shots on every task whose tools differ.
|
|
5222
5541
|
|
|
5223
5542
|
Rules:
|
|
5543
|
+
- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
|
|
5544
|
+
crashes the whole benchmark run.
|
|
5224
5545
|
- Stay within ~budget total shots; every shot/critique spends from a conserved pool.
|
|
5225
5546
|
- For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
|
|
5226
5547
|
fresh conversation too, but be explicit). To CONTINUE, pass the previous
|
|
@@ -5230,8 +5551,8 @@ Rules:
|
|
|
5230
5551
|
- The module must be EXACTLY this shape (no other imports, no commentary outside code):
|
|
5231
5552
|
|
|
5232
5553
|
import { defineStrategy } from '@tangle-network/agent-runtime/loops'
|
|
5233
|
-
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
|
|
5234
|
-
// your composition
|
|
5554
|
+
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
|
|
5555
|
+
// your composition (listTools comes from the destructured context \u2014 it is NOT a global)
|
|
5235
5556
|
})
|
|
5236
5557
|
`;
|
|
5237
5558
|
function assertStrategyContract(code) {
|
|
@@ -5307,34 +5628,89 @@ async function authorStrategy(opts) {
|
|
|
5307
5628
|
}
|
|
5308
5629
|
|
|
5309
5630
|
// src/runtime/strategy-evolution.ts
|
|
5631
|
+
import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
|
|
5310
5632
|
import { gzipSync } from "zlib";
|
|
5311
|
-
function
|
|
5312
|
-
const
|
|
5633
|
+
function discriminatingMeans(report, fieldOrder) {
|
|
5634
|
+
const rows = report.perTask.filter((r) => {
|
|
5635
|
+
if (!r.cells) return false;
|
|
5636
|
+
const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
|
|
5637
|
+
if (scores.length < fieldOrder.length) return false;
|
|
5638
|
+
return Math.max(...scores) - Math.min(...scores) > 0;
|
|
5639
|
+
});
|
|
5640
|
+
if (rows.length === 0) return null;
|
|
5641
|
+
const out = {};
|
|
5642
|
+
for (const name of fieldOrder) {
|
|
5643
|
+
const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
|
|
5644
|
+
out[name] = {
|
|
5645
|
+
score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
|
|
5646
|
+
usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
|
|
5647
|
+
};
|
|
5648
|
+
}
|
|
5649
|
+
return out;
|
|
5650
|
+
}
|
|
5651
|
+
function pickChampion(means, fieldOrder, policy, epsilon) {
|
|
5652
|
+
const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
|
|
5313
5653
|
if (entries.length === 0)
|
|
5314
|
-
throw new Error("
|
|
5654
|
+
throw new Error("pickChampion: the means table carries none of the field strategies");
|
|
5315
5655
|
const best = Math.max(...entries.map((e) => e.summary.score));
|
|
5316
5656
|
const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
|
|
5317
|
-
if (!pick) throw new Error("
|
|
5657
|
+
if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
|
|
5318
5658
|
return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
|
|
5319
5659
|
}
|
|
5660
|
+
function selectChampion(report, fieldOrder, policy, epsilon) {
|
|
5661
|
+
return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
|
|
5662
|
+
}
|
|
5320
5663
|
var fieldSummary = (archive) => archive.map(
|
|
5321
5664
|
(n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
|
|
5322
5665
|
).join("\n");
|
|
5323
|
-
var compactLosses = (report) => {
|
|
5666
|
+
var compactLosses = (report, detail) => {
|
|
5324
5667
|
const r2 = (x) => Math.round(x * 100) / 100;
|
|
5325
5668
|
const rows = report.perTask.map(
|
|
5326
5669
|
(row) => row.cells ? {
|
|
5327
5670
|
task: row.taskId,
|
|
5671
|
+
...row.errors ? {
|
|
5672
|
+
errors: Object.fromEntries(
|
|
5673
|
+
Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
|
|
5674
|
+
)
|
|
5675
|
+
} : {},
|
|
5328
5676
|
cells: Object.fromEntries(
|
|
5329
5677
|
Object.entries(row.cells).map(([name, c]) => [
|
|
5330
5678
|
name,
|
|
5331
|
-
|
|
5679
|
+
// 'binary' is the leakage-bounded channel: the author learns pass/fail per
|
|
5680
|
+
// task and nothing else — the per-generation leak from the evaluation data
|
|
5681
|
+
// is capped at one bit per cell (arXiv:2606.11045 measured that exploration
|
|
5682
|
+
// survives this; whether AUTHORING does is the E1-coarse A/B).
|
|
5683
|
+
detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
|
|
5684
|
+
score: r2(c.score),
|
|
5685
|
+
resolved: c.resolved,
|
|
5686
|
+
usd: Math.round(c.usd * 1e4) / 1e4,
|
|
5687
|
+
progression: (c.progression ?? []).map(r2)
|
|
5688
|
+
}
|
|
5332
5689
|
])
|
|
5333
5690
|
)
|
|
5334
5691
|
} : { task: row.taskId, error: row.error?.slice(0, 80) }
|
|
5335
5692
|
);
|
|
5336
5693
|
return JSON.stringify(rows).slice(0, 12e3);
|
|
5337
5694
|
};
|
|
5695
|
+
function renameStrategy(orig, unique) {
|
|
5696
|
+
if (orig.name === unique) return orig;
|
|
5697
|
+
return {
|
|
5698
|
+
name: unique,
|
|
5699
|
+
driver: (s, t, o, b) => {
|
|
5700
|
+
const agent = orig.driver(s, t, o, b);
|
|
5701
|
+
return {
|
|
5702
|
+
...agent,
|
|
5703
|
+
name: unique,
|
|
5704
|
+
act: async (task, scope) => {
|
|
5705
|
+
const out = await agent.act(task, scope);
|
|
5706
|
+
if (out.kind !== "done") return out;
|
|
5707
|
+
const deliverable = { ...out.deliverable, mode: unique };
|
|
5708
|
+
return { ...out, deliverable };
|
|
5709
|
+
}
|
|
5710
|
+
};
|
|
5711
|
+
}
|
|
5712
|
+
};
|
|
5713
|
+
}
|
|
5338
5714
|
async function runStrategyEvolution(cfg) {
|
|
5339
5715
|
const budget = cfg.budget ?? 3;
|
|
5340
5716
|
const concurrency = cfg.concurrency ?? 3;
|
|
@@ -5342,37 +5718,72 @@ async function runStrategyEvolution(cfg) {
|
|
|
5342
5718
|
const populationSize = cfg.populationSize ?? 2;
|
|
5343
5719
|
const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
|
|
5344
5720
|
const policy = cfg.champion ?? "costAware";
|
|
5345
|
-
const epsilon = cfg.championEpsilon ?? 0.01;
|
|
5721
|
+
const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
|
|
5346
5722
|
const byName = new Map(baselines.map((s) => [s.name, s]));
|
|
5347
|
-
const
|
|
5348
|
-
|
|
5349
|
-
|
|
5350
|
-
|
|
5351
|
-
strategies,
|
|
5723
|
+
const codeByName = /* @__PURE__ */ new Map();
|
|
5724
|
+
const fingerprint = {
|
|
5725
|
+
trainN: cfg.trainN,
|
|
5726
|
+
holdoutN: cfg.holdoutN,
|
|
5352
5727
|
budget,
|
|
5353
|
-
|
|
5354
|
-
|
|
5355
|
-
|
|
5356
|
-
|
|
5728
|
+
generations,
|
|
5729
|
+
populationSize
|
|
5730
|
+
};
|
|
5731
|
+
let ckpt;
|
|
5732
|
+
if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
|
|
5733
|
+
const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
|
|
5734
|
+
if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
|
|
5735
|
+
throw new Error(
|
|
5736
|
+
`evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
|
|
5737
|
+
);
|
|
5738
|
+
}
|
|
5739
|
+
ckpt = raw;
|
|
5740
|
+
}
|
|
5741
|
+
const save = (state) => {
|
|
5742
|
+
if (cfg.checkpoint)
|
|
5743
|
+
writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
|
|
5744
|
+
};
|
|
5745
|
+
const bench = async (phase, tasks, strategies) => {
|
|
5746
|
+
await cfg.onPhase?.(phase);
|
|
5747
|
+
return runBenchmark({
|
|
5748
|
+
environment: cfg.environment,
|
|
5749
|
+
tasks,
|
|
5750
|
+
worker: cfg.worker,
|
|
5751
|
+
strategies,
|
|
5752
|
+
budget,
|
|
5753
|
+
concurrency,
|
|
5754
|
+
...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
|
|
5755
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5756
|
+
});
|
|
5757
|
+
};
|
|
5357
5758
|
const train = await cfg.tasks(0, cfg.trainN);
|
|
5358
|
-
const
|
|
5359
|
-
|
|
5759
|
+
const probeTask = train[0];
|
|
5760
|
+
if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
|
|
5761
|
+
const probe = await cfg.environment.open(probeTask);
|
|
5762
|
+
let toolCatalog;
|
|
5763
|
+
try {
|
|
5764
|
+
const tools = await cfg.environment.tools(probeTask, probe);
|
|
5765
|
+
toolCatalog = tools.map(
|
|
5766
|
+
(t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
|
|
5767
|
+
).join("\n");
|
|
5768
|
+
} finally {
|
|
5769
|
+
await cfg.environment.close(probe);
|
|
5770
|
+
}
|
|
5771
|
+
const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
|
|
5772
|
+
const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
|
|
5360
5773
|
name: s.name,
|
|
5361
5774
|
source: "baseline",
|
|
5362
5775
|
generation: 0,
|
|
5363
5776
|
score: gen0.perStrategy[s.name]?.score ?? 0,
|
|
5364
5777
|
usd: gen0.perStrategy[s.name]?.usd ?? 0
|
|
5365
5778
|
}));
|
|
5366
|
-
const gen0Champion = selectChampion(
|
|
5779
|
+
const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
|
|
5367
5780
|
gen0,
|
|
5368
5781
|
baselines.map((s) => s.name),
|
|
5369
5782
|
policy,
|
|
5370
5783
|
epsilon
|
|
5371
5784
|
);
|
|
5372
|
-
|
|
5373
|
-
|
|
5374
|
-
const generationRows = [];
|
|
5375
|
-
const trajectory = [
|
|
5785
|
+
const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
|
|
5786
|
+
const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
|
|
5376
5787
|
{
|
|
5377
5788
|
generation: 0,
|
|
5378
5789
|
champion: gen0Champion.name,
|
|
@@ -5380,13 +5791,39 @@ async function runStrategyEvolution(cfg) {
|
|
|
5380
5791
|
usd: gen0Champion.usd
|
|
5381
5792
|
}
|
|
5382
5793
|
];
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5794
|
+
for (const row of generationRows) {
|
|
5795
|
+
for (const c of row.candidates) {
|
|
5796
|
+
if (!c.file || c.error) continue;
|
|
5797
|
+
const mod = await import(`file://${c.file}`);
|
|
5798
|
+
if (!mod.default || typeof mod.default.driver !== "function") {
|
|
5799
|
+
throw new Error(
|
|
5800
|
+
`evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
|
|
5801
|
+
);
|
|
5802
|
+
}
|
|
5803
|
+
byName.set(c.name, renameStrategy(mod.default, c.name));
|
|
5804
|
+
codeByName.set(c.name, readFileSync(c.file, "utf8"));
|
|
5805
|
+
}
|
|
5806
|
+
}
|
|
5807
|
+
let authoredOk = generationRows.reduce(
|
|
5808
|
+
(n, row) => n + row.candidates.filter((c) => !c.error).length,
|
|
5809
|
+
0
|
|
5810
|
+
);
|
|
5811
|
+
const lastRow = generationRows[generationRows.length - 1];
|
|
5812
|
+
let incumbent = lastRow ? lastRow.champion : gen0Champion;
|
|
5813
|
+
let latestReport = lastRow ? lastRow.report : gen0;
|
|
5814
|
+
if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5815
|
+
for (let g = generationRows.length + 1; g <= generations; g += 1) {
|
|
5816
|
+
const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
|
|
5386
5817
|
const candidates = [];
|
|
5387
5818
|
const newStrategies = [];
|
|
5388
5819
|
for (let i = 0; i < populationSize; i += 1) {
|
|
5389
|
-
const
|
|
5820
|
+
const objectiveNote = cfg.objective === "cost" ? `
|
|
5821
|
+
|
|
5822
|
+
YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
|
|
5823
|
+
const contract = `${strategyAuthorContract}${objectiveNote}
|
|
5824
|
+
|
|
5825
|
+
EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
|
|
5826
|
+
${toolCatalog}
|
|
5390
5827
|
|
|
5391
5828
|
STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
|
|
5392
5829
|
${fieldSummary(archive)}
|
|
@@ -5406,26 +5843,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5406
5843
|
outDir: cfg.outDir
|
|
5407
5844
|
});
|
|
5408
5845
|
const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
|
|
5409
|
-
const strategy =
|
|
5410
|
-
name: unique,
|
|
5411
|
-
driver: (s, t, o, b) => {
|
|
5412
|
-
const agent = authored.strategy.driver(s, t, o, b);
|
|
5413
|
-
return {
|
|
5414
|
-
...agent,
|
|
5415
|
-
name: unique,
|
|
5416
|
-
act: async (task, scope) => {
|
|
5417
|
-
const out = await agent.act(task, scope);
|
|
5418
|
-
if (out.kind !== "done") return out;
|
|
5419
|
-
const deliverable = {
|
|
5420
|
-
...out.deliverable,
|
|
5421
|
-
mode: unique
|
|
5422
|
-
};
|
|
5423
|
-
return { ...out, deliverable };
|
|
5424
|
-
}
|
|
5425
|
-
};
|
|
5426
|
-
}
|
|
5427
|
-
};
|
|
5846
|
+
const strategy = renameStrategy(authored.strategy, unique);
|
|
5428
5847
|
byName.set(unique, strategy);
|
|
5848
|
+
codeByName.set(unique, authored.code);
|
|
5429
5849
|
newStrategies.push(strategy);
|
|
5430
5850
|
archive.push({
|
|
5431
5851
|
name: unique,
|
|
@@ -5463,12 +5883,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5463
5883
|
node.usd = cell.usd;
|
|
5464
5884
|
}
|
|
5465
5885
|
}
|
|
5466
|
-
const
|
|
5467
|
-
|
|
5468
|
-
|
|
5469
|
-
policy,
|
|
5470
|
-
epsilon
|
|
5471
|
-
);
|
|
5886
|
+
const fieldNames = field.map((s) => s.name);
|
|
5887
|
+
const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
|
|
5888
|
+
const champion = pickChampion(means, fieldNames, policy, epsilon);
|
|
5472
5889
|
generationRows.push({ generation: g, candidates, report, champion });
|
|
5473
5890
|
trajectory.push({
|
|
5474
5891
|
generation: g,
|
|
@@ -5478,21 +5895,134 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5478
5895
|
});
|
|
5479
5896
|
incumbent = champion;
|
|
5480
5897
|
latestReport = report;
|
|
5898
|
+
save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5481
5899
|
}
|
|
5482
5900
|
if (authoredOk === 0) {
|
|
5483
5901
|
throw new Error(
|
|
5484
5902
|
"runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
|
|
5485
5903
|
);
|
|
5486
5904
|
}
|
|
5487
|
-
const
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
5905
|
+
const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
|
|
5906
|
+
let holdoutTasks = [];
|
|
5907
|
+
let bandInfo;
|
|
5908
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
5909
|
+
bandInfo = ckpt.band;
|
|
5910
|
+
if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
|
|
5911
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
|
|
5912
|
+
const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
|
|
5913
|
+
holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
|
|
5914
|
+
}
|
|
5915
|
+
} else if (cfg.band) {
|
|
5916
|
+
const maxRef = cfg.band.maxRefScore ?? 0.99;
|
|
5917
|
+
const reference = baselines[0];
|
|
5918
|
+
if (!reference)
|
|
5919
|
+
throw new Error("evolution band: baselines[0] required as the screening reference");
|
|
5920
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
|
|
5921
|
+
const screen = await bench("band-screen", pool2, [reference]);
|
|
5922
|
+
const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
|
|
5923
|
+
const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
|
|
5924
|
+
const kept = pool2.filter((t) => inBandIds.has(t.id));
|
|
5925
|
+
if (kept.length < cfg.holdoutN) {
|
|
5926
|
+
throw new Error(
|
|
5927
|
+
`evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
|
|
5928
|
+
);
|
|
5929
|
+
}
|
|
5930
|
+
holdoutTasks = kept.slice(0, cfg.holdoutN);
|
|
5931
|
+
bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
|
|
5932
|
+
} else {
|
|
5933
|
+
holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
|
|
5934
|
+
}
|
|
5935
|
+
let holdout;
|
|
5936
|
+
let verdict;
|
|
5937
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
5938
|
+
holdout = ckpt.holdout;
|
|
5939
|
+
verdict = ckpt.verdict;
|
|
5940
|
+
} else {
|
|
5941
|
+
const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
|
|
5942
|
+
holdout = await bench("holdout", holdoutTasks, finalists);
|
|
5943
|
+
verdict = promotionGate({
|
|
5944
|
+
report: holdout,
|
|
5945
|
+
incumbent: gen0Champion.name,
|
|
5946
|
+
candidate: incumbent.name,
|
|
5947
|
+
...cfg.objective === "cost" ? {
|
|
5948
|
+
mode: "non-inferiority",
|
|
5949
|
+
...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
|
|
5950
|
+
} : {},
|
|
5951
|
+
...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
|
|
5952
|
+
});
|
|
5953
|
+
save({
|
|
5954
|
+
gen0,
|
|
5955
|
+
gen0Champion,
|
|
5956
|
+
generations: generationRows,
|
|
5957
|
+
archive,
|
|
5958
|
+
trajectory,
|
|
5959
|
+
holdout,
|
|
5960
|
+
verdict,
|
|
5961
|
+
...bandInfo ? { band: bandInfo } : {}
|
|
5962
|
+
});
|
|
5963
|
+
}
|
|
5964
|
+
let reproduction;
|
|
5965
|
+
const championCode = codeByName.get(incumbent.name);
|
|
5966
|
+
if (cfg.reproducerCheck && championCode) {
|
|
5967
|
+
const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
|
|
5968
|
+
const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
|
|
5969
|
+
const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
|
|
5970
|
+
try {
|
|
5971
|
+
const summaryRes = await cfg.author.chat.chat({
|
|
5972
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5973
|
+
temperature: 0.2,
|
|
5974
|
+
maxTokens: 512,
|
|
5975
|
+
messages: [
|
|
5976
|
+
{
|
|
5977
|
+
role: "system",
|
|
5978
|
+
content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
|
|
5979
|
+
},
|
|
5980
|
+
{ role: "user", content: championCode }
|
|
5981
|
+
]
|
|
5982
|
+
});
|
|
5983
|
+
const summary = summaryRes.content.trim();
|
|
5984
|
+
const reproduced = await authorStrategy({
|
|
5985
|
+
chat: cfg.author.chat,
|
|
5986
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5987
|
+
...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
|
|
5988
|
+
...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
|
|
5989
|
+
temperature: 0.2,
|
|
5990
|
+
contract: `${strategyAuthorContract}
|
|
5991
|
+
|
|
5992
|
+
IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
|
|
5993
|
+
${summary}`,
|
|
5994
|
+
environmentName: cfg.environment.name,
|
|
5995
|
+
lossesJson: "[]",
|
|
5996
|
+
budget,
|
|
5997
|
+
outDir: cfg.outDir
|
|
5998
|
+
});
|
|
5999
|
+
const reproStrategy = {
|
|
6000
|
+
name: `${incumbent.name}-reproduced`,
|
|
6001
|
+
driver: reproduced.strategy.driver
|
|
6002
|
+
};
|
|
6003
|
+
const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
|
|
6004
|
+
const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
|
|
6005
|
+
reproduction = {
|
|
6006
|
+
summary,
|
|
6007
|
+
reproducedName: reproStrategy.name,
|
|
6008
|
+
file: reproduced.file,
|
|
6009
|
+
championHoldoutScore,
|
|
6010
|
+
reproducedHoldoutScore,
|
|
6011
|
+
gap: championHoldoutScore - reproducedHoldoutScore,
|
|
6012
|
+
reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
|
|
6013
|
+
};
|
|
6014
|
+
} catch (e) {
|
|
6015
|
+
reproduction = {
|
|
6016
|
+
summary: "",
|
|
6017
|
+
reproducedName: "",
|
|
6018
|
+
championHoldoutScore,
|
|
6019
|
+
reproducedHoldoutScore: 0,
|
|
6020
|
+
gap: championHoldoutScore,
|
|
6021
|
+
reproducible: false,
|
|
6022
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
6023
|
+
};
|
|
6024
|
+
}
|
|
6025
|
+
}
|
|
5496
6026
|
return {
|
|
5497
6027
|
gen0,
|
|
5498
6028
|
gen0Champion,
|
|
@@ -5501,6 +6031,8 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5501
6031
|
finalChampion: incumbent,
|
|
5502
6032
|
holdout,
|
|
5503
6033
|
verdict,
|
|
6034
|
+
...bandInfo ? { band: bandInfo } : {},
|
|
6035
|
+
...reproduction ? { reproduction } : {},
|
|
5504
6036
|
trajectory
|
|
5505
6037
|
};
|
|
5506
6038
|
}
|
|
@@ -5572,6 +6104,103 @@ function createVerifierEnvironment(opts) {
|
|
|
5572
6104
|
};
|
|
5573
6105
|
}
|
|
5574
6106
|
|
|
6107
|
+
// src/runtime/waterfall.ts
|
|
6108
|
+
function createWaterfallCollector() {
|
|
6109
|
+
let spans = /* @__PURE__ */ new Map();
|
|
6110
|
+
const onEvent = (event) => {
|
|
6111
|
+
if (event.target === "agent.spawn") {
|
|
6112
|
+
const p = event.payload ?? {};
|
|
6113
|
+
const id = p.childId ?? event.id;
|
|
6114
|
+
spans.set(id, {
|
|
6115
|
+
id,
|
|
6116
|
+
label: p.label ?? id,
|
|
6117
|
+
runId: event.runId,
|
|
6118
|
+
...event.parentId !== void 0 ? { parentId: event.parentId } : {},
|
|
6119
|
+
startMs: event.timestamp,
|
|
6120
|
+
status: "running",
|
|
6121
|
+
usd: 0,
|
|
6122
|
+
tokens: { input: 0, output: 0 }
|
|
6123
|
+
});
|
|
6124
|
+
return;
|
|
6125
|
+
}
|
|
6126
|
+
if (event.target === "agent.child") {
|
|
6127
|
+
const p = event.payload ?? {};
|
|
6128
|
+
const id = p.childId;
|
|
6129
|
+
if (!id) return;
|
|
6130
|
+
const span = spans.get(id);
|
|
6131
|
+
if (!span) return;
|
|
6132
|
+
span.endMs = event.timestamp;
|
|
6133
|
+
span.status = p.status === "down" ? "down" : "done";
|
|
6134
|
+
span.usd = p.spent?.usd ?? 0;
|
|
6135
|
+
span.tokens = {
|
|
6136
|
+
input: p.spent?.tokens?.input ?? 0,
|
|
6137
|
+
output: p.spent?.tokens?.output ?? 0
|
|
6138
|
+
};
|
|
6139
|
+
if (typeof p.score === "number") span.score = p.score;
|
|
6140
|
+
}
|
|
6141
|
+
};
|
|
6142
|
+
const report = () => {
|
|
6143
|
+
const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
|
|
6144
|
+
const start = all[0]?.startMs ?? 0;
|
|
6145
|
+
const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
|
|
6146
|
+
const byKind = {};
|
|
6147
|
+
let totalUsd = 0;
|
|
6148
|
+
const totalTokens2 = { input: 0, output: 0 };
|
|
6149
|
+
for (const s of all) {
|
|
6150
|
+
totalUsd += s.usd;
|
|
6151
|
+
totalTokens2.input += s.tokens.input;
|
|
6152
|
+
totalTokens2.output += s.tokens.output;
|
|
6153
|
+
const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
|
|
6154
|
+
const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
|
|
6155
|
+
k.count += 1;
|
|
6156
|
+
k.ms += (s.endMs ?? s.startMs) - s.startMs;
|
|
6157
|
+
k.usd += s.usd;
|
|
6158
|
+
k.tokens.input += s.tokens.input;
|
|
6159
|
+
k.tokens.output += s.tokens.output;
|
|
6160
|
+
}
|
|
6161
|
+
return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
|
|
6162
|
+
};
|
|
6163
|
+
const render = (opts) => {
|
|
6164
|
+
const { spans: all, totalMs, totalUsd, byKind } = report();
|
|
6165
|
+
if (all.length === 0) return "(no spans observed)";
|
|
6166
|
+
const width = opts?.width ?? 48;
|
|
6167
|
+
const maxRows = opts?.maxRows ?? 60;
|
|
6168
|
+
const start = all[0]?.startMs ?? 0;
|
|
6169
|
+
const scale = totalMs > 0 ? width / totalMs : 0;
|
|
6170
|
+
const lines = [];
|
|
6171
|
+
const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
|
|
6172
|
+
for (const s of all.slice(0, maxRows)) {
|
|
6173
|
+
const offset = Math.round((s.startMs - start) * scale);
|
|
6174
|
+
const dur = (s.endMs ?? s.startMs) - s.startMs;
|
|
6175
|
+
const len = Math.max(1, Math.round(dur * scale));
|
|
6176
|
+
const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
|
|
6177
|
+
const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
|
|
6178
|
+
lines.push(
|
|
6179
|
+
`${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
|
|
6180
|
+
);
|
|
6181
|
+
}
|
|
6182
|
+
if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
|
|
6183
|
+
lines.push("\u2014".repeat(labelWidth + width + 2));
|
|
6184
|
+
for (const [kind, k] of Object.entries(byKind)) {
|
|
6185
|
+
lines.push(
|
|
6186
|
+
`${kind.padEnd(labelWidth)} \xD7${k.count} ${(k.ms / 1e3).toFixed(1)}s busy $${k.usd.toFixed(4)} ${k.tokens.input}/${k.tokens.output}tok`
|
|
6187
|
+
);
|
|
6188
|
+
}
|
|
6189
|
+
lines.push(
|
|
6190
|
+
`TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall $${totalUsd.toFixed(4)}`
|
|
6191
|
+
);
|
|
6192
|
+
return lines.join("\n");
|
|
6193
|
+
};
|
|
6194
|
+
return {
|
|
6195
|
+
hooks: { onEvent },
|
|
6196
|
+
report,
|
|
6197
|
+
render,
|
|
6198
|
+
reset: () => {
|
|
6199
|
+
spans = /* @__PURE__ */ new Map();
|
|
6200
|
+
}
|
|
6201
|
+
};
|
|
6202
|
+
}
|
|
6203
|
+
|
|
5575
6204
|
// src/runtime/workspace.ts
|
|
5576
6205
|
function localShell() {
|
|
5577
6206
|
return async (args, cwd) => {
|
|
@@ -5674,6 +6303,10 @@ function tail(s) {
|
|
|
5674
6303
|
}
|
|
5675
6304
|
|
|
5676
6305
|
export {
|
|
6306
|
+
deleteBoxSafe,
|
|
6307
|
+
throwAbort,
|
|
6308
|
+
throwIfAborted,
|
|
6309
|
+
sleep,
|
|
5677
6310
|
contentAddress,
|
|
5678
6311
|
InMemoryResultBlobStore,
|
|
5679
6312
|
FileResultBlobStore,
|
|
@@ -5681,6 +6314,8 @@ export {
|
|
|
5681
6314
|
FileSpawnJournal,
|
|
5682
6315
|
replaySpawnTree,
|
|
5683
6316
|
materializeTreeView,
|
|
6317
|
+
anytimeReport,
|
|
6318
|
+
renderAnytimeTable,
|
|
5684
6319
|
defaultAuditorInstruction,
|
|
5685
6320
|
auditIntent,
|
|
5686
6321
|
completionAuthorizes,
|
|
@@ -5751,11 +6386,14 @@ export {
|
|
|
5751
6386
|
strategyAuthorContract,
|
|
5752
6387
|
assertStrategyContract,
|
|
5753
6388
|
authorStrategy,
|
|
6389
|
+
discriminatingMeans,
|
|
6390
|
+
pickChampion,
|
|
5754
6391
|
selectChampion,
|
|
5755
6392
|
runStrategyEvolution,
|
|
5756
6393
|
createVerifierEnvironment,
|
|
6394
|
+
createWaterfallCollector,
|
|
5757
6395
|
localShell,
|
|
5758
6396
|
gitWorkspace,
|
|
5759
6397
|
jjWorkspace
|
|
5760
6398
|
};
|
|
5761
|
-
//# sourceMappingURL=chunk-
|
|
6399
|
+
//# sourceMappingURL=chunk-PXUTIMGJ.js.map
|