@tangle-network/agent-runtime 0.48.0 → 0.50.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.d.ts +1 -1
- package/dist/agent.js +1 -1
- package/dist/analyst-loop.d.ts +1 -1
- package/dist/{chunk-656G2XCL.js → chunk-BKAIVNFA.js} +3 -3
- package/dist/{chunk-IW2LMLK6.js → chunk-CM2IK7VS.js} +913 -152
- package/dist/chunk-CM2IK7VS.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-ML4IXGTV.js} +2 -2
- package/dist/{chunk-TJS7S3HJ.js → chunk-NDM5VXZW.js} +19 -8
- package/dist/chunk-NDM5VXZW.js.map +1 -0
- package/dist/chunk-OM3YNZIW.js +978 -0
- package/dist/chunk-OM3YNZIW.js.map +1 -0
- package/dist/{chunk-JNPK46YH.js → chunk-RHW75JW5.js} +498 -350
- package/dist/chunk-RHW75JW5.js.map +1 -0
- package/dist/{coder-CVZNGbyg.d.ts → coder-_YCf3BAK.d.ts} +2 -2
- package/dist/{driver-DYU2sgHr.d.ts → driver-DLI1io57.d.ts} +1 -1
- package/dist/index.d.ts +34 -9
- package/dist/index.js +117 -27
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CHAyt4aI.d.ts +1571 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-DFUNgpeK.d.ts} +4 -4
- package/dist/loop-runner-bin.d.ts +5 -5
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +6 -6
- package/dist/loops.js +17 -1
- package/dist/mcp/bin.js +206 -29
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +41 -177
- package/dist/mcp/index.js +40 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/openai-tools-D4HLDWgw.d.ts +45 -0
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/profiles.d.ts +2 -2
- package/dist/{run-loop-DvD4aGiE.d.ts → run-loop-BIineL1T.d.ts} +1 -1
- package/dist/runtime.d.ts +403 -24
- package/dist/runtime.js +17 -1
- package/dist/{types-BpDfCPUp.d.ts → types-5MGt5KTY.d.ts} +1 -1
- package/dist/{types-nBMuollC.d.ts → types-BEQsBhOE.d.ts} +1 -1
- package/dist/workflow.d.ts +2 -2
- package/dist/workflow.js +1 -1
- package/package.json +6 -5
- package/dist/chunk-IW2LMLK6.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-LX66I3SC.js +0 -218
- package/dist/chunk-LX66I3SC.js.map +0 -1
- package/dist/chunk-TJS7S3HJ.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- package/dist/otel-export-EzfsVUhh.d.ts +0 -191
- /package/dist/{chunk-656G2XCL.js.map → chunk-BKAIVNFA.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-ML4IXGTV.js.map} +0 -0
|
@@ -426,6 +426,103 @@ function isNoEntError(err) {
|
|
|
426
426
|
return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
+
// src/runtime/anytime.ts
|
|
430
|
+
var median = (xs) => {
|
|
431
|
+
if (xs.length === 0) return null;
|
|
432
|
+
const s = [...xs].sort((a, b) => a - b);
|
|
433
|
+
const mid = Math.floor(s.length / 2);
|
|
434
|
+
return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
435
|
+
};
|
|
436
|
+
function anytimeReport(spans, opts) {
|
|
437
|
+
const targets = opts?.targets ?? [1];
|
|
438
|
+
const byRun = /* @__PURE__ */ new Map();
|
|
439
|
+
for (const s of spans) {
|
|
440
|
+
if (!s.label.startsWith("shot:")) continue;
|
|
441
|
+
const list = byRun.get(s.runId) ?? [];
|
|
442
|
+
list.push(s);
|
|
443
|
+
byRun.set(s.runId, list);
|
|
444
|
+
}
|
|
445
|
+
const perTask = [];
|
|
446
|
+
for (const [runId, shots] of byRun) {
|
|
447
|
+
const m = runId.match(/^agentic:(.+):(.+)$/);
|
|
448
|
+
const strategy = m?.[1] ?? runId;
|
|
449
|
+
const taskId = m?.[2] ?? runId;
|
|
450
|
+
const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
|
|
451
|
+
const t0 = Math.min(...ordered.map((s) => s.startMs));
|
|
452
|
+
const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
|
|
453
|
+
let best = 0;
|
|
454
|
+
let cumUsd = 0;
|
|
455
|
+
const points = [];
|
|
456
|
+
const hits = {};
|
|
457
|
+
for (const t of taskTargets) hits[String(t)] = null;
|
|
458
|
+
for (const s of ordered) {
|
|
459
|
+
cumUsd += s.usd;
|
|
460
|
+
if (typeof s.score === "number" && s.score > best) best = s.score;
|
|
461
|
+
const elapsedMs = (s.endMs ?? s.startMs) - t0;
|
|
462
|
+
points.push({ elapsedMs, cumUsd, best });
|
|
463
|
+
for (const t of taskTargets) {
|
|
464
|
+
if (hits[String(t)] === null && best >= t) {
|
|
465
|
+
hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
perTask.push({ taskId, strategy, points, hits });
|
|
470
|
+
}
|
|
471
|
+
const byStrategy = /* @__PURE__ */ new Map();
|
|
472
|
+
for (const t of perTask) {
|
|
473
|
+
const list = byStrategy.get(t.strategy) ?? [];
|
|
474
|
+
list.push(t);
|
|
475
|
+
byStrategy.set(t.strategy, list);
|
|
476
|
+
}
|
|
477
|
+
const perStrategy = [];
|
|
478
|
+
for (const [strategy, tasks] of byStrategy) {
|
|
479
|
+
const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
|
|
480
|
+
const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
|
|
481
|
+
const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
|
|
482
|
+
const curveByShot = [];
|
|
483
|
+
for (let i = 0; i < maxShots; i += 1) {
|
|
484
|
+
const vals = tasks.map(
|
|
485
|
+
(t) => t.points[Math.min(i, t.points.length - 1)].best
|
|
486
|
+
);
|
|
487
|
+
curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
|
|
488
|
+
}
|
|
489
|
+
const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
|
|
490
|
+
const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
|
|
491
|
+
for (const t of summaryTargets) {
|
|
492
|
+
const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
|
|
493
|
+
const reached = tasks.filter((x) => key(x) !== null);
|
|
494
|
+
perStrategy.push({
|
|
495
|
+
strategy,
|
|
496
|
+
target: t,
|
|
497
|
+
tasks: tasks.length,
|
|
498
|
+
reachedTarget: reached.length,
|
|
499
|
+
medianTttMs: median(reached.map((x) => key(x).ms)),
|
|
500
|
+
medianShotsToTarget: median(reached.map((x) => key(x).shots)),
|
|
501
|
+
ertMs: reached.length > 0 ? totalMs / reached.length : null,
|
|
502
|
+
erUsd: reached.length > 0 ? totalUsd / reached.length : null,
|
|
503
|
+
curveByShot,
|
|
504
|
+
auc
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
|
|
509
|
+
return { targets, perTask, perStrategy };
|
|
510
|
+
}
|
|
511
|
+
function renderAnytimeTable(report) {
|
|
512
|
+
const lines = [
|
|
513
|
+
`anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
|
|
514
|
+
"strategy \u2265tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve"
|
|
515
|
+
];
|
|
516
|
+
for (const s of report.perStrategy) {
|
|
517
|
+
const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
|
|
518
|
+
const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
|
|
519
|
+
lines.push(
|
|
520
|
+
`${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? " \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`} ${s.medianShotsToTarget === null ? " \u2014" : String(s.medianShotsToTarget).padStart(5)} ${s.ertMs === null ? " \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`} ${s.erUsd === null ? " \u2014" : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
return lines.join("\n");
|
|
524
|
+
}
|
|
525
|
+
|
|
429
526
|
// src/runtime/audit-intent.ts
|
|
430
527
|
var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
|
|
431
528
|
function summarize(trace, maxLines) {
|
|
@@ -2346,20 +2443,20 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
|
|
|
2346
2443
|
}
|
|
2347
2444
|
async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
|
|
2348
2445
|
let reconciled = false;
|
|
2349
|
-
const reconcileOnce = (
|
|
2446
|
+
const reconcileOnce = (spend) => {
|
|
2350
2447
|
if (reconciled) return;
|
|
2351
2448
|
reconciled = true;
|
|
2352
|
-
pool2.reconcile(ticket, clampSpend(
|
|
2449
|
+
pool2.reconcile(ticket, clampSpend(spend, opts.budget));
|
|
2353
2450
|
};
|
|
2354
2451
|
try {
|
|
2355
2452
|
live.status = "running";
|
|
2356
2453
|
const ran = executor.execute(task, childAbort.signal);
|
|
2357
2454
|
let artifact;
|
|
2358
2455
|
if (isAsyncIterable2(ran)) {
|
|
2359
|
-
const
|
|
2360
|
-
live.spent =
|
|
2456
|
+
const spend = await foldStream(ran);
|
|
2457
|
+
live.spent = spend;
|
|
2361
2458
|
artifact = executor.resultArtifact();
|
|
2362
|
-
reconcileOnce(
|
|
2459
|
+
reconcileOnce(spend);
|
|
2363
2460
|
} else {
|
|
2364
2461
|
const terminal = await ran;
|
|
2365
2462
|
live.spent = terminal.spent;
|
|
@@ -2448,21 +2545,21 @@ async function foldStream(stream) {
|
|
|
2448
2545
|
}
|
|
2449
2546
|
return { iterations, tokens, usd, ms: 0 };
|
|
2450
2547
|
}
|
|
2451
|
-
function clampSpend(
|
|
2452
|
-
const totalTokens2 =
|
|
2548
|
+
function clampSpend(spend, budget) {
|
|
2549
|
+
const totalTokens2 = spend.tokens.input + spend.tokens.output;
|
|
2453
2550
|
const tokensOk = totalTokens2 <= budget.maxTokens;
|
|
2454
|
-
const itersOk =
|
|
2455
|
-
const usdOk = budget.maxUsd === void 0 ||
|
|
2456
|
-
if (tokensOk && itersOk && usdOk) return
|
|
2551
|
+
const itersOk = spend.iterations <= budget.maxIterations;
|
|
2552
|
+
const usdOk = budget.maxUsd === void 0 || spend.usd <= budget.maxUsd;
|
|
2553
|
+
if (tokensOk && itersOk && usdOk) return spend;
|
|
2457
2554
|
const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
|
|
2458
2555
|
return {
|
|
2459
|
-
iterations: Math.min(
|
|
2556
|
+
iterations: Math.min(spend.iterations, budget.maxIterations),
|
|
2460
2557
|
tokens: ratio < 1 ? {
|
|
2461
|
-
input: Math.floor(
|
|
2462
|
-
output: Math.floor(
|
|
2463
|
-
} :
|
|
2464
|
-
usd: budget.maxUsd === void 0 ?
|
|
2465
|
-
ms:
|
|
2558
|
+
input: Math.floor(spend.tokens.input * ratio),
|
|
2559
|
+
output: Math.floor(spend.tokens.output * ratio)
|
|
2560
|
+
} : spend.tokens,
|
|
2561
|
+
usd: budget.maxUsd === void 0 ? spend.usd : Math.min(spend.usd, budget.maxUsd),
|
|
2562
|
+
ms: spend.ms
|
|
2466
2563
|
};
|
|
2467
2564
|
}
|
|
2468
2565
|
async function teardownSafe(executor, grace) {
|
|
@@ -3012,7 +3109,138 @@ function isNoEntError2(err) {
|
|
|
3012
3109
|
|
|
3013
3110
|
// src/runtime/supervise/runtime.ts
|
|
3014
3111
|
import { spawn } from "child_process";
|
|
3112
|
+
import { estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
|
|
3113
|
+
|
|
3114
|
+
// src/runtime/router-client.ts
|
|
3015
3115
|
import { estimateCost, isModelPriced } from "@tangle-network/agent-eval";
|
|
3116
|
+
async function routerChatWithUsage(cfg, messages, opts) {
|
|
3117
|
+
const url = `${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`;
|
|
3118
|
+
const headers = { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` };
|
|
3119
|
+
let temperature = opts?.temperature ?? 0.2;
|
|
3120
|
+
let lastErr = "";
|
|
3121
|
+
for (let attempt = 0; attempt < 5; attempt += 1) {
|
|
3122
|
+
const res = await fetch(url, {
|
|
3123
|
+
method: "POST",
|
|
3124
|
+
headers,
|
|
3125
|
+
// max_tokens default is generous: THINKING models (kimi-k2.6) spend the budget on
|
|
3126
|
+
// reasoning_content first — a small router default yields EMPTY content.
|
|
3127
|
+
body: JSON.stringify({
|
|
3128
|
+
model: cfg.model,
|
|
3129
|
+
messages,
|
|
3130
|
+
temperature,
|
|
3131
|
+
max_tokens: opts?.maxTokens ?? 8192
|
|
3132
|
+
}),
|
|
3133
|
+
...opts?.signal ? { signal: opts.signal } : {}
|
|
3134
|
+
});
|
|
3135
|
+
if (res.ok) return parseChatResult(await res.json(), cfg.model);
|
|
3136
|
+
const status = res.status;
|
|
3137
|
+
const text = (await res.text()).slice(0, 200);
|
|
3138
|
+
lastErr = `router ${status}: ${text}`;
|
|
3139
|
+
if (status === 400 && /temperature/i.test(text) && temperature !== 1) {
|
|
3140
|
+
temperature = 1;
|
|
3141
|
+
continue;
|
|
3142
|
+
}
|
|
3143
|
+
if (![408, 425, 429, 500, 502, 503, 504, 520, 522, 524].includes(status))
|
|
3144
|
+
throw new Error(lastErr);
|
|
3145
|
+
if (attempt < 4) await new Promise((r) => setTimeout(r, 800 * 2 ** attempt));
|
|
3146
|
+
}
|
|
3147
|
+
throw new Error(`${lastErr} (exhausted retries)`);
|
|
3148
|
+
}
|
|
3149
|
+
function parseChatResult(json, model) {
|
|
3150
|
+
const data = json;
|
|
3151
|
+
const u = data.usage;
|
|
3152
|
+
const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
|
|
3153
|
+
const costUsd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : void 0;
|
|
3154
|
+
return {
|
|
3155
|
+
content: data.choices?.[0]?.message?.content ?? "",
|
|
3156
|
+
...usage ? { usage } : {},
|
|
3157
|
+
...costUsd !== void 0 ? { costUsd } : {}
|
|
3158
|
+
};
|
|
3159
|
+
}
|
|
3160
|
+
async function routerChatWithTools(cfg, messages, tools, opts) {
|
|
3161
|
+
const res = await fetch(`${cfg.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
3162
|
+
method: "POST",
|
|
3163
|
+
headers: { "content-type": "application/json", authorization: `Bearer ${cfg.routerKey}` },
|
|
3164
|
+
body: JSON.stringify({
|
|
3165
|
+
model: cfg.model,
|
|
3166
|
+
messages,
|
|
3167
|
+
tools,
|
|
3168
|
+
tool_choice: opts?.toolChoice ?? "auto",
|
|
3169
|
+
temperature: opts?.temperature ?? 0.3
|
|
3170
|
+
}),
|
|
3171
|
+
...opts?.signal ? { signal: opts.signal } : {}
|
|
3172
|
+
});
|
|
3173
|
+
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
3174
|
+
const data = await res.json();
|
|
3175
|
+
const msg = data.choices?.[0]?.message;
|
|
3176
|
+
const toolCalls = (msg?.tool_calls ?? []).map((tc, i) => ({
|
|
3177
|
+
id: tc.id ?? `call_${i}`,
|
|
3178
|
+
name: tc.function?.name ?? "",
|
|
3179
|
+
arguments: tc.function?.arguments ?? "{}"
|
|
3180
|
+
}));
|
|
3181
|
+
const u = data.usage;
|
|
3182
|
+
const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
|
|
3183
|
+
const costUsd = usage && isModelPriced(cfg.model) ? estimateCost(usage.input, usage.output, cfg.model) : void 0;
|
|
3184
|
+
return {
|
|
3185
|
+
content: msg?.content ?? null,
|
|
3186
|
+
toolCalls,
|
|
3187
|
+
...usage ? { usage } : {},
|
|
3188
|
+
...costUsd !== void 0 ? { costUsd } : {}
|
|
3189
|
+
};
|
|
3190
|
+
}
|
|
3191
|
+
async function routerToolLoop(cfg, system, user, tools, execute, opts) {
|
|
3192
|
+
const maxTurns = opts?.maxTurns ?? 4;
|
|
3193
|
+
const messages = [
|
|
3194
|
+
{ role: "system", content: system },
|
|
3195
|
+
{ role: "user", content: user }
|
|
3196
|
+
];
|
|
3197
|
+
let toolCalls = 0;
|
|
3198
|
+
let lastText = "";
|
|
3199
|
+
const usage = { input: 0, output: 0 };
|
|
3200
|
+
const toolTrace = [];
|
|
3201
|
+
for (let turn = 1; turn <= maxTurns; turn += 1) {
|
|
3202
|
+
const r = await routerChatWithTools(cfg, messages, tools, {
|
|
3203
|
+
...opts?.temperature !== void 0 ? { temperature: opts.temperature } : {},
|
|
3204
|
+
...opts?.signal ? { signal: opts.signal } : {}
|
|
3205
|
+
});
|
|
3206
|
+
if (r.usage) {
|
|
3207
|
+
usage.input += r.usage.input;
|
|
3208
|
+
usage.output += r.usage.output;
|
|
3209
|
+
}
|
|
3210
|
+
if (r.content) lastText = r.content;
|
|
3211
|
+
if (r.toolCalls.length === 0)
|
|
3212
|
+
return { final: lastText, turns: turn, toolCalls, toolTrace, usage };
|
|
3213
|
+
messages.push({
|
|
3214
|
+
role: "assistant",
|
|
3215
|
+
content: r.content ?? "",
|
|
3216
|
+
tool_calls: r.toolCalls.map((tc) => ({
|
|
3217
|
+
id: tc.id,
|
|
3218
|
+
type: "function",
|
|
3219
|
+
function: { name: tc.name, arguments: tc.arguments }
|
|
3220
|
+
}))
|
|
3221
|
+
});
|
|
3222
|
+
for (const tc of r.toolCalls) {
|
|
3223
|
+
toolCalls += 1;
|
|
3224
|
+
let args = {};
|
|
3225
|
+
try {
|
|
3226
|
+
args = JSON.parse(tc.arguments);
|
|
3227
|
+
} catch {
|
|
3228
|
+
messages.push({
|
|
3229
|
+
role: "tool",
|
|
3230
|
+
tool_call_id: tc.id,
|
|
3231
|
+
content: `error: arguments were not valid JSON: ${tc.arguments.slice(0, 200)}`
|
|
3232
|
+
});
|
|
3233
|
+
continue;
|
|
3234
|
+
}
|
|
3235
|
+
const out = await execute(tc.name, args);
|
|
3236
|
+
messages.push({ role: "tool", tool_call_id: tc.id, content: out });
|
|
3237
|
+
toolTrace.push({ name: tc.name, args: tc.arguments, result: out });
|
|
3238
|
+
}
|
|
3239
|
+
}
|
|
3240
|
+
return { final: lastText, turns: maxTurns, toolCalls, toolTrace, usage };
|
|
3241
|
+
}
|
|
3242
|
+
|
|
3243
|
+
// src/runtime/supervise/runtime.ts
|
|
3016
3244
|
var routerSeamKey = "router";
|
|
3017
3245
|
var sandboxSeamKey = "sandbox";
|
|
3018
3246
|
var cliSeamKey = "cli";
|
|
@@ -3058,30 +3286,19 @@ var routerInlineExecutor = (spec, ctx) => {
|
|
|
3058
3286
|
const messages = taskToMessages(task, spec);
|
|
3059
3287
|
const started = Date.now();
|
|
3060
3288
|
const linked = linkSignals(signal, controller.signal);
|
|
3061
|
-
const
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
|
|
3065
|
-
|
|
3066
|
-
});
|
|
3067
|
-
if (!res.ok) {
|
|
3068
|
-
throw new ValidationError(
|
|
3069
|
-
`routerInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`
|
|
3070
|
-
);
|
|
3071
|
-
}
|
|
3072
|
-
const data = await res.json();
|
|
3073
|
-
const u = data.usage;
|
|
3074
|
-
const usage = u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number" ? { input: u.prompt_tokens, output: u.completion_tokens } : void 0;
|
|
3075
|
-
const usd = usage && isModelPriced(model) ? estimateCost(usage.input, usage.output, model) : 0;
|
|
3076
|
-
const content = data.choices?.[0]?.message?.content ?? "";
|
|
3289
|
+
const r = await routerChatWithUsage(
|
|
3290
|
+
{ routerBaseUrl: seam.routerBaseUrl, routerKey: seam.routerKey, model },
|
|
3291
|
+
messages,
|
|
3292
|
+
linked ? { signal: linked } : {}
|
|
3293
|
+
);
|
|
3077
3294
|
const spent = {
|
|
3078
3295
|
iterations: 1,
|
|
3079
|
-
tokens: usage ? { input: usage.input, output: usage.output } : zeroTokenUsage(),
|
|
3080
|
-
usd,
|
|
3296
|
+
tokens: r.usage ? { input: r.usage.input, output: r.usage.output } : zeroTokenUsage(),
|
|
3297
|
+
usd: r.costUsd ?? 0,
|
|
3081
3298
|
ms: Date.now() - started
|
|
3082
3299
|
};
|
|
3083
|
-
const out = { content };
|
|
3084
|
-
artifact = { outRef: contentRef("router", { model, content }), out, spent };
|
|
3300
|
+
const out = { content: r.content };
|
|
3301
|
+
artifact = { outRef: contentRef("router", { model, content: r.content }), out, spent };
|
|
3085
3302
|
return artifact;
|
|
3086
3303
|
},
|
|
3087
3304
|
teardown(_grace) {
|
|
@@ -3110,7 +3327,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
|
|
|
3110
3327
|
"routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
|
|
3111
3328
|
);
|
|
3112
3329
|
}
|
|
3113
|
-
const maxTurns = seam.maxTurns ??
|
|
3330
|
+
const maxTurns = seam.maxTurns ?? 200;
|
|
3114
3331
|
const controller = new AbortController();
|
|
3115
3332
|
const abortIfSignalled = () => {
|
|
3116
3333
|
if (ctx.signal.aborted) controller.abort();
|
|
@@ -3188,7 +3405,7 @@ var routerToolsInlineExecutor = (spec, ctx) => {
|
|
|
3188
3405
|
messages.push({ role: "tool", tool_call_id: id, content: result });
|
|
3189
3406
|
}
|
|
3190
3407
|
}
|
|
3191
|
-
const usd =
|
|
3408
|
+
const usd = isModelPriced2(model) ? estimateCost2(tokens.input, tokens.output, model) : 0;
|
|
3192
3409
|
const spent = { iterations: turns, tokens, usd, ms: Date.now() - started };
|
|
3193
3410
|
const out = { content: lastText };
|
|
3194
3411
|
artifact = { outRef: contentRef("router-tools", { model, content: lastText }), out, spent };
|
|
@@ -4228,12 +4445,12 @@ function countStatuses(reported) {
|
|
|
4228
4445
|
function zeroSpend4() {
|
|
4229
4446
|
return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
|
|
4230
4447
|
}
|
|
4231
|
-
function cloneSpend(
|
|
4448
|
+
function cloneSpend(spend) {
|
|
4232
4449
|
return {
|
|
4233
|
-
iterations:
|
|
4234
|
-
tokens: { input:
|
|
4235
|
-
usd:
|
|
4236
|
-
ms:
|
|
4450
|
+
iterations: spend.iterations,
|
|
4451
|
+
tokens: { input: spend.tokens.input, output: spend.tokens.output },
|
|
4452
|
+
usd: spend.usd,
|
|
4453
|
+
ms: spend.ms
|
|
4237
4454
|
};
|
|
4238
4455
|
}
|
|
4239
4456
|
function addSpend(acc, delta) {
|
|
@@ -4249,13 +4466,13 @@ function spreadOf(values) {
|
|
|
4249
4466
|
function fractionalSpread(values) {
|
|
4250
4467
|
const spread = spreadOf(values);
|
|
4251
4468
|
if (spread === 0) return 0;
|
|
4252
|
-
const
|
|
4253
|
-
if (
|
|
4469
|
+
const median2 = medianOf(values);
|
|
4470
|
+
if (median2 === 0) {
|
|
4254
4471
|
throw new Error(
|
|
4255
4472
|
"equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
|
|
4256
4473
|
);
|
|
4257
4474
|
}
|
|
4258
|
-
return spread /
|
|
4475
|
+
return spread / median2;
|
|
4259
4476
|
}
|
|
4260
4477
|
function medianOf(values) {
|
|
4261
4478
|
if (values.length === 0) {
|
|
@@ -4287,28 +4504,34 @@ function requireNode2(nodes, id, root) {
|
|
|
4287
4504
|
return node;
|
|
4288
4505
|
}
|
|
4289
4506
|
function requireSpend(rolled, id, root) {
|
|
4290
|
-
const
|
|
4291
|
-
if (!
|
|
4507
|
+
const spend = rolled.get(id);
|
|
4508
|
+
if (!spend) {
|
|
4292
4509
|
throw new Error(
|
|
4293
4510
|
`trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
|
|
4294
4511
|
);
|
|
4295
4512
|
}
|
|
4296
|
-
return
|
|
4513
|
+
return spend;
|
|
4297
4514
|
}
|
|
4298
4515
|
|
|
4299
4516
|
// src/runtime/promotion-gate.ts
|
|
4300
4517
|
import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
|
|
4301
4518
|
function promotionGate(opts) {
|
|
4519
|
+
const mode = opts.mode ?? "superiority";
|
|
4302
4520
|
if (opts.candidate === opts.incumbent) {
|
|
4303
4521
|
return {
|
|
4304
4522
|
promoted: false,
|
|
4305
4523
|
reason: "identical-champion",
|
|
4524
|
+
mode,
|
|
4306
4525
|
n: 0,
|
|
4307
4526
|
lift: { mean: 0, median: 0, low: 0, high: 0 }
|
|
4308
4527
|
};
|
|
4309
4528
|
}
|
|
4310
4529
|
const before = [];
|
|
4311
4530
|
const after = [];
|
|
4531
|
+
const incUsd = [];
|
|
4532
|
+
const candUsd = [];
|
|
4533
|
+
const incMs = [];
|
|
4534
|
+
const candMs = [];
|
|
4312
4535
|
const cellIds = [];
|
|
4313
4536
|
for (const row of opts.report.perTask) {
|
|
4314
4537
|
const inc = row.cells?.[opts.incumbent];
|
|
@@ -4316,6 +4539,10 @@ function promotionGate(opts) {
|
|
|
4316
4539
|
if (!inc || !cand) continue;
|
|
4317
4540
|
before.push(inc.score);
|
|
4318
4541
|
after.push(cand.score);
|
|
4542
|
+
incUsd.push(inc.usd);
|
|
4543
|
+
candUsd.push(cand.usd);
|
|
4544
|
+
incMs.push(inc.ms);
|
|
4545
|
+
candMs.push(cand.ms);
|
|
4319
4546
|
cellIds.push(row.taskId);
|
|
4320
4547
|
}
|
|
4321
4548
|
if (before.length === 0) {
|
|
@@ -4339,15 +4566,91 @@ function promotionGate(opts) {
|
|
|
4339
4566
|
low: sig.bootstrap.low,
|
|
4340
4567
|
high: sig.bootstrap.high
|
|
4341
4568
|
};
|
|
4342
|
-
|
|
4343
|
-
|
|
4569
|
+
const latSig = heldoutSignificance(
|
|
4570
|
+
{ before: incMs, after: candMs, cellIds },
|
|
4571
|
+
{
|
|
4572
|
+
deltaThreshold: 0,
|
|
4573
|
+
minProductiveRuns: 1,
|
|
4574
|
+
statistic: opts.statistic ?? "mean",
|
|
4575
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4576
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4577
|
+
}
|
|
4578
|
+
);
|
|
4579
|
+
const latency = {
|
|
4580
|
+
mean: latSig.bootstrap.mean,
|
|
4581
|
+
median: latSig.bootstrap.median,
|
|
4582
|
+
low: latSig.bootstrap.low,
|
|
4583
|
+
high: latSig.bootstrap.high
|
|
4584
|
+
};
|
|
4585
|
+
if (mode === "superiority") {
|
|
4586
|
+
if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
|
|
4587
|
+
return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
|
|
4588
|
+
}
|
|
4589
|
+
const tolerance = opts.scoreTolerance ?? 0.05;
|
|
4590
|
+
const scoreSig = heldoutSignificance(
|
|
4591
|
+
{ before, after, cellIds },
|
|
4592
|
+
{
|
|
4593
|
+
deltaThreshold: -tolerance,
|
|
4594
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4595
|
+
statistic: opts.statistic ?? "mean",
|
|
4596
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4597
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4598
|
+
}
|
|
4599
|
+
);
|
|
4600
|
+
const costSig = heldoutSignificance(
|
|
4601
|
+
{ before: candUsd, after: incUsd, cellIds },
|
|
4602
|
+
{
|
|
4603
|
+
deltaThreshold: 0,
|
|
4604
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4605
|
+
statistic: opts.statistic ?? "mean",
|
|
4606
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4607
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4608
|
+
}
|
|
4609
|
+
);
|
|
4610
|
+
const costSavings = {
|
|
4611
|
+
mean: costSig.bootstrap.mean,
|
|
4612
|
+
median: costSig.bootstrap.median,
|
|
4613
|
+
low: costSig.bootstrap.low,
|
|
4614
|
+
high: costSig.bootstrap.high
|
|
4615
|
+
};
|
|
4616
|
+
if (scoreSig.fewRuns)
|
|
4617
|
+
return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
|
|
4618
|
+
if (!scoreSig.significant)
|
|
4619
|
+
return {
|
|
4620
|
+
promoted: false,
|
|
4621
|
+
reason: "non-inferiority-unproven",
|
|
4622
|
+
mode,
|
|
4623
|
+
n: scoreSig.n,
|
|
4624
|
+
lift,
|
|
4625
|
+
costSavings,
|
|
4626
|
+
latency
|
|
4627
|
+
};
|
|
4628
|
+
if (!costSig.significant)
|
|
4629
|
+
return {
|
|
4630
|
+
promoted: false,
|
|
4631
|
+
reason: "not-cheaper",
|
|
4632
|
+
mode,
|
|
4633
|
+
n: scoreSig.n,
|
|
4634
|
+
lift,
|
|
4635
|
+
costSavings,
|
|
4636
|
+
latency
|
|
4637
|
+
};
|
|
4638
|
+
return {
|
|
4639
|
+
promoted: true,
|
|
4640
|
+
reason: "non-inferior-and-cheaper",
|
|
4641
|
+
mode,
|
|
4642
|
+
n: scoreSig.n,
|
|
4643
|
+
lift,
|
|
4644
|
+
costSavings,
|
|
4645
|
+
latency
|
|
4646
|
+
};
|
|
4344
4647
|
}
|
|
4345
4648
|
|
|
4346
4649
|
// src/runtime/run-benchmark.ts
|
|
4347
4650
|
import { pairedBootstrap, paretoFrontier } from "@tangle-network/agent-eval";
|
|
4348
4651
|
|
|
4349
4652
|
// src/runtime/strategy.ts
|
|
4350
|
-
import { createChatClient, estimateCost as
|
|
4653
|
+
import { createChatClient, estimateCost as estimateCost3, isModelPriced as isModelPriced3 } from "@tangle-network/agent-eval";
|
|
4351
4654
|
var taskNudge = "Use the available tools to bring the artifact to the required final state. Address EVERY distinct change the request implies. After each tool result, check what remains and continue. Re-read the values you set to confirm they took. Reply DONE only once every required change is made and verified.";
|
|
4352
4655
|
async function runShot(surface, _task, handle, tools, messages, opts, modelOverride) {
|
|
4353
4656
|
const innerTurns = opts.innerTurns ?? 4;
|
|
@@ -4364,7 +4667,8 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
|
|
|
4364
4667
|
messages,
|
|
4365
4668
|
tools,
|
|
4366
4669
|
tool_choice: "auto",
|
|
4367
|
-
temperature: opts.temperature ?? 0.7
|
|
4670
|
+
temperature: opts.temperature ?? 0.7,
|
|
4671
|
+
...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
|
|
4368
4672
|
})
|
|
4369
4673
|
});
|
|
4370
4674
|
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
@@ -4403,12 +4707,15 @@ async function runShot(surface, _task, handle, tools, messages, opts, modelOverr
|
|
|
4403
4707
|
}
|
|
4404
4708
|
return { messages, completions, toolCalls, toolErrors, tokens };
|
|
4405
4709
|
}
|
|
4406
|
-
|
|
4407
|
-
|
|
4710
|
+
function compactTrajectory(messages) {
|
|
4711
|
+
return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
|
|
4408
4712
|
if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
|
|
4409
4713
|
const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
|
|
4410
4714
|
return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
|
|
4411
4715
|
}).join("\n").slice(0, 7e3);
|
|
4716
|
+
}
|
|
4717
|
+
async function consultAnalyst(task, messages, instruction, opts) {
|
|
4718
|
+
const trajectory = compactTrajectory(messages);
|
|
4412
4719
|
const analystModel = opts.analystModel ?? opts.model;
|
|
4413
4720
|
const chat = createChatClient({
|
|
4414
4721
|
transport: "router",
|
|
@@ -4416,6 +4723,52 @@ async function analyze(task, messages, opts) {
|
|
|
4416
4723
|
baseUrl: opts.routerBaseUrl,
|
|
4417
4724
|
defaultModel: analystModel
|
|
4418
4725
|
});
|
|
4726
|
+
const res = await chat.chat({
|
|
4727
|
+
model: analystModel,
|
|
4728
|
+
temperature: 0.2,
|
|
4729
|
+
maxTokens: 1024,
|
|
4730
|
+
messages: [
|
|
4731
|
+
{ role: "system", content: instruction },
|
|
4732
|
+
{
|
|
4733
|
+
role: "user",
|
|
4734
|
+
content: `TASK: ${task.userPrompt.slice(0, 1500)}
|
|
4735
|
+
|
|
4736
|
+
TRAJECTORY:
|
|
4737
|
+
${trajectory}`
|
|
4738
|
+
}
|
|
4739
|
+
]
|
|
4740
|
+
});
|
|
4741
|
+
const usage = res.usage;
|
|
4742
|
+
return {
|
|
4743
|
+
steer: res.content.trim(),
|
|
4744
|
+
tokens: {
|
|
4745
|
+
input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
|
|
4746
|
+
output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
|
|
4747
|
+
}
|
|
4748
|
+
};
|
|
4749
|
+
}
|
|
4750
|
+
async function analyze(task, messages, opts) {
|
|
4751
|
+
const trajectory = compactTrajectory(messages);
|
|
4752
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4753
|
+
const inner = createChatClient({
|
|
4754
|
+
transport: "router",
|
|
4755
|
+
apiKey: opts.routerKey,
|
|
4756
|
+
baseUrl: opts.routerBaseUrl,
|
|
4757
|
+
defaultModel: analystModel
|
|
4758
|
+
});
|
|
4759
|
+
const tokens = { input: 0, output: 0 };
|
|
4760
|
+
const chat = {
|
|
4761
|
+
...inner,
|
|
4762
|
+
chat: async (req, callOpts) => {
|
|
4763
|
+
const res = await inner.chat(req, callOpts);
|
|
4764
|
+
const u = res.usage;
|
|
4765
|
+
if (u) {
|
|
4766
|
+
tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
|
|
4767
|
+
tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
|
|
4768
|
+
}
|
|
4769
|
+
return res;
|
|
4770
|
+
}
|
|
4771
|
+
};
|
|
4419
4772
|
const obs = await observe(
|
|
4420
4773
|
{
|
|
4421
4774
|
task: task.userPrompt,
|
|
@@ -4432,14 +4785,8 @@ async function analyze(task, messages, opts) {
|
|
|
4432
4785
|
}
|
|
4433
4786
|
);
|
|
4434
4787
|
const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
|
|
4435
|
-
return steer || "COMPLETE";
|
|
4788
|
+
return { steer: steer || "COMPLETE", tokens };
|
|
4436
4789
|
}
|
|
4437
|
-
var spend = (iterations) => ({
|
|
4438
|
-
iterations,
|
|
4439
|
-
tokens: { input: 0, output: 0 },
|
|
4440
|
-
usd: 0,
|
|
4441
|
-
ms: 0
|
|
4442
|
-
});
|
|
4443
4790
|
function shotExecutor(surface, opts) {
|
|
4444
4791
|
let artifact;
|
|
4445
4792
|
return {
|
|
@@ -4449,7 +4796,19 @@ function shotExecutor(surface, opts) {
|
|
|
4449
4796
|
const own = !t.handle;
|
|
4450
4797
|
const handle = t.handle ?? await surface.open(t.task);
|
|
4451
4798
|
try {
|
|
4452
|
-
const
|
|
4799
|
+
const allTools = await surface.tools(t.task, handle);
|
|
4800
|
+
let tools = allTools;
|
|
4801
|
+
if (t.tools) {
|
|
4802
|
+
const known = new Set(allTools.map((tool) => tool.function.name));
|
|
4803
|
+
const unknown = t.tools.filter((name) => !known.has(name));
|
|
4804
|
+
if (unknown.length > 0) {
|
|
4805
|
+
throw new Error(
|
|
4806
|
+
`shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
|
|
4807
|
+
);
|
|
4808
|
+
}
|
|
4809
|
+
const want = new Set(t.tools);
|
|
4810
|
+
tools = allTools.filter((tool) => want.has(tool.function.name));
|
|
4811
|
+
}
|
|
4453
4812
|
const messages = t.messages?.length ? t.messages : [
|
|
4454
4813
|
{ role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
|
|
4455
4814
|
{ role: "user", content: `${t.task.userPrompt}
|
|
@@ -4483,7 +4842,7 @@ ${taskNudge}` }
|
|
|
4483
4842
|
spent: {
|
|
4484
4843
|
iterations: shot.completions,
|
|
4485
4844
|
tokens: shot.tokens,
|
|
4486
|
-
usd:
|
|
4845
|
+
usd: isModelPriced3(opts.model) ? estimateCost3(shot.tokens.input, shot.tokens.output, opts.model) : 0,
|
|
4487
4846
|
ms: 0
|
|
4488
4847
|
}
|
|
4489
4848
|
};
|
|
@@ -4505,8 +4864,18 @@ function analystExecutor(opts) {
|
|
|
4505
4864
|
runtime: "agentic-analyst",
|
|
4506
4865
|
async execute(task) {
|
|
4507
4866
|
const t = task;
|
|
4508
|
-
const
|
|
4509
|
-
|
|
4867
|
+
const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
|
|
4868
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4869
|
+
artifact = {
|
|
4870
|
+
outRef: `analyst:${steer.length}`,
|
|
4871
|
+
out: steer,
|
|
4872
|
+
spent: {
|
|
4873
|
+
iterations: 1,
|
|
4874
|
+
tokens,
|
|
4875
|
+
usd: isModelPriced3(analystModel) ? estimateCost3(tokens.input, tokens.output, analystModel) : 0,
|
|
4876
|
+
ms: 0
|
|
4877
|
+
}
|
|
4878
|
+
};
|
|
4510
4879
|
return artifact;
|
|
4511
4880
|
},
|
|
4512
4881
|
teardown: () => Promise.resolve({ destroyed: true }),
|
|
@@ -4669,12 +5038,21 @@ function defineStrategy(name, run) {
|
|
|
4669
5038
|
const innerTurns = opts.innerTurns ?? 4;
|
|
4670
5039
|
let verifiedBest = 0;
|
|
4671
5040
|
let verifiedResolved = false;
|
|
5041
|
+
const openHandles = /* @__PURE__ */ new Set();
|
|
4672
5042
|
const ctx = {
|
|
4673
5043
|
// Narrowed to open/close — the body gets no raw call()/score() access.
|
|
4674
5044
|
surface: {
|
|
4675
5045
|
name: surface.name,
|
|
4676
|
-
open: (t) =>
|
|
4677
|
-
|
|
5046
|
+
open: async (t) => {
|
|
5047
|
+
const h = await surface.open(t);
|
|
5048
|
+
openHandles.add(h.id);
|
|
5049
|
+
return h;
|
|
5050
|
+
},
|
|
5051
|
+
close: async (h) => {
|
|
5052
|
+
if (!h || !openHandles.has(h.id)) return;
|
|
5053
|
+
openHandles.delete(h.id);
|
|
5054
|
+
await surface.close(h);
|
|
5055
|
+
}
|
|
4678
5056
|
},
|
|
4679
5057
|
task,
|
|
4680
5058
|
opts,
|
|
@@ -4690,7 +5068,8 @@ function defineStrategy(name, run) {
|
|
|
4690
5068
|
handle: spec?.handle,
|
|
4691
5069
|
messages: spec?.messages,
|
|
4692
5070
|
steer: spec?.steer,
|
|
4693
|
-
persona: spec?.persona
|
|
5071
|
+
persona: spec?.persona,
|
|
5072
|
+
tools: spec?.tools
|
|
4694
5073
|
},
|
|
4695
5074
|
{ budget: perChild(innerTurns), label: child.name }
|
|
4696
5075
|
);
|
|
@@ -4702,6 +5081,13 @@ function defineStrategy(name, run) {
|
|
|
4702
5081
|
if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
|
|
4703
5082
|
return out;
|
|
4704
5083
|
},
|
|
5084
|
+
async listTools(handle) {
|
|
5085
|
+
const tools = await surface.tools(task, handle);
|
|
5086
|
+
return tools.map((t) => ({
|
|
5087
|
+
name: t.function.name,
|
|
5088
|
+
...t.function.description ? { description: t.function.description } : {}
|
|
5089
|
+
}));
|
|
5090
|
+
},
|
|
4705
5091
|
async critique(messages) {
|
|
4706
5092
|
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4707
5093
|
seq += 1;
|
|
@@ -4715,12 +5101,33 @@ function defineStrategy(name, run) {
|
|
|
4715
5101
|
if (settled.kind === "down") return null;
|
|
4716
5102
|
const findings = settled.out;
|
|
4717
5103
|
return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
|
|
5104
|
+
},
|
|
5105
|
+
async consult(messages, instruction) {
|
|
5106
|
+
const child = leaf(`analyst:${seq}`, "analyst");
|
|
5107
|
+
seq += 1;
|
|
5108
|
+
const res = scope.spawn(
|
|
5109
|
+
child,
|
|
5110
|
+
{ task, messages, rawInstruction: instruction },
|
|
5111
|
+
{ budget: perChild(1), label: child.name }
|
|
5112
|
+
);
|
|
5113
|
+
if (!res.ok) return null;
|
|
5114
|
+
const settled = await drainOne2(scope);
|
|
5115
|
+
if (settled.kind === "down") return null;
|
|
5116
|
+
return settled.out;
|
|
4718
5117
|
}
|
|
4719
5118
|
};
|
|
4720
5119
|
const r = await run(ctx);
|
|
4721
5120
|
return {
|
|
4722
5121
|
kind: "done",
|
|
4723
|
-
deliverable: {
|
|
5122
|
+
deliverable: {
|
|
5123
|
+
mode: name,
|
|
5124
|
+
...r,
|
|
5125
|
+
progression: Array.isArray(r.progression) ? r.progression : [],
|
|
5126
|
+
completions: typeof r.completions === "number" ? r.completions : 0,
|
|
5127
|
+
shots: typeof r.shots === "number" ? r.shots : 0,
|
|
5128
|
+
score: verifiedBest,
|
|
5129
|
+
resolved: verifiedResolved
|
|
5130
|
+
}
|
|
4724
5131
|
};
|
|
4725
5132
|
}
|
|
4726
5133
|
})
|
|
@@ -4875,27 +5282,44 @@ async function runBenchmark(cfg) {
|
|
|
4875
5282
|
let settled = 0;
|
|
4876
5283
|
const perTask = await pool(cfg.tasks, concurrency, async (task) => {
|
|
4877
5284
|
const cells = {};
|
|
5285
|
+
const errors = {};
|
|
4878
5286
|
let row;
|
|
4879
5287
|
try {
|
|
4880
5288
|
for (const s of strategies) {
|
|
4881
|
-
|
|
4882
|
-
|
|
4883
|
-
|
|
4884
|
-
|
|
4885
|
-
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
|
|
4895
|
-
|
|
4896
|
-
|
|
5289
|
+
try {
|
|
5290
|
+
const r = await runAgentic({
|
|
5291
|
+
...cfg.worker,
|
|
5292
|
+
surface: cfg.environment,
|
|
5293
|
+
task,
|
|
5294
|
+
strategy: s,
|
|
5295
|
+
budget,
|
|
5296
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5297
|
+
});
|
|
5298
|
+
cells[s.name] = {
|
|
5299
|
+
score: r.score,
|
|
5300
|
+
resolved: r.resolved,
|
|
5301
|
+
progression: r.progression,
|
|
5302
|
+
usd: r.usd,
|
|
5303
|
+
ms: r.ms,
|
|
5304
|
+
tokens: r.tokens
|
|
5305
|
+
};
|
|
5306
|
+
} catch (e) {
|
|
5307
|
+
errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
|
|
5308
|
+
cells[s.name] = {
|
|
5309
|
+
score: 0,
|
|
5310
|
+
resolved: false,
|
|
5311
|
+
progression: [],
|
|
5312
|
+
usd: 0,
|
|
5313
|
+
ms: 0,
|
|
5314
|
+
tokens: { input: 0, output: 0 }
|
|
5315
|
+
};
|
|
5316
|
+
}
|
|
4897
5317
|
}
|
|
4898
|
-
row = {
|
|
5318
|
+
row = {
|
|
5319
|
+
taskId: task.id,
|
|
5320
|
+
cells,
|
|
5321
|
+
...Object.keys(errors).length > 0 ? { errors } : {}
|
|
5322
|
+
};
|
|
4899
5323
|
} catch (e) {
|
|
4900
5324
|
row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
|
|
4901
5325
|
}
|
|
@@ -5200,7 +5624,7 @@ var strategyAuthorContract = `
|
|
|
5200
5624
|
You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
|
|
5201
5625
|
spend a compute budget to beat a task's deployable check. You compose exactly two steps:
|
|
5202
5626
|
|
|
5203
|
-
shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
|
|
5627
|
+
shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
|
|
5204
5628
|
Runs ONE worker attempt (a bounded tool loop) over an artifact.
|
|
5205
5629
|
- omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
|
|
5206
5630
|
- pass handle => the shot CONTINUES that artifact (state accumulates across shots).
|
|
@@ -5210,6 +5634,10 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
|
|
|
5210
5634
|
(multi-agent strategies: a researcher shot then an engineer shot, a panel of k
|
|
5211
5635
|
personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
|
|
5212
5636
|
a carried conversation it arrives as a hand-off message. Same conserved budget.
|
|
5637
|
+
- tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by
|
|
5638
|
+
name (focus an explore shot on read-only tools, an execute shot on write tools).
|
|
5639
|
+
Restriction-only; unknown names make the shot fail. ALWAYS select from
|
|
5640
|
+
await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
|
|
5213
5641
|
ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
|
|
5214
5642
|
Returns null if the attempt failed infra-wise.
|
|
5215
5643
|
|
|
@@ -5217,10 +5645,23 @@ spend a compute budget to beat a task's deployable check. You compose exactly tw
|
|
|
5217
5645
|
A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
|
|
5218
5646
|
instruction (or null when it judges the work complete). Costs ~1 completion.
|
|
5219
5647
|
|
|
5648
|
+
consult(messages, instruction): Promise<string | null>
|
|
5649
|
+
The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
|
|
5650
|
+
trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
|
|
5651
|
+
(a decision, a prediction). Costs ~1 completion.
|
|
5652
|
+
|
|
5220
5653
|
surface.open(task) / surface.close(handle)
|
|
5221
5654
|
Open a persistent artifact you manage yourself (remember to close in a finally).
|
|
5655
|
+
close is idempotent \u2014 closing an already-closed handle is a safe no-op.
|
|
5656
|
+
|
|
5657
|
+
listTools(handle): Promise<Array<{ name, description? }>>
|
|
5658
|
+
The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
|
|
5659
|
+
shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
|
|
5660
|
+
names from an example kills your shots on every task whose tools differ.
|
|
5222
5661
|
|
|
5223
5662
|
Rules:
|
|
5663
|
+
- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
|
|
5664
|
+
crashes the whole benchmark run.
|
|
5224
5665
|
- Stay within ~budget total shots; every shot/critique spends from a conserved pool.
|
|
5225
5666
|
- For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
|
|
5226
5667
|
fresh conversation too, but be explicit). To CONTINUE, pass the previous
|
|
@@ -5230,8 +5671,8 @@ Rules:
|
|
|
5230
5671
|
- The module must be EXACTLY this shape (no other imports, no commentary outside code):
|
|
5231
5672
|
|
|
5232
5673
|
import { defineStrategy } from '@tangle-network/agent-runtime/loops'
|
|
5233
|
-
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
|
|
5234
|
-
// your composition
|
|
5674
|
+
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
|
|
5675
|
+
// your composition (listTools comes from the destructured context \u2014 it is NOT a global)
|
|
5235
5676
|
})
|
|
5236
5677
|
`;
|
|
5237
5678
|
function assertStrategyContract(code) {
|
|
@@ -5307,34 +5748,89 @@ async function authorStrategy(opts) {
|
|
|
5307
5748
|
}
|
|
5308
5749
|
|
|
5309
5750
|
// src/runtime/strategy-evolution.ts
|
|
5751
|
+
import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
|
|
5310
5752
|
import { gzipSync } from "zlib";
|
|
5311
|
-
function
|
|
5312
|
-
const
|
|
5753
|
+
function discriminatingMeans(report, fieldOrder) {
|
|
5754
|
+
const rows = report.perTask.filter((r) => {
|
|
5755
|
+
if (!r.cells) return false;
|
|
5756
|
+
const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
|
|
5757
|
+
if (scores.length < fieldOrder.length) return false;
|
|
5758
|
+
return Math.max(...scores) - Math.min(...scores) > 0;
|
|
5759
|
+
});
|
|
5760
|
+
if (rows.length === 0) return null;
|
|
5761
|
+
const out = {};
|
|
5762
|
+
for (const name of fieldOrder) {
|
|
5763
|
+
const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
|
|
5764
|
+
out[name] = {
|
|
5765
|
+
score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
|
|
5766
|
+
usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
|
|
5767
|
+
};
|
|
5768
|
+
}
|
|
5769
|
+
return out;
|
|
5770
|
+
}
|
|
5771
|
+
function pickChampion(means, fieldOrder, policy, epsilon) {
|
|
5772
|
+
const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
|
|
5313
5773
|
if (entries.length === 0)
|
|
5314
|
-
throw new Error("
|
|
5774
|
+
throw new Error("pickChampion: the means table carries none of the field strategies");
|
|
5315
5775
|
const best = Math.max(...entries.map((e) => e.summary.score));
|
|
5316
5776
|
const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
|
|
5317
|
-
if (!pick) throw new Error("
|
|
5777
|
+
if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
|
|
5318
5778
|
return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
|
|
5319
5779
|
}
|
|
5780
|
+
function selectChampion(report, fieldOrder, policy, epsilon) {
|
|
5781
|
+
return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
|
|
5782
|
+
}
|
|
5320
5783
|
var fieldSummary = (archive) => archive.map(
|
|
5321
5784
|
(n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
|
|
5322
5785
|
).join("\n");
|
|
5323
|
-
var compactLosses = (report) => {
|
|
5786
|
+
var compactLosses = (report, detail) => {
|
|
5324
5787
|
const r2 = (x) => Math.round(x * 100) / 100;
|
|
5325
5788
|
const rows = report.perTask.map(
|
|
5326
5789
|
(row) => row.cells ? {
|
|
5327
5790
|
task: row.taskId,
|
|
5791
|
+
...row.errors ? {
|
|
5792
|
+
errors: Object.fromEntries(
|
|
5793
|
+
Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
|
|
5794
|
+
)
|
|
5795
|
+
} : {},
|
|
5328
5796
|
cells: Object.fromEntries(
|
|
5329
5797
|
Object.entries(row.cells).map(([name, c]) => [
|
|
5330
5798
|
name,
|
|
5331
|
-
|
|
5799
|
+
// 'binary' is the leakage-bounded channel: the author learns pass/fail per
|
|
5800
|
+
// task and nothing else — the per-generation leak from the evaluation data
|
|
5801
|
+
// is capped at one bit per cell (arXiv:2606.11045 measured that exploration
|
|
5802
|
+
// survives this; whether AUTHORING does is the E1-coarse A/B).
|
|
5803
|
+
detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
|
|
5804
|
+
score: r2(c.score),
|
|
5805
|
+
resolved: c.resolved,
|
|
5806
|
+
usd: Math.round(c.usd * 1e4) / 1e4,
|
|
5807
|
+
progression: (c.progression ?? []).map(r2)
|
|
5808
|
+
}
|
|
5332
5809
|
])
|
|
5333
5810
|
)
|
|
5334
5811
|
} : { task: row.taskId, error: row.error?.slice(0, 80) }
|
|
5335
5812
|
);
|
|
5336
5813
|
return JSON.stringify(rows).slice(0, 12e3);
|
|
5337
5814
|
};
|
|
5815
|
+
function renameStrategy(orig, unique) {
|
|
5816
|
+
if (orig.name === unique) return orig;
|
|
5817
|
+
return {
|
|
5818
|
+
name: unique,
|
|
5819
|
+
driver: (s, t, o, b) => {
|
|
5820
|
+
const agent = orig.driver(s, t, o, b);
|
|
5821
|
+
return {
|
|
5822
|
+
...agent,
|
|
5823
|
+
name: unique,
|
|
5824
|
+
act: async (task, scope) => {
|
|
5825
|
+
const out = await agent.act(task, scope);
|
|
5826
|
+
if (out.kind !== "done") return out;
|
|
5827
|
+
const deliverable = { ...out.deliverable, mode: unique };
|
|
5828
|
+
return { ...out, deliverable };
|
|
5829
|
+
}
|
|
5830
|
+
};
|
|
5831
|
+
}
|
|
5832
|
+
};
|
|
5833
|
+
}
|
|
5338
5834
|
async function runStrategyEvolution(cfg) {
|
|
5339
5835
|
const budget = cfg.budget ?? 3;
|
|
5340
5836
|
const concurrency = cfg.concurrency ?? 3;
|
|
@@ -5342,37 +5838,72 @@ async function runStrategyEvolution(cfg) {
|
|
|
5342
5838
|
const populationSize = cfg.populationSize ?? 2;
|
|
5343
5839
|
const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
|
|
5344
5840
|
const policy = cfg.champion ?? "costAware";
|
|
5345
|
-
const epsilon = cfg.championEpsilon ?? 0.01;
|
|
5841
|
+
const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
|
|
5346
5842
|
const byName = new Map(baselines.map((s) => [s.name, s]));
|
|
5347
|
-
const
|
|
5348
|
-
|
|
5349
|
-
|
|
5350
|
-
|
|
5351
|
-
strategies,
|
|
5843
|
+
const codeByName = /* @__PURE__ */ new Map();
|
|
5844
|
+
const fingerprint = {
|
|
5845
|
+
trainN: cfg.trainN,
|
|
5846
|
+
holdoutN: cfg.holdoutN,
|
|
5352
5847
|
budget,
|
|
5353
|
-
|
|
5354
|
-
|
|
5355
|
-
|
|
5356
|
-
|
|
5848
|
+
generations,
|
|
5849
|
+
populationSize
|
|
5850
|
+
};
|
|
5851
|
+
let ckpt;
|
|
5852
|
+
if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
|
|
5853
|
+
const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
|
|
5854
|
+
if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
|
|
5855
|
+
throw new Error(
|
|
5856
|
+
`evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
|
|
5857
|
+
);
|
|
5858
|
+
}
|
|
5859
|
+
ckpt = raw;
|
|
5860
|
+
}
|
|
5861
|
+
const save = (state) => {
|
|
5862
|
+
if (cfg.checkpoint)
|
|
5863
|
+
writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
|
|
5864
|
+
};
|
|
5865
|
+
const bench = async (phase, tasks, strategies) => {
|
|
5866
|
+
await cfg.onPhase?.(phase);
|
|
5867
|
+
return runBenchmark({
|
|
5868
|
+
environment: cfg.environment,
|
|
5869
|
+
tasks,
|
|
5870
|
+
worker: cfg.worker,
|
|
5871
|
+
strategies,
|
|
5872
|
+
budget,
|
|
5873
|
+
concurrency,
|
|
5874
|
+
...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
|
|
5875
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5876
|
+
});
|
|
5877
|
+
};
|
|
5357
5878
|
const train = await cfg.tasks(0, cfg.trainN);
|
|
5358
|
-
const
|
|
5359
|
-
|
|
5879
|
+
const probeTask = train[0];
|
|
5880
|
+
if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
|
|
5881
|
+
const probe = await cfg.environment.open(probeTask);
|
|
5882
|
+
let toolCatalog;
|
|
5883
|
+
try {
|
|
5884
|
+
const tools = await cfg.environment.tools(probeTask, probe);
|
|
5885
|
+
toolCatalog = tools.map(
|
|
5886
|
+
(t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
|
|
5887
|
+
).join("\n");
|
|
5888
|
+
} finally {
|
|
5889
|
+
await cfg.environment.close(probe);
|
|
5890
|
+
}
|
|
5891
|
+
const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
|
|
5892
|
+
const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
|
|
5360
5893
|
name: s.name,
|
|
5361
5894
|
source: "baseline",
|
|
5362
5895
|
generation: 0,
|
|
5363
5896
|
score: gen0.perStrategy[s.name]?.score ?? 0,
|
|
5364
5897
|
usd: gen0.perStrategy[s.name]?.usd ?? 0
|
|
5365
5898
|
}));
|
|
5366
|
-
const gen0Champion = selectChampion(
|
|
5899
|
+
const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
|
|
5367
5900
|
gen0,
|
|
5368
5901
|
baselines.map((s) => s.name),
|
|
5369
5902
|
policy,
|
|
5370
5903
|
epsilon
|
|
5371
5904
|
);
|
|
5372
|
-
|
|
5373
|
-
|
|
5374
|
-
const generationRows = [];
|
|
5375
|
-
const trajectory = [
|
|
5905
|
+
const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
|
|
5906
|
+
const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
|
|
5376
5907
|
{
|
|
5377
5908
|
generation: 0,
|
|
5378
5909
|
champion: gen0Champion.name,
|
|
@@ -5380,13 +5911,39 @@ async function runStrategyEvolution(cfg) {
|
|
|
5380
5911
|
usd: gen0Champion.usd
|
|
5381
5912
|
}
|
|
5382
5913
|
];
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5914
|
+
for (const row of generationRows) {
|
|
5915
|
+
for (const c of row.candidates) {
|
|
5916
|
+
if (!c.file || c.error) continue;
|
|
5917
|
+
const mod = await import(`file://${c.file}`);
|
|
5918
|
+
if (!mod.default || typeof mod.default.driver !== "function") {
|
|
5919
|
+
throw new Error(
|
|
5920
|
+
`evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
|
|
5921
|
+
);
|
|
5922
|
+
}
|
|
5923
|
+
byName.set(c.name, renameStrategy(mod.default, c.name));
|
|
5924
|
+
codeByName.set(c.name, readFileSync(c.file, "utf8"));
|
|
5925
|
+
}
|
|
5926
|
+
}
|
|
5927
|
+
let authoredOk = generationRows.reduce(
|
|
5928
|
+
(n, row) => n + row.candidates.filter((c) => !c.error).length,
|
|
5929
|
+
0
|
|
5930
|
+
);
|
|
5931
|
+
const lastRow = generationRows[generationRows.length - 1];
|
|
5932
|
+
let incumbent = lastRow ? lastRow.champion : gen0Champion;
|
|
5933
|
+
let latestReport = lastRow ? lastRow.report : gen0;
|
|
5934
|
+
if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5935
|
+
for (let g = generationRows.length + 1; g <= generations; g += 1) {
|
|
5936
|
+
const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
|
|
5386
5937
|
const candidates = [];
|
|
5387
5938
|
const newStrategies = [];
|
|
5388
5939
|
for (let i = 0; i < populationSize; i += 1) {
|
|
5389
|
-
const
|
|
5940
|
+
const objectiveNote = cfg.objective === "cost" ? `
|
|
5941
|
+
|
|
5942
|
+
YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
|
|
5943
|
+
const contract = `${strategyAuthorContract}${objectiveNote}
|
|
5944
|
+
|
|
5945
|
+
EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
|
|
5946
|
+
${toolCatalog}
|
|
5390
5947
|
|
|
5391
5948
|
STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
|
|
5392
5949
|
${fieldSummary(archive)}
|
|
@@ -5406,26 +5963,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5406
5963
|
outDir: cfg.outDir
|
|
5407
5964
|
});
|
|
5408
5965
|
const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
|
|
5409
|
-
const strategy =
|
|
5410
|
-
name: unique,
|
|
5411
|
-
driver: (s, t, o, b) => {
|
|
5412
|
-
const agent = authored.strategy.driver(s, t, o, b);
|
|
5413
|
-
return {
|
|
5414
|
-
...agent,
|
|
5415
|
-
name: unique,
|
|
5416
|
-
act: async (task, scope) => {
|
|
5417
|
-
const out = await agent.act(task, scope);
|
|
5418
|
-
if (out.kind !== "done") return out;
|
|
5419
|
-
const deliverable = {
|
|
5420
|
-
...out.deliverable,
|
|
5421
|
-
mode: unique
|
|
5422
|
-
};
|
|
5423
|
-
return { ...out, deliverable };
|
|
5424
|
-
}
|
|
5425
|
-
};
|
|
5426
|
-
}
|
|
5427
|
-
};
|
|
5966
|
+
const strategy = renameStrategy(authored.strategy, unique);
|
|
5428
5967
|
byName.set(unique, strategy);
|
|
5968
|
+
codeByName.set(unique, authored.code);
|
|
5429
5969
|
newStrategies.push(strategy);
|
|
5430
5970
|
archive.push({
|
|
5431
5971
|
name: unique,
|
|
@@ -5463,12 +6003,9 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5463
6003
|
node.usd = cell.usd;
|
|
5464
6004
|
}
|
|
5465
6005
|
}
|
|
5466
|
-
const
|
|
5467
|
-
|
|
5468
|
-
|
|
5469
|
-
policy,
|
|
5470
|
-
epsilon
|
|
5471
|
-
);
|
|
6006
|
+
const fieldNames = field.map((s) => s.name);
|
|
6007
|
+
const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
|
|
6008
|
+
const champion = pickChampion(means, fieldNames, policy, epsilon);
|
|
5472
6009
|
generationRows.push({ generation: g, candidates, report, champion });
|
|
5473
6010
|
trajectory.push({
|
|
5474
6011
|
generation: g,
|
|
@@ -5478,21 +6015,134 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5478
6015
|
});
|
|
5479
6016
|
incumbent = champion;
|
|
5480
6017
|
latestReport = report;
|
|
6018
|
+
save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5481
6019
|
}
|
|
5482
6020
|
if (authoredOk === 0) {
|
|
5483
6021
|
throw new Error(
|
|
5484
6022
|
"runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
|
|
5485
6023
|
);
|
|
5486
6024
|
}
|
|
5487
|
-
const
|
|
5488
|
-
|
|
5489
|
-
|
|
5490
|
-
|
|
5491
|
-
|
|
5492
|
-
|
|
5493
|
-
|
|
5494
|
-
|
|
5495
|
-
|
|
6025
|
+
const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
|
|
6026
|
+
let holdoutTasks = [];
|
|
6027
|
+
let bandInfo;
|
|
6028
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
6029
|
+
bandInfo = ckpt.band;
|
|
6030
|
+
if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
|
|
6031
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
|
|
6032
|
+
const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
|
|
6033
|
+
holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
|
|
6034
|
+
}
|
|
6035
|
+
} else if (cfg.band) {
|
|
6036
|
+
const maxRef = cfg.band.maxRefScore ?? 0.99;
|
|
6037
|
+
const reference = baselines[0];
|
|
6038
|
+
if (!reference)
|
|
6039
|
+
throw new Error("evolution band: baselines[0] required as the screening reference");
|
|
6040
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
|
|
6041
|
+
const screen = await bench("band-screen", pool2, [reference]);
|
|
6042
|
+
const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
|
|
6043
|
+
const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
|
|
6044
|
+
const kept = pool2.filter((t) => inBandIds.has(t.id));
|
|
6045
|
+
if (kept.length < cfg.holdoutN) {
|
|
6046
|
+
throw new Error(
|
|
6047
|
+
`evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
|
|
6048
|
+
);
|
|
6049
|
+
}
|
|
6050
|
+
holdoutTasks = kept.slice(0, cfg.holdoutN);
|
|
6051
|
+
bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
|
|
6052
|
+
} else {
|
|
6053
|
+
holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
|
|
6054
|
+
}
|
|
6055
|
+
let holdout;
|
|
6056
|
+
let verdict;
|
|
6057
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
6058
|
+
holdout = ckpt.holdout;
|
|
6059
|
+
verdict = ckpt.verdict;
|
|
6060
|
+
} else {
|
|
6061
|
+
const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
|
|
6062
|
+
holdout = await bench("holdout", holdoutTasks, finalists);
|
|
6063
|
+
verdict = promotionGate({
|
|
6064
|
+
report: holdout,
|
|
6065
|
+
incumbent: gen0Champion.name,
|
|
6066
|
+
candidate: incumbent.name,
|
|
6067
|
+
...cfg.objective === "cost" ? {
|
|
6068
|
+
mode: "non-inferiority",
|
|
6069
|
+
...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
|
|
6070
|
+
} : {},
|
|
6071
|
+
...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
|
|
6072
|
+
});
|
|
6073
|
+
save({
|
|
6074
|
+
gen0,
|
|
6075
|
+
gen0Champion,
|
|
6076
|
+
generations: generationRows,
|
|
6077
|
+
archive,
|
|
6078
|
+
trajectory,
|
|
6079
|
+
holdout,
|
|
6080
|
+
verdict,
|
|
6081
|
+
...bandInfo ? { band: bandInfo } : {}
|
|
6082
|
+
});
|
|
6083
|
+
}
|
|
6084
|
+
let reproduction;
|
|
6085
|
+
const championCode = codeByName.get(incumbent.name);
|
|
6086
|
+
if (cfg.reproducerCheck && championCode) {
|
|
6087
|
+
const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
|
|
6088
|
+
const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
|
|
6089
|
+
const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
|
|
6090
|
+
try {
|
|
6091
|
+
const summaryRes = await cfg.author.chat.chat({
|
|
6092
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
6093
|
+
temperature: 0.2,
|
|
6094
|
+
maxTokens: 512,
|
|
6095
|
+
messages: [
|
|
6096
|
+
{
|
|
6097
|
+
role: "system",
|
|
6098
|
+
content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
|
|
6099
|
+
},
|
|
6100
|
+
{ role: "user", content: championCode }
|
|
6101
|
+
]
|
|
6102
|
+
});
|
|
6103
|
+
const summary = summaryRes.content.trim();
|
|
6104
|
+
const reproduced = await authorStrategy({
|
|
6105
|
+
chat: cfg.author.chat,
|
|
6106
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
6107
|
+
...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
|
|
6108
|
+
...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
|
|
6109
|
+
temperature: 0.2,
|
|
6110
|
+
contract: `${strategyAuthorContract}
|
|
6111
|
+
|
|
6112
|
+
IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
|
|
6113
|
+
${summary}`,
|
|
6114
|
+
environmentName: cfg.environment.name,
|
|
6115
|
+
lossesJson: "[]",
|
|
6116
|
+
budget,
|
|
6117
|
+
outDir: cfg.outDir
|
|
6118
|
+
});
|
|
6119
|
+
const reproStrategy = {
|
|
6120
|
+
name: `${incumbent.name}-reproduced`,
|
|
6121
|
+
driver: reproduced.strategy.driver
|
|
6122
|
+
};
|
|
6123
|
+
const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
|
|
6124
|
+
const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
|
|
6125
|
+
reproduction = {
|
|
6126
|
+
summary,
|
|
6127
|
+
reproducedName: reproStrategy.name,
|
|
6128
|
+
file: reproduced.file,
|
|
6129
|
+
championHoldoutScore,
|
|
6130
|
+
reproducedHoldoutScore,
|
|
6131
|
+
gap: championHoldoutScore - reproducedHoldoutScore,
|
|
6132
|
+
reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
|
|
6133
|
+
};
|
|
6134
|
+
} catch (e) {
|
|
6135
|
+
reproduction = {
|
|
6136
|
+
summary: "",
|
|
6137
|
+
reproducedName: "",
|
|
6138
|
+
championHoldoutScore,
|
|
6139
|
+
reproducedHoldoutScore: 0,
|
|
6140
|
+
gap: championHoldoutScore,
|
|
6141
|
+
reproducible: false,
|
|
6142
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
6143
|
+
};
|
|
6144
|
+
}
|
|
6145
|
+
}
|
|
5496
6146
|
return {
|
|
5497
6147
|
gen0,
|
|
5498
6148
|
gen0Champion,
|
|
@@ -5501,6 +6151,8 @@ You are authoring candidate ${i + 1} of ${populationSize} this generation; explo
|
|
|
5501
6151
|
finalChampion: incumbent,
|
|
5502
6152
|
holdout,
|
|
5503
6153
|
verdict,
|
|
6154
|
+
...bandInfo ? { band: bandInfo } : {},
|
|
6155
|
+
...reproduction ? { reproduction } : {},
|
|
5504
6156
|
trajectory
|
|
5505
6157
|
};
|
|
5506
6158
|
}
|
|
@@ -5572,6 +6224,103 @@ function createVerifierEnvironment(opts) {
|
|
|
5572
6224
|
};
|
|
5573
6225
|
}
|
|
5574
6226
|
|
|
6227
|
+
// src/runtime/waterfall.ts
|
|
6228
|
+
function createWaterfallCollector() {
|
|
6229
|
+
let spans = /* @__PURE__ */ new Map();
|
|
6230
|
+
const onEvent = (event) => {
|
|
6231
|
+
if (event.target === "agent.spawn") {
|
|
6232
|
+
const p = event.payload ?? {};
|
|
6233
|
+
const id = p.childId ?? event.id;
|
|
6234
|
+
spans.set(id, {
|
|
6235
|
+
id,
|
|
6236
|
+
label: p.label ?? id,
|
|
6237
|
+
runId: event.runId,
|
|
6238
|
+
...event.parentId !== void 0 ? { parentId: event.parentId } : {},
|
|
6239
|
+
startMs: event.timestamp,
|
|
6240
|
+
status: "running",
|
|
6241
|
+
usd: 0,
|
|
6242
|
+
tokens: { input: 0, output: 0 }
|
|
6243
|
+
});
|
|
6244
|
+
return;
|
|
6245
|
+
}
|
|
6246
|
+
if (event.target === "agent.child") {
|
|
6247
|
+
const p = event.payload ?? {};
|
|
6248
|
+
const id = p.childId;
|
|
6249
|
+
if (!id) return;
|
|
6250
|
+
const span = spans.get(id);
|
|
6251
|
+
if (!span) return;
|
|
6252
|
+
span.endMs = event.timestamp;
|
|
6253
|
+
span.status = p.status === "down" ? "down" : "done";
|
|
6254
|
+
span.usd = p.spent?.usd ?? 0;
|
|
6255
|
+
span.tokens = {
|
|
6256
|
+
input: p.spent?.tokens?.input ?? 0,
|
|
6257
|
+
output: p.spent?.tokens?.output ?? 0
|
|
6258
|
+
};
|
|
6259
|
+
if (typeof p.score === "number") span.score = p.score;
|
|
6260
|
+
}
|
|
6261
|
+
};
|
|
6262
|
+
const report = () => {
|
|
6263
|
+
const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
|
|
6264
|
+
const start = all[0]?.startMs ?? 0;
|
|
6265
|
+
const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
|
|
6266
|
+
const byKind = {};
|
|
6267
|
+
let totalUsd = 0;
|
|
6268
|
+
const totalTokens2 = { input: 0, output: 0 };
|
|
6269
|
+
for (const s of all) {
|
|
6270
|
+
totalUsd += s.usd;
|
|
6271
|
+
totalTokens2.input += s.tokens.input;
|
|
6272
|
+
totalTokens2.output += s.tokens.output;
|
|
6273
|
+
const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
|
|
6274
|
+
const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
|
|
6275
|
+
k.count += 1;
|
|
6276
|
+
k.ms += (s.endMs ?? s.startMs) - s.startMs;
|
|
6277
|
+
k.usd += s.usd;
|
|
6278
|
+
k.tokens.input += s.tokens.input;
|
|
6279
|
+
k.tokens.output += s.tokens.output;
|
|
6280
|
+
}
|
|
6281
|
+
return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
|
|
6282
|
+
};
|
|
6283
|
+
const render = (opts) => {
|
|
6284
|
+
const { spans: all, totalMs, totalUsd, byKind } = report();
|
|
6285
|
+
if (all.length === 0) return "(no spans observed)";
|
|
6286
|
+
const width = opts?.width ?? 48;
|
|
6287
|
+
const maxRows = opts?.maxRows ?? 60;
|
|
6288
|
+
const start = all[0]?.startMs ?? 0;
|
|
6289
|
+
const scale = totalMs > 0 ? width / totalMs : 0;
|
|
6290
|
+
const lines = [];
|
|
6291
|
+
const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
|
|
6292
|
+
for (const s of all.slice(0, maxRows)) {
|
|
6293
|
+
const offset = Math.round((s.startMs - start) * scale);
|
|
6294
|
+
const dur = (s.endMs ?? s.startMs) - s.startMs;
|
|
6295
|
+
const len = Math.max(1, Math.round(dur * scale));
|
|
6296
|
+
const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
|
|
6297
|
+
const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
|
|
6298
|
+
lines.push(
|
|
6299
|
+
`${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
|
|
6300
|
+
);
|
|
6301
|
+
}
|
|
6302
|
+
if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
|
|
6303
|
+
lines.push("\u2014".repeat(labelWidth + width + 2));
|
|
6304
|
+
for (const [kind, k] of Object.entries(byKind)) {
|
|
6305
|
+
lines.push(
|
|
6306
|
+
`${kind.padEnd(labelWidth)} \xD7${k.count} ${(k.ms / 1e3).toFixed(1)}s busy $${k.usd.toFixed(4)} ${k.tokens.input}/${k.tokens.output}tok`
|
|
6307
|
+
);
|
|
6308
|
+
}
|
|
6309
|
+
lines.push(
|
|
6310
|
+
`TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall $${totalUsd.toFixed(4)}`
|
|
6311
|
+
);
|
|
6312
|
+
return lines.join("\n");
|
|
6313
|
+
};
|
|
6314
|
+
return {
|
|
6315
|
+
hooks: { onEvent },
|
|
6316
|
+
report,
|
|
6317
|
+
render,
|
|
6318
|
+
reset: () => {
|
|
6319
|
+
spans = /* @__PURE__ */ new Map();
|
|
6320
|
+
}
|
|
6321
|
+
};
|
|
6322
|
+
}
|
|
6323
|
+
|
|
5575
6324
|
// src/runtime/workspace.ts
|
|
5576
6325
|
function localShell() {
|
|
5577
6326
|
return async (args, cwd) => {
|
|
@@ -5674,6 +6423,10 @@ function tail(s) {
|
|
|
5674
6423
|
}
|
|
5675
6424
|
|
|
5676
6425
|
export {
|
|
6426
|
+
deleteBoxSafe,
|
|
6427
|
+
throwAbort,
|
|
6428
|
+
throwIfAborted,
|
|
6429
|
+
sleep,
|
|
5677
6430
|
contentAddress,
|
|
5678
6431
|
InMemoryResultBlobStore,
|
|
5679
6432
|
FileResultBlobStore,
|
|
@@ -5681,6 +6434,8 @@ export {
|
|
|
5681
6434
|
FileSpawnJournal,
|
|
5682
6435
|
replaySpawnTree,
|
|
5683
6436
|
materializeTreeView,
|
|
6437
|
+
anytimeReport,
|
|
6438
|
+
renderAnytimeTable,
|
|
5684
6439
|
defaultAuditorInstruction,
|
|
5685
6440
|
auditIntent,
|
|
5686
6441
|
completionAuthorizes,
|
|
@@ -5723,6 +6478,9 @@ export {
|
|
|
5723
6478
|
InMemoryCorpus,
|
|
5724
6479
|
FileCorpus,
|
|
5725
6480
|
renderCorpusToInstructions,
|
|
6481
|
+
routerChatWithUsage,
|
|
6482
|
+
routerChatWithTools,
|
|
6483
|
+
routerToolLoop,
|
|
5726
6484
|
createExecutor,
|
|
5727
6485
|
createExecutorRegistry,
|
|
5728
6486
|
spendFromUsageEvents,
|
|
@@ -5751,11 +6509,14 @@ export {
|
|
|
5751
6509
|
strategyAuthorContract,
|
|
5752
6510
|
assertStrategyContract,
|
|
5753
6511
|
authorStrategy,
|
|
6512
|
+
discriminatingMeans,
|
|
6513
|
+
pickChampion,
|
|
5754
6514
|
selectChampion,
|
|
5755
6515
|
runStrategyEvolution,
|
|
5756
6516
|
createVerifierEnvironment,
|
|
6517
|
+
createWaterfallCollector,
|
|
5757
6518
|
localShell,
|
|
5758
6519
|
gitWorkspace,
|
|
5759
6520
|
jjWorkspace
|
|
5760
6521
|
};
|
|
5761
|
-
//# sourceMappingURL=chunk-
|
|
6522
|
+
//# sourceMappingURL=chunk-CM2IK7VS.js.map
|