@tangle-network/agent-runtime 0.47.0 → 0.49.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -15
- package/dist/agent.js +1 -1
- package/dist/chunk-GHX7XOJ2.js +433 -0
- package/dist/chunk-GHX7XOJ2.js.map +1 -0
- package/dist/{chunk-T4OQQEE3.js → chunk-IQS4HI3F.js} +14 -5
- package/dist/chunk-IQS4HI3F.js.map +1 -0
- package/dist/{chunk-72JQCHOZ.js → chunk-PXUTIMGJ.js} +2318 -237
- package/dist/chunk-PXUTIMGJ.js.map +1 -0
- package/dist/{chunk-MGFEUYOH.js → chunk-U2VEWKKK.js} +3 -3
- package/dist/{chunk-JNPK46YH.js → chunk-VIEDXELL.js} +408 -6
- package/dist/chunk-VIEDXELL.js.map +1 -0
- package/dist/{chunk-VR4JIC5H.js → chunk-XTEZ3YJ4.js} +2 -2
- package/dist/index.d.ts +29 -4
- package/dist/index.js +109 -21
- package/dist/index.js.map +1 -1
- package/dist/kb-gate-CsXpNRk7.d.ts +1145 -0
- package/dist/{loop-runner-bin-DEm4roYF.d.ts → loop-runner-bin-Cgn0A-NW.d.ts} +1 -1
- package/dist/loop-runner-bin.d.ts +2 -2
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +3 -3
- package/dist/loops.js +57 -1
- package/dist/mcp/bin.js +187 -24
- package/dist/mcp/bin.js.map +1 -1
- package/dist/mcp/index.d.ts +28 -125
- package/dist/mcp/index.js +28 -6
- package/dist/mcp/index.js.map +1 -1
- package/dist/platform.js +2 -2
- package/dist/platform.js.map +1 -1
- package/dist/runtime.d.ts +1100 -62
- package/dist/runtime.js +57 -1
- package/dist/{types-Cbx3dNK5.d.ts → types-BpDfCPUp.d.ts} +1 -1
- package/dist/workflow.js +1 -1
- package/package.json +7 -6
- package/dist/chunk-5YDS7BLC.js +0 -218
- package/dist/chunk-5YDS7BLC.js.map +0 -1
- package/dist/chunk-72JQCHOZ.js.map +0 -1
- package/dist/chunk-JNPK46YH.js.map +0 -1
- package/dist/chunk-T4OQQEE3.js.map +0 -1
- package/dist/kb-gate-51BlLlVM.d.ts +0 -529
- /package/dist/{chunk-MGFEUYOH.js.map → chunk-U2VEWKKK.js.map} +0 -0
- /package/dist/{chunk-VR4JIC5H.js.map → chunk-XTEZ3YJ4.js.map} +0 -0
|
@@ -426,6 +426,180 @@ function isNoEntError(err) {
|
|
|
426
426
|
return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
+
// src/runtime/anytime.ts
|
|
430
|
+
var median = (xs) => {
|
|
431
|
+
if (xs.length === 0) return null;
|
|
432
|
+
const s = [...xs].sort((a, b) => a - b);
|
|
433
|
+
const mid = Math.floor(s.length / 2);
|
|
434
|
+
return s.length % 2 === 1 ? s[mid] : (s[mid - 1] + s[mid]) / 2;
|
|
435
|
+
};
|
|
436
|
+
function anytimeReport(spans, opts) {
|
|
437
|
+
const targets = opts?.targets ?? [1];
|
|
438
|
+
const byRun = /* @__PURE__ */ new Map();
|
|
439
|
+
for (const s of spans) {
|
|
440
|
+
if (!s.label.startsWith("shot:")) continue;
|
|
441
|
+
const list = byRun.get(s.runId) ?? [];
|
|
442
|
+
list.push(s);
|
|
443
|
+
byRun.set(s.runId, list);
|
|
444
|
+
}
|
|
445
|
+
const perTask = [];
|
|
446
|
+
for (const [runId, shots] of byRun) {
|
|
447
|
+
const m = runId.match(/^agentic:(.+):(.+)$/);
|
|
448
|
+
const strategy = m?.[1] ?? runId;
|
|
449
|
+
const taskId = m?.[2] ?? runId;
|
|
450
|
+
const ordered = [...shots].sort((a, b) => (a.endMs ?? a.startMs) - (b.endMs ?? b.startMs));
|
|
451
|
+
const t0 = Math.min(...ordered.map((s) => s.startMs));
|
|
452
|
+
const taskTargets = opts?.targetFor ? [opts.targetFor(taskId)] : targets;
|
|
453
|
+
let best = 0;
|
|
454
|
+
let cumUsd = 0;
|
|
455
|
+
const points = [];
|
|
456
|
+
const hits = {};
|
|
457
|
+
for (const t of taskTargets) hits[String(t)] = null;
|
|
458
|
+
for (const s of ordered) {
|
|
459
|
+
cumUsd += s.usd;
|
|
460
|
+
if (typeof s.score === "number" && s.score > best) best = s.score;
|
|
461
|
+
const elapsedMs = (s.endMs ?? s.startMs) - t0;
|
|
462
|
+
points.push({ elapsedMs, cumUsd, best });
|
|
463
|
+
for (const t of taskTargets) {
|
|
464
|
+
if (hits[String(t)] === null && best >= t) {
|
|
465
|
+
hits[String(t)] = { ms: elapsedMs, shots: points.length, usd: cumUsd };
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
perTask.push({ taskId, strategy, points, hits });
|
|
470
|
+
}
|
|
471
|
+
const byStrategy = /* @__PURE__ */ new Map();
|
|
472
|
+
for (const t of perTask) {
|
|
473
|
+
const list = byStrategy.get(t.strategy) ?? [];
|
|
474
|
+
list.push(t);
|
|
475
|
+
byStrategy.set(t.strategy, list);
|
|
476
|
+
}
|
|
477
|
+
const perStrategy = [];
|
|
478
|
+
for (const [strategy, tasks] of byStrategy) {
|
|
479
|
+
const totalMs = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.elapsedMs ?? 0), 0);
|
|
480
|
+
const totalUsd = tasks.reduce((s, t) => s + (t.points[t.points.length - 1]?.cumUsd ?? 0), 0);
|
|
481
|
+
const maxShots = Math.max(0, ...tasks.map((t) => t.points.length));
|
|
482
|
+
const curveByShot = [];
|
|
483
|
+
for (let i = 0; i < maxShots; i += 1) {
|
|
484
|
+
const vals = tasks.map(
|
|
485
|
+
(t) => t.points[Math.min(i, t.points.length - 1)].best
|
|
486
|
+
);
|
|
487
|
+
curveByShot.push(vals.reduce((s, v) => s + v, 0) / vals.length);
|
|
488
|
+
}
|
|
489
|
+
const auc = curveByShot.length > 0 ? curveByShot.reduce((s, v) => s + v, 0) / curveByShot.length : 0;
|
|
490
|
+
const summaryTargets = opts?.targetFor ? [Number.NaN] : targets;
|
|
491
|
+
for (const t of summaryTargets) {
|
|
492
|
+
const key = (taskCurve) => opts?.targetFor ? Object.values(taskCurve.hits)[0] ?? null : taskCurve.hits[String(t)] ?? null;
|
|
493
|
+
const reached = tasks.filter((x) => key(x) !== null);
|
|
494
|
+
perStrategy.push({
|
|
495
|
+
strategy,
|
|
496
|
+
target: t,
|
|
497
|
+
tasks: tasks.length,
|
|
498
|
+
reachedTarget: reached.length,
|
|
499
|
+
medianTttMs: median(reached.map((x) => key(x).ms)),
|
|
500
|
+
medianShotsToTarget: median(reached.map((x) => key(x).shots)),
|
|
501
|
+
ertMs: reached.length > 0 ? totalMs / reached.length : null,
|
|
502
|
+
erUsd: reached.length > 0 ? totalUsd / reached.length : null,
|
|
503
|
+
curveByShot,
|
|
504
|
+
auc
|
|
505
|
+
});
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
perStrategy.sort((a, b) => a.strategy.localeCompare(b.strategy) || a.target - b.target);
|
|
509
|
+
return { targets, perTask, perStrategy };
|
|
510
|
+
}
|
|
511
|
+
function renderAnytimeTable(report) {
|
|
512
|
+
const lines = [
|
|
513
|
+
`anytime metrics \xB7 satisficing targets [${report.targets.join(", ")}] \xB7 ERT = \u03A3 all wall-time / #successes (COCO)`,
|
|
514
|
+
"strategy \u2265tgt reach med-TTT med-shots ERT(all-in) $/success AUC curve"
|
|
515
|
+
];
|
|
516
|
+
for (const s of report.perStrategy) {
|
|
517
|
+
const curve = s.curveByShot.map((v) => "\u2581\u2582\u2583\u2584\u2585\u2586\u2587\u2588"[Math.min(7, Math.floor(v * 8))]).join("");
|
|
518
|
+
const tgt = Number.isNaN(s.target) ? "task" : s.target.toFixed(2);
|
|
519
|
+
lines.push(
|
|
520
|
+
`${s.strategy.padEnd(19)} ${tgt.padStart(4)} ${String(s.reachedTarget).padStart(4)}/${String(s.tasks).padEnd(3)} ${s.medianTttMs === null ? " \u2014" : `${(s.medianTttMs / 1e3).toFixed(1).padStart(6)}s`} ${s.medianShotsToTarget === null ? " \u2014" : String(s.medianShotsToTarget).padStart(5)} ${s.ertMs === null ? " \u2014" : `${(s.ertMs / 1e3).toFixed(1).padStart(9)}s`} ${s.erUsd === null ? " \u2014" : `$${s.erUsd.toFixed(4)}`} ${s.auc.toFixed(2)} ${curve}`
|
|
521
|
+
);
|
|
522
|
+
}
|
|
523
|
+
return lines.join("\n");
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
// src/runtime/audit-intent.ts
|
|
527
|
+
var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
|
|
528
|
+
function summarize(trace, maxLines) {
|
|
529
|
+
const lines = [];
|
|
530
|
+
for (const ev of trace) {
|
|
531
|
+
const e = ev;
|
|
532
|
+
const role = e.role;
|
|
533
|
+
if (role === "tool") lines.push(`RESULT ${String(e.content).slice(0, 200)}`);
|
|
534
|
+
else if (role === "assistant") {
|
|
535
|
+
const calls = e.tool_calls?.map((c) => `${c.function?.name}(${(c.function?.arguments ?? "").slice(0, 120)})`).join(", ");
|
|
536
|
+
lines.push(calls ? `CALL ${calls}` : `SAY ${String(e.content).slice(0, 160)}`);
|
|
537
|
+
} else if (role === "user") lines.push(`USER ${String(e.content).slice(0, 160)}`);
|
|
538
|
+
}
|
|
539
|
+
return lines.slice(-maxLines).join("\n");
|
|
540
|
+
}
|
|
541
|
+
var auditSchema = {
|
|
542
|
+
name: "intent_audit",
|
|
543
|
+
schema: {
|
|
544
|
+
type: "object",
|
|
545
|
+
additionalProperties: false,
|
|
546
|
+
required: ["revealedIntent", "verdict", "evidence", "recommendation", "confidence"],
|
|
547
|
+
properties: {
|
|
548
|
+
revealedIntent: { type: "string" },
|
|
549
|
+
verdict: { type: "string", enum: ["aligned", "drifting", "diverged"] },
|
|
550
|
+
evidence: { type: "string" },
|
|
551
|
+
recommendation: { type: "string", enum: ["continue", "steer", "abort"] },
|
|
552
|
+
steer: { type: "string" },
|
|
553
|
+
confidence: { type: "number" }
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
};
|
|
557
|
+
async function auditIntent(input, opts) {
|
|
558
|
+
const res = await opts.chat.chat(
|
|
559
|
+
{
|
|
560
|
+
...opts.model ? { model: opts.model } : {},
|
|
561
|
+
jsonSchema: auditSchema,
|
|
562
|
+
messages: [
|
|
563
|
+
{ role: "system", content: opts.auditorInstruction ?? defaultAuditorInstruction },
|
|
564
|
+
{
|
|
565
|
+
role: "user",
|
|
566
|
+
content: `DECLARED INTENT (the task):
|
|
567
|
+
${input.declaredIntent}
|
|
568
|
+
|
|
569
|
+
` + (input.userIntent ? `USER INTENT (the principal's actual goal):
|
|
570
|
+
${input.userIntent}
|
|
571
|
+
|
|
572
|
+
` : "") + (input.metaIntent ? `META-INTENT (what the whole run is for):
|
|
573
|
+
${input.metaIntent}
|
|
574
|
+
|
|
575
|
+
` : "") + `TRAJECTORY (in order):
|
|
576
|
+
${summarize(input.trace, opts.maxTraceLines ?? 80)}
|
|
577
|
+
|
|
578
|
+
Audit the route: revealed intent, verdict, evidence, one recommendation.`
|
|
579
|
+
}
|
|
580
|
+
]
|
|
581
|
+
},
|
|
582
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
583
|
+
);
|
|
584
|
+
let parsed;
|
|
585
|
+
try {
|
|
586
|
+
parsed = JSON.parse(res.content);
|
|
587
|
+
} catch {
|
|
588
|
+
throw new Error(`auditIntent: auditor returned non-JSON: ${res.content.slice(0, 200)}`);
|
|
589
|
+
}
|
|
590
|
+
if (!parsed.verdict || !parsed.recommendation) {
|
|
591
|
+
throw new Error(`auditIntent: missing verdict/recommendation: ${res.content.slice(0, 200)}`);
|
|
592
|
+
}
|
|
593
|
+
return {
|
|
594
|
+
revealedIntent: parsed.revealedIntent ?? "",
|
|
595
|
+
verdict: parsed.verdict,
|
|
596
|
+
evidence: parsed.evidence ?? "",
|
|
597
|
+
recommendation: parsed.recommendation,
|
|
598
|
+
...parsed.steer ? { steer: parsed.steer } : {},
|
|
599
|
+
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5
|
|
600
|
+
};
|
|
601
|
+
}
|
|
602
|
+
|
|
429
603
|
// src/runtime/completion.ts
|
|
430
604
|
function completionAuthorizes(v, policy) {
|
|
431
605
|
if (!v?.done) return false;
|
|
@@ -674,8 +848,8 @@ function validateMove(move, maxFanout) {
|
|
|
674
848
|
);
|
|
675
849
|
}
|
|
676
850
|
}
|
|
677
|
-
async function runAnalyze(
|
|
678
|
-
const findings = await
|
|
851
|
+
async function runAnalyze(analyze2, task, history) {
|
|
852
|
+
const findings = await analyze2({ task, history });
|
|
679
853
|
if (!Array.isArray(findings)) {
|
|
680
854
|
throw new PlannerError(
|
|
681
855
|
`createDriver: analyze hook must return AnalystFinding[], got ${stringifySafe(findings)}`
|
|
@@ -703,6 +877,214 @@ function renderAnalyses(findings) {
|
|
|
703
877
|
${rows.join("\n")}`;
|
|
704
878
|
}
|
|
705
879
|
|
|
880
|
+
// src/runtime/observe.ts
|
|
881
|
+
import { makeFinding } from "@tangle-network/agent-eval";
|
|
882
|
+
var observerId = "observe/trace";
|
|
883
|
+
var defaultAnalystInstruction = "You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. Only claim what the trace shows. No findings if the run was clean.";
|
|
884
|
+
function summarizeTrace(trace, maxLines) {
|
|
885
|
+
const lines = [];
|
|
886
|
+
for (const ev of trace) {
|
|
887
|
+
const e = ev;
|
|
888
|
+
const t = (e.type ?? "").toLowerCase();
|
|
889
|
+
const d = e.data ?? {};
|
|
890
|
+
const part = d.part ?? {};
|
|
891
|
+
if (part.type === "tool")
|
|
892
|
+
lines.push(`tool:${part.tool}${part.state?.status ? `(${part.state.status})` : ""}`);
|
|
893
|
+
else if (t.includes("error"))
|
|
894
|
+
lines.push(`ERROR: ${String(d.message ?? d.detail ?? "").slice(0, 200)}`);
|
|
895
|
+
else if (t === "status" && typeof d.status === "string") lines.push(`status:${d.status}`);
|
|
896
|
+
else if (t.includes("tool")) lines.push(`tool-event:${t}`);
|
|
897
|
+
}
|
|
898
|
+
const out = [];
|
|
899
|
+
for (const ln of lines) {
|
|
900
|
+
const prev = out[out.length - 1];
|
|
901
|
+
const m = prev?.match(/^(.*?)(?: x(\d+))?$/);
|
|
902
|
+
if (m && m[1] === ln) out[out.length - 1] = `${ln} x${(Number(m[2]) || 1) + 1}`;
|
|
903
|
+
else out.push(ln);
|
|
904
|
+
}
|
|
905
|
+
return out.slice(0, maxLines).join("\n") || "(no tool/error events in trace)";
|
|
906
|
+
}
|
|
907
|
+
var findingsSchema = {
|
|
908
|
+
name: "observer_findings",
|
|
909
|
+
schema: {
|
|
910
|
+
type: "object",
|
|
911
|
+
additionalProperties: false,
|
|
912
|
+
properties: {
|
|
913
|
+
findings: {
|
|
914
|
+
type: "array",
|
|
915
|
+
items: {
|
|
916
|
+
type: "object",
|
|
917
|
+
additionalProperties: false,
|
|
918
|
+
properties: {
|
|
919
|
+
area: {
|
|
920
|
+
type: "string",
|
|
921
|
+
description: "tool-use | cost | verification | process | failure | latency"
|
|
922
|
+
},
|
|
923
|
+
severity: { type: "string", enum: ["critical", "high", "medium", "low", "info"] },
|
|
924
|
+
claim: {
|
|
925
|
+
type: "string",
|
|
926
|
+
description: "what you OBSERVED in the trace (a fact, with the evidence)"
|
|
927
|
+
},
|
|
928
|
+
recommended_action: {
|
|
929
|
+
type: "string",
|
|
930
|
+
description: "the concrete change for the agent or operator"
|
|
931
|
+
},
|
|
932
|
+
audience: {
|
|
933
|
+
type: "string",
|
|
934
|
+
enum: ["agent", "operator"],
|
|
935
|
+
description: "who should act on this"
|
|
936
|
+
},
|
|
937
|
+
confidence: { type: "number" }
|
|
938
|
+
},
|
|
939
|
+
required: ["area", "severity", "claim", "recommended_action", "audience", "confidence"]
|
|
940
|
+
}
|
|
941
|
+
}
|
|
942
|
+
},
|
|
943
|
+
required: ["findings"]
|
|
944
|
+
}
|
|
945
|
+
};
|
|
946
|
+
async function observe(input, opts) {
|
|
947
|
+
const traceSummary = summarizeTrace(input.trace, opts.maxTraceLines ?? 80);
|
|
948
|
+
const res = await opts.chat.chat(
|
|
949
|
+
{
|
|
950
|
+
...opts.model ? { model: opts.model } : {},
|
|
951
|
+
jsonSchema: findingsSchema,
|
|
952
|
+
messages: [
|
|
953
|
+
{
|
|
954
|
+
role: "system",
|
|
955
|
+
content: opts.analystInstruction ?? defaultAnalystInstruction
|
|
956
|
+
},
|
|
957
|
+
{
|
|
958
|
+
role: "user",
|
|
959
|
+
content: `TASK: ${input.task}
|
|
960
|
+
|
|
961
|
+
OUTCOME: ${input.outcome ?? "unknown"}
|
|
962
|
+
|
|
963
|
+
FINAL OUTPUT (truncated):
|
|
964
|
+
${input.output.slice(0, 1200)}
|
|
965
|
+
|
|
966
|
+
TRACE (in order; "xN" = repeated):
|
|
967
|
+
${traceSummary}`
|
|
968
|
+
}
|
|
969
|
+
]
|
|
970
|
+
},
|
|
971
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
972
|
+
);
|
|
973
|
+
const parsed = parseFindings(res.content);
|
|
974
|
+
const producedAt = input.runId ? `${input.runId}` : observerId;
|
|
975
|
+
const findings = parsed.map(
|
|
976
|
+
(f) => makeFinding({
|
|
977
|
+
analyst_id: observerId,
|
|
978
|
+
area: `${f.area}`,
|
|
979
|
+
severity: f.severity,
|
|
980
|
+
claim: f.claim,
|
|
981
|
+
recommended_action: f.recommended_action,
|
|
982
|
+
confidence: typeof f.confidence === "number" ? f.confidence : 0.5,
|
|
983
|
+
evidence_refs: [],
|
|
984
|
+
// The observer reads BEHAVIOR, never the judge verdict — firewall provenance.
|
|
985
|
+
derived_from_judge: false,
|
|
986
|
+
metadata: { audience: f.audience },
|
|
987
|
+
...input.runId ? { subject: input.runId } : {}
|
|
988
|
+
})
|
|
989
|
+
);
|
|
990
|
+
const learned = [];
|
|
991
|
+
if (opts.corpus) {
|
|
992
|
+
for (const f of findings) {
|
|
993
|
+
const record = {
|
|
994
|
+
schemaVersion: "1.0.0",
|
|
995
|
+
id: f.finding_id,
|
|
996
|
+
runId: input.runId ?? observerId,
|
|
997
|
+
producedAt: f.produced_at ?? producedAt,
|
|
998
|
+
area: f.area,
|
|
999
|
+
claim: f.recommended_action ?? f.claim,
|
|
1000
|
+
...f.claim ? { rationale: f.claim } : {},
|
|
1001
|
+
tags: [...opts.tags ?? [], `audience:${f.metadata?.audience ?? "agent"}`],
|
|
1002
|
+
confidence: f.confidence,
|
|
1003
|
+
evidence: [{ kind: "finding", uri: f.finding_id }]
|
|
1004
|
+
};
|
|
1005
|
+
const r = await opts.corpus.append(record);
|
|
1006
|
+
if (r.succeeded) learned.push(record);
|
|
1007
|
+
}
|
|
1008
|
+
}
|
|
1009
|
+
return { findings, learned, report: renderReport(findings) };
|
|
1010
|
+
}
|
|
1011
|
+
function parseFindings(content) {
|
|
1012
|
+
let obj;
|
|
1013
|
+
try {
|
|
1014
|
+
obj = JSON.parse(content);
|
|
1015
|
+
} catch {
|
|
1016
|
+
const m = content.match(/\{[\s\S]*\}/);
|
|
1017
|
+
obj = m ? JSON.parse(m[0]) : { findings: [] };
|
|
1018
|
+
}
|
|
1019
|
+
const arr = obj.findings;
|
|
1020
|
+
return Array.isArray(arr) ? arr : [];
|
|
1021
|
+
}
|
|
1022
|
+
function renderReport(findings) {
|
|
1023
|
+
if (findings.length === 0) return "\u2713 clean run \u2014 the observer found nothing to change.";
|
|
1024
|
+
const audience = (f) => f.metadata?.audience ?? "agent";
|
|
1025
|
+
const forAgent = findings.filter((f) => audience(f) === "agent");
|
|
1026
|
+
const forOperator = findings.filter((f) => audience(f) === "operator");
|
|
1027
|
+
const block = (title, fs) => fs.length === 0 ? "" : `**${title}**
|
|
1028
|
+
${fs.map((f) => `- [${f.severity}] ${f.claim}
|
|
1029
|
+
\u2192 ${f.recommended_action ?? ""}`).join("\n")}
|
|
1030
|
+
`;
|
|
1031
|
+
return [
|
|
1032
|
+
block("For the agent (fix skills / prompt / tools)", forAgent),
|
|
1033
|
+
block("For you (the operator)", forOperator)
|
|
1034
|
+
].filter(Boolean).join("\n");
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
// src/runtime/harvest-corpus.ts
|
|
1038
|
+
async function harvestCorpus(opts) {
|
|
1039
|
+
const concurrency = Math.max(1, opts.concurrency ?? 4);
|
|
1040
|
+
const report = { runsObserved: 0, findings: 0, learned: 0, failures: [] };
|
|
1041
|
+
const iterator = Symbol.asyncIterator in Object(opts.runs) ? opts.runs[Symbol.asyncIterator]() : (async function* () {
|
|
1042
|
+
yield* opts.runs;
|
|
1043
|
+
})();
|
|
1044
|
+
let consumed = 0;
|
|
1045
|
+
let done = false;
|
|
1046
|
+
const next = async () => {
|
|
1047
|
+
if (done || opts.maxRuns !== void 0 && consumed >= opts.maxRuns) return null;
|
|
1048
|
+
const r = await iterator.next();
|
|
1049
|
+
if (r.done) {
|
|
1050
|
+
done = true;
|
|
1051
|
+
return null;
|
|
1052
|
+
}
|
|
1053
|
+
consumed += 1;
|
|
1054
|
+
return r.value;
|
|
1055
|
+
};
|
|
1056
|
+
const workers = Array.from({ length: concurrency }, async () => {
|
|
1057
|
+
for (let input = await next(); input !== null; input = await next()) {
|
|
1058
|
+
if (opts.signal?.aborted) return;
|
|
1059
|
+
try {
|
|
1060
|
+
const obs = await observe(input, {
|
|
1061
|
+
chat: opts.chat,
|
|
1062
|
+
...opts.model ? { model: opts.model } : {},
|
|
1063
|
+
corpus: opts.corpus,
|
|
1064
|
+
tags: opts.tags ?? [],
|
|
1065
|
+
...opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {},
|
|
1066
|
+
...opts.signal ? { signal: opts.signal } : {}
|
|
1067
|
+
});
|
|
1068
|
+
report.runsObserved += 1;
|
|
1069
|
+
report.findings += obs.findings.length;
|
|
1070
|
+
report.learned += obs.learned.length;
|
|
1071
|
+
} catch (e) {
|
|
1072
|
+
report.failures.push({
|
|
1073
|
+
runId: input.runId ?? `run-${consumed}`,
|
|
1074
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
1075
|
+
});
|
|
1076
|
+
}
|
|
1077
|
+
}
|
|
1078
|
+
});
|
|
1079
|
+
await Promise.all(workers);
|
|
1080
|
+
if (report.runsObserved === 0 && report.failures.length > 0) {
|
|
1081
|
+
throw new Error(
|
|
1082
|
+
`harvestCorpus: every run failed analysis (${report.failures.length}) \u2014 first: ${report.failures[0]?.error}`
|
|
1083
|
+
);
|
|
1084
|
+
}
|
|
1085
|
+
return report;
|
|
1086
|
+
}
|
|
1087
|
+
|
|
706
1088
|
// src/runtime/inline-sandbox-client.ts
|
|
707
1089
|
function isAsyncIterable(v) {
|
|
708
1090
|
return typeof v === "object" && v !== null && Symbol.asyncIterator in v;
|
|
@@ -1651,8 +2033,8 @@ function defaultSelectWinner(iterations) {
|
|
|
1651
2033
|
const candidates = iterations.filter((iter) => iter.output !== void 0 && !iter.error);
|
|
1652
2034
|
if (candidates.length === 0) return void 0;
|
|
1653
2035
|
const valid = candidates.filter((iter) => iter.verdict?.valid === true);
|
|
1654
|
-
const
|
|
1655
|
-
const sorted = [...
|
|
2036
|
+
const pool2 = valid.length > 0 ? valid : candidates;
|
|
2037
|
+
const sorted = [...pool2].sort(
|
|
1656
2038
|
(a, b) => (b.verdict?.score ?? 0) - (a.verdict?.score ?? 0) || a.index - b.index
|
|
1657
2039
|
);
|
|
1658
2040
|
const top = sorted[0];
|
|
@@ -1737,160 +2119,96 @@ function loopDispatch(opts) {
|
|
|
1737
2119
|
return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx);
|
|
1738
2120
|
}
|
|
1739
2121
|
|
|
1740
|
-
// src/runtime/
|
|
1741
|
-
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
|
|
2122
|
+
// src/runtime/mcp-environment.ts
|
|
2123
|
+
async function rpc(endpoint, body) {
|
|
2124
|
+
let lastErr;
|
|
2125
|
+
for (let attempt = 0; attempt < 4; attempt += 1) {
|
|
2126
|
+
try {
|
|
2127
|
+
const r = await fetch(endpoint.url, {
|
|
2128
|
+
method: "POST",
|
|
2129
|
+
headers: { "content-type": "application/json", ...endpoint.headers ?? {} },
|
|
2130
|
+
body: JSON.stringify(body)
|
|
2131
|
+
});
|
|
2132
|
+
const text = await r.text();
|
|
2133
|
+
const dataLines = text.split("\n").filter((l) => l.startsWith("data:")).map((l) => l.slice(5).trim());
|
|
2134
|
+
const payload = dataLines.length ? dataLines[dataLines.length - 1] : text;
|
|
2135
|
+
try {
|
|
2136
|
+
return { status: r.status, json: JSON.parse(payload ?? "null") };
|
|
2137
|
+
} catch {
|
|
2138
|
+
return { status: r.status, json: text };
|
|
2139
|
+
}
|
|
2140
|
+
} catch (err) {
|
|
2141
|
+
lastErr = err;
|
|
2142
|
+
await new Promise((res) => setTimeout(res, 1e3 * (attempt + 1)));
|
|
2143
|
+
}
|
|
1756
2144
|
}
|
|
1757
|
-
|
|
1758
|
-
|
|
1759
|
-
|
|
1760
|
-
|
|
1761
|
-
|
|
1762
|
-
|
|
2145
|
+
throw new Error(
|
|
2146
|
+
`mcp rpc ${endpoint.url} failed after 4 attempts: ${lastErr instanceof Error ? lastErr.message : String(lastErr)}`
|
|
2147
|
+
);
|
|
2148
|
+
}
|
|
2149
|
+
function sanitizeSchema(s) {
|
|
2150
|
+
const o = s && typeof s === "object" ? s : {};
|
|
2151
|
+
const banned = o.oneOf || o.anyOf || o.allOf || o.not || o.enum;
|
|
2152
|
+
if (o.type === "object" && !banned && o.properties && typeof o.properties === "object") {
|
|
2153
|
+
return {
|
|
2154
|
+
type: "object",
|
|
2155
|
+
properties: o.properties,
|
|
2156
|
+
...Array.isArray(o.required) ? { required: o.required } : {}
|
|
2157
|
+
};
|
|
1763
2158
|
}
|
|
1764
|
-
return
|
|
2159
|
+
return { type: "object", properties: {} };
|
|
1765
2160
|
}
|
|
1766
|
-
|
|
1767
|
-
|
|
1768
|
-
|
|
1769
|
-
|
|
1770
|
-
|
|
1771
|
-
|
|
1772
|
-
|
|
1773
|
-
|
|
1774
|
-
|
|
1775
|
-
type: "object",
|
|
1776
|
-
additionalProperties: false,
|
|
1777
|
-
properties: {
|
|
1778
|
-
area: {
|
|
1779
|
-
type: "string",
|
|
1780
|
-
description: "tool-use | cost | verification | process | failure | latency"
|
|
1781
|
-
},
|
|
1782
|
-
severity: { type: "string", enum: ["critical", "high", "medium", "low", "info"] },
|
|
1783
|
-
claim: {
|
|
1784
|
-
type: "string",
|
|
1785
|
-
description: "what you OBSERVED in the trace (a fact, with the evidence)"
|
|
1786
|
-
},
|
|
1787
|
-
recommended_action: {
|
|
1788
|
-
type: "string",
|
|
1789
|
-
description: "the concrete change for the agent or operator"
|
|
1790
|
-
},
|
|
1791
|
-
audience: {
|
|
1792
|
-
type: "string",
|
|
1793
|
-
enum: ["agent", "operator"],
|
|
1794
|
-
description: "who should act on this"
|
|
1795
|
-
},
|
|
1796
|
-
confidence: { type: "number" }
|
|
1797
|
-
},
|
|
1798
|
-
required: ["area", "severity", "claim", "recommended_action", "audience", "confidence"]
|
|
1799
|
-
}
|
|
1800
|
-
}
|
|
2161
|
+
function createMcpEnvironment(opts) {
|
|
2162
|
+
const endpoints = /* @__PURE__ */ new Map();
|
|
2163
|
+
const maxChars = opts.maxResultChars ?? 1500;
|
|
2164
|
+
return {
|
|
2165
|
+
name: opts.name,
|
|
2166
|
+
async open(task) {
|
|
2167
|
+
const { handle, endpoint } = await opts.open(task);
|
|
2168
|
+
endpoints.set(handle.id, endpoint);
|
|
2169
|
+
return handle;
|
|
1801
2170
|
},
|
|
1802
|
-
|
|
1803
|
-
|
|
1804
|
-
};
|
|
1805
|
-
|
|
1806
|
-
|
|
1807
|
-
|
|
1808
|
-
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
{
|
|
1813
|
-
|
|
1814
|
-
|
|
1815
|
-
|
|
1816
|
-
|
|
1817
|
-
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
FINAL OUTPUT (truncated):
|
|
1823
|
-
${input.output.slice(0, 1200)}
|
|
1824
|
-
|
|
1825
|
-
TRACE (in order; "xN" = repeated):
|
|
1826
|
-
${traceSummary}`
|
|
1827
|
-
}
|
|
1828
|
-
]
|
|
2171
|
+
async tools(task, handle) {
|
|
2172
|
+
const endpoint = endpoints.get(handle.id);
|
|
2173
|
+
if (!endpoint) throw new Error(`${opts.name}: tools() before open() for ${handle.id}`);
|
|
2174
|
+
const { json } = await rpc(endpoint, {
|
|
2175
|
+
jsonrpc: "2.0",
|
|
2176
|
+
id: 1,
|
|
2177
|
+
method: "tools/list",
|
|
2178
|
+
params: {}
|
|
2179
|
+
});
|
|
2180
|
+
const all = (json.result?.tools ?? []).map(
|
|
2181
|
+
(t) => ({
|
|
2182
|
+
type: "function",
|
|
2183
|
+
function: {
|
|
2184
|
+
name: t.name,
|
|
2185
|
+
description: (t.description ?? "").slice(0, 1e3),
|
|
2186
|
+
parameters: sanitizeSchema(t.inputSchema)
|
|
2187
|
+
}
|
|
2188
|
+
})
|
|
2189
|
+
);
|
|
2190
|
+
return opts.selectTools ? opts.selectTools(task, all) : all;
|
|
1829
2191
|
},
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1834
|
-
|
|
1835
|
-
|
|
1836
|
-
|
|
1837
|
-
|
|
1838
|
-
|
|
1839
|
-
|
|
1840
|
-
|
|
1841
|
-
|
|
1842
|
-
|
|
1843
|
-
|
|
1844
|
-
|
|
1845
|
-
|
|
1846
|
-
|
|
1847
|
-
|
|
1848
|
-
);
|
|
1849
|
-
const learned = [];
|
|
1850
|
-
if (opts.corpus) {
|
|
1851
|
-
for (const f of findings) {
|
|
1852
|
-
const record = {
|
|
1853
|
-
schemaVersion: "1.0.0",
|
|
1854
|
-
id: f.finding_id,
|
|
1855
|
-
runId: input.runId ?? observerId,
|
|
1856
|
-
producedAt: f.produced_at ?? producedAt,
|
|
1857
|
-
area: f.area,
|
|
1858
|
-
claim: f.recommended_action ?? f.claim,
|
|
1859
|
-
...f.claim ? { rationale: f.claim } : {},
|
|
1860
|
-
tags: [...opts.tags ?? [], `audience:${f.metadata?.audience ?? "agent"}`],
|
|
1861
|
-
confidence: f.confidence,
|
|
1862
|
-
evidence: [{ kind: "finding", uri: f.finding_id }]
|
|
1863
|
-
};
|
|
1864
|
-
const r = await opts.corpus.append(record);
|
|
1865
|
-
if (r.succeeded) learned.push(record);
|
|
2192
|
+
async call(handle, name, args) {
|
|
2193
|
+
const endpoint = endpoints.get(handle.id);
|
|
2194
|
+
if (!endpoint) return "ERROR: workspace closed";
|
|
2195
|
+
const { json } = await rpc(endpoint, {
|
|
2196
|
+
jsonrpc: "2.0",
|
|
2197
|
+
id: 2,
|
|
2198
|
+
method: "tools/call",
|
|
2199
|
+
params: { name, arguments: args }
|
|
2200
|
+
});
|
|
2201
|
+
const result = json ?? {};
|
|
2202
|
+
if (result.error) return `ERROR: ${JSON.stringify(result.error).slice(0, 300)}`;
|
|
2203
|
+
const text = result.result?.content?.map((c) => c.text ?? "").join("\n") ?? JSON.stringify(result.result ?? json);
|
|
2204
|
+
return text.slice(0, maxChars);
|
|
2205
|
+
},
|
|
2206
|
+
score: (task, handle) => opts.score(task, handle),
|
|
2207
|
+
async close(handle) {
|
|
2208
|
+
endpoints.delete(handle.id);
|
|
2209
|
+
await opts.close?.(handle);
|
|
1866
2210
|
}
|
|
1867
|
-
}
|
|
1868
|
-
return { findings, learned, report: renderReport(findings) };
|
|
1869
|
-
}
|
|
1870
|
-
function parseFindings(content) {
|
|
1871
|
-
let obj;
|
|
1872
|
-
try {
|
|
1873
|
-
obj = JSON.parse(content);
|
|
1874
|
-
} catch {
|
|
1875
|
-
const m = content.match(/\{[\s\S]*\}/);
|
|
1876
|
-
obj = m ? JSON.parse(m[0]) : { findings: [] };
|
|
1877
|
-
}
|
|
1878
|
-
const arr = obj.findings;
|
|
1879
|
-
return Array.isArray(arr) ? arr : [];
|
|
1880
|
-
}
|
|
1881
|
-
function renderReport(findings) {
|
|
1882
|
-
if (findings.length === 0) return "\u2713 clean run \u2014 the observer found nothing to change.";
|
|
1883
|
-
const audience = (f) => f.metadata?.audience ?? "agent";
|
|
1884
|
-
const forAgent = findings.filter((f) => audience(f) === "agent");
|
|
1885
|
-
const forOperator = findings.filter((f) => audience(f) === "operator");
|
|
1886
|
-
const block = (title, fs) => fs.length === 0 ? "" : `**${title}**
|
|
1887
|
-
${fs.map((f) => `- [${f.severity}] ${f.claim}
|
|
1888
|
-
\u2192 ${f.recommended_action ?? ""}`).join("\n")}
|
|
1889
|
-
`;
|
|
1890
|
-
return [
|
|
1891
|
-
block("For the agent (fix skills / prompt / tools)", forAgent),
|
|
1892
|
-
block("For you (the operator)", forOperator)
|
|
1893
|
-
].filter(Boolean).join("\n");
|
|
2211
|
+
};
|
|
1894
2212
|
}
|
|
1895
2213
|
|
|
1896
2214
|
// src/runtime/supervise/scope.ts
|
|
@@ -2123,12 +2441,12 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
|
|
|
2123
2441
|
seq
|
|
2124
2442
|
};
|
|
2125
2443
|
}
|
|
2126
|
-
async function runChild(live, executor, childAbort, task, opts,
|
|
2444
|
+
async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
|
|
2127
2445
|
let reconciled = false;
|
|
2128
2446
|
const reconcileOnce = (spend) => {
|
|
2129
2447
|
if (reconciled) return;
|
|
2130
2448
|
reconciled = true;
|
|
2131
|
-
|
|
2449
|
+
pool2.reconcile(ticket, clampSpend(spend, opts.budget));
|
|
2132
2450
|
};
|
|
2133
2451
|
try {
|
|
2134
2452
|
live.status = "running";
|
|
@@ -2875,6 +3193,118 @@ var routerInlineExecutor = (spec, ctx) => {
|
|
|
2875
3193
|
}
|
|
2876
3194
|
};
|
|
2877
3195
|
};
|
|
3196
|
+
var routerToolsSeamKey = "router-tools";
|
|
3197
|
+
var routerToolsInlineExecutor = (spec, ctx) => {
|
|
3198
|
+
const seam = readSeam(ctx, routerToolsSeamKey, "router-tools");
|
|
3199
|
+
const model = seam.model ?? spec.profile.model?.default;
|
|
3200
|
+
if (!model) {
|
|
3201
|
+
throw new ValidationError(
|
|
3202
|
+
"routerToolsInlineExecutor: no model \u2014 set RouterToolsSeam.model or AgentProfile.model.default"
|
|
3203
|
+
);
|
|
3204
|
+
}
|
|
3205
|
+
if (!seam.routerBaseUrl || !seam.routerKey) {
|
|
3206
|
+
throw new ValidationError(
|
|
3207
|
+
"routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
|
|
3208
|
+
);
|
|
3209
|
+
}
|
|
3210
|
+
const maxTurns = seam.maxTurns ?? 200;
|
|
3211
|
+
const controller = new AbortController();
|
|
3212
|
+
const abortIfSignalled = () => {
|
|
3213
|
+
if (ctx.signal.aborted) controller.abort();
|
|
3214
|
+
};
|
|
3215
|
+
abortIfSignalled();
|
|
3216
|
+
if (!ctx.signal.aborted) ctx.signal.addEventListener("abort", abortIfSignalled, { once: true });
|
|
3217
|
+
let artifact;
|
|
3218
|
+
return {
|
|
3219
|
+
runtime: "router",
|
|
3220
|
+
async execute(task, signal) {
|
|
3221
|
+
const started = Date.now();
|
|
3222
|
+
const linked = linkSignals(signal, controller.signal);
|
|
3223
|
+
const messages = [
|
|
3224
|
+
...taskToMessages(task, spec)
|
|
3225
|
+
];
|
|
3226
|
+
const tokens = zeroTokenUsage();
|
|
3227
|
+
let turns = 0;
|
|
3228
|
+
let lastText = "";
|
|
3229
|
+
for (let t = 0; t < maxTurns; t += 1) {
|
|
3230
|
+
turns += 1;
|
|
3231
|
+
const res = await fetch(`${seam.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
3232
|
+
method: "POST",
|
|
3233
|
+
headers: {
|
|
3234
|
+
"content-type": "application/json",
|
|
3235
|
+
authorization: `Bearer ${seam.routerKey}`
|
|
3236
|
+
},
|
|
3237
|
+
body: JSON.stringify({
|
|
3238
|
+
model,
|
|
3239
|
+
messages,
|
|
3240
|
+
tools: seam.tools,
|
|
3241
|
+
tool_choice: "auto",
|
|
3242
|
+
temperature: 0.2
|
|
3243
|
+
}),
|
|
3244
|
+
...linked ? { signal: linked } : {}
|
|
3245
|
+
});
|
|
3246
|
+
if (!res.ok) {
|
|
3247
|
+
throw new ValidationError(
|
|
3248
|
+
`routerToolsInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`
|
|
3249
|
+
);
|
|
3250
|
+
}
|
|
3251
|
+
const data = await res.json();
|
|
3252
|
+
const u = data.usage;
|
|
3253
|
+
if (u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number") {
|
|
3254
|
+
tokens.input += u.prompt_tokens;
|
|
3255
|
+
tokens.output += u.completion_tokens;
|
|
3256
|
+
}
|
|
3257
|
+
const msg = data.choices?.[0]?.message;
|
|
3258
|
+
if (msg?.content) lastText = msg.content;
|
|
3259
|
+
const toolCalls = msg?.tool_calls ?? [];
|
|
3260
|
+
if (toolCalls.length === 0) break;
|
|
3261
|
+
messages.push({
|
|
3262
|
+
role: "assistant",
|
|
3263
|
+
content: msg?.content ?? "",
|
|
3264
|
+
tool_calls: toolCalls.map((tc, i) => ({
|
|
3265
|
+
id: tc.id ?? `call_${i}`,
|
|
3266
|
+
type: "function",
|
|
3267
|
+
function: { name: tc.function?.name ?? "", arguments: tc.function?.arguments ?? "{}" }
|
|
3268
|
+
}))
|
|
3269
|
+
});
|
|
3270
|
+
for (let i = 0; i < toolCalls.length; i += 1) {
|
|
3271
|
+
const tc = toolCalls[i];
|
|
3272
|
+
const id = tc?.id ?? `call_${i}`;
|
|
3273
|
+
let args = {};
|
|
3274
|
+
try {
|
|
3275
|
+
args = JSON.parse(tc?.function?.arguments ?? "{}");
|
|
3276
|
+
} catch {
|
|
3277
|
+
messages.push({
|
|
3278
|
+
role: "tool",
|
|
3279
|
+
tool_call_id: id,
|
|
3280
|
+
content: "error: tool arguments were not valid JSON"
|
|
3281
|
+
});
|
|
3282
|
+
continue;
|
|
3283
|
+
}
|
|
3284
|
+
const result = await seam.executeToolCall(tc?.function?.name ?? "", args, task);
|
|
3285
|
+
messages.push({ role: "tool", tool_call_id: id, content: result });
|
|
3286
|
+
}
|
|
3287
|
+
}
|
|
3288
|
+
const usd = isModelPriced(model) ? estimateCost(tokens.input, tokens.output, model) : 0;
|
|
3289
|
+
const spent = { iterations: turns, tokens, usd, ms: Date.now() - started };
|
|
3290
|
+
const out = { content: lastText };
|
|
3291
|
+
artifact = { outRef: contentRef("router-tools", { model, content: lastText }), out, spent };
|
|
3292
|
+
return artifact;
|
|
3293
|
+
},
|
|
3294
|
+
teardown(_grace) {
|
|
3295
|
+
controller.abort();
|
|
3296
|
+
return Promise.resolve({ destroyed: true });
|
|
3297
|
+
},
|
|
3298
|
+
resultArtifact() {
|
|
3299
|
+
if (!artifact) {
|
|
3300
|
+
throw new ValidationError(
|
|
3301
|
+
"routerToolsInlineExecutor: resultArtifact() read before execute()"
|
|
3302
|
+
);
|
|
3303
|
+
}
|
|
3304
|
+
return { ...artifact, spent: artifact.spent };
|
|
3305
|
+
}
|
|
3306
|
+
};
|
|
3307
|
+
};
|
|
2878
3308
|
var sandboxExecutor = (spec, ctx) => {
|
|
2879
3309
|
if (spec.harness === null) {
|
|
2880
3310
|
throw new ValidationError("sandboxExecutor: harness is null (router/inline) \u2014 wrong executor");
|
|
@@ -3168,6 +3598,8 @@ function createExecutor(config) {
|
|
|
3168
3598
|
switch (config.backend) {
|
|
3169
3599
|
case "router":
|
|
3170
3600
|
return routerInlineExecutor(spec, seamed);
|
|
3601
|
+
case "router-tools":
|
|
3602
|
+
return routerToolsInlineExecutor(spec, seamed);
|
|
3171
3603
|
case "bridge":
|
|
3172
3604
|
return bridgeExecutor(spec, seamed);
|
|
3173
3605
|
case "cli":
|
|
@@ -3401,7 +3833,7 @@ function createSupervisor() {
|
|
|
3401
3833
|
let attached;
|
|
3402
3834
|
async function run(root, task, opts) {
|
|
3403
3835
|
const now = opts.now ?? Date.now;
|
|
3404
|
-
const
|
|
3836
|
+
const pool2 = createBudgetPool(opts.budget, now);
|
|
3405
3837
|
await opts.journal.beginTree(opts.runId, new Date(now()).toISOString());
|
|
3406
3838
|
await opts.journal.appendEvent(opts.runId, {
|
|
3407
3839
|
kind: "spawned",
|
|
@@ -3427,7 +3859,7 @@ function createSupervisor() {
|
|
|
3427
3859
|
const scope = createScope({
|
|
3428
3860
|
parentId: opts.runId,
|
|
3429
3861
|
root: opts.runId,
|
|
3430
|
-
pool,
|
|
3862
|
+
pool: pool2,
|
|
3431
3863
|
journal,
|
|
3432
3864
|
blobs: opts.blobs,
|
|
3433
3865
|
executors: opts.executors,
|
|
@@ -3455,7 +3887,7 @@ function createSupervisor() {
|
|
|
3455
3887
|
}
|
|
3456
3888
|
const tree = scope.view;
|
|
3457
3889
|
if (actOutcome.ok) {
|
|
3458
|
-
|
|
3890
|
+
pool2.assertNoOpenTickets();
|
|
3459
3891
|
const out = actOutcome.out;
|
|
3460
3892
|
const outRef = contentAddress(out);
|
|
3461
3893
|
await opts.blobs.put(outRef, out);
|
|
@@ -3469,7 +3901,7 @@ function createSupervisor() {
|
|
|
3469
3901
|
}
|
|
3470
3902
|
return {
|
|
3471
3903
|
kind: "no-winner",
|
|
3472
|
-
reason: classifyNoWinner(controller,
|
|
3904
|
+
reason: classifyNoWinner(controller, pool2, opts, breaker),
|
|
3473
3905
|
tree,
|
|
3474
3906
|
downCount: breaker.downCount()
|
|
3475
3907
|
};
|
|
@@ -3574,14 +4006,14 @@ async function drainCursor(scope) {
|
|
|
3574
4006
|
if (settled === null) return;
|
|
3575
4007
|
}
|
|
3576
4008
|
}
|
|
3577
|
-
function classifyNoWinner(controller,
|
|
4009
|
+
function classifyNoWinner(controller, pool2, opts, breaker) {
|
|
3578
4010
|
if (breaker.tripped()) return "all-children-down";
|
|
3579
4011
|
if (controller.signal.aborted) return "aborted";
|
|
3580
|
-
if (poolExhausted(
|
|
4012
|
+
if (poolExhausted(pool2, opts)) return "budget-exhausted";
|
|
3581
4013
|
return "all-children-down";
|
|
3582
4014
|
}
|
|
3583
|
-
function poolExhausted(
|
|
3584
|
-
const r =
|
|
4015
|
+
function poolExhausted(pool2, opts) {
|
|
4016
|
+
const r = pool2.readout();
|
|
3585
4017
|
if (r.tokensLeft <= 0) return true;
|
|
3586
4018
|
if (opts.budget.maxUsd !== void 0 && r.usdLeft <= 0) return true;
|
|
3587
4019
|
if (opts.budget.deadlineMs !== void 0 && r.deadlineMs > 0 && (opts.now ?? Date.now)() >= r.deadlineMs) {
|
|
@@ -3717,13 +4149,13 @@ function shapeName(shape, _resolved) {
|
|
|
3717
4149
|
}
|
|
3718
4150
|
function resolveShapeBudget(root, over) {
|
|
3719
4151
|
const fanout2 = over?.fanout ?? defaultFanout;
|
|
3720
|
-
const
|
|
4152
|
+
const perChild2 = over?.perChild ?? {
|
|
3721
4153
|
maxIterations: Math.max(1, Math.floor(root.maxIterations / fanout2)),
|
|
3722
4154
|
maxTokens: Math.max(1, Math.floor(root.maxTokens / fanout2)),
|
|
3723
4155
|
...root.maxUsd !== void 0 ? { maxUsd: root.maxUsd / fanout2 } : {},
|
|
3724
4156
|
...root.deadlineMs !== void 0 ? { deadlineMs: root.deadlineMs } : {}
|
|
3725
4157
|
};
|
|
3726
|
-
return { perChild, fanout: fanout2 };
|
|
4158
|
+
return { perChild: perChild2, fanout: fanout2 };
|
|
3727
4159
|
}
|
|
3728
4160
|
var defaultFanout = 3;
|
|
3729
4161
|
function personaRegistry(persona) {
|
|
@@ -3914,13 +4346,13 @@ function spreadOf(values) {
|
|
|
3914
4346
|
function fractionalSpread(values) {
|
|
3915
4347
|
const spread = spreadOf(values);
|
|
3916
4348
|
if (spread === 0) return 0;
|
|
3917
|
-
const
|
|
3918
|
-
if (
|
|
4349
|
+
const median2 = medianOf(values);
|
|
4350
|
+
if (median2 === 0) {
|
|
3919
4351
|
throw new Error(
|
|
3920
4352
|
"equalKOnCost: arms have a non-zero cost spread on a zero-median channel; cannot express it as a fraction"
|
|
3921
4353
|
);
|
|
3922
4354
|
}
|
|
3923
|
-
return spread /
|
|
4355
|
+
return spread / median2;
|
|
3924
4356
|
}
|
|
3925
4357
|
function medianOf(values) {
|
|
3926
4358
|
if (values.length === 0) {
|
|
@@ -3961,75 +4393,951 @@ function requireSpend(rolled, id, root) {
|
|
|
3961
4393
|
return spend;
|
|
3962
4394
|
}
|
|
3963
4395
|
|
|
3964
|
-
// src/runtime/
|
|
3965
|
-
|
|
3966
|
-
|
|
3967
|
-
const
|
|
3968
|
-
|
|
3969
|
-
|
|
3970
|
-
|
|
3971
|
-
|
|
3972
|
-
|
|
3973
|
-
|
|
3974
|
-
|
|
3975
|
-
|
|
3976
|
-
|
|
3977
|
-
|
|
3978
|
-
|
|
3979
|
-
|
|
3980
|
-
|
|
3981
|
-
|
|
3982
|
-
|
|
3983
|
-
|
|
3984
|
-
|
|
3985
|
-
|
|
3986
|
-
|
|
3987
|
-
|
|
3988
|
-
|
|
3989
|
-
|
|
3990
|
-
|
|
3991
|
-
|
|
4396
|
+
// src/runtime/promotion-gate.ts
|
|
4397
|
+
import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
|
|
4398
|
+
function promotionGate(opts) {
|
|
4399
|
+
const mode = opts.mode ?? "superiority";
|
|
4400
|
+
if (opts.candidate === opts.incumbent) {
|
|
4401
|
+
return {
|
|
4402
|
+
promoted: false,
|
|
4403
|
+
reason: "identical-champion",
|
|
4404
|
+
mode,
|
|
4405
|
+
n: 0,
|
|
4406
|
+
lift: { mean: 0, median: 0, low: 0, high: 0 }
|
|
4407
|
+
};
|
|
4408
|
+
}
|
|
4409
|
+
const before = [];
|
|
4410
|
+
const after = [];
|
|
4411
|
+
const incUsd = [];
|
|
4412
|
+
const candUsd = [];
|
|
4413
|
+
const incMs = [];
|
|
4414
|
+
const candMs = [];
|
|
4415
|
+
const cellIds = [];
|
|
4416
|
+
for (const row of opts.report.perTask) {
|
|
4417
|
+
const inc = row.cells?.[opts.incumbent];
|
|
4418
|
+
const cand = row.cells?.[opts.candidate];
|
|
4419
|
+
if (!inc || !cand) continue;
|
|
4420
|
+
before.push(inc.score);
|
|
4421
|
+
after.push(cand.score);
|
|
4422
|
+
incUsd.push(inc.usd);
|
|
4423
|
+
candUsd.push(cand.usd);
|
|
4424
|
+
incMs.push(inc.ms);
|
|
4425
|
+
candMs.push(cand.ms);
|
|
4426
|
+
cellIds.push(row.taskId);
|
|
4427
|
+
}
|
|
4428
|
+
if (before.length === 0) {
|
|
4429
|
+
throw new Error(
|
|
4430
|
+
`promotionGate: no holdout task carried cells for both "${opts.incumbent}" and "${opts.candidate}" \u2014 the report must come from a run that included both strategies`
|
|
3992
4431
|
);
|
|
3993
4432
|
}
|
|
3994
|
-
const
|
|
3995
|
-
|
|
3996
|
-
|
|
3997
|
-
|
|
3998
|
-
|
|
3999
|
-
|
|
4000
|
-
|
|
4001
|
-
|
|
4002
|
-
const turnPayload = (prompt, turnKind, startedAt, result, error) => ({
|
|
4003
|
-
...runPayload(),
|
|
4004
|
-
turnKind,
|
|
4005
|
-
promptChars: prompt.length,
|
|
4006
|
-
promptHash: hashText(prompt),
|
|
4007
|
-
...result !== void 0 || error !== void 0 ? { durationMs: Math.max(0, now() - startedAt) } : {},
|
|
4008
|
-
...result ? {
|
|
4009
|
-
eventCount: result.events.length,
|
|
4010
|
-
eventTypes: eventTypeCounts(result.events),
|
|
4011
|
-
...result.readError !== void 0 ? { readError: result.readError } : {}
|
|
4012
|
-
} : {},
|
|
4013
|
-
...error !== void 0 ? { error: errorMessage(error) } : {}
|
|
4014
|
-
});
|
|
4015
|
-
async function settle2(box, events) {
|
|
4016
|
-
const collected = [];
|
|
4017
|
-
for await (const ev of events) collected.push(ev);
|
|
4018
|
-
if (deliverable.kind === "events") {
|
|
4019
|
-
return { out: deliverable.fromEvents(collected), events: collected };
|
|
4433
|
+
const sig = heldoutSignificance(
|
|
4434
|
+
{ before, after, cellIds },
|
|
4435
|
+
{
|
|
4436
|
+
deltaThreshold: opts.deltaThreshold ?? 0,
|
|
4437
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4438
|
+
statistic: opts.statistic ?? "mean",
|
|
4439
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4440
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4020
4441
|
}
|
|
4021
|
-
|
|
4022
|
-
|
|
4023
|
-
|
|
4024
|
-
|
|
4025
|
-
|
|
4026
|
-
|
|
4027
|
-
|
|
4028
|
-
|
|
4029
|
-
|
|
4030
|
-
|
|
4031
|
-
|
|
4032
|
-
|
|
4442
|
+
);
|
|
4443
|
+
const lift = {
|
|
4444
|
+
mean: sig.bootstrap.mean,
|
|
4445
|
+
median: sig.bootstrap.median,
|
|
4446
|
+
low: sig.bootstrap.low,
|
|
4447
|
+
high: sig.bootstrap.high
|
|
4448
|
+
};
|
|
4449
|
+
const latSig = heldoutSignificance(
|
|
4450
|
+
{ before: incMs, after: candMs, cellIds },
|
|
4451
|
+
{
|
|
4452
|
+
deltaThreshold: 0,
|
|
4453
|
+
minProductiveRuns: 1,
|
|
4454
|
+
statistic: opts.statistic ?? "mean",
|
|
4455
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4456
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4457
|
+
}
|
|
4458
|
+
);
|
|
4459
|
+
const latency = {
|
|
4460
|
+
mean: latSig.bootstrap.mean,
|
|
4461
|
+
median: latSig.bootstrap.median,
|
|
4462
|
+
low: latSig.bootstrap.low,
|
|
4463
|
+
high: latSig.bootstrap.high
|
|
4464
|
+
};
|
|
4465
|
+
if (mode === "superiority") {
|
|
4466
|
+
if (sig.fewRuns) return { promoted: false, reason: "few-tasks", mode, n: sig.n, lift, latency };
|
|
4467
|
+
return sig.significant ? { promoted: true, reason: "significant", mode, n: sig.n, lift, latency } : { promoted: false, reason: "no-margin", mode, n: sig.n, lift, latency };
|
|
4468
|
+
}
|
|
4469
|
+
const tolerance = opts.scoreTolerance ?? 0.05;
|
|
4470
|
+
const scoreSig = heldoutSignificance(
|
|
4471
|
+
{ before, after, cellIds },
|
|
4472
|
+
{
|
|
4473
|
+
deltaThreshold: -tolerance,
|
|
4474
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4475
|
+
statistic: opts.statistic ?? "mean",
|
|
4476
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4477
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4478
|
+
}
|
|
4479
|
+
);
|
|
4480
|
+
const costSig = heldoutSignificance(
|
|
4481
|
+
{ before: candUsd, after: incUsd, cellIds },
|
|
4482
|
+
{
|
|
4483
|
+
deltaThreshold: 0,
|
|
4484
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4485
|
+
statistic: opts.statistic ?? "mean",
|
|
4486
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4487
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4488
|
+
}
|
|
4489
|
+
);
|
|
4490
|
+
const costSavings = {
|
|
4491
|
+
mean: costSig.bootstrap.mean,
|
|
4492
|
+
median: costSig.bootstrap.median,
|
|
4493
|
+
low: costSig.bootstrap.low,
|
|
4494
|
+
high: costSig.bootstrap.high
|
|
4495
|
+
};
|
|
4496
|
+
if (scoreSig.fewRuns)
|
|
4497
|
+
return { promoted: false, reason: "few-tasks", mode, n: scoreSig.n, lift, costSavings, latency };
|
|
4498
|
+
if (!scoreSig.significant)
|
|
4499
|
+
return {
|
|
4500
|
+
promoted: false,
|
|
4501
|
+
reason: "non-inferiority-unproven",
|
|
4502
|
+
mode,
|
|
4503
|
+
n: scoreSig.n,
|
|
4504
|
+
lift,
|
|
4505
|
+
costSavings,
|
|
4506
|
+
latency
|
|
4507
|
+
};
|
|
4508
|
+
if (!costSig.significant)
|
|
4509
|
+
return {
|
|
4510
|
+
promoted: false,
|
|
4511
|
+
reason: "not-cheaper",
|
|
4512
|
+
mode,
|
|
4513
|
+
n: scoreSig.n,
|
|
4514
|
+
lift,
|
|
4515
|
+
costSavings,
|
|
4516
|
+
latency
|
|
4517
|
+
};
|
|
4518
|
+
return {
|
|
4519
|
+
promoted: true,
|
|
4520
|
+
reason: "non-inferior-and-cheaper",
|
|
4521
|
+
mode,
|
|
4522
|
+
n: scoreSig.n,
|
|
4523
|
+
lift,
|
|
4524
|
+
costSavings,
|
|
4525
|
+
latency
|
|
4526
|
+
};
|
|
4527
|
+
}
|
|
4528
|
+
|
|
4529
|
+
// src/runtime/run-benchmark.ts
|
|
4530
|
+
import { pairedBootstrap, paretoFrontier } from "@tangle-network/agent-eval";
|
|
4531
|
+
|
|
4532
|
+
// src/runtime/strategy.ts
|
|
4533
|
+
import { createChatClient, estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
|
|
4534
|
+
var taskNudge = "Use the available tools to bring the artifact to the required final state. Address EVERY distinct change the request implies. After each tool result, check what remains and continue. Re-read the values you set to confirm they took. Reply DONE only once every required change is made and verified.";
|
|
4535
|
+
async function runShot(surface, _task, handle, tools, messages, opts, modelOverride) {
|
|
4536
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4537
|
+
let completions = 0;
|
|
4538
|
+
let toolCalls = 0;
|
|
4539
|
+
let toolErrors = 0;
|
|
4540
|
+
const tokens = { input: 0, output: 0 };
|
|
4541
|
+
for (let t = 0; t < innerTurns; t += 1) {
|
|
4542
|
+
const res = await fetch(`${opts.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
4543
|
+
method: "POST",
|
|
4544
|
+
headers: { "content-type": "application/json", authorization: `Bearer ${opts.routerKey}` },
|
|
4545
|
+
body: JSON.stringify({
|
|
4546
|
+
model: modelOverride ?? opts.model,
|
|
4547
|
+
messages,
|
|
4548
|
+
tools,
|
|
4549
|
+
tool_choice: "auto",
|
|
4550
|
+
temperature: opts.temperature ?? 0.7,
|
|
4551
|
+
...opts.maxTokens ? { max_tokens: opts.maxTokens } : {}
|
|
4552
|
+
})
|
|
4553
|
+
});
|
|
4554
|
+
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
4555
|
+
completions += 1;
|
|
4556
|
+
const data = await res.json();
|
|
4557
|
+
if (typeof data.usage?.prompt_tokens === "number") tokens.input += data.usage.prompt_tokens;
|
|
4558
|
+
if (typeof data.usage?.completion_tokens === "number")
|
|
4559
|
+
tokens.output += data.usage.completion_tokens;
|
|
4560
|
+
const msg = data.choices?.[0]?.message;
|
|
4561
|
+
if (!msg) break;
|
|
4562
|
+
const calls = msg.tool_calls ?? [];
|
|
4563
|
+
messages.push({
|
|
4564
|
+
role: "assistant",
|
|
4565
|
+
content: msg.content ?? "",
|
|
4566
|
+
...calls.length ? { tool_calls: calls } : {}
|
|
4567
|
+
});
|
|
4568
|
+
if (calls.length === 0) break;
|
|
4569
|
+
for (const call of calls) {
|
|
4570
|
+
toolCalls += 1;
|
|
4571
|
+
let args = {};
|
|
4572
|
+
try {
|
|
4573
|
+
args = JSON.parse(call.function.arguments || "{}");
|
|
4574
|
+
} catch {
|
|
4575
|
+
toolErrors += 1;
|
|
4576
|
+
}
|
|
4577
|
+
let out;
|
|
4578
|
+
try {
|
|
4579
|
+
out = await surface.call(handle, call.function.name, args);
|
|
4580
|
+
if (out.startsWith("ERROR:")) toolErrors += 1;
|
|
4581
|
+
} catch (e) {
|
|
4582
|
+
toolErrors += 1;
|
|
4583
|
+
out = `ERROR: ${e instanceof Error ? e.message : String(e)}`;
|
|
4584
|
+
}
|
|
4585
|
+
messages.push({ role: "tool", tool_call_id: call.id, content: out });
|
|
4586
|
+
}
|
|
4587
|
+
}
|
|
4588
|
+
return { messages, completions, toolCalls, toolErrors, tokens };
|
|
4589
|
+
}
|
|
4590
|
+
function compactTrajectory(messages) {
|
|
4591
|
+
return messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
|
|
4592
|
+
if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
|
|
4593
|
+
const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
|
|
4594
|
+
return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
|
|
4595
|
+
}).join("\n").slice(0, 7e3);
|
|
4596
|
+
}
|
|
4597
|
+
async function consultAnalyst(task, messages, instruction, opts) {
|
|
4598
|
+
const trajectory = compactTrajectory(messages);
|
|
4599
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4600
|
+
const chat = createChatClient({
|
|
4601
|
+
transport: "router",
|
|
4602
|
+
apiKey: opts.routerKey,
|
|
4603
|
+
baseUrl: opts.routerBaseUrl,
|
|
4604
|
+
defaultModel: analystModel
|
|
4605
|
+
});
|
|
4606
|
+
const res = await chat.chat({
|
|
4607
|
+
model: analystModel,
|
|
4608
|
+
temperature: 0.2,
|
|
4609
|
+
maxTokens: 1024,
|
|
4610
|
+
messages: [
|
|
4611
|
+
{ role: "system", content: instruction },
|
|
4612
|
+
{
|
|
4613
|
+
role: "user",
|
|
4614
|
+
content: `TASK: ${task.userPrompt.slice(0, 1500)}
|
|
4615
|
+
|
|
4616
|
+
TRAJECTORY:
|
|
4617
|
+
${trajectory}`
|
|
4618
|
+
}
|
|
4619
|
+
]
|
|
4620
|
+
});
|
|
4621
|
+
const usage = res.usage;
|
|
4622
|
+
return {
|
|
4623
|
+
steer: res.content.trim(),
|
|
4624
|
+
tokens: {
|
|
4625
|
+
input: usage?.promptTokens ?? usage?.prompt_tokens ?? 0,
|
|
4626
|
+
output: usage?.completionTokens ?? usage?.completion_tokens ?? 0
|
|
4627
|
+
}
|
|
4628
|
+
};
|
|
4629
|
+
}
|
|
4630
|
+
async function analyze(task, messages, opts) {
|
|
4631
|
+
const trajectory = compactTrajectory(messages);
|
|
4632
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4633
|
+
const inner = createChatClient({
|
|
4634
|
+
transport: "router",
|
|
4635
|
+
apiKey: opts.routerKey,
|
|
4636
|
+
baseUrl: opts.routerBaseUrl,
|
|
4637
|
+
defaultModel: analystModel
|
|
4638
|
+
});
|
|
4639
|
+
const tokens = { input: 0, output: 0 };
|
|
4640
|
+
const chat = {
|
|
4641
|
+
...inner,
|
|
4642
|
+
chat: async (req, callOpts) => {
|
|
4643
|
+
const res = await inner.chat(req, callOpts);
|
|
4644
|
+
const u = res.usage;
|
|
4645
|
+
if (u) {
|
|
4646
|
+
tokens.input += u.promptTokens ?? u.prompt_tokens ?? 0;
|
|
4647
|
+
tokens.output += u.completionTokens ?? u.completion_tokens ?? 0;
|
|
4648
|
+
}
|
|
4649
|
+
return res;
|
|
4650
|
+
}
|
|
4651
|
+
};
|
|
4652
|
+
const obs = await observe(
|
|
4653
|
+
{
|
|
4654
|
+
task: task.userPrompt,
|
|
4655
|
+
output: trajectory,
|
|
4656
|
+
trace: messages,
|
|
4657
|
+
outcome: "failed",
|
|
4658
|
+
runId: task.id
|
|
4659
|
+
},
|
|
4660
|
+
{
|
|
4661
|
+
chat,
|
|
4662
|
+
model: analystModel,
|
|
4663
|
+
...opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {},
|
|
4664
|
+
...opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}
|
|
4665
|
+
}
|
|
4666
|
+
);
|
|
4667
|
+
const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
|
|
4668
|
+
return { steer: steer || "COMPLETE", tokens };
|
|
4669
|
+
}
|
|
4670
|
+
function shotExecutor(surface, opts) {
|
|
4671
|
+
let artifact;
|
|
4672
|
+
return {
|
|
4673
|
+
runtime: "agentic-shot",
|
|
4674
|
+
async execute(task) {
|
|
4675
|
+
const t = task;
|
|
4676
|
+
const own = !t.handle;
|
|
4677
|
+
const handle = t.handle ?? await surface.open(t.task);
|
|
4678
|
+
try {
|
|
4679
|
+
const allTools = await surface.tools(t.task, handle);
|
|
4680
|
+
let tools = allTools;
|
|
4681
|
+
if (t.tools) {
|
|
4682
|
+
const known = new Set(allTools.map((tool) => tool.function.name));
|
|
4683
|
+
const unknown = t.tools.filter((name) => !known.has(name));
|
|
4684
|
+
if (unknown.length > 0) {
|
|
4685
|
+
throw new Error(
|
|
4686
|
+
`shot tools: unknown tool name(s) ${unknown.join(", ")} \u2014 domain offers: ${[...known].join(", ")}`
|
|
4687
|
+
);
|
|
4688
|
+
}
|
|
4689
|
+
const want = new Set(t.tools);
|
|
4690
|
+
tools = allTools.filter((tool) => want.has(tool.function.name));
|
|
4691
|
+
}
|
|
4692
|
+
const messages = t.messages?.length ? t.messages : [
|
|
4693
|
+
{ role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
|
|
4694
|
+
{ role: "user", content: `${t.task.userPrompt}
|
|
4695
|
+
|
|
4696
|
+
${taskNudge}` }
|
|
4697
|
+
];
|
|
4698
|
+
if (t.messages?.length && t.persona?.systemPrompt) {
|
|
4699
|
+
messages.push({
|
|
4700
|
+
role: "user",
|
|
4701
|
+
content: `[hand-off] You are now acting as: ${t.persona.systemPrompt}`
|
|
4702
|
+
});
|
|
4703
|
+
}
|
|
4704
|
+
if (t.steer) messages.push({ role: "user", content: t.steer });
|
|
4705
|
+
const shot = await runShot(surface, t.task, handle, tools, messages, opts, t.persona?.model);
|
|
4706
|
+
const s = await surface.score(t.task, handle);
|
|
4707
|
+
const score = s.total > 0 ? s.passes / s.total : 0;
|
|
4708
|
+
const out = {
|
|
4709
|
+
messages: shot.messages,
|
|
4710
|
+
score,
|
|
4711
|
+
passes: s.passes,
|
|
4712
|
+
total: s.total,
|
|
4713
|
+
completions: shot.completions,
|
|
4714
|
+
toolErrors: shot.toolErrors
|
|
4715
|
+
};
|
|
4716
|
+
artifact = {
|
|
4717
|
+
outRef: `shot:${handle.id}:${shot.completions}:${s.passes}/${s.total}`,
|
|
4718
|
+
out,
|
|
4719
|
+
verdict: { valid: s.total > 0 && s.passes === s.total, score },
|
|
4720
|
+
// Real usage to the conserved pool: tokens from the router responses; usd only
|
|
4721
|
+
// when the model is in the price table (never a fabricated number).
|
|
4722
|
+
spent: {
|
|
4723
|
+
iterations: shot.completions,
|
|
4724
|
+
tokens: shot.tokens,
|
|
4725
|
+
usd: isModelPriced2(opts.model) ? estimateCost2(shot.tokens.input, shot.tokens.output, opts.model) : 0,
|
|
4726
|
+
ms: 0
|
|
4727
|
+
}
|
|
4728
|
+
};
|
|
4729
|
+
return artifact;
|
|
4730
|
+
} finally {
|
|
4731
|
+
if (own) await surface.close(handle);
|
|
4732
|
+
}
|
|
4733
|
+
},
|
|
4734
|
+
teardown: () => Promise.resolve({ destroyed: true }),
|
|
4735
|
+
resultArtifact() {
|
|
4736
|
+
if (!artifact) throw new Error("shotExecutor: resultArtifact before execute");
|
|
4737
|
+
return artifact;
|
|
4738
|
+
}
|
|
4739
|
+
};
|
|
4740
|
+
}
|
|
4741
|
+
function analystExecutor(opts) {
|
|
4742
|
+
let artifact;
|
|
4743
|
+
return {
|
|
4744
|
+
runtime: "agentic-analyst",
|
|
4745
|
+
async execute(task) {
|
|
4746
|
+
const t = task;
|
|
4747
|
+
const { steer, tokens } = t.rawInstruction ? await consultAnalyst(t.task, t.messages, t.rawInstruction, opts) : await analyze(t.task, t.messages, opts);
|
|
4748
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4749
|
+
artifact = {
|
|
4750
|
+
outRef: `analyst:${steer.length}`,
|
|
4751
|
+
out: steer,
|
|
4752
|
+
spent: {
|
|
4753
|
+
iterations: 1,
|
|
4754
|
+
tokens,
|
|
4755
|
+
usd: isModelPriced2(analystModel) ? estimateCost2(tokens.input, tokens.output, analystModel) : 0,
|
|
4756
|
+
ms: 0
|
|
4757
|
+
}
|
|
4758
|
+
};
|
|
4759
|
+
return artifact;
|
|
4760
|
+
},
|
|
4761
|
+
teardown: () => Promise.resolve({ destroyed: true }),
|
|
4762
|
+
resultArtifact() {
|
|
4763
|
+
if (!artifact) throw new Error("analystExecutor: resultArtifact before execute");
|
|
4764
|
+
return artifact;
|
|
4765
|
+
}
|
|
4766
|
+
};
|
|
4767
|
+
}
|
|
4768
|
+
function agenticRegistry(surface, opts) {
|
|
4769
|
+
return {
|
|
4770
|
+
register() {
|
|
4771
|
+
throw new Error("agenticRegistry: register unsupported");
|
|
4772
|
+
},
|
|
4773
|
+
resolve(spec) {
|
|
4774
|
+
const role = spec.profile.metadata?.role;
|
|
4775
|
+
const factory = (_s, _ctx) => role === "analyst" ? analystExecutor(opts) : shotExecutor(surface, opts);
|
|
4776
|
+
return { succeeded: true, value: factory };
|
|
4777
|
+
}
|
|
4778
|
+
};
|
|
4779
|
+
}
|
|
4780
|
+
function leaf(name, role) {
|
|
4781
|
+
const agent = {
|
|
4782
|
+
name,
|
|
4783
|
+
executorSpec: { profile: { name, metadata: { role } }, harness: null },
|
|
4784
|
+
act() {
|
|
4785
|
+
throw new Error(`agentic: spawned leaf "${name}" run as a driver`);
|
|
4786
|
+
}
|
|
4787
|
+
};
|
|
4788
|
+
return agent;
|
|
4789
|
+
}
|
|
4790
|
+
async function drainOne2(scope) {
|
|
4791
|
+
const s = await scope.next();
|
|
4792
|
+
if (!s) throw new Error("agentic: spawned child never settled");
|
|
4793
|
+
return s;
|
|
4794
|
+
}
|
|
4795
|
+
var perChild = (innerTurns) => ({
|
|
4796
|
+
maxIterations: innerTurns + 1,
|
|
4797
|
+
maxTokens: 1e6
|
|
4798
|
+
});
|
|
4799
|
+
function depthDriver(surface, task, opts, cfg) {
|
|
4800
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4801
|
+
let pendingSteer;
|
|
4802
|
+
return {
|
|
4803
|
+
name: "depth",
|
|
4804
|
+
async act(_t, scope) {
|
|
4805
|
+
const handle = await surface.open(task);
|
|
4806
|
+
const progression = [];
|
|
4807
|
+
let messages;
|
|
4808
|
+
let completions = 0;
|
|
4809
|
+
let shots = 0;
|
|
4810
|
+
try {
|
|
4811
|
+
for (shots = 0; shots < cfg.maxShots; shots += 1) {
|
|
4812
|
+
const child = leaf(`shot:${shots}`, "shot");
|
|
4813
|
+
const steer = shots === 0 ? void 0 : pendingSteer;
|
|
4814
|
+
const res = scope.spawn(child, { task, handle, messages, steer }, {
|
|
4815
|
+
budget: perChild(innerTurns),
|
|
4816
|
+
label: `shot:${shots}`
|
|
4817
|
+
});
|
|
4818
|
+
if (!res.ok) break;
|
|
4819
|
+
const settled = await drainOne2(scope);
|
|
4820
|
+
if (settled.kind === "down") break;
|
|
4821
|
+
const out = settled.out;
|
|
4822
|
+
messages = out.messages;
|
|
4823
|
+
completions += out.completions;
|
|
4824
|
+
progression.push(out.score);
|
|
4825
|
+
if (out.score >= 1 || shots === cfg.maxShots - 1) break;
|
|
4826
|
+
const aChild = leaf(`analyst:${shots}`, "analyst");
|
|
4827
|
+
const aRes = scope.spawn(
|
|
4828
|
+
aChild,
|
|
4829
|
+
{ task, messages },
|
|
4830
|
+
{ budget: perChild(1), label: `analyst:${shots}` }
|
|
4831
|
+
);
|
|
4832
|
+
if (!aRes.ok) break;
|
|
4833
|
+
const aSettled = await drainOne2(scope);
|
|
4834
|
+
completions += 1;
|
|
4835
|
+
if (aSettled.kind === "down") break;
|
|
4836
|
+
const findings = aSettled.out;
|
|
4837
|
+
if (/^\s*COMPLETE\b/i.test(findings)) break;
|
|
4838
|
+
pendingSteer = `A reviewer flagged unfinished items:
|
|
4839
|
+
${findings}
|
|
4840
|
+
|
|
4841
|
+
Address each with the tools, verify they took, then continue.`;
|
|
4842
|
+
}
|
|
4843
|
+
const final = await surface.score(task, handle);
|
|
4844
|
+
const score = final.total > 0 ? final.passes / final.total : 0;
|
|
4845
|
+
return {
|
|
4846
|
+
kind: "done",
|
|
4847
|
+
deliverable: {
|
|
4848
|
+
mode: "depth",
|
|
4849
|
+
score,
|
|
4850
|
+
resolved: final.total > 0 && final.passes === final.total,
|
|
4851
|
+
completions,
|
|
4852
|
+
progression,
|
|
4853
|
+
shots: shots + 1
|
|
4854
|
+
}
|
|
4855
|
+
};
|
|
4856
|
+
} finally {
|
|
4857
|
+
await surface.close(handle);
|
|
4858
|
+
}
|
|
4859
|
+
}
|
|
4860
|
+
};
|
|
4861
|
+
}
|
|
4862
|
+
function breadthDriver(_surface, task, opts, cfg) {
|
|
4863
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4864
|
+
return {
|
|
4865
|
+
name: "breadth",
|
|
4866
|
+
async act(_t, scope) {
|
|
4867
|
+
let opened = 0;
|
|
4868
|
+
for (let k = 0; k < cfg.width; k += 1) {
|
|
4869
|
+
const res = scope.spawn(leaf(`rollout:${k}`, "shot"), { task }, {
|
|
4870
|
+
budget: perChild(innerTurns),
|
|
4871
|
+
label: `rollout:${k}`
|
|
4872
|
+
});
|
|
4873
|
+
if (res.ok) opened += 1;
|
|
4874
|
+
}
|
|
4875
|
+
if (opened === 0) return { kind: "blocked", blockers: ["breadth: pool admitted no rollout"] };
|
|
4876
|
+
let best = -1;
|
|
4877
|
+
let bestResolved = false;
|
|
4878
|
+
let completions = 0;
|
|
4879
|
+
const progression = [];
|
|
4880
|
+
for (let s = await scope.next(); s !== null; s = await scope.next()) {
|
|
4881
|
+
if (s.kind === "down") continue;
|
|
4882
|
+
const out = s.out;
|
|
4883
|
+
completions += out.completions;
|
|
4884
|
+
if (out.score > best) best = out.score;
|
|
4885
|
+
if (out.total > 0 && out.passes === out.total) bestResolved = true;
|
|
4886
|
+
progression.push(best);
|
|
4887
|
+
}
|
|
4888
|
+
if (best < 0) return { kind: "blocked", blockers: ["breadth: every rollout went down"] };
|
|
4889
|
+
return {
|
|
4890
|
+
kind: "done",
|
|
4891
|
+
deliverable: {
|
|
4892
|
+
mode: "breadth",
|
|
4893
|
+
score: best,
|
|
4894
|
+
resolved: bestResolved,
|
|
4895
|
+
completions,
|
|
4896
|
+
progression,
|
|
4897
|
+
shots: opened
|
|
4898
|
+
}
|
|
4899
|
+
};
|
|
4900
|
+
}
|
|
4901
|
+
};
|
|
4902
|
+
}
|
|
4903
|
+
var sample = {
|
|
4904
|
+
name: "sample",
|
|
4905
|
+
driver: (surface, task, opts, budget) => breadthDriver(surface, task, opts, { width: budget })
|
|
4906
|
+
};
|
|
4907
|
+
var refine = {
|
|
4908
|
+
name: "refine",
|
|
4909
|
+
driver: (surface, task, opts, budget) => depthDriver(surface, task, opts, { maxShots: budget })
|
|
4910
|
+
};
|
|
4911
|
+
function defineStrategy(name, run) {
|
|
4912
|
+
return {
|
|
4913
|
+
name,
|
|
4914
|
+
driver: (surface, task, opts, budget) => ({
|
|
4915
|
+
name,
|
|
4916
|
+
async act(_t, scope) {
|
|
4917
|
+
let seq = 0;
|
|
4918
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4919
|
+
let verifiedBest = 0;
|
|
4920
|
+
let verifiedResolved = false;
|
|
4921
|
+
const openHandles = /* @__PURE__ */ new Set();
|
|
4922
|
+
const ctx = {
|
|
4923
|
+
// Narrowed to open/close — the body gets no raw call()/score() access.
|
|
4924
|
+
surface: {
|
|
4925
|
+
name: surface.name,
|
|
4926
|
+
open: async (t) => {
|
|
4927
|
+
const h = await surface.open(t);
|
|
4928
|
+
openHandles.add(h.id);
|
|
4929
|
+
return h;
|
|
4930
|
+
},
|
|
4931
|
+
close: async (h) => {
|
|
4932
|
+
if (!h || !openHandles.has(h.id)) return;
|
|
4933
|
+
openHandles.delete(h.id);
|
|
4934
|
+
await surface.close(h);
|
|
4935
|
+
}
|
|
4936
|
+
},
|
|
4937
|
+
task,
|
|
4938
|
+
opts,
|
|
4939
|
+
budget,
|
|
4940
|
+
scope,
|
|
4941
|
+
async shot(spec) {
|
|
4942
|
+
const child = leaf(`shot:${seq}`, "shot");
|
|
4943
|
+
seq += 1;
|
|
4944
|
+
const res = scope.spawn(
|
|
4945
|
+
child,
|
|
4946
|
+
{
|
|
4947
|
+
task,
|
|
4948
|
+
handle: spec?.handle,
|
|
4949
|
+
messages: spec?.messages,
|
|
4950
|
+
steer: spec?.steer,
|
|
4951
|
+
persona: spec?.persona,
|
|
4952
|
+
tools: spec?.tools
|
|
4953
|
+
},
|
|
4954
|
+
{ budget: perChild(innerTurns), label: child.name }
|
|
4955
|
+
);
|
|
4956
|
+
if (!res.ok) return null;
|
|
4957
|
+
const settled = await drainOne2(scope);
|
|
4958
|
+
if (settled.kind === "down") return null;
|
|
4959
|
+
const out = settled.out;
|
|
4960
|
+
if (out.score > verifiedBest) verifiedBest = out.score;
|
|
4961
|
+
if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
|
|
4962
|
+
return out;
|
|
4963
|
+
},
|
|
4964
|
+
async listTools(handle) {
|
|
4965
|
+
const tools = await surface.tools(task, handle);
|
|
4966
|
+
return tools.map((t) => ({
|
|
4967
|
+
name: t.function.name,
|
|
4968
|
+
...t.function.description ? { description: t.function.description } : {}
|
|
4969
|
+
}));
|
|
4970
|
+
},
|
|
4971
|
+
async critique(messages) {
|
|
4972
|
+
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4973
|
+
seq += 1;
|
|
4974
|
+
const res = scope.spawn(
|
|
4975
|
+
child,
|
|
4976
|
+
{ task, messages },
|
|
4977
|
+
{ budget: perChild(1), label: child.name }
|
|
4978
|
+
);
|
|
4979
|
+
if (!res.ok) return null;
|
|
4980
|
+
const settled = await drainOne2(scope);
|
|
4981
|
+
if (settled.kind === "down") return null;
|
|
4982
|
+
const findings = settled.out;
|
|
4983
|
+
return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
|
|
4984
|
+
},
|
|
4985
|
+
async consult(messages, instruction) {
|
|
4986
|
+
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4987
|
+
seq += 1;
|
|
4988
|
+
const res = scope.spawn(
|
|
4989
|
+
child,
|
|
4990
|
+
{ task, messages, rawInstruction: instruction },
|
|
4991
|
+
{ budget: perChild(1), label: child.name }
|
|
4992
|
+
);
|
|
4993
|
+
if (!res.ok) return null;
|
|
4994
|
+
const settled = await drainOne2(scope);
|
|
4995
|
+
if (settled.kind === "down") return null;
|
|
4996
|
+
return settled.out;
|
|
4997
|
+
}
|
|
4998
|
+
};
|
|
4999
|
+
const r = await run(ctx);
|
|
5000
|
+
return {
|
|
5001
|
+
kind: "done",
|
|
5002
|
+
deliverable: {
|
|
5003
|
+
mode: name,
|
|
5004
|
+
...r,
|
|
5005
|
+
progression: Array.isArray(r.progression) ? r.progression : [],
|
|
5006
|
+
completions: typeof r.completions === "number" ? r.completions : 0,
|
|
5007
|
+
shots: typeof r.shots === "number" ? r.shots : 0,
|
|
5008
|
+
score: verifiedBest,
|
|
5009
|
+
resolved: verifiedResolved
|
|
5010
|
+
}
|
|
5011
|
+
};
|
|
5012
|
+
}
|
|
5013
|
+
})
|
|
5014
|
+
};
|
|
5015
|
+
}
|
|
5016
|
+
var adaptiveRefine = defineStrategy(
|
|
5017
|
+
"adaptiveRefine",
|
|
5018
|
+
async ({ surface, task, budget, shot, critique }) => {
|
|
5019
|
+
let handle = await surface.open(task);
|
|
5020
|
+
const progression = [];
|
|
5021
|
+
let messages;
|
|
5022
|
+
let steer;
|
|
5023
|
+
let completions = 0;
|
|
5024
|
+
let best = -1;
|
|
5025
|
+
let shots = 0;
|
|
5026
|
+
try {
|
|
5027
|
+
for (shots = 0; shots < budget; shots += 1) {
|
|
5028
|
+
const out = await shot({ handle, messages, steer });
|
|
5029
|
+
if (!out) break;
|
|
5030
|
+
completions += out.completions;
|
|
5031
|
+
progression.push(out.score);
|
|
5032
|
+
if (out.score >= 1) break;
|
|
5033
|
+
if (out.score <= best) {
|
|
5034
|
+
await surface.close(handle);
|
|
5035
|
+
handle = await surface.open(task);
|
|
5036
|
+
messages = void 0;
|
|
5037
|
+
steer = void 0;
|
|
5038
|
+
continue;
|
|
5039
|
+
}
|
|
5040
|
+
best = out.score;
|
|
5041
|
+
messages = out.messages;
|
|
5042
|
+
const findings = await critique(out.messages);
|
|
5043
|
+
completions += 1;
|
|
5044
|
+
if (!findings) break;
|
|
5045
|
+
steer = `A reviewer flagged unfinished items:
|
|
5046
|
+
${findings}
|
|
5047
|
+
|
|
5048
|
+
Address each with the tools, verify they took, then continue.`;
|
|
5049
|
+
}
|
|
5050
|
+
const score = progression.length ? Math.max(...progression) : 0;
|
|
5051
|
+
return { score, resolved: score >= 1, completions, progression, shots };
|
|
5052
|
+
} finally {
|
|
5053
|
+
await surface.close(handle);
|
|
5054
|
+
}
|
|
5055
|
+
}
|
|
5056
|
+
);
|
|
5057
|
+
var sampleThenRefine = defineStrategy(
|
|
5058
|
+
"sampleThenRefine",
|
|
5059
|
+
async ({ surface, task, budget, shot, critique }) => {
|
|
5060
|
+
const explore = Math.max(1, Math.ceil(budget / 2));
|
|
5061
|
+
const open = /* @__PURE__ */ new Set();
|
|
5062
|
+
const progression = [];
|
|
5063
|
+
let completions = 0;
|
|
5064
|
+
let shots = 0;
|
|
5065
|
+
try {
|
|
5066
|
+
let best;
|
|
5067
|
+
for (let i = 0; i < explore; i += 1) {
|
|
5068
|
+
const handle = await surface.open(task);
|
|
5069
|
+
open.add(handle);
|
|
5070
|
+
const out = await shot({ handle });
|
|
5071
|
+
if (!out) continue;
|
|
5072
|
+
shots += 1;
|
|
5073
|
+
completions += out.completions;
|
|
5074
|
+
progression.push(out.score);
|
|
5075
|
+
if (!best || out.score > best.out.score) best = { handle, out };
|
|
5076
|
+
if (out.score >= 1) break;
|
|
5077
|
+
}
|
|
5078
|
+
if (!best) return { score: 0, resolved: false, completions, progression, shots };
|
|
5079
|
+
for (const h of [...open]) {
|
|
5080
|
+
if (h !== best.handle) {
|
|
5081
|
+
await surface.close(h);
|
|
5082
|
+
open.delete(h);
|
|
5083
|
+
}
|
|
5084
|
+
}
|
|
5085
|
+
let messages = best.out.messages;
|
|
5086
|
+
let topScore = best.out.score;
|
|
5087
|
+
for (let i = explore; i < budget && topScore < 1; i += 1) {
|
|
5088
|
+
const findings = await critique(messages);
|
|
5089
|
+
completions += 1;
|
|
5090
|
+
if (!findings) break;
|
|
5091
|
+
const out = await shot({
|
|
5092
|
+
handle: best.handle,
|
|
5093
|
+
messages,
|
|
5094
|
+
steer: `A reviewer flagged unfinished items:
|
|
5095
|
+
${findings}
|
|
5096
|
+
|
|
5097
|
+
Address each with the tools, verify they took, then continue.`
|
|
5098
|
+
});
|
|
5099
|
+
if (!out) break;
|
|
5100
|
+
shots += 1;
|
|
5101
|
+
completions += out.completions;
|
|
5102
|
+
progression.push(out.score);
|
|
5103
|
+
messages = out.messages;
|
|
5104
|
+
if (out.score > topScore) topScore = out.score;
|
|
5105
|
+
}
|
|
5106
|
+
const score = progression.length ? Math.max(...progression) : 0;
|
|
5107
|
+
return { score, resolved: score >= 1, completions, progression, shots };
|
|
5108
|
+
} finally {
|
|
5109
|
+
for (const h of open) await surface.close(h);
|
|
5110
|
+
}
|
|
5111
|
+
}
|
|
5112
|
+
);
|
|
5113
|
+
async function runAgentic(opts) {
|
|
5114
|
+
const strategy = opts.strategy ?? (opts.mode === "breadth" ? sample : refine);
|
|
5115
|
+
const driver = strategy.driver(opts.surface, opts.task, opts, opts.budget);
|
|
5116
|
+
const supervisor = createSupervisor();
|
|
5117
|
+
const root = opts.rootBudget ?? {
|
|
5118
|
+
maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2),
|
|
5119
|
+
maxTokens: 1e9
|
|
5120
|
+
};
|
|
5121
|
+
const started = Date.now();
|
|
5122
|
+
const result = await supervisor.run(driver, void 0, {
|
|
5123
|
+
budget: root,
|
|
5124
|
+
runId: `agentic:${strategy.name}:${opts.task.id}`,
|
|
5125
|
+
journal: new InMemorySpawnJournal(),
|
|
5126
|
+
blobs: new InMemoryResultBlobStore(),
|
|
5127
|
+
executors: agenticRegistry(opts.surface, opts),
|
|
5128
|
+
maxDepth: 3,
|
|
5129
|
+
...opts.hooks ? { hooks: opts.hooks } : {}
|
|
5130
|
+
});
|
|
5131
|
+
if (result.kind !== "winner" || result.out.kind !== "done") {
|
|
5132
|
+
const reason = result.kind === "winner" ? `blocked: ${result.out.blockers?.join("; ")}` : `no-winner: ${result.reason}`;
|
|
5133
|
+
throw new Error(`runAgentic(${strategy.name}) produced no result \u2014 ${reason}`);
|
|
5134
|
+
}
|
|
5135
|
+
const core = result.out.deliverable;
|
|
5136
|
+
return {
|
|
5137
|
+
...core,
|
|
5138
|
+
usd: result.spentTotal.usd,
|
|
5139
|
+
tokens: result.spentTotal.tokens,
|
|
5140
|
+
ms: Date.now() - started
|
|
5141
|
+
};
|
|
5142
|
+
}
|
|
5143
|
+
|
|
5144
|
+
// src/runtime/run-benchmark.ts
|
|
5145
|
+
async function pool(items, limit, fn) {
|
|
5146
|
+
const out = new Array(items.length);
|
|
5147
|
+
let next = 0;
|
|
5148
|
+
const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => {
|
|
5149
|
+
while (next < items.length) {
|
|
5150
|
+
const i = next;
|
|
5151
|
+
next += 1;
|
|
5152
|
+
out[i] = await fn(items[i], i);
|
|
5153
|
+
}
|
|
5154
|
+
});
|
|
5155
|
+
await Promise.all(workers);
|
|
5156
|
+
return out;
|
|
5157
|
+
}
|
|
5158
|
+
async function runBenchmark(cfg) {
|
|
5159
|
+
const strategies = cfg.strategies ?? [sample, refine];
|
|
5160
|
+
const budget = cfg.budget ?? 3;
|
|
5161
|
+
const concurrency = cfg.concurrency ?? 3;
|
|
5162
|
+
let settled = 0;
|
|
5163
|
+
const perTask = await pool(cfg.tasks, concurrency, async (task) => {
|
|
5164
|
+
const cells = {};
|
|
5165
|
+
const errors = {};
|
|
5166
|
+
let row;
|
|
5167
|
+
try {
|
|
5168
|
+
for (const s of strategies) {
|
|
5169
|
+
try {
|
|
5170
|
+
const r = await runAgentic({
|
|
5171
|
+
...cfg.worker,
|
|
5172
|
+
surface: cfg.environment,
|
|
5173
|
+
task,
|
|
5174
|
+
strategy: s,
|
|
5175
|
+
budget,
|
|
5176
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5177
|
+
});
|
|
5178
|
+
cells[s.name] = {
|
|
5179
|
+
score: r.score,
|
|
5180
|
+
resolved: r.resolved,
|
|
5181
|
+
progression: r.progression,
|
|
5182
|
+
usd: r.usd,
|
|
5183
|
+
ms: r.ms,
|
|
5184
|
+
tokens: r.tokens
|
|
5185
|
+
};
|
|
5186
|
+
} catch (e) {
|
|
5187
|
+
errors[s.name] = e instanceof Error ? e.message.slice(0, 300) : String(e);
|
|
5188
|
+
cells[s.name] = {
|
|
5189
|
+
score: 0,
|
|
5190
|
+
resolved: false,
|
|
5191
|
+
progression: [],
|
|
5192
|
+
usd: 0,
|
|
5193
|
+
ms: 0,
|
|
5194
|
+
tokens: { input: 0, output: 0 }
|
|
5195
|
+
};
|
|
5196
|
+
}
|
|
5197
|
+
}
|
|
5198
|
+
row = {
|
|
5199
|
+
taskId: task.id,
|
|
5200
|
+
cells,
|
|
5201
|
+
...Object.keys(errors).length > 0 ? { errors } : {}
|
|
5202
|
+
};
|
|
5203
|
+
} catch (e) {
|
|
5204
|
+
row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
|
|
5205
|
+
}
|
|
5206
|
+
settled += 1;
|
|
5207
|
+
cfg.onTask?.(row, settled, cfg.tasks.length);
|
|
5208
|
+
return row;
|
|
5209
|
+
});
|
|
5210
|
+
const ok = perTask.filter(
|
|
5211
|
+
(r) => !!r.cells
|
|
5212
|
+
);
|
|
5213
|
+
const mean = (xs) => xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0;
|
|
5214
|
+
const perStrategy = {};
|
|
5215
|
+
for (const s of strategies) {
|
|
5216
|
+
const cells = ok.map((r) => r.cells[s.name]).filter((c) => !!c);
|
|
5217
|
+
perStrategy[s.name] = {
|
|
5218
|
+
score: mean(cells.map((c) => c.score)),
|
|
5219
|
+
resolved: mean(cells.map((c) => c.resolved ? 1 : 0)),
|
|
5220
|
+
usd: mean(cells.map((c) => c.usd)),
|
|
5221
|
+
ms: mean(cells.map((c) => c.ms))
|
|
5222
|
+
};
|
|
5223
|
+
}
|
|
5224
|
+
const frontier = paretoFrontier(
|
|
5225
|
+
Object.entries(perStrategy).map(([name, v]) => ({ name, score: v.score, usd: v.usd })),
|
|
5226
|
+
[
|
|
5227
|
+
{ name: "score", direction: "maximize", value: (c) => c.score },
|
|
5228
|
+
{ name: "usd", direction: "minimize", value: (c) => c.usd }
|
|
5229
|
+
]
|
|
5230
|
+
).frontier.map((c) => c.name);
|
|
5231
|
+
const report = {
|
|
5232
|
+
n: ok.length,
|
|
5233
|
+
excluded: perTask.length - ok.length,
|
|
5234
|
+
perStrategy,
|
|
5235
|
+
perTask,
|
|
5236
|
+
pareto: frontier
|
|
5237
|
+
};
|
|
5238
|
+
const names = strategies.map((s) => s.name);
|
|
5239
|
+
if (names.includes("refine") && names.includes("sample") && ok.length >= 2) {
|
|
5240
|
+
const b = pairedBootstrap(
|
|
5241
|
+
ok.map((r) => r.cells.sample?.score ?? 0),
|
|
5242
|
+
ok.map((r) => r.cells.refine?.score ?? 0)
|
|
5243
|
+
);
|
|
5244
|
+
report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n };
|
|
5245
|
+
}
|
|
5246
|
+
return report;
|
|
5247
|
+
}
|
|
5248
|
+
function printBenchmarkReport(report) {
|
|
5249
|
+
const pct = (x) => `${(x * 100).toFixed(1)}%`;
|
|
5250
|
+
const pp = (x) => `${x >= 0 ? "+" : ""}${(x * 100).toFixed(1)}pp`;
|
|
5251
|
+
console.log(
|
|
5252
|
+
`
|
|
5253
|
+
=== benchmark \xB7 n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ""} ===`
|
|
5254
|
+
);
|
|
5255
|
+
console.log(
|
|
5256
|
+
` ${"strategy".padEnd(16)} ${"score".padStart(7)} ${"resolved".padStart(9)} ${"$/task".padStart(8)} ${"s/task".padStart(7)}`
|
|
5257
|
+
);
|
|
5258
|
+
for (const [s, v] of Object.entries(report.perStrategy))
|
|
5259
|
+
console.log(
|
|
5260
|
+
` ${(report.pareto.includes(s) ? `${s} *` : s).padEnd(16)} ${pct(v.score).padStart(7)} ${pct(v.resolved).padStart(9)} ${`$${v.usd.toFixed(3)}`.padStart(8)} ${(v.ms / 1e3).toFixed(0).padStart(6)}s`
|
|
5261
|
+
);
|
|
5262
|
+
if (report.pareto.length) console.log(` * = on the (score, $) Pareto frontier`);
|
|
5263
|
+
for (const row of report.perTask)
|
|
5264
|
+
if (row.error) console.log(` \u26A0 ${row.taskId}: ${row.error.slice(0, 120)}`);
|
|
5265
|
+
const l = report.refineVsSample;
|
|
5266
|
+
if (l) {
|
|
5267
|
+
const sig = l.low > 0 ? "SIGNIF +" : l.high < 0 ? "SIGNIF -" : "n.s.";
|
|
5268
|
+
console.log(` refine \u2212 sample: ${pp(l.mean)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`);
|
|
5269
|
+
}
|
|
5270
|
+
}
|
|
5271
|
+
|
|
5272
|
+
// src/runtime/sandbox-run.ts
|
|
5273
|
+
async function openSandboxRun(client, options, deliverable) {
|
|
5274
|
+
const runId = options.runId ?? `sandbox-run-${randomSuffix()}`;
|
|
5275
|
+
const now = options.now ?? Date.now;
|
|
5276
|
+
const capabilities = await probeSandboxCapabilities(client);
|
|
5277
|
+
const lineage = createSandboxLineage(client, capabilities, {
|
|
5278
|
+
...options.maxConcurrency !== void 0 ? { maxConcurrency: options.maxConcurrency } : {}
|
|
5279
|
+
});
|
|
5280
|
+
let handle;
|
|
5281
|
+
let started = false;
|
|
5282
|
+
let runStartedAt;
|
|
5283
|
+
let failed = false;
|
|
5284
|
+
let turnCount = 0;
|
|
5285
|
+
function emit(event) {
|
|
5286
|
+
notifyRuntimeHookEvent(
|
|
5287
|
+
options.hooks,
|
|
5288
|
+
{
|
|
5289
|
+
id: `${runId}:${event.target}:${event.phase}${event.stepIndex === void 0 ? "" : `:${event.stepIndex}`}`,
|
|
5290
|
+
runId,
|
|
5291
|
+
scenarioId: options.scenarioId,
|
|
5292
|
+
target: event.target,
|
|
5293
|
+
phase: event.phase,
|
|
5294
|
+
timestamp: event.timestamp,
|
|
5295
|
+
stepIndex: event.stepIndex,
|
|
5296
|
+
payload: event.payload,
|
|
5297
|
+
metadata: { producer: "openSandboxRun" }
|
|
5298
|
+
},
|
|
5299
|
+
{ signal: options.signal }
|
|
5300
|
+
);
|
|
5301
|
+
}
|
|
5302
|
+
const runPayload = () => ({
|
|
5303
|
+
agentName: options.agentRun.name ?? options.agentRun.profile.name ?? "agent",
|
|
5304
|
+
profileName: options.agentRun.profile.name,
|
|
5305
|
+
backendType: backendType(options.agentRun),
|
|
5306
|
+
deliverableKind: deliverable.kind,
|
|
5307
|
+
...deliverable.kind === "artifact" ? { deliverablePath: deliverable.path } : {},
|
|
5308
|
+
...handle ? { sessionId: handle.sessionId, sandboxId: handle.box.id } : {}
|
|
5309
|
+
});
|
|
5310
|
+
const turnPayload = (prompt, turnKind, startedAt, result, error) => ({
|
|
5311
|
+
...runPayload(),
|
|
5312
|
+
turnKind,
|
|
5313
|
+
promptChars: prompt.length,
|
|
5314
|
+
promptHash: hashText(prompt),
|
|
5315
|
+
...result !== void 0 || error !== void 0 ? { durationMs: Math.max(0, now() - startedAt) } : {},
|
|
5316
|
+
...result ? {
|
|
5317
|
+
eventCount: result.events.length,
|
|
5318
|
+
eventTypes: eventTypeCounts(result.events),
|
|
5319
|
+
...result.readError !== void 0 ? { readError: result.readError } : {}
|
|
5320
|
+
} : {},
|
|
5321
|
+
...error !== void 0 ? { error: errorMessage(error) } : {}
|
|
5322
|
+
});
|
|
5323
|
+
async function settle2(box, events) {
|
|
5324
|
+
const collected = [];
|
|
5325
|
+
for await (const ev of events) collected.push(ev);
|
|
5326
|
+
if (deliverable.kind === "events") {
|
|
5327
|
+
return { out: deliverable.fromEvents(collected), events: collected };
|
|
5328
|
+
}
|
|
5329
|
+
throwIfAborted(options.signal);
|
|
5330
|
+
let raw = "";
|
|
5331
|
+
let readError;
|
|
5332
|
+
const readAttempts = 4;
|
|
5333
|
+
const readDelayMs = options.readRetryDelayMs ?? 1e3;
|
|
5334
|
+
for (let attempt = 0; attempt < readAttempts; attempt += 1) {
|
|
5335
|
+
throwIfAborted(options.signal);
|
|
5336
|
+
try {
|
|
5337
|
+
raw = await box.fs.read(deliverable.path);
|
|
5338
|
+
readError = void 0;
|
|
5339
|
+
break;
|
|
5340
|
+
} catch (err) {
|
|
4033
5341
|
readError = err instanceof Error ? err.message : String(err);
|
|
4034
5342
|
if (attempt < readAttempts - 1 && readDelayMs > 0)
|
|
4035
5343
|
await sleep(readDelayMs * (attempt + 1), options.signal);
|
|
@@ -4189,6 +5497,710 @@ function errorMessage(error) {
|
|
|
4189
5497
|
return error instanceof Error ? error.message : String(error);
|
|
4190
5498
|
}
|
|
4191
5499
|
|
|
5500
|
+
// src/runtime/strategy-author.ts
|
|
5501
|
+
import { mkdirSync, writeFileSync } from "fs";
|
|
5502
|
+
import { join } from "path";
|
|
5503
|
+
var strategyAuthorContract = `
|
|
5504
|
+
You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
|
|
5505
|
+
spend a compute budget to beat a task's deployable check. You compose exactly two steps:
|
|
5506
|
+
|
|
5507
|
+
shot(spec?: { handle?, messages?, steer?, persona?, tools? }): Promise<ShotResult | null>
|
|
5508
|
+
Runs ONE worker attempt (a bounded tool loop) over an artifact.
|
|
5509
|
+
- omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
|
|
5510
|
+
- pass handle => the shot CONTINUES that artifact (state accumulates across shots).
|
|
5511
|
+
- messages => the carried conversation (pass the previous ShotResult.messages to continue).
|
|
5512
|
+
- steer => a corrective instruction injected before the shot.
|
|
5513
|
+
- persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model
|
|
5514
|
+
(multi-agent strategies: a researcher shot then an engineer shot, a panel of k
|
|
5515
|
+
personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
|
|
5516
|
+
a carried conversation it arrives as a hand-off message. Same conserved budget.
|
|
5517
|
+
- tools => string[] \u2014 restrict THIS shot to a subset of the task's tools by
|
|
5518
|
+
name (focus an explore shot on read-only tools, an execute shot on write tools).
|
|
5519
|
+
Restriction-only; unknown names make the shot fail. ALWAYS select from
|
|
5520
|
+
await listTools(handle) \u2014 never hardcode. Omitted => the shot sees every tool.
|
|
5521
|
+
ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
|
|
5522
|
+
Returns null if the attempt failed infra-wise.
|
|
5523
|
+
|
|
5524
|
+
critique(messages): Promise<string | null>
|
|
5525
|
+
A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
|
|
5526
|
+
instruction (or null when it judges the work complete). Costs ~1 completion.
|
|
5527
|
+
|
|
5528
|
+
consult(messages, instruction): Promise<string | null>
|
|
5529
|
+
The RAW analyst channel: the same firewalled critic answers YOUR instruction over the
|
|
5530
|
+
trajectory verbatim (no reformatting) \u2014 use it when you need a specific reply format
|
|
5531
|
+
(a decision, a prediction). Costs ~1 completion.
|
|
5532
|
+
|
|
5533
|
+
surface.open(task) / surface.close(handle)
|
|
5534
|
+
Open a persistent artifact you manage yourself (remember to close in a finally).
|
|
5535
|
+
close is idempotent \u2014 closing an already-closed handle is a safe no-op.
|
|
5536
|
+
|
|
5537
|
+
listTools(handle): Promise<Array<{ name, description? }>>
|
|
5538
|
+
The tools THIS task actually offers. TOOL SETS VARY PER TASK \u2014 if you restrict a
|
|
5539
|
+
shot with \`tools\`, you MUST pick names from await listTools(handle); hardcoding
|
|
5540
|
+
names from an example kills your shots on every task whose tools differ.
|
|
5541
|
+
|
|
5542
|
+
Rules:
|
|
5543
|
+
- ALWAYS await every shot/critique/surface call \u2014 a floating promise that rejects
|
|
5544
|
+
crashes the whole benchmark run.
|
|
5545
|
+
- Stay within ~budget total shots; every shot/critique spends from a conserved pool.
|
|
5546
|
+
- For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
|
|
5547
|
+
fresh conversation too, but be explicit). To CONTINUE, pass the previous
|
|
5548
|
+
ShotResult.messages unchanged.
|
|
5549
|
+
- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint
|
|
5550
|
+
you reached (keep-best, never final-state), progression = score after each shot.
|
|
5551
|
+
- The module must be EXACTLY this shape (no other imports, no commentary outside code):
|
|
5552
|
+
|
|
5553
|
+
import { defineStrategy } from '@tangle-network/agent-runtime/loops'
|
|
5554
|
+
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique, listTools }) => {
|
|
5555
|
+
// your composition (listTools comes from the destructured context \u2014 it is NOT a global)
|
|
5556
|
+
})
|
|
5557
|
+
`;
|
|
5558
|
+
function assertStrategyContract(code) {
|
|
5559
|
+
const allowedImport = /^\s*import\s+\{[^}]*\}\s+from\s+['"]@tangle-network\/agent-runtime\/loops['"]/;
|
|
5560
|
+
for (const line of code.split("\n")) {
|
|
5561
|
+
if (/^\s*import\s/.test(line) && !allowedImport.test(line)) {
|
|
5562
|
+
throw new Error(`authored code rejected: foreign import \u2014 ${line.trim().slice(0, 120)}`);
|
|
5563
|
+
}
|
|
5564
|
+
}
|
|
5565
|
+
const banned = [
|
|
5566
|
+
[/\brequire\s*\(/, "require()"],
|
|
5567
|
+
[/\bimport\s*\(/, "dynamic import()"],
|
|
5568
|
+
[/\beval\s*\(/, "eval()"],
|
|
5569
|
+
[/new\s+Function\s*\(/, "new Function()"],
|
|
5570
|
+
[/\bprocess\s*[.[]/, "process access"],
|
|
5571
|
+
[/\bglobalThis\s*[.[]/, "globalThis access"],
|
|
5572
|
+
[/\bfetch\s*\(/, "network access"],
|
|
5573
|
+
[/child_process|node:fs|node:net|node:http|worker_threads/, "node builtin access"]
|
|
5574
|
+
];
|
|
5575
|
+
for (const [re, what] of banned) {
|
|
5576
|
+
if (re.test(code)) throw new Error(`authored code rejected: ${what}`);
|
|
5577
|
+
}
|
|
5578
|
+
}
|
|
5579
|
+
async function requestAuthoredCode(opts, model) {
|
|
5580
|
+
const res = await opts.chat.chat(
|
|
5581
|
+
{
|
|
5582
|
+
...model ? { model } : {},
|
|
5583
|
+
...opts.temperature !== void 0 ? { temperature: opts.temperature } : {},
|
|
5584
|
+
...opts.maxTokens !== void 0 ? { maxTokens: opts.maxTokens } : {},
|
|
5585
|
+
messages: [
|
|
5586
|
+
{
|
|
5587
|
+
role: "system",
|
|
5588
|
+
content: "You are a senior engineer authoring optimization strategies for agent loops. Output exactly one fenced ```ts code block and nothing else."
|
|
5589
|
+
},
|
|
5590
|
+
{
|
|
5591
|
+
role: "user",
|
|
5592
|
+
content: `${opts.contract ?? strategyAuthorContract}
|
|
5593
|
+
|
|
5594
|
+
BASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):
|
|
5595
|
+
${opts.lossesJson}
|
|
5596
|
+
|
|
5597
|
+
Author ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`
|
|
5598
|
+
}
|
|
5599
|
+
]
|
|
5600
|
+
},
|
|
5601
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
5602
|
+
);
|
|
5603
|
+
const match = res.content.match(/```(?:ts|typescript)?\s*\n([\s\S]*?)```/);
|
|
5604
|
+
if (!match?.[1]) {
|
|
5605
|
+
throw new Error(
|
|
5606
|
+
`authorStrategy: no code block in the author's reply (model=${model ?? "default"}): ${res.content.slice(0, 300)}`
|
|
5607
|
+
);
|
|
5608
|
+
}
|
|
5609
|
+
return match[1];
|
|
5610
|
+
}
|
|
5611
|
+
async function authorStrategy(opts) {
|
|
5612
|
+
let code;
|
|
5613
|
+
try {
|
|
5614
|
+
code = await requestAuthoredCode(opts, opts.model);
|
|
5615
|
+
} catch (primaryError) {
|
|
5616
|
+
if (!opts.fallbackModel) throw primaryError;
|
|
5617
|
+
code = await requestAuthoredCode(opts, opts.fallbackModel);
|
|
5618
|
+
}
|
|
5619
|
+
assertStrategyContract(code);
|
|
5620
|
+
mkdirSync(opts.outDir, { recursive: true });
|
|
5621
|
+
const file = join(opts.outDir, `authored-${Date.now()}.mts`);
|
|
5622
|
+
writeFileSync(file, code);
|
|
5623
|
+
const mod = await import(`file://${file}`);
|
|
5624
|
+
if (!mod.default || typeof mod.default.driver !== "function" || !mod.default.name) {
|
|
5625
|
+
throw new Error(`authorStrategy: ${file} does not export a default Strategy`);
|
|
5626
|
+
}
|
|
5627
|
+
return { strategy: mod.default, file, code };
|
|
5628
|
+
}
|
|
5629
|
+
|
|
5630
|
+
// src/runtime/strategy-evolution.ts
|
|
5631
|
+
import { existsSync, readFileSync, writeFileSync as writeFileSync2 } from "fs";
|
|
5632
|
+
import { gzipSync } from "zlib";
|
|
5633
|
+
function discriminatingMeans(report, fieldOrder) {
|
|
5634
|
+
const rows = report.perTask.filter((r) => {
|
|
5635
|
+
if (!r.cells) return false;
|
|
5636
|
+
const scores = fieldOrder.map((n) => r.cells?.[n]?.score).filter((s) => s !== void 0);
|
|
5637
|
+
if (scores.length < fieldOrder.length) return false;
|
|
5638
|
+
return Math.max(...scores) - Math.min(...scores) > 0;
|
|
5639
|
+
});
|
|
5640
|
+
if (rows.length === 0) return null;
|
|
5641
|
+
const out = {};
|
|
5642
|
+
for (const name of fieldOrder) {
|
|
5643
|
+
const cells = rows.map((r) => r.cells?.[name]).filter((c) => !!c);
|
|
5644
|
+
out[name] = {
|
|
5645
|
+
score: cells.reduce((s, c) => s + c.score, 0) / cells.length,
|
|
5646
|
+
usd: cells.reduce((s, c) => s + c.usd, 0) / cells.length
|
|
5647
|
+
};
|
|
5648
|
+
}
|
|
5649
|
+
return out;
|
|
5650
|
+
}
|
|
5651
|
+
function pickChampion(means, fieldOrder, policy, epsilon) {
|
|
5652
|
+
const entries = fieldOrder.map((name) => ({ name, summary: means[name] })).filter((e) => !!e.summary);
|
|
5653
|
+
if (entries.length === 0)
|
|
5654
|
+
throw new Error("pickChampion: the means table carries none of the field strategies");
|
|
5655
|
+
const best = Math.max(...entries.map((e) => e.summary.score));
|
|
5656
|
+
const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
|
|
5657
|
+
if (!pick) throw new Error("pickChampion: empty pick (unreachable)");
|
|
5658
|
+
return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
|
|
5659
|
+
}
|
|
5660
|
+
function selectChampion(report, fieldOrder, policy, epsilon) {
|
|
5661
|
+
return pickChampion(report.perStrategy, fieldOrder, policy, epsilon);
|
|
5662
|
+
}
|
|
5663
|
+
var fieldSummary = (archive) => archive.map(
|
|
5664
|
+
(n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
|
|
5665
|
+
).join("\n");
|
|
5666
|
+
var compactLosses = (report, detail) => {
|
|
5667
|
+
const r2 = (x) => Math.round(x * 100) / 100;
|
|
5668
|
+
const rows = report.perTask.map(
|
|
5669
|
+
(row) => row.cells ? {
|
|
5670
|
+
task: row.taskId,
|
|
5671
|
+
...row.errors ? {
|
|
5672
|
+
errors: Object.fromEntries(
|
|
5673
|
+
Object.entries(row.errors).map(([n, msg]) => [n, msg.slice(0, 100)])
|
|
5674
|
+
)
|
|
5675
|
+
} : {},
|
|
5676
|
+
cells: Object.fromEntries(
|
|
5677
|
+
Object.entries(row.cells).map(([name, c]) => [
|
|
5678
|
+
name,
|
|
5679
|
+
// 'binary' is the leakage-bounded channel: the author learns pass/fail per
|
|
5680
|
+
// task and nothing else — the per-generation leak from the evaluation data
|
|
5681
|
+
// is capped at one bit per cell (arXiv:2606.11045 measured that exploration
|
|
5682
|
+
// survives this; whether AUTHORING does is the E1-coarse A/B).
|
|
5683
|
+
detail === "binary" ? { resolved: c.resolved, usd: Math.round(c.usd * 1e4) / 1e4 } : {
|
|
5684
|
+
score: r2(c.score),
|
|
5685
|
+
resolved: c.resolved,
|
|
5686
|
+
usd: Math.round(c.usd * 1e4) / 1e4,
|
|
5687
|
+
progression: (c.progression ?? []).map(r2)
|
|
5688
|
+
}
|
|
5689
|
+
])
|
|
5690
|
+
)
|
|
5691
|
+
} : { task: row.taskId, error: row.error?.slice(0, 80) }
|
|
5692
|
+
);
|
|
5693
|
+
return JSON.stringify(rows).slice(0, 12e3);
|
|
5694
|
+
};
|
|
5695
|
+
function renameStrategy(orig, unique) {
|
|
5696
|
+
if (orig.name === unique) return orig;
|
|
5697
|
+
return {
|
|
5698
|
+
name: unique,
|
|
5699
|
+
driver: (s, t, o, b) => {
|
|
5700
|
+
const agent = orig.driver(s, t, o, b);
|
|
5701
|
+
return {
|
|
5702
|
+
...agent,
|
|
5703
|
+
name: unique,
|
|
5704
|
+
act: async (task, scope) => {
|
|
5705
|
+
const out = await agent.act(task, scope);
|
|
5706
|
+
if (out.kind !== "done") return out;
|
|
5707
|
+
const deliverable = { ...out.deliverable, mode: unique };
|
|
5708
|
+
return { ...out, deliverable };
|
|
5709
|
+
}
|
|
5710
|
+
};
|
|
5711
|
+
}
|
|
5712
|
+
};
|
|
5713
|
+
}
|
|
5714
|
+
async function runStrategyEvolution(cfg) {
|
|
5715
|
+
const budget = cfg.budget ?? 3;
|
|
5716
|
+
const concurrency = cfg.concurrency ?? 3;
|
|
5717
|
+
const generations = cfg.generations ?? 2;
|
|
5718
|
+
const populationSize = cfg.populationSize ?? 2;
|
|
5719
|
+
const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
|
|
5720
|
+
const policy = cfg.champion ?? "costAware";
|
|
5721
|
+
const epsilon = cfg.championEpsilon ?? (cfg.objective === "cost" ? cfg.scoreTolerance ?? 0.05 : 0.01);
|
|
5722
|
+
const byName = new Map(baselines.map((s) => [s.name, s]));
|
|
5723
|
+
const codeByName = /* @__PURE__ */ new Map();
|
|
5724
|
+
const fingerprint = {
|
|
5725
|
+
trainN: cfg.trainN,
|
|
5726
|
+
holdoutN: cfg.holdoutN,
|
|
5727
|
+
budget,
|
|
5728
|
+
generations,
|
|
5729
|
+
populationSize
|
|
5730
|
+
};
|
|
5731
|
+
let ckpt;
|
|
5732
|
+
if (cfg.checkpoint?.resume && existsSync(cfg.checkpoint.path)) {
|
|
5733
|
+
const raw = JSON.parse(readFileSync(cfg.checkpoint.path, "utf8"));
|
|
5734
|
+
if (JSON.stringify(raw.fingerprint) !== JSON.stringify(fingerprint)) {
|
|
5735
|
+
throw new Error(
|
|
5736
|
+
`evolution resume: checkpoint design mismatch \u2014 checkpoint ${JSON.stringify(raw.fingerprint)} vs config ${JSON.stringify(fingerprint)}; delete ${cfg.checkpoint.path} or match the config`
|
|
5737
|
+
);
|
|
5738
|
+
}
|
|
5739
|
+
ckpt = raw;
|
|
5740
|
+
}
|
|
5741
|
+
const save = (state) => {
|
|
5742
|
+
if (cfg.checkpoint)
|
|
5743
|
+
writeFileSync2(cfg.checkpoint.path, JSON.stringify({ ...state, fingerprint }, null, 1));
|
|
5744
|
+
};
|
|
5745
|
+
const bench = async (phase, tasks, strategies) => {
|
|
5746
|
+
await cfg.onPhase?.(phase);
|
|
5747
|
+
return runBenchmark({
|
|
5748
|
+
environment: cfg.environment,
|
|
5749
|
+
tasks,
|
|
5750
|
+
worker: cfg.worker,
|
|
5751
|
+
strategies,
|
|
5752
|
+
budget,
|
|
5753
|
+
concurrency,
|
|
5754
|
+
...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
|
|
5755
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5756
|
+
});
|
|
5757
|
+
};
|
|
5758
|
+
const train = await cfg.tasks(0, cfg.trainN);
|
|
5759
|
+
const probeTask = train[0];
|
|
5760
|
+
if (!probeTask) throw new Error("runStrategyEvolution: empty train slice");
|
|
5761
|
+
const probe = await cfg.environment.open(probeTask);
|
|
5762
|
+
let toolCatalog;
|
|
5763
|
+
try {
|
|
5764
|
+
const tools = await cfg.environment.tools(probeTask, probe);
|
|
5765
|
+
toolCatalog = tools.map(
|
|
5766
|
+
(t) => `- ${t.function.name}${t.function.description ? ` \u2014 ${t.function.description.slice(0, 120)}` : ""}`
|
|
5767
|
+
).join("\n");
|
|
5768
|
+
} finally {
|
|
5769
|
+
await cfg.environment.close(probe);
|
|
5770
|
+
}
|
|
5771
|
+
const gen0 = ckpt?.gen0 ?? await bench("gen0", train, baselines);
|
|
5772
|
+
const archive = ckpt?.archive ? [...ckpt.archive] : baselines.map((s) => ({
|
|
5773
|
+
name: s.name,
|
|
5774
|
+
source: "baseline",
|
|
5775
|
+
generation: 0,
|
|
5776
|
+
score: gen0.perStrategy[s.name]?.score ?? 0,
|
|
5777
|
+
usd: gen0.perStrategy[s.name]?.usd ?? 0
|
|
5778
|
+
}));
|
|
5779
|
+
const gen0Champion = ckpt?.gen0Champion ?? selectChampion(
|
|
5780
|
+
gen0,
|
|
5781
|
+
baselines.map((s) => s.name),
|
|
5782
|
+
policy,
|
|
5783
|
+
epsilon
|
|
5784
|
+
);
|
|
5785
|
+
const generationRows = ckpt?.generations ? [...ckpt.generations] : [];
|
|
5786
|
+
const trajectory = ckpt?.trajectory ? [...ckpt.trajectory] : [
|
|
5787
|
+
{
|
|
5788
|
+
generation: 0,
|
|
5789
|
+
champion: gen0Champion.name,
|
|
5790
|
+
score: gen0Champion.score,
|
|
5791
|
+
usd: gen0Champion.usd
|
|
5792
|
+
}
|
|
5793
|
+
];
|
|
5794
|
+
for (const row of generationRows) {
|
|
5795
|
+
for (const c of row.candidates) {
|
|
5796
|
+
if (!c.file || c.error) continue;
|
|
5797
|
+
const mod = await import(`file://${c.file}`);
|
|
5798
|
+
if (!mod.default || typeof mod.default.driver !== "function") {
|
|
5799
|
+
throw new Error(
|
|
5800
|
+
`evolution resume: ${c.file} no longer exports a Strategy \u2014 cannot restore "${c.name}"`
|
|
5801
|
+
);
|
|
5802
|
+
}
|
|
5803
|
+
byName.set(c.name, renameStrategy(mod.default, c.name));
|
|
5804
|
+
codeByName.set(c.name, readFileSync(c.file, "utf8"));
|
|
5805
|
+
}
|
|
5806
|
+
}
|
|
5807
|
+
let authoredOk = generationRows.reduce(
|
|
5808
|
+
(n, row) => n + row.candidates.filter((c) => !c.error).length,
|
|
5809
|
+
0
|
|
5810
|
+
);
|
|
5811
|
+
const lastRow = generationRows[generationRows.length - 1];
|
|
5812
|
+
let incumbent = lastRow ? lastRow.champion : gen0Champion;
|
|
5813
|
+
let latestReport = lastRow ? lastRow.report : gen0;
|
|
5814
|
+
if (!ckpt) save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5815
|
+
for (let g = generationRows.length + 1; g <= generations; g += 1) {
|
|
5816
|
+
const lossesJson = compactLosses(latestReport, cfg.lossesDetail ?? "exact");
|
|
5817
|
+
const candidates = [];
|
|
5818
|
+
const newStrategies = [];
|
|
5819
|
+
for (let i = 0; i < populationSize; i += 1) {
|
|
5820
|
+
const objectiveNote = cfg.objective === "cost" ? `
|
|
5821
|
+
|
|
5822
|
+
YOUR OBJECTIVE: match or exceed the incumbent's SCORE while spending LESS (the losses include usd per task). Promotion requires proven score non-inferiority PLUS significant cost savings \u2014 a strategy that ties the score at half the cost WINS; a cheaper strategy that loses score by more than ${((cfg.scoreTolerance ?? 0.05) * 100).toFixed(0)}pp LOSES.` : "";
|
|
5823
|
+
const contract = `${strategyAuthorContract}${objectiveNote}
|
|
5824
|
+
|
|
5825
|
+
EXAMPLE TOOLS FROM ONE TASK (tool sets VARY per task on this domain \u2014 a strategy MUST select tool names from await listTools(handle) at runtime; hardcoding these example names will zero your score on most tasks):
|
|
5826
|
+
${toolCatalog}
|
|
5827
|
+
|
|
5828
|
+
STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
|
|
5829
|
+
${fieldSummary(archive)}
|
|
5830
|
+
|
|
5831
|
+
You are authoring candidate ${i + 1} of ${populationSize} this generation; explore a distinct region of the strategy space from your siblings.`;
|
|
5832
|
+
try {
|
|
5833
|
+
const authored = await authorStrategy({
|
|
5834
|
+
chat: cfg.author.chat,
|
|
5835
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5836
|
+
...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
|
|
5837
|
+
...cfg.author.temperature !== void 0 ? { temperature: cfg.author.temperature } : {},
|
|
5838
|
+
...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
|
|
5839
|
+
contract,
|
|
5840
|
+
environmentName: cfg.environment.name,
|
|
5841
|
+
lossesJson,
|
|
5842
|
+
budget,
|
|
5843
|
+
outDir: cfg.outDir
|
|
5844
|
+
});
|
|
5845
|
+
const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
|
|
5846
|
+
const strategy = renameStrategy(authored.strategy, unique);
|
|
5847
|
+
byName.set(unique, strategy);
|
|
5848
|
+
codeByName.set(unique, authored.code);
|
|
5849
|
+
newStrategies.push(strategy);
|
|
5850
|
+
archive.push({
|
|
5851
|
+
name: unique,
|
|
5852
|
+
source: "authored",
|
|
5853
|
+
generation: g,
|
|
5854
|
+
parent: incumbent.name,
|
|
5855
|
+
gzipBits: gzipSync(Buffer.from(authored.code)).length * 8,
|
|
5856
|
+
file: authored.file,
|
|
5857
|
+
score: 0,
|
|
5858
|
+
usd: 0
|
|
5859
|
+
});
|
|
5860
|
+
candidates.push({
|
|
5861
|
+
name: unique,
|
|
5862
|
+
file: authored.file,
|
|
5863
|
+
gzipBits: gzipSync(Buffer.from(authored.code)).length * 8,
|
|
5864
|
+
codeChars: authored.code.length
|
|
5865
|
+
});
|
|
5866
|
+
authoredOk += 1;
|
|
5867
|
+
} catch (e) {
|
|
5868
|
+
candidates.push({
|
|
5869
|
+
name: `(author-failed g${g}c${i + 1})`,
|
|
5870
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
5871
|
+
});
|
|
5872
|
+
}
|
|
5873
|
+
}
|
|
5874
|
+
const incumbentStrategy = byName.get(incumbent.name);
|
|
5875
|
+
if (!incumbentStrategy)
|
|
5876
|
+
throw new Error(`evolution: incumbent "${incumbent.name}" missing from the field`);
|
|
5877
|
+
const field = [incumbentStrategy, ...newStrategies];
|
|
5878
|
+
const report = await bench(`gen${g}`, train, field);
|
|
5879
|
+
for (const node of archive) {
|
|
5880
|
+
const cell = report.perStrategy[node.name];
|
|
5881
|
+
if (cell) {
|
|
5882
|
+
node.score = cell.score;
|
|
5883
|
+
node.usd = cell.usd;
|
|
5884
|
+
}
|
|
5885
|
+
}
|
|
5886
|
+
const fieldNames = field.map((s) => s.name);
|
|
5887
|
+
const means = cfg.band ? discriminatingMeans(report, fieldNames) ?? report.perStrategy : report.perStrategy;
|
|
5888
|
+
const champion = pickChampion(means, fieldNames, policy, epsilon);
|
|
5889
|
+
generationRows.push({ generation: g, candidates, report, champion });
|
|
5890
|
+
trajectory.push({
|
|
5891
|
+
generation: g,
|
|
5892
|
+
champion: champion.name,
|
|
5893
|
+
score: champion.score,
|
|
5894
|
+
usd: champion.usd
|
|
5895
|
+
});
|
|
5896
|
+
incumbent = champion;
|
|
5897
|
+
latestReport = report;
|
|
5898
|
+
save({ gen0, gen0Champion, generations: generationRows, archive, trajectory });
|
|
5899
|
+
}
|
|
5900
|
+
if (authoredOk === 0) {
|
|
5901
|
+
throw new Error(
|
|
5902
|
+
"runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
|
|
5903
|
+
);
|
|
5904
|
+
}
|
|
5905
|
+
const holdoutOffset = cfg.trainN + (cfg.holdoutOffset ?? 0);
|
|
5906
|
+
let holdoutTasks = [];
|
|
5907
|
+
let bandInfo;
|
|
5908
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
5909
|
+
bandInfo = ckpt.band;
|
|
5910
|
+
if (cfg.reproducerCheck && codeByName.has(incumbent.name)) {
|
|
5911
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band?.holdoutPoolN ?? cfg.holdoutN);
|
|
5912
|
+
const gateIds = new Set(ckpt.holdout.perTask.map((r) => r.taskId));
|
|
5913
|
+
holdoutTasks = pool2.filter((t) => gateIds.has(t.id));
|
|
5914
|
+
}
|
|
5915
|
+
} else if (cfg.band) {
|
|
5916
|
+
const maxRef = cfg.band.maxRefScore ?? 0.99;
|
|
5917
|
+
const reference = baselines[0];
|
|
5918
|
+
if (!reference)
|
|
5919
|
+
throw new Error("evolution band: baselines[0] required as the screening reference");
|
|
5920
|
+
const pool2 = await cfg.tasks(holdoutOffset, cfg.band.holdoutPoolN);
|
|
5921
|
+
const screen = await bench("band-screen", pool2, [reference]);
|
|
5922
|
+
const refScores = screen.perTask.filter((r) => r.cells?.[reference.name]).map((r) => ({ taskId: r.taskId, score: r.cells?.[reference.name]?.score ?? 0 }));
|
|
5923
|
+
const inBandIds = new Set(refScores.filter((r) => r.score <= maxRef).map((r) => r.taskId));
|
|
5924
|
+
const kept = pool2.filter((t) => inBandIds.has(t.id));
|
|
5925
|
+
if (kept.length < cfg.holdoutN) {
|
|
5926
|
+
throw new Error(
|
|
5927
|
+
`evolution band: only ${kept.length}/${cfg.holdoutN} holdout tasks have headroom (pool ${cfg.band.holdoutPoolN}, reference "${reference.name}" \u2264 ${maxRef}) \u2014 widen holdoutPoolN or raise maxRefScore`
|
|
5928
|
+
);
|
|
5929
|
+
}
|
|
5930
|
+
holdoutTasks = kept.slice(0, cfg.holdoutN);
|
|
5931
|
+
bandInfo = { screened: refScores.length, inBand: kept.length, refScores };
|
|
5932
|
+
} else {
|
|
5933
|
+
holdoutTasks = await cfg.tasks(holdoutOffset, cfg.holdoutN);
|
|
5934
|
+
}
|
|
5935
|
+
let holdout;
|
|
5936
|
+
let verdict;
|
|
5937
|
+
if (ckpt?.holdout && ckpt.verdict) {
|
|
5938
|
+
holdout = ckpt.holdout;
|
|
5939
|
+
verdict = ckpt.verdict;
|
|
5940
|
+
} else {
|
|
5941
|
+
const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
|
|
5942
|
+
holdout = await bench("holdout", holdoutTasks, finalists);
|
|
5943
|
+
verdict = promotionGate({
|
|
5944
|
+
report: holdout,
|
|
5945
|
+
incumbent: gen0Champion.name,
|
|
5946
|
+
candidate: incumbent.name,
|
|
5947
|
+
...cfg.objective === "cost" ? {
|
|
5948
|
+
mode: "non-inferiority",
|
|
5949
|
+
...cfg.scoreTolerance !== void 0 ? { scoreTolerance: cfg.scoreTolerance } : {}
|
|
5950
|
+
} : {},
|
|
5951
|
+
...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
|
|
5952
|
+
});
|
|
5953
|
+
save({
|
|
5954
|
+
gen0,
|
|
5955
|
+
gen0Champion,
|
|
5956
|
+
generations: generationRows,
|
|
5957
|
+
archive,
|
|
5958
|
+
trajectory,
|
|
5959
|
+
holdout,
|
|
5960
|
+
verdict,
|
|
5961
|
+
...bandInfo ? { band: bandInfo } : {}
|
|
5962
|
+
});
|
|
5963
|
+
}
|
|
5964
|
+
let reproduction;
|
|
5965
|
+
const championCode = codeByName.get(incumbent.name);
|
|
5966
|
+
if (cfg.reproducerCheck && championCode) {
|
|
5967
|
+
const words = cfg.reproducerCheck.summaryMaxWords ?? 64;
|
|
5968
|
+
const tolerance = cfg.reproducerCheck.tolerance ?? 0.05;
|
|
5969
|
+
const championHoldoutScore = holdout.perStrategy[incumbent.name]?.score ?? 0;
|
|
5970
|
+
try {
|
|
5971
|
+
const summaryRes = await cfg.author.chat.chat({
|
|
5972
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5973
|
+
temperature: 0.2,
|
|
5974
|
+
maxTokens: 512,
|
|
5975
|
+
messages: [
|
|
5976
|
+
{
|
|
5977
|
+
role: "system",
|
|
5978
|
+
content: `Summarize the optimization strategy implemented by this code in at most ${words} words. Describe the COMPOSITION (shots, critique, artifact handling, restarts, stopping) \u2014 not the code. Output only the summary.`
|
|
5979
|
+
},
|
|
5980
|
+
{ role: "user", content: championCode }
|
|
5981
|
+
]
|
|
5982
|
+
});
|
|
5983
|
+
const summary = summaryRes.content.trim();
|
|
5984
|
+
const reproduced = await authorStrategy({
|
|
5985
|
+
chat: cfg.author.chat,
|
|
5986
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5987
|
+
...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
|
|
5988
|
+
...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
|
|
5989
|
+
temperature: 0.2,
|
|
5990
|
+
contract: `${strategyAuthorContract}
|
|
5991
|
+
|
|
5992
|
+
IMPLEMENT EXACTLY THIS STRATEGY (a colleague's description \u2014 do not invent a different approach):
|
|
5993
|
+
${summary}`,
|
|
5994
|
+
environmentName: cfg.environment.name,
|
|
5995
|
+
lossesJson: "[]",
|
|
5996
|
+
budget,
|
|
5997
|
+
outDir: cfg.outDir
|
|
5998
|
+
});
|
|
5999
|
+
const reproStrategy = {
|
|
6000
|
+
name: `${incumbent.name}-reproduced`,
|
|
6001
|
+
driver: reproduced.strategy.driver
|
|
6002
|
+
};
|
|
6003
|
+
const reproReport = await bench("reproduce", holdoutTasks, [reproStrategy]);
|
|
6004
|
+
const reproducedHoldoutScore = reproReport.perStrategy[reproStrategy.name]?.score ?? 0;
|
|
6005
|
+
reproduction = {
|
|
6006
|
+
summary,
|
|
6007
|
+
reproducedName: reproStrategy.name,
|
|
6008
|
+
file: reproduced.file,
|
|
6009
|
+
championHoldoutScore,
|
|
6010
|
+
reproducedHoldoutScore,
|
|
6011
|
+
gap: championHoldoutScore - reproducedHoldoutScore,
|
|
6012
|
+
reproducible: reproducedHoldoutScore >= championHoldoutScore - tolerance
|
|
6013
|
+
};
|
|
6014
|
+
} catch (e) {
|
|
6015
|
+
reproduction = {
|
|
6016
|
+
summary: "",
|
|
6017
|
+
reproducedName: "",
|
|
6018
|
+
championHoldoutScore,
|
|
6019
|
+
reproducedHoldoutScore: 0,
|
|
6020
|
+
gap: championHoldoutScore,
|
|
6021
|
+
reproducible: false,
|
|
6022
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
6023
|
+
};
|
|
6024
|
+
}
|
|
6025
|
+
}
|
|
6026
|
+
return {
|
|
6027
|
+
gen0,
|
|
6028
|
+
gen0Champion,
|
|
6029
|
+
generations: generationRows,
|
|
6030
|
+
archive,
|
|
6031
|
+
finalChampion: incumbent,
|
|
6032
|
+
holdout,
|
|
6033
|
+
verdict,
|
|
6034
|
+
...bandInfo ? { band: bandInfo } : {},
|
|
6035
|
+
...reproduction ? { reproduction } : {},
|
|
6036
|
+
trajectory
|
|
6037
|
+
};
|
|
6038
|
+
}
|
|
6039
|
+
|
|
6040
|
+
// src/runtime/verifier-environment.ts
|
|
6041
|
+
var submitTool = {
|
|
6042
|
+
type: "function",
|
|
6043
|
+
function: {
|
|
6044
|
+
name: "submit_answer",
|
|
6045
|
+
description: "Submit your answer for evaluation. You may submit more than once \u2014 the best-scoring submission counts. Submit the COMPLETE final answer, not a fragment.",
|
|
6046
|
+
parameters: {
|
|
6047
|
+
type: "object",
|
|
6048
|
+
properties: { answer: { type: "string", description: "The complete final answer." } },
|
|
6049
|
+
required: ["answer"]
|
|
6050
|
+
}
|
|
6051
|
+
}
|
|
6052
|
+
};
|
|
6053
|
+
function createVerifierEnvironment(opts) {
|
|
6054
|
+
if (opts.extraTools?.length && !opts.callExtra) {
|
|
6055
|
+
throw new Error(`${opts.name}: extraTools requires callExtra`);
|
|
6056
|
+
}
|
|
6057
|
+
const states = /* @__PURE__ */ new Map();
|
|
6058
|
+
let seq = 0;
|
|
6059
|
+
return {
|
|
6060
|
+
name: opts.name,
|
|
6061
|
+
async open(task) {
|
|
6062
|
+
seq += 1;
|
|
6063
|
+
const handle = { id: `${opts.name}-${seq}`, surface: opts.name };
|
|
6064
|
+
states.set(handle.id, { task, submissions: [] });
|
|
6065
|
+
return handle;
|
|
6066
|
+
},
|
|
6067
|
+
async tools() {
|
|
6068
|
+
return [submitTool, ...opts.extraTools ?? []];
|
|
6069
|
+
},
|
|
6070
|
+
async call(handle, name, args) {
|
|
6071
|
+
const state = states.get(handle.id);
|
|
6072
|
+
if (!state) return "ERROR: workspace closed";
|
|
6073
|
+
if (name === "submit_answer") {
|
|
6074
|
+
const answer = String(args.answer ?? "").trim();
|
|
6075
|
+
if (!answer) return "ERROR: empty answer";
|
|
6076
|
+
state.submissions.push(answer);
|
|
6077
|
+
return `submission ${state.submissions.length} recorded`;
|
|
6078
|
+
}
|
|
6079
|
+
if (opts.callExtra && opts.extraTools?.some((t) => t.function.name === name)) {
|
|
6080
|
+
try {
|
|
6081
|
+
return await opts.callExtra(state.task, name, args);
|
|
6082
|
+
} catch (e) {
|
|
6083
|
+
return `ERROR: ${e instanceof Error ? e.message : String(e)}`;
|
|
6084
|
+
}
|
|
6085
|
+
}
|
|
6086
|
+
return `ERROR: unknown tool ${name}`;
|
|
6087
|
+
},
|
|
6088
|
+
// Keep-best across submissions — the measured law (workers reach correct answers,
|
|
6089
|
+
// then revise past them; final-state scoring undersells every strategy).
|
|
6090
|
+
async score(task, handle) {
|
|
6091
|
+
const state = states.get(handle.id);
|
|
6092
|
+
if (!state || state.submissions.length === 0) return { passes: 0, total: 1, errored: 0 };
|
|
6093
|
+
let best = { passes: 0, total: 1, errored: 0 };
|
|
6094
|
+
const ratio = (s) => s.total > 0 ? s.passes / s.total : 0;
|
|
6095
|
+
for (const answer of state.submissions) {
|
|
6096
|
+
const s = await opts.check(task, answer);
|
|
6097
|
+
if (ratio(s) > ratio(best)) best = s;
|
|
6098
|
+
}
|
|
6099
|
+
return best;
|
|
6100
|
+
},
|
|
6101
|
+
async close(handle) {
|
|
6102
|
+
states.delete(handle.id);
|
|
6103
|
+
}
|
|
6104
|
+
};
|
|
6105
|
+
}
|
|
6106
|
+
|
|
6107
|
+
// src/runtime/waterfall.ts
|
|
6108
|
+
function createWaterfallCollector() {
|
|
6109
|
+
let spans = /* @__PURE__ */ new Map();
|
|
6110
|
+
const onEvent = (event) => {
|
|
6111
|
+
if (event.target === "agent.spawn") {
|
|
6112
|
+
const p = event.payload ?? {};
|
|
6113
|
+
const id = p.childId ?? event.id;
|
|
6114
|
+
spans.set(id, {
|
|
6115
|
+
id,
|
|
6116
|
+
label: p.label ?? id,
|
|
6117
|
+
runId: event.runId,
|
|
6118
|
+
...event.parentId !== void 0 ? { parentId: event.parentId } : {},
|
|
6119
|
+
startMs: event.timestamp,
|
|
6120
|
+
status: "running",
|
|
6121
|
+
usd: 0,
|
|
6122
|
+
tokens: { input: 0, output: 0 }
|
|
6123
|
+
});
|
|
6124
|
+
return;
|
|
6125
|
+
}
|
|
6126
|
+
if (event.target === "agent.child") {
|
|
6127
|
+
const p = event.payload ?? {};
|
|
6128
|
+
const id = p.childId;
|
|
6129
|
+
if (!id) return;
|
|
6130
|
+
const span = spans.get(id);
|
|
6131
|
+
if (!span) return;
|
|
6132
|
+
span.endMs = event.timestamp;
|
|
6133
|
+
span.status = p.status === "down" ? "down" : "done";
|
|
6134
|
+
span.usd = p.spent?.usd ?? 0;
|
|
6135
|
+
span.tokens = {
|
|
6136
|
+
input: p.spent?.tokens?.input ?? 0,
|
|
6137
|
+
output: p.spent?.tokens?.output ?? 0
|
|
6138
|
+
};
|
|
6139
|
+
if (typeof p.score === "number") span.score = p.score;
|
|
6140
|
+
}
|
|
6141
|
+
};
|
|
6142
|
+
const report = () => {
|
|
6143
|
+
const all = [...spans.values()].sort((a, b) => a.startMs - b.startMs);
|
|
6144
|
+
const start = all[0]?.startMs ?? 0;
|
|
6145
|
+
const end = Math.max(start, ...all.map((s) => s.endMs ?? s.startMs));
|
|
6146
|
+
const byKind = {};
|
|
6147
|
+
let totalUsd = 0;
|
|
6148
|
+
const totalTokens2 = { input: 0, output: 0 };
|
|
6149
|
+
for (const s of all) {
|
|
6150
|
+
totalUsd += s.usd;
|
|
6151
|
+
totalTokens2.input += s.tokens.input;
|
|
6152
|
+
totalTokens2.output += s.tokens.output;
|
|
6153
|
+
const kind = s.label.includes(":") ? s.label.split(":")[0] : s.label;
|
|
6154
|
+
const k = byKind[kind] ??= { count: 0, ms: 0, usd: 0, tokens: { input: 0, output: 0 } };
|
|
6155
|
+
k.count += 1;
|
|
6156
|
+
k.ms += (s.endMs ?? s.startMs) - s.startMs;
|
|
6157
|
+
k.usd += s.usd;
|
|
6158
|
+
k.tokens.input += s.tokens.input;
|
|
6159
|
+
k.tokens.output += s.tokens.output;
|
|
6160
|
+
}
|
|
6161
|
+
return { spans: all, totalMs: end - start, totalUsd, totalTokens: totalTokens2, byKind };
|
|
6162
|
+
};
|
|
6163
|
+
const render = (opts) => {
|
|
6164
|
+
const { spans: all, totalMs, totalUsd, byKind } = report();
|
|
6165
|
+
if (all.length === 0) return "(no spans observed)";
|
|
6166
|
+
const width = opts?.width ?? 48;
|
|
6167
|
+
const maxRows = opts?.maxRows ?? 60;
|
|
6168
|
+
const start = all[0]?.startMs ?? 0;
|
|
6169
|
+
const scale = totalMs > 0 ? width / totalMs : 0;
|
|
6170
|
+
const lines = [];
|
|
6171
|
+
const labelWidth = Math.min(24, Math.max(...all.map((s) => s.label.length)) + 1);
|
|
6172
|
+
for (const s of all.slice(0, maxRows)) {
|
|
6173
|
+
const offset = Math.round((s.startMs - start) * scale);
|
|
6174
|
+
const dur = (s.endMs ?? s.startMs) - s.startMs;
|
|
6175
|
+
const len = Math.max(1, Math.round(dur * scale));
|
|
6176
|
+
const bar = `${" ".repeat(Math.min(offset, width))}${(s.status === "down" ? "\u2591" : "\u2588").repeat(Math.max(1, Math.min(len, width - Math.min(offset, width) + 1)))}`;
|
|
6177
|
+
const mark = s.status === "down" ? " DOWN" : s.score !== void 0 ? ` ${(s.score * 100).toFixed(0)}%` : "";
|
|
6178
|
+
lines.push(
|
|
6179
|
+
`${s.label.padEnd(labelWidth)}|${bar.padEnd(width + 1)}| ${(dur / 1e3).toFixed(1)}s $${s.usd.toFixed(4)} ${s.tokens.input}/${s.tokens.output}tok${mark}`
|
|
6180
|
+
);
|
|
6181
|
+
}
|
|
6182
|
+
if (all.length > maxRows) lines.push(`\u2026 ${all.length - maxRows} more spans`);
|
|
6183
|
+
lines.push("\u2014".repeat(labelWidth + width + 2));
|
|
6184
|
+
for (const [kind, k] of Object.entries(byKind)) {
|
|
6185
|
+
lines.push(
|
|
6186
|
+
`${kind.padEnd(labelWidth)} \xD7${k.count} ${(k.ms / 1e3).toFixed(1)}s busy $${k.usd.toFixed(4)} ${k.tokens.input}/${k.tokens.output}tok`
|
|
6187
|
+
);
|
|
6188
|
+
}
|
|
6189
|
+
lines.push(
|
|
6190
|
+
`TOTAL${" ".repeat(labelWidth - 5)} ${(totalMs / 1e3).toFixed(1)}s wall $${totalUsd.toFixed(4)}`
|
|
6191
|
+
);
|
|
6192
|
+
return lines.join("\n");
|
|
6193
|
+
};
|
|
6194
|
+
return {
|
|
6195
|
+
hooks: { onEvent },
|
|
6196
|
+
report,
|
|
6197
|
+
render,
|
|
6198
|
+
reset: () => {
|
|
6199
|
+
spans = /* @__PURE__ */ new Map();
|
|
6200
|
+
}
|
|
6201
|
+
};
|
|
6202
|
+
}
|
|
6203
|
+
|
|
4192
6204
|
// src/runtime/workspace.ts
|
|
4193
6205
|
function localShell() {
|
|
4194
6206
|
return async (args, cwd) => {
|
|
@@ -4249,11 +6261,52 @@ function gitWorkspace(opts) {
|
|
|
4249
6261
|
}
|
|
4250
6262
|
};
|
|
4251
6263
|
}
|
|
6264
|
+
function jjWorkspace(opts) {
|
|
6265
|
+
const shell = opts.shell ?? localShell();
|
|
6266
|
+
const branch = opts.branch ?? "main";
|
|
6267
|
+
const ident = [
|
|
6268
|
+
"--config-toml",
|
|
6269
|
+
'user.name="workspace"',
|
|
6270
|
+
"--config-toml",
|
|
6271
|
+
'user.email="workspace@tangle.local"'
|
|
6272
|
+
];
|
|
6273
|
+
const jj = async (args, cwd) => {
|
|
6274
|
+
const res = await shell(["jj", ...ident, ...args], cwd);
|
|
6275
|
+
if (res.code !== 0) {
|
|
6276
|
+
throw new Error(
|
|
6277
|
+
`jj ${args.join(" ")} failed (${res.code}): ${tail(res.stderr || res.stdout)}`
|
|
6278
|
+
);
|
|
6279
|
+
}
|
|
6280
|
+
return res.stdout;
|
|
6281
|
+
};
|
|
6282
|
+
return {
|
|
6283
|
+
ref: opts.ref,
|
|
6284
|
+
// Colocated clone: jj manages history, git holds the durable remote.
|
|
6285
|
+
materialize: (dir) => jj(["git", "clone", "--colocate", opts.ref, dir]).then(() => {
|
|
6286
|
+
}),
|
|
6287
|
+
async commit(dir, message) {
|
|
6288
|
+
await jj(["describe", "-m", message], dir);
|
|
6289
|
+
await jj(["new"], dir);
|
|
6290
|
+
const push = await shell(["jj", ...ident, "git", "push", "--branch", branch], dir);
|
|
6291
|
+
if (push.code !== 0) return { ok: false, conflict: tail(push.stderr || push.stdout) };
|
|
6292
|
+
const rev = (await jj(["log", "--no-graph", "-r", "@-", "-T", "commit_id"], dir)).trim();
|
|
6293
|
+
return { ok: true, rev };
|
|
6294
|
+
},
|
|
6295
|
+
async head() {
|
|
6296
|
+
const out = await shell(["git", "ls-remote", opts.ref, `refs/heads/${branch}`]);
|
|
6297
|
+
return out.stdout.split(/\s+/)[0] ?? "";
|
|
6298
|
+
}
|
|
6299
|
+
};
|
|
6300
|
+
}
|
|
4252
6301
|
function tail(s) {
|
|
4253
6302
|
return s.slice(-400);
|
|
4254
6303
|
}
|
|
4255
6304
|
|
|
4256
6305
|
export {
|
|
6306
|
+
deleteBoxSafe,
|
|
6307
|
+
throwAbort,
|
|
6308
|
+
throwIfAborted,
|
|
6309
|
+
sleep,
|
|
4257
6310
|
contentAddress,
|
|
4258
6311
|
InMemoryResultBlobStore,
|
|
4259
6312
|
FileResultBlobStore,
|
|
@@ -4261,6 +6314,10 @@ export {
|
|
|
4261
6314
|
FileSpawnJournal,
|
|
4262
6315
|
replaySpawnTree,
|
|
4263
6316
|
materializeTreeView,
|
|
6317
|
+
anytimeReport,
|
|
6318
|
+
renderAnytimeTable,
|
|
6319
|
+
defaultAuditorInstruction,
|
|
6320
|
+
auditIntent,
|
|
4264
6321
|
completionAuthorizes,
|
|
4265
6322
|
stopSentinel,
|
|
4266
6323
|
sentinelCompletion,
|
|
@@ -4270,6 +6327,10 @@ export {
|
|
|
4270
6327
|
buildSteerContext,
|
|
4271
6328
|
createDriver,
|
|
4272
6329
|
renderAnalyses,
|
|
6330
|
+
defaultAnalystInstruction,
|
|
6331
|
+
observe,
|
|
6332
|
+
renderReport,
|
|
6333
|
+
harvestCorpus,
|
|
4273
6334
|
inlineSandboxClient,
|
|
4274
6335
|
reportLoopUsage,
|
|
4275
6336
|
defineRuntimeHooks,
|
|
@@ -4284,8 +6345,7 @@ export {
|
|
|
4284
6345
|
createSandboxForSpec,
|
|
4285
6346
|
defaultSelectWinner,
|
|
4286
6347
|
loopDispatch,
|
|
4287
|
-
|
|
4288
|
-
renderReport,
|
|
6348
|
+
createMcpEnvironment,
|
|
4289
6349
|
createScope,
|
|
4290
6350
|
settledToIteration,
|
|
4291
6351
|
pipeline,
|
|
@@ -4311,8 +6371,29 @@ export {
|
|
|
4311
6371
|
runPersonified,
|
|
4312
6372
|
trajectoryReport,
|
|
4313
6373
|
equalKOnCost,
|
|
6374
|
+
promotionGate,
|
|
6375
|
+
depthDriver,
|
|
6376
|
+
breadthDriver,
|
|
6377
|
+
sample,
|
|
6378
|
+
refine,
|
|
6379
|
+
defineStrategy,
|
|
6380
|
+
adaptiveRefine,
|
|
6381
|
+
sampleThenRefine,
|
|
6382
|
+
runAgentic,
|
|
6383
|
+
runBenchmark,
|
|
6384
|
+
printBenchmarkReport,
|
|
4314
6385
|
openSandboxRun,
|
|
6386
|
+
strategyAuthorContract,
|
|
6387
|
+
assertStrategyContract,
|
|
6388
|
+
authorStrategy,
|
|
6389
|
+
discriminatingMeans,
|
|
6390
|
+
pickChampion,
|
|
6391
|
+
selectChampion,
|
|
6392
|
+
runStrategyEvolution,
|
|
6393
|
+
createVerifierEnvironment,
|
|
6394
|
+
createWaterfallCollector,
|
|
4315
6395
|
localShell,
|
|
4316
|
-
gitWorkspace
|
|
6396
|
+
gitWorkspace,
|
|
6397
|
+
jjWorkspace
|
|
4317
6398
|
};
|
|
4318
|
-
//# sourceMappingURL=chunk-
|
|
6399
|
+
//# sourceMappingURL=chunk-PXUTIMGJ.js.map
|