@tangle-network/agent-runtime 0.46.0 → 0.48.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent.d.ts +1 -1
- package/dist/agent.js +1 -1
- package/dist/analyst-loop.d.ts +1 -1
- package/dist/{chunk-GN75RGM6.js → chunk-656G2XCL.js} +3 -3
- package/dist/{chunk-65FQLI4V.js → chunk-IW2LMLK6.js} +1714 -42
- package/dist/chunk-IW2LMLK6.js.map +1 -0
- package/dist/{chunk-I42NHLKX.js → chunk-LX66I3SC.js} +11 -6
- package/dist/chunk-LX66I3SC.js.map +1 -0
- package/dist/{chunk-KPN7OQ64.js → chunk-TJS7S3HJ.js} +2 -2
- package/dist/{chunk-KPN7OQ64.js.map → chunk-TJS7S3HJ.js.map} +1 -1
- package/dist/{coder-DCWFQpmJ.d.ts → coder-CVZNGbyg.d.ts} +1 -1
- package/dist/{driver-C-mtBo7h.d.ts → driver-DYU2sgHr.d.ts} +1 -1
- package/dist/index.d.ts +7 -7
- package/dist/index.js +3 -3
- package/dist/{kb-gate-2Gwpz_27.d.ts → kb-gate-51BlLlVM.d.ts} +8 -2
- package/dist/{loop-runner-bin-D-K6bRp3.d.ts → loop-runner-bin-DEm4roYF.d.ts} +4 -4
- package/dist/loop-runner-bin.d.ts +5 -5
- package/dist/loop-runner-bin.js +3 -3
- package/dist/loops.d.ts +5 -5
- package/dist/loops.js +55 -1
- package/dist/mcp/bin.js +3 -3
- package/dist/mcp/index.d.ts +71 -70
- package/dist/mcp/index.js +199 -27
- package/dist/mcp/index.js.map +1 -1
- package/dist/{otel-export-nurzFwuJ.d.ts → otel-export-EzfsVUhh.d.ts} +1 -1
- package/dist/profiles.d.ts +2 -2
- package/dist/{run-loop-CU2Y00Si.d.ts → run-loop-DvD4aGiE.d.ts} +1 -1
- package/dist/runtime.d.ts +915 -71
- package/dist/runtime.js +55 -1
- package/dist/{types-BfoeiQRZ.d.ts → types-BpDfCPUp.d.ts} +5 -5
- package/dist/{types-DnYoHvvZ.d.ts → types-nBMuollC.d.ts} +17 -0
- package/dist/workflow.d.ts +2 -2
- package/dist/workflow.js +1 -1
- package/package.json +25 -14
- package/skills/loop-writer/SKILL.md +163 -0
- package/dist/chunk-65FQLI4V.js.map +0 -1
- package/dist/chunk-I42NHLKX.js.map +0 -1
- /package/dist/{chunk-GN75RGM6.js.map → chunk-656G2XCL.js.map} +0 -0
|
@@ -426,6 +426,83 @@ function isNoEntError(err) {
|
|
|
426
426
|
return typeof err === "object" && err !== null && "code" in err && err.code === "ENOENT";
|
|
427
427
|
}
|
|
428
428
|
|
|
429
|
+
// src/runtime/audit-intent.ts
|
|
430
|
+
var defaultAuditorInstruction = "You audit whether an AI agent is on the RIGHT ROUTE \u2014 not whether it works hard, but whether its actions serve the stated intents. Infer the REVEALED intent from the action pattern (what the trajectory is actually optimizing). Compare against the declared task intent, the user intent when given, and the meta-intent when given. Flawless execution down the wrong route is DIVERGED. Busy-work that neither advances nor harms is DRIFTING. Judge only from the trajectory \u2014 be specific about which actions ground your verdict. Recommend abort only when continuing cannot serve the intent.";
|
|
431
|
+
function summarize(trace, maxLines) {
|
|
432
|
+
const lines = [];
|
|
433
|
+
for (const ev of trace) {
|
|
434
|
+
const e = ev;
|
|
435
|
+
const role = e.role;
|
|
436
|
+
if (role === "tool") lines.push(`RESULT ${String(e.content).slice(0, 200)}`);
|
|
437
|
+
else if (role === "assistant") {
|
|
438
|
+
const calls = e.tool_calls?.map((c) => `${c.function?.name}(${(c.function?.arguments ?? "").slice(0, 120)})`).join(", ");
|
|
439
|
+
lines.push(calls ? `CALL ${calls}` : `SAY ${String(e.content).slice(0, 160)}`);
|
|
440
|
+
} else if (role === "user") lines.push(`USER ${String(e.content).slice(0, 160)}`);
|
|
441
|
+
}
|
|
442
|
+
return lines.slice(-maxLines).join("\n");
|
|
443
|
+
}
|
|
444
|
+
var auditSchema = {
|
|
445
|
+
name: "intent_audit",
|
|
446
|
+
schema: {
|
|
447
|
+
type: "object",
|
|
448
|
+
additionalProperties: false,
|
|
449
|
+
required: ["revealedIntent", "verdict", "evidence", "recommendation", "confidence"],
|
|
450
|
+
properties: {
|
|
451
|
+
revealedIntent: { type: "string" },
|
|
452
|
+
verdict: { type: "string", enum: ["aligned", "drifting", "diverged"] },
|
|
453
|
+
evidence: { type: "string" },
|
|
454
|
+
recommendation: { type: "string", enum: ["continue", "steer", "abort"] },
|
|
455
|
+
steer: { type: "string" },
|
|
456
|
+
confidence: { type: "number" }
|
|
457
|
+
}
|
|
458
|
+
}
|
|
459
|
+
};
|
|
460
|
+
async function auditIntent(input, opts) {
|
|
461
|
+
const res = await opts.chat.chat(
|
|
462
|
+
{
|
|
463
|
+
...opts.model ? { model: opts.model } : {},
|
|
464
|
+
jsonSchema: auditSchema,
|
|
465
|
+
messages: [
|
|
466
|
+
{ role: "system", content: opts.auditorInstruction ?? defaultAuditorInstruction },
|
|
467
|
+
{
|
|
468
|
+
role: "user",
|
|
469
|
+
content: `DECLARED INTENT (the task):
|
|
470
|
+
${input.declaredIntent}
|
|
471
|
+
|
|
472
|
+
` + (input.userIntent ? `USER INTENT (the principal's actual goal):
|
|
473
|
+
${input.userIntent}
|
|
474
|
+
|
|
475
|
+
` : "") + (input.metaIntent ? `META-INTENT (what the whole run is for):
|
|
476
|
+
${input.metaIntent}
|
|
477
|
+
|
|
478
|
+
` : "") + `TRAJECTORY (in order):
|
|
479
|
+
${summarize(input.trace, opts.maxTraceLines ?? 80)}
|
|
480
|
+
|
|
481
|
+
Audit the route: revealed intent, verdict, evidence, one recommendation.`
|
|
482
|
+
}
|
|
483
|
+
]
|
|
484
|
+
},
|
|
485
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
486
|
+
);
|
|
487
|
+
let parsed;
|
|
488
|
+
try {
|
|
489
|
+
parsed = JSON.parse(res.content);
|
|
490
|
+
} catch {
|
|
491
|
+
throw new Error(`auditIntent: auditor returned non-JSON: ${res.content.slice(0, 200)}`);
|
|
492
|
+
}
|
|
493
|
+
if (!parsed.verdict || !parsed.recommendation) {
|
|
494
|
+
throw new Error(`auditIntent: missing verdict/recommendation: ${res.content.slice(0, 200)}`);
|
|
495
|
+
}
|
|
496
|
+
return {
|
|
497
|
+
revealedIntent: parsed.revealedIntent ?? "",
|
|
498
|
+
verdict: parsed.verdict,
|
|
499
|
+
evidence: parsed.evidence ?? "",
|
|
500
|
+
recommendation: parsed.recommendation,
|
|
501
|
+
...parsed.steer ? { steer: parsed.steer } : {},
|
|
502
|
+
confidence: typeof parsed.confidence === "number" ? parsed.confidence : 0.5
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
|
|
429
506
|
// src/runtime/completion.ts
|
|
430
507
|
function completionAuthorizes(v, policy) {
|
|
431
508
|
if (!v?.done) return false;
|
|
@@ -674,8 +751,8 @@ function validateMove(move, maxFanout) {
|
|
|
674
751
|
);
|
|
675
752
|
}
|
|
676
753
|
}
|
|
677
|
-
async function runAnalyze(
|
|
678
|
-
const findings = await
|
|
754
|
+
async function runAnalyze(analyze2, task, history) {
|
|
755
|
+
const findings = await analyze2({ task, history });
|
|
679
756
|
if (!Array.isArray(findings)) {
|
|
680
757
|
throw new PlannerError(
|
|
681
758
|
`createDriver: analyze hook must return AnalystFinding[], got ${stringifySafe(findings)}`
|
|
@@ -703,6 +780,214 @@ function renderAnalyses(findings) {
|
|
|
703
780
|
${rows.join("\n")}`;
|
|
704
781
|
}
|
|
705
782
|
|
|
783
|
+
// src/runtime/observe.ts
|
|
784
|
+
import { makeFinding } from "@tangle-network/agent-eval";
|
|
785
|
+
var observerId = "observe/trace";
|
|
786
|
+
var defaultAnalystInstruction = "You are a third-person OBSERVER watching an AI agent work. You see its TRACE (what it did), not its grader. From the trace, name SPECIFIC, behavior-grounded findings: wasted/duplicated tool calls, thrash/retries, token/cost waste, missing verification, failure patterns. For each, a concrete recommended_action, and whether the AGENT (fix its skills/prompt/tools) or the OPERATOR (fix framing/decomposition/config) should act. Only claim what the trace shows. No findings if the run was clean.";
|
|
787
|
+
function summarizeTrace(trace, maxLines) {
|
|
788
|
+
const lines = [];
|
|
789
|
+
for (const ev of trace) {
|
|
790
|
+
const e = ev;
|
|
791
|
+
const t = (e.type ?? "").toLowerCase();
|
|
792
|
+
const d = e.data ?? {};
|
|
793
|
+
const part = d.part ?? {};
|
|
794
|
+
if (part.type === "tool")
|
|
795
|
+
lines.push(`tool:${part.tool}${part.state?.status ? `(${part.state.status})` : ""}`);
|
|
796
|
+
else if (t.includes("error"))
|
|
797
|
+
lines.push(`ERROR: ${String(d.message ?? d.detail ?? "").slice(0, 200)}`);
|
|
798
|
+
else if (t === "status" && typeof d.status === "string") lines.push(`status:${d.status}`);
|
|
799
|
+
else if (t.includes("tool")) lines.push(`tool-event:${t}`);
|
|
800
|
+
}
|
|
801
|
+
const out = [];
|
|
802
|
+
for (const ln of lines) {
|
|
803
|
+
const prev = out[out.length - 1];
|
|
804
|
+
const m = prev?.match(/^(.*?)(?: x(\d+))?$/);
|
|
805
|
+
if (m && m[1] === ln) out[out.length - 1] = `${ln} x${(Number(m[2]) || 1) + 1}`;
|
|
806
|
+
else out.push(ln);
|
|
807
|
+
}
|
|
808
|
+
return out.slice(0, maxLines).join("\n") || "(no tool/error events in trace)";
|
|
809
|
+
}
|
|
810
|
+
var findingsSchema = {
|
|
811
|
+
name: "observer_findings",
|
|
812
|
+
schema: {
|
|
813
|
+
type: "object",
|
|
814
|
+
additionalProperties: false,
|
|
815
|
+
properties: {
|
|
816
|
+
findings: {
|
|
817
|
+
type: "array",
|
|
818
|
+
items: {
|
|
819
|
+
type: "object",
|
|
820
|
+
additionalProperties: false,
|
|
821
|
+
properties: {
|
|
822
|
+
area: {
|
|
823
|
+
type: "string",
|
|
824
|
+
description: "tool-use | cost | verification | process | failure | latency"
|
|
825
|
+
},
|
|
826
|
+
severity: { type: "string", enum: ["critical", "high", "medium", "low", "info"] },
|
|
827
|
+
claim: {
|
|
828
|
+
type: "string",
|
|
829
|
+
description: "what you OBSERVED in the trace (a fact, with the evidence)"
|
|
830
|
+
},
|
|
831
|
+
recommended_action: {
|
|
832
|
+
type: "string",
|
|
833
|
+
description: "the concrete change for the agent or operator"
|
|
834
|
+
},
|
|
835
|
+
audience: {
|
|
836
|
+
type: "string",
|
|
837
|
+
enum: ["agent", "operator"],
|
|
838
|
+
description: "who should act on this"
|
|
839
|
+
},
|
|
840
|
+
confidence: { type: "number" }
|
|
841
|
+
},
|
|
842
|
+
required: ["area", "severity", "claim", "recommended_action", "audience", "confidence"]
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
},
|
|
846
|
+
required: ["findings"]
|
|
847
|
+
}
|
|
848
|
+
};
|
|
849
|
+
async function observe(input, opts) {
|
|
850
|
+
const traceSummary = summarizeTrace(input.trace, opts.maxTraceLines ?? 80);
|
|
851
|
+
const res = await opts.chat.chat(
|
|
852
|
+
{
|
|
853
|
+
...opts.model ? { model: opts.model } : {},
|
|
854
|
+
jsonSchema: findingsSchema,
|
|
855
|
+
messages: [
|
|
856
|
+
{
|
|
857
|
+
role: "system",
|
|
858
|
+
content: opts.analystInstruction ?? defaultAnalystInstruction
|
|
859
|
+
},
|
|
860
|
+
{
|
|
861
|
+
role: "user",
|
|
862
|
+
content: `TASK: ${input.task}
|
|
863
|
+
|
|
864
|
+
OUTCOME: ${input.outcome ?? "unknown"}
|
|
865
|
+
|
|
866
|
+
FINAL OUTPUT (truncated):
|
|
867
|
+
${input.output.slice(0, 1200)}
|
|
868
|
+
|
|
869
|
+
TRACE (in order; "xN" = repeated):
|
|
870
|
+
${traceSummary}`
|
|
871
|
+
}
|
|
872
|
+
]
|
|
873
|
+
},
|
|
874
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
875
|
+
);
|
|
876
|
+
const parsed = parseFindings(res.content);
|
|
877
|
+
const producedAt = input.runId ? `${input.runId}` : observerId;
|
|
878
|
+
const findings = parsed.map(
|
|
879
|
+
(f) => makeFinding({
|
|
880
|
+
analyst_id: observerId,
|
|
881
|
+
area: `${f.area}`,
|
|
882
|
+
severity: f.severity,
|
|
883
|
+
claim: f.claim,
|
|
884
|
+
recommended_action: f.recommended_action,
|
|
885
|
+
confidence: typeof f.confidence === "number" ? f.confidence : 0.5,
|
|
886
|
+
evidence_refs: [],
|
|
887
|
+
// The observer reads BEHAVIOR, never the judge verdict — firewall provenance.
|
|
888
|
+
derived_from_judge: false,
|
|
889
|
+
metadata: { audience: f.audience },
|
|
890
|
+
...input.runId ? { subject: input.runId } : {}
|
|
891
|
+
})
|
|
892
|
+
);
|
|
893
|
+
const learned = [];
|
|
894
|
+
if (opts.corpus) {
|
|
895
|
+
for (const f of findings) {
|
|
896
|
+
const record = {
|
|
897
|
+
schemaVersion: "1.0.0",
|
|
898
|
+
id: f.finding_id,
|
|
899
|
+
runId: input.runId ?? observerId,
|
|
900
|
+
producedAt: f.produced_at ?? producedAt,
|
|
901
|
+
area: f.area,
|
|
902
|
+
claim: f.recommended_action ?? f.claim,
|
|
903
|
+
...f.claim ? { rationale: f.claim } : {},
|
|
904
|
+
tags: [...opts.tags ?? [], `audience:${f.metadata?.audience ?? "agent"}`],
|
|
905
|
+
confidence: f.confidence,
|
|
906
|
+
evidence: [{ kind: "finding", uri: f.finding_id }]
|
|
907
|
+
};
|
|
908
|
+
const r = await opts.corpus.append(record);
|
|
909
|
+
if (r.succeeded) learned.push(record);
|
|
910
|
+
}
|
|
911
|
+
}
|
|
912
|
+
return { findings, learned, report: renderReport(findings) };
|
|
913
|
+
}
|
|
914
|
+
function parseFindings(content) {
|
|
915
|
+
let obj;
|
|
916
|
+
try {
|
|
917
|
+
obj = JSON.parse(content);
|
|
918
|
+
} catch {
|
|
919
|
+
const m = content.match(/\{[\s\S]*\}/);
|
|
920
|
+
obj = m ? JSON.parse(m[0]) : { findings: [] };
|
|
921
|
+
}
|
|
922
|
+
const arr = obj.findings;
|
|
923
|
+
return Array.isArray(arr) ? arr : [];
|
|
924
|
+
}
|
|
925
|
+
function renderReport(findings) {
|
|
926
|
+
if (findings.length === 0) return "\u2713 clean run \u2014 the observer found nothing to change.";
|
|
927
|
+
const audience = (f) => f.metadata?.audience ?? "agent";
|
|
928
|
+
const forAgent = findings.filter((f) => audience(f) === "agent");
|
|
929
|
+
const forOperator = findings.filter((f) => audience(f) === "operator");
|
|
930
|
+
const block = (title, fs) => fs.length === 0 ? "" : `**${title}**
|
|
931
|
+
${fs.map((f) => `- [${f.severity}] ${f.claim}
|
|
932
|
+
\u2192 ${f.recommended_action ?? ""}`).join("\n")}
|
|
933
|
+
`;
|
|
934
|
+
return [
|
|
935
|
+
block("For the agent (fix skills / prompt / tools)", forAgent),
|
|
936
|
+
block("For you (the operator)", forOperator)
|
|
937
|
+
].filter(Boolean).join("\n");
|
|
938
|
+
}
|
|
939
|
+
|
|
940
|
+
// src/runtime/harvest-corpus.ts
|
|
941
|
+
async function harvestCorpus(opts) {
|
|
942
|
+
const concurrency = Math.max(1, opts.concurrency ?? 4);
|
|
943
|
+
const report = { runsObserved: 0, findings: 0, learned: 0, failures: [] };
|
|
944
|
+
const iterator = Symbol.asyncIterator in Object(opts.runs) ? opts.runs[Symbol.asyncIterator]() : (async function* () {
|
|
945
|
+
yield* opts.runs;
|
|
946
|
+
})();
|
|
947
|
+
let consumed = 0;
|
|
948
|
+
let done = false;
|
|
949
|
+
const next = async () => {
|
|
950
|
+
if (done || opts.maxRuns !== void 0 && consumed >= opts.maxRuns) return null;
|
|
951
|
+
const r = await iterator.next();
|
|
952
|
+
if (r.done) {
|
|
953
|
+
done = true;
|
|
954
|
+
return null;
|
|
955
|
+
}
|
|
956
|
+
consumed += 1;
|
|
957
|
+
return r.value;
|
|
958
|
+
};
|
|
959
|
+
const workers = Array.from({ length: concurrency }, async () => {
|
|
960
|
+
for (let input = await next(); input !== null; input = await next()) {
|
|
961
|
+
if (opts.signal?.aborted) return;
|
|
962
|
+
try {
|
|
963
|
+
const obs = await observe(input, {
|
|
964
|
+
chat: opts.chat,
|
|
965
|
+
...opts.model ? { model: opts.model } : {},
|
|
966
|
+
corpus: opts.corpus,
|
|
967
|
+
tags: opts.tags ?? [],
|
|
968
|
+
...opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {},
|
|
969
|
+
...opts.signal ? { signal: opts.signal } : {}
|
|
970
|
+
});
|
|
971
|
+
report.runsObserved += 1;
|
|
972
|
+
report.findings += obs.findings.length;
|
|
973
|
+
report.learned += obs.learned.length;
|
|
974
|
+
} catch (e) {
|
|
975
|
+
report.failures.push({
|
|
976
|
+
runId: input.runId ?? `run-${consumed}`,
|
|
977
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
978
|
+
});
|
|
979
|
+
}
|
|
980
|
+
}
|
|
981
|
+
});
|
|
982
|
+
await Promise.all(workers);
|
|
983
|
+
if (report.runsObserved === 0 && report.failures.length > 0) {
|
|
984
|
+
throw new Error(
|
|
985
|
+
`harvestCorpus: every run failed analysis (${report.failures.length}) \u2014 first: ${report.failures[0]?.error}`
|
|
986
|
+
);
|
|
987
|
+
}
|
|
988
|
+
return report;
|
|
989
|
+
}
|
|
990
|
+
|
|
706
991
|
// src/runtime/inline-sandbox-client.ts
|
|
707
992
|
function isAsyncIterable(v) {
|
|
708
993
|
return typeof v === "object" && v !== null && Symbol.asyncIterator in v;
|
|
@@ -1024,6 +1309,7 @@ function createSandboxLineage(client, capabilities, options = {}) {
|
|
|
1024
1309
|
if (signal.aborted) throwAbort();
|
|
1025
1310
|
const opts = buildBackendOptions(spec.profile, spec.sandboxOverrides);
|
|
1026
1311
|
const box = await acquireSandbox(client, opts, { signal });
|
|
1312
|
+
await spec.prepareBox?.(box, { signal });
|
|
1027
1313
|
owned.push(box);
|
|
1028
1314
|
return box;
|
|
1029
1315
|
};
|
|
@@ -1052,6 +1338,7 @@ function createSandboxLineage(client, capabilities, options = {}) {
|
|
|
1052
1338
|
if (checkpointId !== void 0) {
|
|
1053
1339
|
const box2 = await forkFromCheckpoint(parent.box, checkpointId, signal);
|
|
1054
1340
|
owned.push(box2);
|
|
1341
|
+
await spec.prepareBox?.(box2, { signal });
|
|
1055
1342
|
const sessionId2 = mintSessionId();
|
|
1056
1343
|
return {
|
|
1057
1344
|
handle: { box: box2, sessionId: sessionId2 },
|
|
@@ -1475,6 +1762,7 @@ async function executeIteration(args) {
|
|
|
1475
1762
|
if (args.validator) {
|
|
1476
1763
|
slot.verdict = await args.validator.validate(slot.output, {
|
|
1477
1764
|
iteration: args.item.index,
|
|
1765
|
+
...box ? { box } : {},
|
|
1478
1766
|
signal: args.signal,
|
|
1479
1767
|
traceEmitter: args.ctx.traceEmitter
|
|
1480
1768
|
});
|
|
@@ -1571,7 +1859,9 @@ function readSandboxId(box) {
|
|
|
1571
1859
|
async function createSandboxForSpec(client, spec, signal) {
|
|
1572
1860
|
const opts = buildBackendOptions(spec.profile, spec.sandboxOverrides);
|
|
1573
1861
|
if (signal.aborted) throwAbort();
|
|
1574
|
-
|
|
1862
|
+
const box = await acquireSandbox(client, opts, { signal });
|
|
1863
|
+
await spec.prepareBox?.(box, { signal });
|
|
1864
|
+
return box;
|
|
1575
1865
|
}
|
|
1576
1866
|
function finalize(args) {
|
|
1577
1867
|
const winner = args.options.selectWinner ? args.options.selectWinner(args.iterations) : args.options.driver.selectWinner?.(args.iterations) ?? defaultSelectWinner(args.iterations);
|
|
@@ -1646,8 +1936,8 @@ function defaultSelectWinner(iterations) {
|
|
|
1646
1936
|
const candidates = iterations.filter((iter) => iter.output !== void 0 && !iter.error);
|
|
1647
1937
|
if (candidates.length === 0) return void 0;
|
|
1648
1938
|
const valid = candidates.filter((iter) => iter.verdict?.valid === true);
|
|
1649
|
-
const
|
|
1650
|
-
const sorted = [...
|
|
1939
|
+
const pool2 = valid.length > 0 ? valid : candidates;
|
|
1940
|
+
const sorted = [...pool2].sort(
|
|
1651
1941
|
(a, b) => (b.verdict?.score ?? 0) - (a.verdict?.score ?? 0) || a.index - b.index
|
|
1652
1942
|
);
|
|
1653
1943
|
const top = sorted[0];
|
|
@@ -1732,6 +2022,98 @@ function loopDispatch(opts) {
|
|
|
1732
2022
|
return (profile, scenario, ctx) => runLoopForCell(opts, scenario, profile, ctx);
|
|
1733
2023
|
}
|
|
1734
2024
|
|
|
2025
|
+
// src/runtime/mcp-environment.ts
|
|
2026
|
+
async function rpc(endpoint, body) {
|
|
2027
|
+
let lastErr;
|
|
2028
|
+
for (let attempt = 0; attempt < 4; attempt += 1) {
|
|
2029
|
+
try {
|
|
2030
|
+
const r = await fetch(endpoint.url, {
|
|
2031
|
+
method: "POST",
|
|
2032
|
+
headers: { "content-type": "application/json", ...endpoint.headers ?? {} },
|
|
2033
|
+
body: JSON.stringify(body)
|
|
2034
|
+
});
|
|
2035
|
+
const text = await r.text();
|
|
2036
|
+
const dataLines = text.split("\n").filter((l) => l.startsWith("data:")).map((l) => l.slice(5).trim());
|
|
2037
|
+
const payload = dataLines.length ? dataLines[dataLines.length - 1] : text;
|
|
2038
|
+
try {
|
|
2039
|
+
return { status: r.status, json: JSON.parse(payload ?? "null") };
|
|
2040
|
+
} catch {
|
|
2041
|
+
return { status: r.status, json: text };
|
|
2042
|
+
}
|
|
2043
|
+
} catch (err) {
|
|
2044
|
+
lastErr = err;
|
|
2045
|
+
await new Promise((res) => setTimeout(res, 1e3 * (attempt + 1)));
|
|
2046
|
+
}
|
|
2047
|
+
}
|
|
2048
|
+
throw new Error(
|
|
2049
|
+
`mcp rpc ${endpoint.url} failed after 4 attempts: ${lastErr instanceof Error ? lastErr.message : String(lastErr)}`
|
|
2050
|
+
);
|
|
2051
|
+
}
|
|
2052
|
+
function sanitizeSchema(s) {
|
|
2053
|
+
const o = s && typeof s === "object" ? s : {};
|
|
2054
|
+
const banned = o.oneOf || o.anyOf || o.allOf || o.not || o.enum;
|
|
2055
|
+
if (o.type === "object" && !banned && o.properties && typeof o.properties === "object") {
|
|
2056
|
+
return {
|
|
2057
|
+
type: "object",
|
|
2058
|
+
properties: o.properties,
|
|
2059
|
+
...Array.isArray(o.required) ? { required: o.required } : {}
|
|
2060
|
+
};
|
|
2061
|
+
}
|
|
2062
|
+
return { type: "object", properties: {} };
|
|
2063
|
+
}
|
|
2064
|
+
function createMcpEnvironment(opts) {
|
|
2065
|
+
const endpoints = /* @__PURE__ */ new Map();
|
|
2066
|
+
const maxChars = opts.maxResultChars ?? 1500;
|
|
2067
|
+
return {
|
|
2068
|
+
name: opts.name,
|
|
2069
|
+
async open(task) {
|
|
2070
|
+
const { handle, endpoint } = await opts.open(task);
|
|
2071
|
+
endpoints.set(handle.id, endpoint);
|
|
2072
|
+
return handle;
|
|
2073
|
+
},
|
|
2074
|
+
async tools(task, handle) {
|
|
2075
|
+
const endpoint = endpoints.get(handle.id);
|
|
2076
|
+
if (!endpoint) throw new Error(`${opts.name}: tools() before open() for ${handle.id}`);
|
|
2077
|
+
const { json } = await rpc(endpoint, {
|
|
2078
|
+
jsonrpc: "2.0",
|
|
2079
|
+
id: 1,
|
|
2080
|
+
method: "tools/list",
|
|
2081
|
+
params: {}
|
|
2082
|
+
});
|
|
2083
|
+
const all = (json.result?.tools ?? []).map(
|
|
2084
|
+
(t) => ({
|
|
2085
|
+
type: "function",
|
|
2086
|
+
function: {
|
|
2087
|
+
name: t.name,
|
|
2088
|
+
description: (t.description ?? "").slice(0, 1e3),
|
|
2089
|
+
parameters: sanitizeSchema(t.inputSchema)
|
|
2090
|
+
}
|
|
2091
|
+
})
|
|
2092
|
+
);
|
|
2093
|
+
return opts.selectTools ? opts.selectTools(task, all) : all;
|
|
2094
|
+
},
|
|
2095
|
+
async call(handle, name, args) {
|
|
2096
|
+
const endpoint = endpoints.get(handle.id);
|
|
2097
|
+
if (!endpoint) return "ERROR: workspace closed";
|
|
2098
|
+
const { json } = await rpc(endpoint, {
|
|
2099
|
+
jsonrpc: "2.0",
|
|
2100
|
+
id: 2,
|
|
2101
|
+
method: "tools/call",
|
|
2102
|
+
params: { name, arguments: args }
|
|
2103
|
+
});
|
|
2104
|
+
const result = json ?? {};
|
|
2105
|
+
if (result.error) return `ERROR: ${JSON.stringify(result.error).slice(0, 300)}`;
|
|
2106
|
+
const text = result.result?.content?.map((c) => c.text ?? "").join("\n") ?? JSON.stringify(result.result ?? json);
|
|
2107
|
+
return text.slice(0, maxChars);
|
|
2108
|
+
},
|
|
2109
|
+
score: (task, handle) => opts.score(task, handle),
|
|
2110
|
+
async close(handle) {
|
|
2111
|
+
endpoints.delete(handle.id);
|
|
2112
|
+
await opts.close?.(handle);
|
|
2113
|
+
}
|
|
2114
|
+
};
|
|
2115
|
+
}
|
|
2116
|
+
|
|
1735
2117
|
// src/runtime/supervise/scope.ts
|
|
1736
2118
|
function createScope(args) {
|
|
1737
2119
|
const children = /* @__PURE__ */ new Map();
|
|
@@ -1962,22 +2344,22 @@ async function finalizeSettlement(child, settlement, seq, args, now) {
|
|
|
1962
2344
|
seq
|
|
1963
2345
|
};
|
|
1964
2346
|
}
|
|
1965
|
-
async function runChild(live, executor, childAbort, task, opts,
|
|
2347
|
+
async function runChild(live, executor, childAbort, task, opts, pool2, ticket, blobs) {
|
|
1966
2348
|
let reconciled = false;
|
|
1967
|
-
const reconcileOnce = (
|
|
2349
|
+
const reconcileOnce = (spend2) => {
|
|
1968
2350
|
if (reconciled) return;
|
|
1969
2351
|
reconciled = true;
|
|
1970
|
-
|
|
2352
|
+
pool2.reconcile(ticket, clampSpend(spend2, opts.budget));
|
|
1971
2353
|
};
|
|
1972
2354
|
try {
|
|
1973
2355
|
live.status = "running";
|
|
1974
2356
|
const ran = executor.execute(task, childAbort.signal);
|
|
1975
2357
|
let artifact;
|
|
1976
2358
|
if (isAsyncIterable2(ran)) {
|
|
1977
|
-
const
|
|
1978
|
-
live.spent =
|
|
2359
|
+
const spend2 = await foldStream(ran);
|
|
2360
|
+
live.spent = spend2;
|
|
1979
2361
|
artifact = executor.resultArtifact();
|
|
1980
|
-
reconcileOnce(
|
|
2362
|
+
reconcileOnce(spend2);
|
|
1981
2363
|
} else {
|
|
1982
2364
|
const terminal = await ran;
|
|
1983
2365
|
live.spent = terminal.spent;
|
|
@@ -2066,21 +2448,21 @@ async function foldStream(stream) {
|
|
|
2066
2448
|
}
|
|
2067
2449
|
return { iterations, tokens, usd, ms: 0 };
|
|
2068
2450
|
}
|
|
2069
|
-
function clampSpend(
|
|
2070
|
-
const totalTokens2 =
|
|
2451
|
+
function clampSpend(spend2, budget) {
|
|
2452
|
+
const totalTokens2 = spend2.tokens.input + spend2.tokens.output;
|
|
2071
2453
|
const tokensOk = totalTokens2 <= budget.maxTokens;
|
|
2072
|
-
const itersOk =
|
|
2073
|
-
const usdOk = budget.maxUsd === void 0 ||
|
|
2074
|
-
if (tokensOk && itersOk && usdOk) return
|
|
2454
|
+
const itersOk = spend2.iterations <= budget.maxIterations;
|
|
2455
|
+
const usdOk = budget.maxUsd === void 0 || spend2.usd <= budget.maxUsd;
|
|
2456
|
+
if (tokensOk && itersOk && usdOk) return spend2;
|
|
2075
2457
|
const ratio = !tokensOk && totalTokens2 > 0 ? budget.maxTokens / totalTokens2 : 1;
|
|
2076
2458
|
return {
|
|
2077
|
-
iterations: Math.min(
|
|
2459
|
+
iterations: Math.min(spend2.iterations, budget.maxIterations),
|
|
2078
2460
|
tokens: ratio < 1 ? {
|
|
2079
|
-
input: Math.floor(
|
|
2080
|
-
output: Math.floor(
|
|
2081
|
-
} :
|
|
2082
|
-
usd: budget.maxUsd === void 0 ?
|
|
2083
|
-
ms:
|
|
2461
|
+
input: Math.floor(spend2.tokens.input * ratio),
|
|
2462
|
+
output: Math.floor(spend2.tokens.output * ratio)
|
|
2463
|
+
} : spend2.tokens,
|
|
2464
|
+
usd: budget.maxUsd === void 0 ? spend2.usd : Math.min(spend2.usd, budget.maxUsd),
|
|
2465
|
+
ms: spend2.ms
|
|
2084
2466
|
};
|
|
2085
2467
|
}
|
|
2086
2468
|
async function teardownSafe(executor, grace) {
|
|
@@ -2714,6 +3096,118 @@ var routerInlineExecutor = (spec, ctx) => {
|
|
|
2714
3096
|
}
|
|
2715
3097
|
};
|
|
2716
3098
|
};
|
|
3099
|
+
var routerToolsSeamKey = "router-tools";
|
|
3100
|
+
var routerToolsInlineExecutor = (spec, ctx) => {
|
|
3101
|
+
const seam = readSeam(ctx, routerToolsSeamKey, "router-tools");
|
|
3102
|
+
const model = seam.model ?? spec.profile.model?.default;
|
|
3103
|
+
if (!model) {
|
|
3104
|
+
throw new ValidationError(
|
|
3105
|
+
"routerToolsInlineExecutor: no model \u2014 set RouterToolsSeam.model or AgentProfile.model.default"
|
|
3106
|
+
);
|
|
3107
|
+
}
|
|
3108
|
+
if (!seam.routerBaseUrl || !seam.routerKey) {
|
|
3109
|
+
throw new ValidationError(
|
|
3110
|
+
"routerToolsInlineExecutor: RouterToolsSeam.routerBaseUrl + routerKey required"
|
|
3111
|
+
);
|
|
3112
|
+
}
|
|
3113
|
+
const maxTurns = seam.maxTurns ?? 4;
|
|
3114
|
+
const controller = new AbortController();
|
|
3115
|
+
const abortIfSignalled = () => {
|
|
3116
|
+
if (ctx.signal.aborted) controller.abort();
|
|
3117
|
+
};
|
|
3118
|
+
abortIfSignalled();
|
|
3119
|
+
if (!ctx.signal.aborted) ctx.signal.addEventListener("abort", abortIfSignalled, { once: true });
|
|
3120
|
+
let artifact;
|
|
3121
|
+
return {
|
|
3122
|
+
runtime: "router",
|
|
3123
|
+
async execute(task, signal) {
|
|
3124
|
+
const started = Date.now();
|
|
3125
|
+
const linked = linkSignals(signal, controller.signal);
|
|
3126
|
+
const messages = [
|
|
3127
|
+
...taskToMessages(task, spec)
|
|
3128
|
+
];
|
|
3129
|
+
const tokens = zeroTokenUsage();
|
|
3130
|
+
let turns = 0;
|
|
3131
|
+
let lastText = "";
|
|
3132
|
+
for (let t = 0; t < maxTurns; t += 1) {
|
|
3133
|
+
turns += 1;
|
|
3134
|
+
const res = await fetch(`${seam.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
3135
|
+
method: "POST",
|
|
3136
|
+
headers: {
|
|
3137
|
+
"content-type": "application/json",
|
|
3138
|
+
authorization: `Bearer ${seam.routerKey}`
|
|
3139
|
+
},
|
|
3140
|
+
body: JSON.stringify({
|
|
3141
|
+
model,
|
|
3142
|
+
messages,
|
|
3143
|
+
tools: seam.tools,
|
|
3144
|
+
tool_choice: "auto",
|
|
3145
|
+
temperature: 0.2
|
|
3146
|
+
}),
|
|
3147
|
+
...linked ? { signal: linked } : {}
|
|
3148
|
+
});
|
|
3149
|
+
if (!res.ok) {
|
|
3150
|
+
throw new ValidationError(
|
|
3151
|
+
`routerToolsInlineExecutor: router ${res.status}: ${(await res.text()).slice(0, 200)}`
|
|
3152
|
+
);
|
|
3153
|
+
}
|
|
3154
|
+
const data = await res.json();
|
|
3155
|
+
const u = data.usage;
|
|
3156
|
+
if (u && typeof u.prompt_tokens === "number" && typeof u.completion_tokens === "number") {
|
|
3157
|
+
tokens.input += u.prompt_tokens;
|
|
3158
|
+
tokens.output += u.completion_tokens;
|
|
3159
|
+
}
|
|
3160
|
+
const msg = data.choices?.[0]?.message;
|
|
3161
|
+
if (msg?.content) lastText = msg.content;
|
|
3162
|
+
const toolCalls = msg?.tool_calls ?? [];
|
|
3163
|
+
if (toolCalls.length === 0) break;
|
|
3164
|
+
messages.push({
|
|
3165
|
+
role: "assistant",
|
|
3166
|
+
content: msg?.content ?? "",
|
|
3167
|
+
tool_calls: toolCalls.map((tc, i) => ({
|
|
3168
|
+
id: tc.id ?? `call_${i}`,
|
|
3169
|
+
type: "function",
|
|
3170
|
+
function: { name: tc.function?.name ?? "", arguments: tc.function?.arguments ?? "{}" }
|
|
3171
|
+
}))
|
|
3172
|
+
});
|
|
3173
|
+
for (let i = 0; i < toolCalls.length; i += 1) {
|
|
3174
|
+
const tc = toolCalls[i];
|
|
3175
|
+
const id = tc?.id ?? `call_${i}`;
|
|
3176
|
+
let args = {};
|
|
3177
|
+
try {
|
|
3178
|
+
args = JSON.parse(tc?.function?.arguments ?? "{}");
|
|
3179
|
+
} catch {
|
|
3180
|
+
messages.push({
|
|
3181
|
+
role: "tool",
|
|
3182
|
+
tool_call_id: id,
|
|
3183
|
+
content: "error: tool arguments were not valid JSON"
|
|
3184
|
+
});
|
|
3185
|
+
continue;
|
|
3186
|
+
}
|
|
3187
|
+
const result = await seam.executeToolCall(tc?.function?.name ?? "", args, task);
|
|
3188
|
+
messages.push({ role: "tool", tool_call_id: id, content: result });
|
|
3189
|
+
}
|
|
3190
|
+
}
|
|
3191
|
+
const usd = isModelPriced(model) ? estimateCost(tokens.input, tokens.output, model) : 0;
|
|
3192
|
+
const spent = { iterations: turns, tokens, usd, ms: Date.now() - started };
|
|
3193
|
+
const out = { content: lastText };
|
|
3194
|
+
artifact = { outRef: contentRef("router-tools", { model, content: lastText }), out, spent };
|
|
3195
|
+
return artifact;
|
|
3196
|
+
},
|
|
3197
|
+
teardown(_grace) {
|
|
3198
|
+
controller.abort();
|
|
3199
|
+
return Promise.resolve({ destroyed: true });
|
|
3200
|
+
},
|
|
3201
|
+
resultArtifact() {
|
|
3202
|
+
if (!artifact) {
|
|
3203
|
+
throw new ValidationError(
|
|
3204
|
+
"routerToolsInlineExecutor: resultArtifact() read before execute()"
|
|
3205
|
+
);
|
|
3206
|
+
}
|
|
3207
|
+
return { ...artifact, spent: artifact.spent };
|
|
3208
|
+
}
|
|
3209
|
+
};
|
|
3210
|
+
};
|
|
2717
3211
|
var sandboxExecutor = (spec, ctx) => {
|
|
2718
3212
|
if (spec.harness === null) {
|
|
2719
3213
|
throw new ValidationError("sandboxExecutor: harness is null (router/inline) \u2014 wrong executor");
|
|
@@ -3007,6 +3501,8 @@ function createExecutor(config) {
|
|
|
3007
3501
|
switch (config.backend) {
|
|
3008
3502
|
case "router":
|
|
3009
3503
|
return routerInlineExecutor(spec, seamed);
|
|
3504
|
+
case "router-tools":
|
|
3505
|
+
return routerToolsInlineExecutor(spec, seamed);
|
|
3010
3506
|
case "bridge":
|
|
3011
3507
|
return bridgeExecutor(spec, seamed);
|
|
3012
3508
|
case "cli":
|
|
@@ -3240,7 +3736,7 @@ function createSupervisor() {
|
|
|
3240
3736
|
let attached;
|
|
3241
3737
|
async function run(root, task, opts) {
|
|
3242
3738
|
const now = opts.now ?? Date.now;
|
|
3243
|
-
const
|
|
3739
|
+
const pool2 = createBudgetPool(opts.budget, now);
|
|
3244
3740
|
await opts.journal.beginTree(opts.runId, new Date(now()).toISOString());
|
|
3245
3741
|
await opts.journal.appendEvent(opts.runId, {
|
|
3246
3742
|
kind: "spawned",
|
|
@@ -3266,7 +3762,7 @@ function createSupervisor() {
|
|
|
3266
3762
|
const scope = createScope({
|
|
3267
3763
|
parentId: opts.runId,
|
|
3268
3764
|
root: opts.runId,
|
|
3269
|
-
pool,
|
|
3765
|
+
pool: pool2,
|
|
3270
3766
|
journal,
|
|
3271
3767
|
blobs: opts.blobs,
|
|
3272
3768
|
executors: opts.executors,
|
|
@@ -3294,7 +3790,7 @@ function createSupervisor() {
|
|
|
3294
3790
|
}
|
|
3295
3791
|
const tree = scope.view;
|
|
3296
3792
|
if (actOutcome.ok) {
|
|
3297
|
-
|
|
3793
|
+
pool2.assertNoOpenTickets();
|
|
3298
3794
|
const out = actOutcome.out;
|
|
3299
3795
|
const outRef = contentAddress(out);
|
|
3300
3796
|
await opts.blobs.put(outRef, out);
|
|
@@ -3308,7 +3804,7 @@ function createSupervisor() {
|
|
|
3308
3804
|
}
|
|
3309
3805
|
return {
|
|
3310
3806
|
kind: "no-winner",
|
|
3311
|
-
reason: classifyNoWinner(controller,
|
|
3807
|
+
reason: classifyNoWinner(controller, pool2, opts, breaker),
|
|
3312
3808
|
tree,
|
|
3313
3809
|
downCount: breaker.downCount()
|
|
3314
3810
|
};
|
|
@@ -3413,14 +3909,14 @@ async function drainCursor(scope) {
|
|
|
3413
3909
|
if (settled === null) return;
|
|
3414
3910
|
}
|
|
3415
3911
|
}
|
|
3416
|
-
function classifyNoWinner(controller,
|
|
3912
|
+
function classifyNoWinner(controller, pool2, opts, breaker) {
|
|
3417
3913
|
if (breaker.tripped()) return "all-children-down";
|
|
3418
3914
|
if (controller.signal.aborted) return "aborted";
|
|
3419
|
-
if (poolExhausted(
|
|
3915
|
+
if (poolExhausted(pool2, opts)) return "budget-exhausted";
|
|
3420
3916
|
return "all-children-down";
|
|
3421
3917
|
}
|
|
3422
|
-
function poolExhausted(
|
|
3423
|
-
const r =
|
|
3918
|
+
function poolExhausted(pool2, opts) {
|
|
3919
|
+
const r = pool2.readout();
|
|
3424
3920
|
if (r.tokensLeft <= 0) return true;
|
|
3425
3921
|
if (opts.budget.maxUsd !== void 0 && r.usdLeft <= 0) return true;
|
|
3426
3922
|
if (opts.budget.deadlineMs !== void 0 && r.deadlineMs > 0 && (opts.now ?? Date.now)() >= r.deadlineMs) {
|
|
@@ -3556,13 +4052,13 @@ function shapeName(shape, _resolved) {
|
|
|
3556
4052
|
}
|
|
3557
4053
|
function resolveShapeBudget(root, over) {
|
|
3558
4054
|
const fanout2 = over?.fanout ?? defaultFanout;
|
|
3559
|
-
const
|
|
4055
|
+
const perChild2 = over?.perChild ?? {
|
|
3560
4056
|
maxIterations: Math.max(1, Math.floor(root.maxIterations / fanout2)),
|
|
3561
4057
|
maxTokens: Math.max(1, Math.floor(root.maxTokens / fanout2)),
|
|
3562
4058
|
...root.maxUsd !== void 0 ? { maxUsd: root.maxUsd / fanout2 } : {},
|
|
3563
4059
|
...root.deadlineMs !== void 0 ? { deadlineMs: root.deadlineMs } : {}
|
|
3564
4060
|
};
|
|
3565
|
-
return { perChild, fanout: fanout2 };
|
|
4061
|
+
return { perChild: perChild2, fanout: fanout2 };
|
|
3566
4062
|
}
|
|
3567
4063
|
var defaultFanout = 3;
|
|
3568
4064
|
function personaRegistry(persona) {
|
|
@@ -3732,12 +4228,12 @@ function countStatuses(reported) {
|
|
|
3732
4228
|
function zeroSpend4() {
|
|
3733
4229
|
return { iterations: 0, tokens: zeroTokenUsage(), usd: 0, ms: 0 };
|
|
3734
4230
|
}
|
|
3735
|
-
function cloneSpend(
|
|
4231
|
+
function cloneSpend(spend2) {
|
|
3736
4232
|
return {
|
|
3737
|
-
iterations:
|
|
3738
|
-
tokens: { input:
|
|
3739
|
-
usd:
|
|
3740
|
-
ms:
|
|
4233
|
+
iterations: spend2.iterations,
|
|
4234
|
+
tokens: { input: spend2.tokens.input, output: spend2.tokens.output },
|
|
4235
|
+
usd: spend2.usd,
|
|
4236
|
+
ms: spend2.ms
|
|
3741
4237
|
};
|
|
3742
4238
|
}
|
|
3743
4239
|
function addSpend(acc, delta) {
|
|
@@ -3791,13 +4287,682 @@ function requireNode2(nodes, id, root) {
|
|
|
3791
4287
|
return node;
|
|
3792
4288
|
}
|
|
3793
4289
|
function requireSpend(rolled, id, root) {
|
|
3794
|
-
const
|
|
3795
|
-
if (!
|
|
4290
|
+
const spend2 = rolled.get(id);
|
|
4291
|
+
if (!spend2) {
|
|
3796
4292
|
throw new Error(
|
|
3797
4293
|
`trajectoryReport: node '${id}' was never rolled up in tree '${root}' (unreachable from root)`
|
|
3798
4294
|
);
|
|
3799
4295
|
}
|
|
3800
|
-
return
|
|
4296
|
+
return spend2;
|
|
4297
|
+
}
|
|
4298
|
+
|
|
4299
|
+
// src/runtime/promotion-gate.ts
|
|
4300
|
+
import { heldoutSignificance } from "@tangle-network/agent-eval/campaign";
|
|
4301
|
+
function promotionGate(opts) {
|
|
4302
|
+
if (opts.candidate === opts.incumbent) {
|
|
4303
|
+
return {
|
|
4304
|
+
promoted: false,
|
|
4305
|
+
reason: "identical-champion",
|
|
4306
|
+
n: 0,
|
|
4307
|
+
lift: { mean: 0, median: 0, low: 0, high: 0 }
|
|
4308
|
+
};
|
|
4309
|
+
}
|
|
4310
|
+
const before = [];
|
|
4311
|
+
const after = [];
|
|
4312
|
+
const cellIds = [];
|
|
4313
|
+
for (const row of opts.report.perTask) {
|
|
4314
|
+
const inc = row.cells?.[opts.incumbent];
|
|
4315
|
+
const cand = row.cells?.[opts.candidate];
|
|
4316
|
+
if (!inc || !cand) continue;
|
|
4317
|
+
before.push(inc.score);
|
|
4318
|
+
after.push(cand.score);
|
|
4319
|
+
cellIds.push(row.taskId);
|
|
4320
|
+
}
|
|
4321
|
+
if (before.length === 0) {
|
|
4322
|
+
throw new Error(
|
|
4323
|
+
`promotionGate: no holdout task carried cells for both "${opts.incumbent}" and "${opts.candidate}" \u2014 the report must come from a run that included both strategies`
|
|
4324
|
+
);
|
|
4325
|
+
}
|
|
4326
|
+
const sig = heldoutSignificance(
|
|
4327
|
+
{ before, after, cellIds },
|
|
4328
|
+
{
|
|
4329
|
+
deltaThreshold: opts.deltaThreshold ?? 0,
|
|
4330
|
+
minProductiveRuns: opts.minPairedTasks ?? 6,
|
|
4331
|
+
statistic: opts.statistic ?? "mean",
|
|
4332
|
+
...opts.seed !== void 0 ? { seed: opts.seed } : {},
|
|
4333
|
+
...opts.resamples !== void 0 ? { resamples: opts.resamples } : {}
|
|
4334
|
+
}
|
|
4335
|
+
);
|
|
4336
|
+
const lift = {
|
|
4337
|
+
mean: sig.bootstrap.mean,
|
|
4338
|
+
median: sig.bootstrap.median,
|
|
4339
|
+
low: sig.bootstrap.low,
|
|
4340
|
+
high: sig.bootstrap.high
|
|
4341
|
+
};
|
|
4342
|
+
if (sig.fewRuns) return { promoted: false, reason: "few-tasks", n: sig.n, lift };
|
|
4343
|
+
return sig.significant ? { promoted: true, reason: "significant", n: sig.n, lift } : { promoted: false, reason: "no-margin", n: sig.n, lift };
|
|
4344
|
+
}
|
|
4345
|
+
|
|
4346
|
+
// src/runtime/run-benchmark.ts
|
|
4347
|
+
import { pairedBootstrap, paretoFrontier } from "@tangle-network/agent-eval";
|
|
4348
|
+
|
|
4349
|
+
// src/runtime/strategy.ts
|
|
4350
|
+
import { createChatClient, estimateCost as estimateCost2, isModelPriced as isModelPriced2 } from "@tangle-network/agent-eval";
|
|
4351
|
+
var taskNudge = "Use the available tools to bring the artifact to the required final state. Address EVERY distinct change the request implies. After each tool result, check what remains and continue. Re-read the values you set to confirm they took. Reply DONE only once every required change is made and verified.";
|
|
4352
|
+
async function runShot(surface, _task, handle, tools, messages, opts, modelOverride) {
|
|
4353
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4354
|
+
let completions = 0;
|
|
4355
|
+
let toolCalls = 0;
|
|
4356
|
+
let toolErrors = 0;
|
|
4357
|
+
const tokens = { input: 0, output: 0 };
|
|
4358
|
+
for (let t = 0; t < innerTurns; t += 1) {
|
|
4359
|
+
const res = await fetch(`${opts.routerBaseUrl.replace(/\/$/, "")}/chat/completions`, {
|
|
4360
|
+
method: "POST",
|
|
4361
|
+
headers: { "content-type": "application/json", authorization: `Bearer ${opts.routerKey}` },
|
|
4362
|
+
body: JSON.stringify({
|
|
4363
|
+
model: modelOverride ?? opts.model,
|
|
4364
|
+
messages,
|
|
4365
|
+
tools,
|
|
4366
|
+
tool_choice: "auto",
|
|
4367
|
+
temperature: opts.temperature ?? 0.7
|
|
4368
|
+
})
|
|
4369
|
+
});
|
|
4370
|
+
if (!res.ok) throw new Error(`router ${res.status}: ${(await res.text()).slice(0, 200)}`);
|
|
4371
|
+
completions += 1;
|
|
4372
|
+
const data = await res.json();
|
|
4373
|
+
if (typeof data.usage?.prompt_tokens === "number") tokens.input += data.usage.prompt_tokens;
|
|
4374
|
+
if (typeof data.usage?.completion_tokens === "number")
|
|
4375
|
+
tokens.output += data.usage.completion_tokens;
|
|
4376
|
+
const msg = data.choices?.[0]?.message;
|
|
4377
|
+
if (!msg) break;
|
|
4378
|
+
const calls = msg.tool_calls ?? [];
|
|
4379
|
+
messages.push({
|
|
4380
|
+
role: "assistant",
|
|
4381
|
+
content: msg.content ?? "",
|
|
4382
|
+
...calls.length ? { tool_calls: calls } : {}
|
|
4383
|
+
});
|
|
4384
|
+
if (calls.length === 0) break;
|
|
4385
|
+
for (const call of calls) {
|
|
4386
|
+
toolCalls += 1;
|
|
4387
|
+
let args = {};
|
|
4388
|
+
try {
|
|
4389
|
+
args = JSON.parse(call.function.arguments || "{}");
|
|
4390
|
+
} catch {
|
|
4391
|
+
toolErrors += 1;
|
|
4392
|
+
}
|
|
4393
|
+
let out;
|
|
4394
|
+
try {
|
|
4395
|
+
out = await surface.call(handle, call.function.name, args);
|
|
4396
|
+
if (out.startsWith("ERROR:")) toolErrors += 1;
|
|
4397
|
+
} catch (e) {
|
|
4398
|
+
toolErrors += 1;
|
|
4399
|
+
out = `ERROR: ${e instanceof Error ? e.message : String(e)}`;
|
|
4400
|
+
}
|
|
4401
|
+
messages.push({ role: "tool", tool_call_id: call.id, content: out });
|
|
4402
|
+
}
|
|
4403
|
+
}
|
|
4404
|
+
return { messages, completions, toolCalls, toolErrors, tokens };
|
|
4405
|
+
}
|
|
4406
|
+
async function analyze(task, messages, opts) {
|
|
4407
|
+
const trajectory = messages.filter((m) => m.role === "assistant" || m.role === "tool").map((m) => {
|
|
4408
|
+
if (m.role === "tool") return `RESULT ${String(m.content).slice(0, 280)}`;
|
|
4409
|
+
const calls = m.tool_calls?.map((c) => `${c.function.name}(${c.function.arguments})`).join(", ");
|
|
4410
|
+
return calls ? `CALL ${calls}` : `SAY ${String(m.content).slice(0, 200)}`;
|
|
4411
|
+
}).join("\n").slice(0, 7e3);
|
|
4412
|
+
const analystModel = opts.analystModel ?? opts.model;
|
|
4413
|
+
const chat = createChatClient({
|
|
4414
|
+
transport: "router",
|
|
4415
|
+
apiKey: opts.routerKey,
|
|
4416
|
+
baseUrl: opts.routerBaseUrl,
|
|
4417
|
+
defaultModel: analystModel
|
|
4418
|
+
});
|
|
4419
|
+
const obs = await observe(
|
|
4420
|
+
{
|
|
4421
|
+
task: task.userPrompt,
|
|
4422
|
+
output: trajectory,
|
|
4423
|
+
trace: messages,
|
|
4424
|
+
outcome: "failed",
|
|
4425
|
+
runId: task.id
|
|
4426
|
+
},
|
|
4427
|
+
{
|
|
4428
|
+
chat,
|
|
4429
|
+
model: analystModel,
|
|
4430
|
+
...opts.analystInstruction ? { analystInstruction: opts.analystInstruction } : {},
|
|
4431
|
+
...opts.corpus ? { corpus: opts.corpus, tags: opts.corpusTags ?? [] } : {}
|
|
4432
|
+
}
|
|
4433
|
+
);
|
|
4434
|
+
const steer = obs.findings.map((f) => f.recommended_action).filter((a) => typeof a === "string" && a.trim().length > 0).join("\n").trim();
|
|
4435
|
+
return steer || "COMPLETE";
|
|
4436
|
+
}
|
|
4437
|
+
var spend = (iterations) => ({
|
|
4438
|
+
iterations,
|
|
4439
|
+
tokens: { input: 0, output: 0 },
|
|
4440
|
+
usd: 0,
|
|
4441
|
+
ms: 0
|
|
4442
|
+
});
|
|
4443
|
+
function shotExecutor(surface, opts) {
|
|
4444
|
+
let artifact;
|
|
4445
|
+
return {
|
|
4446
|
+
runtime: "agentic-shot",
|
|
4447
|
+
async execute(task) {
|
|
4448
|
+
const t = task;
|
|
4449
|
+
const own = !t.handle;
|
|
4450
|
+
const handle = t.handle ?? await surface.open(t.task);
|
|
4451
|
+
try {
|
|
4452
|
+
const tools = await surface.tools(t.task, handle);
|
|
4453
|
+
const messages = t.messages?.length ? t.messages : [
|
|
4454
|
+
{ role: "system", content: t.persona?.systemPrompt ?? t.task.systemPrompt },
|
|
4455
|
+
{ role: "user", content: `${t.task.userPrompt}
|
|
4456
|
+
|
|
4457
|
+
${taskNudge}` }
|
|
4458
|
+
];
|
|
4459
|
+
if (t.messages?.length && t.persona?.systemPrompt) {
|
|
4460
|
+
messages.push({
|
|
4461
|
+
role: "user",
|
|
4462
|
+
content: `[hand-off] You are now acting as: ${t.persona.systemPrompt}`
|
|
4463
|
+
});
|
|
4464
|
+
}
|
|
4465
|
+
if (t.steer) messages.push({ role: "user", content: t.steer });
|
|
4466
|
+
const shot = await runShot(surface, t.task, handle, tools, messages, opts, t.persona?.model);
|
|
4467
|
+
const s = await surface.score(t.task, handle);
|
|
4468
|
+
const score = s.total > 0 ? s.passes / s.total : 0;
|
|
4469
|
+
const out = {
|
|
4470
|
+
messages: shot.messages,
|
|
4471
|
+
score,
|
|
4472
|
+
passes: s.passes,
|
|
4473
|
+
total: s.total,
|
|
4474
|
+
completions: shot.completions,
|
|
4475
|
+
toolErrors: shot.toolErrors
|
|
4476
|
+
};
|
|
4477
|
+
artifact = {
|
|
4478
|
+
outRef: `shot:${handle.id}:${shot.completions}:${s.passes}/${s.total}`,
|
|
4479
|
+
out,
|
|
4480
|
+
verdict: { valid: s.total > 0 && s.passes === s.total, score },
|
|
4481
|
+
// Real usage to the conserved pool: tokens from the router responses; usd only
|
|
4482
|
+
// when the model is in the price table (never a fabricated number).
|
|
4483
|
+
spent: {
|
|
4484
|
+
iterations: shot.completions,
|
|
4485
|
+
tokens: shot.tokens,
|
|
4486
|
+
usd: isModelPriced2(opts.model) ? estimateCost2(shot.tokens.input, shot.tokens.output, opts.model) : 0,
|
|
4487
|
+
ms: 0
|
|
4488
|
+
}
|
|
4489
|
+
};
|
|
4490
|
+
return artifact;
|
|
4491
|
+
} finally {
|
|
4492
|
+
if (own) await surface.close(handle);
|
|
4493
|
+
}
|
|
4494
|
+
},
|
|
4495
|
+
teardown: () => Promise.resolve({ destroyed: true }),
|
|
4496
|
+
resultArtifact() {
|
|
4497
|
+
if (!artifact) throw new Error("shotExecutor: resultArtifact before execute");
|
|
4498
|
+
return artifact;
|
|
4499
|
+
}
|
|
4500
|
+
};
|
|
4501
|
+
}
|
|
4502
|
+
function analystExecutor(opts) {
|
|
4503
|
+
let artifact;
|
|
4504
|
+
return {
|
|
4505
|
+
runtime: "agentic-analyst",
|
|
4506
|
+
async execute(task) {
|
|
4507
|
+
const t = task;
|
|
4508
|
+
const findings = await analyze(t.task, t.messages, opts);
|
|
4509
|
+
artifact = { outRef: `analyst:${findings.length}`, out: findings, spent: spend(1) };
|
|
4510
|
+
return artifact;
|
|
4511
|
+
},
|
|
4512
|
+
teardown: () => Promise.resolve({ destroyed: true }),
|
|
4513
|
+
resultArtifact() {
|
|
4514
|
+
if (!artifact) throw new Error("analystExecutor: resultArtifact before execute");
|
|
4515
|
+
return artifact;
|
|
4516
|
+
}
|
|
4517
|
+
};
|
|
4518
|
+
}
|
|
4519
|
+
function agenticRegistry(surface, opts) {
|
|
4520
|
+
return {
|
|
4521
|
+
register() {
|
|
4522
|
+
throw new Error("agenticRegistry: register unsupported");
|
|
4523
|
+
},
|
|
4524
|
+
resolve(spec) {
|
|
4525
|
+
const role = spec.profile.metadata?.role;
|
|
4526
|
+
const factory = (_s, _ctx) => role === "analyst" ? analystExecutor(opts) : shotExecutor(surface, opts);
|
|
4527
|
+
return { succeeded: true, value: factory };
|
|
4528
|
+
}
|
|
4529
|
+
};
|
|
4530
|
+
}
|
|
4531
|
+
function leaf(name, role) {
|
|
4532
|
+
const agent = {
|
|
4533
|
+
name,
|
|
4534
|
+
executorSpec: { profile: { name, metadata: { role } }, harness: null },
|
|
4535
|
+
act() {
|
|
4536
|
+
throw new Error(`agentic: spawned leaf "${name}" run as a driver`);
|
|
4537
|
+
}
|
|
4538
|
+
};
|
|
4539
|
+
return agent;
|
|
4540
|
+
}
|
|
4541
|
+
async function drainOne2(scope) {
|
|
4542
|
+
const s = await scope.next();
|
|
4543
|
+
if (!s) throw new Error("agentic: spawned child never settled");
|
|
4544
|
+
return s;
|
|
4545
|
+
}
|
|
4546
|
+
var perChild = (innerTurns) => ({
|
|
4547
|
+
maxIterations: innerTurns + 1,
|
|
4548
|
+
maxTokens: 1e6
|
|
4549
|
+
});
|
|
4550
|
+
function depthDriver(surface, task, opts, cfg) {
|
|
4551
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4552
|
+
let pendingSteer;
|
|
4553
|
+
return {
|
|
4554
|
+
name: "depth",
|
|
4555
|
+
async act(_t, scope) {
|
|
4556
|
+
const handle = await surface.open(task);
|
|
4557
|
+
const progression = [];
|
|
4558
|
+
let messages;
|
|
4559
|
+
let completions = 0;
|
|
4560
|
+
let shots = 0;
|
|
4561
|
+
try {
|
|
4562
|
+
for (shots = 0; shots < cfg.maxShots; shots += 1) {
|
|
4563
|
+
const child = leaf(`shot:${shots}`, "shot");
|
|
4564
|
+
const steer = shots === 0 ? void 0 : pendingSteer;
|
|
4565
|
+
const res = scope.spawn(child, { task, handle, messages, steer }, {
|
|
4566
|
+
budget: perChild(innerTurns),
|
|
4567
|
+
label: `shot:${shots}`
|
|
4568
|
+
});
|
|
4569
|
+
if (!res.ok) break;
|
|
4570
|
+
const settled = await drainOne2(scope);
|
|
4571
|
+
if (settled.kind === "down") break;
|
|
4572
|
+
const out = settled.out;
|
|
4573
|
+
messages = out.messages;
|
|
4574
|
+
completions += out.completions;
|
|
4575
|
+
progression.push(out.score);
|
|
4576
|
+
if (out.score >= 1 || shots === cfg.maxShots - 1) break;
|
|
4577
|
+
const aChild = leaf(`analyst:${shots}`, "analyst");
|
|
4578
|
+
const aRes = scope.spawn(
|
|
4579
|
+
aChild,
|
|
4580
|
+
{ task, messages },
|
|
4581
|
+
{ budget: perChild(1), label: `analyst:${shots}` }
|
|
4582
|
+
);
|
|
4583
|
+
if (!aRes.ok) break;
|
|
4584
|
+
const aSettled = await drainOne2(scope);
|
|
4585
|
+
completions += 1;
|
|
4586
|
+
if (aSettled.kind === "down") break;
|
|
4587
|
+
const findings = aSettled.out;
|
|
4588
|
+
if (/^\s*COMPLETE\b/i.test(findings)) break;
|
|
4589
|
+
pendingSteer = `A reviewer flagged unfinished items:
|
|
4590
|
+
${findings}
|
|
4591
|
+
|
|
4592
|
+
Address each with the tools, verify they took, then continue.`;
|
|
4593
|
+
}
|
|
4594
|
+
const final = await surface.score(task, handle);
|
|
4595
|
+
const score = final.total > 0 ? final.passes / final.total : 0;
|
|
4596
|
+
return {
|
|
4597
|
+
kind: "done",
|
|
4598
|
+
deliverable: {
|
|
4599
|
+
mode: "depth",
|
|
4600
|
+
score,
|
|
4601
|
+
resolved: final.total > 0 && final.passes === final.total,
|
|
4602
|
+
completions,
|
|
4603
|
+
progression,
|
|
4604
|
+
shots: shots + 1
|
|
4605
|
+
}
|
|
4606
|
+
};
|
|
4607
|
+
} finally {
|
|
4608
|
+
await surface.close(handle);
|
|
4609
|
+
}
|
|
4610
|
+
}
|
|
4611
|
+
};
|
|
4612
|
+
}
|
|
4613
|
+
function breadthDriver(_surface, task, opts, cfg) {
|
|
4614
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4615
|
+
return {
|
|
4616
|
+
name: "breadth",
|
|
4617
|
+
async act(_t, scope) {
|
|
4618
|
+
let opened = 0;
|
|
4619
|
+
for (let k = 0; k < cfg.width; k += 1) {
|
|
4620
|
+
const res = scope.spawn(leaf(`rollout:${k}`, "shot"), { task }, {
|
|
4621
|
+
budget: perChild(innerTurns),
|
|
4622
|
+
label: `rollout:${k}`
|
|
4623
|
+
});
|
|
4624
|
+
if (res.ok) opened += 1;
|
|
4625
|
+
}
|
|
4626
|
+
if (opened === 0) return { kind: "blocked", blockers: ["breadth: pool admitted no rollout"] };
|
|
4627
|
+
let best = -1;
|
|
4628
|
+
let bestResolved = false;
|
|
4629
|
+
let completions = 0;
|
|
4630
|
+
const progression = [];
|
|
4631
|
+
for (let s = await scope.next(); s !== null; s = await scope.next()) {
|
|
4632
|
+
if (s.kind === "down") continue;
|
|
4633
|
+
const out = s.out;
|
|
4634
|
+
completions += out.completions;
|
|
4635
|
+
if (out.score > best) best = out.score;
|
|
4636
|
+
if (out.total > 0 && out.passes === out.total) bestResolved = true;
|
|
4637
|
+
progression.push(best);
|
|
4638
|
+
}
|
|
4639
|
+
if (best < 0) return { kind: "blocked", blockers: ["breadth: every rollout went down"] };
|
|
4640
|
+
return {
|
|
4641
|
+
kind: "done",
|
|
4642
|
+
deliverable: {
|
|
4643
|
+
mode: "breadth",
|
|
4644
|
+
score: best,
|
|
4645
|
+
resolved: bestResolved,
|
|
4646
|
+
completions,
|
|
4647
|
+
progression,
|
|
4648
|
+
shots: opened
|
|
4649
|
+
}
|
|
4650
|
+
};
|
|
4651
|
+
}
|
|
4652
|
+
};
|
|
4653
|
+
}
|
|
4654
|
+
var sample = {
|
|
4655
|
+
name: "sample",
|
|
4656
|
+
driver: (surface, task, opts, budget) => breadthDriver(surface, task, opts, { width: budget })
|
|
4657
|
+
};
|
|
4658
|
+
var refine = {
|
|
4659
|
+
name: "refine",
|
|
4660
|
+
driver: (surface, task, opts, budget) => depthDriver(surface, task, opts, { maxShots: budget })
|
|
4661
|
+
};
|
|
4662
|
+
function defineStrategy(name, run) {
|
|
4663
|
+
return {
|
|
4664
|
+
name,
|
|
4665
|
+
driver: (surface, task, opts, budget) => ({
|
|
4666
|
+
name,
|
|
4667
|
+
async act(_t, scope) {
|
|
4668
|
+
let seq = 0;
|
|
4669
|
+
const innerTurns = opts.innerTurns ?? 4;
|
|
4670
|
+
let verifiedBest = 0;
|
|
4671
|
+
let verifiedResolved = false;
|
|
4672
|
+
const ctx = {
|
|
4673
|
+
// Narrowed to open/close — the body gets no raw call()/score() access.
|
|
4674
|
+
surface: {
|
|
4675
|
+
name: surface.name,
|
|
4676
|
+
open: (t) => surface.open(t),
|
|
4677
|
+
close: (h) => surface.close(h)
|
|
4678
|
+
},
|
|
4679
|
+
task,
|
|
4680
|
+
opts,
|
|
4681
|
+
budget,
|
|
4682
|
+
scope,
|
|
4683
|
+
async shot(spec) {
|
|
4684
|
+
const child = leaf(`shot:${seq}`, "shot");
|
|
4685
|
+
seq += 1;
|
|
4686
|
+
const res = scope.spawn(
|
|
4687
|
+
child,
|
|
4688
|
+
{
|
|
4689
|
+
task,
|
|
4690
|
+
handle: spec?.handle,
|
|
4691
|
+
messages: spec?.messages,
|
|
4692
|
+
steer: spec?.steer,
|
|
4693
|
+
persona: spec?.persona
|
|
4694
|
+
},
|
|
4695
|
+
{ budget: perChild(innerTurns), label: child.name }
|
|
4696
|
+
);
|
|
4697
|
+
if (!res.ok) return null;
|
|
4698
|
+
const settled = await drainOne2(scope);
|
|
4699
|
+
if (settled.kind === "down") return null;
|
|
4700
|
+
const out = settled.out;
|
|
4701
|
+
if (out.score > verifiedBest) verifiedBest = out.score;
|
|
4702
|
+
if (out.total > 0 && out.passes === out.total) verifiedResolved = true;
|
|
4703
|
+
return out;
|
|
4704
|
+
},
|
|
4705
|
+
async critique(messages) {
|
|
4706
|
+
const child = leaf(`analyst:${seq}`, "analyst");
|
|
4707
|
+
seq += 1;
|
|
4708
|
+
const res = scope.spawn(
|
|
4709
|
+
child,
|
|
4710
|
+
{ task, messages },
|
|
4711
|
+
{ budget: perChild(1), label: child.name }
|
|
4712
|
+
);
|
|
4713
|
+
if (!res.ok) return null;
|
|
4714
|
+
const settled = await drainOne2(scope);
|
|
4715
|
+
if (settled.kind === "down") return null;
|
|
4716
|
+
const findings = settled.out;
|
|
4717
|
+
return /^\s*COMPLETE\b/i.test(findings) ? null : findings;
|
|
4718
|
+
}
|
|
4719
|
+
};
|
|
4720
|
+
const r = await run(ctx);
|
|
4721
|
+
return {
|
|
4722
|
+
kind: "done",
|
|
4723
|
+
deliverable: { mode: name, ...r, score: verifiedBest, resolved: verifiedResolved }
|
|
4724
|
+
};
|
|
4725
|
+
}
|
|
4726
|
+
})
|
|
4727
|
+
};
|
|
4728
|
+
}
|
|
4729
|
+
var adaptiveRefine = defineStrategy(
|
|
4730
|
+
"adaptiveRefine",
|
|
4731
|
+
async ({ surface, task, budget, shot, critique }) => {
|
|
4732
|
+
let handle = await surface.open(task);
|
|
4733
|
+
const progression = [];
|
|
4734
|
+
let messages;
|
|
4735
|
+
let steer;
|
|
4736
|
+
let completions = 0;
|
|
4737
|
+
let best = -1;
|
|
4738
|
+
let shots = 0;
|
|
4739
|
+
try {
|
|
4740
|
+
for (shots = 0; shots < budget; shots += 1) {
|
|
4741
|
+
const out = await shot({ handle, messages, steer });
|
|
4742
|
+
if (!out) break;
|
|
4743
|
+
completions += out.completions;
|
|
4744
|
+
progression.push(out.score);
|
|
4745
|
+
if (out.score >= 1) break;
|
|
4746
|
+
if (out.score <= best) {
|
|
4747
|
+
await surface.close(handle);
|
|
4748
|
+
handle = await surface.open(task);
|
|
4749
|
+
messages = void 0;
|
|
4750
|
+
steer = void 0;
|
|
4751
|
+
continue;
|
|
4752
|
+
}
|
|
4753
|
+
best = out.score;
|
|
4754
|
+
messages = out.messages;
|
|
4755
|
+
const findings = await critique(out.messages);
|
|
4756
|
+
completions += 1;
|
|
4757
|
+
if (!findings) break;
|
|
4758
|
+
steer = `A reviewer flagged unfinished items:
|
|
4759
|
+
${findings}
|
|
4760
|
+
|
|
4761
|
+
Address each with the tools, verify they took, then continue.`;
|
|
4762
|
+
}
|
|
4763
|
+
const score = progression.length ? Math.max(...progression) : 0;
|
|
4764
|
+
return { score, resolved: score >= 1, completions, progression, shots };
|
|
4765
|
+
} finally {
|
|
4766
|
+
await surface.close(handle);
|
|
4767
|
+
}
|
|
4768
|
+
}
|
|
4769
|
+
);
|
|
4770
|
+
var sampleThenRefine = defineStrategy(
|
|
4771
|
+
"sampleThenRefine",
|
|
4772
|
+
async ({ surface, task, budget, shot, critique }) => {
|
|
4773
|
+
const explore = Math.max(1, Math.ceil(budget / 2));
|
|
4774
|
+
const open = /* @__PURE__ */ new Set();
|
|
4775
|
+
const progression = [];
|
|
4776
|
+
let completions = 0;
|
|
4777
|
+
let shots = 0;
|
|
4778
|
+
try {
|
|
4779
|
+
let best;
|
|
4780
|
+
for (let i = 0; i < explore; i += 1) {
|
|
4781
|
+
const handle = await surface.open(task);
|
|
4782
|
+
open.add(handle);
|
|
4783
|
+
const out = await shot({ handle });
|
|
4784
|
+
if (!out) continue;
|
|
4785
|
+
shots += 1;
|
|
4786
|
+
completions += out.completions;
|
|
4787
|
+
progression.push(out.score);
|
|
4788
|
+
if (!best || out.score > best.out.score) best = { handle, out };
|
|
4789
|
+
if (out.score >= 1) break;
|
|
4790
|
+
}
|
|
4791
|
+
if (!best) return { score: 0, resolved: false, completions, progression, shots };
|
|
4792
|
+
for (const h of [...open]) {
|
|
4793
|
+
if (h !== best.handle) {
|
|
4794
|
+
await surface.close(h);
|
|
4795
|
+
open.delete(h);
|
|
4796
|
+
}
|
|
4797
|
+
}
|
|
4798
|
+
let messages = best.out.messages;
|
|
4799
|
+
let topScore = best.out.score;
|
|
4800
|
+
for (let i = explore; i < budget && topScore < 1; i += 1) {
|
|
4801
|
+
const findings = await critique(messages);
|
|
4802
|
+
completions += 1;
|
|
4803
|
+
if (!findings) break;
|
|
4804
|
+
const out = await shot({
|
|
4805
|
+
handle: best.handle,
|
|
4806
|
+
messages,
|
|
4807
|
+
steer: `A reviewer flagged unfinished items:
|
|
4808
|
+
${findings}
|
|
4809
|
+
|
|
4810
|
+
Address each with the tools, verify they took, then continue.`
|
|
4811
|
+
});
|
|
4812
|
+
if (!out) break;
|
|
4813
|
+
shots += 1;
|
|
4814
|
+
completions += out.completions;
|
|
4815
|
+
progression.push(out.score);
|
|
4816
|
+
messages = out.messages;
|
|
4817
|
+
if (out.score > topScore) topScore = out.score;
|
|
4818
|
+
}
|
|
4819
|
+
const score = progression.length ? Math.max(...progression) : 0;
|
|
4820
|
+
return { score, resolved: score >= 1, completions, progression, shots };
|
|
4821
|
+
} finally {
|
|
4822
|
+
for (const h of open) await surface.close(h);
|
|
4823
|
+
}
|
|
4824
|
+
}
|
|
4825
|
+
);
|
|
4826
|
+
async function runAgentic(opts) {
|
|
4827
|
+
const strategy = opts.strategy ?? (opts.mode === "breadth" ? sample : refine);
|
|
4828
|
+
const driver = strategy.driver(opts.surface, opts.task, opts, opts.budget);
|
|
4829
|
+
const supervisor = createSupervisor();
|
|
4830
|
+
const root = opts.rootBudget ?? {
|
|
4831
|
+
maxIterations: opts.budget * ((opts.innerTurns ?? 4) + 2),
|
|
4832
|
+
maxTokens: 1e9
|
|
4833
|
+
};
|
|
4834
|
+
const started = Date.now();
|
|
4835
|
+
const result = await supervisor.run(driver, void 0, {
|
|
4836
|
+
budget: root,
|
|
4837
|
+
runId: `agentic:${strategy.name}:${opts.task.id}`,
|
|
4838
|
+
journal: new InMemorySpawnJournal(),
|
|
4839
|
+
blobs: new InMemoryResultBlobStore(),
|
|
4840
|
+
executors: agenticRegistry(opts.surface, opts),
|
|
4841
|
+
maxDepth: 3,
|
|
4842
|
+
...opts.hooks ? { hooks: opts.hooks } : {}
|
|
4843
|
+
});
|
|
4844
|
+
if (result.kind !== "winner" || result.out.kind !== "done") {
|
|
4845
|
+
const reason = result.kind === "winner" ? `blocked: ${result.out.blockers?.join("; ")}` : `no-winner: ${result.reason}`;
|
|
4846
|
+
throw new Error(`runAgentic(${strategy.name}) produced no result \u2014 ${reason}`);
|
|
4847
|
+
}
|
|
4848
|
+
const core = result.out.deliverable;
|
|
4849
|
+
return {
|
|
4850
|
+
...core,
|
|
4851
|
+
usd: result.spentTotal.usd,
|
|
4852
|
+
tokens: result.spentTotal.tokens,
|
|
4853
|
+
ms: Date.now() - started
|
|
4854
|
+
};
|
|
4855
|
+
}
|
|
4856
|
+
|
|
4857
|
+
// src/runtime/run-benchmark.ts
|
|
4858
|
+
async function pool(items, limit, fn) {
|
|
4859
|
+
const out = new Array(items.length);
|
|
4860
|
+
let next = 0;
|
|
4861
|
+
const workers = Array.from({ length: Math.max(1, Math.min(limit, items.length)) }, async () => {
|
|
4862
|
+
while (next < items.length) {
|
|
4863
|
+
const i = next;
|
|
4864
|
+
next += 1;
|
|
4865
|
+
out[i] = await fn(items[i], i);
|
|
4866
|
+
}
|
|
4867
|
+
});
|
|
4868
|
+
await Promise.all(workers);
|
|
4869
|
+
return out;
|
|
4870
|
+
}
|
|
4871
|
+
async function runBenchmark(cfg) {
|
|
4872
|
+
const strategies = cfg.strategies ?? [sample, refine];
|
|
4873
|
+
const budget = cfg.budget ?? 3;
|
|
4874
|
+
const concurrency = cfg.concurrency ?? 3;
|
|
4875
|
+
let settled = 0;
|
|
4876
|
+
const perTask = await pool(cfg.tasks, concurrency, async (task) => {
|
|
4877
|
+
const cells = {};
|
|
4878
|
+
let row;
|
|
4879
|
+
try {
|
|
4880
|
+
for (const s of strategies) {
|
|
4881
|
+
const r = await runAgentic({
|
|
4882
|
+
...cfg.worker,
|
|
4883
|
+
surface: cfg.environment,
|
|
4884
|
+
task,
|
|
4885
|
+
strategy: s,
|
|
4886
|
+
budget,
|
|
4887
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
4888
|
+
});
|
|
4889
|
+
cells[s.name] = {
|
|
4890
|
+
score: r.score,
|
|
4891
|
+
resolved: r.resolved,
|
|
4892
|
+
progression: r.progression,
|
|
4893
|
+
usd: r.usd,
|
|
4894
|
+
ms: r.ms,
|
|
4895
|
+
tokens: r.tokens
|
|
4896
|
+
};
|
|
4897
|
+
}
|
|
4898
|
+
row = { taskId: task.id, cells };
|
|
4899
|
+
} catch (e) {
|
|
4900
|
+
row = { taskId: task.id, error: e instanceof Error ? e.message.slice(0, 300) : String(e) };
|
|
4901
|
+
}
|
|
4902
|
+
settled += 1;
|
|
4903
|
+
cfg.onTask?.(row, settled, cfg.tasks.length);
|
|
4904
|
+
return row;
|
|
4905
|
+
});
|
|
4906
|
+
const ok = perTask.filter(
|
|
4907
|
+
(r) => !!r.cells
|
|
4908
|
+
);
|
|
4909
|
+
const mean = (xs) => xs.length ? xs.reduce((s, x) => s + x, 0) / xs.length : 0;
|
|
4910
|
+
const perStrategy = {};
|
|
4911
|
+
for (const s of strategies) {
|
|
4912
|
+
const cells = ok.map((r) => r.cells[s.name]).filter((c) => !!c);
|
|
4913
|
+
perStrategy[s.name] = {
|
|
4914
|
+
score: mean(cells.map((c) => c.score)),
|
|
4915
|
+
resolved: mean(cells.map((c) => c.resolved ? 1 : 0)),
|
|
4916
|
+
usd: mean(cells.map((c) => c.usd)),
|
|
4917
|
+
ms: mean(cells.map((c) => c.ms))
|
|
4918
|
+
};
|
|
4919
|
+
}
|
|
4920
|
+
const frontier = paretoFrontier(
|
|
4921
|
+
Object.entries(perStrategy).map(([name, v]) => ({ name, score: v.score, usd: v.usd })),
|
|
4922
|
+
[
|
|
4923
|
+
{ name: "score", direction: "maximize", value: (c) => c.score },
|
|
4924
|
+
{ name: "usd", direction: "minimize", value: (c) => c.usd }
|
|
4925
|
+
]
|
|
4926
|
+
).frontier.map((c) => c.name);
|
|
4927
|
+
const report = {
|
|
4928
|
+
n: ok.length,
|
|
4929
|
+
excluded: perTask.length - ok.length,
|
|
4930
|
+
perStrategy,
|
|
4931
|
+
perTask,
|
|
4932
|
+
pareto: frontier
|
|
4933
|
+
};
|
|
4934
|
+
const names = strategies.map((s) => s.name);
|
|
4935
|
+
if (names.includes("refine") && names.includes("sample") && ok.length >= 2) {
|
|
4936
|
+
const b = pairedBootstrap(
|
|
4937
|
+
ok.map((r) => r.cells.sample?.score ?? 0),
|
|
4938
|
+
ok.map((r) => r.cells.refine?.score ?? 0)
|
|
4939
|
+
);
|
|
4940
|
+
report.refineVsSample = { mean: b.mean, low: b.low, high: b.high, n: b.n };
|
|
4941
|
+
}
|
|
4942
|
+
return report;
|
|
4943
|
+
}
|
|
4944
|
+
function printBenchmarkReport(report) {
|
|
4945
|
+
const pct = (x) => `${(x * 100).toFixed(1)}%`;
|
|
4946
|
+
const pp = (x) => `${x >= 0 ? "+" : ""}${(x * 100).toFixed(1)}pp`;
|
|
4947
|
+
console.log(
|
|
4948
|
+
`
|
|
4949
|
+
=== benchmark \xB7 n=${report.n}${report.excluded ? ` (excluded ${report.excluded})` : ""} ===`
|
|
4950
|
+
);
|
|
4951
|
+
console.log(
|
|
4952
|
+
` ${"strategy".padEnd(16)} ${"score".padStart(7)} ${"resolved".padStart(9)} ${"$/task".padStart(8)} ${"s/task".padStart(7)}`
|
|
4953
|
+
);
|
|
4954
|
+
for (const [s, v] of Object.entries(report.perStrategy))
|
|
4955
|
+
console.log(
|
|
4956
|
+
` ${(report.pareto.includes(s) ? `${s} *` : s).padEnd(16)} ${pct(v.score).padStart(7)} ${pct(v.resolved).padStart(9)} ${`$${v.usd.toFixed(3)}`.padStart(8)} ${(v.ms / 1e3).toFixed(0).padStart(6)}s`
|
|
4957
|
+
);
|
|
4958
|
+
if (report.pareto.length) console.log(` * = on the (score, $) Pareto frontier`);
|
|
4959
|
+
for (const row of report.perTask)
|
|
4960
|
+
if (row.error) console.log(` \u26A0 ${row.taskId}: ${row.error.slice(0, 120)}`);
|
|
4961
|
+
const l = report.refineVsSample;
|
|
4962
|
+
if (l) {
|
|
4963
|
+
const sig = l.low > 0 ? "SIGNIF +" : l.high < 0 ? "SIGNIF -" : "n.s.";
|
|
4964
|
+
console.log(` refine \u2212 sample: ${pp(l.mean)} CI [${pp(l.low)}, ${pp(l.high)}] (${sig})`);
|
|
4965
|
+
}
|
|
3801
4966
|
}
|
|
3802
4967
|
|
|
3803
4968
|
// src/runtime/sandbox-run.ts
|
|
@@ -4028,6 +5193,486 @@ function errorMessage(error) {
|
|
|
4028
5193
|
return error instanceof Error ? error.message : String(error);
|
|
4029
5194
|
}
|
|
4030
5195
|
|
|
5196
|
+
// src/runtime/strategy-author.ts
|
|
5197
|
+
import { mkdirSync, writeFileSync } from "fs";
|
|
5198
|
+
import { join } from "path";
|
|
5199
|
+
var strategyAuthorContract = `
|
|
5200
|
+
You author an OPTIMIZATION STRATEGY for an agentic loop system. A strategy decides how to
|
|
5201
|
+
spend a compute budget to beat a task's deployable check. You compose exactly two steps:
|
|
5202
|
+
|
|
5203
|
+
shot(spec?: { handle?, messages?, steer?, persona? }): Promise<ShotResult | null>
|
|
5204
|
+
Runs ONE worker attempt (a bounded tool loop) over an artifact.
|
|
5205
|
+
- omit handle => the shot opens its OWN fresh artifact and closes it after (a sample).
|
|
5206
|
+
- pass handle => the shot CONTINUES that artifact (state accumulates across shots).
|
|
5207
|
+
- messages => the carried conversation (pass the previous ShotResult.messages to continue).
|
|
5208
|
+
- steer => a corrective instruction injected before the shot.
|
|
5209
|
+
- persona => { systemPrompt?, model? } \u2014 give THIS shot its own role and/or model
|
|
5210
|
+
(multi-agent strategies: a researcher shot then an engineer shot, a panel of k
|
|
5211
|
+
personas over one budget). On a fresh shot the systemPrompt replaces the task's; on
|
|
5212
|
+
a carried conversation it arrives as a hand-off message. Same conserved budget.
|
|
5213
|
+
ShotResult = { messages, score (0..1 on the task's check), passes, total, completions, toolErrors }
|
|
5214
|
+
Returns null if the attempt failed infra-wise.
|
|
5215
|
+
|
|
5216
|
+
critique(messages): Promise<string | null>
|
|
5217
|
+
A firewalled trace-analyst reads the attempt's trajectory and returns ONE corrective
|
|
5218
|
+
instruction (or null when it judges the work complete). Costs ~1 completion.
|
|
5219
|
+
|
|
5220
|
+
surface.open(task) / surface.close(handle)
|
|
5221
|
+
Open a persistent artifact you manage yourself (remember to close in a finally).
|
|
5222
|
+
|
|
5223
|
+
Rules:
|
|
5224
|
+
- Stay within ~budget total shots; every shot/critique spends from a conserved pool.
|
|
5225
|
+
- For a FRESH attempt OMIT \`messages\` entirely (never pass \`[]\` \u2014 an empty array is a
|
|
5226
|
+
fresh conversation too, but be explicit). To CONTINUE, pass the previous
|
|
5227
|
+
ShotResult.messages unchanged.
|
|
5228
|
+
- Return { score, resolved, completions, progression, shots } \u2014 score = the BEST checkpoint
|
|
5229
|
+
you reached (keep-best, never final-state), progression = score after each shot.
|
|
5230
|
+
- The module must be EXACTLY this shape (no other imports, no commentary outside code):
|
|
5231
|
+
|
|
5232
|
+
import { defineStrategy } from '@tangle-network/agent-runtime/loops'
|
|
5233
|
+
export default defineStrategy('your-strategy-name', async ({ surface, task, budget, shot, critique }) => {
|
|
5234
|
+
// your composition
|
|
5235
|
+
})
|
|
5236
|
+
`;
|
|
5237
|
+
function assertStrategyContract(code) {
|
|
5238
|
+
const allowedImport = /^\s*import\s+\{[^}]*\}\s+from\s+['"]@tangle-network\/agent-runtime\/loops['"]/;
|
|
5239
|
+
for (const line of code.split("\n")) {
|
|
5240
|
+
if (/^\s*import\s/.test(line) && !allowedImport.test(line)) {
|
|
5241
|
+
throw new Error(`authored code rejected: foreign import \u2014 ${line.trim().slice(0, 120)}`);
|
|
5242
|
+
}
|
|
5243
|
+
}
|
|
5244
|
+
const banned = [
|
|
5245
|
+
[/\brequire\s*\(/, "require()"],
|
|
5246
|
+
[/\bimport\s*\(/, "dynamic import()"],
|
|
5247
|
+
[/\beval\s*\(/, "eval()"],
|
|
5248
|
+
[/new\s+Function\s*\(/, "new Function()"],
|
|
5249
|
+
[/\bprocess\s*[.[]/, "process access"],
|
|
5250
|
+
[/\bglobalThis\s*[.[]/, "globalThis access"],
|
|
5251
|
+
[/\bfetch\s*\(/, "network access"],
|
|
5252
|
+
[/child_process|node:fs|node:net|node:http|worker_threads/, "node builtin access"]
|
|
5253
|
+
];
|
|
5254
|
+
for (const [re, what] of banned) {
|
|
5255
|
+
if (re.test(code)) throw new Error(`authored code rejected: ${what}`);
|
|
5256
|
+
}
|
|
5257
|
+
}
|
|
5258
|
+
async function requestAuthoredCode(opts, model) {
|
|
5259
|
+
const res = await opts.chat.chat(
|
|
5260
|
+
{
|
|
5261
|
+
...model ? { model } : {},
|
|
5262
|
+
...opts.temperature !== void 0 ? { temperature: opts.temperature } : {},
|
|
5263
|
+
...opts.maxTokens !== void 0 ? { maxTokens: opts.maxTokens } : {},
|
|
5264
|
+
messages: [
|
|
5265
|
+
{
|
|
5266
|
+
role: "system",
|
|
5267
|
+
content: "You are a senior engineer authoring optimization strategies for agent loops. Output exactly one fenced ```ts code block and nothing else."
|
|
5268
|
+
},
|
|
5269
|
+
{
|
|
5270
|
+
role: "user",
|
|
5271
|
+
content: `${opts.contract ?? strategyAuthorContract}
|
|
5272
|
+
|
|
5273
|
+
BASELINE RESULTS on the "${opts.environmentName}" environment (budget=${opts.budget}):
|
|
5274
|
+
${opts.lossesJson}
|
|
5275
|
+
|
|
5276
|
+
Author ONE new strategy that you expect to beat the baselines on THIS environment at the same budget. Use the losses to target the observed failure mode. Output only the module code block.`
|
|
5277
|
+
}
|
|
5278
|
+
]
|
|
5279
|
+
},
|
|
5280
|
+
{ ...opts.signal ? { signal: opts.signal } : {} }
|
|
5281
|
+
);
|
|
5282
|
+
const match = res.content.match(/```(?:ts|typescript)?\s*\n([\s\S]*?)```/);
|
|
5283
|
+
if (!match?.[1]) {
|
|
5284
|
+
throw new Error(
|
|
5285
|
+
`authorStrategy: no code block in the author's reply (model=${model ?? "default"}): ${res.content.slice(0, 300)}`
|
|
5286
|
+
);
|
|
5287
|
+
}
|
|
5288
|
+
return match[1];
|
|
5289
|
+
}
|
|
5290
|
+
async function authorStrategy(opts) {
|
|
5291
|
+
let code;
|
|
5292
|
+
try {
|
|
5293
|
+
code = await requestAuthoredCode(opts, opts.model);
|
|
5294
|
+
} catch (primaryError) {
|
|
5295
|
+
if (!opts.fallbackModel) throw primaryError;
|
|
5296
|
+
code = await requestAuthoredCode(opts, opts.fallbackModel);
|
|
5297
|
+
}
|
|
5298
|
+
assertStrategyContract(code);
|
|
5299
|
+
mkdirSync(opts.outDir, { recursive: true });
|
|
5300
|
+
const file = join(opts.outDir, `authored-${Date.now()}.mts`);
|
|
5301
|
+
writeFileSync(file, code);
|
|
5302
|
+
const mod = await import(`file://${file}`);
|
|
5303
|
+
if (!mod.default || typeof mod.default.driver !== "function" || !mod.default.name) {
|
|
5304
|
+
throw new Error(`authorStrategy: ${file} does not export a default Strategy`);
|
|
5305
|
+
}
|
|
5306
|
+
return { strategy: mod.default, file, code };
|
|
5307
|
+
}
|
|
5308
|
+
|
|
5309
|
+
// src/runtime/strategy-evolution.ts
|
|
5310
|
+
import { gzipSync } from "zlib";
|
|
5311
|
+
function selectChampion(report, fieldOrder, policy, epsilon) {
|
|
5312
|
+
const entries = fieldOrder.map((name) => ({ name, summary: report.perStrategy[name] })).filter((e) => !!e.summary);
|
|
5313
|
+
if (entries.length === 0)
|
|
5314
|
+
throw new Error("selectChampion: report carries none of the field strategies");
|
|
5315
|
+
const best = Math.max(...entries.map((e) => e.summary.score));
|
|
5316
|
+
const pick = policy === "score" ? entries.find((e) => e.summary.score === best) : entries.filter((e) => e.summary.score >= best - epsilon).sort((a, b) => a.summary.usd - b.summary.usd || b.summary.score - a.summary.score)[0];
|
|
5317
|
+
if (!pick) throw new Error("selectChampion: empty pick (unreachable)");
|
|
5318
|
+
return { name: pick.name, score: pick.summary.score, usd: pick.summary.usd };
|
|
5319
|
+
}
|
|
5320
|
+
var fieldSummary = (archive) => archive.map(
|
|
5321
|
+
(n) => `- ${n.name} (${n.source}, gen ${n.generation}, last score ${(n.score * 100).toFixed(0)}%)`
|
|
5322
|
+
).join("\n");
|
|
5323
|
+
var compactLosses = (report) => {
|
|
5324
|
+
const r2 = (x) => Math.round(x * 100) / 100;
|
|
5325
|
+
const rows = report.perTask.map(
|
|
5326
|
+
(row) => row.cells ? {
|
|
5327
|
+
task: row.taskId,
|
|
5328
|
+
cells: Object.fromEntries(
|
|
5329
|
+
Object.entries(row.cells).map(([name, c]) => [
|
|
5330
|
+
name,
|
|
5331
|
+
{ score: r2(c.score), resolved: c.resolved, progression: c.progression.map(r2) }
|
|
5332
|
+
])
|
|
5333
|
+
)
|
|
5334
|
+
} : { task: row.taskId, error: row.error?.slice(0, 80) }
|
|
5335
|
+
);
|
|
5336
|
+
return JSON.stringify(rows).slice(0, 12e3);
|
|
5337
|
+
};
|
|
5338
|
+
async function runStrategyEvolution(cfg) {
|
|
5339
|
+
const budget = cfg.budget ?? 3;
|
|
5340
|
+
const concurrency = cfg.concurrency ?? 3;
|
|
5341
|
+
const generations = cfg.generations ?? 2;
|
|
5342
|
+
const populationSize = cfg.populationSize ?? 2;
|
|
5343
|
+
const baselines = cfg.baselines ?? [sample, refine, sampleThenRefine];
|
|
5344
|
+
const policy = cfg.champion ?? "costAware";
|
|
5345
|
+
const epsilon = cfg.championEpsilon ?? 0.01;
|
|
5346
|
+
const byName = new Map(baselines.map((s) => [s.name, s]));
|
|
5347
|
+
const bench = (phase, tasks, strategies) => runBenchmark({
|
|
5348
|
+
environment: cfg.environment,
|
|
5349
|
+
tasks,
|
|
5350
|
+
worker: cfg.worker,
|
|
5351
|
+
strategies,
|
|
5352
|
+
budget,
|
|
5353
|
+
concurrency,
|
|
5354
|
+
...cfg.onTask ? { onTask: (row, done, total) => cfg.onTask?.(phase, row, done, total) } : {},
|
|
5355
|
+
...cfg.hooks ? { hooks: cfg.hooks } : {}
|
|
5356
|
+
});
|
|
5357
|
+
const train = await cfg.tasks(0, cfg.trainN);
|
|
5358
|
+
const gen0 = await bench("gen0", train, baselines);
|
|
5359
|
+
const archive = baselines.map((s) => ({
|
|
5360
|
+
name: s.name,
|
|
5361
|
+
source: "baseline",
|
|
5362
|
+
generation: 0,
|
|
5363
|
+
score: gen0.perStrategy[s.name]?.score ?? 0,
|
|
5364
|
+
usd: gen0.perStrategy[s.name]?.usd ?? 0
|
|
5365
|
+
}));
|
|
5366
|
+
const gen0Champion = selectChampion(
|
|
5367
|
+
gen0,
|
|
5368
|
+
baselines.map((s) => s.name),
|
|
5369
|
+
policy,
|
|
5370
|
+
epsilon
|
|
5371
|
+
);
|
|
5372
|
+
let incumbent = gen0Champion;
|
|
5373
|
+
let latestReport = gen0;
|
|
5374
|
+
const generationRows = [];
|
|
5375
|
+
const trajectory = [
|
|
5376
|
+
{
|
|
5377
|
+
generation: 0,
|
|
5378
|
+
champion: gen0Champion.name,
|
|
5379
|
+
score: gen0Champion.score,
|
|
5380
|
+
usd: gen0Champion.usd
|
|
5381
|
+
}
|
|
5382
|
+
];
|
|
5383
|
+
let authoredOk = 0;
|
|
5384
|
+
for (let g = 1; g <= generations; g += 1) {
|
|
5385
|
+
const lossesJson = compactLosses(latestReport);
|
|
5386
|
+
const candidates = [];
|
|
5387
|
+
const newStrategies = [];
|
|
5388
|
+
for (let i = 0; i < populationSize; i += 1) {
|
|
5389
|
+
const contract = `${strategyAuthorContract}
|
|
5390
|
+
|
|
5391
|
+
STRATEGIES ALREADY IN THE TOURNAMENT (author something MEANINGFULLY different \u2014 a new composition, not a rename):
|
|
5392
|
+
${fieldSummary(archive)}
|
|
5393
|
+
|
|
5394
|
+
You are authoring candidate ${i + 1} of ${populationSize} this generation; explore a distinct region of the strategy space from your siblings.`;
|
|
5395
|
+
try {
|
|
5396
|
+
const authored = await authorStrategy({
|
|
5397
|
+
chat: cfg.author.chat,
|
|
5398
|
+
...cfg.author.model ? { model: cfg.author.model } : {},
|
|
5399
|
+
...cfg.author.fallbackModel ? { fallbackModel: cfg.author.fallbackModel } : {},
|
|
5400
|
+
...cfg.author.temperature !== void 0 ? { temperature: cfg.author.temperature } : {},
|
|
5401
|
+
...cfg.author.maxTokens !== void 0 ? { maxTokens: cfg.author.maxTokens } : {},
|
|
5402
|
+
contract,
|
|
5403
|
+
environmentName: cfg.environment.name,
|
|
5404
|
+
lossesJson,
|
|
5405
|
+
budget,
|
|
5406
|
+
outDir: cfg.outDir
|
|
5407
|
+
});
|
|
5408
|
+
const unique = byName.has(authored.strategy.name) ? `${authored.strategy.name}-g${g}c${i + 1}` : authored.strategy.name;
|
|
5409
|
+
const strategy = unique === authored.strategy.name ? authored.strategy : {
|
|
5410
|
+
name: unique,
|
|
5411
|
+
driver: (s, t, o, b) => {
|
|
5412
|
+
const agent = authored.strategy.driver(s, t, o, b);
|
|
5413
|
+
return {
|
|
5414
|
+
...agent,
|
|
5415
|
+
name: unique,
|
|
5416
|
+
act: async (task, scope) => {
|
|
5417
|
+
const out = await agent.act(task, scope);
|
|
5418
|
+
if (out.kind !== "done") return out;
|
|
5419
|
+
const deliverable = {
|
|
5420
|
+
...out.deliverable,
|
|
5421
|
+
mode: unique
|
|
5422
|
+
};
|
|
5423
|
+
return { ...out, deliverable };
|
|
5424
|
+
}
|
|
5425
|
+
};
|
|
5426
|
+
}
|
|
5427
|
+
};
|
|
5428
|
+
byName.set(unique, strategy);
|
|
5429
|
+
newStrategies.push(strategy);
|
|
5430
|
+
archive.push({
|
|
5431
|
+
name: unique,
|
|
5432
|
+
source: "authored",
|
|
5433
|
+
generation: g,
|
|
5434
|
+
parent: incumbent.name,
|
|
5435
|
+
gzipBits: gzipSync(Buffer.from(authored.code)).length * 8,
|
|
5436
|
+
file: authored.file,
|
|
5437
|
+
score: 0,
|
|
5438
|
+
usd: 0
|
|
5439
|
+
});
|
|
5440
|
+
candidates.push({
|
|
5441
|
+
name: unique,
|
|
5442
|
+
file: authored.file,
|
|
5443
|
+
gzipBits: gzipSync(Buffer.from(authored.code)).length * 8,
|
|
5444
|
+
codeChars: authored.code.length
|
|
5445
|
+
});
|
|
5446
|
+
authoredOk += 1;
|
|
5447
|
+
} catch (e) {
|
|
5448
|
+
candidates.push({
|
|
5449
|
+
name: `(author-failed g${g}c${i + 1})`,
|
|
5450
|
+
error: e instanceof Error ? e.message.slice(0, 300) : String(e)
|
|
5451
|
+
});
|
|
5452
|
+
}
|
|
5453
|
+
}
|
|
5454
|
+
const incumbentStrategy = byName.get(incumbent.name);
|
|
5455
|
+
if (!incumbentStrategy)
|
|
5456
|
+
throw new Error(`evolution: incumbent "${incumbent.name}" missing from the field`);
|
|
5457
|
+
const field = [incumbentStrategy, ...newStrategies];
|
|
5458
|
+
const report = await bench(`gen${g}`, train, field);
|
|
5459
|
+
for (const node of archive) {
|
|
5460
|
+
const cell = report.perStrategy[node.name];
|
|
5461
|
+
if (cell) {
|
|
5462
|
+
node.score = cell.score;
|
|
5463
|
+
node.usd = cell.usd;
|
|
5464
|
+
}
|
|
5465
|
+
}
|
|
5466
|
+
const champion = selectChampion(
|
|
5467
|
+
report,
|
|
5468
|
+
field.map((s) => s.name),
|
|
5469
|
+
policy,
|
|
5470
|
+
epsilon
|
|
5471
|
+
);
|
|
5472
|
+
generationRows.push({ generation: g, candidates, report, champion });
|
|
5473
|
+
trajectory.push({
|
|
5474
|
+
generation: g,
|
|
5475
|
+
champion: champion.name,
|
|
5476
|
+
score: champion.score,
|
|
5477
|
+
usd: champion.usd
|
|
5478
|
+
});
|
|
5479
|
+
incumbent = champion;
|
|
5480
|
+
latestReport = report;
|
|
5481
|
+
}
|
|
5482
|
+
if (authoredOk === 0) {
|
|
5483
|
+
throw new Error(
|
|
5484
|
+
"runStrategyEvolution: every author attempt failed across all generations \u2014 no search happened; see the candidates[].error entries"
|
|
5485
|
+
);
|
|
5486
|
+
}
|
|
5487
|
+
const holdoutTasks = await cfg.tasks(cfg.trainN + (cfg.holdoutOffset ?? 0), cfg.holdoutN);
|
|
5488
|
+
const finalists = [.../* @__PURE__ */ new Set([gen0Champion.name, incumbent.name])].map((n) => byName.get(n)).filter((s) => !!s);
|
|
5489
|
+
const holdout = await bench("holdout", holdoutTasks, finalists);
|
|
5490
|
+
const verdict = promotionGate({
|
|
5491
|
+
report: holdout,
|
|
5492
|
+
incumbent: gen0Champion.name,
|
|
5493
|
+
candidate: incumbent.name,
|
|
5494
|
+
...cfg.minPairedTasks !== void 0 ? { minPairedTasks: cfg.minPairedTasks } : {}
|
|
5495
|
+
});
|
|
5496
|
+
return {
|
|
5497
|
+
gen0,
|
|
5498
|
+
gen0Champion,
|
|
5499
|
+
generations: generationRows,
|
|
5500
|
+
archive,
|
|
5501
|
+
finalChampion: incumbent,
|
|
5502
|
+
holdout,
|
|
5503
|
+
verdict,
|
|
5504
|
+
trajectory
|
|
5505
|
+
};
|
|
5506
|
+
}
|
|
5507
|
+
|
|
5508
|
+
// src/runtime/verifier-environment.ts
|
|
5509
|
+
var submitTool = {
|
|
5510
|
+
type: "function",
|
|
5511
|
+
function: {
|
|
5512
|
+
name: "submit_answer",
|
|
5513
|
+
description: "Submit your answer for evaluation. You may submit more than once \u2014 the best-scoring submission counts. Submit the COMPLETE final answer, not a fragment.",
|
|
5514
|
+
parameters: {
|
|
5515
|
+
type: "object",
|
|
5516
|
+
properties: { answer: { type: "string", description: "The complete final answer." } },
|
|
5517
|
+
required: ["answer"]
|
|
5518
|
+
}
|
|
5519
|
+
}
|
|
5520
|
+
};
|
|
5521
|
+
function createVerifierEnvironment(opts) {
|
|
5522
|
+
if (opts.extraTools?.length && !opts.callExtra) {
|
|
5523
|
+
throw new Error(`${opts.name}: extraTools requires callExtra`);
|
|
5524
|
+
}
|
|
5525
|
+
const states = /* @__PURE__ */ new Map();
|
|
5526
|
+
let seq = 0;
|
|
5527
|
+
return {
|
|
5528
|
+
name: opts.name,
|
|
5529
|
+
async open(task) {
|
|
5530
|
+
seq += 1;
|
|
5531
|
+
const handle = { id: `${opts.name}-${seq}`, surface: opts.name };
|
|
5532
|
+
states.set(handle.id, { task, submissions: [] });
|
|
5533
|
+
return handle;
|
|
5534
|
+
},
|
|
5535
|
+
async tools() {
|
|
5536
|
+
return [submitTool, ...opts.extraTools ?? []];
|
|
5537
|
+
},
|
|
5538
|
+
async call(handle, name, args) {
|
|
5539
|
+
const state = states.get(handle.id);
|
|
5540
|
+
if (!state) return "ERROR: workspace closed";
|
|
5541
|
+
if (name === "submit_answer") {
|
|
5542
|
+
const answer = String(args.answer ?? "").trim();
|
|
5543
|
+
if (!answer) return "ERROR: empty answer";
|
|
5544
|
+
state.submissions.push(answer);
|
|
5545
|
+
return `submission ${state.submissions.length} recorded`;
|
|
5546
|
+
}
|
|
5547
|
+
if (opts.callExtra && opts.extraTools?.some((t) => t.function.name === name)) {
|
|
5548
|
+
try {
|
|
5549
|
+
return await opts.callExtra(state.task, name, args);
|
|
5550
|
+
} catch (e) {
|
|
5551
|
+
return `ERROR: ${e instanceof Error ? e.message : String(e)}`;
|
|
5552
|
+
}
|
|
5553
|
+
}
|
|
5554
|
+
return `ERROR: unknown tool ${name}`;
|
|
5555
|
+
},
|
|
5556
|
+
// Keep-best across submissions — the measured law (workers reach correct answers,
|
|
5557
|
+
// then revise past them; final-state scoring undersells every strategy).
|
|
5558
|
+
async score(task, handle) {
|
|
5559
|
+
const state = states.get(handle.id);
|
|
5560
|
+
if (!state || state.submissions.length === 0) return { passes: 0, total: 1, errored: 0 };
|
|
5561
|
+
let best = { passes: 0, total: 1, errored: 0 };
|
|
5562
|
+
const ratio = (s) => s.total > 0 ? s.passes / s.total : 0;
|
|
5563
|
+
for (const answer of state.submissions) {
|
|
5564
|
+
const s = await opts.check(task, answer);
|
|
5565
|
+
if (ratio(s) > ratio(best)) best = s;
|
|
5566
|
+
}
|
|
5567
|
+
return best;
|
|
5568
|
+
},
|
|
5569
|
+
async close(handle) {
|
|
5570
|
+
states.delete(handle.id);
|
|
5571
|
+
}
|
|
5572
|
+
};
|
|
5573
|
+
}
|
|
5574
|
+
|
|
5575
|
+
// src/runtime/workspace.ts
|
|
5576
|
+
function localShell() {
|
|
5577
|
+
return async (args, cwd) => {
|
|
5578
|
+
const { execFile } = await import("child_process");
|
|
5579
|
+
const [bin, ...rest] = args;
|
|
5580
|
+
return new Promise((resolve) => {
|
|
5581
|
+
execFile(
|
|
5582
|
+
bin ?? "",
|
|
5583
|
+
rest,
|
|
5584
|
+
{ cwd, encoding: "utf-8", maxBuffer: 64 * 1024 * 1024 },
|
|
5585
|
+
(err, stdout, stderr) => {
|
|
5586
|
+
resolve({
|
|
5587
|
+
stdout: stdout ?? "",
|
|
5588
|
+
stderr: stderr ?? "",
|
|
5589
|
+
code: err ? err.code ?? 1 : 0
|
|
5590
|
+
});
|
|
5591
|
+
}
|
|
5592
|
+
);
|
|
5593
|
+
});
|
|
5594
|
+
};
|
|
5595
|
+
}
|
|
5596
|
+
function gitWorkspace(opts) {
|
|
5597
|
+
const shell = opts.shell ?? localShell();
|
|
5598
|
+
const branch = opts.branch ?? "main";
|
|
5599
|
+
const cfg = opts.noHooks === false ? [] : ["-c", "core.hooksPath=/dev/null"];
|
|
5600
|
+
const ident = ["-c", "user.email=workspace@tangle.local", "-c", "user.name=workspace"];
|
|
5601
|
+
const run = async (args, cwd) => {
|
|
5602
|
+
const res = await shell(["git", ...cfg, ...ident, ...args], cwd);
|
|
5603
|
+
if (res.code !== 0) {
|
|
5604
|
+
throw new Error(
|
|
5605
|
+
`git ${args.join(" ")} failed (${res.code}): ${tail(res.stderr || res.stdout)}`
|
|
5606
|
+
);
|
|
5607
|
+
}
|
|
5608
|
+
return res.stdout;
|
|
5609
|
+
};
|
|
5610
|
+
return {
|
|
5611
|
+
ref: opts.ref,
|
|
5612
|
+
materialize: (dir) => run(["clone", "--branch", branch, opts.ref, dir]).then(() => {
|
|
5613
|
+
}),
|
|
5614
|
+
async commit(dir, message) {
|
|
5615
|
+
await run(["add", "-A"], dir);
|
|
5616
|
+
const status = await run(["status", "--porcelain"], dir);
|
|
5617
|
+
if (!status.trim()) return { ok: true, rev: (await run(["rev-parse", "HEAD"], dir)).trim() };
|
|
5618
|
+
await run(["commit", "-m", message], dir);
|
|
5619
|
+
const pull = await shell(["git", ...cfg, ...ident, "pull", "--rebase", "origin", branch], dir);
|
|
5620
|
+
if (pull.code !== 0) {
|
|
5621
|
+
await shell(["git", ...cfg, "rebase", "--abort"], dir).catch(() => {
|
|
5622
|
+
});
|
|
5623
|
+
return { ok: false, conflict: tail(pull.stderr || pull.stdout) };
|
|
5624
|
+
}
|
|
5625
|
+
const push = await shell(["git", ...cfg, ...ident, "push", "origin", branch], dir);
|
|
5626
|
+
if (push.code !== 0) return { ok: false, conflict: tail(push.stderr || push.stdout) };
|
|
5627
|
+
return { ok: true, rev: (await run(["rev-parse", "HEAD"], dir)).trim() };
|
|
5628
|
+
},
|
|
5629
|
+
async head() {
|
|
5630
|
+
const out = await run(["ls-remote", opts.ref, `refs/heads/${branch}`]);
|
|
5631
|
+
return out.split(/\s+/)[0] ?? "";
|
|
5632
|
+
}
|
|
5633
|
+
};
|
|
5634
|
+
}
|
|
5635
|
+
function jjWorkspace(opts) {
|
|
5636
|
+
const shell = opts.shell ?? localShell();
|
|
5637
|
+
const branch = opts.branch ?? "main";
|
|
5638
|
+
const ident = [
|
|
5639
|
+
"--config-toml",
|
|
5640
|
+
'user.name="workspace"',
|
|
5641
|
+
"--config-toml",
|
|
5642
|
+
'user.email="workspace@tangle.local"'
|
|
5643
|
+
];
|
|
5644
|
+
const jj = async (args, cwd) => {
|
|
5645
|
+
const res = await shell(["jj", ...ident, ...args], cwd);
|
|
5646
|
+
if (res.code !== 0) {
|
|
5647
|
+
throw new Error(
|
|
5648
|
+
`jj ${args.join(" ")} failed (${res.code}): ${tail(res.stderr || res.stdout)}`
|
|
5649
|
+
);
|
|
5650
|
+
}
|
|
5651
|
+
return res.stdout;
|
|
5652
|
+
};
|
|
5653
|
+
return {
|
|
5654
|
+
ref: opts.ref,
|
|
5655
|
+
// Colocated clone: jj manages history, git holds the durable remote.
|
|
5656
|
+
materialize: (dir) => jj(["git", "clone", "--colocate", opts.ref, dir]).then(() => {
|
|
5657
|
+
}),
|
|
5658
|
+
async commit(dir, message) {
|
|
5659
|
+
await jj(["describe", "-m", message], dir);
|
|
5660
|
+
await jj(["new"], dir);
|
|
5661
|
+
const push = await shell(["jj", ...ident, "git", "push", "--branch", branch], dir);
|
|
5662
|
+
if (push.code !== 0) return { ok: false, conflict: tail(push.stderr || push.stdout) };
|
|
5663
|
+
const rev = (await jj(["log", "--no-graph", "-r", "@-", "-T", "commit_id"], dir)).trim();
|
|
5664
|
+
return { ok: true, rev };
|
|
5665
|
+
},
|
|
5666
|
+
async head() {
|
|
5667
|
+
const out = await shell(["git", "ls-remote", opts.ref, `refs/heads/${branch}`]);
|
|
5668
|
+
return out.stdout.split(/\s+/)[0] ?? "";
|
|
5669
|
+
}
|
|
5670
|
+
};
|
|
5671
|
+
}
|
|
5672
|
+
function tail(s) {
|
|
5673
|
+
return s.slice(-400);
|
|
5674
|
+
}
|
|
5675
|
+
|
|
4031
5676
|
export {
|
|
4032
5677
|
contentAddress,
|
|
4033
5678
|
InMemoryResultBlobStore,
|
|
@@ -4036,6 +5681,8 @@ export {
|
|
|
4036
5681
|
FileSpawnJournal,
|
|
4037
5682
|
replaySpawnTree,
|
|
4038
5683
|
materializeTreeView,
|
|
5684
|
+
defaultAuditorInstruction,
|
|
5685
|
+
auditIntent,
|
|
4039
5686
|
completionAuthorizes,
|
|
4040
5687
|
stopSentinel,
|
|
4041
5688
|
sentinelCompletion,
|
|
@@ -4045,6 +5692,10 @@ export {
|
|
|
4045
5692
|
buildSteerContext,
|
|
4046
5693
|
createDriver,
|
|
4047
5694
|
renderAnalyses,
|
|
5695
|
+
defaultAnalystInstruction,
|
|
5696
|
+
observe,
|
|
5697
|
+
renderReport,
|
|
5698
|
+
harvestCorpus,
|
|
4048
5699
|
inlineSandboxClient,
|
|
4049
5700
|
reportLoopUsage,
|
|
4050
5701
|
defineRuntimeHooks,
|
|
@@ -4059,6 +5710,7 @@ export {
|
|
|
4059
5710
|
createSandboxForSpec,
|
|
4060
5711
|
defaultSelectWinner,
|
|
4061
5712
|
loopDispatch,
|
|
5713
|
+
createMcpEnvironment,
|
|
4062
5714
|
createScope,
|
|
4063
5715
|
settledToIteration,
|
|
4064
5716
|
pipeline,
|
|
@@ -4084,6 +5736,26 @@ export {
|
|
|
4084
5736
|
runPersonified,
|
|
4085
5737
|
trajectoryReport,
|
|
4086
5738
|
equalKOnCost,
|
|
4087
|
-
|
|
5739
|
+
promotionGate,
|
|
5740
|
+
depthDriver,
|
|
5741
|
+
breadthDriver,
|
|
5742
|
+
sample,
|
|
5743
|
+
refine,
|
|
5744
|
+
defineStrategy,
|
|
5745
|
+
adaptiveRefine,
|
|
5746
|
+
sampleThenRefine,
|
|
5747
|
+
runAgentic,
|
|
5748
|
+
runBenchmark,
|
|
5749
|
+
printBenchmarkReport,
|
|
5750
|
+
openSandboxRun,
|
|
5751
|
+
strategyAuthorContract,
|
|
5752
|
+
assertStrategyContract,
|
|
5753
|
+
authorStrategy,
|
|
5754
|
+
selectChampion,
|
|
5755
|
+
runStrategyEvolution,
|
|
5756
|
+
createVerifierEnvironment,
|
|
5757
|
+
localShell,
|
|
5758
|
+
gitWorkspace,
|
|
5759
|
+
jjWorkspace
|
|
4088
5760
|
};
|
|
4089
|
-
//# sourceMappingURL=chunk-
|
|
5761
|
+
//# sourceMappingURL=chunk-IW2LMLK6.js.map
|