@tangle-network/agent-eval 0.65.0 → 0.66.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/dist/adapters/otel.d.ts +1 -1
- package/dist/campaign/index.d.ts +4 -3
- package/dist/campaign/index.js +18 -19
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
- package/dist/chunk-6XQIEUQ2.js.map +1 -0
- package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
- package/dist/chunk-DFS3FEXO.js.map +1 -0
- package/dist/{chunk-4ODZXQV2.js → chunk-Q56RRLEC.js} +635 -2
- package/dist/chunk-Q56RRLEC.js.map +1 -0
- package/dist/chunk-RDK3P4JE.js +482 -0
- package/dist/chunk-RDK3P4JE.js.map +1 -0
- package/dist/contract/index.d.ts +10 -8
- package/dist/contract/index.js +11 -12
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.d.ts +1 -1
- package/dist/hosted/index.js +1 -1
- package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
- package/dist/index.d.ts +246 -3
- package/dist/index.js +292 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/provenance-BZUFC1_D.d.ts +292 -0
- package/dist/{registry-DPly4_hZ.d.ts → registry-BzAEvqAt.d.ts} +1 -1
- package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
- package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
- package/package.json +1 -1
- package/dist/chunk-4ODZXQV2.js.map +0 -1
- package/dist/chunk-7TPYV2ER.js.map +0 -1
- package/dist/chunk-CZRKD2X2.js +0 -1104
- package/dist/chunk-CZRKD2X2.js.map +0 -1
- package/dist/chunk-E22YUOAL.js +0 -111
- package/dist/chunk-E22YUOAL.js.map +0 -1
- package/dist/chunk-HKINEDRZ.js.map +0 -1
- /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
package/dist/chunk-CZRKD2X2.js
DELETED
|
@@ -1,1104 +0,0 @@
|
|
|
1
|
-
import {
|
|
2
|
-
runCampaign
|
|
3
|
-
} from "./chunk-7TPYV2ER.js";
|
|
4
|
-
import {
|
|
5
|
-
buildReflectionPrompt,
|
|
6
|
-
paretoFrontier,
|
|
7
|
-
parseReflectionResponse,
|
|
8
|
-
runCanaries,
|
|
9
|
-
scoreRedTeamOutput
|
|
10
|
-
} from "./chunk-4ODZXQV2.js";
|
|
11
|
-
import {
|
|
12
|
-
summarizeBackendIntegrity
|
|
13
|
-
} from "./chunk-E22YUOAL.js";
|
|
14
|
-
import {
|
|
15
|
-
detectRewardHacking
|
|
16
|
-
} from "./chunk-YV7J7X5N.js";
|
|
17
|
-
import {
|
|
18
|
-
callLlm
|
|
19
|
-
} from "./chunk-IHDHUN2X.js";
|
|
20
|
-
|
|
21
|
-
// src/campaign/auto-pr.ts
|
|
22
|
-
import { execSync } from "child_process";
|
|
23
|
-
import { writeFileSync } from "fs";
|
|
24
|
-
import { tmpdir } from "os";
|
|
25
|
-
import { join } from "path";
|
|
26
|
-
function openAutoPr(options) {
|
|
27
|
-
if (options.gate.decision !== "ship") {
|
|
28
|
-
return {
|
|
29
|
-
opened: false,
|
|
30
|
-
dryRun: false,
|
|
31
|
-
reason: `gate verdict was "${options.gate.decision}" \u2014 refusing to open PR`
|
|
32
|
-
};
|
|
33
|
-
}
|
|
34
|
-
const dryRun = options.dryRun ?? !process.env.GH_AUTO_PR_TOKEN;
|
|
35
|
-
const branch = options.branch ?? `auto/${options.result.manifestHash.slice(0, 12)}`;
|
|
36
|
-
const title = options.title ?? `auto: campaign ${options.result.manifestHash.slice(0, 8)} promoted by gate`;
|
|
37
|
-
const body = renderPrBody(options.result, options.gate, options.promotedDiff);
|
|
38
|
-
const bodyPath = join(tmpdir(), `auto-pr-body-${Date.now()}.md`);
|
|
39
|
-
writeFileSync(bodyPath, body);
|
|
40
|
-
if (dryRun) {
|
|
41
|
-
return {
|
|
42
|
-
opened: false,
|
|
43
|
-
dryRun: true,
|
|
44
|
-
reason: `dry-run (GH_AUTO_PR_TOKEN not set). Would create PR on ${options.ghOwner}/${options.ghRepo} branch ${branch}. Body at ${bodyPath}.`
|
|
45
|
-
};
|
|
46
|
-
}
|
|
47
|
-
const ghExec = options.ghExec ?? defaultGhExec;
|
|
48
|
-
const result = ghExec([
|
|
49
|
-
"pr",
|
|
50
|
-
"create",
|
|
51
|
-
"--repo",
|
|
52
|
-
`${options.ghOwner}/${options.ghRepo}`,
|
|
53
|
-
"--head",
|
|
54
|
-
branch,
|
|
55
|
-
"--title",
|
|
56
|
-
title,
|
|
57
|
-
"--body-file",
|
|
58
|
-
bodyPath
|
|
59
|
-
]);
|
|
60
|
-
if (result.status !== 0) {
|
|
61
|
-
return {
|
|
62
|
-
opened: false,
|
|
63
|
-
dryRun: false,
|
|
64
|
-
reason: `gh pr create failed (exit ${result.status}): ${result.stderr.slice(0, 400)}`
|
|
65
|
-
};
|
|
66
|
-
}
|
|
67
|
-
const prUrl = result.stdout.trim();
|
|
68
|
-
return { opened: true, prUrl, dryRun: false, reason: "PR opened" };
|
|
69
|
-
}
|
|
70
|
-
function renderPrBody(result, gate, diff) {
|
|
71
|
-
const lines = [];
|
|
72
|
-
lines.push(`## Automated promotion by \`runImprovementLoop\``);
|
|
73
|
-
lines.push("");
|
|
74
|
-
lines.push(`**Manifest**: \`${result.manifestHash}\``);
|
|
75
|
-
lines.push(`**Seed**: ${result.seed}`);
|
|
76
|
-
lines.push(`**Duration**: ${Math.round(result.durationMs / 1e3)}s`);
|
|
77
|
-
lines.push(
|
|
78
|
-
`**Cells**: executed ${result.aggregates.cellsExecuted}, cached ${result.aggregates.cellsCached}, skipped ${result.aggregates.cellsSkipped}, failed ${result.aggregates.cellsFailed}`
|
|
79
|
-
);
|
|
80
|
-
lines.push(`**Total spend**: $${result.aggregates.totalCostUsd.toFixed(2)}`);
|
|
81
|
-
lines.push("");
|
|
82
|
-
lines.push(`### Gate verdict: \`${gate.decision}\``);
|
|
83
|
-
lines.push("");
|
|
84
|
-
for (const reason of gate.reasons) lines.push(`- ${reason}`);
|
|
85
|
-
if (gate.delta !== void 0) lines.push(`- delta: ${gate.delta.toFixed(3)}`);
|
|
86
|
-
lines.push("");
|
|
87
|
-
lines.push("### Contributing gates");
|
|
88
|
-
lines.push("");
|
|
89
|
-
lines.push("| gate | passed | detail |");
|
|
90
|
-
lines.push("|---|---|---|");
|
|
91
|
-
for (const c of gate.contributingGates) {
|
|
92
|
-
const detail = typeof c.detail === "object" ? JSON.stringify(c.detail).slice(0, 80) : String(c.detail).slice(0, 80);
|
|
93
|
-
lines.push(`| ${c.name} | ${c.passed ? "\u2713" : "\u2717"} | ${detail} |`);
|
|
94
|
-
}
|
|
95
|
-
lines.push("");
|
|
96
|
-
lines.push("### Promoted surface");
|
|
97
|
-
lines.push("");
|
|
98
|
-
lines.push("```diff");
|
|
99
|
-
lines.push(diff.slice(0, 8e3));
|
|
100
|
-
lines.push("```");
|
|
101
|
-
lines.push("");
|
|
102
|
-
lines.push("### By-judge aggregates");
|
|
103
|
-
lines.push("");
|
|
104
|
-
lines.push("| judge | mean | ci95 | n |");
|
|
105
|
-
lines.push("|---|---|---|---|");
|
|
106
|
-
for (const [name, agg] of Object.entries(result.aggregates.byJudge)) {
|
|
107
|
-
lines.push(
|
|
108
|
-
`| ${name} | ${agg.mean.toFixed(3)} | [${agg.ci95[0].toFixed(3)}, ${agg.ci95[1].toFixed(3)}] | ${agg.n} |`
|
|
109
|
-
);
|
|
110
|
-
}
|
|
111
|
-
return lines.join("\n");
|
|
112
|
-
}
|
|
113
|
-
function defaultGhExec(args) {
|
|
114
|
-
try {
|
|
115
|
-
const stdout = execSync(`gh ${args.map(quoteArg).join(" ")}`, {
|
|
116
|
-
env: { ...process.env, GH_TOKEN: process.env.GH_AUTO_PR_TOKEN ?? process.env.GH_TOKEN ?? "" },
|
|
117
|
-
stdio: ["ignore", "pipe", "pipe"]
|
|
118
|
-
}).toString("utf8");
|
|
119
|
-
return { stdout, stderr: "", status: 0 };
|
|
120
|
-
} catch (err) {
|
|
121
|
-
const e = err;
|
|
122
|
-
return {
|
|
123
|
-
stdout: e.stdout?.toString("utf8") ?? "",
|
|
124
|
-
stderr: e.stderr?.toString("utf8") ?? "",
|
|
125
|
-
status: e.status ?? 1
|
|
126
|
-
};
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
function quoteArg(arg) {
|
|
130
|
-
if (/^[a-zA-Z0-9_/\-:.@]+$/.test(arg)) return arg;
|
|
131
|
-
return `"${arg.replace(/"/g, '\\"')}"`;
|
|
132
|
-
}
|
|
133
|
-
|
|
134
|
-
// src/campaign/drivers/evolutionary.ts
|
|
135
|
-
function evolutionaryDriver(opts) {
|
|
136
|
-
return {
|
|
137
|
-
kind: `evolutionary:${opts.mutator.kind}`,
|
|
138
|
-
async propose({ currentSurface, findings, populationSize, signal }) {
|
|
139
|
-
return opts.mutator.mutate({
|
|
140
|
-
findings: findings.length > 0 ? findings : opts.findings ?? [],
|
|
141
|
-
currentSurface,
|
|
142
|
-
populationSize,
|
|
143
|
-
signal
|
|
144
|
-
});
|
|
145
|
-
}
|
|
146
|
-
};
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// src/campaign/drivers/gepa.ts
|
|
150
|
-
var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
|
|
151
|
-
var COMBINE_SYSTEM = 'You are an expert prompt engineer performing a GEPA "combine complementary lessons" merge. You are given several non-dominated versions of one surface; each is uniquely best on different scenarios. Produce ONE new version that keeps what makes each version strong on its winning scenarios and resolves conflicts in favor of the more general rule. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} with exactly one proposal whose `payload` is the FULL merged surface text. No prose outside the JSON.';
|
|
152
|
-
function gepaDriver(opts) {
|
|
153
|
-
const evidenceK = opts.evidenceK ?? 3;
|
|
154
|
-
const combineParents = opts.combineParents ?? true;
|
|
155
|
-
const combineMaxParents = opts.combineMaxParents ?? 4;
|
|
156
|
-
if (combineParents && combineMaxParents < 1) {
|
|
157
|
-
throw new Error("gepaDriver: combineMaxParents must be >= 1 when combineParents is enabled");
|
|
158
|
-
}
|
|
159
|
-
return {
|
|
160
|
-
kind: "gepa",
|
|
161
|
-
async propose(ctx) {
|
|
162
|
-
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
163
|
-
const constraints = opts.constraints;
|
|
164
|
-
const preserveSections = constraints?.preserveSections !== void 0 ? constraints.preserveSections.length === 0 ? extractH2Sections(parent) : constraints.preserveSections : null;
|
|
165
|
-
const maxEdits = constraints?.maxSentenceEdits;
|
|
166
|
-
const out = [];
|
|
167
|
-
const seen = /* @__PURE__ */ new Set();
|
|
168
|
-
const accept = (payload, label, rationale) => {
|
|
169
|
-
const text = typeof payload === "string" ? payload.trim() : "";
|
|
170
|
-
if (!text || text === parent || seen.has(text)) return;
|
|
171
|
-
if (preserveSections && !validatePreservedSections(text, preserveSections)) return;
|
|
172
|
-
if (maxEdits !== void 0 && countSentenceEdits(parent, text) > maxEdits * 2) return;
|
|
173
|
-
seen.add(text);
|
|
174
|
-
out.push({ surface: text, label, rationale });
|
|
175
|
-
};
|
|
176
|
-
const stringParents = (combineParents ? ctx.paretoParents ?? [] : []).filter((p) => typeof p.surface === "string").sort((a, b) => b.composite - a.composite).slice(0, combineMaxParents);
|
|
177
|
-
if (stringParents.length > 1) {
|
|
178
|
-
const combinePrompt = buildCombinePrompt({
|
|
179
|
-
target: opts.target,
|
|
180
|
-
parents: stringParents,
|
|
181
|
-
evidenceK
|
|
182
|
-
});
|
|
183
|
-
const combineResult = await callLlm(
|
|
184
|
-
{
|
|
185
|
-
model: opts.model,
|
|
186
|
-
messages: [
|
|
187
|
-
{ role: "system", content: COMBINE_SYSTEM },
|
|
188
|
-
{ role: "user", content: combinePrompt }
|
|
189
|
-
],
|
|
190
|
-
jsonMode: true,
|
|
191
|
-
temperature: opts.temperature ?? 0.7,
|
|
192
|
-
maxTokens: opts.maxTokens ?? 6e3
|
|
193
|
-
},
|
|
194
|
-
opts.llm
|
|
195
|
-
);
|
|
196
|
-
const merged = parseReflectionResponse(combineResult.content, 1)[0];
|
|
197
|
-
if (merged) {
|
|
198
|
-
accept(
|
|
199
|
-
merged.payload,
|
|
200
|
-
merged.label || "pareto-combine",
|
|
201
|
-
merged.rationale || `combined ${stringParents.length} non-dominated parents (gen ${stringParents.map((p) => p.generation).join(",")})`
|
|
202
|
-
);
|
|
203
|
-
}
|
|
204
|
-
}
|
|
205
|
-
const reflectCount = Math.max(0, ctx.populationSize - out.length);
|
|
206
|
-
if (reflectCount > 0) {
|
|
207
|
-
const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
|
|
208
|
-
const userPrompt = buildReflectionPrompt({
|
|
209
|
-
target,
|
|
210
|
-
parentPayload: parent,
|
|
211
|
-
topTrials: top,
|
|
212
|
-
bottomTrials: bottom,
|
|
213
|
-
childCount: reflectCount,
|
|
214
|
-
mutationPrimitives: opts.mutationPrimitives
|
|
215
|
-
});
|
|
216
|
-
const result = await callLlm(
|
|
217
|
-
{
|
|
218
|
-
model: opts.model,
|
|
219
|
-
messages: [
|
|
220
|
-
{ role: "system", content: REFLECTION_SYSTEM },
|
|
221
|
-
{ role: "user", content: userPrompt }
|
|
222
|
-
],
|
|
223
|
-
jsonMode: true,
|
|
224
|
-
temperature: opts.temperature ?? 0.7,
|
|
225
|
-
maxTokens: opts.maxTokens ?? 6e3
|
|
226
|
-
},
|
|
227
|
-
opts.llm
|
|
228
|
-
);
|
|
229
|
-
for (const proposal of parseReflectionResponse(result.content, reflectCount)) {
|
|
230
|
-
accept(proposal.payload, proposal.label, proposal.rationale);
|
|
231
|
-
}
|
|
232
|
-
}
|
|
233
|
-
return out.slice(0, ctx.populationSize);
|
|
234
|
-
}
|
|
235
|
-
};
|
|
236
|
-
}
|
|
237
|
-
function buildCombinePrompt(args) {
|
|
238
|
-
const lines = [
|
|
239
|
-
`You are merging ${args.parents.length} versions of: ${args.target}.`,
|
|
240
|
-
"",
|
|
241
|
-
"Each version is on the Pareto frontier \u2014 none dominates the others; each",
|
|
242
|
-
"wins on different scenarios. Combine their complementary strengths into",
|
|
243
|
-
"ONE version. Below, each version lists the scenarios it scores highest on.",
|
|
244
|
-
""
|
|
245
|
-
];
|
|
246
|
-
args.parents.forEach((p, i) => {
|
|
247
|
-
const tag = String.fromCharCode(65 + i);
|
|
248
|
-
const best = Object.entries(p.objectives).sort((a, b) => b[1] - a[1]).slice(0, args.evidenceK).map(([id, score]) => `${id} (${score.toFixed(2)})`);
|
|
249
|
-
lines.push(
|
|
250
|
-
`### Version ${tag} (mean ${p.composite.toFixed(2)}; strongest on: ${best.join(", ") || "n/a"})`,
|
|
251
|
-
"```",
|
|
252
|
-
p.surface,
|
|
253
|
-
"```",
|
|
254
|
-
""
|
|
255
|
-
);
|
|
256
|
-
});
|
|
257
|
-
lines.push(
|
|
258
|
-
"Return ONE merged version that would score well on the union of every",
|
|
259
|
-
"version's winning scenarios. Keep each version's specific winning rule;",
|
|
260
|
-
"where two rules conflict, prefer the more general one and note the choice",
|
|
261
|
-
"in your rationale."
|
|
262
|
-
);
|
|
263
|
-
return lines.join("\n");
|
|
264
|
-
}
|
|
265
|
-
function extractH2Sections(text) {
|
|
266
|
-
const out = [];
|
|
267
|
-
for (const line of text.split("\n")) {
|
|
268
|
-
const match = /^##\s+(.+?)\s*$/.exec(line);
|
|
269
|
-
if (match) out.push(match[1]);
|
|
270
|
-
}
|
|
271
|
-
return out;
|
|
272
|
-
}
|
|
273
|
-
function countSentenceEdits(baseline, candidate) {
|
|
274
|
-
const norm = (s) => s.split(/(?<=[.!?])\s+|\n/g).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
275
|
-
const a = new Set(norm(baseline));
|
|
276
|
-
const b = new Set(norm(candidate));
|
|
277
|
-
let edits = 0;
|
|
278
|
-
for (const s of a) if (!b.has(s)) edits++;
|
|
279
|
-
for (const s of b) if (!a.has(s)) edits++;
|
|
280
|
-
return edits;
|
|
281
|
-
}
|
|
282
|
-
function validatePreservedSections(candidate, required) {
|
|
283
|
-
if (required.length === 0) return true;
|
|
284
|
-
const have = new Set(extractH2Sections(candidate));
|
|
285
|
-
for (const section of required) {
|
|
286
|
-
if (!have.has(section)) return false;
|
|
287
|
-
}
|
|
288
|
-
return true;
|
|
289
|
-
}
|
|
290
|
-
function buildEvidence(ctx, evidenceK, baseTarget) {
|
|
291
|
-
const last = ctx.history.at(-1);
|
|
292
|
-
if (!last || last.candidates.length === 0) {
|
|
293
|
-
return { top: [], bottom: [], target: baseTarget };
|
|
294
|
-
}
|
|
295
|
-
const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
|
|
296
|
-
if (!best) return { top: [], bottom: [], target: baseTarget };
|
|
297
|
-
const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
|
|
298
|
-
const toTrace = (s) => ({
|
|
299
|
-
id: s.scenarioId,
|
|
300
|
-
score: s.composite
|
|
301
|
-
});
|
|
302
|
-
const top = byScore.slice(0, evidenceK).map(toTrace);
|
|
303
|
-
const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
|
|
304
|
-
const weakest = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, 3).map(([dim, value]) => `${dim} (${value.toFixed(2)})`);
|
|
305
|
-
const target = weakest.length > 0 ? `${baseTarget} \u2014 weakest dimensions: ${weakest.join(", ")}` : baseTarget;
|
|
306
|
-
return { top, bottom, target };
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// src/campaign/gates/compose.ts
|
|
310
|
-
function composeGate(...gates) {
|
|
311
|
-
if (gates.length === 0) {
|
|
312
|
-
throw new Error("composeGate requires at least one gate");
|
|
313
|
-
}
|
|
314
|
-
return {
|
|
315
|
-
name: `composed(${gates.map((g) => g.name).join(",")})`,
|
|
316
|
-
async decide(ctx) {
|
|
317
|
-
const results = [];
|
|
318
|
-
for (const gate of gates) {
|
|
319
|
-
const res = await gate.decide(ctx);
|
|
320
|
-
results.push({ gate, res });
|
|
321
|
-
}
|
|
322
|
-
const decisions = results.map((r) => r.res.decision);
|
|
323
|
-
const overall = decisions.every((d) => d === "ship") ? "ship" : decisions.includes("arch_ceiling") ? "arch_ceiling" : decisions.includes("model_ceiling") ? "model_ceiling" : decisions.includes("hold") ? "hold" : "need_more_work";
|
|
324
|
-
const contributing = results.flatMap(
|
|
325
|
-
(r) => r.res.contributingGates.length > 0 ? r.res.contributingGates : [{ name: r.gate.name, passed: r.res.decision === "ship", detail: r.res }]
|
|
326
|
-
);
|
|
327
|
-
const reasons = results.flatMap(
|
|
328
|
-
(r) => r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`)
|
|
329
|
-
);
|
|
330
|
-
return {
|
|
331
|
-
decision: overall,
|
|
332
|
-
reasons,
|
|
333
|
-
contributingGates: contributing,
|
|
334
|
-
delta: results[0]?.res.delta
|
|
335
|
-
};
|
|
336
|
-
}
|
|
337
|
-
};
|
|
338
|
-
}
|
|
339
|
-
|
|
340
|
-
// src/campaign/gates/default-production-gate.ts
|
|
341
|
-
function defaultProductionGate(options) {
|
|
342
|
-
const deltaThreshold = options.deltaThreshold ?? 0.5;
|
|
343
|
-
const blockOnGaming = options.blockOnRewardHackingGaming ?? true;
|
|
344
|
-
return {
|
|
345
|
-
name: "defaultProductionGate",
|
|
346
|
-
async decide(ctx) {
|
|
347
|
-
const reasons = [];
|
|
348
|
-
const contributing = [];
|
|
349
|
-
const baselineComposite = meanComposite(
|
|
350
|
-
ctx.baselineArtifacts,
|
|
351
|
-
ctx.baselineJudgeScores ?? ctx.judgeScores,
|
|
352
|
-
options.holdoutScenarios
|
|
353
|
-
);
|
|
354
|
-
const candidateComposite = meanComposite(
|
|
355
|
-
ctx.candidateArtifacts,
|
|
356
|
-
ctx.judgeScores,
|
|
357
|
-
options.holdoutScenarios
|
|
358
|
-
);
|
|
359
|
-
const delta = candidateComposite - baselineComposite;
|
|
360
|
-
const heldoutPass = delta >= deltaThreshold;
|
|
361
|
-
contributing.push({
|
|
362
|
-
name: "heldout-delta",
|
|
363
|
-
passed: heldoutPass,
|
|
364
|
-
detail: { baselineComposite, candidateComposite, delta, deltaThreshold }
|
|
365
|
-
});
|
|
366
|
-
if (!heldoutPass) {
|
|
367
|
-
reasons.push(`heldout delta ${delta.toFixed(3)} < threshold ${deltaThreshold}`);
|
|
368
|
-
}
|
|
369
|
-
const budgetPass = options.budgetUsd === void 0 || ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd;
|
|
370
|
-
contributing.push({
|
|
371
|
-
name: "budget",
|
|
372
|
-
passed: budgetPass,
|
|
373
|
-
detail: {
|
|
374
|
-
candidateUsd: ctx.cost.candidate,
|
|
375
|
-
baselineUsd: ctx.cost.baseline,
|
|
376
|
-
budgetUsd: options.budgetUsd
|
|
377
|
-
}
|
|
378
|
-
});
|
|
379
|
-
if (!budgetPass) {
|
|
380
|
-
reasons.push(
|
|
381
|
-
`spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`
|
|
382
|
-
);
|
|
383
|
-
}
|
|
384
|
-
const redTeamFindings = options.redTeamBattery ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery) : { passed: true, findings: [] };
|
|
385
|
-
contributing.push({
|
|
386
|
-
name: "red-team",
|
|
387
|
-
passed: redTeamFindings.passed,
|
|
388
|
-
detail: {
|
|
389
|
-
failures: redTeamFindings.findings.length,
|
|
390
|
-
sample: redTeamFindings.findings.slice(0, 3)
|
|
391
|
-
}
|
|
392
|
-
});
|
|
393
|
-
if (!redTeamFindings.passed) {
|
|
394
|
-
reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`);
|
|
395
|
-
}
|
|
396
|
-
let rewardHackingReport = null;
|
|
397
|
-
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
398
|
-
rewardHackingReport = detectRewardHacking({ runs: options.recentRuns });
|
|
399
|
-
}
|
|
400
|
-
const gamingThreshold = 0.6;
|
|
401
|
-
const gamingFindings = (rewardHackingReport?.findings ?? []).filter(
|
|
402
|
-
(f) => f.severity >= gamingThreshold
|
|
403
|
-
);
|
|
404
|
-
const rewardHackingPass = !rewardHackingReport || !blockOnGaming || gamingFindings.length === 0 && rewardHackingReport.verdict !== "gaming";
|
|
405
|
-
contributing.push({
|
|
406
|
-
name: "reward-hacking",
|
|
407
|
-
passed: rewardHackingPass,
|
|
408
|
-
detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length }
|
|
409
|
-
});
|
|
410
|
-
if (!rewardHackingPass) {
|
|
411
|
-
reasons.push(
|
|
412
|
-
`reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport.verdict})`
|
|
413
|
-
);
|
|
414
|
-
}
|
|
415
|
-
let canaryReport = null;
|
|
416
|
-
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
417
|
-
canaryReport = runCanaries(options.recentRuns, {});
|
|
418
|
-
}
|
|
419
|
-
const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === "error");
|
|
420
|
-
const canaryPass = errorAlerts.length === 0;
|
|
421
|
-
contributing.push({
|
|
422
|
-
name: "canary",
|
|
423
|
-
passed: canaryPass,
|
|
424
|
-
detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length }
|
|
425
|
-
});
|
|
426
|
-
if (!canaryPass) {
|
|
427
|
-
reasons.push(`canary error alerts: ${errorAlerts.length}`);
|
|
428
|
-
}
|
|
429
|
-
const allPassed = contributing.every((c) => c.passed);
|
|
430
|
-
const decision = allPassed ? "ship" : "hold";
|
|
431
|
-
return {
|
|
432
|
-
decision,
|
|
433
|
-
reasons: reasons.length > 0 ? reasons : ["all gates passed"],
|
|
434
|
-
contributingGates: contributing,
|
|
435
|
-
delta
|
|
436
|
-
};
|
|
437
|
-
}
|
|
438
|
-
};
|
|
439
|
-
}
|
|
440
|
-
function meanComposite(artifacts, judgeScoresByCell, scenarios) {
|
|
441
|
-
if (!artifacts || artifacts.size === 0) return 0;
|
|
442
|
-
const scenarioIds = new Set(scenarios.map((s) => s.id));
|
|
443
|
-
const composites = [];
|
|
444
|
-
for (const [cellId, scores] of judgeScoresByCell) {
|
|
445
|
-
const scenarioId = cellId.split(":")[0] ?? "";
|
|
446
|
-
if (!scenarioIds.has(scenarioId)) continue;
|
|
447
|
-
const cellComposites = Object.values(scores).map((s) => s.composite);
|
|
448
|
-
if (cellComposites.length === 0) continue;
|
|
449
|
-
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
450
|
-
}
|
|
451
|
-
if (composites.length === 0) return 0;
|
|
452
|
-
return composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
453
|
-
}
|
|
454
|
-
function probeRedTeam(artifacts, battery) {
|
|
455
|
-
const findings = [];
|
|
456
|
-
for (const [_cellId, artifact] of artifacts) {
|
|
457
|
-
const text = extractText(artifact);
|
|
458
|
-
if (text === void 0) continue;
|
|
459
|
-
for (const rtCase of battery) {
|
|
460
|
-
const finding = scoreRedTeamOutput(text, [], rtCase);
|
|
461
|
-
if (!finding.passed) {
|
|
462
|
-
findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? "red-team probe failed" });
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
}
|
|
466
|
-
return { passed: findings.length === 0, findings };
|
|
467
|
-
}
|
|
468
|
-
function extractText(artifact) {
|
|
469
|
-
if (typeof artifact === "string") return artifact;
|
|
470
|
-
if (artifact && typeof artifact === "object") {
|
|
471
|
-
const rec = artifact;
|
|
472
|
-
if (typeof rec.text === "string") return rec.text;
|
|
473
|
-
if (typeof rec.output === "string") return rec.output;
|
|
474
|
-
if (typeof rec.content === "string") return rec.content;
|
|
475
|
-
}
|
|
476
|
-
return void 0;
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
// src/campaign/gates/heldout-gate.ts
|
|
480
|
-
function heldOutGate(options) {
|
|
481
|
-
const deltaThreshold = options.deltaThreshold ?? 0.5;
|
|
482
|
-
return {
|
|
483
|
-
name: "heldOutGate",
|
|
484
|
-
async decide(ctx) {
|
|
485
|
-
const scenarioIds = new Set(options.scenarios.map((s) => s.id));
|
|
486
|
-
const baseline = meanForScenarios(ctx.baselineJudgeScores ?? ctx.judgeScores, scenarioIds);
|
|
487
|
-
const candidate = meanForScenarios(ctx.judgeScores, scenarioIds);
|
|
488
|
-
const delta = candidate - baseline;
|
|
489
|
-
const passed = delta >= deltaThreshold;
|
|
490
|
-
return {
|
|
491
|
-
decision: passed ? "ship" : "hold",
|
|
492
|
-
reasons: passed ? [`held-out delta ${delta.toFixed(3)} \u2265 ${deltaThreshold}`] : [`held-out delta ${delta.toFixed(3)} < ${deltaThreshold}`],
|
|
493
|
-
contributingGates: [
|
|
494
|
-
{ name: "heldOutGate", passed, detail: { baseline, candidate, delta, deltaThreshold } }
|
|
495
|
-
],
|
|
496
|
-
delta
|
|
497
|
-
};
|
|
498
|
-
}
|
|
499
|
-
};
|
|
500
|
-
}
|
|
501
|
-
function meanForScenarios(judgeScoresByCell, scenarioIds) {
|
|
502
|
-
const composites = [];
|
|
503
|
-
for (const [cellId, scores] of judgeScoresByCell) {
|
|
504
|
-
const scenarioId = cellId.split(":")[0] ?? "";
|
|
505
|
-
if (!scenarioIds.has(scenarioId)) continue;
|
|
506
|
-
const vals = Object.values(scores).map((s) => s.composite);
|
|
507
|
-
if (vals.length > 0) composites.push(vals.reduce((a, b) => a + b, 0) / vals.length);
|
|
508
|
-
}
|
|
509
|
-
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
// src/campaign/types.ts
|
|
513
|
-
function isProposedCandidate(value) {
|
|
514
|
-
return typeof value === "object" && value !== null && "surface" in value && "label" in value && "rationale" in value;
|
|
515
|
-
}
|
|
516
|
-
var LABEL_TRUST_RANK = {
|
|
517
|
-
unverified: 0,
|
|
518
|
-
"verified-signal": 1,
|
|
519
|
-
"human-rated": 2
|
|
520
|
-
};
|
|
521
|
-
function labelTrustRank(trust) {
|
|
522
|
-
return LABEL_TRUST_RANK[trust ?? "unverified"];
|
|
523
|
-
}
|
|
524
|
-
|
|
525
|
-
// src/campaign/score-utils.ts
|
|
526
|
-
function campaignMeanComposite(campaign) {
|
|
527
|
-
const composites = [];
|
|
528
|
-
for (const cell of campaign.cells) {
|
|
529
|
-
const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
530
|
-
if (cellComposites.length > 0) {
|
|
531
|
-
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
532
|
-
}
|
|
533
|
-
}
|
|
534
|
-
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
535
|
-
}
|
|
536
|
-
function campaignBreakdown(campaign) {
|
|
537
|
-
const dimSums = {};
|
|
538
|
-
const dimCounts = {};
|
|
539
|
-
const byScenario = /* @__PURE__ */ new Map();
|
|
540
|
-
for (const cell of campaign.cells) {
|
|
541
|
-
const judgeScores = Object.values(cell.judgeScores);
|
|
542
|
-
if (judgeScores.length === 0) continue;
|
|
543
|
-
const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
|
|
544
|
-
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
545
|
-
arr.push(cellComposite);
|
|
546
|
-
byScenario.set(cell.scenarioId, arr);
|
|
547
|
-
for (const score of judgeScores) {
|
|
548
|
-
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
549
|
-
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
550
|
-
dimCounts[key] = (dimCounts[key] ?? 0) + 1;
|
|
551
|
-
}
|
|
552
|
-
}
|
|
553
|
-
}
|
|
554
|
-
const dimensions = {};
|
|
555
|
-
for (const key of Object.keys(dimSums)) {
|
|
556
|
-
const count = dimCounts[key] ?? 0;
|
|
557
|
-
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
558
|
-
}
|
|
559
|
-
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
|
|
560
|
-
scenarioId,
|
|
561
|
-
composite: comps.reduce((a, b) => a + b, 0) / comps.length
|
|
562
|
-
}));
|
|
563
|
-
return { dimensions, scenarios };
|
|
564
|
-
}
|
|
565
|
-
|
|
566
|
-
// src/campaign/presets/run-optimization.ts
|
|
567
|
-
import { createHash } from "crypto";
|
|
568
|
-
async function runOptimization(opts) {
|
|
569
|
-
const promoteTopK = opts.promoteTopK ?? 2;
|
|
570
|
-
const baselineCampaign = await runCampaign({
|
|
571
|
-
...opts,
|
|
572
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
|
|
573
|
-
runDir: `${opts.runDir}/baseline`
|
|
574
|
-
});
|
|
575
|
-
const generations = [];
|
|
576
|
-
const history = [];
|
|
577
|
-
let currentSurfaces = [opts.baselineSurface];
|
|
578
|
-
let winnerSurface = opts.baselineSurface;
|
|
579
|
-
let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
|
|
580
|
-
let winnerComposite = campaignMeanComposite(baselineCampaign);
|
|
581
|
-
let winnerLabel;
|
|
582
|
-
let winnerRationale;
|
|
583
|
-
const scored = [
|
|
584
|
-
toParetoParent(opts.baselineSurface, winnerSurfaceHash, baselineCampaign, -1)
|
|
585
|
-
];
|
|
586
|
-
for (let gen = 0; gen < opts.maxGenerations; gen++) {
|
|
587
|
-
if (opts.driver.decide?.({ history }).stop) break;
|
|
588
|
-
const paretoParents = computeParetoFrontier(scored);
|
|
589
|
-
const proposed = await opts.driver.propose({
|
|
590
|
-
currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
|
|
591
|
-
history,
|
|
592
|
-
findings: [],
|
|
593
|
-
populationSize: opts.populationSize,
|
|
594
|
-
generation: gen,
|
|
595
|
-
signal: new AbortController().signal,
|
|
596
|
-
report: opts.report,
|
|
597
|
-
dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
|
|
598
|
-
maxImprovementShots: opts.maxImprovementShots,
|
|
599
|
-
paretoParents
|
|
600
|
-
});
|
|
601
|
-
const candidates = proposed.map(
|
|
602
|
-
(p) => isProposedCandidate(p) ? p : { surface: p, label: "", rationale: "" }
|
|
603
|
-
);
|
|
604
|
-
const surfaceResults = [];
|
|
605
|
-
for (let i = 0; i < candidates.length; i++) {
|
|
606
|
-
const { surface, label, rationale } = candidates[i];
|
|
607
|
-
const hash = surfaceHash(surface);
|
|
608
|
-
const campaign = await runCampaign({
|
|
609
|
-
...opts,
|
|
610
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
611
|
-
runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
|
|
612
|
-
});
|
|
613
|
-
const composite = campaignMeanComposite(campaign);
|
|
614
|
-
surfaceResults.push({ surfaceHash: hash, surface, label, rationale, campaign, composite });
|
|
615
|
-
scored.push(
|
|
616
|
-
toParetoParent(surface, hash, campaign, gen, label || void 0, rationale || void 0)
|
|
617
|
-
);
|
|
618
|
-
}
|
|
619
|
-
surfaceResults.sort((a, b) => b.composite - a.composite);
|
|
620
|
-
const promoted = surfaceResults.slice(0, promoteTopK);
|
|
621
|
-
currentSurfaces = promoted.map((p) => p.surface);
|
|
622
|
-
const top = surfaceResults[0];
|
|
623
|
-
if (top && top.composite > winnerComposite) {
|
|
624
|
-
winnerSurface = top.surface;
|
|
625
|
-
winnerSurfaceHash = top.surfaceHash;
|
|
626
|
-
winnerComposite = top.composite;
|
|
627
|
-
winnerLabel = top.label || void 0;
|
|
628
|
-
winnerRationale = top.rationale || void 0;
|
|
629
|
-
}
|
|
630
|
-
const record = {
|
|
631
|
-
generationIndex: gen,
|
|
632
|
-
candidates: surfaceResults.map((s) => {
|
|
633
|
-
const breakdown = campaignBreakdown(s.campaign);
|
|
634
|
-
const candidate = {
|
|
635
|
-
surfaceHash: s.surfaceHash,
|
|
636
|
-
composite: s.composite,
|
|
637
|
-
ci95: [s.composite, s.composite],
|
|
638
|
-
dimensions: breakdown.dimensions,
|
|
639
|
-
scenarios: breakdown.scenarios
|
|
640
|
-
};
|
|
641
|
-
if (s.label) candidate.label = s.label;
|
|
642
|
-
if (s.rationale) candidate.rationale = s.rationale;
|
|
643
|
-
return candidate;
|
|
644
|
-
}),
|
|
645
|
-
promoted: promoted.map((p) => p.surfaceHash)
|
|
646
|
-
};
|
|
647
|
-
history.push(record);
|
|
648
|
-
generations.push({
|
|
649
|
-
record,
|
|
650
|
-
surfaces: surfaceResults.map((s) => ({
|
|
651
|
-
surfaceHash: s.surfaceHash,
|
|
652
|
-
surface: s.surface,
|
|
653
|
-
campaign: s.campaign
|
|
654
|
-
}))
|
|
655
|
-
});
|
|
656
|
-
}
|
|
657
|
-
return {
|
|
658
|
-
generations,
|
|
659
|
-
winnerSurface,
|
|
660
|
-
winnerSurfaceHash,
|
|
661
|
-
winnerLabel,
|
|
662
|
-
winnerRationale,
|
|
663
|
-
baselineCampaign,
|
|
664
|
-
paretoFrontier: computeParetoFrontier(scored)
|
|
665
|
-
};
|
|
666
|
-
}
|
|
667
|
-
function toParetoParent(surface, hash, campaign, generation, label, rationale) {
|
|
668
|
-
const objectives = {};
|
|
669
|
-
for (const { scenarioId, composite } of campaignBreakdown(campaign).scenarios) {
|
|
670
|
-
objectives[scenarioId] = composite;
|
|
671
|
-
}
|
|
672
|
-
const parent = {
|
|
673
|
-
surface,
|
|
674
|
-
surfaceHash: hash,
|
|
675
|
-
objectives,
|
|
676
|
-
composite: campaignMeanComposite(campaign),
|
|
677
|
-
generation
|
|
678
|
-
};
|
|
679
|
-
if (label) parent.label = label;
|
|
680
|
-
if (rationale) parent.rationale = rationale;
|
|
681
|
-
return parent;
|
|
682
|
-
}
|
|
683
|
-
function computeParetoFrontier(scored) {
|
|
684
|
-
if (scored.length <= 1) return [...scored];
|
|
685
|
-
const ids = /* @__PURE__ */ new Set();
|
|
686
|
-
for (const p of scored) for (const id of Object.keys(p.objectives)) ids.add(id);
|
|
687
|
-
if (ids.size === 0) return [...scored];
|
|
688
|
-
const floor = {};
|
|
689
|
-
for (const id of ids) {
|
|
690
|
-
let min = Number.POSITIVE_INFINITY;
|
|
691
|
-
for (const p of scored) {
|
|
692
|
-
const v = p.objectives[id];
|
|
693
|
-
if (typeof v === "number" && Number.isFinite(v) && v < min) min = v;
|
|
694
|
-
}
|
|
695
|
-
floor[id] = Number.isFinite(min) ? min : 0;
|
|
696
|
-
}
|
|
697
|
-
const objectives = [...ids].map((id) => ({
|
|
698
|
-
name: id,
|
|
699
|
-
direction: "maximize",
|
|
700
|
-
value: (p) => {
|
|
701
|
-
const v = p.objectives[id];
|
|
702
|
-
return typeof v === "number" && Number.isFinite(v) ? v : floor[id] ?? 0;
|
|
703
|
-
}
|
|
704
|
-
}));
|
|
705
|
-
return paretoFrontier(scored, objectives).frontier;
|
|
706
|
-
}
|
|
707
|
-
function surfaceHash(surface) {
|
|
708
|
-
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
709
|
-
kind: surface.kind,
|
|
710
|
-
worktreeRef: surface.worktreeRef,
|
|
711
|
-
baseRef: surface.baseRef ?? null
|
|
712
|
-
});
|
|
713
|
-
return createHash("sha256").update(material).digest("hex").slice(0, 16);
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
// src/campaign/presets/run-improvement-loop.ts
|
|
717
|
-
async function runImprovementLoop(opts) {
|
|
718
|
-
if (opts.autoOnPromote === "config") {
|
|
719
|
-
throw new Error(
|
|
720
|
-
"runImprovementLoop: autoOnPromote='config' is deferred to Pass B (requires shadow deploy + rollback + ensemble judges). Use 'pr' or 'none' in v0.40."
|
|
721
|
-
);
|
|
722
|
-
}
|
|
723
|
-
if (opts.tracing === "off" && opts.driver) {
|
|
724
|
-
throw new Error(
|
|
725
|
-
"runImprovementLoop: tracing='off' is forbidden when a driver is wired. The improvement loop without traces is unattributable; candidate surfaces cannot be cited back to spans and the optimization dataset goes unfed."
|
|
726
|
-
);
|
|
727
|
-
}
|
|
728
|
-
if (opts.autoOnPromote === "pr" && (!opts.ghOwner || !opts.ghRepo)) {
|
|
729
|
-
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
730
|
-
}
|
|
731
|
-
const optimization = await runOptimization(opts);
|
|
732
|
-
const { runCampaign: runCampaign2 } = await import("./run-campaign-5J3ED2UJ.js");
|
|
733
|
-
const baselineOnHoldout = await runCampaign2({
|
|
734
|
-
...opts,
|
|
735
|
-
scenarios: opts.holdoutScenarios,
|
|
736
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
|
|
737
|
-
runDir: `${opts.runDir}/holdout-baseline`
|
|
738
|
-
});
|
|
739
|
-
const winnerOnHoldout = await runCampaign2({
|
|
740
|
-
...opts,
|
|
741
|
-
scenarios: opts.holdoutScenarios,
|
|
742
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(optimization.winnerSurface, scenario, ctx),
|
|
743
|
-
runDir: `${opts.runDir}/holdout-winner`
|
|
744
|
-
});
|
|
745
|
-
const candidateArtifacts = /* @__PURE__ */ new Map();
|
|
746
|
-
const baselineArtifacts = /* @__PURE__ */ new Map();
|
|
747
|
-
const judgeScores = /* @__PURE__ */ new Map();
|
|
748
|
-
const baselineJudgeScores = /* @__PURE__ */ new Map();
|
|
749
|
-
for (const cell of winnerOnHoldout.cells) {
|
|
750
|
-
candidateArtifacts.set(cell.cellId, cell.artifact);
|
|
751
|
-
judgeScores.set(cell.cellId, cell.judgeScores);
|
|
752
|
-
}
|
|
753
|
-
for (const cell of baselineOnHoldout.cells) {
|
|
754
|
-
baselineArtifacts.set(cell.cellId, cell.artifact);
|
|
755
|
-
baselineJudgeScores.set(cell.cellId, cell.judgeScores);
|
|
756
|
-
}
|
|
757
|
-
const gateResult = await opts.gate.decide({
|
|
758
|
-
candidateArtifacts,
|
|
759
|
-
baselineArtifacts,
|
|
760
|
-
judgeScores,
|
|
761
|
-
baselineJudgeScores,
|
|
762
|
-
scenarios: opts.holdoutScenarios,
|
|
763
|
-
cost: {
|
|
764
|
-
candidate: winnerOnHoldout.aggregates.totalCostUsd,
|
|
765
|
-
baseline: baselineOnHoldout.aggregates.totalCostUsd
|
|
766
|
-
},
|
|
767
|
-
signal: new AbortController().signal
|
|
768
|
-
});
|
|
769
|
-
const render = opts.renderPromotedDiff ?? defaultRenderDiff;
|
|
770
|
-
const promotedDiff = optimization.winnerSurfaceHash === surfaceHash(opts.baselineSurface) ? "" : render(optimization.winnerSurface, opts.baselineSurface);
|
|
771
|
-
let prResult;
|
|
772
|
-
if (opts.autoOnPromote === "pr" && gateResult.decision === "ship") {
|
|
773
|
-
prResult = openAutoPr({
|
|
774
|
-
result: winnerOnHoldout,
|
|
775
|
-
gate: gateResult,
|
|
776
|
-
promotedDiff,
|
|
777
|
-
ghOwner: opts.ghOwner,
|
|
778
|
-
ghRepo: opts.ghRepo
|
|
779
|
-
});
|
|
780
|
-
}
|
|
781
|
-
return {
|
|
782
|
-
...optimization,
|
|
783
|
-
baselineOnHoldout,
|
|
784
|
-
winnerOnHoldout,
|
|
785
|
-
gateResult,
|
|
786
|
-
promotedDiff,
|
|
787
|
-
prResult
|
|
788
|
-
};
|
|
789
|
-
}
|
|
790
|
-
function defaultRenderDiff(winnerSurface, baselineSurface) {
|
|
791
|
-
if (typeof winnerSurface !== "string" || typeof baselineSurface !== "string") {
|
|
792
|
-
const fmt = (s) => typeof s === "string" ? "(prompt surface)" : `worktree=${s.worktreeRef}${s.baseRef ? ` base=${s.baseRef}` : ""}${s.summary ? `
|
|
793
|
-
${s.summary}` : ""}`;
|
|
794
|
-
return `--- baseline
|
|
795
|
-
${fmt(baselineSurface)}
|
|
796
|
-
+++ winner
|
|
797
|
-
${fmt(winnerSurface)}`;
|
|
798
|
-
}
|
|
799
|
-
const lines = [];
|
|
800
|
-
lines.push("--- baseline");
|
|
801
|
-
lines.push("+++ winner");
|
|
802
|
-
for (const l of baselineSurface.split("\n")) lines.push(`- ${l}`);
|
|
803
|
-
for (const l of winnerSurface.split("\n")) lines.push(`+ ${l}`);
|
|
804
|
-
return lines.join("\n");
|
|
805
|
-
}
|
|
806
|
-
|
|
807
|
-
// src/campaign/presets/run-eval.ts
|
|
808
|
-
async function runEval(opts) {
|
|
809
|
-
return runCampaign(opts);
|
|
810
|
-
}
|
|
811
|
-
|
|
812
|
-
// src/campaign/provenance.ts
|
|
813
|
-
import { createHash as createHash2 } from "crypto";
|
|
814
|
-
import { join as join2 } from "path";
|
|
815
|
-
function surfaceContentHash(surface) {
|
|
816
|
-
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
817
|
-
kind: surface.kind,
|
|
818
|
-
worktreeRef: surface.worktreeRef,
|
|
819
|
-
baseRef: surface.baseRef ?? null
|
|
820
|
-
});
|
|
821
|
-
return `sha256:${createHash2("sha256").update(material).digest("hex")}`;
|
|
822
|
-
}
|
|
823
|
-
function meanHoldoutComposite(campaign) {
|
|
824
|
-
const xs = [];
|
|
825
|
-
for (const cell of campaign.cells) {
|
|
826
|
-
if (cell.error) continue;
|
|
827
|
-
const cs = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
828
|
-
if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
|
|
829
|
-
}
|
|
830
|
-
return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
|
|
831
|
-
}
|
|
832
|
-
function buildLoopProvenanceRecord(args) {
|
|
833
|
-
const integrity = summarizeBackendIntegrity(args.workerRecords);
|
|
834
|
-
const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
|
|
835
|
-
const candidates = [];
|
|
836
|
-
for (const gen of args.generations) {
|
|
837
|
-
const promotedSet = new Set(gen.promoted);
|
|
838
|
-
const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
|
|
839
|
-
for (const c of gen.candidates) {
|
|
840
|
-
const surface = surfaceByHash.get(c.surfaceHash);
|
|
841
|
-
const entry = {
|
|
842
|
-
generation: gen.generationIndex,
|
|
843
|
-
surfaceHash: c.surfaceHash,
|
|
844
|
-
contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
|
|
845
|
-
composite: c.composite,
|
|
846
|
-
promoted: promotedSet.has(c.surfaceHash)
|
|
847
|
-
};
|
|
848
|
-
if (c.label) entry.label = c.label;
|
|
849
|
-
if (c.rationale) entry.rationale = c.rationale;
|
|
850
|
-
candidates.push(entry);
|
|
851
|
-
}
|
|
852
|
-
}
|
|
853
|
-
const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
|
|
854
|
-
const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
|
|
855
|
-
const record = {
|
|
856
|
-
schema: "tangle.loop-provenance.v1",
|
|
857
|
-
runId: args.runId,
|
|
858
|
-
runDir: args.runDir,
|
|
859
|
-
timestamp: args.timestamp,
|
|
860
|
-
baselineContentHash: surfaceContentHash(args.baselineSurface),
|
|
861
|
-
winnerContentHash: surfaceContentHash(args.winnerSurface),
|
|
862
|
-
diff: args.diff,
|
|
863
|
-
candidates,
|
|
864
|
-
gate: {
|
|
865
|
-
decision: args.gate.decision,
|
|
866
|
-
reasons: args.gate.reasons,
|
|
867
|
-
delta: args.gate.delta,
|
|
868
|
-
contributingGates: args.gate.contributingGates.map((g) => ({
|
|
869
|
-
name: g.name,
|
|
870
|
-
passed: g.passed
|
|
871
|
-
}))
|
|
872
|
-
},
|
|
873
|
-
baselineHoldoutComposite,
|
|
874
|
-
winnerHoldoutComposite,
|
|
875
|
-
heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
|
|
876
|
-
backend: {
|
|
877
|
-
verdict: integrity.verdict,
|
|
878
|
-
workerCallCount: integrity.totalRecords,
|
|
879
|
-
models,
|
|
880
|
-
totalInputTokens: integrity.totalInputTokens,
|
|
881
|
-
totalOutputTokens: integrity.totalOutputTokens,
|
|
882
|
-
totalCostUsd: integrity.totalCostUsd
|
|
883
|
-
},
|
|
884
|
-
totalCostUsd: args.totalCostUsd,
|
|
885
|
-
totalDurationMs: args.totalDurationMs
|
|
886
|
-
};
|
|
887
|
-
if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
|
|
888
|
-
if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
|
|
889
|
-
return record;
|
|
890
|
-
}
|
|
891
|
-
var DECISION_OK = ["ship"];
|
|
892
|
-
function hashId(parts) {
|
|
893
|
-
return createHash2("sha256").update(parts.join(":")).digest("hex");
|
|
894
|
-
}
|
|
895
|
-
function gateStatus(decision) {
|
|
896
|
-
return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
|
|
897
|
-
}
|
|
898
|
-
function loopProvenanceSpans(record, opts = {}) {
|
|
899
|
-
const traceId = hashId(["trace", record.runId]).slice(0, 32);
|
|
900
|
-
const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
|
|
901
|
-
const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
|
|
902
|
-
const spans = [];
|
|
903
|
-
const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
|
|
904
|
-
spans.push({
|
|
905
|
-
traceId,
|
|
906
|
-
spanId: rootSpanId,
|
|
907
|
-
name: "improvement-loop",
|
|
908
|
-
startTimeUnixNano: baseNano,
|
|
909
|
-
endTimeUnixNano: endNano,
|
|
910
|
-
attributes: {
|
|
911
|
-
"tangle.runId": record.runId,
|
|
912
|
-
"tangle.runDir": record.runDir,
|
|
913
|
-
"tangle.baselineContentHash": record.baselineContentHash,
|
|
914
|
-
"tangle.winnerContentHash": record.winnerContentHash,
|
|
915
|
-
"tangle.heldOutLift": record.heldOutLift,
|
|
916
|
-
"tangle.gateDecision": record.gate.decision,
|
|
917
|
-
"tangle.backendVerdict": record.backend.verdict,
|
|
918
|
-
"tangle.workerCallCount": record.backend.workerCallCount,
|
|
919
|
-
"tangle.totalCostUsd": record.totalCostUsd
|
|
920
|
-
},
|
|
921
|
-
status: gateStatus(record.gate.decision),
|
|
922
|
-
"tangle.runId": record.runId
|
|
923
|
-
});
|
|
924
|
-
const byGen = /* @__PURE__ */ new Map();
|
|
925
|
-
for (const c of record.candidates) {
|
|
926
|
-
const arr = byGen.get(c.generation) ?? [];
|
|
927
|
-
arr.push(c);
|
|
928
|
-
byGen.set(c.generation, arr);
|
|
929
|
-
}
|
|
930
|
-
for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
|
|
931
|
-
const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
|
|
932
|
-
const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
|
|
933
|
-
spans.push({
|
|
934
|
-
traceId,
|
|
935
|
-
spanId: genSpanId,
|
|
936
|
-
parentSpanId: rootSpanId,
|
|
937
|
-
name: `generation-${generation}`,
|
|
938
|
-
startTimeUnixNano: baseNano,
|
|
939
|
-
endTimeUnixNano: endNano,
|
|
940
|
-
attributes: {
|
|
941
|
-
"tangle.runId": record.runId,
|
|
942
|
-
"tangle.generation": generation,
|
|
943
|
-
"tangle.populationSize": cands.length,
|
|
944
|
-
"tangle.bestComposite": bestComposite
|
|
945
|
-
},
|
|
946
|
-
"tangle.runId": record.runId,
|
|
947
|
-
"tangle.generation": generation
|
|
948
|
-
});
|
|
949
|
-
for (let i = 0; i < cands.length; i++) {
|
|
950
|
-
const c = cands[i];
|
|
951
|
-
const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
|
|
952
|
-
0,
|
|
953
|
-
16
|
|
954
|
-
);
|
|
955
|
-
const attributes = {
|
|
956
|
-
"tangle.runId": record.runId,
|
|
957
|
-
"tangle.generation": generation,
|
|
958
|
-
"tangle.surfaceHash": c.surfaceHash,
|
|
959
|
-
"tangle.contentHash": c.contentHash,
|
|
960
|
-
"tangle.composite": c.composite,
|
|
961
|
-
"tangle.promoted": c.promoted
|
|
962
|
-
};
|
|
963
|
-
if (c.label) attributes["tangle.candidateLabel"] = c.label;
|
|
964
|
-
if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
|
|
965
|
-
spans.push({
|
|
966
|
-
traceId,
|
|
967
|
-
spanId: candSpanId,
|
|
968
|
-
parentSpanId: genSpanId,
|
|
969
|
-
name: `candidate-${c.surfaceHash}`,
|
|
970
|
-
startTimeUnixNano: baseNano,
|
|
971
|
-
endTimeUnixNano: endNano,
|
|
972
|
-
attributes,
|
|
973
|
-
"tangle.runId": record.runId,
|
|
974
|
-
"tangle.generation": generation
|
|
975
|
-
});
|
|
976
|
-
}
|
|
977
|
-
}
|
|
978
|
-
const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
|
|
979
|
-
spans.push({
|
|
980
|
-
traceId,
|
|
981
|
-
spanId: gateSpanId,
|
|
982
|
-
parentSpanId: rootSpanId,
|
|
983
|
-
name: "gate-decision",
|
|
984
|
-
startTimeUnixNano: endNano,
|
|
985
|
-
endTimeUnixNano: endNano,
|
|
986
|
-
attributes: {
|
|
987
|
-
"tangle.runId": record.runId,
|
|
988
|
-
"tangle.gateDecision": record.gate.decision,
|
|
989
|
-
"tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
|
|
990
|
-
"tangle.gateReasons": JSON.stringify(record.gate.reasons),
|
|
991
|
-
"tangle.heldOutLift": record.heldOutLift,
|
|
992
|
-
"tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
|
|
993
|
-
"tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
|
|
994
|
-
},
|
|
995
|
-
status: gateStatus(record.gate.decision),
|
|
996
|
-
"tangle.runId": record.runId
|
|
997
|
-
});
|
|
998
|
-
return spans;
|
|
999
|
-
}
|
|
1000
|
-
function provenanceRecordPath(runDir) {
|
|
1001
|
-
return join2(runDir, "loop-provenance.json");
|
|
1002
|
-
}
|
|
1003
|
-
function provenanceSpansPath(runDir) {
|
|
1004
|
-
return join2(runDir, "loop-provenance-spans.jsonl");
|
|
1005
|
-
}
|
|
1006
|
-
function snapshotFromHoldout(index, surfaceHash2, surface, campaign) {
|
|
1007
|
-
const cells = campaign.cells.map((cell) => {
|
|
1008
|
-
const judgeScores = Object.values(cell.judgeScores);
|
|
1009
|
-
const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
|
|
1010
|
-
const score = {
|
|
1011
|
-
scenarioId: cell.scenarioId,
|
|
1012
|
-
rep: cell.rep,
|
|
1013
|
-
compositeMean: composite,
|
|
1014
|
-
dimensions: Object.fromEntries(
|
|
1015
|
-
Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
|
|
1016
|
-
)
|
|
1017
|
-
};
|
|
1018
|
-
if (cell.error) score.errorMessage = cell.error;
|
|
1019
|
-
return score;
|
|
1020
|
-
});
|
|
1021
|
-
const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
|
|
1022
|
-
return {
|
|
1023
|
-
index,
|
|
1024
|
-
surfaceHash: surfaceHash2,
|
|
1025
|
-
surface,
|
|
1026
|
-
cells,
|
|
1027
|
-
compositeMean,
|
|
1028
|
-
costUsd: campaign.aggregates.totalCostUsd,
|
|
1029
|
-
durationMs: campaign.durationMs
|
|
1030
|
-
};
|
|
1031
|
-
}
|
|
1032
|
-
function buildEvalRunEvent(args, record) {
|
|
1033
|
-
return {
|
|
1034
|
-
runId: args.runId,
|
|
1035
|
-
runDir: args.runDir,
|
|
1036
|
-
timestamp: args.timestamp,
|
|
1037
|
-
status: "finished",
|
|
1038
|
-
labels: {},
|
|
1039
|
-
baseline: snapshotFromHoldout(
|
|
1040
|
-
0,
|
|
1041
|
-
record.baselineContentHash,
|
|
1042
|
-
args.baselineSurface,
|
|
1043
|
-
args.baselineOnHoldout
|
|
1044
|
-
),
|
|
1045
|
-
generations: [
|
|
1046
|
-
snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
|
|
1047
|
-
],
|
|
1048
|
-
gateDecision: args.gate.decision,
|
|
1049
|
-
holdoutLift: record.heldOutLift,
|
|
1050
|
-
totalCostUsd: args.totalCostUsd,
|
|
1051
|
-
totalDurationMs: args.totalDurationMs
|
|
1052
|
-
};
|
|
1053
|
-
}
|
|
1054
|
-
async function emitLoopProvenance(args) {
|
|
1055
|
-
const record = buildLoopProvenanceRecord(args);
|
|
1056
|
-
const spans = loopProvenanceSpans(record);
|
|
1057
|
-
args.storage.ensureDir(args.runDir);
|
|
1058
|
-
const recordPath = provenanceRecordPath(args.runDir);
|
|
1059
|
-
const spansPath = provenanceSpansPath(args.runDir);
|
|
1060
|
-
args.storage.write(recordPath, JSON.stringify(record, null, 2));
|
|
1061
|
-
args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
|
|
1062
|
-
if (args.hostedClient) {
|
|
1063
|
-
try {
|
|
1064
|
-
await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
|
|
1065
|
-
} catch (err) {
|
|
1066
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
1067
|
-
console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
|
|
1068
|
-
}
|
|
1069
|
-
try {
|
|
1070
|
-
await args.hostedClient.ingestTraces(spans);
|
|
1071
|
-
} catch (err) {
|
|
1072
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
1073
|
-
console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
|
|
1074
|
-
}
|
|
1075
|
-
}
|
|
1076
|
-
return { record, spans, recordPath, spansPath };
|
|
1077
|
-
}
|
|
1078
|
-
|
|
1079
|
-
export {
|
|
1080
|
-
openAutoPr,
|
|
1081
|
-
evolutionaryDriver,
|
|
1082
|
-
gepaDriver,
|
|
1083
|
-
extractH2Sections,
|
|
1084
|
-
countSentenceEdits,
|
|
1085
|
-
composeGate,
|
|
1086
|
-
defaultProductionGate,
|
|
1087
|
-
heldOutGate,
|
|
1088
|
-
isProposedCandidate,
|
|
1089
|
-
labelTrustRank,
|
|
1090
|
-
campaignMeanComposite,
|
|
1091
|
-
campaignBreakdown,
|
|
1092
|
-
runOptimization,
|
|
1093
|
-
surfaceHash,
|
|
1094
|
-
runImprovementLoop,
|
|
1095
|
-
defaultRenderDiff,
|
|
1096
|
-
runEval,
|
|
1097
|
-
surfaceContentHash,
|
|
1098
|
-
buildLoopProvenanceRecord,
|
|
1099
|
-
loopProvenanceSpans,
|
|
1100
|
-
provenanceRecordPath,
|
|
1101
|
-
provenanceSpansPath,
|
|
1102
|
-
emitLoopProvenance
|
|
1103
|
-
};
|
|
1104
|
-
//# sourceMappingURL=chunk-CZRKD2X2.js.map
|