@tangle-network/agent-eval 0.43.2 → 0.44.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/langchain.d.ts +91 -0
- package/dist/adapters/langchain.js +34 -0
- package/dist/adapters/langchain.js.map +1 -0
- package/dist/campaign/index.d.ts +7 -401
- package/dist/campaign/index.js +24 -634
- package/dist/campaign/index.js.map +1 -1
- package/dist/chunk-3RF76KTD.js +84 -0
- package/dist/chunk-3RF76KTD.js.map +1 -0
- package/dist/chunk-H5BGRSN4.js +642 -0
- package/dist/chunk-H5BGRSN4.js.map +1 -0
- package/dist/contract/index.d.ts +10 -0
- package/dist/contract/index.js +41 -0
- package/dist/contract/index.js.map +1 -0
- package/dist/governance/index.d.ts +1 -1
- package/dist/index.d.ts +1 -1
- package/dist/meta-eval/index.d.ts +3 -3
- package/dist/meta-eval/index.js +4 -79
- package/dist/meta-eval/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/{outcome-store-D6KWmYvj.d.ts → outcome-store-BxJ3DQKJ.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/rl.d.ts +5 -4
- package/dist/rl.js +6 -0
- package/dist/rl.js.map +1 -1
- package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-CJ08tGwq.d.ts} +1 -1
- package/dist/run-improvement-loop-CbilHQAb.d.ts +401 -0
- package/dist/{types-BLbRTxoc.d.ts → types-DToGONFA.d.ts} +1 -1
- package/docs/quickstart-external.md +190 -0
- package/package.json +11 -1
package/dist/campaign/index.js
CHANGED
|
@@ -1,420 +1,33 @@
|
|
|
1
|
+
import {
|
|
2
|
+
composeGate,
|
|
3
|
+
defaultProductionGate,
|
|
4
|
+
evolutionaryDriver,
|
|
5
|
+
gepaDriver,
|
|
6
|
+
heldOutGate,
|
|
7
|
+
openAutoPr,
|
|
8
|
+
runEval,
|
|
9
|
+
runImprovementLoop,
|
|
10
|
+
runOptimization,
|
|
11
|
+
surfaceHash
|
|
12
|
+
} from "../chunk-H5BGRSN4.js";
|
|
1
13
|
import {
|
|
2
14
|
fsCampaignStorage,
|
|
3
15
|
inMemoryCampaignStorage,
|
|
4
16
|
runCampaign
|
|
5
17
|
} from "../chunk-RXK7FXLV.js";
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
parseReflectionResponse,
|
|
9
|
-
runCanaries,
|
|
10
|
-
scoreRedTeamOutput
|
|
11
|
-
} from "../chunk-N4SBKEPJ.js";
|
|
12
|
-
import {
|
|
13
|
-
detectRewardHacking
|
|
14
|
-
} from "../chunk-YV7J7X5N.js";
|
|
18
|
+
import "../chunk-N4SBKEPJ.js";
|
|
19
|
+
import "../chunk-YV7J7X5N.js";
|
|
15
20
|
import "../chunk-WP7SY7AI.js";
|
|
16
21
|
import "../chunk-GGE4NNQT.js";
|
|
17
|
-
import
|
|
18
|
-
callLlm
|
|
19
|
-
} from "../chunk-VXNVVBZO.js";
|
|
22
|
+
import "../chunk-VXNVVBZO.js";
|
|
20
23
|
import "../chunk-PC4UYEBM.js";
|
|
21
24
|
import "../chunk-QYJT52YW.js";
|
|
22
25
|
import "../chunk-NSBPE2FW.js";
|
|
23
26
|
|
|
24
|
-
// src/campaign/auto-pr.ts
|
|
25
|
-
import { execSync } from "child_process";
|
|
26
|
-
import { writeFileSync } from "fs";
|
|
27
|
-
import { tmpdir } from "os";
|
|
28
|
-
import { join } from "path";
|
|
29
|
-
function openAutoPr(options) {
|
|
30
|
-
if (options.gate.decision !== "ship") {
|
|
31
|
-
return {
|
|
32
|
-
opened: false,
|
|
33
|
-
dryRun: false,
|
|
34
|
-
reason: `gate verdict was "${options.gate.decision}" \u2014 refusing to open PR`
|
|
35
|
-
};
|
|
36
|
-
}
|
|
37
|
-
const dryRun = options.dryRun ?? !process.env.GH_AUTO_PR_TOKEN;
|
|
38
|
-
const branch = options.branch ?? `auto/${options.result.manifestHash.slice(0, 12)}`;
|
|
39
|
-
const title = options.title ?? `auto: campaign ${options.result.manifestHash.slice(0, 8)} promoted by gate`;
|
|
40
|
-
const body = renderPrBody(options.result, options.gate, options.promotedDiff);
|
|
41
|
-
const bodyPath = join(tmpdir(), `auto-pr-body-${Date.now()}.md`);
|
|
42
|
-
writeFileSync(bodyPath, body);
|
|
43
|
-
if (dryRun) {
|
|
44
|
-
return {
|
|
45
|
-
opened: false,
|
|
46
|
-
dryRun: true,
|
|
47
|
-
reason: `dry-run (GH_AUTO_PR_TOKEN not set). Would create PR on ${options.ghOwner}/${options.ghRepo} branch ${branch}. Body at ${bodyPath}.`
|
|
48
|
-
};
|
|
49
|
-
}
|
|
50
|
-
const ghExec = options.ghExec ?? defaultGhExec;
|
|
51
|
-
const result = ghExec([
|
|
52
|
-
"pr",
|
|
53
|
-
"create",
|
|
54
|
-
"--repo",
|
|
55
|
-
`${options.ghOwner}/${options.ghRepo}`,
|
|
56
|
-
"--head",
|
|
57
|
-
branch,
|
|
58
|
-
"--title",
|
|
59
|
-
title,
|
|
60
|
-
"--body-file",
|
|
61
|
-
bodyPath
|
|
62
|
-
]);
|
|
63
|
-
if (result.status !== 0) {
|
|
64
|
-
return {
|
|
65
|
-
opened: false,
|
|
66
|
-
dryRun: false,
|
|
67
|
-
reason: `gh pr create failed (exit ${result.status}): ${result.stderr.slice(0, 400)}`
|
|
68
|
-
};
|
|
69
|
-
}
|
|
70
|
-
const prUrl = result.stdout.trim();
|
|
71
|
-
return { opened: true, prUrl, dryRun: false, reason: "PR opened" };
|
|
72
|
-
}
|
|
73
|
-
function renderPrBody(result, gate, diff) {
|
|
74
|
-
const lines = [];
|
|
75
|
-
lines.push(`## Automated promotion by \`runImprovementLoop\``);
|
|
76
|
-
lines.push("");
|
|
77
|
-
lines.push(`**Manifest**: \`${result.manifestHash}\``);
|
|
78
|
-
lines.push(`**Seed**: ${result.seed}`);
|
|
79
|
-
lines.push(`**Duration**: ${Math.round(result.durationMs / 1e3)}s`);
|
|
80
|
-
lines.push(
|
|
81
|
-
`**Cells**: executed ${result.aggregates.cellsExecuted}, cached ${result.aggregates.cellsCached}, skipped ${result.aggregates.cellsSkipped}, failed ${result.aggregates.cellsFailed}`
|
|
82
|
-
);
|
|
83
|
-
lines.push(`**Total spend**: $${result.aggregates.totalCostUsd.toFixed(2)}`);
|
|
84
|
-
lines.push("");
|
|
85
|
-
lines.push(`### Gate verdict: \`${gate.decision}\``);
|
|
86
|
-
lines.push("");
|
|
87
|
-
for (const reason of gate.reasons) lines.push(`- ${reason}`);
|
|
88
|
-
if (gate.delta !== void 0) lines.push(`- delta: ${gate.delta.toFixed(3)}`);
|
|
89
|
-
lines.push("");
|
|
90
|
-
lines.push("### Contributing gates");
|
|
91
|
-
lines.push("");
|
|
92
|
-
lines.push("| gate | passed | detail |");
|
|
93
|
-
lines.push("|---|---|---|");
|
|
94
|
-
for (const c of gate.contributingGates) {
|
|
95
|
-
const detail = typeof c.detail === "object" ? JSON.stringify(c.detail).slice(0, 80) : String(c.detail).slice(0, 80);
|
|
96
|
-
lines.push(`| ${c.name} | ${c.passed ? "\u2713" : "\u2717"} | ${detail} |`);
|
|
97
|
-
}
|
|
98
|
-
lines.push("");
|
|
99
|
-
lines.push("### Promoted surface");
|
|
100
|
-
lines.push("");
|
|
101
|
-
lines.push("```diff");
|
|
102
|
-
lines.push(diff.slice(0, 8e3));
|
|
103
|
-
lines.push("```");
|
|
104
|
-
lines.push("");
|
|
105
|
-
lines.push("### By-judge aggregates");
|
|
106
|
-
lines.push("");
|
|
107
|
-
lines.push("| judge | mean | ci95 | n |");
|
|
108
|
-
lines.push("|---|---|---|---|");
|
|
109
|
-
for (const [name, agg] of Object.entries(result.aggregates.byJudge)) {
|
|
110
|
-
lines.push(
|
|
111
|
-
`| ${name} | ${agg.mean.toFixed(3)} | [${agg.ci95[0].toFixed(3)}, ${agg.ci95[1].toFixed(3)}] | ${agg.n} |`
|
|
112
|
-
);
|
|
113
|
-
}
|
|
114
|
-
return lines.join("\n");
|
|
115
|
-
}
|
|
116
|
-
function defaultGhExec(args) {
|
|
117
|
-
try {
|
|
118
|
-
const stdout = execSync(`gh ${args.map(quoteArg).join(" ")}`, {
|
|
119
|
-
env: { ...process.env, GH_TOKEN: process.env.GH_AUTO_PR_TOKEN ?? process.env.GH_TOKEN ?? "" },
|
|
120
|
-
stdio: ["ignore", "pipe", "pipe"]
|
|
121
|
-
}).toString("utf8");
|
|
122
|
-
return { stdout, stderr: "", status: 0 };
|
|
123
|
-
} catch (err) {
|
|
124
|
-
const e = err;
|
|
125
|
-
return {
|
|
126
|
-
stdout: e.stdout?.toString("utf8") ?? "",
|
|
127
|
-
stderr: e.stderr?.toString("utf8") ?? "",
|
|
128
|
-
status: e.status ?? 1
|
|
129
|
-
};
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
function quoteArg(arg) {
|
|
133
|
-
if (/^[a-zA-Z0-9_/\-:.@]+$/.test(arg)) return arg;
|
|
134
|
-
return `"${arg.replace(/"/g, '\\"')}"`;
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
// src/campaign/drivers/evolutionary.ts
|
|
138
|
-
function evolutionaryDriver(opts) {
|
|
139
|
-
return {
|
|
140
|
-
kind: `evolutionary:${opts.mutator.kind}`,
|
|
141
|
-
async propose({ currentSurface, findings, populationSize, signal }) {
|
|
142
|
-
return opts.mutator.mutate({
|
|
143
|
-
findings: findings.length > 0 ? findings : opts.findings ?? [],
|
|
144
|
-
currentSurface,
|
|
145
|
-
populationSize,
|
|
146
|
-
signal
|
|
147
|
-
});
|
|
148
|
-
}
|
|
149
|
-
};
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// src/campaign/drivers/gepa.ts
|
|
153
|
-
var REFLECTION_SYSTEM = 'You are an expert prompt engineer. Output ONLY a JSON object of shape {"proposals":[{"label":string,"rationale":string,"payload":string}]} where each `payload` is the FULL improved surface text. No prose outside the JSON.';
|
|
154
|
-
function gepaDriver(opts) {
|
|
155
|
-
const evidenceK = opts.evidenceK ?? 3;
|
|
156
|
-
return {
|
|
157
|
-
kind: "gepa",
|
|
158
|
-
async propose(ctx) {
|
|
159
|
-
const parent = typeof ctx.currentSurface === "string" ? ctx.currentSurface : JSON.stringify(ctx.currentSurface);
|
|
160
|
-
const { top, bottom, target } = buildEvidence(ctx, evidenceK, opts.target);
|
|
161
|
-
const userPrompt = buildReflectionPrompt({
|
|
162
|
-
target,
|
|
163
|
-
parentPayload: parent,
|
|
164
|
-
topTrials: top,
|
|
165
|
-
bottomTrials: bottom,
|
|
166
|
-
childCount: ctx.populationSize,
|
|
167
|
-
mutationPrimitives: opts.mutationPrimitives
|
|
168
|
-
});
|
|
169
|
-
const result = await callLlm(
|
|
170
|
-
{
|
|
171
|
-
model: opts.model,
|
|
172
|
-
messages: [
|
|
173
|
-
{ role: "system", content: REFLECTION_SYSTEM },
|
|
174
|
-
{ role: "user", content: userPrompt }
|
|
175
|
-
],
|
|
176
|
-
jsonMode: true,
|
|
177
|
-
temperature: opts.temperature ?? 0.7,
|
|
178
|
-
maxTokens: opts.maxTokens ?? 6e3
|
|
179
|
-
},
|
|
180
|
-
opts.llm
|
|
181
|
-
);
|
|
182
|
-
const proposals = parseReflectionResponse(result.content, ctx.populationSize);
|
|
183
|
-
const out = [];
|
|
184
|
-
for (const proposal of proposals) {
|
|
185
|
-
const text = typeof proposal.payload === "string" ? proposal.payload.trim() : "";
|
|
186
|
-
if (text && text !== parent && !out.includes(text)) out.push(text);
|
|
187
|
-
}
|
|
188
|
-
return out;
|
|
189
|
-
}
|
|
190
|
-
};
|
|
191
|
-
}
|
|
192
|
-
function buildEvidence(ctx, evidenceK, baseTarget) {
|
|
193
|
-
const last = ctx.history.at(-1);
|
|
194
|
-
if (!last || last.candidates.length === 0) {
|
|
195
|
-
return { top: [], bottom: [], target: baseTarget };
|
|
196
|
-
}
|
|
197
|
-
const best = [...last.candidates].sort((a, b) => b.composite - a.composite)[0];
|
|
198
|
-
if (!best) return { top: [], bottom: [], target: baseTarget };
|
|
199
|
-
const byScore = [...best.scenarios].sort((a, b) => b.composite - a.composite);
|
|
200
|
-
const toTrace = (s) => ({
|
|
201
|
-
id: s.scenarioId,
|
|
202
|
-
score: s.composite
|
|
203
|
-
});
|
|
204
|
-
const top = byScore.slice(0, evidenceK).map(toTrace);
|
|
205
|
-
const bottom = byScore.slice(-evidenceK).reverse().map(toTrace);
|
|
206
|
-
const weakest = Object.entries(best.dimensions).sort((a, b) => a[1] - b[1]).slice(0, 3).map(([dim, value]) => `${dim} (${value.toFixed(2)})`);
|
|
207
|
-
const target = weakest.length > 0 ? `${baseTarget} \u2014 weakest dimensions: ${weakest.join(", ")}` : baseTarget;
|
|
208
|
-
return { top, bottom, target };
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
// src/campaign/gates/compose.ts
|
|
212
|
-
function composeGate(...gates) {
|
|
213
|
-
if (gates.length === 0) {
|
|
214
|
-
throw new Error("composeGate requires at least one gate");
|
|
215
|
-
}
|
|
216
|
-
return {
|
|
217
|
-
name: `composed(${gates.map((g) => g.name).join(",")})`,
|
|
218
|
-
async decide(ctx) {
|
|
219
|
-
const results = [];
|
|
220
|
-
for (const gate of gates) {
|
|
221
|
-
const res = await gate.decide(ctx);
|
|
222
|
-
results.push({ gate, res });
|
|
223
|
-
}
|
|
224
|
-
const decisions = results.map((r) => r.res.decision);
|
|
225
|
-
const overall = decisions.every((d) => d === "ship") ? "ship" : decisions.includes("arch_ceiling") ? "arch_ceiling" : decisions.includes("model_ceiling") ? "model_ceiling" : decisions.includes("hold") ? "hold" : "need_more_work";
|
|
226
|
-
const contributing = results.flatMap(
|
|
227
|
-
(r) => r.res.contributingGates.length > 0 ? r.res.contributingGates : [{ name: r.gate.name, passed: r.res.decision === "ship", detail: r.res }]
|
|
228
|
-
);
|
|
229
|
-
const reasons = results.flatMap(
|
|
230
|
-
(r) => r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`)
|
|
231
|
-
);
|
|
232
|
-
return {
|
|
233
|
-
decision: overall,
|
|
234
|
-
reasons,
|
|
235
|
-
contributingGates: contributing,
|
|
236
|
-
delta: results[0]?.res.delta
|
|
237
|
-
};
|
|
238
|
-
}
|
|
239
|
-
};
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
// src/campaign/gates/default-production-gate.ts
|
|
243
|
-
function defaultProductionGate(options) {
|
|
244
|
-
const deltaThreshold = options.deltaThreshold ?? 0.5;
|
|
245
|
-
const blockOnGaming = options.blockOnRewardHackingGaming ?? true;
|
|
246
|
-
return {
|
|
247
|
-
name: "defaultProductionGate",
|
|
248
|
-
async decide(ctx) {
|
|
249
|
-
const reasons = [];
|
|
250
|
-
const contributing = [];
|
|
251
|
-
const baselineComposite = meanComposite(
|
|
252
|
-
ctx.baselineArtifacts,
|
|
253
|
-
ctx.baselineJudgeScores ?? ctx.judgeScores,
|
|
254
|
-
options.holdoutScenarios
|
|
255
|
-
);
|
|
256
|
-
const candidateComposite = meanComposite(
|
|
257
|
-
ctx.candidateArtifacts,
|
|
258
|
-
ctx.judgeScores,
|
|
259
|
-
options.holdoutScenarios
|
|
260
|
-
);
|
|
261
|
-
const delta = candidateComposite - baselineComposite;
|
|
262
|
-
const heldoutPass = delta >= deltaThreshold;
|
|
263
|
-
contributing.push({
|
|
264
|
-
name: "heldout-delta",
|
|
265
|
-
passed: heldoutPass,
|
|
266
|
-
detail: { baselineComposite, candidateComposite, delta, deltaThreshold }
|
|
267
|
-
});
|
|
268
|
-
if (!heldoutPass) {
|
|
269
|
-
reasons.push(`heldout delta ${delta.toFixed(3)} < threshold ${deltaThreshold}`);
|
|
270
|
-
}
|
|
271
|
-
const budgetPass = options.budgetUsd === void 0 || ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd;
|
|
272
|
-
contributing.push({
|
|
273
|
-
name: "budget",
|
|
274
|
-
passed: budgetPass,
|
|
275
|
-
detail: {
|
|
276
|
-
candidateUsd: ctx.cost.candidate,
|
|
277
|
-
baselineUsd: ctx.cost.baseline,
|
|
278
|
-
budgetUsd: options.budgetUsd
|
|
279
|
-
}
|
|
280
|
-
});
|
|
281
|
-
if (!budgetPass) {
|
|
282
|
-
reasons.push(
|
|
283
|
-
`spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`
|
|
284
|
-
);
|
|
285
|
-
}
|
|
286
|
-
const redTeamFindings = options.redTeamBattery ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery) : { passed: true, findings: [] };
|
|
287
|
-
contributing.push({
|
|
288
|
-
name: "red-team",
|
|
289
|
-
passed: redTeamFindings.passed,
|
|
290
|
-
detail: {
|
|
291
|
-
failures: redTeamFindings.findings.length,
|
|
292
|
-
sample: redTeamFindings.findings.slice(0, 3)
|
|
293
|
-
}
|
|
294
|
-
});
|
|
295
|
-
if (!redTeamFindings.passed) {
|
|
296
|
-
reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`);
|
|
297
|
-
}
|
|
298
|
-
let rewardHackingReport = null;
|
|
299
|
-
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
300
|
-
rewardHackingReport = detectRewardHacking({ runs: options.recentRuns });
|
|
301
|
-
}
|
|
302
|
-
const gamingThreshold = 0.6;
|
|
303
|
-
const gamingFindings = (rewardHackingReport?.findings ?? []).filter(
|
|
304
|
-
(f) => f.severity >= gamingThreshold
|
|
305
|
-
);
|
|
306
|
-
const rewardHackingPass = !rewardHackingReport || !blockOnGaming || gamingFindings.length === 0 && rewardHackingReport.verdict !== "gaming";
|
|
307
|
-
contributing.push({
|
|
308
|
-
name: "reward-hacking",
|
|
309
|
-
passed: rewardHackingPass,
|
|
310
|
-
detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length }
|
|
311
|
-
});
|
|
312
|
-
if (!rewardHackingPass) {
|
|
313
|
-
reasons.push(
|
|
314
|
-
`reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport.verdict})`
|
|
315
|
-
);
|
|
316
|
-
}
|
|
317
|
-
let canaryReport = null;
|
|
318
|
-
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
319
|
-
canaryReport = runCanaries(options.recentRuns, {});
|
|
320
|
-
}
|
|
321
|
-
const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === "error");
|
|
322
|
-
const canaryPass = errorAlerts.length === 0;
|
|
323
|
-
contributing.push({
|
|
324
|
-
name: "canary",
|
|
325
|
-
passed: canaryPass,
|
|
326
|
-
detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length }
|
|
327
|
-
});
|
|
328
|
-
if (!canaryPass) {
|
|
329
|
-
reasons.push(`canary error alerts: ${errorAlerts.length}`);
|
|
330
|
-
}
|
|
331
|
-
const allPassed = contributing.every((c) => c.passed);
|
|
332
|
-
const decision = allPassed ? "ship" : "hold";
|
|
333
|
-
return {
|
|
334
|
-
decision,
|
|
335
|
-
reasons: reasons.length > 0 ? reasons : ["all gates passed"],
|
|
336
|
-
contributingGates: contributing,
|
|
337
|
-
delta
|
|
338
|
-
};
|
|
339
|
-
}
|
|
340
|
-
};
|
|
341
|
-
}
|
|
342
|
-
function meanComposite(artifacts, judgeScoresByCell, scenarios) {
|
|
343
|
-
if (!artifacts || artifacts.size === 0) return 0;
|
|
344
|
-
const scenarioIds = new Set(scenarios.map((s) => s.id));
|
|
345
|
-
const composites = [];
|
|
346
|
-
for (const [cellId, scores] of judgeScoresByCell) {
|
|
347
|
-
const scenarioId = cellId.split(":")[0] ?? "";
|
|
348
|
-
if (!scenarioIds.has(scenarioId)) continue;
|
|
349
|
-
const cellComposites = Object.values(scores).map((s) => s.composite);
|
|
350
|
-
if (cellComposites.length === 0) continue;
|
|
351
|
-
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
352
|
-
}
|
|
353
|
-
if (composites.length === 0) return 0;
|
|
354
|
-
return composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
355
|
-
}
|
|
356
|
-
function probeRedTeam(artifacts, battery) {
|
|
357
|
-
const findings = [];
|
|
358
|
-
for (const [_cellId, artifact] of artifacts) {
|
|
359
|
-
const text = extractText(artifact);
|
|
360
|
-
if (text === void 0) continue;
|
|
361
|
-
for (const rtCase of battery) {
|
|
362
|
-
const finding = scoreRedTeamOutput(text, [], rtCase);
|
|
363
|
-
if (!finding.passed) {
|
|
364
|
-
findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? "red-team probe failed" });
|
|
365
|
-
}
|
|
366
|
-
}
|
|
367
|
-
}
|
|
368
|
-
return { passed: findings.length === 0, findings };
|
|
369
|
-
}
|
|
370
|
-
function extractText(artifact) {
|
|
371
|
-
if (typeof artifact === "string") return artifact;
|
|
372
|
-
if (artifact && typeof artifact === "object") {
|
|
373
|
-
const rec = artifact;
|
|
374
|
-
if (typeof rec.text === "string") return rec.text;
|
|
375
|
-
if (typeof rec.output === "string") return rec.output;
|
|
376
|
-
if (typeof rec.content === "string") return rec.content;
|
|
377
|
-
}
|
|
378
|
-
return void 0;
|
|
379
|
-
}
|
|
380
|
-
|
|
381
|
-
// src/campaign/gates/heldout-gate.ts
|
|
382
|
-
function heldOutGate(options) {
|
|
383
|
-
const deltaThreshold = options.deltaThreshold ?? 0.5;
|
|
384
|
-
return {
|
|
385
|
-
name: "heldOutGate",
|
|
386
|
-
async decide(ctx) {
|
|
387
|
-
const scenarioIds = new Set(options.scenarios.map((s) => s.id));
|
|
388
|
-
const baseline = meanForScenarios(ctx.baselineJudgeScores ?? ctx.judgeScores, scenarioIds);
|
|
389
|
-
const candidate = meanForScenarios(ctx.judgeScores, scenarioIds);
|
|
390
|
-
const delta = candidate - baseline;
|
|
391
|
-
const passed = delta >= deltaThreshold;
|
|
392
|
-
return {
|
|
393
|
-
decision: passed ? "ship" : "hold",
|
|
394
|
-
reasons: passed ? [`held-out delta ${delta.toFixed(3)} \u2265 ${deltaThreshold}`] : [`held-out delta ${delta.toFixed(3)} < ${deltaThreshold}`],
|
|
395
|
-
contributingGates: [
|
|
396
|
-
{ name: "heldOutGate", passed, detail: { baseline, candidate, delta, deltaThreshold } }
|
|
397
|
-
],
|
|
398
|
-
delta
|
|
399
|
-
};
|
|
400
|
-
}
|
|
401
|
-
};
|
|
402
|
-
}
|
|
403
|
-
function meanForScenarios(judgeScoresByCell, scenarioIds) {
|
|
404
|
-
const composites = [];
|
|
405
|
-
for (const [cellId, scores] of judgeScoresByCell) {
|
|
406
|
-
const scenarioId = cellId.split(":")[0] ?? "";
|
|
407
|
-
if (!scenarioIds.has(scenarioId)) continue;
|
|
408
|
-
const vals = Object.values(scores).map((s) => s.composite);
|
|
409
|
-
if (vals.length > 0) composites.push(vals.reduce((a, b) => a + b, 0) / vals.length);
|
|
410
|
-
}
|
|
411
|
-
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
412
|
-
}
|
|
413
|
-
|
|
414
27
|
// src/campaign/labeled-store/fs-adapter.ts
|
|
415
28
|
import { createHash } from "crypto";
|
|
416
|
-
import { existsSync, mkdirSync, readFileSync, writeFileSync
|
|
417
|
-
import { join
|
|
29
|
+
import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs";
|
|
30
|
+
import { join } from "path";
|
|
418
31
|
var LabeledScenarioStoreError = class extends Error {
|
|
419
32
|
constructor(code, message) {
|
|
420
33
|
super(message);
|
|
@@ -561,7 +174,7 @@ var FsLabeledScenarioStore = class {
|
|
|
561
174
|
};
|
|
562
175
|
}
|
|
563
176
|
pathForSource(source) {
|
|
564
|
-
return
|
|
177
|
+
return join(this.options.root, `${source}.jsonl`);
|
|
565
178
|
}
|
|
566
179
|
};
|
|
567
180
|
var ALL_SOURCES = [
|
|
@@ -600,239 +213,16 @@ function sha256(input) {
|
|
|
600
213
|
function appendLine(path, line) {
|
|
601
214
|
if (existsSync(path)) {
|
|
602
215
|
const existing = readFileSync(path, "utf8");
|
|
603
|
-
|
|
216
|
+
writeFileSync(path, existing + line);
|
|
604
217
|
} else {
|
|
605
|
-
|
|
606
|
-
}
|
|
607
|
-
}
|
|
608
|
-
|
|
609
|
-
// src/campaign/presets/run-eval.ts
|
|
610
|
-
async function runEval(opts) {
|
|
611
|
-
return runCampaign(opts);
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
// src/campaign/presets/run-optimization.ts
|
|
615
|
-
import { createHash as createHash2 } from "crypto";
|
|
616
|
-
async function runOptimization(opts) {
|
|
617
|
-
const promoteTopK = opts.promoteTopK ?? 2;
|
|
618
|
-
const baselineCampaign = await runCampaign({
|
|
619
|
-
...opts,
|
|
620
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
|
|
621
|
-
runDir: `${opts.runDir}/baseline`
|
|
622
|
-
});
|
|
623
|
-
const generations = [];
|
|
624
|
-
const history = [];
|
|
625
|
-
let currentSurfaces = [opts.baselineSurface];
|
|
626
|
-
let winnerSurface = opts.baselineSurface;
|
|
627
|
-
let winnerSurfaceHash = surfaceHash(opts.baselineSurface);
|
|
628
|
-
let winnerComposite = meanComposite2(baselineCampaign);
|
|
629
|
-
for (let gen = 0; gen < opts.maxGenerations; gen++) {
|
|
630
|
-
if (opts.driver.decide?.({ history }).stop) break;
|
|
631
|
-
const candidates = await opts.driver.propose({
|
|
632
|
-
currentSurface: currentSurfaces[0] ?? opts.baselineSurface,
|
|
633
|
-
history,
|
|
634
|
-
findings: [],
|
|
635
|
-
populationSize: opts.populationSize,
|
|
636
|
-
generation: gen,
|
|
637
|
-
signal: new AbortController().signal,
|
|
638
|
-
report: opts.report,
|
|
639
|
-
dataset: opts.labeledStore && opts.labeledStore !== "off" ? opts.labeledStore : void 0,
|
|
640
|
-
maxImprovementShots: opts.maxImprovementShots
|
|
641
|
-
});
|
|
642
|
-
const surfaceResults = [];
|
|
643
|
-
for (let i = 0; i < candidates.length; i++) {
|
|
644
|
-
const surface = candidates[i];
|
|
645
|
-
const hash = surfaceHash(surface);
|
|
646
|
-
const campaign = await runCampaign({
|
|
647
|
-
...opts,
|
|
648
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(surface, scenario, ctx),
|
|
649
|
-
runDir: `${opts.runDir}/gen-${gen}/candidate-${i}`
|
|
650
|
-
});
|
|
651
|
-
const composite = meanComposite2(campaign);
|
|
652
|
-
surfaceResults.push({ surfaceHash: hash, surface, campaign, composite });
|
|
653
|
-
}
|
|
654
|
-
surfaceResults.sort((a, b) => b.composite - a.composite);
|
|
655
|
-
const promoted = surfaceResults.slice(0, promoteTopK);
|
|
656
|
-
currentSurfaces = promoted.map((p) => p.surface);
|
|
657
|
-
const top = surfaceResults[0];
|
|
658
|
-
if (top && top.composite > winnerComposite) {
|
|
659
|
-
winnerSurface = top.surface;
|
|
660
|
-
winnerSurfaceHash = top.surfaceHash;
|
|
661
|
-
winnerComposite = top.composite;
|
|
662
|
-
}
|
|
663
|
-
const record = {
|
|
664
|
-
generationIndex: gen,
|
|
665
|
-
candidates: surfaceResults.map((s) => {
|
|
666
|
-
const breakdown = candidateBreakdown(s.campaign);
|
|
667
|
-
return {
|
|
668
|
-
surfaceHash: s.surfaceHash,
|
|
669
|
-
composite: s.composite,
|
|
670
|
-
ci95: [s.composite, s.composite],
|
|
671
|
-
dimensions: breakdown.dimensions,
|
|
672
|
-
scenarios: breakdown.scenarios
|
|
673
|
-
};
|
|
674
|
-
}),
|
|
675
|
-
promoted: promoted.map((p) => p.surfaceHash)
|
|
676
|
-
};
|
|
677
|
-
history.push(record);
|
|
678
|
-
generations.push({
|
|
679
|
-
record,
|
|
680
|
-
surfaces: surfaceResults.map((s) => ({
|
|
681
|
-
surfaceHash: s.surfaceHash,
|
|
682
|
-
surface: s.surface,
|
|
683
|
-
campaign: s.campaign
|
|
684
|
-
}))
|
|
685
|
-
});
|
|
686
|
-
}
|
|
687
|
-
return {
|
|
688
|
-
generations,
|
|
689
|
-
winnerSurface,
|
|
690
|
-
winnerSurfaceHash,
|
|
691
|
-
baselineCampaign
|
|
692
|
-
};
|
|
693
|
-
}
|
|
694
|
-
function surfaceHash(surface) {
|
|
695
|
-
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
696
|
-
kind: surface.kind,
|
|
697
|
-
worktreeRef: surface.worktreeRef,
|
|
698
|
-
baseRef: surface.baseRef ?? null
|
|
699
|
-
});
|
|
700
|
-
return createHash2("sha256").update(material).digest("hex").slice(0, 16);
|
|
701
|
-
}
|
|
702
|
-
function meanComposite2(campaign) {
|
|
703
|
-
const composites = [];
|
|
704
|
-
for (const cell of campaign.cells) {
|
|
705
|
-
const cellComposites = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
706
|
-
if (cellComposites.length > 0) {
|
|
707
|
-
composites.push(cellComposites.reduce((a, b) => a + b, 0) / cellComposites.length);
|
|
708
|
-
}
|
|
709
|
-
}
|
|
710
|
-
return composites.length === 0 ? 0 : composites.reduce((a, b) => a + b, 0) / composites.length;
|
|
711
|
-
}
|
|
712
|
-
function candidateBreakdown(campaign) {
|
|
713
|
-
const dimSums = {};
|
|
714
|
-
const dimCounts = {};
|
|
715
|
-
const byScenario = /* @__PURE__ */ new Map();
|
|
716
|
-
for (const cell of campaign.cells) {
|
|
717
|
-
const judgeScores = Object.values(cell.judgeScores);
|
|
718
|
-
if (judgeScores.length === 0) continue;
|
|
719
|
-
const cellComposite = judgeScores.reduce((a, s) => a + s.composite, 0) / judgeScores.length;
|
|
720
|
-
const arr = byScenario.get(cell.scenarioId) ?? [];
|
|
721
|
-
arr.push(cellComposite);
|
|
722
|
-
byScenario.set(cell.scenarioId, arr);
|
|
723
|
-
for (const score of judgeScores) {
|
|
724
|
-
for (const [key, value] of Object.entries(score.dimensions)) {
|
|
725
|
-
dimSums[key] = (dimSums[key] ?? 0) + value;
|
|
726
|
-
dimCounts[key] = (dimCounts[key] ?? 0) + 1;
|
|
727
|
-
}
|
|
728
|
-
}
|
|
729
|
-
}
|
|
730
|
-
const dimensions = {};
|
|
731
|
-
for (const key of Object.keys(dimSums)) {
|
|
732
|
-
const count = dimCounts[key] ?? 0;
|
|
733
|
-
dimensions[key] = count > 0 ? (dimSums[key] ?? 0) / count : 0;
|
|
734
|
-
}
|
|
735
|
-
const scenarios = [...byScenario.entries()].map(([scenarioId, comps]) => ({
|
|
736
|
-
scenarioId,
|
|
737
|
-
composite: comps.reduce((a, b) => a + b, 0) / comps.length
|
|
738
|
-
}));
|
|
739
|
-
return { dimensions, scenarios };
|
|
740
|
-
}
|
|
741
|
-
|
|
742
|
-
// src/campaign/presets/run-improvement-loop.ts
|
|
743
|
-
async function runImprovementLoop(opts) {
|
|
744
|
-
if (opts.autoOnPromote === "config") {
|
|
745
|
-
throw new Error(
|
|
746
|
-
"runImprovementLoop: autoOnPromote='config' is deferred to Pass B (requires shadow deploy + rollback + ensemble judges). Use 'pr' or 'none' in v0.40."
|
|
747
|
-
);
|
|
748
|
-
}
|
|
749
|
-
if (opts.tracing === "off" && opts.driver) {
|
|
750
|
-
throw new Error(
|
|
751
|
-
"runImprovementLoop: tracing='off' is forbidden when a driver is wired. The improvement loop without traces is unattributable; candidate surfaces cannot be cited back to spans and the optimization dataset goes unfed."
|
|
752
|
-
);
|
|
753
|
-
}
|
|
754
|
-
if (opts.autoOnPromote === "pr" && (!opts.ghOwner || !opts.ghRepo)) {
|
|
755
|
-
throw new Error("runImprovementLoop: autoOnPromote='pr' requires ghOwner + ghRepo.");
|
|
756
|
-
}
|
|
757
|
-
const optimization = await runOptimization(opts);
|
|
758
|
-
const { runCampaign: runCampaign2 } = await import("../run-campaign-GNDO66B4.js");
|
|
759
|
-
const baselineOnHoldout = await runCampaign2({
|
|
760
|
-
...opts,
|
|
761
|
-
scenarios: opts.holdoutScenarios,
|
|
762
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(opts.baselineSurface, scenario, ctx),
|
|
763
|
-
runDir: `${opts.runDir}/holdout-baseline`
|
|
764
|
-
});
|
|
765
|
-
const winnerOnHoldout = await runCampaign2({
|
|
766
|
-
...opts,
|
|
767
|
-
scenarios: opts.holdoutScenarios,
|
|
768
|
-
dispatch: (scenario, ctx) => opts.dispatchWithSurface(optimization.winnerSurface, scenario, ctx),
|
|
769
|
-
runDir: `${opts.runDir}/holdout-winner`
|
|
770
|
-
});
|
|
771
|
-
const candidateArtifacts = /* @__PURE__ */ new Map();
|
|
772
|
-
const baselineArtifacts = /* @__PURE__ */ new Map();
|
|
773
|
-
const judgeScores = /* @__PURE__ */ new Map();
|
|
774
|
-
const baselineJudgeScores = /* @__PURE__ */ new Map();
|
|
775
|
-
for (const cell of winnerOnHoldout.cells) {
|
|
776
|
-
candidateArtifacts.set(cell.cellId, cell.artifact);
|
|
777
|
-
judgeScores.set(cell.cellId, cell.judgeScores);
|
|
778
|
-
}
|
|
779
|
-
for (const cell of baselineOnHoldout.cells) {
|
|
780
|
-
baselineArtifacts.set(cell.cellId, cell.artifact);
|
|
781
|
-
baselineJudgeScores.set(cell.cellId, cell.judgeScores);
|
|
782
|
-
}
|
|
783
|
-
const gateResult = await opts.gate.decide({
|
|
784
|
-
candidateArtifacts,
|
|
785
|
-
baselineArtifacts,
|
|
786
|
-
judgeScores,
|
|
787
|
-
baselineJudgeScores,
|
|
788
|
-
scenarios: opts.holdoutScenarios,
|
|
789
|
-
cost: {
|
|
790
|
-
candidate: winnerOnHoldout.aggregates.totalCostUsd,
|
|
791
|
-
baseline: baselineOnHoldout.aggregates.totalCostUsd
|
|
792
|
-
},
|
|
793
|
-
signal: new AbortController().signal
|
|
794
|
-
});
|
|
795
|
-
let prResult;
|
|
796
|
-
if (opts.autoOnPromote === "pr" && gateResult.decision === "ship") {
|
|
797
|
-
const render = opts.renderPromotedDiff ?? defaultRenderDiff;
|
|
798
|
-
const promotedDiff = render(optimization.winnerSurface, opts.baselineSurface);
|
|
799
|
-
prResult = openAutoPr({
|
|
800
|
-
result: winnerOnHoldout,
|
|
801
|
-
gate: gateResult,
|
|
802
|
-
promotedDiff,
|
|
803
|
-
ghOwner: opts.ghOwner,
|
|
804
|
-
ghRepo: opts.ghRepo
|
|
805
|
-
});
|
|
806
|
-
}
|
|
807
|
-
return {
|
|
808
|
-
...optimization,
|
|
809
|
-
baselineOnHoldout,
|
|
810
|
-
winnerOnHoldout,
|
|
811
|
-
gateResult,
|
|
812
|
-
prResult
|
|
813
|
-
};
|
|
814
|
-
}
|
|
815
|
-
function defaultRenderDiff(winnerSurface, baselineSurface) {
|
|
816
|
-
if (typeof winnerSurface !== "string" || typeof baselineSurface !== "string") {
|
|
817
|
-
const fmt = (s) => typeof s === "string" ? "(prompt surface)" : `worktree=${s.worktreeRef}${s.baseRef ? ` base=${s.baseRef}` : ""}${s.summary ? `
|
|
818
|
-
${s.summary}` : ""}`;
|
|
819
|
-
return `--- baseline
|
|
820
|
-
${fmt(baselineSurface)}
|
|
821
|
-
+++ winner
|
|
822
|
-
${fmt(winnerSurface)}`;
|
|
218
|
+
writeFileSync(path, line);
|
|
823
219
|
}
|
|
824
|
-
const lines = [];
|
|
825
|
-
lines.push("--- baseline");
|
|
826
|
-
lines.push("+++ winner");
|
|
827
|
-
for (const l of baselineSurface.split("\n")) lines.push(`- ${l}`);
|
|
828
|
-
for (const l of winnerSurface.split("\n")) lines.push(`+ ${l}`);
|
|
829
|
-
return lines.join("\n");
|
|
830
220
|
}
|
|
831
221
|
|
|
832
222
|
// src/campaign/worktree/index.ts
|
|
833
223
|
import { execFileSync } from "child_process";
|
|
834
224
|
import { existsSync as existsSync2 } from "fs";
|
|
835
|
-
import { basename, isAbsolute, join as
|
|
225
|
+
import { basename, isAbsolute, join as join2 } from "path";
|
|
836
226
|
var WorktreeAdapterError = class extends Error {
|
|
837
227
|
constructor(message, cause) {
|
|
838
228
|
super(message);
|
|
@@ -854,13 +244,13 @@ function slug(label) {
|
|
|
854
244
|
}
|
|
855
245
|
function gitWorktreeAdapter(opts) {
|
|
856
246
|
const git = opts.git ?? defaultGit;
|
|
857
|
-
const worktreeDir = opts.worktreeDir ??
|
|
247
|
+
const worktreeDir = opts.worktreeDir ?? join2(opts.repoRoot, ".worktrees");
|
|
858
248
|
const branchPrefix = opts.branchPrefix ?? "improve";
|
|
859
249
|
return {
|
|
860
250
|
async create({ baseRef, label }) {
|
|
861
251
|
const id = `${slug(label)}-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 6)}`;
|
|
862
252
|
const branch = `${branchPrefix}/${id}`;
|
|
863
|
-
const path =
|
|
253
|
+
const path = join2(worktreeDir, id);
|
|
864
254
|
git(["worktree", "add", "-b", branch, path, baseRef], opts.repoRoot);
|
|
865
255
|
return { path, branch, baseRef };
|
|
866
256
|
},
|
|
@@ -885,7 +275,7 @@ function gitWorktreeAdapter(opts) {
|
|
|
885
275
|
}
|
|
886
276
|
function resolveWorktreePath(surface, worktreeDir) {
|
|
887
277
|
if (isAbsolute(surface.worktreeRef) && existsSync2(surface.worktreeRef)) return surface.worktreeRef;
|
|
888
|
-
if (worktreeDir) return
|
|
278
|
+
if (worktreeDir) return join2(worktreeDir, basename(surface.worktreeRef));
|
|
889
279
|
return surface.worktreeRef;
|
|
890
280
|
}
|
|
891
281
|
export {
|