@tangle-network/agent-eval 0.65.0 → 0.67.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/dist/adapters/otel.d.ts +1 -1
- package/dist/campaign/index.d.ts +110 -6
- package/dist/campaign/index.js +26 -19
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
- package/dist/chunk-6XQIEUQ2.js.map +1 -0
- package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
- package/dist/chunk-DFS3FEXO.js.map +1 -0
- package/dist/chunk-MZ2IYGGN.js +592 -0
- package/dist/chunk-MZ2IYGGN.js.map +1 -0
- package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
- package/dist/chunk-NV2PF37Q.js.map +1 -0
- package/dist/contract/index.d.ts +11 -9
- package/dist/contract/index.js +11 -12
- package/dist/contract/index.js.map +1 -1
- package/dist/hosted/index.d.ts +1 -1
- package/dist/hosted/index.js +1 -1
- package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
- package/dist/index.d.ts +251 -7
- package/dist/index.js +292 -2
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +1 -1
- package/dist/provenance-CChUqexv.d.ts +314 -0
- package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
- package/dist/release-report-CN8hJlhk.d.ts +233 -0
- package/dist/reporting.d.ts +4 -3
- package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
- package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
- package/dist/statistics-B7yCbi9i.d.ts +253 -0
- package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
- package/package.json +1 -1
- package/dist/chunk-4ODZXQV2.js.map +0 -1
- package/dist/chunk-7TPYV2ER.js.map +0 -1
- package/dist/chunk-CZRKD2X2.js +0 -1104
- package/dist/chunk-CZRKD2X2.js.map +0 -1
- package/dist/chunk-E22YUOAL.js +0 -111
- package/dist/chunk-E22YUOAL.js.map +0 -1
- package/dist/chunk-HKINEDRZ.js.map +0 -1
- package/dist/release-report-DGoeObZT.d.ts +0 -484
- /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
import {
|
|
2
|
+
runCanaries,
|
|
3
|
+
scoreRedTeamOutput
|
|
4
|
+
} from "./chunk-NV2PF37Q.js";
|
|
5
|
+
import {
|
|
6
|
+
runCampaign,
|
|
7
|
+
summarizeBackendIntegrity
|
|
8
|
+
} from "./chunk-6XQIEUQ2.js";
|
|
9
|
+
import {
|
|
10
|
+
detectRewardHacking
|
|
11
|
+
} from "./chunk-YV7J7X5N.js";
|
|
12
|
+
import {
|
|
13
|
+
pairedBootstrap
|
|
14
|
+
} from "./chunk-ITBRCT73.js";
|
|
15
|
+
|
|
16
|
+
// src/campaign/drivers/evolutionary.ts
|
|
17
|
+
function evolutionaryDriver(opts) {
|
|
18
|
+
return {
|
|
19
|
+
kind: `evolutionary:${opts.mutator.kind}`,
|
|
20
|
+
async propose({ currentSurface, findings, populationSize, signal }) {
|
|
21
|
+
return opts.mutator.mutate({
|
|
22
|
+
findings: findings.length > 0 ? findings : opts.findings ?? [],
|
|
23
|
+
currentSurface,
|
|
24
|
+
populationSize,
|
|
25
|
+
signal
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// src/campaign/gates/compose.ts
|
|
32
|
+
function composeGate(...gates) {
|
|
33
|
+
if (gates.length === 0) {
|
|
34
|
+
throw new Error("composeGate requires at least one gate");
|
|
35
|
+
}
|
|
36
|
+
return {
|
|
37
|
+
name: `composed(${gates.map((g) => g.name).join(",")})`,
|
|
38
|
+
async decide(ctx) {
|
|
39
|
+
const results = [];
|
|
40
|
+
for (const gate of gates) {
|
|
41
|
+
const res = await gate.decide(ctx);
|
|
42
|
+
results.push({ gate, res });
|
|
43
|
+
}
|
|
44
|
+
const decisions = results.map((r) => r.res.decision);
|
|
45
|
+
const overall = decisions.every((d) => d === "ship") ? "ship" : decisions.includes("arch_ceiling") ? "arch_ceiling" : decisions.includes("model_ceiling") ? "model_ceiling" : decisions.includes("hold") ? "hold" : "need_more_work";
|
|
46
|
+
const contributing = results.flatMap(
|
|
47
|
+
(r) => r.res.contributingGates.length > 0 ? r.res.contributingGates : [{ name: r.gate.name, passed: r.res.decision === "ship", detail: r.res }]
|
|
48
|
+
);
|
|
49
|
+
const reasons = results.flatMap(
|
|
50
|
+
(r) => r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`)
|
|
51
|
+
);
|
|
52
|
+
return {
|
|
53
|
+
decision: overall,
|
|
54
|
+
reasons,
|
|
55
|
+
contributingGates: contributing,
|
|
56
|
+
delta: results[0]?.res.delta
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// src/campaign/gates/statistical-heldout.ts
|
|
63
|
+
function pairHoldout(candidate, baseline, scenarioIds, select) {
|
|
64
|
+
const cellValue = (byCell, cellId) => {
|
|
65
|
+
const scores = byCell.get(cellId);
|
|
66
|
+
if (!scores) return void 0;
|
|
67
|
+
const vals = [];
|
|
68
|
+
for (const s of Object.values(scores)) {
|
|
69
|
+
const v = select(s);
|
|
70
|
+
if (typeof v === "number" && Number.isFinite(v)) vals.push(v);
|
|
71
|
+
}
|
|
72
|
+
if (vals.length === 0) return void 0;
|
|
73
|
+
return vals.reduce((a, b) => a + b, 0) / vals.length;
|
|
74
|
+
};
|
|
75
|
+
const inScope = (cellId) => scenarioIds.has(cellId.split(":")[0] ?? "");
|
|
76
|
+
const candCells = [...candidate.keys()].filter(inScope).sort();
|
|
77
|
+
const baseCells = [...baseline.keys()].filter(inScope).sort();
|
|
78
|
+
if (candCells.length !== baseCells.length || candCells.some((c, i) => c !== baseCells[i])) {
|
|
79
|
+
throw new Error(
|
|
80
|
+
`pairHoldout: candidate/baseline holdout cells do not align \u2014 candidate=[${candCells.join(",")}] baseline=[${baseCells.join(",")}]. Both holdout campaigns must run the same scenarios with the same seed base.`
|
|
81
|
+
);
|
|
82
|
+
}
|
|
83
|
+
const before = [];
|
|
84
|
+
const after = [];
|
|
85
|
+
const cellIds = [];
|
|
86
|
+
for (const cellId of candCells) {
|
|
87
|
+
const b = cellValue(baseline, cellId);
|
|
88
|
+
const a = cellValue(candidate, cellId);
|
|
89
|
+
if (b === void 0 || a === void 0) continue;
|
|
90
|
+
before.push(b);
|
|
91
|
+
after.push(a);
|
|
92
|
+
cellIds.push(cellId);
|
|
93
|
+
}
|
|
94
|
+
return { before, after, cellIds };
|
|
95
|
+
}
|
|
96
|
+
function heldoutSignificance(paired, opts = {}) {
|
|
97
|
+
const deltaThreshold = opts.deltaThreshold ?? 0;
|
|
98
|
+
const minProductiveRuns = opts.minProductiveRuns ?? 3;
|
|
99
|
+
const bootstrap = pairedBootstrap(paired.before, paired.after, {
|
|
100
|
+
confidence: opts.confidence ?? 0.95,
|
|
101
|
+
resamples: opts.resamples ?? 2e3,
|
|
102
|
+
statistic: opts.statistic ?? "median",
|
|
103
|
+
seed: opts.seed ?? 1337
|
|
104
|
+
});
|
|
105
|
+
const n = paired.before.length;
|
|
106
|
+
const fewRuns = n < minProductiveRuns;
|
|
107
|
+
const significant = !fewRuns && bootstrap.low > deltaThreshold;
|
|
108
|
+
return { paired, bootstrap, n, significant, fewRuns };
|
|
109
|
+
}
|
|
110
|
+
function detectScale(values) {
|
|
111
|
+
return values.some((v) => Math.abs(v) > 1.5) ? 100 : 1;
|
|
112
|
+
}
|
|
113
|
+
function dimensionRegressions(candidate, baseline, scenarioIds, criticalDimensions, opts = {}) {
|
|
114
|
+
const out = [];
|
|
115
|
+
for (const dim of criticalDimensions) {
|
|
116
|
+
const paired = pairHoldout(candidate, baseline, scenarioIds, (s) => s.dimensions[dim]);
|
|
117
|
+
if (paired.before.length === 0) continue;
|
|
118
|
+
const tolerance = opts.tolerance ?? 0.05 * detectScale([...paired.before, ...paired.after]);
|
|
119
|
+
const bootstrap = pairedBootstrap(paired.before, paired.after, {
|
|
120
|
+
confidence: opts.confidence ?? 0.95,
|
|
121
|
+
resamples: opts.resamples ?? 2e3,
|
|
122
|
+
statistic: "median",
|
|
123
|
+
seed: opts.seed ?? 1337
|
|
124
|
+
});
|
|
125
|
+
out.push({
|
|
126
|
+
dimension: dim,
|
|
127
|
+
bootstrap,
|
|
128
|
+
regressed: bootstrap.low < -tolerance,
|
|
129
|
+
tolerance,
|
|
130
|
+
n: paired.before.length
|
|
131
|
+
});
|
|
132
|
+
}
|
|
133
|
+
return out;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// src/campaign/gates/default-production-gate.ts
|
|
137
|
+
function defaultProductionGate(options) {
|
|
138
|
+
const deltaThreshold = options.deltaThreshold ?? 0;
|
|
139
|
+
const confidence = options.confidence ?? 0.95;
|
|
140
|
+
const resamples = options.bootstrapResamples ?? 2e3;
|
|
141
|
+
const seed = options.bootstrapSeed ?? 1337;
|
|
142
|
+
const minProductiveRuns = options.minProductiveRuns ?? 3;
|
|
143
|
+
const blockOnGaming = options.blockOnRewardHackingGaming ?? true;
|
|
144
|
+
return {
|
|
145
|
+
name: "defaultProductionGate",
|
|
146
|
+
async decide(ctx) {
|
|
147
|
+
const reasons = [];
|
|
148
|
+
const contributing = [];
|
|
149
|
+
const scenarioIds = new Set(options.holdoutScenarios.map((s) => s.id));
|
|
150
|
+
const sig = heldoutSignificance(
|
|
151
|
+
pairHoldout(
|
|
152
|
+
ctx.judgeScores,
|
|
153
|
+
ctx.baselineJudgeScores ?? ctx.judgeScores,
|
|
154
|
+
scenarioIds,
|
|
155
|
+
(s) => s.composite
|
|
156
|
+
),
|
|
157
|
+
{ deltaThreshold, minProductiveRuns, confidence, resamples, seed }
|
|
158
|
+
);
|
|
159
|
+
const delta = sig.bootstrap.median;
|
|
160
|
+
const heldoutPass = sig.significant;
|
|
161
|
+
contributing.push({
|
|
162
|
+
name: "heldout-significance",
|
|
163
|
+
passed: heldoutPass,
|
|
164
|
+
detail: {
|
|
165
|
+
n: sig.n,
|
|
166
|
+
deltaMedian: sig.bootstrap.median,
|
|
167
|
+
ciLow: sig.bootstrap.low,
|
|
168
|
+
ciHigh: sig.bootstrap.high,
|
|
169
|
+
confidence: sig.bootstrap.confidence,
|
|
170
|
+
deltaThreshold,
|
|
171
|
+
fewRuns: sig.fewRuns
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
if (!heldoutPass) {
|
|
175
|
+
reasons.push(
|
|
176
|
+
sig.fewRuns ? `held-out: only ${sig.n} paired runs (< ${minProductiveRuns}) \u2014 too few to claim significance` : `held-out CI.low ${sig.bootstrap.low.toFixed(3)} \u2264 threshold ${deltaThreshold} (median ${sig.bootstrap.median.toFixed(3)}, ${(sig.bootstrap.confidence * 100).toFixed(0)}% CI [${sig.bootstrap.low.toFixed(3)}, ${sig.bootstrap.high.toFixed(3)}])`
|
|
177
|
+
);
|
|
178
|
+
}
|
|
179
|
+
const dimRegs = options.criticalDimensions?.length ? dimensionRegressions(
|
|
180
|
+
ctx.judgeScores,
|
|
181
|
+
ctx.baselineJudgeScores ?? ctx.judgeScores,
|
|
182
|
+
scenarioIds,
|
|
183
|
+
options.criticalDimensions,
|
|
184
|
+
{ tolerance: options.regressionTolerance, confidence, resamples, seed }
|
|
185
|
+
) : [];
|
|
186
|
+
const regressed = dimRegs.filter((d) => d.regressed);
|
|
187
|
+
const dimPass = regressed.length === 0;
|
|
188
|
+
contributing.push({
|
|
189
|
+
name: "dimension-regression",
|
|
190
|
+
passed: dimPass,
|
|
191
|
+
detail: {
|
|
192
|
+
guarded: options.criticalDimensions ?? [],
|
|
193
|
+
regressions: dimRegs.map((d) => ({
|
|
194
|
+
dimension: d.dimension,
|
|
195
|
+
ciLow: d.bootstrap.low,
|
|
196
|
+
median: d.bootstrap.median,
|
|
197
|
+
tolerance: d.tolerance,
|
|
198
|
+
n: d.n,
|
|
199
|
+
regressed: d.regressed
|
|
200
|
+
}))
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
if (!dimPass) {
|
|
204
|
+
reasons.push(
|
|
205
|
+
`critical dimension(s) regressed: ${regressed.map((d) => `${d.dimension} CI.low ${d.bootstrap.low.toFixed(3)} < -${d.tolerance}`).join("; ")}`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
const budgetPass = options.budgetUsd === void 0 || ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd;
|
|
209
|
+
contributing.push({
|
|
210
|
+
name: "budget",
|
|
211
|
+
passed: budgetPass,
|
|
212
|
+
detail: {
|
|
213
|
+
candidateUsd: ctx.cost.candidate,
|
|
214
|
+
baselineUsd: ctx.cost.baseline,
|
|
215
|
+
budgetUsd: options.budgetUsd
|
|
216
|
+
}
|
|
217
|
+
});
|
|
218
|
+
if (!budgetPass) {
|
|
219
|
+
reasons.push(
|
|
220
|
+
`spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`
|
|
221
|
+
);
|
|
222
|
+
}
|
|
223
|
+
const redTeamFindings = options.redTeamBattery ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery) : { passed: true, findings: [] };
|
|
224
|
+
contributing.push({
|
|
225
|
+
name: "red-team",
|
|
226
|
+
passed: redTeamFindings.passed,
|
|
227
|
+
detail: {
|
|
228
|
+
failures: redTeamFindings.findings.length,
|
|
229
|
+
sample: redTeamFindings.findings.slice(0, 3)
|
|
230
|
+
}
|
|
231
|
+
});
|
|
232
|
+
if (!redTeamFindings.passed) {
|
|
233
|
+
reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`);
|
|
234
|
+
}
|
|
235
|
+
let rewardHackingReport = null;
|
|
236
|
+
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
237
|
+
rewardHackingReport = detectRewardHacking({ runs: options.recentRuns });
|
|
238
|
+
}
|
|
239
|
+
const gamingThreshold = 0.6;
|
|
240
|
+
const gamingFindings = (rewardHackingReport?.findings ?? []).filter(
|
|
241
|
+
(f) => f.severity >= gamingThreshold
|
|
242
|
+
);
|
|
243
|
+
const rewardHackingPass = !rewardHackingReport || !blockOnGaming || gamingFindings.length === 0 && rewardHackingReport.verdict !== "gaming";
|
|
244
|
+
contributing.push({
|
|
245
|
+
name: "reward-hacking",
|
|
246
|
+
passed: rewardHackingPass,
|
|
247
|
+
detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length }
|
|
248
|
+
});
|
|
249
|
+
if (!rewardHackingPass) {
|
|
250
|
+
reasons.push(
|
|
251
|
+
`reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport.verdict})`
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
let canaryReport = null;
|
|
255
|
+
if (options.recentRuns && options.recentRuns.length >= 10) {
|
|
256
|
+
canaryReport = runCanaries(options.recentRuns, {});
|
|
257
|
+
}
|
|
258
|
+
const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === "error");
|
|
259
|
+
const canaryPass = errorAlerts.length === 0;
|
|
260
|
+
contributing.push({
|
|
261
|
+
name: "canary",
|
|
262
|
+
passed: canaryPass,
|
|
263
|
+
detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length }
|
|
264
|
+
});
|
|
265
|
+
if (!canaryPass) {
|
|
266
|
+
reasons.push(`canary error alerts: ${errorAlerts.length}`);
|
|
267
|
+
}
|
|
268
|
+
const allPassed = contributing.every((c) => c.passed);
|
|
269
|
+
const decision = allPassed ? "ship" : "hold";
|
|
270
|
+
return {
|
|
271
|
+
decision,
|
|
272
|
+
reasons: reasons.length > 0 ? reasons : ["all gates passed"],
|
|
273
|
+
contributingGates: contributing,
|
|
274
|
+
delta
|
|
275
|
+
};
|
|
276
|
+
}
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
function probeRedTeam(artifacts, battery) {
|
|
280
|
+
const findings = [];
|
|
281
|
+
for (const [_cellId, artifact] of artifacts) {
|
|
282
|
+
const text = extractText(artifact);
|
|
283
|
+
if (text === void 0) continue;
|
|
284
|
+
for (const rtCase of battery) {
|
|
285
|
+
const finding = scoreRedTeamOutput(text, [], rtCase);
|
|
286
|
+
if (!finding.passed) {
|
|
287
|
+
findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? "red-team probe failed" });
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return { passed: findings.length === 0, findings };
|
|
292
|
+
}
|
|
293
|
+
function extractText(artifact) {
|
|
294
|
+
if (typeof artifact === "string") return artifact;
|
|
295
|
+
if (artifact && typeof artifact === "object") {
|
|
296
|
+
const rec = artifact;
|
|
297
|
+
if (typeof rec.text === "string") return rec.text;
|
|
298
|
+
if (typeof rec.output === "string") return rec.output;
|
|
299
|
+
if (typeof rec.content === "string") return rec.content;
|
|
300
|
+
}
|
|
301
|
+
return void 0;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// src/campaign/presets/run-eval.ts
|
|
305
|
+
async function runEval(opts) {
|
|
306
|
+
return runCampaign(opts);
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// src/campaign/provenance.ts
|
|
310
|
+
import { createHash } from "crypto";
|
|
311
|
+
import { join } from "path";
|
|
312
|
+
function surfaceContentHash(surface) {
|
|
313
|
+
const material = typeof surface === "string" ? surface : JSON.stringify({
|
|
314
|
+
kind: surface.kind,
|
|
315
|
+
worktreeRef: surface.worktreeRef,
|
|
316
|
+
baseRef: surface.baseRef ?? null
|
|
317
|
+
});
|
|
318
|
+
return `sha256:${createHash("sha256").update(material).digest("hex")}`;
|
|
319
|
+
}
|
|
320
|
+
function meanHoldoutComposite(campaign) {
|
|
321
|
+
const xs = [];
|
|
322
|
+
for (const cell of campaign.cells) {
|
|
323
|
+
if (cell.error) continue;
|
|
324
|
+
const cs = Object.values(cell.judgeScores).map((s) => s.composite);
|
|
325
|
+
if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
|
|
326
|
+
}
|
|
327
|
+
return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
|
|
328
|
+
}
|
|
329
|
+
function buildLoopProvenanceRecord(args) {
|
|
330
|
+
const integrity = summarizeBackendIntegrity(args.workerRecords);
|
|
331
|
+
const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
|
|
332
|
+
const candidates = [];
|
|
333
|
+
for (const gen of args.generations) {
|
|
334
|
+
const promotedSet = new Set(gen.promoted);
|
|
335
|
+
const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
|
|
336
|
+
for (const c of gen.candidates) {
|
|
337
|
+
const surface = surfaceByHash.get(c.surfaceHash);
|
|
338
|
+
const entry = {
|
|
339
|
+
generation: gen.generationIndex,
|
|
340
|
+
surfaceHash: c.surfaceHash,
|
|
341
|
+
contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
|
|
342
|
+
composite: c.composite,
|
|
343
|
+
promoted: promotedSet.has(c.surfaceHash)
|
|
344
|
+
};
|
|
345
|
+
if (c.label) entry.label = c.label;
|
|
346
|
+
if (c.rationale) entry.rationale = c.rationale;
|
|
347
|
+
candidates.push(entry);
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
|
|
351
|
+
const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
|
|
352
|
+
const record = {
|
|
353
|
+
schema: "tangle.loop-provenance.v1",
|
|
354
|
+
runId: args.runId,
|
|
355
|
+
runDir: args.runDir,
|
|
356
|
+
timestamp: args.timestamp,
|
|
357
|
+
baselineContentHash: surfaceContentHash(args.baselineSurface),
|
|
358
|
+
winnerContentHash: surfaceContentHash(args.winnerSurface),
|
|
359
|
+
diff: args.diff,
|
|
360
|
+
candidates,
|
|
361
|
+
gate: {
|
|
362
|
+
decision: args.gate.decision,
|
|
363
|
+
reasons: args.gate.reasons,
|
|
364
|
+
delta: args.gate.delta,
|
|
365
|
+
contributingGates: args.gate.contributingGates.map((g) => ({
|
|
366
|
+
name: g.name,
|
|
367
|
+
passed: g.passed
|
|
368
|
+
}))
|
|
369
|
+
},
|
|
370
|
+
baselineHoldoutComposite,
|
|
371
|
+
winnerHoldoutComposite,
|
|
372
|
+
heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
|
|
373
|
+
backend: {
|
|
374
|
+
verdict: integrity.verdict,
|
|
375
|
+
workerCallCount: integrity.totalRecords,
|
|
376
|
+
models,
|
|
377
|
+
totalInputTokens: integrity.totalInputTokens,
|
|
378
|
+
totalOutputTokens: integrity.totalOutputTokens,
|
|
379
|
+
totalCostUsd: integrity.totalCostUsd
|
|
380
|
+
},
|
|
381
|
+
totalCostUsd: args.totalCostUsd,
|
|
382
|
+
totalDurationMs: args.totalDurationMs
|
|
383
|
+
};
|
|
384
|
+
if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
|
|
385
|
+
if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
|
|
386
|
+
return record;
|
|
387
|
+
}
|
|
388
|
+
var DECISION_OK = ["ship"];
|
|
389
|
+
function hashId(parts) {
|
|
390
|
+
return createHash("sha256").update(parts.join(":")).digest("hex");
|
|
391
|
+
}
|
|
392
|
+
function gateStatus(decision) {
|
|
393
|
+
return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
|
|
394
|
+
}
|
|
395
|
+
function loopProvenanceSpans(record, opts = {}) {
|
|
396
|
+
const traceId = hashId(["trace", record.runId]).slice(0, 32);
|
|
397
|
+
const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
|
|
398
|
+
const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
|
|
399
|
+
const spans = [];
|
|
400
|
+
const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
|
|
401
|
+
spans.push({
|
|
402
|
+
traceId,
|
|
403
|
+
spanId: rootSpanId,
|
|
404
|
+
name: "improvement-loop",
|
|
405
|
+
startTimeUnixNano: baseNano,
|
|
406
|
+
endTimeUnixNano: endNano,
|
|
407
|
+
attributes: {
|
|
408
|
+
"tangle.runId": record.runId,
|
|
409
|
+
"tangle.runDir": record.runDir,
|
|
410
|
+
"tangle.baselineContentHash": record.baselineContentHash,
|
|
411
|
+
"tangle.winnerContentHash": record.winnerContentHash,
|
|
412
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
413
|
+
"tangle.gateDecision": record.gate.decision,
|
|
414
|
+
"tangle.backendVerdict": record.backend.verdict,
|
|
415
|
+
"tangle.workerCallCount": record.backend.workerCallCount,
|
|
416
|
+
"tangle.totalCostUsd": record.totalCostUsd
|
|
417
|
+
},
|
|
418
|
+
status: gateStatus(record.gate.decision),
|
|
419
|
+
"tangle.runId": record.runId
|
|
420
|
+
});
|
|
421
|
+
const byGen = /* @__PURE__ */ new Map();
|
|
422
|
+
for (const c of record.candidates) {
|
|
423
|
+
const arr = byGen.get(c.generation) ?? [];
|
|
424
|
+
arr.push(c);
|
|
425
|
+
byGen.set(c.generation, arr);
|
|
426
|
+
}
|
|
427
|
+
for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
|
|
428
|
+
const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
|
|
429
|
+
const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
|
|
430
|
+
spans.push({
|
|
431
|
+
traceId,
|
|
432
|
+
spanId: genSpanId,
|
|
433
|
+
parentSpanId: rootSpanId,
|
|
434
|
+
name: `generation-${generation}`,
|
|
435
|
+
startTimeUnixNano: baseNano,
|
|
436
|
+
endTimeUnixNano: endNano,
|
|
437
|
+
attributes: {
|
|
438
|
+
"tangle.runId": record.runId,
|
|
439
|
+
"tangle.generation": generation,
|
|
440
|
+
"tangle.populationSize": cands.length,
|
|
441
|
+
"tangle.bestComposite": bestComposite
|
|
442
|
+
},
|
|
443
|
+
"tangle.runId": record.runId,
|
|
444
|
+
"tangle.generation": generation
|
|
445
|
+
});
|
|
446
|
+
for (let i = 0; i < cands.length; i++) {
|
|
447
|
+
const c = cands[i];
|
|
448
|
+
const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
|
|
449
|
+
0,
|
|
450
|
+
16
|
|
451
|
+
);
|
|
452
|
+
const attributes = {
|
|
453
|
+
"tangle.runId": record.runId,
|
|
454
|
+
"tangle.generation": generation,
|
|
455
|
+
"tangle.surfaceHash": c.surfaceHash,
|
|
456
|
+
"tangle.contentHash": c.contentHash,
|
|
457
|
+
"tangle.composite": c.composite,
|
|
458
|
+
"tangle.promoted": c.promoted
|
|
459
|
+
};
|
|
460
|
+
if (c.label) attributes["tangle.candidateLabel"] = c.label;
|
|
461
|
+
if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
|
|
462
|
+
spans.push({
|
|
463
|
+
traceId,
|
|
464
|
+
spanId: candSpanId,
|
|
465
|
+
parentSpanId: genSpanId,
|
|
466
|
+
name: `candidate-${c.surfaceHash}`,
|
|
467
|
+
startTimeUnixNano: baseNano,
|
|
468
|
+
endTimeUnixNano: endNano,
|
|
469
|
+
attributes,
|
|
470
|
+
"tangle.runId": record.runId,
|
|
471
|
+
"tangle.generation": generation
|
|
472
|
+
});
|
|
473
|
+
}
|
|
474
|
+
}
|
|
475
|
+
const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
|
|
476
|
+
spans.push({
|
|
477
|
+
traceId,
|
|
478
|
+
spanId: gateSpanId,
|
|
479
|
+
parentSpanId: rootSpanId,
|
|
480
|
+
name: "gate-decision",
|
|
481
|
+
startTimeUnixNano: endNano,
|
|
482
|
+
endTimeUnixNano: endNano,
|
|
483
|
+
attributes: {
|
|
484
|
+
"tangle.runId": record.runId,
|
|
485
|
+
"tangle.gateDecision": record.gate.decision,
|
|
486
|
+
"tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
|
|
487
|
+
"tangle.gateReasons": JSON.stringify(record.gate.reasons),
|
|
488
|
+
"tangle.heldOutLift": record.heldOutLift,
|
|
489
|
+
"tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
|
|
490
|
+
"tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
|
|
491
|
+
},
|
|
492
|
+
status: gateStatus(record.gate.decision),
|
|
493
|
+
"tangle.runId": record.runId
|
|
494
|
+
});
|
|
495
|
+
return spans;
|
|
496
|
+
}
|
|
497
|
+
function provenanceRecordPath(runDir) {
|
|
498
|
+
return join(runDir, "loop-provenance.json");
|
|
499
|
+
}
|
|
500
|
+
function provenanceSpansPath(runDir) {
|
|
501
|
+
return join(runDir, "loop-provenance-spans.jsonl");
|
|
502
|
+
}
|
|
503
|
+
function snapshotFromHoldout(index, surfaceHash, surface, campaign) {
|
|
504
|
+
const cells = campaign.cells.map((cell) => {
|
|
505
|
+
const judgeScores = Object.values(cell.judgeScores);
|
|
506
|
+
const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
|
|
507
|
+
const score = {
|
|
508
|
+
scenarioId: cell.scenarioId,
|
|
509
|
+
rep: cell.rep,
|
|
510
|
+
compositeMean: composite,
|
|
511
|
+
dimensions: Object.fromEntries(
|
|
512
|
+
Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
|
|
513
|
+
)
|
|
514
|
+
};
|
|
515
|
+
if (cell.error) score.errorMessage = cell.error;
|
|
516
|
+
return score;
|
|
517
|
+
});
|
|
518
|
+
const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
|
|
519
|
+
return {
|
|
520
|
+
index,
|
|
521
|
+
surfaceHash,
|
|
522
|
+
surface,
|
|
523
|
+
cells,
|
|
524
|
+
compositeMean,
|
|
525
|
+
costUsd: campaign.aggregates.totalCostUsd,
|
|
526
|
+
durationMs: campaign.durationMs
|
|
527
|
+
};
|
|
528
|
+
}
|
|
529
|
+
function buildEvalRunEvent(args, record) {
|
|
530
|
+
return {
|
|
531
|
+
runId: args.runId,
|
|
532
|
+
runDir: args.runDir,
|
|
533
|
+
timestamp: args.timestamp,
|
|
534
|
+
status: "finished",
|
|
535
|
+
labels: {},
|
|
536
|
+
baseline: snapshotFromHoldout(
|
|
537
|
+
0,
|
|
538
|
+
record.baselineContentHash,
|
|
539
|
+
args.baselineSurface,
|
|
540
|
+
args.baselineOnHoldout
|
|
541
|
+
),
|
|
542
|
+
generations: [
|
|
543
|
+
snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
|
|
544
|
+
],
|
|
545
|
+
gateDecision: args.gate.decision,
|
|
546
|
+
holdoutLift: record.heldOutLift,
|
|
547
|
+
totalCostUsd: args.totalCostUsd,
|
|
548
|
+
totalDurationMs: args.totalDurationMs
|
|
549
|
+
};
|
|
550
|
+
}
|
|
551
|
+
async function emitLoopProvenance(args) {
|
|
552
|
+
const record = buildLoopProvenanceRecord(args);
|
|
553
|
+
const spans = loopProvenanceSpans(record);
|
|
554
|
+
args.storage.ensureDir(args.runDir);
|
|
555
|
+
const recordPath = provenanceRecordPath(args.runDir);
|
|
556
|
+
const spansPath = provenanceSpansPath(args.runDir);
|
|
557
|
+
args.storage.write(recordPath, JSON.stringify(record, null, 2));
|
|
558
|
+
args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
|
|
559
|
+
if (args.hostedClient) {
|
|
560
|
+
try {
|
|
561
|
+
await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
|
|
562
|
+
} catch (err) {
|
|
563
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
564
|
+
console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
|
|
565
|
+
}
|
|
566
|
+
try {
|
|
567
|
+
await args.hostedClient.ingestTraces(spans);
|
|
568
|
+
} catch (err) {
|
|
569
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
570
|
+
console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
|
|
571
|
+
}
|
|
572
|
+
}
|
|
573
|
+
return { record, spans, recordPath, spansPath };
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
export {
|
|
577
|
+
evolutionaryDriver,
|
|
578
|
+
composeGate,
|
|
579
|
+
pairHoldout,
|
|
580
|
+
heldoutSignificance,
|
|
581
|
+
detectScale,
|
|
582
|
+
dimensionRegressions,
|
|
583
|
+
defaultProductionGate,
|
|
584
|
+
runEval,
|
|
585
|
+
surfaceContentHash,
|
|
586
|
+
buildLoopProvenanceRecord,
|
|
587
|
+
loopProvenanceSpans,
|
|
588
|
+
provenanceRecordPath,
|
|
589
|
+
provenanceSpansPath,
|
|
590
|
+
emitLoopProvenance
|
|
591
|
+
};
|
|
592
|
+
//# sourceMappingURL=chunk-MZ2IYGGN.js.map
|