@tangle-network/agent-eval 0.65.0 → 0.67.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/CHANGELOG.md +25 -0
  2. package/dist/adapters/otel.d.ts +1 -1
  3. package/dist/campaign/index.d.ts +110 -6
  4. package/dist/campaign/index.js +26 -19
  5. package/dist/campaign/index.js.map +1 -1
  6. package/dist/{chunk-7TPYV2ER.js → chunk-6XQIEUQ2.js} +140 -7
  7. package/dist/chunk-6XQIEUQ2.js.map +1 -0
  8. package/dist/{chunk-HKINEDRZ.js → chunk-DFS3FEXO.js} +3 -2
  9. package/dist/chunk-DFS3FEXO.js.map +1 -0
  10. package/dist/chunk-MZ2IYGGN.js +592 -0
  11. package/dist/chunk-MZ2IYGGN.js.map +1 -0
  12. package/dist/{chunk-4ODZXQV2.js → chunk-NV2PF37Q.js} +645 -2
  13. package/dist/chunk-NV2PF37Q.js.map +1 -0
  14. package/dist/contract/index.d.ts +11 -9
  15. package/dist/contract/index.js +11 -12
  16. package/dist/contract/index.js.map +1 -1
  17. package/dist/hosted/index.d.ts +1 -1
  18. package/dist/hosted/index.js +1 -1
  19. package/dist/{index-CzhtwYBT.d.ts → index-DSEHMwvS.d.ts} +4 -2
  20. package/dist/index.d.ts +251 -7
  21. package/dist/index.js +292 -2
  22. package/dist/index.js.map +1 -1
  23. package/dist/openapi.json +1 -1
  24. package/dist/provenance-CChUqexv.d.ts +314 -0
  25. package/dist/{registry-DPly4_hZ.d.ts → registry-BGKyX6bw.d.ts} +2 -2
  26. package/dist/release-report-CN8hJlhk.d.ts +233 -0
  27. package/dist/reporting.d.ts +4 -3
  28. package/dist/{run-campaign-5J3ED2UJ.js → run-campaign-BVY3RGAZ.js} +2 -3
  29. package/dist/{provenance-lqyLpOYR.d.ts → run-improvement-loop-BKpM5T4t.d.ts} +51 -329
  30. package/dist/statistics-B7yCbi9i.d.ts +253 -0
  31. package/dist/{types-DhqpAi_z.d.ts → types-Croy5h7V.d.ts} +1 -1
  32. package/package.json +1 -1
  33. package/dist/chunk-4ODZXQV2.js.map +0 -1
  34. package/dist/chunk-7TPYV2ER.js.map +0 -1
  35. package/dist/chunk-CZRKD2X2.js +0 -1104
  36. package/dist/chunk-CZRKD2X2.js.map +0 -1
  37. package/dist/chunk-E22YUOAL.js +0 -111
  38. package/dist/chunk-E22YUOAL.js.map +0 -1
  39. package/dist/chunk-HKINEDRZ.js.map +0 -1
  40. package/dist/release-report-DGoeObZT.d.ts +0 -484
  41. /package/dist/{run-campaign-5J3ED2UJ.js.map → run-campaign-BVY3RGAZ.js.map} +0 -0
@@ -0,0 +1,592 @@
1
+ import {
2
+ runCanaries,
3
+ scoreRedTeamOutput
4
+ } from "./chunk-NV2PF37Q.js";
5
+ import {
6
+ runCampaign,
7
+ summarizeBackendIntegrity
8
+ } from "./chunk-6XQIEUQ2.js";
9
+ import {
10
+ detectRewardHacking
11
+ } from "./chunk-YV7J7X5N.js";
12
+ import {
13
+ pairedBootstrap
14
+ } from "./chunk-ITBRCT73.js";
15
+
16
+ // src/campaign/drivers/evolutionary.ts
17
+ function evolutionaryDriver(opts) {
18
+ return {
19
+ kind: `evolutionary:${opts.mutator.kind}`,
20
+ async propose({ currentSurface, findings, populationSize, signal }) {
21
+ return opts.mutator.mutate({
22
+ findings: findings.length > 0 ? findings : opts.findings ?? [],
23
+ currentSurface,
24
+ populationSize,
25
+ signal
26
+ });
27
+ }
28
+ };
29
+ }
30
+
31
+ // src/campaign/gates/compose.ts
32
+ function composeGate(...gates) {
33
+ if (gates.length === 0) {
34
+ throw new Error("composeGate requires at least one gate");
35
+ }
36
+ return {
37
+ name: `composed(${gates.map((g) => g.name).join(",")})`,
38
+ async decide(ctx) {
39
+ const results = [];
40
+ for (const gate of gates) {
41
+ const res = await gate.decide(ctx);
42
+ results.push({ gate, res });
43
+ }
44
+ const decisions = results.map((r) => r.res.decision);
45
+ const overall = decisions.every((d) => d === "ship") ? "ship" : decisions.includes("arch_ceiling") ? "arch_ceiling" : decisions.includes("model_ceiling") ? "model_ceiling" : decisions.includes("hold") ? "hold" : "need_more_work";
46
+ const contributing = results.flatMap(
47
+ (r) => r.res.contributingGates.length > 0 ? r.res.contributingGates : [{ name: r.gate.name, passed: r.res.decision === "ship", detail: r.res }]
48
+ );
49
+ const reasons = results.flatMap(
50
+ (r) => r.res.reasons.map((reason) => `[${r.gate.name}] ${reason}`)
51
+ );
52
+ return {
53
+ decision: overall,
54
+ reasons,
55
+ contributingGates: contributing,
56
+ delta: results[0]?.res.delta
57
+ };
58
+ }
59
+ };
60
+ }
61
+
62
+ // src/campaign/gates/statistical-heldout.ts
63
+ function pairHoldout(candidate, baseline, scenarioIds, select) {
64
+ const cellValue = (byCell, cellId) => {
65
+ const scores = byCell.get(cellId);
66
+ if (!scores) return void 0;
67
+ const vals = [];
68
+ for (const s of Object.values(scores)) {
69
+ const v = select(s);
70
+ if (typeof v === "number" && Number.isFinite(v)) vals.push(v);
71
+ }
72
+ if (vals.length === 0) return void 0;
73
+ return vals.reduce((a, b) => a + b, 0) / vals.length;
74
+ };
75
+ const inScope = (cellId) => scenarioIds.has(cellId.split(":")[0] ?? "");
76
+ const candCells = [...candidate.keys()].filter(inScope).sort();
77
+ const baseCells = [...baseline.keys()].filter(inScope).sort();
78
+ if (candCells.length !== baseCells.length || candCells.some((c, i) => c !== baseCells[i])) {
79
+ throw new Error(
80
+ `pairHoldout: candidate/baseline holdout cells do not align \u2014 candidate=[${candCells.join(",")}] baseline=[${baseCells.join(",")}]. Both holdout campaigns must run the same scenarios with the same seed base.`
81
+ );
82
+ }
83
+ const before = [];
84
+ const after = [];
85
+ const cellIds = [];
86
+ for (const cellId of candCells) {
87
+ const b = cellValue(baseline, cellId);
88
+ const a = cellValue(candidate, cellId);
89
+ if (b === void 0 || a === void 0) continue;
90
+ before.push(b);
91
+ after.push(a);
92
+ cellIds.push(cellId);
93
+ }
94
+ return { before, after, cellIds };
95
+ }
96
+ function heldoutSignificance(paired, opts = {}) {
97
+ const deltaThreshold = opts.deltaThreshold ?? 0;
98
+ const minProductiveRuns = opts.minProductiveRuns ?? 3;
99
+ const bootstrap = pairedBootstrap(paired.before, paired.after, {
100
+ confidence: opts.confidence ?? 0.95,
101
+ resamples: opts.resamples ?? 2e3,
102
+ statistic: opts.statistic ?? "median",
103
+ seed: opts.seed ?? 1337
104
+ });
105
+ const n = paired.before.length;
106
+ const fewRuns = n < minProductiveRuns;
107
+ const significant = !fewRuns && bootstrap.low > deltaThreshold;
108
+ return { paired, bootstrap, n, significant, fewRuns };
109
+ }
110
+ function detectScale(values) {
111
+ return values.some((v) => Math.abs(v) > 1.5) ? 100 : 1;
112
+ }
113
+ function dimensionRegressions(candidate, baseline, scenarioIds, criticalDimensions, opts = {}) {
114
+ const out = [];
115
+ for (const dim of criticalDimensions) {
116
+ const paired = pairHoldout(candidate, baseline, scenarioIds, (s) => s.dimensions[dim]);
117
+ if (paired.before.length === 0) continue;
118
+ const tolerance = opts.tolerance ?? 0.05 * detectScale([...paired.before, ...paired.after]);
119
+ const bootstrap = pairedBootstrap(paired.before, paired.after, {
120
+ confidence: opts.confidence ?? 0.95,
121
+ resamples: opts.resamples ?? 2e3,
122
+ statistic: "median",
123
+ seed: opts.seed ?? 1337
124
+ });
125
+ out.push({
126
+ dimension: dim,
127
+ bootstrap,
128
+ regressed: bootstrap.low < -tolerance,
129
+ tolerance,
130
+ n: paired.before.length
131
+ });
132
+ }
133
+ return out;
134
+ }
135
+
136
+ // src/campaign/gates/default-production-gate.ts
137
+ function defaultProductionGate(options) {
138
+ const deltaThreshold = options.deltaThreshold ?? 0;
139
+ const confidence = options.confidence ?? 0.95;
140
+ const resamples = options.bootstrapResamples ?? 2e3;
141
+ const seed = options.bootstrapSeed ?? 1337;
142
+ const minProductiveRuns = options.minProductiveRuns ?? 3;
143
+ const blockOnGaming = options.blockOnRewardHackingGaming ?? true;
144
+ return {
145
+ name: "defaultProductionGate",
146
+ async decide(ctx) {
147
+ const reasons = [];
148
+ const contributing = [];
149
+ const scenarioIds = new Set(options.holdoutScenarios.map((s) => s.id));
150
+ const sig = heldoutSignificance(
151
+ pairHoldout(
152
+ ctx.judgeScores,
153
+ ctx.baselineJudgeScores ?? ctx.judgeScores,
154
+ scenarioIds,
155
+ (s) => s.composite
156
+ ),
157
+ { deltaThreshold, minProductiveRuns, confidence, resamples, seed }
158
+ );
159
+ const delta = sig.bootstrap.median;
160
+ const heldoutPass = sig.significant;
161
+ contributing.push({
162
+ name: "heldout-significance",
163
+ passed: heldoutPass,
164
+ detail: {
165
+ n: sig.n,
166
+ deltaMedian: sig.bootstrap.median,
167
+ ciLow: sig.bootstrap.low,
168
+ ciHigh: sig.bootstrap.high,
169
+ confidence: sig.bootstrap.confidence,
170
+ deltaThreshold,
171
+ fewRuns: sig.fewRuns
172
+ }
173
+ });
174
+ if (!heldoutPass) {
175
+ reasons.push(
176
+ sig.fewRuns ? `held-out: only ${sig.n} paired runs (< ${minProductiveRuns}) \u2014 too few to claim significance` : `held-out CI.low ${sig.bootstrap.low.toFixed(3)} \u2264 threshold ${deltaThreshold} (median ${sig.bootstrap.median.toFixed(3)}, ${(sig.bootstrap.confidence * 100).toFixed(0)}% CI [${sig.bootstrap.low.toFixed(3)}, ${sig.bootstrap.high.toFixed(3)}])`
177
+ );
178
+ }
179
+ const dimRegs = options.criticalDimensions?.length ? dimensionRegressions(
180
+ ctx.judgeScores,
181
+ ctx.baselineJudgeScores ?? ctx.judgeScores,
182
+ scenarioIds,
183
+ options.criticalDimensions,
184
+ { tolerance: options.regressionTolerance, confidence, resamples, seed }
185
+ ) : [];
186
+ const regressed = dimRegs.filter((d) => d.regressed);
187
+ const dimPass = regressed.length === 0;
188
+ contributing.push({
189
+ name: "dimension-regression",
190
+ passed: dimPass,
191
+ detail: {
192
+ guarded: options.criticalDimensions ?? [],
193
+ regressions: dimRegs.map((d) => ({
194
+ dimension: d.dimension,
195
+ ciLow: d.bootstrap.low,
196
+ median: d.bootstrap.median,
197
+ tolerance: d.tolerance,
198
+ n: d.n,
199
+ regressed: d.regressed
200
+ }))
201
+ }
202
+ });
203
+ if (!dimPass) {
204
+ reasons.push(
205
+ `critical dimension(s) regressed: ${regressed.map((d) => `${d.dimension} CI.low ${d.bootstrap.low.toFixed(3)} < -${d.tolerance}`).join("; ")}`
206
+ );
207
+ }
208
+ const budgetPass = options.budgetUsd === void 0 || ctx.cost.candidate + ctx.cost.baseline <= options.budgetUsd;
209
+ contributing.push({
210
+ name: "budget",
211
+ passed: budgetPass,
212
+ detail: {
213
+ candidateUsd: ctx.cost.candidate,
214
+ baselineUsd: ctx.cost.baseline,
215
+ budgetUsd: options.budgetUsd
216
+ }
217
+ });
218
+ if (!budgetPass) {
219
+ reasons.push(
220
+ `spend ${(ctx.cost.candidate + ctx.cost.baseline).toFixed(2)} > budget ${options.budgetUsd}`
221
+ );
222
+ }
223
+ const redTeamFindings = options.redTeamBattery ? probeRedTeam(ctx.candidateArtifacts, options.redTeamBattery) : { passed: true, findings: [] };
224
+ contributing.push({
225
+ name: "red-team",
226
+ passed: redTeamFindings.passed,
227
+ detail: {
228
+ failures: redTeamFindings.findings.length,
229
+ sample: redTeamFindings.findings.slice(0, 3)
230
+ }
231
+ });
232
+ if (!redTeamFindings.passed) {
233
+ reasons.push(`red-team probe failed (${redTeamFindings.findings.length} findings)`);
234
+ }
235
+ let rewardHackingReport = null;
236
+ if (options.recentRuns && options.recentRuns.length >= 10) {
237
+ rewardHackingReport = detectRewardHacking({ runs: options.recentRuns });
238
+ }
239
+ const gamingThreshold = 0.6;
240
+ const gamingFindings = (rewardHackingReport?.findings ?? []).filter(
241
+ (f) => f.severity >= gamingThreshold
242
+ );
243
+ const rewardHackingPass = !rewardHackingReport || !blockOnGaming || gamingFindings.length === 0 && rewardHackingReport.verdict !== "gaming";
244
+ contributing.push({
245
+ name: "reward-hacking",
246
+ passed: rewardHackingPass,
247
+ detail: { report: rewardHackingReport, gamingFindingCount: gamingFindings.length }
248
+ });
249
+ if (!rewardHackingPass) {
250
+ reasons.push(
251
+ `reward-hacking detector flagged ${gamingFindings.length} gaming-severity findings (verdict=${rewardHackingReport.verdict})`
252
+ );
253
+ }
254
+ let canaryReport = null;
255
+ if (options.recentRuns && options.recentRuns.length >= 10) {
256
+ canaryReport = runCanaries(options.recentRuns, {});
257
+ }
258
+ const errorAlerts = (canaryReport?.alerts ?? []).filter((a) => a.severity === "error");
259
+ const canaryPass = errorAlerts.length === 0;
260
+ contributing.push({
261
+ name: "canary",
262
+ passed: canaryPass,
263
+ detail: { totalAlerts: canaryReport?.alerts.length ?? 0, errorAlerts: errorAlerts.length }
264
+ });
265
+ if (!canaryPass) {
266
+ reasons.push(`canary error alerts: ${errorAlerts.length}`);
267
+ }
268
+ const allPassed = contributing.every((c) => c.passed);
269
+ const decision = allPassed ? "ship" : "hold";
270
+ return {
271
+ decision,
272
+ reasons: reasons.length > 0 ? reasons : ["all gates passed"],
273
+ contributingGates: contributing,
274
+ delta
275
+ };
276
+ }
277
+ };
278
+ }
279
+ function probeRedTeam(artifacts, battery) {
280
+ const findings = [];
281
+ for (const [_cellId, artifact] of artifacts) {
282
+ const text = extractText(artifact);
283
+ if (text === void 0) continue;
284
+ for (const rtCase of battery) {
285
+ const finding = scoreRedTeamOutput(text, [], rtCase);
286
+ if (!finding.passed) {
287
+ findings.push({ scenarioId: rtCase.id, reason: finding.reason ?? "red-team probe failed" });
288
+ }
289
+ }
290
+ }
291
+ return { passed: findings.length === 0, findings };
292
+ }
293
+ function extractText(artifact) {
294
+ if (typeof artifact === "string") return artifact;
295
+ if (artifact && typeof artifact === "object") {
296
+ const rec = artifact;
297
+ if (typeof rec.text === "string") return rec.text;
298
+ if (typeof rec.output === "string") return rec.output;
299
+ if (typeof rec.content === "string") return rec.content;
300
+ }
301
+ return void 0;
302
+ }
303
+
304
+ // src/campaign/presets/run-eval.ts
305
+ async function runEval(opts) {
306
+ return runCampaign(opts);
307
+ }
308
+
309
+ // src/campaign/provenance.ts
310
+ import { createHash } from "crypto";
311
+ import { join } from "path";
312
+ function surfaceContentHash(surface) {
313
+ const material = typeof surface === "string" ? surface : JSON.stringify({
314
+ kind: surface.kind,
315
+ worktreeRef: surface.worktreeRef,
316
+ baseRef: surface.baseRef ?? null
317
+ });
318
+ return `sha256:${createHash("sha256").update(material).digest("hex")}`;
319
+ }
320
+ function meanHoldoutComposite(campaign) {
321
+ const xs = [];
322
+ for (const cell of campaign.cells) {
323
+ if (cell.error) continue;
324
+ const cs = Object.values(cell.judgeScores).map((s) => s.composite);
325
+ if (cs.length) xs.push(cs.reduce((a, b) => a + b, 0) / cs.length);
326
+ }
327
+ return xs.length ? xs.reduce((a, b) => a + b, 0) / xs.length : 0;
328
+ }
329
+ function buildLoopProvenanceRecord(args) {
330
+ const integrity = summarizeBackendIntegrity(args.workerRecords);
331
+ const models = [...new Set(args.workerRecords.map((r) => r.model))].sort();
332
+ const candidates = [];
333
+ for (const gen of args.generations) {
334
+ const promotedSet = new Set(gen.promoted);
335
+ const surfaceByHash = new Map(gen.surfaces.map((s) => [s.surfaceHash, s.surface]));
336
+ for (const c of gen.candidates) {
337
+ const surface = surfaceByHash.get(c.surfaceHash);
338
+ const entry = {
339
+ generation: gen.generationIndex,
340
+ surfaceHash: c.surfaceHash,
341
+ contentHash: surface !== void 0 ? surfaceContentHash(surface) : `sha256:${c.surfaceHash}`,
342
+ composite: c.composite,
343
+ promoted: promotedSet.has(c.surfaceHash)
344
+ };
345
+ if (c.label) entry.label = c.label;
346
+ if (c.rationale) entry.rationale = c.rationale;
347
+ candidates.push(entry);
348
+ }
349
+ }
350
+ const baselineHoldoutComposite = meanHoldoutComposite(args.baselineOnHoldout);
351
+ const winnerHoldoutComposite = meanHoldoutComposite(args.winnerOnHoldout);
352
+ const record = {
353
+ schema: "tangle.loop-provenance.v1",
354
+ runId: args.runId,
355
+ runDir: args.runDir,
356
+ timestamp: args.timestamp,
357
+ baselineContentHash: surfaceContentHash(args.baselineSurface),
358
+ winnerContentHash: surfaceContentHash(args.winnerSurface),
359
+ diff: args.diff,
360
+ candidates,
361
+ gate: {
362
+ decision: args.gate.decision,
363
+ reasons: args.gate.reasons,
364
+ delta: args.gate.delta,
365
+ contributingGates: args.gate.contributingGates.map((g) => ({
366
+ name: g.name,
367
+ passed: g.passed
368
+ }))
369
+ },
370
+ baselineHoldoutComposite,
371
+ winnerHoldoutComposite,
372
+ heldOutLift: winnerHoldoutComposite - baselineHoldoutComposite,
373
+ backend: {
374
+ verdict: integrity.verdict,
375
+ workerCallCount: integrity.totalRecords,
376
+ models,
377
+ totalInputTokens: integrity.totalInputTokens,
378
+ totalOutputTokens: integrity.totalOutputTokens,
379
+ totalCostUsd: integrity.totalCostUsd
380
+ },
381
+ totalCostUsd: args.totalCostUsd,
382
+ totalDurationMs: args.totalDurationMs
383
+ };
384
+ if (args.winnerLabel) record.winnerLabel = args.winnerLabel;
385
+ if (args.winnerRationale) record.winnerRationale = args.winnerRationale;
386
+ return record;
387
+ }
388
+ var DECISION_OK = ["ship"];
389
+ function hashId(parts) {
390
+ return createHash("sha256").update(parts.join(":")).digest("hex");
391
+ }
392
+ function gateStatus(decision) {
393
+ return DECISION_OK.includes(decision) ? { code: "OK" } : { code: "ERROR", message: `gate decision: ${decision}` };
394
+ }
395
+ function loopProvenanceSpans(record, opts = {}) {
396
+ const traceId = hashId(["trace", record.runId]).slice(0, 32);
397
+ const baseNano = (opts.baseTimeMs ?? (Date.parse(record.timestamp) || Date.now())) * 1e6;
398
+ const endNano = baseNano + Math.max(1, record.totalDurationMs) * 1e6;
399
+ const spans = [];
400
+ const rootSpanId = hashId(["root", record.runId]).slice(0, 16);
401
+ spans.push({
402
+ traceId,
403
+ spanId: rootSpanId,
404
+ name: "improvement-loop",
405
+ startTimeUnixNano: baseNano,
406
+ endTimeUnixNano: endNano,
407
+ attributes: {
408
+ "tangle.runId": record.runId,
409
+ "tangle.runDir": record.runDir,
410
+ "tangle.baselineContentHash": record.baselineContentHash,
411
+ "tangle.winnerContentHash": record.winnerContentHash,
412
+ "tangle.heldOutLift": record.heldOutLift,
413
+ "tangle.gateDecision": record.gate.decision,
414
+ "tangle.backendVerdict": record.backend.verdict,
415
+ "tangle.workerCallCount": record.backend.workerCallCount,
416
+ "tangle.totalCostUsd": record.totalCostUsd
417
+ },
418
+ status: gateStatus(record.gate.decision),
419
+ "tangle.runId": record.runId
420
+ });
421
+ const byGen = /* @__PURE__ */ new Map();
422
+ for (const c of record.candidates) {
423
+ const arr = byGen.get(c.generation) ?? [];
424
+ arr.push(c);
425
+ byGen.set(c.generation, arr);
426
+ }
427
+ for (const [generation, cands] of [...byGen.entries()].sort((a, b) => a[0] - b[0])) {
428
+ const genSpanId = hashId(["gen", record.runId, String(generation)]).slice(0, 16);
429
+ const bestComposite = cands.reduce((m, c) => Math.max(m, c.composite), 0);
430
+ spans.push({
431
+ traceId,
432
+ spanId: genSpanId,
433
+ parentSpanId: rootSpanId,
434
+ name: `generation-${generation}`,
435
+ startTimeUnixNano: baseNano,
436
+ endTimeUnixNano: endNano,
437
+ attributes: {
438
+ "tangle.runId": record.runId,
439
+ "tangle.generation": generation,
440
+ "tangle.populationSize": cands.length,
441
+ "tangle.bestComposite": bestComposite
442
+ },
443
+ "tangle.runId": record.runId,
444
+ "tangle.generation": generation
445
+ });
446
+ for (let i = 0; i < cands.length; i++) {
447
+ const c = cands[i];
448
+ const candSpanId = hashId(["cand", record.runId, String(generation), c.surfaceHash]).slice(
449
+ 0,
450
+ 16
451
+ );
452
+ const attributes = {
453
+ "tangle.runId": record.runId,
454
+ "tangle.generation": generation,
455
+ "tangle.surfaceHash": c.surfaceHash,
456
+ "tangle.contentHash": c.contentHash,
457
+ "tangle.composite": c.composite,
458
+ "tangle.promoted": c.promoted
459
+ };
460
+ if (c.label) attributes["tangle.candidateLabel"] = c.label;
461
+ if (c.rationale) attributes["tangle.candidateRationale"] = c.rationale;
462
+ spans.push({
463
+ traceId,
464
+ spanId: candSpanId,
465
+ parentSpanId: genSpanId,
466
+ name: `candidate-${c.surfaceHash}`,
467
+ startTimeUnixNano: baseNano,
468
+ endTimeUnixNano: endNano,
469
+ attributes,
470
+ "tangle.runId": record.runId,
471
+ "tangle.generation": generation
472
+ });
473
+ }
474
+ }
475
+ const gateSpanId = hashId(["gate", record.runId]).slice(0, 16);
476
+ spans.push({
477
+ traceId,
478
+ spanId: gateSpanId,
479
+ parentSpanId: rootSpanId,
480
+ name: "gate-decision",
481
+ startTimeUnixNano: endNano,
482
+ endTimeUnixNano: endNano,
483
+ attributes: {
484
+ "tangle.runId": record.runId,
485
+ "tangle.gateDecision": record.gate.decision,
486
+ "tangle.gateDelta": record.gate.delta ?? record.heldOutLift,
487
+ "tangle.gateReasons": JSON.stringify(record.gate.reasons),
488
+ "tangle.heldOutLift": record.heldOutLift,
489
+ "tangle.baselineHoldoutComposite": record.baselineHoldoutComposite,
490
+ "tangle.winnerHoldoutComposite": record.winnerHoldoutComposite
491
+ },
492
+ status: gateStatus(record.gate.decision),
493
+ "tangle.runId": record.runId
494
+ });
495
+ return spans;
496
+ }
497
+ function provenanceRecordPath(runDir) {
498
+ return join(runDir, "loop-provenance.json");
499
+ }
500
+ function provenanceSpansPath(runDir) {
501
+ return join(runDir, "loop-provenance-spans.jsonl");
502
+ }
503
+ function snapshotFromHoldout(index, surfaceHash, surface, campaign) {
504
+ const cells = campaign.cells.map((cell) => {
505
+ const judgeScores = Object.values(cell.judgeScores);
506
+ const composite = judgeScores.length === 0 ? 0 : judgeScores.reduce((s, j) => s + j.composite, 0) / judgeScores.length;
507
+ const score = {
508
+ scenarioId: cell.scenarioId,
509
+ rep: cell.rep,
510
+ compositeMean: composite,
511
+ dimensions: Object.fromEntries(
512
+ Object.entries(cell.judgeScores).map(([name, s]) => [name, s.dimensions])
513
+ )
514
+ };
515
+ if (cell.error) score.errorMessage = cell.error;
516
+ return score;
517
+ });
518
+ const compositeMean = cells.length === 0 ? 0 : cells.reduce((s, c) => s + c.compositeMean, 0) / cells.length;
519
+ return {
520
+ index,
521
+ surfaceHash,
522
+ surface,
523
+ cells,
524
+ compositeMean,
525
+ costUsd: campaign.aggregates.totalCostUsd,
526
+ durationMs: campaign.durationMs
527
+ };
528
+ }
529
+ function buildEvalRunEvent(args, record) {
530
+ return {
531
+ runId: args.runId,
532
+ runDir: args.runDir,
533
+ timestamp: args.timestamp,
534
+ status: "finished",
535
+ labels: {},
536
+ baseline: snapshotFromHoldout(
537
+ 0,
538
+ record.baselineContentHash,
539
+ args.baselineSurface,
540
+ args.baselineOnHoldout
541
+ ),
542
+ generations: [
543
+ snapshotFromHoldout(1, record.winnerContentHash, args.winnerSurface, args.winnerOnHoldout)
544
+ ],
545
+ gateDecision: args.gate.decision,
546
+ holdoutLift: record.heldOutLift,
547
+ totalCostUsd: args.totalCostUsd,
548
+ totalDurationMs: args.totalDurationMs
549
+ };
550
+ }
551
+ async function emitLoopProvenance(args) {
552
+ const record = buildLoopProvenanceRecord(args);
553
+ const spans = loopProvenanceSpans(record);
554
+ args.storage.ensureDir(args.runDir);
555
+ const recordPath = provenanceRecordPath(args.runDir);
556
+ const spansPath = provenanceSpansPath(args.runDir);
557
+ args.storage.write(recordPath, JSON.stringify(record, null, 2));
558
+ args.storage.write(spansPath, spans.map((s) => JSON.stringify(s)).join("\n"));
559
+ if (args.hostedClient) {
560
+ try {
561
+ await args.hostedClient.ingestEvalRun(buildEvalRunEvent(args, record));
562
+ } catch (err) {
563
+ const msg = err instanceof Error ? err.message : String(err);
564
+ console.warn(`[agent-eval] hosted eval-run ingest failed (continuing): ${msg}`);
565
+ }
566
+ try {
567
+ await args.hostedClient.ingestTraces(spans);
568
+ } catch (err) {
569
+ const msg = err instanceof Error ? err.message : String(err);
570
+ console.warn(`[agent-eval] provenance span ingest failed (continuing): ${msg}`);
571
+ }
572
+ }
573
+ return { record, spans, recordPath, spansPath };
574
+ }
575
+
576
+ export {
577
+ evolutionaryDriver,
578
+ composeGate,
579
+ pairHoldout,
580
+ heldoutSignificance,
581
+ detectScale,
582
+ dimensionRegressions,
583
+ defaultProductionGate,
584
+ runEval,
585
+ surfaceContentHash,
586
+ buildLoopProvenanceRecord,
587
+ loopProvenanceSpans,
588
+ provenanceRecordPath,
589
+ provenanceSpansPath,
590
+ emitLoopProvenance
591
+ };
592
+ //# sourceMappingURL=chunk-MZ2IYGGN.js.map