@tangle-network/agent-eval 0.20.12 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/CHANGELOG.md +76 -0
  2. package/README.md +39 -1
  3. package/dist/{chunk-75MCTH7P.js → chunk-3GN6U53I.js} +198 -3
  4. package/dist/chunk-3GN6U53I.js.map +1 -0
  5. package/dist/chunk-3IX6QTB7.js +1349 -0
  6. package/dist/chunk-3IX6QTB7.js.map +1 -0
  7. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  8. package/dist/chunk-5IIQKMD5.js.map +1 -0
  9. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  10. package/dist/{chunk-HKYRWNHV.js → chunk-HRZELXCR.js} +2 -2
  11. package/dist/{chunk-ODFINDLQ.js → chunk-KRR4VMH7.js} +11 -1
  12. package/dist/chunk-KRR4VMH7.js.map +1 -0
  13. package/dist/chunk-SNUHRBDL.js +154 -0
  14. package/dist/chunk-SNUHRBDL.js.map +1 -0
  15. package/dist/{chunk-KWUAAIHR.js → chunk-WOK2RTWG.js} +157 -1
  16. package/dist/chunk-WOK2RTWG.js.map +1 -0
  17. package/dist/{chunk-HNJLMAJ2.js → chunk-WOPGKVN4.js} +2 -2
  18. package/dist/cli.js +3 -2
  19. package/dist/cli.js.map +1 -1
  20. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  21. package/dist/control.d.ts +4 -3
  22. package/dist/control.js +2 -2
  23. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  24. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  25. package/dist/index.d.ts +71 -83
  26. package/dist/index.js +48 -60
  27. package/dist/index.js.map +1 -1
  28. package/dist/openapi.json +1 -1
  29. package/dist/optimization.d.ts +3 -2
  30. package/dist/optimization.js +2 -2
  31. package/dist/reporting-Da2ihlcM.d.ts +672 -0
  32. package/dist/reporting.d.ts +5 -426
  33. package/dist/reporting.js +6 -2
  34. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  35. package/dist/traces.d.ts +259 -3
  36. package/dist/traces.js +24 -4
  37. package/dist/wire/index.js +3 -2
  38. package/docs/research-report-methodology.md +155 -0
  39. package/package.json +10 -12
  40. package/dist/chunk-75MCTH7P.js.map +0 -1
  41. package/dist/chunk-IKFVX537.js +0 -717
  42. package/dist/chunk-IKFVX537.js.map +0 -1
  43. package/dist/chunk-KWUAAIHR.js.map +0 -1
  44. package/dist/chunk-ODFINDLQ.js.map +0 -1
  45. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  46. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
  47. /package/dist/{chunk-HKYRWNHV.js.map → chunk-HRZELXCR.js.map} +0 -0
  48. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-WOPGKVN4.js.map} +0 -0
@@ -1,717 +0,0 @@
1
- import {
2
- benjaminiHochberg,
3
- cohensD,
4
- confidenceInterval,
5
- pairedBootstrap,
6
- wilcoxonSignedRank
7
- } from "./chunk-ODFINDLQ.js";
8
-
9
- // src/release-confidence.ts
10
- var DEFAULT_THRESHOLDS = {
11
- requireCorpus: true,
12
- minScenarioCount: 1,
13
- minSearchRuns: 1,
14
- minHoldoutRuns: 1,
15
- requireHoldout: true,
16
- minPassRate: 0.8,
17
- minMeanScore: 0.7,
18
- maxOverfitGap: 0.15,
19
- maxMeanCostUsd: Number.POSITIVE_INFINITY,
20
- maxP95WallMs: Number.POSITIVE_INFINITY,
21
- requireAsiForFailures: true,
22
- failureScoreThreshold: 0.5
23
- };
24
- function releaseTraceEvidenceFromMultiShotTrials(trials) {
25
- return trials.map((trial) => ({
26
- scenarioId: trial.scenarioId,
27
- candidateId: trial.variantId,
28
- split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
29
- score: trial.score,
30
- ok: trial.ok,
31
- turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
32
- costUsd: trial.cost,
33
- durationMs: trial.durationMs,
34
- failureMode: trial.error ? "runtime_error" : void 0,
35
- asi: trial.asi,
36
- metadata: trial.metadata
37
- }));
38
- }
39
- function evaluateReleaseConfidence(input) {
40
- const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
41
- const candidateId = input.candidateId ?? null;
42
- const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
43
- const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
44
- const scenarios = input.scenarios ?? [];
45
- const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
46
- const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
47
- const searchScores = scoresFor(runs, "search");
48
- const holdoutScores = scoresFor(runs, "holdout");
49
- const allScores = [...searchScores, ...holdoutScores];
50
- const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
51
- const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
52
- const searchRuns = runs.filter((r) => r.splitTag === "search").length;
53
- const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
54
- const searchMeanScore = mean(searchScores);
55
- const holdoutMeanScore = mean(holdoutScores);
56
- const metrics = {
57
- scenarioCount,
58
- searchRuns,
59
- holdoutRuns,
60
- passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
61
- meanScore: mean(scoreUniverse),
62
- searchMeanScore,
63
- holdoutMeanScore,
64
- overfitGap: safeDiff(searchMeanScore, holdoutMeanScore),
65
- meanCostUsd: mean([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
66
- p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
67
- failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
68
- failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
69
- singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
70
- multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
71
- splitCounts,
72
- domainCounts: countDomains(scenarios),
73
- failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
74
- responsibleSurfaceCounts: countResponsibleSurfaces(traces)
75
- };
76
- const issues = [];
77
- checkCorpus(input, thresholds, metrics, issues);
78
- checkQuality(thresholds, metrics, issues);
79
- checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
80
- checkDiagnostics(thresholds, metrics, issues);
81
- checkEfficiency(thresholds, metrics, issues);
82
- const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
83
- const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
84
- return {
85
- target: input.target,
86
- candidateId,
87
- baselineId: input.baselineId ?? null,
88
- status,
89
- promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
90
- axes,
91
- issues,
92
- metrics,
93
- dataset: input.dataset ?? null,
94
- gateDecision: input.gateDecision ?? null,
95
- summary: renderSummary(input.target, status, metrics, issues)
96
- };
97
- }
98
- function assertReleaseConfidence(input) {
99
- const scorecard = evaluateReleaseConfidence(input);
100
- if (scorecard.status === "fail") {
101
- throw new Error(scorecard.summary);
102
- }
103
- return scorecard;
104
- }
105
- function filterCandidate(runs, candidateId, baselineId) {
106
- if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
107
- if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
108
- return [...runs];
109
- }
110
- function filterTraceCandidate(traces, candidateId, baselineId) {
111
- if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
112
- if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
113
- return [...traces];
114
- }
115
- function checkCorpus(input, thresholds, metrics, issues) {
116
- if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
117
- issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
118
- }
119
- if (metrics.scenarioCount < thresholds.minScenarioCount) {
120
- issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
121
- }
122
- if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
123
- issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
124
- }
125
- }
126
- function checkQuality(thresholds, metrics, issues) {
127
- if (metrics.searchRuns < thresholds.minSearchRuns) {
128
- issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
129
- }
130
- if (metrics.passRate < thresholds.minPassRate) {
131
- issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt(metrics.passRate)} < ${fmt(thresholds.minPassRate)}.` });
132
- }
133
- if (metrics.meanScore < thresholds.minMeanScore) {
134
- issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt(metrics.meanScore)} < ${fmt(thresholds.minMeanScore)}.` });
135
- }
136
- }
137
- function checkGeneralization(gateDecision, thresholds, metrics, issues) {
138
- if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
139
- issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
140
- }
141
- if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
142
- issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt(metrics.overfitGap)} > ${fmt(thresholds.maxOverfitGap)}.` });
143
- }
144
- if (gateDecision && !gateDecision.promote) {
145
- issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
146
- }
147
- }
148
- function checkDiagnostics(thresholds, metrics, issues) {
149
- if (!thresholds.requireAsiForFailures) return;
150
- if (metrics.failedRows > metrics.failuresWithAsi) {
151
- issues.push({
152
- axis: "diagnostics",
153
- severity: "critical",
154
- code: "missing_failure_asi",
155
- detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
156
- });
157
- }
158
- }
159
- function checkEfficiency(thresholds, metrics, issues) {
160
- if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
161
- issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt(metrics.meanCostUsd)} > ${fmt(thresholds.maxMeanCostUsd)}.` });
162
- }
163
- if (metrics.p95WallMs > thresholds.maxP95WallMs) {
164
- issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt(metrics.p95WallMs)} > ${fmt(thresholds.maxP95WallMs)}.` });
165
- }
166
- }
167
- function buildAxes(metrics, thresholds, gateDecision, issues) {
168
- return [
169
- axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
170
- axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`),
171
- axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt(metrics.overfitGap)}`),
172
- axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
173
- axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt(metrics.meanCostUsd)} p95WallMs=${fmt(metrics.p95WallMs)}`)
174
- ];
175
- }
176
- function axis(name, issues, score, detail) {
177
- const own = issues.filter((i) => i.axis === name);
178
- const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
179
- return { name, status, score: bounded(score), detail };
180
- }
181
- function countScenarioSplits(scenarios) {
182
- const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
183
- for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
184
- return counts;
185
- }
186
- function countDomains(scenarios) {
187
- const out = {};
188
- for (const scenario of scenarios) {
189
- const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
190
- out[domain] = (out[domain] ?? 0) + 1;
191
- }
192
- return out;
193
- }
194
- function countFailureModes(runs, traces, threshold) {
195
- const out = {};
196
- for (const run of runs) {
197
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
198
- if (run.failureMode || score !== void 0 && score < threshold) {
199
- const mode = run.failureMode ?? "low_score";
200
- out[mode] = (out[mode] ?? 0) + 1;
201
- }
202
- }
203
- for (const trace of traces) {
204
- if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
205
- const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
206
- out[mode] = (out[mode] ?? 0) + 1;
207
- }
208
- }
209
- return out;
210
- }
211
- function countResponsibleSurfaces(traces) {
212
- const out = {};
213
- for (const trace of traces) {
214
- for (const asi of trace.asi ?? []) {
215
- const surface = asi.responsibleSurface ?? "unknown";
216
- out[surface] = (out[surface] ?? 0) + 1;
217
- }
218
- }
219
- return out;
220
- }
221
- function failedRows(runs, traces, threshold) {
222
- const out = [];
223
- for (const run of runs) {
224
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
225
- if (run.failureMode || score !== void 0 && score < threshold) {
226
- const asiMetric = run.outcome.raw.asi;
227
- out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
228
- }
229
- }
230
- for (const trace of traces) {
231
- if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
232
- out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
233
- }
234
- }
235
- return out;
236
- }
237
- function passRate(runs, traces, threshold) {
238
- const outcomes = [
239
- ...runs.map((run) => {
240
- const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
241
- return !run.failureMode && score !== void 0 && score >= threshold;
242
- }),
243
- ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
244
- ];
245
- if (outcomes.length === 0) return 0;
246
- return outcomes.filter(Boolean).length / outcomes.length;
247
- }
248
- function scoresFor(runs, split) {
249
- return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
250
- }
251
- function mean(xs) {
252
- if (xs.length === 0) return Number.NaN;
253
- return xs.reduce((sum, x) => sum + x, 0) / xs.length;
254
- }
255
- function percentile(xs, p) {
256
- if (xs.length === 0) return Number.NaN;
257
- const sorted = [...xs].sort((a, b) => a - b);
258
- return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
259
- }
260
- function isFiniteNumber(value) {
261
- return typeof value === "number" && Number.isFinite(value);
262
- }
263
- function safeDiff(a, b) {
264
- if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
265
- return a - b;
266
- }
267
- function gapScore(gap, maxGap) {
268
- if (!Number.isFinite(gap)) return 0;
269
- if (maxGap <= 0) return gap <= 0 ? 1 : 0;
270
- return bounded(1 - Math.max(0, gap) / maxGap);
271
- }
272
- function efficiencyScore(metrics, thresholds) {
273
- const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
274
- const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
275
- return Math.min(cost, latency);
276
- }
277
- function bounded(x) {
278
- if (!Number.isFinite(x)) return 0;
279
- return Math.max(0, Math.min(1, x));
280
- }
281
- function renderSummary(target, status, metrics, issues) {
282
- const prefix = `release confidence ${status}: ${target}`;
283
- const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt(metrics.passRate)} meanScore=${fmt(metrics.meanScore)}`;
284
- if (issues.length === 0) return `${prefix}; ${metricText}`;
285
- return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
286
- }
287
- function fmt(x) {
288
- if (!Number.isFinite(x)) return String(x);
289
- return x.toFixed(4);
290
- }
291
-
292
- // src/summary-report.ts
293
- function summaryTable(runs, opts = {}) {
294
- const split = opts.split ?? "holdout";
295
- const confidence = opts.confidence ?? 0.95;
296
- const fdr = opts.fdr ?? 0.05;
297
- const comparator = opts.comparator ?? null;
298
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
299
- const byCandidate = /* @__PURE__ */ new Map();
300
- for (const r of runs) {
301
- if (r.splitTag !== split) continue;
302
- const v = r.outcome[scoreField];
303
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
304
- const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
305
- bucket.runs.push(r);
306
- bucket.scores.push(v);
307
- byCandidate.set(r.candidateId, bucket);
308
- }
309
- const candidateIds = [...byCandidate.keys()].sort();
310
- const compRuns = comparator ? byCandidate.get(comparator) : void 0;
311
- const tentative = [];
312
- for (const id of candidateIds) {
313
- const bucket = byCandidate.get(id);
314
- const ci = confidenceInterval(bucket.scores, confidence);
315
- let rawP = Number.NaN;
316
- let d = Number.NaN;
317
- if (comparator && compRuns && id !== comparator) {
318
- const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
319
- if (paired.before.length >= 6) {
320
- rawP = wilcoxonSignedRank(paired.before, paired.after).p;
321
- }
322
- d = cohensD(compRuns.scores, bucket.scores);
323
- }
324
- tentative.push({
325
- candidateId: id,
326
- n: bucket.scores.length,
327
- mean: ci.mean,
328
- ciLow: ci.lower,
329
- ciHigh: ci.upper,
330
- qValue: rawP,
331
- cohensD: d,
332
- rawP
333
- });
334
- }
335
- if (comparator) {
336
- const idxs = [];
337
- const ps = [];
338
- for (let i = 0; i < tentative.length; i++) {
339
- const r = tentative[i];
340
- if (r.candidateId === comparator) continue;
341
- if (!Number.isFinite(r.rawP)) continue;
342
- idxs.push(i);
343
- ps.push(r.rawP);
344
- }
345
- if (ps.length > 0) {
346
- const { qValues } = benjaminiHochberg(ps, fdr);
347
- for (let k = 0; k < idxs.length; k++) {
348
- tentative[idxs[k]].qValue = qValues[k];
349
- }
350
- }
351
- }
352
- const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
353
- const markdown = renderSummaryTableMarkdown(rows, comparator, split);
354
- return { rows, comparator, split, markdown };
355
- }
356
- function pairScoresByKey(candidate, baseline, scoreField) {
357
- const baseIdx = /* @__PURE__ */ new Map();
358
- for (const r of baseline) {
359
- const v = r.outcome[scoreField];
360
- if (typeof v === "number" && Number.isFinite(v)) {
361
- baseIdx.set(`${r.experimentId}::${r.seed}`, v);
362
- }
363
- }
364
- const before = [];
365
- const after = [];
366
- for (const r of candidate) {
367
- const v = r.outcome[scoreField];
368
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
369
- const key = `${r.experimentId}::${r.seed}`;
370
- const b = baseIdx.get(key);
371
- if (b === void 0) continue;
372
- before.push(b);
373
- after.push(v);
374
- }
375
- return { before, after };
376
- }
377
- function renderSummaryTableMarkdown(rows, comparator, split) {
378
- const lines = [];
379
- const cmpLabel = comparator ? ` (vs ${comparator})` : "";
380
- lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
381
- lines.push("");
382
- lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
383
- lines.push("|---|---:|---:|---|---:|---:|");
384
- for (const r of rows) {
385
- const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
386
- const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
387
- const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
388
- lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
389
- }
390
- return lines.join("\n");
391
- }
392
- function paretoChart(runs, opts = {}) {
393
- const split = opts.split ?? "holdout";
394
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
395
- const buckets = /* @__PURE__ */ new Map();
396
- for (const r of runs) {
397
- if (r.splitTag !== split) continue;
398
- const v = r.outcome[scoreField];
399
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
400
- const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
401
- bucket.cost.push(r.costUsd);
402
- bucket.quality.push(v);
403
- buckets.set(r.candidateId, bucket);
404
- }
405
- const points = [];
406
- for (const [candidateId, bucket] of buckets.entries()) {
407
- points.push({
408
- candidateId,
409
- cost: avg(bucket.cost),
410
- quality: avg(bucket.quality),
411
- n: bucket.cost.length,
412
- onFrontier: false,
413
- gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
414
- });
415
- }
416
- for (const p of points) {
417
- p.onFrontier = !points.some((q) => q !== p && dominates(q, p));
418
- }
419
- return {
420
- kind: "pareto-cost-quality",
421
- split,
422
- axes: { x: "costUsd", y: "score" },
423
- points
424
- };
425
- }
426
- function dominates(a, b) {
427
- return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
428
- }
429
- function gateLabel(d) {
430
- if (d.promote) return "promote";
431
- if (d.rejectionCode === "few_runs") return "reject_few_runs";
432
- if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
433
- if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
434
- return null;
435
- }
436
- function gainHistogram(runs, candidateId, comparator, opts = {}) {
437
- const split = opts.split ?? "holdout";
438
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
439
- const binCount = opts.bins ?? 11;
440
- if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
441
- const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
442
- const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
443
- const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
444
- const n = before.length;
445
- if (n === 0) {
446
- return {
447
- kind: "gain-distribution",
448
- candidateId,
449
- comparator,
450
- split,
451
- n: 0,
452
- bins: [],
453
- median: 0,
454
- ci: { low: 0, high: 0 }
455
- };
456
- }
457
- const deltas = before.map((b, i) => after[i] - b);
458
- const sortedDeltas = [...deltas].sort((a, b) => a - b);
459
- const median = medianOfSorted(sortedDeltas);
460
- const min = sortedDeltas[0];
461
- const max = sortedDeltas[sortedDeltas.length - 1];
462
- const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
463
- const lo = -bound;
464
- const hi = bound;
465
- const width = (hi - lo) / binCount;
466
- const bins = [];
467
- for (let i = 0; i < binCount; i++) {
468
- bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
469
- }
470
- for (const d of deltas) {
471
- let idx = Math.floor((d - lo) / width);
472
- if (idx < 0) idx = 0;
473
- if (idx >= binCount) idx = binCount - 1;
474
- bins[idx].count += 1;
475
- }
476
- const ci = pairedBootstrap(before, after, {
477
- confidence: opts.confidence ?? 0.95,
478
- resamples: opts.resamples ?? 2e3,
479
- statistic: "median",
480
- seed: opts.seed
481
- });
482
- return {
483
- kind: "gain-distribution",
484
- candidateId,
485
- comparator,
486
- split,
487
- n,
488
- bins,
489
- median,
490
- ci: { low: ci.low, high: ci.high }
491
- };
492
- }
493
- function avg(xs) {
494
- if (xs.length === 0) return Number.NaN;
495
- return xs.reduce((s, x) => s + x, 0) / xs.length;
496
- }
497
- function medianOfSorted(sorted) {
498
- if (sorted.length === 0) return 0;
499
- const mid = Math.floor(sorted.length / 2);
500
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
501
- }
502
- function fmt2(x) {
503
- if (!Number.isFinite(x)) return String(x);
504
- return x.toFixed(4);
505
- }
506
-
507
- // src/release-report.ts
508
- function renderReleaseReport(scorecard, options = {}) {
509
- const title = options.title ?? `Release Report: ${scorecard.target}`;
510
- const lines = [];
511
- lines.push(`# ${title}`);
512
- lines.push("");
513
- lines.push(`Status: **${scorecard.status.toUpperCase()}**`);
514
- lines.push(`Promote: **${scorecard.promote ? "yes" : "no"}**`);
515
- if (scorecard.candidateId) lines.push(`Candidate: \`${scorecard.candidateId}\``);
516
- if (scorecard.baselineId) lines.push(`Baseline: \`${scorecard.baselineId}\``);
517
- lines.push("");
518
- lines.push(scorecard.summary);
519
- lines.push("");
520
- lines.push("## Metrics");
521
- lines.push("");
522
- lines.push("| Metric | Value |");
523
- lines.push("|---|---:|");
524
- lines.push(`| Scenarios | ${scorecard.metrics.scenarioCount} |`);
525
- lines.push(`| Search runs | ${scorecard.metrics.searchRuns} |`);
526
- lines.push(`| Holdout runs | ${scorecard.metrics.holdoutRuns} |`);
527
- lines.push(`| Pass rate | ${pct(scorecard.metrics.passRate)} |`);
528
- lines.push(`| Mean score | ${num(scorecard.metrics.meanScore)} |`);
529
- lines.push(`| Search mean | ${num(scorecard.metrics.searchMeanScore)} |`);
530
- lines.push(`| Holdout mean | ${num(scorecard.metrics.holdoutMeanScore)} |`);
531
- lines.push(`| Overfit gap | ${num(scorecard.metrics.overfitGap)} |`);
532
- lines.push(`| Mean cost | $${num(scorecard.metrics.meanCostUsd)} |`);
533
- lines.push(`| p95 wall time | ${Math.round(scorecard.metrics.p95WallMs)} ms |`);
534
- lines.push("");
535
- if (scorecard.issues.length > 0) {
536
- lines.push("## Issues");
537
- lines.push("");
538
- for (const issue of scorecard.issues) {
539
- lines.push(`- **${issue.severity}** \`${issue.code}\` (${issue.axis}): ${issue.detail}`);
540
- }
541
- lines.push("");
542
- }
543
- const surfaces = entries(scorecard.metrics.responsibleSurfaceCounts);
544
- if (surfaces.length > 0) {
545
- lines.push("## Responsible Surfaces");
546
- lines.push("");
547
- for (const [surface, count] of surfaces) lines.push(`- ${surface}: ${count}`);
548
- lines.push("");
549
- }
550
- const failures = entries(scorecard.metrics.failureModeCounts);
551
- if (failures.length > 0) {
552
- lines.push("## Failure Modes");
553
- lines.push("");
554
- for (const [mode, count] of failures) lines.push(`- ${mode}: ${count}`);
555
- lines.push("");
556
- }
557
- if (options.runs && options.runs.length > 0) {
558
- lines.push("## Run Summary");
559
- lines.push("");
560
- lines.push(summaryTable([...options.runs], {
561
- comparator: options.comparator ?? scorecard.baselineId ?? void 0,
562
- split: "holdout"
563
- }).markdown);
564
- lines.push("");
565
- }
566
- if (options.traceAnalystFindings && options.traceAnalystFindings.length > 0) {
567
- lines.push("## TraceAnalyst Findings");
568
- lines.push("");
569
- for (const finding of options.traceAnalystFindings) lines.push(`- ${finding}`);
570
- lines.push("");
571
- }
572
- const nextActions = options.nextActions ?? defaultNextActions(scorecard);
573
- if (nextActions.length > 0) {
574
- lines.push("## Next Actions");
575
- lines.push("");
576
- for (const action of nextActions) lines.push(`- ${action}`);
577
- lines.push("");
578
- }
579
- return lines.join("\n").trimEnd() + "\n";
580
- }
581
- function defaultNextActions(scorecard) {
582
- if (scorecard.promote) return ["Promote the candidate and keep canaries enabled."];
583
- return scorecard.issues.filter((issue) => issue.severity === "critical").map((issue) => `Resolve ${issue.code}: ${issue.detail}`);
584
- }
585
- function entries(values) {
586
- return Object.entries(values).filter(([, count]) => count > 0).sort((a, b) => b[1] - a[1] || a[0].localeCompare(b[0]));
587
- }
588
- function pct(value) {
589
- return Number.isFinite(value) ? `${(value * 100).toFixed(1)}%` : "n/a";
590
- }
591
- function num(value) {
592
- return Number.isFinite(value) ? value.toFixed(3) : "n/a";
593
- }
594
-
595
- // src/promotion-gate.ts
596
- function bootstrapCi(baseline, candidate, options = {}) {
597
- const alpha = options.alpha ?? 0.05;
598
- const iterations = options.iterations ?? 1e3;
599
- const minTotal = options.minTotalSamples ?? 6;
600
- const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
601
- const baselineMean = mean2(baseline);
602
- const candidateMean = mean2(candidate);
603
- const delta = candidateMean - baselineMean;
604
- if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
605
- return {
606
- baselineMean,
607
- candidateMean,
608
- delta,
609
- ciLower: -Infinity,
610
- ciUpper: Infinity,
611
- iterations: 0,
612
- alpha,
613
- verdict: "INCONCLUSIVE"
614
- };
615
- }
616
- const deltas = new Array(iterations);
617
- for (let i = 0; i < iterations; i++) {
618
- const bResample = resample(baseline, rng);
619
- const cResample = resample(candidate, rng);
620
- deltas[i] = mean2(cResample) - mean2(bResample);
621
- }
622
- deltas.sort((a, b) => a - b);
623
- const lowerIdx = Math.floor(alpha / 2 * iterations);
624
- const upperIdx = Math.floor((1 - alpha / 2) * iterations) - 1;
625
- const ciLower = deltas[Math.max(0, lowerIdx)];
626
- const ciUpper = deltas[Math.min(iterations - 1, upperIdx)];
627
- let verdict;
628
- if (ciLower > 0) verdict = "ADVANCE";
629
- else if (ciUpper < 0) verdict = "REVERT";
630
- else if (delta >= 0) verdict = "KEEP";
631
- else verdict = "INCONCLUSIVE";
632
- return {
633
- baselineMean,
634
- candidateMean,
635
- delta,
636
- ciLower,
637
- ciUpper,
638
- iterations,
639
- alpha,
640
- verdict
641
- };
642
- }
643
- function mean2(xs) {
644
- if (xs.length === 0) return 0;
645
- let s = 0;
646
- for (const x of xs) s += x;
647
- return s / xs.length;
648
- }
649
- function resample(xs, rng) {
650
- const out = new Array(xs.length);
651
- for (let i = 0; i < xs.length; i++) out[i] = xs[Math.floor(rng() * xs.length)];
652
- return out;
653
- }
654
- function mulberry32(seed) {
655
- let t = seed >>> 0;
656
- return () => {
657
- t += 1831565813;
658
- let r = t;
659
- r = Math.imul(r ^ r >>> 15, r | 1);
660
- r ^= r + Math.imul(r ^ r >>> 7, r | 61);
661
- return ((r ^ r >>> 14) >>> 0) / 4294967296;
662
- };
663
- }
664
- function hashSeed(a, b) {
665
- let h = 2166136261;
666
- for (const x of [...a, ...b]) {
667
- const view = new Float64Array([x]);
668
- const bytes = new Uint8Array(view.buffer);
669
- for (const byte of bytes) {
670
- h ^= byte;
671
- h = Math.imul(h, 16777619);
672
- }
673
- }
674
- return h >>> 0;
675
- }
676
- async function judgeReplayGate(args) {
677
- const concurrency = args.judgeConcurrency ?? 4;
678
- const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
679
- const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
680
- const ci = bootstrapCi(baselineScores, candidateScores, {
681
- ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
682
- ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
683
- ...args.seed !== void 0 ? { seed: args.seed } : {}
684
- });
685
- return {
686
- ...ci,
687
- baselineSamples: baselineScores.length,
688
- candidateSamples: candidateScores.length
689
- };
690
- }
691
- async function scoreAll(outputs, judge, concurrency) {
692
- const results = new Array(outputs.length);
693
- let next = 0;
694
- async function worker() {
695
- while (true) {
696
- const i = next++;
697
- if (i >= outputs.length) return;
698
- const v = await judge(outputs[i]);
699
- results[i] = Number.isFinite(v) ? v : 0;
700
- }
701
- }
702
- await Promise.all(Array.from({ length: Math.max(1, concurrency) }, () => worker()));
703
- return results;
704
- }
705
-
706
- export {
707
- releaseTraceEvidenceFromMultiShotTrials,
708
- evaluateReleaseConfidence,
709
- assertReleaseConfidence,
710
- summaryTable,
711
- paretoChart,
712
- gainHistogram,
713
- renderReleaseReport,
714
- bootstrapCi,
715
- judgeReplayGate
716
- };
717
- //# sourceMappingURL=chunk-IKFVX537.js.map