@tangle-network/agent-eval 0.20.12 → 0.22.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +177 -0
  2. package/README.md +43 -1
  3. package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
  4. package/dist/chunk-4W4NCYM2.js.map +1 -0
  5. package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
  6. package/dist/chunk-5IIQKMD5.js.map +1 -0
  7. package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
  8. package/dist/chunk-6M774GY6.js +53 -0
  9. package/dist/chunk-6M774GY6.js.map +1 -0
  10. package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
  11. package/dist/chunk-IOXMGMHQ.js +1226 -0
  12. package/dist/chunk-IOXMGMHQ.js.map +1 -0
  13. package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
  14. package/dist/chunk-KAO3Q65R.js.map +1 -0
  15. package/dist/chunk-QUKKGHTZ.js +121 -0
  16. package/dist/chunk-QUKKGHTZ.js.map +1 -0
  17. package/dist/chunk-SQQLHODJ.js +163 -0
  18. package/dist/chunk-SQQLHODJ.js.map +1 -0
  19. package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
  20. package/dist/chunk-UAND2LOT.js.map +1 -0
  21. package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
  22. package/dist/chunk-USHQBPMH.js.map +1 -0
  23. package/dist/cli.js +3 -2
  24. package/dist/cli.js.map +1 -1
  25. package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
  26. package/dist/control.d.ts +4 -3
  27. package/dist/control.js +2 -2
  28. package/dist/emitter-B2XqDKFU.d.ts +121 -0
  29. package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
  30. package/dist/index.d.ts +16 -302
  31. package/dist/index.js +70 -62
  32. package/dist/index.js.map +1 -1
  33. package/dist/integrity-K2oVlF57.d.ts +210 -0
  34. package/dist/openapi.json +1 -1
  35. package/dist/optimization-UVDNKaO6.d.ts +574 -0
  36. package/dist/optimization.d.ts +7 -144
  37. package/dist/optimization.js +9 -2
  38. package/dist/reporting-B82RSv9C.d.ts +593 -0
  39. package/dist/reporting.d.ts +5 -426
  40. package/dist/reporting.js +17 -6
  41. package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
  42. package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
  43. package/dist/traces.d.ts +179 -3
  44. package/dist/traces.js +35 -4
  45. package/dist/wire/index.js +3 -2
  46. package/docs/research-report-methodology.md +170 -0
  47. package/docs/wire-protocol.md +1 -1
  48. package/package.json +11 -13
  49. package/dist/chunk-75MCTH7P.js.map +0 -1
  50. package/dist/chunk-HKYRWNHV.js.map +0 -1
  51. package/dist/chunk-IKFVX537.js.map +0 -1
  52. package/dist/chunk-KWUAAIHR.js.map +0 -1
  53. package/dist/chunk-ODFINDLQ.js +0 -413
  54. package/dist/chunk-ODFINDLQ.js.map +0 -1
  55. package/dist/chunk-PKCVBYTQ.js.map +0 -1
  56. /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
  57. /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
@@ -1,10 +1,6 @@
1
1
  import {
2
- benjaminiHochberg,
3
- cohensD,
4
- confidenceInterval,
5
- pairedBootstrap,
6
- wilcoxonSignedRank
7
- } from "./chunk-ODFINDLQ.js";
2
+ summaryTable
3
+ } from "./chunk-IOXMGMHQ.js";
8
4
 
9
5
  // src/release-confidence.ts
10
6
  var DEFAULT_THRESHOLDS = {
@@ -289,219 +285,244 @@ function fmt(x) {
289
285
  return x.toFixed(4);
290
286
  }
291
287
 
292
- // src/summary-report.ts
293
- function summaryTable(runs, opts = {}) {
294
- const split = opts.split ?? "holdout";
295
- const confidence = opts.confidence ?? 0.95;
296
- const fdr = opts.fdr ?? 0.05;
297
- const comparator = opts.comparator ?? null;
298
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
299
- const byCandidate = /* @__PURE__ */ new Map();
300
- for (const r of runs) {
301
- if (r.splitTag !== split) continue;
302
- const v = r.outcome[scoreField];
303
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
304
- const bucket = byCandidate.get(r.candidateId) ?? { runs: [], scores: [] };
305
- bucket.runs.push(r);
306
- bucket.scores.push(v);
307
- byCandidate.set(r.candidateId, bucket);
308
- }
309
- const candidateIds = [...byCandidate.keys()].sort();
310
- const compRuns = comparator ? byCandidate.get(comparator) : void 0;
311
- const tentative = [];
312
- for (const id of candidateIds) {
313
- const bucket = byCandidate.get(id);
314
- const ci = confidenceInterval(bucket.scores, confidence);
315
- let rawP = Number.NaN;
316
- let d = Number.NaN;
317
- if (comparator && compRuns && id !== comparator) {
318
- const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
319
- if (paired.before.length >= 6) {
320
- rawP = wilcoxonSignedRank(paired.before, paired.after).p;
321
- }
322
- d = cohensD(compRuns.scores, bucket.scores);
288
+ // src/meta-eval/rubric-predictive-validity.ts
289
+ async function rubricPredictiveValidity(input) {
290
+ const minSamples = input.minSamples ?? 8;
291
+ const reduction = input.reduction ?? "latest";
292
+ const resamples = input.bootstrapResamples ?? 500;
293
+ const rng = makeRng(input.seed);
294
+ const outcomes = await input.outcomes.list();
295
+ const outcomesByRun = /* @__PURE__ */ new Map();
296
+ for (const o of outcomes) {
297
+ const arr = outcomesByRun.get(o.runId) ?? [];
298
+ arr.push(o);
299
+ outcomesByRun.set(o.runId, arr);
300
+ }
301
+ const observedRubrics = /* @__PURE__ */ new Set();
302
+ for (const r of input.runs) {
303
+ for (const k of Object.keys(r.outcome.raw)) observedRubrics.add(k);
304
+ }
305
+ const rubrics = input.rubrics ?? [...observedRubrics];
306
+ const buckets = [];
307
+ for (const r of rubrics) {
308
+ for (const o of input.outcomeMetrics) {
309
+ buckets.push({ rubric: r, outcome: o, xs: [], ys: [] });
323
310
  }
324
- tentative.push({
325
- candidateId: id,
326
- n: bucket.scores.length,
327
- mean: ci.mean,
328
- ciLow: ci.lower,
329
- ciHigh: ci.upper,
330
- qValue: rawP,
331
- cohensD: d,
332
- rawP
333
- });
334
311
  }
335
- if (comparator) {
336
- const idxs = [];
337
- const ps = [];
338
- for (let i = 0; i < tentative.length; i++) {
339
- const r = tentative[i];
340
- if (r.candidateId === comparator) continue;
341
- if (!Number.isFinite(r.rawP)) continue;
342
- idxs.push(i);
343
- ps.push(r.rawP);
312
+ let joined = 0;
313
+ let skipped = 0;
314
+ for (const run of input.runs) {
315
+ const os = outcomesByRun.get(run.runId);
316
+ if (!os || os.length === 0) {
317
+ skipped++;
318
+ continue;
344
319
  }
345
- if (ps.length > 0) {
346
- const { qValues } = benjaminiHochberg(ps, fdr);
347
- for (let k = 0; k < idxs.length; k++) {
348
- tentative[idxs[k]].qValue = qValues[k];
320
+ let joinedThisRun = false;
321
+ for (const r of rubrics) {
322
+ const x = run.outcome.raw[r];
323
+ if (typeof x !== "number" || !Number.isFinite(x)) continue;
324
+ for (const o of input.outcomeMetrics) {
325
+ const values = os.map((row) => row.metrics[o]).filter((v) => typeof v === "number" && Number.isFinite(v));
326
+ if (values.length === 0) continue;
327
+ const y = reduce(values, os, o, reduction);
328
+ if (y === null) continue;
329
+ const bucket = buckets.find((b) => b.rubric === r && b.outcome === o);
330
+ bucket.xs.push(x);
331
+ bucket.ys.push(y);
332
+ joinedThisRun = true;
349
333
  }
350
334
  }
351
- }
352
- const rows = tentative.map(({ rawP: _rawP, ...rest }) => rest);
353
- const markdown = renderSummaryTableMarkdown(rows, comparator, split);
354
- return { rows, comparator, split, markdown };
355
- }
356
- function pairScoresByKey(candidate, baseline, scoreField) {
357
- const baseIdx = /* @__PURE__ */ new Map();
358
- for (const r of baseline) {
359
- const v = r.outcome[scoreField];
360
- if (typeof v === "number" && Number.isFinite(v)) {
361
- baseIdx.set(`${r.experimentId}::${r.seed}`, v);
362
- }
363
- }
364
- const before = [];
365
- const after = [];
366
- for (const r of candidate) {
367
- const v = r.outcome[scoreField];
368
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
369
- const key = `${r.experimentId}::${r.seed}`;
370
- const b = baseIdx.get(key);
371
- if (b === void 0) continue;
372
- before.push(b);
373
- after.push(v);
374
- }
375
- return { before, after };
376
- }
377
- function renderSummaryTableMarkdown(rows, comparator, split) {
378
- const lines = [];
379
- const cmpLabel = comparator ? ` (vs ${comparator})` : "";
380
- lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
381
- lines.push("");
382
- lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
383
- lines.push("|---|---:|---:|---|---:|---:|");
384
- for (const r of rows) {
385
- const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
386
- const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
387
- const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
388
- lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
389
- }
390
- return lines.join("\n");
391
- }
392
- function paretoChart(runs, opts = {}) {
393
- const split = opts.split ?? "holdout";
394
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
395
- const buckets = /* @__PURE__ */ new Map();
396
- for (const r of runs) {
397
- if (r.splitTag !== split) continue;
398
- const v = r.outcome[scoreField];
399
- if (typeof v !== "number" || !Number.isFinite(v)) continue;
400
- const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
401
- bucket.cost.push(r.costUsd);
402
- bucket.quality.push(v);
403
- buckets.set(r.candidateId, bucket);
404
- }
405
- const points = [];
406
- for (const [candidateId, bucket] of buckets.entries()) {
407
- points.push({
408
- candidateId,
409
- cost: avg(bucket.cost),
410
- quality: avg(bucket.quality),
411
- n: bucket.cost.length,
412
- onFrontier: false,
413
- gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
335
+ if (joinedThisRun) joined++;
336
+ }
337
+ const pairs = [];
338
+ for (const b of buckets) {
339
+ if (b.xs.length < minSamples) continue;
340
+ const pearson = pearsonR(b.xs, b.ys);
341
+ const spearman = pearsonR(rankWithTies(b.xs), rankWithTies(b.ys));
342
+ const ci = bootstrapCi(b.xs, b.ys, resamples, rng);
343
+ const verdict = Math.abs(spearman) >= 0.7 ? "load_bearing" : Math.abs(spearman) >= 0.4 ? "informative" : "decorative";
344
+ pairs.push({
345
+ rubric: b.rubric,
346
+ outcome: b.outcome,
347
+ n: b.xs.length,
348
+ pearson,
349
+ spearman,
350
+ ci95: ci,
351
+ verdict
414
352
  });
415
353
  }
416
- for (const p of points) {
417
- p.onFrontier = !points.some((q) => q !== p && dominates(q, p));
354
+ const byRubric = /* @__PURE__ */ new Map();
355
+ for (const p of pairs) {
356
+ const arr = byRubric.get(p.rubric) ?? [];
357
+ arr.push(p);
358
+ byRubric.set(p.rubric, arr);
418
359
  }
419
- return {
420
- kind: "pareto-cost-quality",
421
- split,
422
- axes: { x: "costUsd", y: "score" },
423
- points
424
- };
425
- }
426
- function dominates(a, b) {
427
- return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
428
- }
429
- function gateLabel(d) {
430
- if (d.promote) return "promote";
431
- if (d.rejectionCode === "few_runs") return "reject_few_runs";
432
- if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
433
- if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
434
- return null;
435
- }
436
- function gainHistogram(runs, candidateId, comparator, opts = {}) {
437
- const split = opts.split ?? "holdout";
438
- const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
439
- const binCount = opts.bins ?? 11;
440
- if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
441
- const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
442
- const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
443
- const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
444
- const n = before.length;
445
- if (n === 0) {
360
+ const ranked = [...byRubric.entries()].map(([rubric, ps]) => {
361
+ const best = ps.reduce((a, b) => Math.abs(b.spearman) > Math.abs(a.spearman) ? b : a);
446
362
  return {
447
- kind: "gain-distribution",
448
- candidateId,
449
- comparator,
450
- split,
451
- n: 0,
452
- bins: [],
453
- median: 0,
454
- ci: { low: 0, high: 0 }
363
+ rubric,
364
+ bestOutcome: best.outcome,
365
+ spearman: best.spearman,
366
+ pearson: best.pearson,
367
+ n: best.n,
368
+ verdict: best.verdict
455
369
  };
370
+ }).sort((a, b) => Math.abs(b.spearman) - Math.abs(a.spearman));
371
+ const rubricsWithoutData = rubrics.filter((r) => !byRubric.has(r));
372
+ return { pairs, ranked, joinedSamples: joined, skippedRuns: skipped, rubricsWithoutData };
373
+ }
374
+ function reduce(values, outcomes, metric, kind) {
375
+ if (values.length === 0) return null;
376
+ if (kind === "mean") return values.reduce((s, v) => s + v, 0) / values.length;
377
+ if (kind === "max") return Math.max(...values);
378
+ const sorted = [...outcomes].filter((o) => typeof o.metrics[metric] === "number").sort((a, b) => b.capturedAt - a.capturedAt);
379
+ return sorted[0]?.metrics[metric] ?? null;
380
+ }
381
+ function pearsonR(a, b) {
382
+ if (a.length !== b.length || a.length < 2) return Number.NaN;
383
+ const ma = a.reduce((s, v) => s + v, 0) / a.length;
384
+ const mb = b.reduce((s, v) => s + v, 0) / b.length;
385
+ let num2 = 0, da = 0, db = 0;
386
+ for (let i = 0; i < a.length; i++) {
387
+ const xa = a[i] - ma;
388
+ const xb = b[i] - mb;
389
+ num2 += xa * xb;
390
+ da += xa * xa;
391
+ db += xb * xb;
392
+ }
393
+ if (da === 0 || db === 0) return da === 0 && db === 0 ? 1 : 0;
394
+ return num2 / Math.sqrt(da * db);
395
+ }
396
+ function rankWithTies(xs) {
397
+ const indexed = xs.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
398
+ const r = new Array(xs.length);
399
+ for (let i = 0; i < indexed.length; ) {
400
+ let j = i;
401
+ while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
402
+ const avg = (i + j + 2) / 2;
403
+ for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
404
+ i = j + 1;
405
+ }
406
+ return r;
407
+ }
408
+ function bootstrapCi(xs, ys, iterations, rng) {
409
+ const n = xs.length;
410
+ if (n < 3) return { low: Number.NaN, high: Number.NaN };
411
+ const samples = [];
412
+ for (let b = 0; b < iterations; b++) {
413
+ const rx = new Array(n);
414
+ const ry = new Array(n);
415
+ for (let i = 0; i < n; i++) {
416
+ const idx = Math.floor(rng() * n);
417
+ rx[i] = xs[idx];
418
+ ry[i] = ys[idx];
419
+ }
420
+ const r = pearsonR(rx, ry);
421
+ if (Number.isFinite(r)) samples.push(r);
456
422
  }
457
- const deltas = before.map((b, i) => after[i] - b);
458
- const sortedDeltas = [...deltas].sort((a, b) => a - b);
459
- const median = medianOfSorted(sortedDeltas);
460
- const min = sortedDeltas[0];
461
- const max = sortedDeltas[sortedDeltas.length - 1];
462
- const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
463
- const lo = -bound;
464
- const hi = bound;
465
- const width = (hi - lo) / binCount;
466
- const bins = [];
467
- for (let i = 0; i < binCount; i++) {
468
- bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
469
- }
470
- for (const d of deltas) {
471
- let idx = Math.floor((d - lo) / width);
472
- if (idx < 0) idx = 0;
473
- if (idx >= binCount) idx = binCount - 1;
474
- bins[idx].count += 1;
475
- }
476
- const ci = pairedBootstrap(before, after, {
477
- confidence: opts.confidence ?? 0.95,
478
- resamples: opts.resamples ?? 2e3,
479
- statistic: "median",
480
- seed: opts.seed
481
- });
423
+ samples.sort((a, b) => a - b);
424
+ if (samples.length === 0) return { low: Number.NaN, high: Number.NaN };
482
425
  return {
483
- kind: "gain-distribution",
484
- candidateId,
485
- comparator,
486
- split,
487
- n,
488
- bins,
489
- median,
490
- ci: { low: ci.low, high: ci.high }
426
+ low: samples[Math.floor(0.025 * samples.length)],
427
+ high: samples[Math.min(samples.length - 1, Math.floor(0.975 * samples.length))]
491
428
  };
492
429
  }
493
- function avg(xs) {
494
- if (xs.length === 0) return Number.NaN;
495
- return xs.reduce((s, x) => s + x, 0) / xs.length;
496
- }
497
- function medianOfSorted(sorted) {
498
- if (sorted.length === 0) return 0;
499
- const mid = Math.floor(sorted.length / 2);
500
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
430
+ function makeRng(seed) {
431
+ if (seed === void 0) return Math.random;
432
+ let s = seed >>> 0;
433
+ return () => {
434
+ s = s + 1831565813 >>> 0;
435
+ let t = s;
436
+ t = Math.imul(t ^ t >>> 15, t | 1);
437
+ t ^= t + Math.imul(t ^ t >>> 7, t | 61);
438
+ return ((t ^ t >>> 14) >>> 0) / 4294967296;
439
+ };
501
440
  }
502
- function fmt2(x) {
503
- if (!Number.isFinite(x)) return String(x);
504
- return x.toFixed(4);
441
+
442
+ // src/sequential.ts
443
+ function pairedEvalueSequence(deltas, opts = {}) {
444
+ const c = opts.bound ?? 1;
445
+ const alpha = opts.alpha ?? 0.05;
446
+ const initialShrink = opts.initialBetShrinkage ?? 0.5;
447
+ const rope = opts.rope ?? null;
448
+ if (c <= 0) throw new Error("pairedEvalueSequence: bound must be > 0");
449
+ if (alpha <= 0 || alpha >= 1) throw new Error("pairedEvalueSequence: alpha must be in (0,1)");
450
+ if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
451
+ throw new Error("pairedEvalueSequence: rope must satisfy low \u2264 high");
452
+ }
453
+ const steps = [];
454
+ let clipped = false;
455
+ let evalue = 1;
456
+ let decisionFiredAt = null;
457
+ let sum = 0;
458
+ let sumSq = 0;
459
+ let count = 0;
460
+ for (let i = 0; i < deltas.length; i++) {
461
+ let d = deltas[i];
462
+ if (d < -c || d > c) {
463
+ d = Math.max(-c, Math.min(c, d));
464
+ clipped = true;
465
+ }
466
+ const muHat = count === 0 ? 0 : sum / count;
467
+ const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat);
468
+ const t = i + 1;
469
+ const shrink = initialShrink * Math.min(1, count / 32);
470
+ let lambda = muHat / (varHat + c * c) * shrink;
471
+ const lambdaMax = 0.99 / c;
472
+ if (lambda > lambdaMax) lambda = lambdaMax;
473
+ if (lambda < -lambdaMax) lambda = -lambdaMax;
474
+ evalue = evalue * (1 + lambda * d);
475
+ if (!Number.isFinite(evalue) || evalue < 0) evalue = 0;
476
+ sum += d;
477
+ sumSq += d * d;
478
+ count += 1;
479
+ const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300));
480
+ const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha);
481
+ let decision = "continue";
482
+ if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = "equivalent";
483
+ else if (evalue >= 2 / alpha && muHat > 0) decision = "promote_now";
484
+ else if (evalue >= 2 / alpha && muHat < 0) decision = "reject_now";
485
+ else if (rope && cs.high < rope.low) decision = "reject_now";
486
+ if (decision !== "continue" && decisionFiredAt === null) decisionFiredAt = t;
487
+ steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision });
488
+ }
489
+ const finalDecision = steps.length === 0 ? "continue" : steps[steps.length - 1].decision;
490
+ return { steps, finalDecision, decisionFiredAt, clipped };
491
+ }
492
+ function evaluateInterimReleaseConfidence(input) {
493
+ const candidates = input.deltaSeries.map((s) => {
494
+ const seq = pairedEvalueSequence(s.deltas, {
495
+ alpha: input.alpha,
496
+ bound: input.bound,
497
+ rope: input.rope
498
+ });
499
+ const last = seq.steps[seq.steps.length - 1];
500
+ return {
501
+ candidateId: s.candidateId,
502
+ decision: seq.finalDecision,
503
+ decisionFiredAt: seq.decisionFiredAt,
504
+ finalEvalue: last?.evalue ?? 1,
505
+ finalPValue: last?.pValue ?? 1,
506
+ pairs: seq.steps.length,
507
+ csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,
508
+ csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY
509
+ };
510
+ });
511
+ const promote = candidates.find((c) => c.decision === "promote_now");
512
+ if (promote) return { candidates, recommendation: { decision: "promote_now", candidateId: promote.candidateId } };
513
+ const live = candidates.find((c) => c.decision === "continue");
514
+ if (live) return { candidates, recommendation: { decision: "continue", candidateId: null } };
515
+ const equiv = candidates.find((c) => c.decision === "equivalent");
516
+ if (equiv) return { candidates, recommendation: { decision: "equivalent", candidateId: equiv.candidateId } };
517
+ return { candidates, recommendation: { decision: "reject_now", candidateId: null } };
518
+ }
519
+ function empiricalBernsteinCs(sum, sumSq, n, bound, alpha) {
520
+ if (n === 0) return { low: -bound, high: bound };
521
+ const mean3 = sum / n;
522
+ const variance = Math.max(0, sumSq / n - mean3 * mean3);
523
+ const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1);
524
+ const radius = Math.sqrt(2 * variance * psi / n) + 3 * bound * psi / n;
525
+ return { low: mean3 - radius, high: mean3 + radius };
505
526
  }
506
527
 
507
528
  // src/release-report.ts
@@ -593,7 +614,7 @@ function num(value) {
593
614
  }
594
615
 
595
616
  // src/promotion-gate.ts
596
- function bootstrapCi(baseline, candidate, options = {}) {
617
+ function bootstrapCi2(baseline, candidate, options = {}) {
597
618
  const alpha = options.alpha ?? 0.05;
598
619
  const iterations = options.iterations ?? 1e3;
599
620
  const minTotal = options.minTotalSamples ?? 6;
@@ -677,7 +698,7 @@ async function judgeReplayGate(args) {
677
698
  const concurrency = args.judgeConcurrency ?? 4;
678
699
  const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
679
700
  const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
680
- const ci = bootstrapCi(baselineScores, candidateScores, {
701
+ const ci = bootstrapCi2(baselineScores, candidateScores, {
681
702
  ...args.alpha !== void 0 ? { alpha: args.alpha } : {},
682
703
  ...args.iterations !== void 0 ? { iterations: args.iterations } : {},
683
704
  ...args.seed !== void 0 ? { seed: args.seed } : {}
@@ -707,11 +728,11 @@ export {
707
728
  releaseTraceEvidenceFromMultiShotTrials,
708
729
  evaluateReleaseConfidence,
709
730
  assertReleaseConfidence,
710
- summaryTable,
711
- paretoChart,
712
- gainHistogram,
731
+ rubricPredictiveValidity,
732
+ pairedEvalueSequence,
733
+ evaluateInterimReleaseConfidence,
713
734
  renderReleaseReport,
714
- bootstrapCi,
735
+ bootstrapCi2 as bootstrapCi,
715
736
  judgeReplayGate
716
737
  };
717
- //# sourceMappingURL=chunk-IKFVX537.js.map
738
+ //# sourceMappingURL=chunk-UAND2LOT.js.map