@tangle-network/agent-eval 0.20.12 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +177 -0
- package/README.md +43 -1
- package/dist/{chunk-KWUAAIHR.js → chunk-4W4NCYM2.js} +182 -1
- package/dist/chunk-4W4NCYM2.js.map +1 -0
- package/dist/{chunk-PKCVBYTQ.js → chunk-5IIQKMD5.js} +38 -2
- package/dist/chunk-5IIQKMD5.js.map +1 -0
- package/dist/{chunk-HNJLMAJ2.js → chunk-6KQG5HAH.js} +2 -2
- package/dist/chunk-6M774GY6.js +53 -0
- package/dist/chunk-6M774GY6.js.map +1 -0
- package/dist/{chunk-MCMV7DUL.js → chunk-ARZ6BEV6.js} +2 -2
- package/dist/chunk-IOXMGMHQ.js +1226 -0
- package/dist/chunk-IOXMGMHQ.js.map +1 -0
- package/dist/{chunk-75MCTH7P.js → chunk-KAO3Q65R.js} +198 -3
- package/dist/chunk-KAO3Q65R.js.map +1 -0
- package/dist/chunk-QUKKGHTZ.js +121 -0
- package/dist/chunk-QUKKGHTZ.js.map +1 -0
- package/dist/chunk-SQQLHODJ.js +163 -0
- package/dist/chunk-SQQLHODJ.js.map +1 -0
- package/dist/{chunk-IKFVX537.js → chunk-UAND2LOT.js} +232 -211
- package/dist/chunk-UAND2LOT.js.map +1 -0
- package/dist/{chunk-HKYRWNHV.js → chunk-USHQBPMH.js} +283 -7
- package/dist/chunk-USHQBPMH.js.map +1 -0
- package/dist/cli.js +3 -2
- package/dist/cli.js.map +1 -1
- package/dist/{control-C8NKbF3w.d.ts → control-cxwMOAsy.d.ts} +3 -2
- package/dist/control.d.ts +4 -3
- package/dist/control.js +2 -2
- package/dist/emitter-B2XqDKFU.d.ts +121 -0
- package/dist/{feedback-trajectory-BGQ_ANCN.d.ts → feedback-trajectory-CB0A32o3.d.ts} +2 -1
- package/dist/index.d.ts +16 -302
- package/dist/index.js +70 -62
- package/dist/index.js.map +1 -1
- package/dist/integrity-K2oVlF57.d.ts +210 -0
- package/dist/openapi.json +1 -1
- package/dist/optimization-UVDNKaO6.d.ts +574 -0
- package/dist/optimization.d.ts +7 -144
- package/dist/optimization.js +9 -2
- package/dist/reporting-B82RSv9C.d.ts +593 -0
- package/dist/reporting.d.ts +5 -426
- package/dist/reporting.js +17 -6
- package/dist/{emitter-BYO2nSDA.d.ts → store-u47QaJ9G.d.ts} +1 -91
- package/dist/{multi-shot-optimization-Bvtz294B.d.ts → summary-report-D4p7RlDu.d.ts} +381 -1
- package/dist/traces.d.ts +179 -3
- package/dist/traces.js +35 -4
- package/dist/wire/index.js +3 -2
- package/docs/research-report-methodology.md +170 -0
- package/docs/wire-protocol.md +1 -1
- package/package.json +11 -13
- package/dist/chunk-75MCTH7P.js.map +0 -1
- package/dist/chunk-HKYRWNHV.js.map +0 -1
- package/dist/chunk-IKFVX537.js.map +0 -1
- package/dist/chunk-KWUAAIHR.js.map +0 -1
- package/dist/chunk-ODFINDLQ.js +0 -413
- package/dist/chunk-ODFINDLQ.js.map +0 -1
- package/dist/chunk-PKCVBYTQ.js.map +0 -1
- /package/dist/{chunk-HNJLMAJ2.js.map → chunk-6KQG5HAH.js.map} +0 -0
- /package/dist/{chunk-MCMV7DUL.js.map → chunk-ARZ6BEV6.js.map} +0 -0
|
@@ -1,10 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
confidenceInterval,
|
|
5
|
-
pairedBootstrap,
|
|
6
|
-
wilcoxonSignedRank
|
|
7
|
-
} from "./chunk-ODFINDLQ.js";
|
|
2
|
+
summaryTable
|
|
3
|
+
} from "./chunk-IOXMGMHQ.js";
|
|
8
4
|
|
|
9
5
|
// src/release-confidence.ts
|
|
10
6
|
var DEFAULT_THRESHOLDS = {
|
|
@@ -289,219 +285,244 @@ function fmt(x) {
|
|
|
289
285
|
return x.toFixed(4);
|
|
290
286
|
}
|
|
291
287
|
|
|
292
|
-
// src/
|
|
293
|
-
function
|
|
294
|
-
const
|
|
295
|
-
const
|
|
296
|
-
const
|
|
297
|
-
const
|
|
298
|
-
const
|
|
299
|
-
const
|
|
300
|
-
for (const
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
}
|
|
309
|
-
const
|
|
310
|
-
const
|
|
311
|
-
const
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
const ci = confidenceInterval(bucket.scores, confidence);
|
|
315
|
-
let rawP = Number.NaN;
|
|
316
|
-
let d = Number.NaN;
|
|
317
|
-
if (comparator && compRuns && id !== comparator) {
|
|
318
|
-
const paired = pairScoresByKey(bucket.runs, compRuns.runs, scoreField);
|
|
319
|
-
if (paired.before.length >= 6) {
|
|
320
|
-
rawP = wilcoxonSignedRank(paired.before, paired.after).p;
|
|
321
|
-
}
|
|
322
|
-
d = cohensD(compRuns.scores, bucket.scores);
|
|
288
|
+
// src/meta-eval/rubric-predictive-validity.ts
|
|
289
|
+
async function rubricPredictiveValidity(input) {
|
|
290
|
+
const minSamples = input.minSamples ?? 8;
|
|
291
|
+
const reduction = input.reduction ?? "latest";
|
|
292
|
+
const resamples = input.bootstrapResamples ?? 500;
|
|
293
|
+
const rng = makeRng(input.seed);
|
|
294
|
+
const outcomes = await input.outcomes.list();
|
|
295
|
+
const outcomesByRun = /* @__PURE__ */ new Map();
|
|
296
|
+
for (const o of outcomes) {
|
|
297
|
+
const arr = outcomesByRun.get(o.runId) ?? [];
|
|
298
|
+
arr.push(o);
|
|
299
|
+
outcomesByRun.set(o.runId, arr);
|
|
300
|
+
}
|
|
301
|
+
const observedRubrics = /* @__PURE__ */ new Set();
|
|
302
|
+
for (const r of input.runs) {
|
|
303
|
+
for (const k of Object.keys(r.outcome.raw)) observedRubrics.add(k);
|
|
304
|
+
}
|
|
305
|
+
const rubrics = input.rubrics ?? [...observedRubrics];
|
|
306
|
+
const buckets = [];
|
|
307
|
+
for (const r of rubrics) {
|
|
308
|
+
for (const o of input.outcomeMetrics) {
|
|
309
|
+
buckets.push({ rubric: r, outcome: o, xs: [], ys: [] });
|
|
323
310
|
}
|
|
324
|
-
tentative.push({
|
|
325
|
-
candidateId: id,
|
|
326
|
-
n: bucket.scores.length,
|
|
327
|
-
mean: ci.mean,
|
|
328
|
-
ciLow: ci.lower,
|
|
329
|
-
ciHigh: ci.upper,
|
|
330
|
-
qValue: rawP,
|
|
331
|
-
cohensD: d,
|
|
332
|
-
rawP
|
|
333
|
-
});
|
|
334
311
|
}
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
idxs.push(i);
|
|
343
|
-
ps.push(r.rawP);
|
|
312
|
+
let joined = 0;
|
|
313
|
+
let skipped = 0;
|
|
314
|
+
for (const run of input.runs) {
|
|
315
|
+
const os = outcomesByRun.get(run.runId);
|
|
316
|
+
if (!os || os.length === 0) {
|
|
317
|
+
skipped++;
|
|
318
|
+
continue;
|
|
344
319
|
}
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
320
|
+
let joinedThisRun = false;
|
|
321
|
+
for (const r of rubrics) {
|
|
322
|
+
const x = run.outcome.raw[r];
|
|
323
|
+
if (typeof x !== "number" || !Number.isFinite(x)) continue;
|
|
324
|
+
for (const o of input.outcomeMetrics) {
|
|
325
|
+
const values = os.map((row) => row.metrics[o]).filter((v) => typeof v === "number" && Number.isFinite(v));
|
|
326
|
+
if (values.length === 0) continue;
|
|
327
|
+
const y = reduce(values, os, o, reduction);
|
|
328
|
+
if (y === null) continue;
|
|
329
|
+
const bucket = buckets.find((b) => b.rubric === r && b.outcome === o);
|
|
330
|
+
bucket.xs.push(x);
|
|
331
|
+
bucket.ys.push(y);
|
|
332
|
+
joinedThisRun = true;
|
|
349
333
|
}
|
|
350
334
|
}
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
const
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
const
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
369
|
-
const key = `${r.experimentId}::${r.seed}`;
|
|
370
|
-
const b = baseIdx.get(key);
|
|
371
|
-
if (b === void 0) continue;
|
|
372
|
-
before.push(b);
|
|
373
|
-
after.push(v);
|
|
374
|
-
}
|
|
375
|
-
return { before, after };
|
|
376
|
-
}
|
|
377
|
-
function renderSummaryTableMarkdown(rows, comparator, split) {
|
|
378
|
-
const lines = [];
|
|
379
|
-
const cmpLabel = comparator ? ` (vs ${comparator})` : "";
|
|
380
|
-
lines.push(`Summary Table \u2014 ${split} split${cmpLabel}`);
|
|
381
|
-
lines.push("");
|
|
382
|
-
lines.push("| Candidate | N | Mean | 95% CI | q (BH) | Cohen's d |");
|
|
383
|
-
lines.push("|---|---:|---:|---|---:|---:|");
|
|
384
|
-
for (const r of rows) {
|
|
385
|
-
const ci = `[${fmt2(r.ciLow)}, ${fmt2(r.ciHigh)}]`;
|
|
386
|
-
const q = Number.isFinite(r.qValue) ? r.qValue.toFixed(4) : "\u2014";
|
|
387
|
-
const d = Number.isFinite(r.cohensD) ? r.cohensD.toFixed(3) : "\u2014";
|
|
388
|
-
lines.push(`| ${r.candidateId} | ${r.n} | ${fmt2(r.mean)} | ${ci} | ${q} | ${d} |`);
|
|
389
|
-
}
|
|
390
|
-
return lines.join("\n");
|
|
391
|
-
}
|
|
392
|
-
function paretoChart(runs, opts = {}) {
|
|
393
|
-
const split = opts.split ?? "holdout";
|
|
394
|
-
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
395
|
-
const buckets = /* @__PURE__ */ new Map();
|
|
396
|
-
for (const r of runs) {
|
|
397
|
-
if (r.splitTag !== split) continue;
|
|
398
|
-
const v = r.outcome[scoreField];
|
|
399
|
-
if (typeof v !== "number" || !Number.isFinite(v)) continue;
|
|
400
|
-
const bucket = buckets.get(r.candidateId) ?? { cost: [], quality: [] };
|
|
401
|
-
bucket.cost.push(r.costUsd);
|
|
402
|
-
bucket.quality.push(v);
|
|
403
|
-
buckets.set(r.candidateId, bucket);
|
|
404
|
-
}
|
|
405
|
-
const points = [];
|
|
406
|
-
for (const [candidateId, bucket] of buckets.entries()) {
|
|
407
|
-
points.push({
|
|
408
|
-
candidateId,
|
|
409
|
-
cost: avg(bucket.cost),
|
|
410
|
-
quality: avg(bucket.quality),
|
|
411
|
-
n: bucket.cost.length,
|
|
412
|
-
onFrontier: false,
|
|
413
|
-
gate: opts.gateDecisions?.[candidateId] ? gateLabel(opts.gateDecisions[candidateId]) : void 0
|
|
335
|
+
if (joinedThisRun) joined++;
|
|
336
|
+
}
|
|
337
|
+
const pairs = [];
|
|
338
|
+
for (const b of buckets) {
|
|
339
|
+
if (b.xs.length < minSamples) continue;
|
|
340
|
+
const pearson = pearsonR(b.xs, b.ys);
|
|
341
|
+
const spearman = pearsonR(rankWithTies(b.xs), rankWithTies(b.ys));
|
|
342
|
+
const ci = bootstrapCi(b.xs, b.ys, resamples, rng);
|
|
343
|
+
const verdict = Math.abs(spearman) >= 0.7 ? "load_bearing" : Math.abs(spearman) >= 0.4 ? "informative" : "decorative";
|
|
344
|
+
pairs.push({
|
|
345
|
+
rubric: b.rubric,
|
|
346
|
+
outcome: b.outcome,
|
|
347
|
+
n: b.xs.length,
|
|
348
|
+
pearson,
|
|
349
|
+
spearman,
|
|
350
|
+
ci95: ci,
|
|
351
|
+
verdict
|
|
414
352
|
});
|
|
415
353
|
}
|
|
416
|
-
|
|
417
|
-
|
|
354
|
+
const byRubric = /* @__PURE__ */ new Map();
|
|
355
|
+
for (const p of pairs) {
|
|
356
|
+
const arr = byRubric.get(p.rubric) ?? [];
|
|
357
|
+
arr.push(p);
|
|
358
|
+
byRubric.set(p.rubric, arr);
|
|
418
359
|
}
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
split,
|
|
422
|
-
axes: { x: "costUsd", y: "score" },
|
|
423
|
-
points
|
|
424
|
-
};
|
|
425
|
-
}
|
|
426
|
-
function dominates(a, b) {
|
|
427
|
-
return a.cost <= b.cost && a.quality >= b.quality && (a.cost < b.cost || a.quality > b.quality);
|
|
428
|
-
}
|
|
429
|
-
function gateLabel(d) {
|
|
430
|
-
if (d.promote) return "promote";
|
|
431
|
-
if (d.rejectionCode === "few_runs") return "reject_few_runs";
|
|
432
|
-
if (d.rejectionCode === "negative_delta") return "reject_negative_delta";
|
|
433
|
-
if (d.rejectionCode === "overfit_gap") return "reject_overfit_gap";
|
|
434
|
-
return null;
|
|
435
|
-
}
|
|
436
|
-
function gainHistogram(runs, candidateId, comparator, opts = {}) {
|
|
437
|
-
const split = opts.split ?? "holdout";
|
|
438
|
-
const scoreField = split === "holdout" ? "holdoutScore" : "searchScore";
|
|
439
|
-
const binCount = opts.bins ?? 11;
|
|
440
|
-
if (binCount < 1) throw new Error("gainHistogram: bins must be \u2265 1");
|
|
441
|
-
const candidate = runs.filter((r) => r.candidateId === candidateId && r.splitTag === split);
|
|
442
|
-
const baseline = runs.filter((r) => r.candidateId === comparator && r.splitTag === split);
|
|
443
|
-
const { before, after } = pairScoresByKey(candidate, baseline, scoreField);
|
|
444
|
-
const n = before.length;
|
|
445
|
-
if (n === 0) {
|
|
360
|
+
const ranked = [...byRubric.entries()].map(([rubric, ps]) => {
|
|
361
|
+
const best = ps.reduce((a, b) => Math.abs(b.spearman) > Math.abs(a.spearman) ? b : a);
|
|
446
362
|
return {
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
n:
|
|
452
|
-
|
|
453
|
-
median: 0,
|
|
454
|
-
ci: { low: 0, high: 0 }
|
|
363
|
+
rubric,
|
|
364
|
+
bestOutcome: best.outcome,
|
|
365
|
+
spearman: best.spearman,
|
|
366
|
+
pearson: best.pearson,
|
|
367
|
+
n: best.n,
|
|
368
|
+
verdict: best.verdict
|
|
455
369
|
};
|
|
370
|
+
}).sort((a, b) => Math.abs(b.spearman) - Math.abs(a.spearman));
|
|
371
|
+
const rubricsWithoutData = rubrics.filter((r) => !byRubric.has(r));
|
|
372
|
+
return { pairs, ranked, joinedSamples: joined, skippedRuns: skipped, rubricsWithoutData };
|
|
373
|
+
}
|
|
374
|
+
function reduce(values, outcomes, metric, kind) {
|
|
375
|
+
if (values.length === 0) return null;
|
|
376
|
+
if (kind === "mean") return values.reduce((s, v) => s + v, 0) / values.length;
|
|
377
|
+
if (kind === "max") return Math.max(...values);
|
|
378
|
+
const sorted = [...outcomes].filter((o) => typeof o.metrics[metric] === "number").sort((a, b) => b.capturedAt - a.capturedAt);
|
|
379
|
+
return sorted[0]?.metrics[metric] ?? null;
|
|
380
|
+
}
|
|
381
|
+
function pearsonR(a, b) {
|
|
382
|
+
if (a.length !== b.length || a.length < 2) return Number.NaN;
|
|
383
|
+
const ma = a.reduce((s, v) => s + v, 0) / a.length;
|
|
384
|
+
const mb = b.reduce((s, v) => s + v, 0) / b.length;
|
|
385
|
+
let num2 = 0, da = 0, db = 0;
|
|
386
|
+
for (let i = 0; i < a.length; i++) {
|
|
387
|
+
const xa = a[i] - ma;
|
|
388
|
+
const xb = b[i] - mb;
|
|
389
|
+
num2 += xa * xb;
|
|
390
|
+
da += xa * xa;
|
|
391
|
+
db += xb * xb;
|
|
392
|
+
}
|
|
393
|
+
if (da === 0 || db === 0) return da === 0 && db === 0 ? 1 : 0;
|
|
394
|
+
return num2 / Math.sqrt(da * db);
|
|
395
|
+
}
|
|
396
|
+
function rankWithTies(xs) {
|
|
397
|
+
const indexed = xs.map((v, i) => ({ v, i })).sort((a, b) => a.v - b.v);
|
|
398
|
+
const r = new Array(xs.length);
|
|
399
|
+
for (let i = 0; i < indexed.length; ) {
|
|
400
|
+
let j = i;
|
|
401
|
+
while (j + 1 < indexed.length && indexed[j + 1].v === indexed[i].v) j++;
|
|
402
|
+
const avg = (i + j + 2) / 2;
|
|
403
|
+
for (let k = i; k <= j; k++) r[indexed[k].i] = avg;
|
|
404
|
+
i = j + 1;
|
|
405
|
+
}
|
|
406
|
+
return r;
|
|
407
|
+
}
|
|
408
|
+
function bootstrapCi(xs, ys, iterations, rng) {
|
|
409
|
+
const n = xs.length;
|
|
410
|
+
if (n < 3) return { low: Number.NaN, high: Number.NaN };
|
|
411
|
+
const samples = [];
|
|
412
|
+
for (let b = 0; b < iterations; b++) {
|
|
413
|
+
const rx = new Array(n);
|
|
414
|
+
const ry = new Array(n);
|
|
415
|
+
for (let i = 0; i < n; i++) {
|
|
416
|
+
const idx = Math.floor(rng() * n);
|
|
417
|
+
rx[i] = xs[idx];
|
|
418
|
+
ry[i] = ys[idx];
|
|
419
|
+
}
|
|
420
|
+
const r = pearsonR(rx, ry);
|
|
421
|
+
if (Number.isFinite(r)) samples.push(r);
|
|
456
422
|
}
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
const median = medianOfSorted(sortedDeltas);
|
|
460
|
-
const min = sortedDeltas[0];
|
|
461
|
-
const max = sortedDeltas[sortedDeltas.length - 1];
|
|
462
|
-
const bound = Math.max(Math.abs(min), Math.abs(max), 1e-6);
|
|
463
|
-
const lo = -bound;
|
|
464
|
-
const hi = bound;
|
|
465
|
-
const width = (hi - lo) / binCount;
|
|
466
|
-
const bins = [];
|
|
467
|
-
for (let i = 0; i < binCount; i++) {
|
|
468
|
-
bins.push({ lo: lo + i * width, hi: lo + (i + 1) * width, count: 0 });
|
|
469
|
-
}
|
|
470
|
-
for (const d of deltas) {
|
|
471
|
-
let idx = Math.floor((d - lo) / width);
|
|
472
|
-
if (idx < 0) idx = 0;
|
|
473
|
-
if (idx >= binCount) idx = binCount - 1;
|
|
474
|
-
bins[idx].count += 1;
|
|
475
|
-
}
|
|
476
|
-
const ci = pairedBootstrap(before, after, {
|
|
477
|
-
confidence: opts.confidence ?? 0.95,
|
|
478
|
-
resamples: opts.resamples ?? 2e3,
|
|
479
|
-
statistic: "median",
|
|
480
|
-
seed: opts.seed
|
|
481
|
-
});
|
|
423
|
+
samples.sort((a, b) => a - b);
|
|
424
|
+
if (samples.length === 0) return { low: Number.NaN, high: Number.NaN };
|
|
482
425
|
return {
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
comparator,
|
|
486
|
-
split,
|
|
487
|
-
n,
|
|
488
|
-
bins,
|
|
489
|
-
median,
|
|
490
|
-
ci: { low: ci.low, high: ci.high }
|
|
426
|
+
low: samples[Math.floor(0.025 * samples.length)],
|
|
427
|
+
high: samples[Math.min(samples.length - 1, Math.floor(0.975 * samples.length))]
|
|
491
428
|
};
|
|
492
429
|
}
|
|
493
|
-
function
|
|
494
|
-
if (
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
430
|
+
function makeRng(seed) {
|
|
431
|
+
if (seed === void 0) return Math.random;
|
|
432
|
+
let s = seed >>> 0;
|
|
433
|
+
return () => {
|
|
434
|
+
s = s + 1831565813 >>> 0;
|
|
435
|
+
let t = s;
|
|
436
|
+
t = Math.imul(t ^ t >>> 15, t | 1);
|
|
437
|
+
t ^= t + Math.imul(t ^ t >>> 7, t | 61);
|
|
438
|
+
return ((t ^ t >>> 14) >>> 0) / 4294967296;
|
|
439
|
+
};
|
|
501
440
|
}
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
441
|
+
|
|
442
|
+
// src/sequential.ts
|
|
443
|
+
function pairedEvalueSequence(deltas, opts = {}) {
|
|
444
|
+
const c = opts.bound ?? 1;
|
|
445
|
+
const alpha = opts.alpha ?? 0.05;
|
|
446
|
+
const initialShrink = opts.initialBetShrinkage ?? 0.5;
|
|
447
|
+
const rope = opts.rope ?? null;
|
|
448
|
+
if (c <= 0) throw new Error("pairedEvalueSequence: bound must be > 0");
|
|
449
|
+
if (alpha <= 0 || alpha >= 1) throw new Error("pairedEvalueSequence: alpha must be in (0,1)");
|
|
450
|
+
if (rope && !(Number.isFinite(rope.low) && Number.isFinite(rope.high) && rope.low <= rope.high)) {
|
|
451
|
+
throw new Error("pairedEvalueSequence: rope must satisfy low \u2264 high");
|
|
452
|
+
}
|
|
453
|
+
const steps = [];
|
|
454
|
+
let clipped = false;
|
|
455
|
+
let evalue = 1;
|
|
456
|
+
let decisionFiredAt = null;
|
|
457
|
+
let sum = 0;
|
|
458
|
+
let sumSq = 0;
|
|
459
|
+
let count = 0;
|
|
460
|
+
for (let i = 0; i < deltas.length; i++) {
|
|
461
|
+
let d = deltas[i];
|
|
462
|
+
if (d < -c || d > c) {
|
|
463
|
+
d = Math.max(-c, Math.min(c, d));
|
|
464
|
+
clipped = true;
|
|
465
|
+
}
|
|
466
|
+
const muHat = count === 0 ? 0 : sum / count;
|
|
467
|
+
const varHat = count === 0 ? c * c : Math.max(1e-12, sumSq / count - muHat * muHat);
|
|
468
|
+
const t = i + 1;
|
|
469
|
+
const shrink = initialShrink * Math.min(1, count / 32);
|
|
470
|
+
let lambda = muHat / (varHat + c * c) * shrink;
|
|
471
|
+
const lambdaMax = 0.99 / c;
|
|
472
|
+
if (lambda > lambdaMax) lambda = lambdaMax;
|
|
473
|
+
if (lambda < -lambdaMax) lambda = -lambdaMax;
|
|
474
|
+
evalue = evalue * (1 + lambda * d);
|
|
475
|
+
if (!Number.isFinite(evalue) || evalue < 0) evalue = 0;
|
|
476
|
+
sum += d;
|
|
477
|
+
sumSq += d * d;
|
|
478
|
+
count += 1;
|
|
479
|
+
const pValue = Math.min(1, 1 / Math.max(evalue, 1e-300));
|
|
480
|
+
const cs = empiricalBernsteinCs(sum, sumSq, count, c, alpha);
|
|
481
|
+
let decision = "continue";
|
|
482
|
+
if (rope && cs.low >= rope.low && cs.high <= rope.high) decision = "equivalent";
|
|
483
|
+
else if (evalue >= 2 / alpha && muHat > 0) decision = "promote_now";
|
|
484
|
+
else if (evalue >= 2 / alpha && muHat < 0) decision = "reject_now";
|
|
485
|
+
else if (rope && cs.high < rope.low) decision = "reject_now";
|
|
486
|
+
if (decision !== "continue" && decisionFiredAt === null) decisionFiredAt = t;
|
|
487
|
+
steps.push({ t, delta: d, evalue, pValue, csLow: cs.low, csHigh: cs.high, decision });
|
|
488
|
+
}
|
|
489
|
+
const finalDecision = steps.length === 0 ? "continue" : steps[steps.length - 1].decision;
|
|
490
|
+
return { steps, finalDecision, decisionFiredAt, clipped };
|
|
491
|
+
}
|
|
492
|
+
function evaluateInterimReleaseConfidence(input) {
|
|
493
|
+
const candidates = input.deltaSeries.map((s) => {
|
|
494
|
+
const seq = pairedEvalueSequence(s.deltas, {
|
|
495
|
+
alpha: input.alpha,
|
|
496
|
+
bound: input.bound,
|
|
497
|
+
rope: input.rope
|
|
498
|
+
});
|
|
499
|
+
const last = seq.steps[seq.steps.length - 1];
|
|
500
|
+
return {
|
|
501
|
+
candidateId: s.candidateId,
|
|
502
|
+
decision: seq.finalDecision,
|
|
503
|
+
decisionFiredAt: seq.decisionFiredAt,
|
|
504
|
+
finalEvalue: last?.evalue ?? 1,
|
|
505
|
+
finalPValue: last?.pValue ?? 1,
|
|
506
|
+
pairs: seq.steps.length,
|
|
507
|
+
csLow: last?.csLow ?? Number.NEGATIVE_INFINITY,
|
|
508
|
+
csHigh: last?.csHigh ?? Number.POSITIVE_INFINITY
|
|
509
|
+
};
|
|
510
|
+
});
|
|
511
|
+
const promote = candidates.find((c) => c.decision === "promote_now");
|
|
512
|
+
if (promote) return { candidates, recommendation: { decision: "promote_now", candidateId: promote.candidateId } };
|
|
513
|
+
const live = candidates.find((c) => c.decision === "continue");
|
|
514
|
+
if (live) return { candidates, recommendation: { decision: "continue", candidateId: null } };
|
|
515
|
+
const equiv = candidates.find((c) => c.decision === "equivalent");
|
|
516
|
+
if (equiv) return { candidates, recommendation: { decision: "equivalent", candidateId: equiv.candidateId } };
|
|
517
|
+
return { candidates, recommendation: { decision: "reject_now", candidateId: null } };
|
|
518
|
+
}
|
|
519
|
+
function empiricalBernsteinCs(sum, sumSq, n, bound, alpha) {
|
|
520
|
+
if (n === 0) return { low: -bound, high: bound };
|
|
521
|
+
const mean3 = sum / n;
|
|
522
|
+
const variance = Math.max(0, sumSq / n - mean3 * mean3);
|
|
523
|
+
const psi = Math.log(2 / alpha) + 1.7 * Math.log(Math.log(Math.max(Math.E, n)) + 1);
|
|
524
|
+
const radius = Math.sqrt(2 * variance * psi / n) + 3 * bound * psi / n;
|
|
525
|
+
return { low: mean3 - radius, high: mean3 + radius };
|
|
505
526
|
}
|
|
506
527
|
|
|
507
528
|
// src/release-report.ts
|
|
@@ -593,7 +614,7 @@ function num(value) {
|
|
|
593
614
|
}
|
|
594
615
|
|
|
595
616
|
// src/promotion-gate.ts
|
|
596
|
-
function
|
|
617
|
+
function bootstrapCi2(baseline, candidate, options = {}) {
|
|
597
618
|
const alpha = options.alpha ?? 0.05;
|
|
598
619
|
const iterations = options.iterations ?? 1e3;
|
|
599
620
|
const minTotal = options.minTotalSamples ?? 6;
|
|
@@ -677,7 +698,7 @@ async function judgeReplayGate(args) {
|
|
|
677
698
|
const concurrency = args.judgeConcurrency ?? 4;
|
|
678
699
|
const baselineScores = await scoreAll(args.baselineOutputs, args.judge, concurrency);
|
|
679
700
|
const candidateScores = await scoreAll(args.candidateOutputs, args.judge, concurrency);
|
|
680
|
-
const ci =
|
|
701
|
+
const ci = bootstrapCi2(baselineScores, candidateScores, {
|
|
681
702
|
...args.alpha !== void 0 ? { alpha: args.alpha } : {},
|
|
682
703
|
...args.iterations !== void 0 ? { iterations: args.iterations } : {},
|
|
683
704
|
...args.seed !== void 0 ? { seed: args.seed } : {}
|
|
@@ -707,11 +728,11 @@ export {
|
|
|
707
728
|
releaseTraceEvidenceFromMultiShotTrials,
|
|
708
729
|
evaluateReleaseConfidence,
|
|
709
730
|
assertReleaseConfidence,
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
731
|
+
rubricPredictiveValidity,
|
|
732
|
+
pairedEvalueSequence,
|
|
733
|
+
evaluateInterimReleaseConfidence,
|
|
713
734
|
renderReleaseReport,
|
|
714
|
-
bootstrapCi,
|
|
735
|
+
bootstrapCi2 as bootstrapCi,
|
|
715
736
|
judgeReplayGate
|
|
716
737
|
};
|
|
717
|
-
//# sourceMappingURL=chunk-
|
|
738
|
+
//# sourceMappingURL=chunk-UAND2LOT.js.map
|