@tangle-network/agent-eval 0.19.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
417
417
  if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
418
418
  if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
419
419
  const n = scores2.length;
420
- const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
420
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
421
421
  const B = 1e3;
422
422
  const bootstrapMeans = [];
423
423
  for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
432
432
  const lowerIdx = Math.floor(alpha / 2 * B);
433
433
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
434
434
  return {
435
- mean: mean9,
435
+ mean: mean10,
436
436
  lower: bootstrapMeans[lowerIdx],
437
437
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
438
438
  };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
520
520
  const n = before.length;
521
521
  if (n < 2) return { t: 0, df: 0, p: 1 };
522
522
  const diffs = before.map((b, i) => after[i] - b);
523
- const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
524
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
523
+ const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
524
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
525
525
  const se = Math.sqrt(variance2 / n);
526
- if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
527
- const t = mean9 / se;
526
+ if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
527
+ const t = mean10 / se;
528
528
  const df = n - 1;
529
529
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
530
530
  return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
548
548
  }
549
549
  let wPlus = 0;
550
550
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
551
- const mean9 = n * (n + 1) / 4;
551
+ const mean10 = n * (n + 1) / 4;
552
552
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
553
- const z = (wPlus - mean9) / Math.sqrt(variance2);
553
+ const z = (wPlus - mean10) / Math.sqrt(variance2);
554
554
  const p = 2 * (1 - normalCdf(Math.abs(z)));
555
555
  return { w: wPlus, p };
556
556
  }
@@ -6457,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
6457
6457
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6458
6458
  }
6459
6459
  const tail = values.slice(-window);
6460
- const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
6461
- const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
6460
+ const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
6461
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
6462
6462
  const stdDev = Math.sqrt(variance2);
6463
- const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
6463
+ const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
6464
6464
  const cv = stdDev / refMean;
6465
6465
  const stable = tail.length >= window && cv <= stableCv;
6466
6466
  let tailRun = 0;
@@ -6481,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
6481
6481
  } else {
6482
6482
  state = "noisy";
6483
6483
  }
6484
- return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
6484
+ return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
6485
6485
  }
6486
6486
 
6487
6487
  // src/state-continuity.ts
@@ -6938,9 +6938,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
6938
6938
  for (let k = n - 1; k >= 0; k--) {
6939
6939
  const rank = k + 1;
6940
6940
  const raw = indexed[k].p * n / rank;
6941
- const bounded = Math.min(minRight, raw);
6942
- minRight = bounded;
6943
- q[indexed[k].i] = Math.min(1, bounded);
6941
+ const bounded2 = Math.min(minRight, raw);
6942
+ minRight = bounded2;
6943
+ q[indexed[k].i] = Math.min(1, bounded2);
6944
6944
  }
6945
6945
  const significant = q.map((v) => v < fdr);
6946
6946
  return { qValues: q, significant };
@@ -7470,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7470
7470
  variantScores.push({ mutator: id, score, mutated });
7471
7471
  all.push(score);
7472
7472
  }
7473
- const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
7474
- const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
7473
+ const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
7474
+ const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
7475
7475
  const stdDev = Math.sqrt(variance2);
7476
- const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
7476
+ const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
7477
7477
  const robustness = Math.max(0, 1 - stdDev / ref);
7478
- return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
7478
+ return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
7479
7479
  }
7480
7480
  var lowercaseMutator = (p) => p.toLowerCase();
7481
7481
  var sentenceReorderMutator = (p, seed) => {
@@ -8396,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
8396
8396
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
8397
8397
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
8398
8398
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
8399
- const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8400
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
8399
+ const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8400
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
8401
8401
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8402
8402
  }
8403
8403
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8419,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
8419
8419
  const ranked = [...byRun.values()].sort(
8420
8420
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
8421
8421
  );
8422
- const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8423
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
8422
+ const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8423
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
8424
8424
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8425
8425
  }
8426
8426
 
@@ -8950,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
8950
8950
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
8951
8951
  const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
8952
8952
  if (scores2.length < 3) continue;
8953
- const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
8954
- const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
8953
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
8954
+ const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
8955
8955
  if (variance2 > varianceThreshold) {
8956
8956
  targets.push({
8957
8957
  reason: "high-variance",
@@ -12813,6 +12813,289 @@ function traceExcerpt(trace) {
12813
12813
  return void 0;
12814
12814
  }
12815
12815
 
12816
+ // src/release-confidence.ts
12817
+ var DEFAULT_THRESHOLDS = {
12818
+ requireCorpus: true,
12819
+ minScenarioCount: 1,
12820
+ minSearchRuns: 1,
12821
+ minHoldoutRuns: 1,
12822
+ requireHoldout: true,
12823
+ minPassRate: 0.8,
12824
+ minMeanScore: 0.7,
12825
+ maxOverfitGap: 0.15,
12826
+ maxMeanCostUsd: Number.POSITIVE_INFINITY,
12827
+ maxP95WallMs: Number.POSITIVE_INFINITY,
12828
+ requireAsiForFailures: true,
12829
+ failureScoreThreshold: 0.5
12830
+ };
12831
+ function releaseTraceEvidenceFromMultiShotTrials(trials) {
12832
+ return trials.map((trial) => ({
12833
+ scenarioId: trial.scenarioId,
12834
+ candidateId: trial.variantId,
12835
+ split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
12836
+ score: trial.score,
12837
+ ok: trial.ok,
12838
+ turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
12839
+ costUsd: trial.cost,
12840
+ durationMs: trial.durationMs,
12841
+ failureMode: trial.error ? "runtime_error" : void 0,
12842
+ asi: trial.asi,
12843
+ metadata: trial.metadata
12844
+ }));
12845
+ }
12846
+ function evaluateReleaseConfidence(input) {
12847
+ const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
12848
+ const candidateId = input.candidateId ?? null;
12849
+ const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
12850
+ const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
12851
+ const scenarios = input.scenarios ?? [];
12852
+ const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
12853
+ const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
12854
+ const searchScores = scoresFor(runs, "search");
12855
+ const holdoutScores = scoresFor(runs, "holdout");
12856
+ const allScores = [...searchScores, ...holdoutScores];
12857
+ const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
12858
+ const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
12859
+ const searchRuns = runs.filter((r) => r.splitTag === "search").length;
12860
+ const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
12861
+ const searchMeanScore = mean8(searchScores);
12862
+ const holdoutMeanScore = mean8(holdoutScores);
12863
+ const metrics = {
12864
+ scenarioCount,
12865
+ searchRuns,
12866
+ holdoutRuns,
12867
+ passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
12868
+ meanScore: mean8(scoreUniverse),
12869
+ searchMeanScore,
12870
+ holdoutMeanScore,
12871
+ overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
12872
+ meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
12873
+ p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
12874
+ failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
12875
+ failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
12876
+ singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
12877
+ multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
12878
+ splitCounts,
12879
+ domainCounts: countDomains(scenarios),
12880
+ failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
12881
+ responsibleSurfaceCounts: countResponsibleSurfaces(traces)
12882
+ };
12883
+ const issues = [];
12884
+ checkCorpus(input, thresholds, metrics, issues);
12885
+ checkQuality(thresholds, metrics, issues);
12886
+ checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
12887
+ checkDiagnostics(thresholds, metrics, issues);
12888
+ checkEfficiency(thresholds, metrics, issues);
12889
+ const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
12890
+ const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
12891
+ return {
12892
+ target: input.target,
12893
+ candidateId,
12894
+ baselineId: input.baselineId ?? null,
12895
+ status,
12896
+ promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
12897
+ axes,
12898
+ issues,
12899
+ metrics,
12900
+ dataset: input.dataset ?? null,
12901
+ gateDecision: input.gateDecision ?? null,
12902
+ summary: renderSummary(input.target, status, metrics, issues)
12903
+ };
12904
+ }
12905
+ function assertReleaseConfidence(input) {
12906
+ const scorecard = evaluateReleaseConfidence(input);
12907
+ if (scorecard.status === "fail") {
12908
+ throw new Error(scorecard.summary);
12909
+ }
12910
+ return scorecard;
12911
+ }
12912
+ function filterCandidate(runs, candidateId, baselineId) {
12913
+ if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
12914
+ if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
12915
+ return [...runs];
12916
+ }
12917
+ function filterTraceCandidate(traces, candidateId, baselineId) {
12918
+ if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
12919
+ if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
12920
+ return [...traces];
12921
+ }
12922
+ function checkCorpus(input, thresholds, metrics, issues) {
12923
+ if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
12924
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
12925
+ }
12926
+ if (metrics.scenarioCount < thresholds.minScenarioCount) {
12927
+ issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
12928
+ }
12929
+ if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
12930
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
12931
+ }
12932
+ }
12933
+ function checkQuality(thresholds, metrics, issues) {
12934
+ if (metrics.searchRuns < thresholds.minSearchRuns) {
12935
+ issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
12936
+ }
12937
+ if (metrics.passRate < thresholds.minPassRate) {
12938
+ issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
12939
+ }
12940
+ if (metrics.meanScore < thresholds.minMeanScore) {
12941
+ issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
12942
+ }
12943
+ }
12944
+ function checkGeneralization(gateDecision, thresholds, metrics, issues) {
12945
+ if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
12946
+ issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
12947
+ }
12948
+ if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
12949
+ issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
12950
+ }
12951
+ if (gateDecision && !gateDecision.promote) {
12952
+ issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
12953
+ }
12954
+ }
12955
+ function checkDiagnostics(thresholds, metrics, issues) {
12956
+ if (!thresholds.requireAsiForFailures) return;
12957
+ if (metrics.failedRows > metrics.failuresWithAsi) {
12958
+ issues.push({
12959
+ axis: "diagnostics",
12960
+ severity: "critical",
12961
+ code: "missing_failure_asi",
12962
+ detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
12963
+ });
12964
+ }
12965
+ }
12966
+ function checkEfficiency(thresholds, metrics, issues) {
12967
+ if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
12968
+ issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
12969
+ }
12970
+ if (metrics.p95WallMs > thresholds.maxP95WallMs) {
12971
+ issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
12972
+ }
12973
+ }
12974
+ function buildAxes(metrics, thresholds, gateDecision, issues) {
12975
+ return [
12976
+ axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
12977
+ axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
12978
+ axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
12979
+ axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
12980
+ axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
12981
+ ];
12982
+ }
12983
+ function axis(name, issues, score, detail) {
12984
+ const own = issues.filter((i) => i.axis === name);
12985
+ const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
12986
+ return { name, status, score: bounded(score), detail };
12987
+ }
12988
+ function countScenarioSplits(scenarios) {
12989
+ const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
12990
+ for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
12991
+ return counts;
12992
+ }
12993
+ function countDomains(scenarios) {
12994
+ const out = {};
12995
+ for (const scenario of scenarios) {
12996
+ const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
12997
+ out[domain] = (out[domain] ?? 0) + 1;
12998
+ }
12999
+ return out;
13000
+ }
13001
+ function countFailureModes(runs, traces, threshold) {
13002
+ const out = {};
13003
+ for (const run of runs) {
13004
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13005
+ if (run.failureMode || score !== void 0 && score < threshold) {
13006
+ const mode = run.failureMode ?? "low_score";
13007
+ out[mode] = (out[mode] ?? 0) + 1;
13008
+ }
13009
+ }
13010
+ for (const trace of traces) {
13011
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13012
+ const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
13013
+ out[mode] = (out[mode] ?? 0) + 1;
13014
+ }
13015
+ }
13016
+ return out;
13017
+ }
13018
+ function countResponsibleSurfaces(traces) {
13019
+ const out = {};
13020
+ for (const trace of traces) {
13021
+ for (const asi of trace.asi ?? []) {
13022
+ const surface = asi.responsibleSurface ?? "unknown";
13023
+ out[surface] = (out[surface] ?? 0) + 1;
13024
+ }
13025
+ }
13026
+ return out;
13027
+ }
13028
+ function failedRows(runs, traces, threshold) {
13029
+ const out = [];
13030
+ for (const run of runs) {
13031
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13032
+ if (run.failureMode || score !== void 0 && score < threshold) {
13033
+ const asiMetric = run.outcome.raw.asi;
13034
+ out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
13035
+ }
13036
+ }
13037
+ for (const trace of traces) {
13038
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13039
+ out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
13040
+ }
13041
+ }
13042
+ return out;
13043
+ }
13044
+ function passRate(runs, traces, threshold) {
13045
+ const outcomes = [
13046
+ ...runs.map((run) => {
13047
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13048
+ return !run.failureMode && score !== void 0 && score >= threshold;
13049
+ }),
13050
+ ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
13051
+ ];
13052
+ if (outcomes.length === 0) return 0;
13053
+ return outcomes.filter(Boolean).length / outcomes.length;
13054
+ }
13055
+ function scoresFor(runs, split) {
13056
+ return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
13057
+ }
13058
+ function mean8(xs) {
13059
+ if (xs.length === 0) return Number.NaN;
13060
+ return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
13061
+ }
13062
+ function percentile(xs, p) {
13063
+ if (xs.length === 0) return Number.NaN;
13064
+ const sorted = [...xs].sort((a, b) => a - b);
13065
+ return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
13066
+ }
13067
+ function isFiniteNumber(value) {
13068
+ return typeof value === "number" && Number.isFinite(value);
13069
+ }
13070
+ function safeDiff2(a, b) {
13071
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
13072
+ return a - b;
13073
+ }
13074
+ function gapScore(gap, maxGap) {
13075
+ if (!Number.isFinite(gap)) return 0;
13076
+ if (maxGap <= 0) return gap <= 0 ? 1 : 0;
13077
+ return bounded(1 - Math.max(0, gap) / maxGap);
13078
+ }
13079
+ function efficiencyScore(metrics, thresholds) {
13080
+ const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
13081
+ const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
13082
+ return Math.min(cost, latency);
13083
+ }
13084
+ function bounded(x) {
13085
+ if (!Number.isFinite(x)) return 0;
13086
+ return Math.max(0, Math.min(1, x));
13087
+ }
13088
+ function renderSummary(target, status, metrics, issues) {
13089
+ const prefix = `release confidence ${status}: ${target}`;
13090
+ const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
13091
+ if (issues.length === 0) return `${prefix}; ${metricText}`;
13092
+ return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
13093
+ }
13094
+ function fmt3(x) {
13095
+ if (!Number.isFinite(x)) return String(x);
13096
+ return x.toFixed(4);
13097
+ }
13098
+
12816
13099
  // src/jsonl-trial-cache.ts
12817
13100
  import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
12818
13101
  import { dirname as dirname4 } from "path";
@@ -13458,9 +13741,9 @@ function passOrthogonality(input) {
13458
13741
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
13459
13742
  }
13460
13743
  }
13461
- const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
13744
+ const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
13462
13745
  return {
13463
- orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
13746
+ orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
13464
13747
  passCount: passes.length,
13465
13748
  similarities: sims
13466
13749
  };
@@ -13506,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
13506
13789
  const iterations = options.iterations ?? 1e3;
13507
13790
  const minTotal = options.minTotalSamples ?? 6;
13508
13791
  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
13509
- const baselineMean = mean8(baseline);
13510
- const candidateMean = mean8(candidate);
13792
+ const baselineMean = mean9(baseline);
13793
+ const candidateMean = mean9(candidate);
13511
13794
  const delta = candidateMean - baselineMean;
13512
13795
  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
13513
13796
  return {
@@ -13525,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13525
13808
  for (let i = 0; i < iterations; i++) {
13526
13809
  const bResample = resample(baseline, rng);
13527
13810
  const cResample = resample(candidate, rng);
13528
- deltas[i] = mean8(cResample) - mean8(bResample);
13811
+ deltas[i] = mean9(cResample) - mean9(bResample);
13529
13812
  }
13530
13813
  deltas.sort((a, b) => a - b);
13531
13814
  const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13548,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13548
13831
  verdict
13549
13832
  };
13550
13833
  }
13551
- function mean8(xs) {
13834
+ function mean9(xs) {
13552
13835
  if (xs.length === 0) return 0;
13553
13836
  let s = 0;
13554
13837
  for (const x of xs) s += x;
@@ -13872,6 +14155,7 @@ export {
13872
14155
  analyzeAntiSlop,
13873
14156
  analyzeSeries,
13874
14157
  argHash,
14158
+ assertReleaseConfidence,
13875
14159
  assignFeedbackSplit,
13876
14160
  attributeCounterfactuals,
13877
14161
  deterministicSplit as benchmarkDeterministicSplit,
@@ -13942,6 +14226,7 @@ export {
13942
14226
  evaluateContract,
13943
14227
  evaluateHypothesis,
13944
14228
  evaluateOracles,
14229
+ evaluateReleaseConfidence,
13945
14230
  executeScenario,
13946
14231
  expectAgent,
13947
14232
  exportRewardModel,
@@ -14041,6 +14326,7 @@ export {
14041
14326
  regexMatch,
14042
14327
  regexMatches,
14043
14328
  regressionView,
14329
+ releaseTraceEvidenceFromMultiShotTrials,
14044
14330
  renderMarkdown,
14045
14331
  renderMarkdownReport,
14046
14332
  renderPlaybookMarkdown,