@tangle-network/agent-eval 0.19.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/index.d.ts +109 -1
- package/dist/index.js +317 -31
- package/dist/index.js.map +1 -1
- package/package.json +12 -10
package/dist/index.js
CHANGED
|
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
417
417
|
if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
418
418
|
if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
|
|
419
419
|
const n = scores2.length;
|
|
420
|
-
const
|
|
420
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
|
|
421
421
|
const B = 1e3;
|
|
422
422
|
const bootstrapMeans = [];
|
|
423
423
|
for (let i = 0; i < B; i++) {
|
|
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
432
432
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
433
433
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
434
434
|
return {
|
|
435
|
-
mean:
|
|
435
|
+
mean: mean10,
|
|
436
436
|
lower: bootstrapMeans[lowerIdx],
|
|
437
437
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
438
438
|
};
|
|
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
|
|
|
520
520
|
const n = before.length;
|
|
521
521
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
522
522
|
const diffs = before.map((b, i) => after[i] - b);
|
|
523
|
-
const
|
|
524
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
523
|
+
const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
524
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
|
|
525
525
|
const se = Math.sqrt(variance2 / n);
|
|
526
|
-
if (se === 0) return { t:
|
|
527
|
-
const t =
|
|
526
|
+
if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
|
|
527
|
+
const t = mean10 / se;
|
|
528
528
|
const df = n - 1;
|
|
529
529
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
530
530
|
return { t, df, p };
|
|
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
548
548
|
}
|
|
549
549
|
let wPlus = 0;
|
|
550
550
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
551
|
-
const
|
|
551
|
+
const mean10 = n * (n + 1) / 4;
|
|
552
552
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
553
|
-
const z = (wPlus -
|
|
553
|
+
const z = (wPlus - mean10) / Math.sqrt(variance2);
|
|
554
554
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
555
555
|
return { w: wPlus, p };
|
|
556
556
|
}
|
|
@@ -6457,10 +6457,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6457
6457
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6458
6458
|
}
|
|
6459
6459
|
const tail = values.slice(-window);
|
|
6460
|
-
const
|
|
6461
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
6460
|
+
const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6461
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
|
|
6462
6462
|
const stdDev = Math.sqrt(variance2);
|
|
6463
|
-
const refMean = Math.abs(
|
|
6463
|
+
const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
6464
6464
|
const cv = stdDev / refMean;
|
|
6465
6465
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6466
6466
|
let tailRun = 0;
|
|
@@ -6481,7 +6481,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6481
6481
|
} else {
|
|
6482
6482
|
state = "noisy";
|
|
6483
6483
|
}
|
|
6484
|
-
return { state, windowMean:
|
|
6484
|
+
return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
|
|
6485
6485
|
}
|
|
6486
6486
|
|
|
6487
6487
|
// src/state-continuity.ts
|
|
@@ -6938,9 +6938,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
|
6938
6938
|
for (let k = n - 1; k >= 0; k--) {
|
|
6939
6939
|
const rank = k + 1;
|
|
6940
6940
|
const raw = indexed[k].p * n / rank;
|
|
6941
|
-
const
|
|
6942
|
-
minRight =
|
|
6943
|
-
q[indexed[k].i] = Math.min(1,
|
|
6941
|
+
const bounded2 = Math.min(minRight, raw);
|
|
6942
|
+
minRight = bounded2;
|
|
6943
|
+
q[indexed[k].i] = Math.min(1, bounded2);
|
|
6944
6944
|
}
|
|
6945
6945
|
const significant = q.map((v) => v < fdr);
|
|
6946
6946
|
return { qValues: q, significant };
|
|
@@ -7470,12 +7470,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7470
7470
|
variantScores.push({ mutator: id, score, mutated });
|
|
7471
7471
|
all.push(score);
|
|
7472
7472
|
}
|
|
7473
|
-
const
|
|
7474
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
7473
|
+
const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7474
|
+
const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
|
|
7475
7475
|
const stdDev = Math.sqrt(variance2);
|
|
7476
|
-
const ref = Math.abs(
|
|
7476
|
+
const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
7477
7477
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7478
|
-
return { originalScore, variantScores, meanScore:
|
|
7478
|
+
return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
|
|
7479
7479
|
}
|
|
7480
7480
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7481
7481
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -8396,8 +8396,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
8396
8396
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
8397
8397
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
8398
8398
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
8399
|
-
const
|
|
8400
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
8399
|
+
const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
8400
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
|
|
8401
8401
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8402
8402
|
}
|
|
8403
8403
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -8419,8 +8419,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
8419
8419
|
const ranked = [...byRun.values()].sort(
|
|
8420
8420
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
8421
8421
|
);
|
|
8422
|
-
const
|
|
8423
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
8422
|
+
const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
8423
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
|
|
8424
8424
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8425
8425
|
}
|
|
8426
8426
|
|
|
@@ -8950,8 +8950,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
8950
8950
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
8951
8951
|
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
8952
8952
|
if (scores2.length < 3) continue;
|
|
8953
|
-
const
|
|
8954
|
-
const variance2 = scores2.reduce((a, b) => a + (b -
|
|
8953
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
8954
|
+
const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
|
|
8955
8955
|
if (variance2 > varianceThreshold) {
|
|
8956
8956
|
targets.push({
|
|
8957
8957
|
reason: "high-variance",
|
|
@@ -12813,6 +12813,289 @@ function traceExcerpt(trace) {
|
|
|
12813
12813
|
return void 0;
|
|
12814
12814
|
}
|
|
12815
12815
|
|
|
12816
|
+
// src/release-confidence.ts
|
|
12817
|
+
var DEFAULT_THRESHOLDS = {
|
|
12818
|
+
requireCorpus: true,
|
|
12819
|
+
minScenarioCount: 1,
|
|
12820
|
+
minSearchRuns: 1,
|
|
12821
|
+
minHoldoutRuns: 1,
|
|
12822
|
+
requireHoldout: true,
|
|
12823
|
+
minPassRate: 0.8,
|
|
12824
|
+
minMeanScore: 0.7,
|
|
12825
|
+
maxOverfitGap: 0.15,
|
|
12826
|
+
maxMeanCostUsd: Number.POSITIVE_INFINITY,
|
|
12827
|
+
maxP95WallMs: Number.POSITIVE_INFINITY,
|
|
12828
|
+
requireAsiForFailures: true,
|
|
12829
|
+
failureScoreThreshold: 0.5
|
|
12830
|
+
};
|
|
12831
|
+
function releaseTraceEvidenceFromMultiShotTrials(trials) {
|
|
12832
|
+
return trials.map((trial) => ({
|
|
12833
|
+
scenarioId: trial.scenarioId,
|
|
12834
|
+
candidateId: trial.variantId,
|
|
12835
|
+
split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
|
|
12836
|
+
score: trial.score,
|
|
12837
|
+
ok: trial.ok,
|
|
12838
|
+
turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
|
|
12839
|
+
costUsd: trial.cost,
|
|
12840
|
+
durationMs: trial.durationMs,
|
|
12841
|
+
failureMode: trial.error ? "runtime_error" : void 0,
|
|
12842
|
+
asi: trial.asi,
|
|
12843
|
+
metadata: trial.metadata
|
|
12844
|
+
}));
|
|
12845
|
+
}
|
|
12846
|
+
function evaluateReleaseConfidence(input) {
|
|
12847
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
|
12848
|
+
const candidateId = input.candidateId ?? null;
|
|
12849
|
+
const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
|
|
12850
|
+
const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
|
|
12851
|
+
const scenarios = input.scenarios ?? [];
|
|
12852
|
+
const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
|
|
12853
|
+
const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
|
|
12854
|
+
const searchScores = scoresFor(runs, "search");
|
|
12855
|
+
const holdoutScores = scoresFor(runs, "holdout");
|
|
12856
|
+
const allScores = [...searchScores, ...holdoutScores];
|
|
12857
|
+
const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
|
|
12858
|
+
const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
|
|
12859
|
+
const searchRuns = runs.filter((r) => r.splitTag === "search").length;
|
|
12860
|
+
const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
|
|
12861
|
+
const searchMeanScore = mean8(searchScores);
|
|
12862
|
+
const holdoutMeanScore = mean8(holdoutScores);
|
|
12863
|
+
const metrics = {
|
|
12864
|
+
scenarioCount,
|
|
12865
|
+
searchRuns,
|
|
12866
|
+
holdoutRuns,
|
|
12867
|
+
passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
|
|
12868
|
+
meanScore: mean8(scoreUniverse),
|
|
12869
|
+
searchMeanScore,
|
|
12870
|
+
holdoutMeanScore,
|
|
12871
|
+
overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
|
|
12872
|
+
meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
|
|
12873
|
+
p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
|
|
12874
|
+
failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
|
|
12875
|
+
failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
|
|
12876
|
+
singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
|
|
12877
|
+
multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
|
|
12878
|
+
splitCounts,
|
|
12879
|
+
domainCounts: countDomains(scenarios),
|
|
12880
|
+
failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
|
|
12881
|
+
responsibleSurfaceCounts: countResponsibleSurfaces(traces)
|
|
12882
|
+
};
|
|
12883
|
+
const issues = [];
|
|
12884
|
+
checkCorpus(input, thresholds, metrics, issues);
|
|
12885
|
+
checkQuality(thresholds, metrics, issues);
|
|
12886
|
+
checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
|
|
12887
|
+
checkDiagnostics(thresholds, metrics, issues);
|
|
12888
|
+
checkEfficiency(thresholds, metrics, issues);
|
|
12889
|
+
const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
|
|
12890
|
+
const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
|
|
12891
|
+
return {
|
|
12892
|
+
target: input.target,
|
|
12893
|
+
candidateId,
|
|
12894
|
+
baselineId: input.baselineId ?? null,
|
|
12895
|
+
status,
|
|
12896
|
+
promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
|
|
12897
|
+
axes,
|
|
12898
|
+
issues,
|
|
12899
|
+
metrics,
|
|
12900
|
+
dataset: input.dataset ?? null,
|
|
12901
|
+
gateDecision: input.gateDecision ?? null,
|
|
12902
|
+
summary: renderSummary(input.target, status, metrics, issues)
|
|
12903
|
+
};
|
|
12904
|
+
}
|
|
12905
|
+
function assertReleaseConfidence(input) {
|
|
12906
|
+
const scorecard = evaluateReleaseConfidence(input);
|
|
12907
|
+
if (scorecard.status === "fail") {
|
|
12908
|
+
throw new Error(scorecard.summary);
|
|
12909
|
+
}
|
|
12910
|
+
return scorecard;
|
|
12911
|
+
}
|
|
12912
|
+
function filterCandidate(runs, candidateId, baselineId) {
|
|
12913
|
+
if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
|
|
12914
|
+
if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
|
|
12915
|
+
return [...runs];
|
|
12916
|
+
}
|
|
12917
|
+
function filterTraceCandidate(traces, candidateId, baselineId) {
|
|
12918
|
+
if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
|
|
12919
|
+
if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
|
|
12920
|
+
return [...traces];
|
|
12921
|
+
}
|
|
12922
|
+
function checkCorpus(input, thresholds, metrics, issues) {
|
|
12923
|
+
if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
|
|
12924
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
|
|
12925
|
+
}
|
|
12926
|
+
if (metrics.scenarioCount < thresholds.minScenarioCount) {
|
|
12927
|
+
issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
|
|
12928
|
+
}
|
|
12929
|
+
if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
|
|
12930
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
|
|
12931
|
+
}
|
|
12932
|
+
}
|
|
12933
|
+
function checkQuality(thresholds, metrics, issues) {
|
|
12934
|
+
if (metrics.searchRuns < thresholds.minSearchRuns) {
|
|
12935
|
+
issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
|
|
12936
|
+
}
|
|
12937
|
+
if (metrics.passRate < thresholds.minPassRate) {
|
|
12938
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
|
|
12939
|
+
}
|
|
12940
|
+
if (metrics.meanScore < thresholds.minMeanScore) {
|
|
12941
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
|
|
12942
|
+
}
|
|
12943
|
+
}
|
|
12944
|
+
function checkGeneralization(gateDecision, thresholds, metrics, issues) {
|
|
12945
|
+
if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
|
|
12946
|
+
issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
|
|
12947
|
+
}
|
|
12948
|
+
if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
|
|
12949
|
+
issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
|
|
12950
|
+
}
|
|
12951
|
+
if (gateDecision && !gateDecision.promote) {
|
|
12952
|
+
issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
|
|
12953
|
+
}
|
|
12954
|
+
}
|
|
12955
|
+
function checkDiagnostics(thresholds, metrics, issues) {
|
|
12956
|
+
if (!thresholds.requireAsiForFailures) return;
|
|
12957
|
+
if (metrics.failedRows > metrics.failuresWithAsi) {
|
|
12958
|
+
issues.push({
|
|
12959
|
+
axis: "diagnostics",
|
|
12960
|
+
severity: "critical",
|
|
12961
|
+
code: "missing_failure_asi",
|
|
12962
|
+
detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
|
|
12963
|
+
});
|
|
12964
|
+
}
|
|
12965
|
+
}
|
|
12966
|
+
function checkEfficiency(thresholds, metrics, issues) {
|
|
12967
|
+
if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
|
|
12968
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
|
|
12969
|
+
}
|
|
12970
|
+
if (metrics.p95WallMs > thresholds.maxP95WallMs) {
|
|
12971
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
|
|
12972
|
+
}
|
|
12973
|
+
}
|
|
12974
|
+
function buildAxes(metrics, thresholds, gateDecision, issues) {
|
|
12975
|
+
return [
|
|
12976
|
+
axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
|
|
12977
|
+
axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
|
|
12978
|
+
axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
|
|
12979
|
+
axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
|
|
12980
|
+
axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
|
|
12981
|
+
];
|
|
12982
|
+
}
|
|
12983
|
+
function axis(name, issues, score, detail) {
|
|
12984
|
+
const own = issues.filter((i) => i.axis === name);
|
|
12985
|
+
const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
|
|
12986
|
+
return { name, status, score: bounded(score), detail };
|
|
12987
|
+
}
|
|
12988
|
+
function countScenarioSplits(scenarios) {
|
|
12989
|
+
const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
|
|
12990
|
+
for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
|
|
12991
|
+
return counts;
|
|
12992
|
+
}
|
|
12993
|
+
function countDomains(scenarios) {
|
|
12994
|
+
const out = {};
|
|
12995
|
+
for (const scenario of scenarios) {
|
|
12996
|
+
const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
|
|
12997
|
+
out[domain] = (out[domain] ?? 0) + 1;
|
|
12998
|
+
}
|
|
12999
|
+
return out;
|
|
13000
|
+
}
|
|
13001
|
+
function countFailureModes(runs, traces, threshold) {
|
|
13002
|
+
const out = {};
|
|
13003
|
+
for (const run of runs) {
|
|
13004
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13005
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13006
|
+
const mode = run.failureMode ?? "low_score";
|
|
13007
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13008
|
+
}
|
|
13009
|
+
}
|
|
13010
|
+
for (const trace of traces) {
|
|
13011
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13012
|
+
const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
|
|
13013
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13014
|
+
}
|
|
13015
|
+
}
|
|
13016
|
+
return out;
|
|
13017
|
+
}
|
|
13018
|
+
function countResponsibleSurfaces(traces) {
|
|
13019
|
+
const out = {};
|
|
13020
|
+
for (const trace of traces) {
|
|
13021
|
+
for (const asi of trace.asi ?? []) {
|
|
13022
|
+
const surface = asi.responsibleSurface ?? "unknown";
|
|
13023
|
+
out[surface] = (out[surface] ?? 0) + 1;
|
|
13024
|
+
}
|
|
13025
|
+
}
|
|
13026
|
+
return out;
|
|
13027
|
+
}
|
|
13028
|
+
function failedRows(runs, traces, threshold) {
|
|
13029
|
+
const out = [];
|
|
13030
|
+
for (const run of runs) {
|
|
13031
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13032
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13033
|
+
const asiMetric = run.outcome.raw.asi;
|
|
13034
|
+
out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
|
|
13035
|
+
}
|
|
13036
|
+
}
|
|
13037
|
+
for (const trace of traces) {
|
|
13038
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13039
|
+
out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
|
|
13040
|
+
}
|
|
13041
|
+
}
|
|
13042
|
+
return out;
|
|
13043
|
+
}
|
|
13044
|
+
function passRate(runs, traces, threshold) {
|
|
13045
|
+
const outcomes = [
|
|
13046
|
+
...runs.map((run) => {
|
|
13047
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13048
|
+
return !run.failureMode && score !== void 0 && score >= threshold;
|
|
13049
|
+
}),
|
|
13050
|
+
...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
|
|
13051
|
+
];
|
|
13052
|
+
if (outcomes.length === 0) return 0;
|
|
13053
|
+
return outcomes.filter(Boolean).length / outcomes.length;
|
|
13054
|
+
}
|
|
13055
|
+
function scoresFor(runs, split) {
|
|
13056
|
+
return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
|
|
13057
|
+
}
|
|
13058
|
+
function mean8(xs) {
|
|
13059
|
+
if (xs.length === 0) return Number.NaN;
|
|
13060
|
+
return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
|
|
13061
|
+
}
|
|
13062
|
+
function percentile(xs, p) {
|
|
13063
|
+
if (xs.length === 0) return Number.NaN;
|
|
13064
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
13065
|
+
return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
|
|
13066
|
+
}
|
|
13067
|
+
function isFiniteNumber(value) {
|
|
13068
|
+
return typeof value === "number" && Number.isFinite(value);
|
|
13069
|
+
}
|
|
13070
|
+
function safeDiff2(a, b) {
|
|
13071
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
13072
|
+
return a - b;
|
|
13073
|
+
}
|
|
13074
|
+
function gapScore(gap, maxGap) {
|
|
13075
|
+
if (!Number.isFinite(gap)) return 0;
|
|
13076
|
+
if (maxGap <= 0) return gap <= 0 ? 1 : 0;
|
|
13077
|
+
return bounded(1 - Math.max(0, gap) / maxGap);
|
|
13078
|
+
}
|
|
13079
|
+
function efficiencyScore(metrics, thresholds) {
|
|
13080
|
+
const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
|
|
13081
|
+
const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
|
|
13082
|
+
return Math.min(cost, latency);
|
|
13083
|
+
}
|
|
13084
|
+
function bounded(x) {
|
|
13085
|
+
if (!Number.isFinite(x)) return 0;
|
|
13086
|
+
return Math.max(0, Math.min(1, x));
|
|
13087
|
+
}
|
|
13088
|
+
function renderSummary(target, status, metrics, issues) {
|
|
13089
|
+
const prefix = `release confidence ${status}: ${target}`;
|
|
13090
|
+
const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
|
|
13091
|
+
if (issues.length === 0) return `${prefix}; ${metricText}`;
|
|
13092
|
+
return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
|
|
13093
|
+
}
|
|
13094
|
+
function fmt3(x) {
|
|
13095
|
+
if (!Number.isFinite(x)) return String(x);
|
|
13096
|
+
return x.toFixed(4);
|
|
13097
|
+
}
|
|
13098
|
+
|
|
12816
13099
|
// src/jsonl-trial-cache.ts
|
|
12817
13100
|
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
12818
13101
|
import { dirname as dirname4 } from "path";
|
|
@@ -13458,9 +13741,9 @@ function passOrthogonality(input) {
|
|
|
13458
13741
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
13459
13742
|
}
|
|
13460
13743
|
}
|
|
13461
|
-
const
|
|
13744
|
+
const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
13462
13745
|
return {
|
|
13463
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
13746
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
|
|
13464
13747
|
passCount: passes.length,
|
|
13465
13748
|
similarities: sims
|
|
13466
13749
|
};
|
|
@@ -13506,8 +13789,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13506
13789
|
const iterations = options.iterations ?? 1e3;
|
|
13507
13790
|
const minTotal = options.minTotalSamples ?? 6;
|
|
13508
13791
|
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
13509
|
-
const baselineMean =
|
|
13510
|
-
const candidateMean =
|
|
13792
|
+
const baselineMean = mean9(baseline);
|
|
13793
|
+
const candidateMean = mean9(candidate);
|
|
13511
13794
|
const delta = candidateMean - baselineMean;
|
|
13512
13795
|
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
13513
13796
|
return {
|
|
@@ -13525,7 +13808,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13525
13808
|
for (let i = 0; i < iterations; i++) {
|
|
13526
13809
|
const bResample = resample(baseline, rng);
|
|
13527
13810
|
const cResample = resample(candidate, rng);
|
|
13528
|
-
deltas[i] =
|
|
13811
|
+
deltas[i] = mean9(cResample) - mean9(bResample);
|
|
13529
13812
|
}
|
|
13530
13813
|
deltas.sort((a, b) => a - b);
|
|
13531
13814
|
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
@@ -13548,7 +13831,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13548
13831
|
verdict
|
|
13549
13832
|
};
|
|
13550
13833
|
}
|
|
13551
|
-
function
|
|
13834
|
+
function mean9(xs) {
|
|
13552
13835
|
if (xs.length === 0) return 0;
|
|
13553
13836
|
let s = 0;
|
|
13554
13837
|
for (const x of xs) s += x;
|
|
@@ -13872,6 +14155,7 @@ export {
|
|
|
13872
14155
|
analyzeAntiSlop,
|
|
13873
14156
|
analyzeSeries,
|
|
13874
14157
|
argHash,
|
|
14158
|
+
assertReleaseConfidence,
|
|
13875
14159
|
assignFeedbackSplit,
|
|
13876
14160
|
attributeCounterfactuals,
|
|
13877
14161
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
@@ -13942,6 +14226,7 @@ export {
|
|
|
13942
14226
|
evaluateContract,
|
|
13943
14227
|
evaluateHypothesis,
|
|
13944
14228
|
evaluateOracles,
|
|
14229
|
+
evaluateReleaseConfidence,
|
|
13945
14230
|
executeScenario,
|
|
13946
14231
|
expectAgent,
|
|
13947
14232
|
exportRewardModel,
|
|
@@ -14041,6 +14326,7 @@ export {
|
|
|
14041
14326
|
regexMatch,
|
|
14042
14327
|
regexMatches,
|
|
14043
14328
|
regressionView,
|
|
14329
|
+
releaseTraceEvidenceFromMultiShotTrials,
|
|
14044
14330
|
renderMarkdown,
|
|
14045
14331
|
renderMarkdownReport,
|
|
14046
14332
|
renderPlaybookMarkdown,
|