@tangle-network/agent-eval 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -0
- package/dist/index.d.ts +352 -4
- package/dist/index.js +634 -45
- package/dist/index.js.map +1 -1
- package/docs/knowledge-readiness.md +84 -0
- package/docs/multi-shot-optimization.md +7 -0
- package/package.json +12 -10
package/dist/index.js
CHANGED
|
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
417
417
|
if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
|
|
418
418
|
if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
|
|
419
419
|
const n = scores2.length;
|
|
420
|
-
const
|
|
420
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
|
|
421
421
|
const B = 1e3;
|
|
422
422
|
const bootstrapMeans = [];
|
|
423
423
|
for (let i = 0; i < B; i++) {
|
|
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
|
|
|
432
432
|
const lowerIdx = Math.floor(alpha / 2 * B);
|
|
433
433
|
const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
|
|
434
434
|
return {
|
|
435
|
-
mean:
|
|
435
|
+
mean: mean10,
|
|
436
436
|
lower: bootstrapMeans[lowerIdx],
|
|
437
437
|
upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
|
|
438
438
|
};
|
|
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
|
|
|
520
520
|
const n = before.length;
|
|
521
521
|
if (n < 2) return { t: 0, df: 0, p: 1 };
|
|
522
522
|
const diffs = before.map((b, i) => after[i] - b);
|
|
523
|
-
const
|
|
524
|
-
const variance2 = diffs.reduce((acc, d) => acc + (d -
|
|
523
|
+
const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
|
|
524
|
+
const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
|
|
525
525
|
const se = Math.sqrt(variance2 / n);
|
|
526
|
-
if (se === 0) return { t:
|
|
527
|
-
const t =
|
|
526
|
+
if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
|
|
527
|
+
const t = mean10 / se;
|
|
528
528
|
const df = n - 1;
|
|
529
529
|
const p = 2 * (1 - studentTCdf(Math.abs(t), df));
|
|
530
530
|
return { t, df, p };
|
|
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
|
|
|
548
548
|
}
|
|
549
549
|
let wPlus = 0;
|
|
550
550
|
for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
|
|
551
|
-
const
|
|
551
|
+
const mean10 = n * (n + 1) / 4;
|
|
552
552
|
const variance2 = n * (n + 1) * (2 * n + 1) / 24;
|
|
553
|
-
const z = (wPlus -
|
|
553
|
+
const z = (wPlus - mean10) / Math.sqrt(variance2);
|
|
554
554
|
const p = 2 * (1 - normalCdf(Math.abs(z)));
|
|
555
555
|
return { w: wPlus, p };
|
|
556
556
|
}
|
|
@@ -2251,6 +2251,151 @@ async function finish(emitter, result) {
|
|
|
2251
2251
|
return result;
|
|
2252
2252
|
}
|
|
2253
2253
|
|
|
2254
|
+
// src/knowledge/readiness.ts
|
|
2255
|
+
function scoreKnowledgeReadiness(options) {
|
|
2256
|
+
const requirements = options.requirements.map(normalizeRequirement);
|
|
2257
|
+
const missing = requirements.filter((requirement) => requirement.currentConfidence < requirement.confidenceNeeded);
|
|
2258
|
+
const blockingMissingRequirements = missing.filter(isBlockingGap);
|
|
2259
|
+
const nonBlockingGaps = missing.filter((requirement) => !isBlockingGap(requirement));
|
|
2260
|
+
const readinessScore = weightedReadiness(requirements);
|
|
2261
|
+
const bundle = {
|
|
2262
|
+
taskId: options.taskId,
|
|
2263
|
+
requirements,
|
|
2264
|
+
evidenceIds: unique([...options.evidenceIds ?? [], ...requirements.flatMap((r) => r.evidenceIds)]),
|
|
2265
|
+
claimIds: unique(options.claimIds ?? []),
|
|
2266
|
+
wikiPageIds: unique(options.wikiPageIds ?? []),
|
|
2267
|
+
userAnswers: options.userAnswers ?? {},
|
|
2268
|
+
missing,
|
|
2269
|
+
readinessScore,
|
|
2270
|
+
metadata: options.metadata
|
|
2271
|
+
};
|
|
2272
|
+
const recommendedAction = chooseRecommendedAction(blockingMissingRequirements, nonBlockingGaps);
|
|
2273
|
+
const severity = blockingMissingRequirements.length > 0 ? "critical" : nonBlockingGaps.some((gap) => gap.importance === "high") ? "warning" : "info";
|
|
2274
|
+
const reason = blockingMissingRequirements.length > 0 ? `${blockingMissingRequirements.length} blocking knowledge requirement(s) are missing.` : nonBlockingGaps.length > 0 ? `${nonBlockingGaps.length} non-blocking knowledge gap(s) remain.` : "All declared knowledge requirements are ready.";
|
|
2275
|
+
return {
|
|
2276
|
+
taskId: options.taskId,
|
|
2277
|
+
readinessScore,
|
|
2278
|
+
blockingMissingRequirements,
|
|
2279
|
+
nonBlockingGaps,
|
|
2280
|
+
recommendedAction,
|
|
2281
|
+
bundle,
|
|
2282
|
+
severity,
|
|
2283
|
+
reason
|
|
2284
|
+
};
|
|
2285
|
+
}
|
|
2286
|
+
function blockingKnowledgeEval(report, options = {}) {
|
|
2287
|
+
const minimumScore = options.minimumScore ?? 0.7;
|
|
2288
|
+
const passed = report.blockingMissingRequirements.length === 0 && report.readinessScore >= minimumScore;
|
|
2289
|
+
return objectiveEval({
|
|
2290
|
+
id: options.id ?? "knowledge-ready",
|
|
2291
|
+
passed,
|
|
2292
|
+
score: report.readinessScore,
|
|
2293
|
+
severity: passed ? "info" : report.severity,
|
|
2294
|
+
detail: report.reason,
|
|
2295
|
+
evidence: report.blockingMissingRequirements.map((r) => r.id).join(", ") || void 0,
|
|
2296
|
+
metadata: { knowledgeReadiness: report }
|
|
2297
|
+
});
|
|
2298
|
+
}
|
|
2299
|
+
function userQuestionsForKnowledgeGaps(gaps) {
|
|
2300
|
+
return gaps.filter((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask").map((gap) => ({
|
|
2301
|
+
id: `question_${gap.id}`,
|
|
2302
|
+
question: `Please provide: ${gap.description}`,
|
|
2303
|
+
reason: `Required for ${gap.requiredFor.join(", ") || "the task"}.`,
|
|
2304
|
+
requirementId: gap.id,
|
|
2305
|
+
importance: gap.importance,
|
|
2306
|
+
answerType: gap.sensitivity === "secret" ? "credential" : "free_text",
|
|
2307
|
+
impactIfUnknown: impactFor(gap)
|
|
2308
|
+
}));
|
|
2309
|
+
}
|
|
2310
|
+
function acquisitionPlansForKnowledgeGaps(gaps) {
|
|
2311
|
+
const byMode = /* @__PURE__ */ new Map();
|
|
2312
|
+
for (const gap of gaps) {
|
|
2313
|
+
const mode = planMode(gap.acquisitionMode);
|
|
2314
|
+
if (!mode) continue;
|
|
2315
|
+
const bucket = byMode.get(mode) ?? [];
|
|
2316
|
+
bucket.push(gap);
|
|
2317
|
+
byMode.set(mode, bucket);
|
|
2318
|
+
}
|
|
2319
|
+
return [...byMode.entries()].map(([mode, requirements]) => ({
|
|
2320
|
+
id: `acquire_${mode}`,
|
|
2321
|
+
requirementIds: requirements.map((r) => r.id),
|
|
2322
|
+
mode,
|
|
2323
|
+
description: descriptionForPlan(mode, requirements),
|
|
2324
|
+
priority: maxImportance(requirements.map((r) => r.importance)),
|
|
2325
|
+
questions: mode === "ask_user" ? userQuestionsForKnowledgeGaps(requirements) : void 0
|
|
2326
|
+
}));
|
|
2327
|
+
}
|
|
2328
|
+
function normalizeRequirement(requirement) {
|
|
2329
|
+
return {
|
|
2330
|
+
...requirement,
|
|
2331
|
+
confidenceNeeded: clamp01(requirement.confidenceNeeded),
|
|
2332
|
+
currentConfidence: clamp01(requirement.currentConfidence),
|
|
2333
|
+
evidenceIds: unique(requirement.evidenceIds)
|
|
2334
|
+
};
|
|
2335
|
+
}
|
|
2336
|
+
function weightedReadiness(requirements) {
|
|
2337
|
+
if (requirements.length === 0) return 1;
|
|
2338
|
+
let weightSum = 0;
|
|
2339
|
+
let scoreSum = 0;
|
|
2340
|
+
for (const requirement of requirements) {
|
|
2341
|
+
const weight = importanceWeight(requirement.importance);
|
|
2342
|
+
const score = requirement.confidenceNeeded <= 0 ? 1 : Math.min(1, requirement.currentConfidence / requirement.confidenceNeeded);
|
|
2343
|
+
weightSum += weight;
|
|
2344
|
+
scoreSum += weight * score;
|
|
2345
|
+
}
|
|
2346
|
+
return clamp01(scoreSum / weightSum);
|
|
2347
|
+
}
|
|
2348
|
+
function isBlockingGap(requirement) {
|
|
2349
|
+
return requirement.importance === "blocking" || requirement.fallbackPolicy === "block" || requirement.sensitivity === "secret";
|
|
2350
|
+
}
|
|
2351
|
+
function chooseRecommendedAction(blocking, nonBlocking) {
|
|
2352
|
+
const gaps = blocking.length > 0 ? blocking : nonBlocking;
|
|
2353
|
+
if (gaps.length === 0) return "run_agent";
|
|
2354
|
+
if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
|
|
2355
|
+
if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
|
|
2356
|
+
if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
|
|
2357
|
+
if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
|
|
2358
|
+
if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
|
|
2359
|
+
if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
|
|
2360
|
+
return "continue_with_caveat";
|
|
2361
|
+
}
|
|
2362
|
+
function planMode(mode) {
|
|
2363
|
+
if (mode === "infer_low_confidence" || mode === "not_available") return null;
|
|
2364
|
+
return mode;
|
|
2365
|
+
}
|
|
2366
|
+
function descriptionForPlan(mode, requirements) {
|
|
2367
|
+
const labels = requirements.map((r) => r.description).join("; ");
|
|
2368
|
+
if (mode === "ask_user") return `Ask the user for: ${labels}`;
|
|
2369
|
+
if (mode === "search_web") return `Search web or documentation sources for: ${labels}`;
|
|
2370
|
+
if (mode === "query_connector") return `Query configured connectors for: ${labels}`;
|
|
2371
|
+
if (mode === "inspect_repo") return `Inspect repository context for: ${labels}`;
|
|
2372
|
+
if (mode === "run_command") return `Run local commands to collect: ${labels}`;
|
|
2373
|
+
return `Build domain wiki evidence for: ${labels}`;
|
|
2374
|
+
}
|
|
2375
|
+
function impactFor(requirement) {
|
|
2376
|
+
if (requirement.fallbackPolicy === "block") return "The agent should not run until this is known.";
|
|
2377
|
+
if (requirement.fallbackPolicy === "continue_with_caveat") return "The agent may continue, but must disclose uncertainty.";
|
|
2378
|
+
if (requirement.fallbackPolicy === "use_default") return "The agent will use the configured default if skipped.";
|
|
2379
|
+
return "The agent should ask before continuing.";
|
|
2380
|
+
}
|
|
2381
|
+
function maxImportance(values) {
|
|
2382
|
+
const order = ["blocking", "high", "medium", "low"];
|
|
2383
|
+
return order.find((value) => values.includes(value)) ?? "low";
|
|
2384
|
+
}
|
|
2385
|
+
function importanceWeight(importance) {
|
|
2386
|
+
if (importance === "blocking") return 8;
|
|
2387
|
+
if (importance === "high") return 4;
|
|
2388
|
+
if (importance === "medium") return 2;
|
|
2389
|
+
return 1;
|
|
2390
|
+
}
|
|
2391
|
+
function clamp01(value) {
|
|
2392
|
+
if (!Number.isFinite(value)) return 0;
|
|
2393
|
+
return Math.max(0, Math.min(1, value));
|
|
2394
|
+
}
|
|
2395
|
+
function unique(items) {
|
|
2396
|
+
return [...new Set(items)];
|
|
2397
|
+
}
|
|
2398
|
+
|
|
2254
2399
|
// src/feedback-trajectory.ts
|
|
2255
2400
|
var DEFAULT_SPLIT_POLICY = {
|
|
2256
2401
|
trainPct: 70,
|
|
@@ -3521,9 +3666,9 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
|
|
|
3521
3666
|
};
|
|
3522
3667
|
function aggregateRunScore(score, weights = {}) {
|
|
3523
3668
|
const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
|
|
3524
|
-
return w.success *
|
|
3669
|
+
return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
|
|
3525
3670
|
}
|
|
3526
|
-
function
|
|
3671
|
+
function clamp012(value) {
|
|
3527
3672
|
if (!Number.isFinite(value)) return 0;
|
|
3528
3673
|
return Math.max(0, Math.min(1, value));
|
|
3529
3674
|
}
|
|
@@ -3567,13 +3712,13 @@ var RunCritic = class {
|
|
|
3567
3712
|
const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
|
|
3568
3713
|
if (!success) notes.push("run did not complete with pass=true");
|
|
3569
3714
|
const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
|
|
3570
|
-
const outcomeScore = typeof trace.run.outcome?.score === "number" ?
|
|
3715
|
+
const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp012(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
|
|
3571
3716
|
const goalProgress = outcomeScore ?? judgeAverage ?? success;
|
|
3572
3717
|
const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
|
|
3573
3718
|
const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
|
|
3574
3719
|
if (toolSpans2.length === 0) notes.push("no tool spans recorded");
|
|
3575
3720
|
const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
|
|
3576
|
-
const patchQuality = patchEvidence > 0 ?
|
|
3721
|
+
const patchQuality = patchEvidence > 0 ? clamp012(patchEvidence / 4) : 0;
|
|
3577
3722
|
if (!patchQuality) notes.push("no artifact or edit evidence recorded");
|
|
3578
3723
|
const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
|
|
3579
3724
|
const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
|
|
@@ -3617,7 +3762,7 @@ var RunCritic = class {
|
|
|
3617
3762
|
}
|
|
3618
3763
|
};
|
|
3619
3764
|
function normalizeJudgeScore(score) {
|
|
3620
|
-
return score > 1 ?
|
|
3765
|
+
return score > 1 ? clamp012(score / 10) : clamp012(score);
|
|
3621
3766
|
}
|
|
3622
3767
|
function looksRepoGrounded(text) {
|
|
3623
3768
|
return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
|
|
@@ -4973,6 +5118,17 @@ var FAILURE_CLASSES = [
|
|
|
4973
5118
|
"cost_overrun",
|
|
4974
5119
|
"timeout",
|
|
4975
5120
|
"sandbox_failure",
|
|
5121
|
+
"missing_user_data",
|
|
5122
|
+
"missing_domain_data",
|
|
5123
|
+
"missing_codebase_context",
|
|
5124
|
+
"missing_runtime_context",
|
|
5125
|
+
"missing_credentials",
|
|
5126
|
+
"stale_external_data",
|
|
5127
|
+
"bad_retrieval",
|
|
5128
|
+
"insufficient_evidence",
|
|
5129
|
+
"contradictory_evidence",
|
|
5130
|
+
"ambiguous_user_intent",
|
|
5131
|
+
"knowledge_readiness_blocked",
|
|
4976
5132
|
"unknown"
|
|
4977
5133
|
];
|
|
4978
5134
|
function isLlmSpan(s) {
|
|
@@ -5329,6 +5485,62 @@ var DEFAULT_RULES = [
|
|
|
5329
5485
|
return null;
|
|
5330
5486
|
}
|
|
5331
5487
|
},
|
|
5488
|
+
{
|
|
5489
|
+
id: "knowledge-readiness-blocked",
|
|
5490
|
+
match: ({ events }) => {
|
|
5491
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false);
|
|
5492
|
+
return event ? {
|
|
5493
|
+
failureClass: "knowledge_readiness_blocked",
|
|
5494
|
+
reason: "knowledge readiness report blocked execution",
|
|
5495
|
+
triggerEventId: event.eventId
|
|
5496
|
+
} : null;
|
|
5497
|
+
}
|
|
5498
|
+
},
|
|
5499
|
+
{
|
|
5500
|
+
id: "missing-credentials",
|
|
5501
|
+
match: ({ events }) => {
|
|
5502
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret");
|
|
5503
|
+
return event ? {
|
|
5504
|
+
failureClass: "missing_credentials",
|
|
5505
|
+
reason: "required credential or secret was missing",
|
|
5506
|
+
triggerEventId: event.eventId
|
|
5507
|
+
} : null;
|
|
5508
|
+
}
|
|
5509
|
+
},
|
|
5510
|
+
{
|
|
5511
|
+
id: "bad-retrieval",
|
|
5512
|
+
match: ({ run, spans }) => {
|
|
5513
|
+
if (run.outcome?.pass !== false) return null;
|
|
5514
|
+
const retrieval = spans.find((s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)));
|
|
5515
|
+
return retrieval ? {
|
|
5516
|
+
failureClass: "bad_retrieval",
|
|
5517
|
+
reason: "retrieval returned no useful hits for a failed run",
|
|
5518
|
+
triggerSpanId: retrieval.spanId
|
|
5519
|
+
} : null;
|
|
5520
|
+
}
|
|
5521
|
+
},
|
|
5522
|
+
{
|
|
5523
|
+
id: "insufficient-evidence",
|
|
5524
|
+
match: ({ events }) => {
|
|
5525
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence");
|
|
5526
|
+
return event ? {
|
|
5527
|
+
failureClass: "insufficient_evidence",
|
|
5528
|
+
reason: "task proceeded with insufficient supporting evidence",
|
|
5529
|
+
triggerEventId: event.eventId
|
|
5530
|
+
} : null;
|
|
5531
|
+
}
|
|
5532
|
+
},
|
|
5533
|
+
{
|
|
5534
|
+
id: "contradictory-evidence",
|
|
5535
|
+
match: ({ events }) => {
|
|
5536
|
+
const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence");
|
|
5537
|
+
return event ? {
|
|
5538
|
+
failureClass: "contradictory_evidence",
|
|
5539
|
+
reason: "supporting evidence contradicted itself",
|
|
5540
|
+
triggerEventId: event.eventId
|
|
5541
|
+
} : null;
|
|
5542
|
+
}
|
|
5543
|
+
},
|
|
5332
5544
|
// Budget breach events
|
|
5333
5545
|
{
|
|
5334
5546
|
id: "budget-breach",
|
|
@@ -5667,11 +5879,14 @@ async function failureClusterView(store, options = {}) {
|
|
|
5667
5879
|
const cls = classifyFailure({ run, spans, events }, rules);
|
|
5668
5880
|
let toolName;
|
|
5669
5881
|
let argPrefix;
|
|
5882
|
+
let dimension;
|
|
5670
5883
|
if (cls.triggerSpanId) {
|
|
5671
5884
|
const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
|
|
5672
5885
|
if (trig?.kind === "tool") {
|
|
5673
5886
|
toolName = trig.toolName;
|
|
5674
5887
|
argPrefix = argHash(trig.args).slice(0, 16);
|
|
5888
|
+
} else if (trig?.kind === "judge") {
|
|
5889
|
+
dimension = trig.dimension;
|
|
5675
5890
|
}
|
|
5676
5891
|
}
|
|
5677
5892
|
if (!toolName) {
|
|
@@ -5682,13 +5897,18 @@ async function failureClusterView(store, options = {}) {
|
|
|
5682
5897
|
argPrefix = argHash(errored.args).slice(0, 16);
|
|
5683
5898
|
}
|
|
5684
5899
|
}
|
|
5685
|
-
|
|
5900
|
+
if (!dimension) {
|
|
5901
|
+
const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
|
|
5902
|
+
if (judge?.kind === "judge") dimension = judge.dimension;
|
|
5903
|
+
}
|
|
5904
|
+
const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
|
|
5686
5905
|
let cluster = clusters.get(key);
|
|
5687
5906
|
if (!cluster) {
|
|
5688
5907
|
cluster = {
|
|
5689
5908
|
failureClass: cls.failureClass,
|
|
5690
5909
|
toolName,
|
|
5691
5910
|
argPrefix,
|
|
5911
|
+
dimension,
|
|
5692
5912
|
runCount: 0,
|
|
5693
5913
|
scenarioIds: [],
|
|
5694
5914
|
exampleRunId: run.runId,
|
|
@@ -6457,10 +6677,10 @@ function analyzeSeries(values, options = {}) {
|
|
|
6457
6677
|
return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
|
|
6458
6678
|
}
|
|
6459
6679
|
const tail = values.slice(-window);
|
|
6460
|
-
const
|
|
6461
|
-
const variance2 = tail.reduce((acc, v) => acc + (v -
|
|
6680
|
+
const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
|
|
6681
|
+
const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
|
|
6462
6682
|
const stdDev = Math.sqrt(variance2);
|
|
6463
|
-
const refMean = Math.abs(
|
|
6683
|
+
const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
6464
6684
|
const cv = stdDev / refMean;
|
|
6465
6685
|
const stable = tail.length >= window && cv <= stableCv;
|
|
6466
6686
|
let tailRun = 0;
|
|
@@ -6481,7 +6701,7 @@ function analyzeSeries(values, options = {}) {
|
|
|
6481
6701
|
} else {
|
|
6482
6702
|
state = "noisy";
|
|
6483
6703
|
}
|
|
6484
|
-
return { state, windowMean:
|
|
6704
|
+
return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
|
|
6485
6705
|
}
|
|
6486
6706
|
|
|
6487
6707
|
// src/state-continuity.ts
|
|
@@ -6673,6 +6893,46 @@ function checkCanaries(output, scenarios) {
|
|
|
6673
6893
|
}
|
|
6674
6894
|
return leaks;
|
|
6675
6895
|
}
|
|
6896
|
+
function checkBehavioralCanary(output, scenario) {
|
|
6897
|
+
const pattern = scenario.forbiddenPattern ?? scenario.canary;
|
|
6898
|
+
if (!pattern) return null;
|
|
6899
|
+
const hit = matchForbidden(output, pattern);
|
|
6900
|
+
if (!hit) return null;
|
|
6901
|
+
return {
|
|
6902
|
+
scenarioId: scenario.id,
|
|
6903
|
+
canary: pattern,
|
|
6904
|
+
evidence: excerpt2(output, hit)
|
|
6905
|
+
};
|
|
6906
|
+
}
|
|
6907
|
+
function runBehavioralCanaries(cases) {
|
|
6908
|
+
const leaks = [];
|
|
6909
|
+
for (const c of cases) {
|
|
6910
|
+
const leak = checkBehavioralCanary(c.output, c.scenario);
|
|
6911
|
+
if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
|
|
6912
|
+
}
|
|
6913
|
+
return leaks;
|
|
6914
|
+
}
|
|
6915
|
+
function matchForbidden(output, pattern) {
|
|
6916
|
+
const re = tryParseRegex(pattern);
|
|
6917
|
+
if (re) {
|
|
6918
|
+
const m = output.match(re);
|
|
6919
|
+
return m && m[0].length > 0 ? m[0] : null;
|
|
6920
|
+
}
|
|
6921
|
+
return output.includes(pattern) ? pattern : null;
|
|
6922
|
+
}
|
|
6923
|
+
function tryParseRegex(pattern) {
|
|
6924
|
+
if (pattern.length < 2 || pattern[0] !== "/") return null;
|
|
6925
|
+
const last = pattern.lastIndexOf("/");
|
|
6926
|
+
if (last <= 0) return null;
|
|
6927
|
+
const body = pattern.slice(1, last);
|
|
6928
|
+
const flags = pattern.slice(last + 1);
|
|
6929
|
+
if (!/^[gimsuy]*$/.test(flags)) return null;
|
|
6930
|
+
try {
|
|
6931
|
+
return new RegExp(body, flags);
|
|
6932
|
+
} catch {
|
|
6933
|
+
return null;
|
|
6934
|
+
}
|
|
6935
|
+
}
|
|
6676
6936
|
async function canaryLeakView(store, scenarios) {
|
|
6677
6937
|
const targets = scenarios.filter((s) => !!s.canary);
|
|
6678
6938
|
if (targets.length === 0) return [];
|
|
@@ -6938,9 +7198,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
|
|
|
6938
7198
|
for (let k = n - 1; k >= 0; k--) {
|
|
6939
7199
|
const rank = k + 1;
|
|
6940
7200
|
const raw = indexed[k].p * n / rank;
|
|
6941
|
-
const
|
|
6942
|
-
minRight =
|
|
6943
|
-
q[indexed[k].i] = Math.min(1,
|
|
7201
|
+
const bounded2 = Math.min(minRight, raw);
|
|
7202
|
+
minRight = bounded2;
|
|
7203
|
+
q[indexed[k].i] = Math.min(1, bounded2);
|
|
6944
7204
|
}
|
|
6945
7205
|
const significant = q.map((v) => v < fdr);
|
|
6946
7206
|
return { qValues: q, significant };
|
|
@@ -7470,12 +7730,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
|
|
|
7470
7730
|
variantScores.push({ mutator: id, score, mutated });
|
|
7471
7731
|
all.push(score);
|
|
7472
7732
|
}
|
|
7473
|
-
const
|
|
7474
|
-
const variance2 = all.reduce((a, v) => a + (v -
|
|
7733
|
+
const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
|
|
7734
|
+
const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
|
|
7475
7735
|
const stdDev = Math.sqrt(variance2);
|
|
7476
|
-
const ref = Math.abs(
|
|
7736
|
+
const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
|
|
7477
7737
|
const robustness = Math.max(0, 1 - stdDev / ref);
|
|
7478
|
-
return { originalScore, variantScores, meanScore:
|
|
7738
|
+
return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
|
|
7479
7739
|
}
|
|
7480
7740
|
var lowercaseMutator = (p) => p.toLowerCase();
|
|
7481
7741
|
var sentenceReorderMutator = (p, seed) => {
|
|
@@ -7519,6 +7779,41 @@ var DEFAULT_MUTATORS = [
|
|
|
7519
7779
|
{ id: "politeness-prefix", fn: politenessPrefixMutator },
|
|
7520
7780
|
{ id: "whitespace-collapse", fn: whitespaceCollapseMutator }
|
|
7521
7781
|
];
|
|
7782
|
+
async function paraphraseRobustnessScenarios(args) {
|
|
7783
|
+
const reps = Math.max(1, args.reps ?? 1);
|
|
7784
|
+
const mutatorNames = args.mutators.map((m) => m.name);
|
|
7785
|
+
const perScenario = [];
|
|
7786
|
+
for (const scenario of args.scenarios) {
|
|
7787
|
+
const baseline = await args.runScenario({
|
|
7788
|
+
id: scenario.id,
|
|
7789
|
+
userTurns: scenario.userTurns
|
|
7790
|
+
});
|
|
7791
|
+
const originalScore = baseline.score;
|
|
7792
|
+
const deltas = {};
|
|
7793
|
+
const paraphrasedAll = [];
|
|
7794
|
+
for (const m of args.mutators) {
|
|
7795
|
+
const scores2 = [];
|
|
7796
|
+
for (let r = 0; r < reps; r++) {
|
|
7797
|
+
const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
|
|
7798
|
+
const out = await args.runScenario({
|
|
7799
|
+
id: scenario.id,
|
|
7800
|
+
userTurns: mutatedTurns
|
|
7801
|
+
});
|
|
7802
|
+
scores2.push(out.score);
|
|
7803
|
+
}
|
|
7804
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
7805
|
+
deltas[m.name] = mean10 - originalScore;
|
|
7806
|
+
paraphrasedAll.push(...scores2);
|
|
7807
|
+
}
|
|
7808
|
+
const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
|
|
7809
|
+
perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
|
|
7810
|
+
}
|
|
7811
|
+
const meanOriginal = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.originalScore, 0) / perScenario.length;
|
|
7812
|
+
const meanParaphrased = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.paraphrasedMean, 0) / perScenario.length;
|
|
7813
|
+
const ratio2 = meanOriginal <= 0 ? 0 : meanParaphrased / meanOriginal;
|
|
7814
|
+
const score = Math.max(0, Math.min(1, ratio2));
|
|
7815
|
+
return { score, perScenario, mutators: mutatorNames };
|
|
7816
|
+
}
|
|
7522
7817
|
|
|
7523
7818
|
// src/visual-diff.ts
|
|
7524
7819
|
function visualDiff(a, b, options = {}) {
|
|
@@ -8396,8 +8691,8 @@ async function prmBestOfN(store, grader, runIds) {
|
|
|
8396
8691
|
if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
|
|
8397
8692
|
const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
|
|
8398
8693
|
const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
|
|
8399
|
-
const
|
|
8400
|
-
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore -
|
|
8694
|
+
const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
|
|
8695
|
+
const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
|
|
8401
8696
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8402
8697
|
}
|
|
8403
8698
|
async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
@@ -8419,8 +8714,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
|
|
|
8419
8714
|
const ranked = [...byRun.values()].sort(
|
|
8420
8715
|
(a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
|
|
8421
8716
|
);
|
|
8422
|
-
const
|
|
8423
|
-
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore -
|
|
8717
|
+
const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
|
|
8718
|
+
const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
|
|
8424
8719
|
return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
|
|
8425
8720
|
}
|
|
8426
8721
|
|
|
@@ -8747,10 +9042,11 @@ async function signManifest(m) {
|
|
|
8747
9042
|
const bytes = new TextEncoder().encode(JSON.stringify(canonical));
|
|
8748
9043
|
const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
|
|
8749
9044
|
const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
|
|
8750
|
-
return { ...m, contentHash: hash };
|
|
9045
|
+
return { ...m, contentHash: hash, algo: "sha256-content" };
|
|
8751
9046
|
}
|
|
8752
9047
|
async function verifyManifest(m) {
|
|
8753
|
-
const { contentHash, ...rest } = m;
|
|
9048
|
+
const { contentHash, algo: _algo, ...rest } = m;
|
|
9049
|
+
void _algo;
|
|
8754
9050
|
const resigned = await signManifest(rest);
|
|
8755
9051
|
return resigned.contentHash === contentHash;
|
|
8756
9052
|
}
|
|
@@ -8950,8 +9246,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
|
|
|
8950
9246
|
const sRuns = runs.filter((r) => r.scenarioId === s.id);
|
|
8951
9247
|
const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
|
|
8952
9248
|
if (scores2.length < 3) continue;
|
|
8953
|
-
const
|
|
8954
|
-
const variance2 = scores2.reduce((a, b) => a + (b -
|
|
9249
|
+
const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
|
|
9250
|
+
const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
|
|
8955
9251
|
if (variance2 > varianceThreshold) {
|
|
8956
9252
|
targets.push({
|
|
8957
9253
|
reason: "high-variance",
|
|
@@ -10989,7 +11285,7 @@ function defaultReferenceReplayMatcher(reference, candidate) {
|
|
|
10989
11285
|
const textScore = tokenJaccard(referenceText, candidateText);
|
|
10990
11286
|
const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
|
|
10991
11287
|
const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
|
|
10992
|
-
const score =
|
|
11288
|
+
const score = clamp013(textScore * 0.85 + tagScore + severityScore);
|
|
10993
11289
|
return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
|
|
10994
11290
|
}
|
|
10995
11291
|
function scoreScenario(scenario, matcher, threshold, matchStrategy) {
|
|
@@ -11089,7 +11385,7 @@ function scorePair(scenario, matcher, reference, candidate) {
|
|
|
11089
11385
|
if (!Number.isFinite(result.score)) {
|
|
11090
11386
|
throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
|
|
11091
11387
|
}
|
|
11092
|
-
return { score:
|
|
11388
|
+
return { score: clamp013(result.score), reason: result.reason ?? "" };
|
|
11093
11389
|
}
|
|
11094
11390
|
function buildScenarioScore(scenario, matches2, falsePositives) {
|
|
11095
11391
|
const matched = matches2.filter((match) => match.matched).length;
|
|
@@ -11188,7 +11484,7 @@ function tokens(text) {
|
|
|
11188
11484
|
function normalize(text) {
|
|
11189
11485
|
return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
|
|
11190
11486
|
}
|
|
11191
|
-
function
|
|
11487
|
+
function clamp013(value) {
|
|
11192
11488
|
if (!Number.isFinite(value)) return 0;
|
|
11193
11489
|
return Math.max(0, Math.min(1, value));
|
|
11194
11490
|
}
|
|
@@ -12653,7 +12949,7 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
|
|
|
12653
12949
|
scenarioId,
|
|
12654
12950
|
rep,
|
|
12655
12951
|
ok: scored.ok ?? true,
|
|
12656
|
-
score:
|
|
12952
|
+
score: clamp014(scored.score),
|
|
12657
12953
|
cost: scored.costUsd ?? run.costUsd ?? 0,
|
|
12658
12954
|
durationMs: scored.durationMs ?? run.durationMs ?? 0,
|
|
12659
12955
|
metrics: {
|
|
@@ -12765,7 +13061,7 @@ function stableHash2(input) {
|
|
|
12765
13061
|
}
|
|
12766
13062
|
return h >>> 0;
|
|
12767
13063
|
}
|
|
12768
|
-
function
|
|
13064
|
+
function clamp014(n) {
|
|
12769
13065
|
if (!Number.isFinite(n)) return 0;
|
|
12770
13066
|
return Math.max(0, Math.min(1, n));
|
|
12771
13067
|
}
|
|
@@ -12813,6 +13109,289 @@ function traceExcerpt(trace) {
|
|
|
12813
13109
|
return void 0;
|
|
12814
13110
|
}
|
|
12815
13111
|
|
|
13112
|
+
// src/release-confidence.ts
|
|
13113
|
+
var DEFAULT_THRESHOLDS = {
|
|
13114
|
+
requireCorpus: true,
|
|
13115
|
+
minScenarioCount: 1,
|
|
13116
|
+
minSearchRuns: 1,
|
|
13117
|
+
minHoldoutRuns: 1,
|
|
13118
|
+
requireHoldout: true,
|
|
13119
|
+
minPassRate: 0.8,
|
|
13120
|
+
minMeanScore: 0.7,
|
|
13121
|
+
maxOverfitGap: 0.15,
|
|
13122
|
+
maxMeanCostUsd: Number.POSITIVE_INFINITY,
|
|
13123
|
+
maxP95WallMs: Number.POSITIVE_INFINITY,
|
|
13124
|
+
requireAsiForFailures: true,
|
|
13125
|
+
failureScoreThreshold: 0.5
|
|
13126
|
+
};
|
|
13127
|
+
function releaseTraceEvidenceFromMultiShotTrials(trials) {
|
|
13128
|
+
return trials.map((trial) => ({
|
|
13129
|
+
scenarioId: trial.scenarioId,
|
|
13130
|
+
candidateId: trial.variantId,
|
|
13131
|
+
split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
|
|
13132
|
+
score: trial.score,
|
|
13133
|
+
ok: trial.ok,
|
|
13134
|
+
turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
|
|
13135
|
+
costUsd: trial.cost,
|
|
13136
|
+
durationMs: trial.durationMs,
|
|
13137
|
+
failureMode: trial.error ? "runtime_error" : void 0,
|
|
13138
|
+
asi: trial.asi,
|
|
13139
|
+
metadata: trial.metadata
|
|
13140
|
+
}));
|
|
13141
|
+
}
|
|
13142
|
+
function evaluateReleaseConfidence(input) {
|
|
13143
|
+
const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
|
|
13144
|
+
const candidateId = input.candidateId ?? null;
|
|
13145
|
+
const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
|
|
13146
|
+
const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
|
|
13147
|
+
const scenarios = input.scenarios ?? [];
|
|
13148
|
+
const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
|
|
13149
|
+
const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
|
|
13150
|
+
const searchScores = scoresFor(runs, "search");
|
|
13151
|
+
const holdoutScores = scoresFor(runs, "holdout");
|
|
13152
|
+
const allScores = [...searchScores, ...holdoutScores];
|
|
13153
|
+
const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
|
|
13154
|
+
const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
|
|
13155
|
+
const searchRuns = runs.filter((r) => r.splitTag === "search").length;
|
|
13156
|
+
const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
|
|
13157
|
+
const searchMeanScore = mean8(searchScores);
|
|
13158
|
+
const holdoutMeanScore = mean8(holdoutScores);
|
|
13159
|
+
const metrics = {
|
|
13160
|
+
scenarioCount,
|
|
13161
|
+
searchRuns,
|
|
13162
|
+
holdoutRuns,
|
|
13163
|
+
passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
|
|
13164
|
+
meanScore: mean8(scoreUniverse),
|
|
13165
|
+
searchMeanScore,
|
|
13166
|
+
holdoutMeanScore,
|
|
13167
|
+
overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
|
|
13168
|
+
meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
|
|
13169
|
+
p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
|
|
13170
|
+
failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
|
|
13171
|
+
failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
|
|
13172
|
+
singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
|
|
13173
|
+
multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
|
|
13174
|
+
splitCounts,
|
|
13175
|
+
domainCounts: countDomains(scenarios),
|
|
13176
|
+
failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
|
|
13177
|
+
responsibleSurfaceCounts: countResponsibleSurfaces(traces)
|
|
13178
|
+
};
|
|
13179
|
+
const issues = [];
|
|
13180
|
+
checkCorpus(input, thresholds, metrics, issues);
|
|
13181
|
+
checkQuality(thresholds, metrics, issues);
|
|
13182
|
+
checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
|
|
13183
|
+
checkDiagnostics(thresholds, metrics, issues);
|
|
13184
|
+
checkEfficiency(thresholds, metrics, issues);
|
|
13185
|
+
const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
|
|
13186
|
+
const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
|
|
13187
|
+
return {
|
|
13188
|
+
target: input.target,
|
|
13189
|
+
candidateId,
|
|
13190
|
+
baselineId: input.baselineId ?? null,
|
|
13191
|
+
status,
|
|
13192
|
+
promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
|
|
13193
|
+
axes,
|
|
13194
|
+
issues,
|
|
13195
|
+
metrics,
|
|
13196
|
+
dataset: input.dataset ?? null,
|
|
13197
|
+
gateDecision: input.gateDecision ?? null,
|
|
13198
|
+
summary: renderSummary(input.target, status, metrics, issues)
|
|
13199
|
+
};
|
|
13200
|
+
}
|
|
13201
|
+
function assertReleaseConfidence(input) {
|
|
13202
|
+
const scorecard = evaluateReleaseConfidence(input);
|
|
13203
|
+
if (scorecard.status === "fail") {
|
|
13204
|
+
throw new Error(scorecard.summary);
|
|
13205
|
+
}
|
|
13206
|
+
return scorecard;
|
|
13207
|
+
}
|
|
13208
|
+
function filterCandidate(runs, candidateId, baselineId) {
|
|
13209
|
+
if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
|
|
13210
|
+
if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
|
|
13211
|
+
return [...runs];
|
|
13212
|
+
}
|
|
13213
|
+
function filterTraceCandidate(traces, candidateId, baselineId) {
|
|
13214
|
+
if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
|
|
13215
|
+
if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
|
|
13216
|
+
return [...traces];
|
|
13217
|
+
}
|
|
13218
|
+
function checkCorpus(input, thresholds, metrics, issues) {
|
|
13219
|
+
if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
|
|
13220
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
|
|
13221
|
+
}
|
|
13222
|
+
if (metrics.scenarioCount < thresholds.minScenarioCount) {
|
|
13223
|
+
issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
|
|
13224
|
+
}
|
|
13225
|
+
if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
|
|
13226
|
+
issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
|
|
13227
|
+
}
|
|
13228
|
+
}
|
|
13229
|
+
function checkQuality(thresholds, metrics, issues) {
|
|
13230
|
+
if (metrics.searchRuns < thresholds.minSearchRuns) {
|
|
13231
|
+
issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
|
|
13232
|
+
}
|
|
13233
|
+
if (metrics.passRate < thresholds.minPassRate) {
|
|
13234
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
|
|
13235
|
+
}
|
|
13236
|
+
if (metrics.meanScore < thresholds.minMeanScore) {
|
|
13237
|
+
issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
|
|
13238
|
+
}
|
|
13239
|
+
}
|
|
13240
|
+
function checkGeneralization(gateDecision, thresholds, metrics, issues) {
|
|
13241
|
+
if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
|
|
13242
|
+
issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
|
|
13243
|
+
}
|
|
13244
|
+
if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
|
|
13245
|
+
issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
|
|
13246
|
+
}
|
|
13247
|
+
if (gateDecision && !gateDecision.promote) {
|
|
13248
|
+
issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
|
|
13249
|
+
}
|
|
13250
|
+
}
|
|
13251
|
+
function checkDiagnostics(thresholds, metrics, issues) {
|
|
13252
|
+
if (!thresholds.requireAsiForFailures) return;
|
|
13253
|
+
if (metrics.failedRows > metrics.failuresWithAsi) {
|
|
13254
|
+
issues.push({
|
|
13255
|
+
axis: "diagnostics",
|
|
13256
|
+
severity: "critical",
|
|
13257
|
+
code: "missing_failure_asi",
|
|
13258
|
+
detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
|
|
13259
|
+
});
|
|
13260
|
+
}
|
|
13261
|
+
}
|
|
13262
|
+
function checkEfficiency(thresholds, metrics, issues) {
|
|
13263
|
+
if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
|
|
13264
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
|
|
13265
|
+
}
|
|
13266
|
+
if (metrics.p95WallMs > thresholds.maxP95WallMs) {
|
|
13267
|
+
issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
|
|
13268
|
+
}
|
|
13269
|
+
}
|
|
13270
|
+
function buildAxes(metrics, thresholds, gateDecision, issues) {
|
|
13271
|
+
return [
|
|
13272
|
+
axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
|
|
13273
|
+
axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
|
|
13274
|
+
axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
|
|
13275
|
+
axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
|
|
13276
|
+
axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
|
|
13277
|
+
];
|
|
13278
|
+
}
|
|
13279
|
+
function axis(name, issues, score, detail) {
|
|
13280
|
+
const own = issues.filter((i) => i.axis === name);
|
|
13281
|
+
const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
|
|
13282
|
+
return { name, status, score: bounded(score), detail };
|
|
13283
|
+
}
|
|
13284
|
+
function countScenarioSplits(scenarios) {
|
|
13285
|
+
const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
|
|
13286
|
+
for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
|
|
13287
|
+
return counts;
|
|
13288
|
+
}
|
|
13289
|
+
function countDomains(scenarios) {
|
|
13290
|
+
const out = {};
|
|
13291
|
+
for (const scenario of scenarios) {
|
|
13292
|
+
const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
|
|
13293
|
+
out[domain] = (out[domain] ?? 0) + 1;
|
|
13294
|
+
}
|
|
13295
|
+
return out;
|
|
13296
|
+
}
|
|
13297
|
+
function countFailureModes(runs, traces, threshold) {
|
|
13298
|
+
const out = {};
|
|
13299
|
+
for (const run of runs) {
|
|
13300
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13301
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13302
|
+
const mode = run.failureMode ?? "low_score";
|
|
13303
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13304
|
+
}
|
|
13305
|
+
}
|
|
13306
|
+
for (const trace of traces) {
|
|
13307
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13308
|
+
const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
|
|
13309
|
+
out[mode] = (out[mode] ?? 0) + 1;
|
|
13310
|
+
}
|
|
13311
|
+
}
|
|
13312
|
+
return out;
|
|
13313
|
+
}
|
|
13314
|
+
function countResponsibleSurfaces(traces) {
|
|
13315
|
+
const out = {};
|
|
13316
|
+
for (const trace of traces) {
|
|
13317
|
+
for (const asi of trace.asi ?? []) {
|
|
13318
|
+
const surface = asi.responsibleSurface ?? "unknown";
|
|
13319
|
+
out[surface] = (out[surface] ?? 0) + 1;
|
|
13320
|
+
}
|
|
13321
|
+
}
|
|
13322
|
+
return out;
|
|
13323
|
+
}
|
|
13324
|
+
function failedRows(runs, traces, threshold) {
|
|
13325
|
+
const out = [];
|
|
13326
|
+
for (const run of runs) {
|
|
13327
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13328
|
+
if (run.failureMode || score !== void 0 && score < threshold) {
|
|
13329
|
+
const asiMetric = run.outcome.raw.asi;
|
|
13330
|
+
out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
|
|
13331
|
+
}
|
|
13332
|
+
}
|
|
13333
|
+
for (const trace of traces) {
|
|
13334
|
+
if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
|
|
13335
|
+
out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
|
|
13336
|
+
}
|
|
13337
|
+
}
|
|
13338
|
+
return out;
|
|
13339
|
+
}
|
|
13340
|
+
function passRate(runs, traces, threshold) {
|
|
13341
|
+
const outcomes = [
|
|
13342
|
+
...runs.map((run) => {
|
|
13343
|
+
const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
|
|
13344
|
+
return !run.failureMode && score !== void 0 && score >= threshold;
|
|
13345
|
+
}),
|
|
13346
|
+
...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
|
|
13347
|
+
];
|
|
13348
|
+
if (outcomes.length === 0) return 0;
|
|
13349
|
+
return outcomes.filter(Boolean).length / outcomes.length;
|
|
13350
|
+
}
|
|
13351
|
+
function scoresFor(runs, split) {
|
|
13352
|
+
return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
|
|
13353
|
+
}
|
|
13354
|
+
function mean8(xs) {
|
|
13355
|
+
if (xs.length === 0) return Number.NaN;
|
|
13356
|
+
return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
|
|
13357
|
+
}
|
|
13358
|
+
function percentile(xs, p) {
|
|
13359
|
+
if (xs.length === 0) return Number.NaN;
|
|
13360
|
+
const sorted = [...xs].sort((a, b) => a - b);
|
|
13361
|
+
return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
|
|
13362
|
+
}
|
|
13363
|
+
function isFiniteNumber(value) {
|
|
13364
|
+
return typeof value === "number" && Number.isFinite(value);
|
|
13365
|
+
}
|
|
13366
|
+
function safeDiff2(a, b) {
|
|
13367
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
|
|
13368
|
+
return a - b;
|
|
13369
|
+
}
|
|
13370
|
+
function gapScore(gap, maxGap) {
|
|
13371
|
+
if (!Number.isFinite(gap)) return 0;
|
|
13372
|
+
if (maxGap <= 0) return gap <= 0 ? 1 : 0;
|
|
13373
|
+
return bounded(1 - Math.max(0, gap) / maxGap);
|
|
13374
|
+
}
|
|
13375
|
+
function efficiencyScore(metrics, thresholds) {
|
|
13376
|
+
const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
|
|
13377
|
+
const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
|
|
13378
|
+
return Math.min(cost, latency);
|
|
13379
|
+
}
|
|
13380
|
+
function bounded(x) {
|
|
13381
|
+
if (!Number.isFinite(x)) return 0;
|
|
13382
|
+
return Math.max(0, Math.min(1, x));
|
|
13383
|
+
}
|
|
13384
|
+
function renderSummary(target, status, metrics, issues) {
|
|
13385
|
+
const prefix = `release confidence ${status}: ${target}`;
|
|
13386
|
+
const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
|
|
13387
|
+
if (issues.length === 0) return `${prefix}; ${metricText}`;
|
|
13388
|
+
return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
|
|
13389
|
+
}
|
|
13390
|
+
function fmt3(x) {
|
|
13391
|
+
if (!Number.isFinite(x)) return String(x);
|
|
13392
|
+
return x.toFixed(4);
|
|
13393
|
+
}
|
|
13394
|
+
|
|
12816
13395
|
// src/jsonl-trial-cache.ts
|
|
12817
13396
|
import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
|
|
12818
13397
|
import { dirname as dirname4 } from "path";
|
|
@@ -13458,9 +14037,9 @@ function passOrthogonality(input) {
|
|
|
13458
14037
|
sims.push(cosineSimilarity(vectors[i], vectors[j]));
|
|
13459
14038
|
}
|
|
13460
14039
|
}
|
|
13461
|
-
const
|
|
14040
|
+
const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
|
|
13462
14041
|
return {
|
|
13463
|
-
orthogonality: Math.max(0, Math.min(1, 1 -
|
|
14042
|
+
orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
|
|
13464
14043
|
passCount: passes.length,
|
|
13465
14044
|
similarities: sims
|
|
13466
14045
|
};
|
|
@@ -13506,8 +14085,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13506
14085
|
const iterations = options.iterations ?? 1e3;
|
|
13507
14086
|
const minTotal = options.minTotalSamples ?? 6;
|
|
13508
14087
|
const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
|
|
13509
|
-
const baselineMean =
|
|
13510
|
-
const candidateMean =
|
|
14088
|
+
const baselineMean = mean9(baseline);
|
|
14089
|
+
const candidateMean = mean9(candidate);
|
|
13511
14090
|
const delta = candidateMean - baselineMean;
|
|
13512
14091
|
if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
|
|
13513
14092
|
return {
|
|
@@ -13525,7 +14104,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13525
14104
|
for (let i = 0; i < iterations; i++) {
|
|
13526
14105
|
const bResample = resample(baseline, rng);
|
|
13527
14106
|
const cResample = resample(candidate, rng);
|
|
13528
|
-
deltas[i] =
|
|
14107
|
+
deltas[i] = mean9(cResample) - mean9(bResample);
|
|
13529
14108
|
}
|
|
13530
14109
|
deltas.sort((a, b) => a - b);
|
|
13531
14110
|
const lowerIdx = Math.floor(alpha / 2 * iterations);
|
|
@@ -13548,7 +14127,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
|
|
|
13548
14127
|
verdict
|
|
13549
14128
|
};
|
|
13550
14129
|
}
|
|
13551
|
-
function
|
|
14130
|
+
function mean9(xs) {
|
|
13552
14131
|
if (xs.length === 0) return 0;
|
|
13553
14132
|
let s = 0;
|
|
13554
14133
|
for (const x of xs) s += x;
|
|
@@ -13865,6 +14444,7 @@ export {
|
|
|
13865
14444
|
TraceEmitter,
|
|
13866
14445
|
TrialTelemetry,
|
|
13867
14446
|
UNIVERSAL_FINDERS,
|
|
14447
|
+
acquisitionPlansForKnowledgeGaps,
|
|
13868
14448
|
adversarialJudge,
|
|
13869
14449
|
aggregateLlm,
|
|
13870
14450
|
aggregateRunScore,
|
|
@@ -13872,6 +14452,7 @@ export {
|
|
|
13872
14452
|
analyzeAntiSlop,
|
|
13873
14453
|
analyzeSeries,
|
|
13874
14454
|
argHash,
|
|
14455
|
+
assertReleaseConfidence,
|
|
13875
14456
|
assignFeedbackSplit,
|
|
13876
14457
|
attributeCounterfactuals,
|
|
13877
14458
|
deterministicSplit as benchmarkDeterministicSplit,
|
|
@@ -13879,6 +14460,7 @@ export {
|
|
|
13879
14460
|
benjaminiHochberg,
|
|
13880
14461
|
bhAdjust,
|
|
13881
14462
|
bisect,
|
|
14463
|
+
blockingKnowledgeEval,
|
|
13882
14464
|
bonferroni,
|
|
13883
14465
|
bootstrapCi,
|
|
13884
14466
|
budgetBreachView,
|
|
@@ -13892,9 +14474,10 @@ export {
|
|
|
13892
14474
|
callLlmJson,
|
|
13893
14475
|
canaryLeakView,
|
|
13894
14476
|
causalAttribution,
|
|
14477
|
+
checkBehavioralCanary,
|
|
13895
14478
|
checkCanaries,
|
|
13896
14479
|
checkSlos,
|
|
13897
|
-
clamp01,
|
|
14480
|
+
clamp012 as clamp01,
|
|
13898
14481
|
classifyEuAiRisk,
|
|
13899
14482
|
classifyFailure,
|
|
13900
14483
|
codeExecutionJudge,
|
|
@@ -13942,6 +14525,7 @@ export {
|
|
|
13942
14525
|
evaluateContract,
|
|
13943
14526
|
evaluateHypothesis,
|
|
13944
14527
|
evaluateOracles,
|
|
14528
|
+
evaluateReleaseConfidence,
|
|
13945
14529
|
executeScenario,
|
|
13946
14530
|
expectAgent,
|
|
13947
14531
|
exportRewardModel,
|
|
@@ -14014,6 +14598,7 @@ export {
|
|
|
14014
14598
|
pairedTTest,
|
|
14015
14599
|
pairedWilcoxon,
|
|
14016
14600
|
paraphraseRobustness,
|
|
14601
|
+
paraphraseRobustnessScenarios,
|
|
14017
14602
|
paretoChart,
|
|
14018
14603
|
paretoFrontier,
|
|
14019
14604
|
paretoFrontierWithCrowding,
|
|
@@ -14041,6 +14626,7 @@ export {
|
|
|
14041
14626
|
regexMatch,
|
|
14042
14627
|
regexMatches,
|
|
14043
14628
|
regressionView,
|
|
14629
|
+
releaseTraceEvidenceFromMultiShotTrials,
|
|
14044
14630
|
renderMarkdown,
|
|
14045
14631
|
renderMarkdownReport,
|
|
14046
14632
|
renderPlaybookMarkdown,
|
|
@@ -14058,6 +14644,7 @@ export {
|
|
|
14058
14644
|
rowWhere,
|
|
14059
14645
|
runAgentControlLoop,
|
|
14060
14646
|
runAssertions,
|
|
14647
|
+
runBehavioralCanaries,
|
|
14061
14648
|
runCanaries,
|
|
14062
14649
|
runCounterfactual,
|
|
14063
14650
|
runE2EWorkflow,
|
|
@@ -14081,6 +14668,7 @@ export {
|
|
|
14081
14668
|
scanForMuffledGates,
|
|
14082
14669
|
scoreAllProjects,
|
|
14083
14670
|
scoreContinuity,
|
|
14671
|
+
scoreKnowledgeReadiness,
|
|
14084
14672
|
scoreProject,
|
|
14085
14673
|
scoreRedTeamOutput,
|
|
14086
14674
|
scoreReferenceReplay,
|
|
@@ -14115,6 +14703,7 @@ export {
|
|
|
14115
14703
|
trialTraceFromMultiShotTrial,
|
|
14116
14704
|
typoMutator,
|
|
14117
14705
|
urlContains,
|
|
14706
|
+
userQuestionsForKnowledgeGaps,
|
|
14118
14707
|
validateRunRecord,
|
|
14119
14708
|
verbosityBias,
|
|
14120
14709
|
verifyManifest,
|