@tangle-network/agent-eval 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
417
417
  if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
418
418
  if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
419
419
  const n = scores2.length;
420
- const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
420
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
421
421
  const B = 1e3;
422
422
  const bootstrapMeans = [];
423
423
  for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
432
432
  const lowerIdx = Math.floor(alpha / 2 * B);
433
433
  const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
434
434
  return {
435
- mean: mean9,
435
+ mean: mean10,
436
436
  lower: bootstrapMeans[lowerIdx],
437
437
  upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
438
438
  };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
520
520
  const n = before.length;
521
521
  if (n < 2) return { t: 0, df: 0, p: 1 };
522
522
  const diffs = before.map((b, i) => after[i] - b);
523
- const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
524
- const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
523
+ const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
524
+ const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
525
525
  const se = Math.sqrt(variance2 / n);
526
- if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
527
- const t = mean9 / se;
526
+ if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
527
+ const t = mean10 / se;
528
528
  const df = n - 1;
529
529
  const p = 2 * (1 - studentTCdf(Math.abs(t), df));
530
530
  return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
548
548
  }
549
549
  let wPlus = 0;
550
550
  for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
551
- const mean9 = n * (n + 1) / 4;
551
+ const mean10 = n * (n + 1) / 4;
552
552
  const variance2 = n * (n + 1) * (2 * n + 1) / 24;
553
- const z = (wPlus - mean9) / Math.sqrt(variance2);
553
+ const z = (wPlus - mean10) / Math.sqrt(variance2);
554
554
  const p = 2 * (1 - normalCdf(Math.abs(z)));
555
555
  return { w: wPlus, p };
556
556
  }
@@ -2251,6 +2251,151 @@ async function finish(emitter, result) {
2251
2251
  return result;
2252
2252
  }
2253
2253
 
2254
+ // src/knowledge/readiness.ts
2255
+ function scoreKnowledgeReadiness(options) {
2256
+ const requirements = options.requirements.map(normalizeRequirement);
2257
+ const missing = requirements.filter((requirement) => requirement.currentConfidence < requirement.confidenceNeeded);
2258
+ const blockingMissingRequirements = missing.filter(isBlockingGap);
2259
+ const nonBlockingGaps = missing.filter((requirement) => !isBlockingGap(requirement));
2260
+ const readinessScore = weightedReadiness(requirements);
2261
+ const bundle = {
2262
+ taskId: options.taskId,
2263
+ requirements,
2264
+ evidenceIds: unique([...options.evidenceIds ?? [], ...requirements.flatMap((r) => r.evidenceIds)]),
2265
+ claimIds: unique(options.claimIds ?? []),
2266
+ wikiPageIds: unique(options.wikiPageIds ?? []),
2267
+ userAnswers: options.userAnswers ?? {},
2268
+ missing,
2269
+ readinessScore,
2270
+ metadata: options.metadata
2271
+ };
2272
+ const recommendedAction = chooseRecommendedAction(blockingMissingRequirements, nonBlockingGaps);
2273
+ const severity = blockingMissingRequirements.length > 0 ? "critical" : nonBlockingGaps.some((gap) => gap.importance === "high") ? "warning" : "info";
2274
+ const reason = blockingMissingRequirements.length > 0 ? `${blockingMissingRequirements.length} blocking knowledge requirement(s) are missing.` : nonBlockingGaps.length > 0 ? `${nonBlockingGaps.length} non-blocking knowledge gap(s) remain.` : "All declared knowledge requirements are ready.";
2275
+ return {
2276
+ taskId: options.taskId,
2277
+ readinessScore,
2278
+ blockingMissingRequirements,
2279
+ nonBlockingGaps,
2280
+ recommendedAction,
2281
+ bundle,
2282
+ severity,
2283
+ reason
2284
+ };
2285
+ }
2286
+ function blockingKnowledgeEval(report, options = {}) {
2287
+ const minimumScore = options.minimumScore ?? 0.7;
2288
+ const passed = report.blockingMissingRequirements.length === 0 && report.readinessScore >= minimumScore;
2289
+ return objectiveEval({
2290
+ id: options.id ?? "knowledge-ready",
2291
+ passed,
2292
+ score: report.readinessScore,
2293
+ severity: passed ? "info" : report.severity,
2294
+ detail: report.reason,
2295
+ evidence: report.blockingMissingRequirements.map((r) => r.id).join(", ") || void 0,
2296
+ metadata: { knowledgeReadiness: report }
2297
+ });
2298
+ }
2299
+ function userQuestionsForKnowledgeGaps(gaps) {
2300
+ return gaps.filter((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask").map((gap) => ({
2301
+ id: `question_${gap.id}`,
2302
+ question: `Please provide: ${gap.description}`,
2303
+ reason: `Required for ${gap.requiredFor.join(", ") || "the task"}.`,
2304
+ requirementId: gap.id,
2305
+ importance: gap.importance,
2306
+ answerType: gap.sensitivity === "secret" ? "credential" : "free_text",
2307
+ impactIfUnknown: impactFor(gap)
2308
+ }));
2309
+ }
2310
+ function acquisitionPlansForKnowledgeGaps(gaps) {
2311
+ const byMode = /* @__PURE__ */ new Map();
2312
+ for (const gap of gaps) {
2313
+ const mode = planMode(gap.acquisitionMode);
2314
+ if (!mode) continue;
2315
+ const bucket = byMode.get(mode) ?? [];
2316
+ bucket.push(gap);
2317
+ byMode.set(mode, bucket);
2318
+ }
2319
+ return [...byMode.entries()].map(([mode, requirements]) => ({
2320
+ id: `acquire_${mode}`,
2321
+ requirementIds: requirements.map((r) => r.id),
2322
+ mode,
2323
+ description: descriptionForPlan(mode, requirements),
2324
+ priority: maxImportance(requirements.map((r) => r.importance)),
2325
+ questions: mode === "ask_user" ? userQuestionsForKnowledgeGaps(requirements) : void 0
2326
+ }));
2327
+ }
2328
+ function normalizeRequirement(requirement) {
2329
+ return {
2330
+ ...requirement,
2331
+ confidenceNeeded: clamp01(requirement.confidenceNeeded),
2332
+ currentConfidence: clamp01(requirement.currentConfidence),
2333
+ evidenceIds: unique(requirement.evidenceIds)
2334
+ };
2335
+ }
2336
+ function weightedReadiness(requirements) {
2337
+ if (requirements.length === 0) return 1;
2338
+ let weightSum = 0;
2339
+ let scoreSum = 0;
2340
+ for (const requirement of requirements) {
2341
+ const weight = importanceWeight(requirement.importance);
2342
+ const score = requirement.confidenceNeeded <= 0 ? 1 : Math.min(1, requirement.currentConfidence / requirement.confidenceNeeded);
2343
+ weightSum += weight;
2344
+ scoreSum += weight * score;
2345
+ }
2346
+ return clamp01(scoreSum / weightSum);
2347
+ }
2348
+ function isBlockingGap(requirement) {
2349
+ return requirement.importance === "blocking" || requirement.fallbackPolicy === "block" || requirement.sensitivity === "secret";
2350
+ }
2351
+ function chooseRecommendedAction(blocking, nonBlocking) {
2352
+ const gaps = blocking.length > 0 ? blocking : nonBlocking;
2353
+ if (gaps.length === 0) return "run_agent";
2354
+ if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
2355
+ if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
2356
+ if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
2357
+ if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
2358
+ if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
2359
+ if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
2360
+ return "continue_with_caveat";
2361
+ }
2362
+ function planMode(mode) {
2363
+ if (mode === "infer_low_confidence" || mode === "not_available") return null;
2364
+ return mode;
2365
+ }
2366
+ function descriptionForPlan(mode, requirements) {
2367
+ const labels = requirements.map((r) => r.description).join("; ");
2368
+ if (mode === "ask_user") return `Ask the user for: ${labels}`;
2369
+ if (mode === "search_web") return `Search web or documentation sources for: ${labels}`;
2370
+ if (mode === "query_connector") return `Query configured connectors for: ${labels}`;
2371
+ if (mode === "inspect_repo") return `Inspect repository context for: ${labels}`;
2372
+ if (mode === "run_command") return `Run local commands to collect: ${labels}`;
2373
+ return `Build domain wiki evidence for: ${labels}`;
2374
+ }
2375
+ function impactFor(requirement) {
2376
+ if (requirement.fallbackPolicy === "block") return "The agent should not run until this is known.";
2377
+ if (requirement.fallbackPolicy === "continue_with_caveat") return "The agent may continue, but must disclose uncertainty.";
2378
+ if (requirement.fallbackPolicy === "use_default") return "The agent will use the configured default if skipped.";
2379
+ return "The agent should ask before continuing.";
2380
+ }
2381
+ function maxImportance(values) {
2382
+ const order = ["blocking", "high", "medium", "low"];
2383
+ return order.find((value) => values.includes(value)) ?? "low";
2384
+ }
2385
+ function importanceWeight(importance) {
2386
+ if (importance === "blocking") return 8;
2387
+ if (importance === "high") return 4;
2388
+ if (importance === "medium") return 2;
2389
+ return 1;
2390
+ }
2391
+ function clamp01(value) {
2392
+ if (!Number.isFinite(value)) return 0;
2393
+ return Math.max(0, Math.min(1, value));
2394
+ }
2395
+ function unique(items) {
2396
+ return [...new Set(items)];
2397
+ }
2398
+
2254
2399
  // src/feedback-trajectory.ts
2255
2400
  var DEFAULT_SPLIT_POLICY = {
2256
2401
  trainPct: 70,
@@ -3521,9 +3666,9 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
3521
3666
  };
3522
3667
  function aggregateRunScore(score, weights = {}) {
3523
3668
  const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
3524
- return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
3669
+ return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
3525
3670
  }
3526
- function clamp01(value) {
3671
+ function clamp012(value) {
3527
3672
  if (!Number.isFinite(value)) return 0;
3528
3673
  return Math.max(0, Math.min(1, value));
3529
3674
  }
@@ -3567,13 +3712,13 @@ var RunCritic = class {
3567
3712
  const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
3568
3713
  if (!success) notes.push("run did not complete with pass=true");
3569
3714
  const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
3570
- const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
3715
+ const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp012(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
3571
3716
  const goalProgress = outcomeScore ?? judgeAverage ?? success;
3572
3717
  const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
3573
3718
  const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
3574
3719
  if (toolSpans2.length === 0) notes.push("no tool spans recorded");
3575
3720
  const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
3576
- const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
3721
+ const patchQuality = patchEvidence > 0 ? clamp012(patchEvidence / 4) : 0;
3577
3722
  if (!patchQuality) notes.push("no artifact or edit evidence recorded");
3578
3723
  const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
3579
3724
  const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
@@ -3617,7 +3762,7 @@ var RunCritic = class {
3617
3762
  }
3618
3763
  };
3619
3764
  function normalizeJudgeScore(score) {
3620
- return score > 1 ? clamp01(score / 10) : clamp01(score);
3765
+ return score > 1 ? clamp012(score / 10) : clamp012(score);
3621
3766
  }
3622
3767
  function looksRepoGrounded(text) {
3623
3768
  return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
@@ -4973,6 +5118,17 @@ var FAILURE_CLASSES = [
4973
5118
  "cost_overrun",
4974
5119
  "timeout",
4975
5120
  "sandbox_failure",
5121
+ "missing_user_data",
5122
+ "missing_domain_data",
5123
+ "missing_codebase_context",
5124
+ "missing_runtime_context",
5125
+ "missing_credentials",
5126
+ "stale_external_data",
5127
+ "bad_retrieval",
5128
+ "insufficient_evidence",
5129
+ "contradictory_evidence",
5130
+ "ambiguous_user_intent",
5131
+ "knowledge_readiness_blocked",
4976
5132
  "unknown"
4977
5133
  ];
4978
5134
  function isLlmSpan(s) {
@@ -5329,6 +5485,62 @@ var DEFAULT_RULES = [
5329
5485
  return null;
5330
5486
  }
5331
5487
  },
5488
+ {
5489
+ id: "knowledge-readiness-blocked",
5490
+ match: ({ events }) => {
5491
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false);
5492
+ return event ? {
5493
+ failureClass: "knowledge_readiness_blocked",
5494
+ reason: "knowledge readiness report blocked execution",
5495
+ triggerEventId: event.eventId
5496
+ } : null;
5497
+ }
5498
+ },
5499
+ {
5500
+ id: "missing-credentials",
5501
+ match: ({ events }) => {
5502
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret");
5503
+ return event ? {
5504
+ failureClass: "missing_credentials",
5505
+ reason: "required credential or secret was missing",
5506
+ triggerEventId: event.eventId
5507
+ } : null;
5508
+ }
5509
+ },
5510
+ {
5511
+ id: "bad-retrieval",
5512
+ match: ({ run, spans }) => {
5513
+ if (run.outcome?.pass !== false) return null;
5514
+ const retrieval = spans.find((s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)));
5515
+ return retrieval ? {
5516
+ failureClass: "bad_retrieval",
5517
+ reason: "retrieval returned no useful hits for a failed run",
5518
+ triggerSpanId: retrieval.spanId
5519
+ } : null;
5520
+ }
5521
+ },
5522
+ {
5523
+ id: "insufficient-evidence",
5524
+ match: ({ events }) => {
5525
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence");
5526
+ return event ? {
5527
+ failureClass: "insufficient_evidence",
5528
+ reason: "task proceeded with insufficient supporting evidence",
5529
+ triggerEventId: event.eventId
5530
+ } : null;
5531
+ }
5532
+ },
5533
+ {
5534
+ id: "contradictory-evidence",
5535
+ match: ({ events }) => {
5536
+ const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence");
5537
+ return event ? {
5538
+ failureClass: "contradictory_evidence",
5539
+ reason: "supporting evidence contradicted itself",
5540
+ triggerEventId: event.eventId
5541
+ } : null;
5542
+ }
5543
+ },
5332
5544
  // Budget breach events
5333
5545
  {
5334
5546
  id: "budget-breach",
@@ -5667,11 +5879,14 @@ async function failureClusterView(store, options = {}) {
5667
5879
  const cls = classifyFailure({ run, spans, events }, rules);
5668
5880
  let toolName;
5669
5881
  let argPrefix;
5882
+ let dimension;
5670
5883
  if (cls.triggerSpanId) {
5671
5884
  const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
5672
5885
  if (trig?.kind === "tool") {
5673
5886
  toolName = trig.toolName;
5674
5887
  argPrefix = argHash(trig.args).slice(0, 16);
5888
+ } else if (trig?.kind === "judge") {
5889
+ dimension = trig.dimension;
5675
5890
  }
5676
5891
  }
5677
5892
  if (!toolName) {
@@ -5682,13 +5897,18 @@ async function failureClusterView(store, options = {}) {
5682
5897
  argPrefix = argHash(errored.args).slice(0, 16);
5683
5898
  }
5684
5899
  }
5685
- const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}`;
5900
+ if (!dimension) {
5901
+ const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
5902
+ if (judge?.kind === "judge") dimension = judge.dimension;
5903
+ }
5904
+ const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
5686
5905
  let cluster = clusters.get(key);
5687
5906
  if (!cluster) {
5688
5907
  cluster = {
5689
5908
  failureClass: cls.failureClass,
5690
5909
  toolName,
5691
5910
  argPrefix,
5911
+ dimension,
5692
5912
  runCount: 0,
5693
5913
  scenarioIds: [],
5694
5914
  exampleRunId: run.runId,
@@ -6457,10 +6677,10 @@ function analyzeSeries(values, options = {}) {
6457
6677
  return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
6458
6678
  }
6459
6679
  const tail = values.slice(-window);
6460
- const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
6461
- const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
6680
+ const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
6681
+ const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
6462
6682
  const stdDev = Math.sqrt(variance2);
6463
- const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
6683
+ const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
6464
6684
  const cv = stdDev / refMean;
6465
6685
  const stable = tail.length >= window && cv <= stableCv;
6466
6686
  let tailRun = 0;
@@ -6481,7 +6701,7 @@ function analyzeSeries(values, options = {}) {
6481
6701
  } else {
6482
6702
  state = "noisy";
6483
6703
  }
6484
- return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
6704
+ return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
6485
6705
  }
6486
6706
 
6487
6707
  // src/state-continuity.ts
@@ -6673,6 +6893,46 @@ function checkCanaries(output, scenarios) {
6673
6893
  }
6674
6894
  return leaks;
6675
6895
  }
6896
+ function checkBehavioralCanary(output, scenario) {
6897
+ const pattern = scenario.forbiddenPattern ?? scenario.canary;
6898
+ if (!pattern) return null;
6899
+ const hit = matchForbidden(output, pattern);
6900
+ if (!hit) return null;
6901
+ return {
6902
+ scenarioId: scenario.id,
6903
+ canary: pattern,
6904
+ evidence: excerpt2(output, hit)
6905
+ };
6906
+ }
6907
+ function runBehavioralCanaries(cases) {
6908
+ const leaks = [];
6909
+ for (const c of cases) {
6910
+ const leak = checkBehavioralCanary(c.output, c.scenario);
6911
+ if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
6912
+ }
6913
+ return leaks;
6914
+ }
6915
+ function matchForbidden(output, pattern) {
6916
+ const re = tryParseRegex(pattern);
6917
+ if (re) {
6918
+ const m = output.match(re);
6919
+ return m && m[0].length > 0 ? m[0] : null;
6920
+ }
6921
+ return output.includes(pattern) ? pattern : null;
6922
+ }
6923
+ function tryParseRegex(pattern) {
6924
+ if (pattern.length < 2 || pattern[0] !== "/") return null;
6925
+ const last = pattern.lastIndexOf("/");
6926
+ if (last <= 0) return null;
6927
+ const body = pattern.slice(1, last);
6928
+ const flags = pattern.slice(last + 1);
6929
+ if (!/^[gimsuy]*$/.test(flags)) return null;
6930
+ try {
6931
+ return new RegExp(body, flags);
6932
+ } catch {
6933
+ return null;
6934
+ }
6935
+ }
6676
6936
  async function canaryLeakView(store, scenarios) {
6677
6937
  const targets = scenarios.filter((s) => !!s.canary);
6678
6938
  if (targets.length === 0) return [];
@@ -6938,9 +7198,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
6938
7198
  for (let k = n - 1; k >= 0; k--) {
6939
7199
  const rank = k + 1;
6940
7200
  const raw = indexed[k].p * n / rank;
6941
- const bounded = Math.min(minRight, raw);
6942
- minRight = bounded;
6943
- q[indexed[k].i] = Math.min(1, bounded);
7201
+ const bounded2 = Math.min(minRight, raw);
7202
+ minRight = bounded2;
7203
+ q[indexed[k].i] = Math.min(1, bounded2);
6944
7204
  }
6945
7205
  const significant = q.map((v) => v < fdr);
6946
7206
  return { qValues: q, significant };
@@ -7470,12 +7730,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
7470
7730
  variantScores.push({ mutator: id, score, mutated });
7471
7731
  all.push(score);
7472
7732
  }
7473
- const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
7474
- const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
7733
+ const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
7734
+ const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
7475
7735
  const stdDev = Math.sqrt(variance2);
7476
- const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
7736
+ const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
7477
7737
  const robustness = Math.max(0, 1 - stdDev / ref);
7478
- return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
7738
+ return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
7479
7739
  }
7480
7740
  var lowercaseMutator = (p) => p.toLowerCase();
7481
7741
  var sentenceReorderMutator = (p, seed) => {
@@ -7519,6 +7779,41 @@ var DEFAULT_MUTATORS = [
7519
7779
  { id: "politeness-prefix", fn: politenessPrefixMutator },
7520
7780
  { id: "whitespace-collapse", fn: whitespaceCollapseMutator }
7521
7781
  ];
7782
+ async function paraphraseRobustnessScenarios(args) {
7783
+ const reps = Math.max(1, args.reps ?? 1);
7784
+ const mutatorNames = args.mutators.map((m) => m.name);
7785
+ const perScenario = [];
7786
+ for (const scenario of args.scenarios) {
7787
+ const baseline = await args.runScenario({
7788
+ id: scenario.id,
7789
+ userTurns: scenario.userTurns
7790
+ });
7791
+ const originalScore = baseline.score;
7792
+ const deltas = {};
7793
+ const paraphrasedAll = [];
7794
+ for (const m of args.mutators) {
7795
+ const scores2 = [];
7796
+ for (let r = 0; r < reps; r++) {
7797
+ const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
7798
+ const out = await args.runScenario({
7799
+ id: scenario.id,
7800
+ userTurns: mutatedTurns
7801
+ });
7802
+ scores2.push(out.score);
7803
+ }
7804
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
7805
+ deltas[m.name] = mean10 - originalScore;
7806
+ paraphrasedAll.push(...scores2);
7807
+ }
7808
+ const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
7809
+ perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
7810
+ }
7811
+ const meanOriginal = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.originalScore, 0) / perScenario.length;
7812
+ const meanParaphrased = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.paraphrasedMean, 0) / perScenario.length;
7813
+ const ratio2 = meanOriginal <= 0 ? 0 : meanParaphrased / meanOriginal;
7814
+ const score = Math.max(0, Math.min(1, ratio2));
7815
+ return { score, perScenario, mutators: mutatorNames };
7816
+ }
7522
7817
 
7523
7818
  // src/visual-diff.ts
7524
7819
  function visualDiff(a, b, options = {}) {
@@ -8396,8 +8691,8 @@ async function prmBestOfN(store, grader, runIds) {
8396
8691
  if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
8397
8692
  const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
8398
8693
  const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
8399
- const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8400
- const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
8694
+ const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
8695
+ const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
8401
8696
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8402
8697
  }
8403
8698
  async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8419,8 +8714,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
8419
8714
  const ranked = [...byRun.values()].sort(
8420
8715
  (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
8421
8716
  );
8422
- const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8423
- const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
8717
+ const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
8718
+ const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
8424
8719
  return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
8425
8720
  }
8426
8721
 
@@ -8747,10 +9042,11 @@ async function signManifest(m) {
8747
9042
  const bytes = new TextEncoder().encode(JSON.stringify(canonical));
8748
9043
  const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
8749
9044
  const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
8750
- return { ...m, contentHash: hash };
9045
+ return { ...m, contentHash: hash, algo: "sha256-content" };
8751
9046
  }
8752
9047
  async function verifyManifest(m) {
8753
- const { contentHash, ...rest } = m;
9048
+ const { contentHash, algo: _algo, ...rest } = m;
9049
+ void _algo;
8754
9050
  const resigned = await signManifest(rest);
8755
9051
  return resigned.contentHash === contentHash;
8756
9052
  }
@@ -8950,8 +9246,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
8950
9246
  const sRuns = runs.filter((r) => r.scenarioId === s.id);
8951
9247
  const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
8952
9248
  if (scores2.length < 3) continue;
8953
- const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
8954
- const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
9249
+ const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
9250
+ const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
8955
9251
  if (variance2 > varianceThreshold) {
8956
9252
  targets.push({
8957
9253
  reason: "high-variance",
@@ -10989,7 +11285,7 @@ function defaultReferenceReplayMatcher(reference, candidate) {
10989
11285
  const textScore = tokenJaccard(referenceText, candidateText);
10990
11286
  const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
10991
11287
  const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
10992
- const score = clamp012(textScore * 0.85 + tagScore + severityScore);
11288
+ const score = clamp013(textScore * 0.85 + tagScore + severityScore);
10993
11289
  return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
10994
11290
  }
10995
11291
  function scoreScenario(scenario, matcher, threshold, matchStrategy) {
@@ -11089,7 +11385,7 @@ function scorePair(scenario, matcher, reference, candidate) {
11089
11385
  if (!Number.isFinite(result.score)) {
11090
11386
  throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
11091
11387
  }
11092
- return { score: clamp012(result.score), reason: result.reason ?? "" };
11388
+ return { score: clamp013(result.score), reason: result.reason ?? "" };
11093
11389
  }
11094
11390
  function buildScenarioScore(scenario, matches2, falsePositives) {
11095
11391
  const matched = matches2.filter((match) => match.matched).length;
@@ -11188,7 +11484,7 @@ function tokens(text) {
11188
11484
  function normalize(text) {
11189
11485
  return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
11190
11486
  }
11191
- function clamp012(value) {
11487
+ function clamp013(value) {
11192
11488
  if (!Number.isFinite(value)) return 0;
11193
11489
  return Math.max(0, Math.min(1, value));
11194
11490
  }
@@ -12653,7 +12949,7 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
12653
12949
  scenarioId,
12654
12950
  rep,
12655
12951
  ok: scored.ok ?? true,
12656
- score: clamp013(scored.score),
12952
+ score: clamp014(scored.score),
12657
12953
  cost: scored.costUsd ?? run.costUsd ?? 0,
12658
12954
  durationMs: scored.durationMs ?? run.durationMs ?? 0,
12659
12955
  metrics: {
@@ -12765,7 +13061,7 @@ function stableHash2(input) {
12765
13061
  }
12766
13062
  return h >>> 0;
12767
13063
  }
12768
- function clamp013(n) {
13064
+ function clamp014(n) {
12769
13065
  if (!Number.isFinite(n)) return 0;
12770
13066
  return Math.max(0, Math.min(1, n));
12771
13067
  }
@@ -12813,6 +13109,289 @@ function traceExcerpt(trace) {
12813
13109
  return void 0;
12814
13110
  }
12815
13111
 
13112
+ // src/release-confidence.ts
13113
+ var DEFAULT_THRESHOLDS = {
13114
+ requireCorpus: true,
13115
+ minScenarioCount: 1,
13116
+ minSearchRuns: 1,
13117
+ minHoldoutRuns: 1,
13118
+ requireHoldout: true,
13119
+ minPassRate: 0.8,
13120
+ minMeanScore: 0.7,
13121
+ maxOverfitGap: 0.15,
13122
+ maxMeanCostUsd: Number.POSITIVE_INFINITY,
13123
+ maxP95WallMs: Number.POSITIVE_INFINITY,
13124
+ requireAsiForFailures: true,
13125
+ failureScoreThreshold: 0.5
13126
+ };
13127
+ function releaseTraceEvidenceFromMultiShotTrials(trials) {
13128
+ return trials.map((trial) => ({
13129
+ scenarioId: trial.scenarioId,
13130
+ candidateId: trial.variantId,
13131
+ split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
13132
+ score: trial.score,
13133
+ ok: trial.ok,
13134
+ turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
13135
+ costUsd: trial.cost,
13136
+ durationMs: trial.durationMs,
13137
+ failureMode: trial.error ? "runtime_error" : void 0,
13138
+ asi: trial.asi,
13139
+ metadata: trial.metadata
13140
+ }));
13141
+ }
13142
+ function evaluateReleaseConfidence(input) {
13143
+ const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
13144
+ const candidateId = input.candidateId ?? null;
13145
+ const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
13146
+ const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
13147
+ const scenarios = input.scenarios ?? [];
13148
+ const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
13149
+ const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
13150
+ const searchScores = scoresFor(runs, "search");
13151
+ const holdoutScores = scoresFor(runs, "holdout");
13152
+ const allScores = [...searchScores, ...holdoutScores];
13153
+ const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
13154
+ const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
13155
+ const searchRuns = runs.filter((r) => r.splitTag === "search").length;
13156
+ const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
13157
+ const searchMeanScore = mean8(searchScores);
13158
+ const holdoutMeanScore = mean8(holdoutScores);
13159
+ const metrics = {
13160
+ scenarioCount,
13161
+ searchRuns,
13162
+ holdoutRuns,
13163
+ passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
13164
+ meanScore: mean8(scoreUniverse),
13165
+ searchMeanScore,
13166
+ holdoutMeanScore,
13167
+ overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
13168
+ meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
13169
+ p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
13170
+ failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
13171
+ failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
13172
+ singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
13173
+ multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
13174
+ splitCounts,
13175
+ domainCounts: countDomains(scenarios),
13176
+ failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
13177
+ responsibleSurfaceCounts: countResponsibleSurfaces(traces)
13178
+ };
13179
+ const issues = [];
13180
+ checkCorpus(input, thresholds, metrics, issues);
13181
+ checkQuality(thresholds, metrics, issues);
13182
+ checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
13183
+ checkDiagnostics(thresholds, metrics, issues);
13184
+ checkEfficiency(thresholds, metrics, issues);
13185
+ const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
13186
+ const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
13187
+ return {
13188
+ target: input.target,
13189
+ candidateId,
13190
+ baselineId: input.baselineId ?? null,
13191
+ status,
13192
+ promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
13193
+ axes,
13194
+ issues,
13195
+ metrics,
13196
+ dataset: input.dataset ?? null,
13197
+ gateDecision: input.gateDecision ?? null,
13198
+ summary: renderSummary(input.target, status, metrics, issues)
13199
+ };
13200
+ }
13201
+ function assertReleaseConfidence(input) {
13202
+ const scorecard = evaluateReleaseConfidence(input);
13203
+ if (scorecard.status === "fail") {
13204
+ throw new Error(scorecard.summary);
13205
+ }
13206
+ return scorecard;
13207
+ }
13208
+ function filterCandidate(runs, candidateId, baselineId) {
13209
+ if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
13210
+ if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
13211
+ return [...runs];
13212
+ }
13213
+ function filterTraceCandidate(traces, candidateId, baselineId) {
13214
+ if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
13215
+ if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
13216
+ return [...traces];
13217
+ }
13218
+ function checkCorpus(input, thresholds, metrics, issues) {
13219
+ if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
13220
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
13221
+ }
13222
+ if (metrics.scenarioCount < thresholds.minScenarioCount) {
13223
+ issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
13224
+ }
13225
+ if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
13226
+ issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
13227
+ }
13228
+ }
13229
+ function checkQuality(thresholds, metrics, issues) {
13230
+ if (metrics.searchRuns < thresholds.minSearchRuns) {
13231
+ issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
13232
+ }
13233
+ if (metrics.passRate < thresholds.minPassRate) {
13234
+ issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
13235
+ }
13236
+ if (metrics.meanScore < thresholds.minMeanScore) {
13237
+ issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
13238
+ }
13239
+ }
13240
+ function checkGeneralization(gateDecision, thresholds, metrics, issues) {
13241
+ if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
13242
+ issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
13243
+ }
13244
+ if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
13245
+ issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
13246
+ }
13247
+ if (gateDecision && !gateDecision.promote) {
13248
+ issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
13249
+ }
13250
+ }
13251
+ function checkDiagnostics(thresholds, metrics, issues) {
13252
+ if (!thresholds.requireAsiForFailures) return;
13253
+ if (metrics.failedRows > metrics.failuresWithAsi) {
13254
+ issues.push({
13255
+ axis: "diagnostics",
13256
+ severity: "critical",
13257
+ code: "missing_failure_asi",
13258
+ detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
13259
+ });
13260
+ }
13261
+ }
13262
+ function checkEfficiency(thresholds, metrics, issues) {
13263
+ if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
13264
+ issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
13265
+ }
13266
+ if (metrics.p95WallMs > thresholds.maxP95WallMs) {
13267
+ issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
13268
+ }
13269
+ }
13270
+ function buildAxes(metrics, thresholds, gateDecision, issues) {
13271
+ return [
13272
+ axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
13273
+ axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
13274
+ axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
13275
+ axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
13276
+ axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
13277
+ ];
13278
+ }
13279
+ function axis(name, issues, score, detail) {
13280
+ const own = issues.filter((i) => i.axis === name);
13281
+ const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
13282
+ return { name, status, score: bounded(score), detail };
13283
+ }
13284
+ function countScenarioSplits(scenarios) {
13285
+ const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
13286
+ for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
13287
+ return counts;
13288
+ }
13289
+ function countDomains(scenarios) {
13290
+ const out = {};
13291
+ for (const scenario of scenarios) {
13292
+ const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
13293
+ out[domain] = (out[domain] ?? 0) + 1;
13294
+ }
13295
+ return out;
13296
+ }
13297
+ function countFailureModes(runs, traces, threshold) {
13298
+ const out = {};
13299
+ for (const run of runs) {
13300
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13301
+ if (run.failureMode || score !== void 0 && score < threshold) {
13302
+ const mode = run.failureMode ?? "low_score";
13303
+ out[mode] = (out[mode] ?? 0) + 1;
13304
+ }
13305
+ }
13306
+ for (const trace of traces) {
13307
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13308
+ const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
13309
+ out[mode] = (out[mode] ?? 0) + 1;
13310
+ }
13311
+ }
13312
+ return out;
13313
+ }
13314
+ function countResponsibleSurfaces(traces) {
13315
+ const out = {};
13316
+ for (const trace of traces) {
13317
+ for (const asi of trace.asi ?? []) {
13318
+ const surface = asi.responsibleSurface ?? "unknown";
13319
+ out[surface] = (out[surface] ?? 0) + 1;
13320
+ }
13321
+ }
13322
+ return out;
13323
+ }
13324
+ function failedRows(runs, traces, threshold) {
13325
+ const out = [];
13326
+ for (const run of runs) {
13327
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13328
+ if (run.failureMode || score !== void 0 && score < threshold) {
13329
+ const asiMetric = run.outcome.raw.asi;
13330
+ out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
13331
+ }
13332
+ }
13333
+ for (const trace of traces) {
13334
+ if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
13335
+ out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
13336
+ }
13337
+ }
13338
+ return out;
13339
+ }
13340
+ function passRate(runs, traces, threshold) {
13341
+ const outcomes = [
13342
+ ...runs.map((run) => {
13343
+ const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
13344
+ return !run.failureMode && score !== void 0 && score >= threshold;
13345
+ }),
13346
+ ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
13347
+ ];
13348
+ if (outcomes.length === 0) return 0;
13349
+ return outcomes.filter(Boolean).length / outcomes.length;
13350
+ }
13351
+ function scoresFor(runs, split) {
13352
+ return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
13353
+ }
13354
+ function mean8(xs) {
13355
+ if (xs.length === 0) return Number.NaN;
13356
+ return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
13357
+ }
13358
+ function percentile(xs, p) {
13359
+ if (xs.length === 0) return Number.NaN;
13360
+ const sorted = [...xs].sort((a, b) => a - b);
13361
+ return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
13362
+ }
13363
+ function isFiniteNumber(value) {
13364
+ return typeof value === "number" && Number.isFinite(value);
13365
+ }
13366
+ function safeDiff2(a, b) {
13367
+ if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
13368
+ return a - b;
13369
+ }
13370
+ function gapScore(gap, maxGap) {
13371
+ if (!Number.isFinite(gap)) return 0;
13372
+ if (maxGap <= 0) return gap <= 0 ? 1 : 0;
13373
+ return bounded(1 - Math.max(0, gap) / maxGap);
13374
+ }
13375
+ function efficiencyScore(metrics, thresholds) {
13376
+ const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
13377
+ const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
13378
+ return Math.min(cost, latency);
13379
+ }
13380
+ function bounded(x) {
13381
+ if (!Number.isFinite(x)) return 0;
13382
+ return Math.max(0, Math.min(1, x));
13383
+ }
13384
+ function renderSummary(target, status, metrics, issues) {
13385
+ const prefix = `release confidence ${status}: ${target}`;
13386
+ const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
13387
+ if (issues.length === 0) return `${prefix}; ${metricText}`;
13388
+ return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
13389
+ }
13390
+ function fmt3(x) {
13391
+ if (!Number.isFinite(x)) return String(x);
13392
+ return x.toFixed(4);
13393
+ }
13394
+
12816
13395
  // src/jsonl-trial-cache.ts
12817
13396
  import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
12818
13397
  import { dirname as dirname4 } from "path";
@@ -13458,9 +14037,9 @@ function passOrthogonality(input) {
13458
14037
  sims.push(cosineSimilarity(vectors[i], vectors[j]));
13459
14038
  }
13460
14039
  }
13461
- const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
14040
+ const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
13462
14041
  return {
13463
- orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
14042
+ orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
13464
14043
  passCount: passes.length,
13465
14044
  similarities: sims
13466
14045
  };
@@ -13506,8 +14085,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
13506
14085
  const iterations = options.iterations ?? 1e3;
13507
14086
  const minTotal = options.minTotalSamples ?? 6;
13508
14087
  const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
13509
- const baselineMean = mean8(baseline);
13510
- const candidateMean = mean8(candidate);
14088
+ const baselineMean = mean9(baseline);
14089
+ const candidateMean = mean9(candidate);
13511
14090
  const delta = candidateMean - baselineMean;
13512
14091
  if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
13513
14092
  return {
@@ -13525,7 +14104,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13525
14104
  for (let i = 0; i < iterations; i++) {
13526
14105
  const bResample = resample(baseline, rng);
13527
14106
  const cResample = resample(candidate, rng);
13528
- deltas[i] = mean8(cResample) - mean8(bResample);
14107
+ deltas[i] = mean9(cResample) - mean9(bResample);
13529
14108
  }
13530
14109
  deltas.sort((a, b) => a - b);
13531
14110
  const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13548,7 +14127,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
13548
14127
  verdict
13549
14128
  };
13550
14129
  }
13551
- function mean8(xs) {
14130
+ function mean9(xs) {
13552
14131
  if (xs.length === 0) return 0;
13553
14132
  let s = 0;
13554
14133
  for (const x of xs) s += x;
@@ -13865,6 +14444,7 @@ export {
13865
14444
  TraceEmitter,
13866
14445
  TrialTelemetry,
13867
14446
  UNIVERSAL_FINDERS,
14447
+ acquisitionPlansForKnowledgeGaps,
13868
14448
  adversarialJudge,
13869
14449
  aggregateLlm,
13870
14450
  aggregateRunScore,
@@ -13872,6 +14452,7 @@ export {
13872
14452
  analyzeAntiSlop,
13873
14453
  analyzeSeries,
13874
14454
  argHash,
14455
+ assertReleaseConfidence,
13875
14456
  assignFeedbackSplit,
13876
14457
  attributeCounterfactuals,
13877
14458
  deterministicSplit as benchmarkDeterministicSplit,
@@ -13879,6 +14460,7 @@ export {
13879
14460
  benjaminiHochberg,
13880
14461
  bhAdjust,
13881
14462
  bisect,
14463
+ blockingKnowledgeEval,
13882
14464
  bonferroni,
13883
14465
  bootstrapCi,
13884
14466
  budgetBreachView,
@@ -13892,9 +14474,10 @@ export {
13892
14474
  callLlmJson,
13893
14475
  canaryLeakView,
13894
14476
  causalAttribution,
14477
+ checkBehavioralCanary,
13895
14478
  checkCanaries,
13896
14479
  checkSlos,
13897
- clamp01,
14480
+ clamp012 as clamp01,
13898
14481
  classifyEuAiRisk,
13899
14482
  classifyFailure,
13900
14483
  codeExecutionJudge,
@@ -13942,6 +14525,7 @@ export {
13942
14525
  evaluateContract,
13943
14526
  evaluateHypothesis,
13944
14527
  evaluateOracles,
14528
+ evaluateReleaseConfidence,
13945
14529
  executeScenario,
13946
14530
  expectAgent,
13947
14531
  exportRewardModel,
@@ -14014,6 +14598,7 @@ export {
14014
14598
  pairedTTest,
14015
14599
  pairedWilcoxon,
14016
14600
  paraphraseRobustness,
14601
+ paraphraseRobustnessScenarios,
14017
14602
  paretoChart,
14018
14603
  paretoFrontier,
14019
14604
  paretoFrontierWithCrowding,
@@ -14041,6 +14626,7 @@ export {
14041
14626
  regexMatch,
14042
14627
  regexMatches,
14043
14628
  regressionView,
14629
+ releaseTraceEvidenceFromMultiShotTrials,
14044
14630
  renderMarkdown,
14045
14631
  renderMarkdownReport,
14046
14632
  renderPlaybookMarkdown,
@@ -14058,6 +14644,7 @@ export {
14058
14644
  rowWhere,
14059
14645
  runAgentControlLoop,
14060
14646
  runAssertions,
14647
+ runBehavioralCanaries,
14061
14648
  runCanaries,
14062
14649
  runCounterfactual,
14063
14650
  runE2EWorkflow,
@@ -14081,6 +14668,7 @@ export {
14081
14668
  scanForMuffledGates,
14082
14669
  scoreAllProjects,
14083
14670
  scoreContinuity,
14671
+ scoreKnowledgeReadiness,
14084
14672
  scoreProject,
14085
14673
  scoreRedTeamOutput,
14086
14674
  scoreReferenceReplay,
@@ -14115,6 +14703,7 @@ export {
14115
14703
  trialTraceFromMultiShotTrial,
14116
14704
  typoMutator,
14117
14705
  urlContains,
14706
+ userQuestionsForKnowledgeGaps,
14118
14707
  validateRunRecord,
14119
14708
  verbosityBias,
14120
14709
  verifyManifest,