npm - @tangle-network/agent-eval - Versions diffs - 0.19.0 → 0.20.0 - Mend

@tangle-network/agent-eval 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +38 -0
package/dist/index.d.ts +352 -4
package/dist/index.js +634 -45
package/dist/index.js.map +1 -1
package/docs/knowledge-readiness.md +84 -0
package/docs/multi-shot-optimization.md +7 -0
package/package.json +12 -10

package/dist/index.js CHANGED Viewed

@@ -417,7 +417,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   if (scores2.length === 0) return { mean: 0, lower: 0, upper: 0 };
   if (scores2.length === 1) return { mean: scores2[0], lower: scores2[0], upper: scores2[0] };
   const n = scores2.length;
-  const mean9 = scores2.reduce((a, b) => a + b, 0) / n;
+  const mean10 = scores2.reduce((a, b) => a + b, 0) / n;
   const B = 1e3;
   const bootstrapMeans = [];
   for (let i = 0; i < B; i++) {
@@ -432,7 +432,7 @@ function confidenceInterval(scores2, confidence = 0.95) {
   const lowerIdx = Math.floor(alpha / 2 * B);
   const upperIdx = Math.floor((1 - alpha / 2) * B) - 1;
   return {
-    mean: mean9,
+    mean: mean10,
     lower: bootstrapMeans[lowerIdx],
     upper: bootstrapMeans[Math.min(upperIdx, B - 1)]
   };
@@ -520,11 +520,11 @@ function pairedTTest(before, after) {
   const n = before.length;
   if (n < 2) return { t: 0, df: 0, p: 1 };
   const diffs = before.map((b, i) => after[i] - b);
-  const mean9 = diffs.reduce((a, b) => a + b, 0) / n;
-  const variance2 = diffs.reduce((acc, d) => acc + (d - mean9) ** 2, 0) / (n - 1);
+  const mean10 = diffs.reduce((a, b) => a + b, 0) / n;
+  const variance2 = diffs.reduce((acc, d) => acc + (d - mean10) ** 2, 0) / (n - 1);
   const se = Math.sqrt(variance2 / n);
-  if (se === 0) return { t: mean9 === 0 ? 0 : Infinity, df: n - 1, p: mean9 === 0 ? 1 : 0 };
-  const t = mean9 / se;
+  if (se === 0) return { t: mean10 === 0 ? 0 : Infinity, df: n - 1, p: mean10 === 0 ? 1 : 0 };
+  const t = mean10 / se;
   const df = n - 1;
   const p = 2 * (1 - studentTCdf(Math.abs(t), df));
   return { t, df, p };
@@ -548,9 +548,9 @@ function wilcoxonSignedRank(before, after) {
   }
   let wPlus = 0;
   for (let k = 0; k < n; k++) if (diffs[k] > 0) wPlus += ranks3[k];
-  const mean9 = n * (n + 1) / 4;
+  const mean10 = n * (n + 1) / 4;
   const variance2 = n * (n + 1) * (2 * n + 1) / 24;
-  const z = (wPlus - mean9) / Math.sqrt(variance2);
+  const z = (wPlus - mean10) / Math.sqrt(variance2);
   const p = 2 * (1 - normalCdf(Math.abs(z)));
   return { w: wPlus, p };
 }
@@ -2251,6 +2251,151 @@ async function finish(emitter, result) {
   return result;
 }
+// src/knowledge/readiness.ts
+function scoreKnowledgeReadiness(options) {
+  const requirements = options.requirements.map(normalizeRequirement);
+  const missing = requirements.filter((requirement) => requirement.currentConfidence < requirement.confidenceNeeded);
+  const blockingMissingRequirements = missing.filter(isBlockingGap);
+  const nonBlockingGaps = missing.filter((requirement) => !isBlockingGap(requirement));
+  const readinessScore = weightedReadiness(requirements);
+  const bundle = {
+    taskId: options.taskId,
+    requirements,
+    evidenceIds: unique([...options.evidenceIds ?? [], ...requirements.flatMap((r) => r.evidenceIds)]),
+    claimIds: unique(options.claimIds ?? []),
+    wikiPageIds: unique(options.wikiPageIds ?? []),
+    userAnswers: options.userAnswers ?? {},
+    missing,
+    readinessScore,
+    metadata: options.metadata
+  };
+  const recommendedAction = chooseRecommendedAction(blockingMissingRequirements, nonBlockingGaps);
+  const severity = blockingMissingRequirements.length > 0 ? "critical" : nonBlockingGaps.some((gap) => gap.importance === "high") ? "warning" : "info";
+  const reason = blockingMissingRequirements.length > 0 ? `${blockingMissingRequirements.length} blocking knowledge requirement(s) are missing.` : nonBlockingGaps.length > 0 ? `${nonBlockingGaps.length} non-blocking knowledge gap(s) remain.` : "All declared knowledge requirements are ready.";
+  return {
+    taskId: options.taskId,
+    readinessScore,
+    blockingMissingRequirements,
+    nonBlockingGaps,
+    recommendedAction,
+    bundle,
+    severity,
+    reason
+  };
+}
+function blockingKnowledgeEval(report, options = {}) {
+  const minimumScore = options.minimumScore ?? 0.7;
+  const passed = report.blockingMissingRequirements.length === 0 && report.readinessScore >= minimumScore;
+  return objectiveEval({
+    id: options.id ?? "knowledge-ready",
+    passed,
+    score: report.readinessScore,
+    severity: passed ? "info" : report.severity,
+    detail: report.reason,
+    evidence: report.blockingMissingRequirements.map((r) => r.id).join(", ") || void 0,
+    metadata: { knowledgeReadiness: report }
+  });
+}
+function userQuestionsForKnowledgeGaps(gaps) {
+  return gaps.filter((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask").map((gap) => ({
+    id: `question_${gap.id}`,
+    question: `Please provide: ${gap.description}`,
+    reason: `Required for ${gap.requiredFor.join(", ") || "the task"}.`,
+    requirementId: gap.id,
+    importance: gap.importance,
+    answerType: gap.sensitivity === "secret" ? "credential" : "free_text",
+    impactIfUnknown: impactFor(gap)
+  }));
+}
+function acquisitionPlansForKnowledgeGaps(gaps) {
+  const byMode = /* @__PURE__ */ new Map();
+  for (const gap of gaps) {
+    const mode = planMode(gap.acquisitionMode);
+    if (!mode) continue;
+    const bucket = byMode.get(mode) ?? [];
+    bucket.push(gap);
+    byMode.set(mode, bucket);
+  }
+  return [...byMode.entries()].map(([mode, requirements]) => ({
+    id: `acquire_${mode}`,
+    requirementIds: requirements.map((r) => r.id),
+    mode,
+    description: descriptionForPlan(mode, requirements),
+    priority: maxImportance(requirements.map((r) => r.importance)),
+    questions: mode === "ask_user" ? userQuestionsForKnowledgeGaps(requirements) : void 0
+  }));
+}
+function normalizeRequirement(requirement) {
+  return {
+    ...requirement,
+    confidenceNeeded: clamp01(requirement.confidenceNeeded),
+    currentConfidence: clamp01(requirement.currentConfidence),
+    evidenceIds: unique(requirement.evidenceIds)
+  };
+}
+function weightedReadiness(requirements) {
+  if (requirements.length === 0) return 1;
+  let weightSum = 0;
+  let scoreSum = 0;
+  for (const requirement of requirements) {
+    const weight = importanceWeight(requirement.importance);
+    const score = requirement.confidenceNeeded <= 0 ? 1 : Math.min(1, requirement.currentConfidence / requirement.confidenceNeeded);
+    weightSum += weight;
+    scoreSum += weight * score;
+  }
+  return clamp01(scoreSum / weightSum);
+}
+function isBlockingGap(requirement) {
+  return requirement.importance === "blocking" || requirement.fallbackPolicy === "block" || requirement.sensitivity === "secret";
+}
+function chooseRecommendedAction(blocking, nonBlocking) {
+  const gaps = blocking.length > 0 ? blocking : nonBlocking;
+  if (gaps.length === 0) return "run_agent";
+  if (blocking.some((gap) => gap.acquisitionMode === "ask_user" || gap.fallbackPolicy === "ask")) return "ask_user";
+  if (blocking.some((gap) => gap.acquisitionMode === "query_connector")) return "query_connectors";
+  if (blocking.some((gap) => gap.acquisitionMode === "inspect_repo" || gap.acquisitionMode === "run_command")) return "inspect_repo";
+  if (blocking.some((gap) => gap.acquisitionMode === "search_web")) return "collect_web_data";
+  if (blocking.some((gap) => gap.acquisitionMode === "not_available")) return "abort_or_rescope";
+  if (nonBlocking.some((gap) => gap.importance === "high")) return "build_domain_wiki";
+  return "continue_with_caveat";
+}
+function planMode(mode) {
+  if (mode === "infer_low_confidence" || mode === "not_available") return null;
+  return mode;
+}
+function descriptionForPlan(mode, requirements) {
+  const labels = requirements.map((r) => r.description).join("; ");
+  if (mode === "ask_user") return `Ask the user for: ${labels}`;
+  if (mode === "search_web") return `Search web or documentation sources for: ${labels}`;
+  if (mode === "query_connector") return `Query configured connectors for: ${labels}`;
+  if (mode === "inspect_repo") return `Inspect repository context for: ${labels}`;
+  if (mode === "run_command") return `Run local commands to collect: ${labels}`;
+  return `Build domain wiki evidence for: ${labels}`;
+}
+function impactFor(requirement) {
+  if (requirement.fallbackPolicy === "block") return "The agent should not run until this is known.";
+  if (requirement.fallbackPolicy === "continue_with_caveat") return "The agent may continue, but must disclose uncertainty.";
+  if (requirement.fallbackPolicy === "use_default") return "The agent will use the configured default if skipped.";
+  return "The agent should ask before continuing.";
+}
+function maxImportance(values) {
+  const order = ["blocking", "high", "medium", "low"];
+  return order.find((value) => values.includes(value)) ?? "low";
+}
+function importanceWeight(importance) {
+  if (importance === "blocking") return 8;
+  if (importance === "high") return 4;
+  if (importance === "medium") return 2;
+  return 1;
+}
+function clamp01(value) {
+  if (!Number.isFinite(value)) return 0;
+  return Math.max(0, Math.min(1, value));
+}
+function unique(items) {
+  return [...new Set(items)];
+}
 // src/feedback-trajectory.ts
 var DEFAULT_SPLIT_POLICY = {
   trainPct: 70,
@@ -3521,9 +3666,9 @@ var DEFAULT_RUN_SCORE_WEIGHTS = {
 };
 function aggregateRunScore(score, weights = {}) {
   const w = { ...DEFAULT_RUN_SCORE_WEIGHTS, ...weights };
-  return w.success * clamp01(score.success) + w.goalProgress * clamp01(score.goalProgress) + w.repoGroundedness * clamp01(score.repoGroundedness) + w.driftPenalty * clamp01(score.driftPenalty) + w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + w.finalGate * clamp01(score.finalGate) + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
+  return w.success * clamp012(score.success) + w.goalProgress * clamp012(score.goalProgress) + w.repoGroundedness * clamp012(score.repoGroundedness) + w.driftPenalty * clamp012(score.driftPenalty) + w.toolUseQuality * clamp012(score.toolUseQuality) + w.patchQuality * clamp012(score.patchQuality) + w.testReality * clamp012(score.testReality) + w.finalGate * clamp012(score.finalGate) + w.reviewerBlockers * clamp012(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60);
 }
-function clamp01(value) {
+function clamp012(value) {
   if (!Number.isFinite(value)) return 0;
   return Math.max(0, Math.min(1, value));
 }
@@ -3567,13 +3712,13 @@ var RunCritic = class {
     const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === "completed" ? 0.5 : 0;
     if (!success) notes.push("run did not complete with pass=true");
     const judgeAverage = judgeSpans2.length ? judgeSpans2.reduce((sum2, span) => sum2 + normalizeJudgeScore(span.score), 0) / judgeSpans2.length : void 0;
-    const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp01(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
+    const outcomeScore = typeof trace.run.outcome?.score === "number" ? clamp012(trace.run.outcome.score > 1 ? trace.run.outcome.score / 100 : trace.run.outcome.score) : void 0;
     const goalProgress = outcomeScore ?? judgeAverage ?? success;
     const successfulTools = toolSpans2.filter((span) => span.status !== "error").length;
     const toolUseQuality = toolSpans2.length === 0 ? 0 : successfulTools / toolSpans2.length;
     if (toolSpans2.length === 0) notes.push("no tool spans recorded");
     const patchEvidence = trace.artifacts.length + toolSpans2.filter((span) => /write|edit|patch|apply/i.test(span.toolName)).length;
-    const patchQuality = patchEvidence > 0 ? clamp01(patchEvidence / 4) : 0;
+    const patchQuality = patchEvidence > 0 ? clamp012(patchEvidence / 4) : 0;
     if (!patchQuality) notes.push("no artifact or edit evidence recorded");
     const sandboxTests = sandboxSpans.filter((span) => typeof span.testsTotal === "number" && span.testsTotal > 0);
     const testReality = sandboxTests.length ? sandboxTests.reduce((sum2, span) => sum2 + (span.testsPassed ?? 0) / Math.max(1, span.testsTotal ?? 1), 0) / sandboxTests.length : toolSpans2.some((span) => /\btest|vitest|pytest|jest|build|tsc\b/i.test(JSON.stringify(span.args))) ? 0.4 : 0;
@@ -3617,7 +3762,7 @@ var RunCritic = class {
   }
 };
 function normalizeJudgeScore(score) {
-  return score > 1 ? clamp01(score / 10) : clamp01(score);
+  return score > 1 ? clamp012(score / 10) : clamp012(score);
 }
 function looksRepoGrounded(text) {
   return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text);
@@ -4973,6 +5118,17 @@ var FAILURE_CLASSES = [
   "cost_overrun",
   "timeout",
   "sandbox_failure",
+  "missing_user_data",
+  "missing_domain_data",
+  "missing_codebase_context",
+  "missing_runtime_context",
+  "missing_credentials",
+  "stale_external_data",
+  "bad_retrieval",
+  "insufficient_evidence",
+  "contradictory_evidence",
+  "ambiguous_user_intent",
+  "knowledge_readiness_blocked",
   "unknown"
 ];
 function isLlmSpan(s) {
@@ -5329,6 +5485,62 @@ var DEFAULT_RULES = [
       return null;
     }
   },
+  {
+    id: "knowledge-readiness-blocked",
+    match: ({ events }) => {
+      const event = events.find((e) => e.kind === "custom" && e.payload.kind === "readiness_scored" && e.payload.passed === false);
+      return event ? {
+        failureClass: "knowledge_readiness_blocked",
+        reason: "knowledge readiness report blocked execution",
+        triggerEventId: event.eventId
+      } : null;
+    }
+  },
+  {
+    id: "missing-credentials",
+    match: ({ events }) => {
+      const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.category === "credential_or_secret");
+      return event ? {
+        failureClass: "missing_credentials",
+        reason: "required credential or secret was missing",
+        triggerEventId: event.eventId
+      } : null;
+    }
+  },
+  {
+    id: "bad-retrieval",
+    match: ({ run, spans }) => {
+      if (run.outcome?.pass !== false) return null;
+      const retrieval = spans.find((s) => s.kind === "retrieval" && (s.hits.length === 0 || s.hits.every((hit) => hit.score <= 0)));
+      return retrieval ? {
+        failureClass: "bad_retrieval",
+        reason: "retrieval returned no useful hits for a failed run",
+        triggerSpanId: retrieval.spanId
+      } : null;
+    }
+  },
+  {
+    id: "insufficient-evidence",
+    match: ({ events }) => {
+      const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "insufficient_evidence");
+      return event ? {
+        failureClass: "insufficient_evidence",
+        reason: "task proceeded with insufficient supporting evidence",
+        triggerEventId: event.eventId
+      } : null;
+    }
+  },
+  {
+    id: "contradictory-evidence",
+    match: ({ events }) => {
+      const event = events.find((e) => e.kind === "custom" && e.payload.kind === "knowledge_gap" && e.payload.reason === "contradictory_evidence");
+      return event ? {
+        failureClass: "contradictory_evidence",
+        reason: "supporting evidence contradicted itself",
+        triggerEventId: event.eventId
+      } : null;
+    }
+  },
   // Budget breach events
   {
     id: "budget-breach",
@@ -5667,11 +5879,14 @@ async function failureClusterView(store, options = {}) {
     const cls = classifyFailure({ run, spans, events }, rules);
     let toolName;
     let argPrefix;
+    let dimension;
     if (cls.triggerSpanId) {
       const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
       if (trig?.kind === "tool") {
         toolName = trig.toolName;
         argPrefix = argHash(trig.args).slice(0, 16);
+      } else if (trig?.kind === "judge") {
+        dimension = trig.dimension;
       }
     }
     if (!toolName) {
@@ -5682,13 +5897,18 @@ async function failureClusterView(store, options = {}) {
         argPrefix = argHash(errored.args).slice(0, 16);
       }
     }
-    const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}`;
+    if (!dimension) {
+      const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
+      if (judge?.kind === "judge") dimension = judge.dimension;
+    }
+    const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
     let cluster = clusters.get(key);
     if (!cluster) {
       cluster = {
         failureClass: cls.failureClass,
         toolName,
         argPrefix,
+        dimension,
         runCount: 0,
         scenarioIds: [],
         exampleRunId: run.runId,
@@ -6457,10 +6677,10 @@ function analyzeSeries(values, options = {}) {
     return { state: "insufficient-data", windowMean: 0, windowCv: 0, tailRun: 0, stable: false };
   }
   const tail = values.slice(-window);
-  const mean9 = tail.reduce((a, b) => a + b, 0) / tail.length;
-  const variance2 = tail.reduce((acc, v) => acc + (v - mean9) ** 2, 0) / tail.length;
+  const mean10 = tail.reduce((a, b) => a + b, 0) / tail.length;
+  const variance2 = tail.reduce((acc, v) => acc + (v - mean10) ** 2, 0) / tail.length;
   const stdDev = Math.sqrt(variance2);
-  const refMean = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const refMean = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const cv = stdDev / refMean;
   const stable = tail.length >= window && cv <= stableCv;
   let tailRun = 0;
@@ -6481,7 +6701,7 @@ function analyzeSeries(values, options = {}) {
   } else {
     state = "noisy";
   }
-  return { state, windowMean: mean9, windowCv: cv, tailRun, stable };
+  return { state, windowMean: mean10, windowCv: cv, tailRun, stable };
 }
 // src/state-continuity.ts
@@ -6673,6 +6893,46 @@ function checkCanaries(output, scenarios) {
   }
   return leaks;
 }
+function checkBehavioralCanary(output, scenario) {
+  const pattern = scenario.forbiddenPattern ?? scenario.canary;
+  if (!pattern) return null;
+  const hit = matchForbidden(output, pattern);
+  if (!hit) return null;
+  return {
+    scenarioId: scenario.id,
+    canary: pattern,
+    evidence: excerpt2(output, hit)
+  };
+}
+function runBehavioralCanaries(cases) {
+  const leaks = [];
+  for (const c of cases) {
+    const leak = checkBehavioralCanary(c.output, c.scenario);
+    if (leak) leaks.push({ ...leak, runId: c.runId ?? leak.runId });
+  }
+  return leaks;
+}
+function matchForbidden(output, pattern) {
+  const re = tryParseRegex(pattern);
+  if (re) {
+    const m = output.match(re);
+    return m && m[0].length > 0 ? m[0] : null;
+  }
+  return output.includes(pattern) ? pattern : null;
+}
+function tryParseRegex(pattern) {
+  if (pattern.length < 2 || pattern[0] !== "/") return null;
+  const last = pattern.lastIndexOf("/");
+  if (last <= 0) return null;
+  const body = pattern.slice(1, last);
+  const flags = pattern.slice(last + 1);
+  if (!/^[gimsuy]*$/.test(flags)) return null;
+  try {
+    return new RegExp(body, flags);
+  } catch {
+    return null;
+  }
+}
 async function canaryLeakView(store, scenarios) {
   const targets = scenarios.filter((s) => !!s.canary);
   if (targets.length === 0) return [];
@@ -6938,9 +7198,9 @@ function benjaminiHochberg(pValues, fdr = 0.05) {
   for (let k = n - 1; k >= 0; k--) {
     const rank = k + 1;
     const raw = indexed[k].p * n / rank;
-    const bounded = Math.min(minRight, raw);
-    minRight = bounded;
-    q[indexed[k].i] = Math.min(1, bounded);
+    const bounded2 = Math.min(minRight, raw);
+    minRight = bounded2;
+    q[indexed[k].i] = Math.min(1, bounded2);
   }
   const significant = q.map((v) => v < fdr);
   return { qValues: q, significant };
@@ -7470,12 +7730,12 @@ async function paraphraseRobustness(prompt, mutators, scoreFn, options = {}) {
     variantScores.push({ mutator: id, score, mutated });
     all.push(score);
   }
-  const mean9 = all.reduce((a, b) => a + b, 0) / all.length;
-  const variance2 = all.reduce((a, v) => a + (v - mean9) ** 2, 0) / all.length;
+  const mean10 = all.reduce((a, b) => a + b, 0) / all.length;
+  const variance2 = all.reduce((a, v) => a + (v - mean10) ** 2, 0) / all.length;
   const stdDev = Math.sqrt(variance2);
-  const ref = Math.abs(mean9) > 1e-9 ? Math.abs(mean9) : 1;
+  const ref = Math.abs(mean10) > 1e-9 ? Math.abs(mean10) : 1;
   const robustness = Math.max(0, 1 - stdDev / ref);
-  return { originalScore, variantScores, meanScore: mean9, stdDev, robustness };
+  return { originalScore, variantScores, meanScore: mean10, stdDev, robustness };
 }
 var lowercaseMutator = (p) => p.toLowerCase();
 var sentenceReorderMutator = (p, seed) => {
@@ -7519,6 +7779,41 @@ var DEFAULT_MUTATORS = [
   { id: "politeness-prefix", fn: politenessPrefixMutator },
   { id: "whitespace-collapse", fn: whitespaceCollapseMutator }
 ];
+async function paraphraseRobustnessScenarios(args) {
+  const reps = Math.max(1, args.reps ?? 1);
+  const mutatorNames = args.mutators.map((m) => m.name);
+  const perScenario = [];
+  for (const scenario of args.scenarios) {
+    const baseline = await args.runScenario({
+      id: scenario.id,
+      userTurns: scenario.userTurns
+    });
+    const originalScore = baseline.score;
+    const deltas = {};
+    const paraphrasedAll = [];
+    for (const m of args.mutators) {
+      const scores2 = [];
+      for (let r = 0; r < reps; r++) {
+        const mutatedTurns = scenario.userTurns.map((t) => m.mutator(t));
+        const out = await args.runScenario({
+          id: scenario.id,
+          userTurns: mutatedTurns
+        });
+        scores2.push(out.score);
+      }
+      const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
+      deltas[m.name] = mean10 - originalScore;
+      paraphrasedAll.push(...scores2);
+    }
+    const paraphrasedMean = paraphrasedAll.length === 0 ? originalScore : paraphrasedAll.reduce((a, b) => a + b, 0) / paraphrasedAll.length;
+    perScenario.push({ id: scenario.id, originalScore, paraphrasedMean, deltas });
+  }
+  const meanOriginal = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.originalScore, 0) / perScenario.length;
+  const meanParaphrased = perScenario.length === 0 ? 0 : perScenario.reduce((a, p) => a + p.paraphrasedMean, 0) / perScenario.length;
+  const ratio2 = meanOriginal <= 0 ? 0 : meanParaphrased / meanOriginal;
+  const score = Math.max(0, Math.min(1, ratio2));
+  return { score, perScenario, mutators: mutatorNames };
+}
 // src/visual-diff.ts
 function visualDiff(a, b, options = {}) {
@@ -8396,8 +8691,8 @@ async function prmBestOfN(store, grader, runIds) {
   if (runIds.length === 0) throw new Error("prmBestOfN: at least 1 candidate required");
   const graded = await Promise.all(runIds.map((id) => grader.grade(store, id)));
   const ranked = [...graded].sort((a, b) => b.aggregateScore - a.aggregateScore);
-  const mean9 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
-  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / graded.length;
+  const mean10 = graded.reduce((a, g) => a + g.aggregateScore, 0) / graded.length;
+  const variance2 = graded.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / graded.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
 async function prmEnsembleBestOfN(store, graders, runIds) {
@@ -8419,8 +8714,8 @@ async function prmEnsembleBestOfN(store, graders, runIds) {
   const ranked = [...byRun.values()].sort(
     (a, b) => (bordaScores.get(b.runId) ?? 0) - (bordaScores.get(a.runId) ?? 0)
   );
-  const mean9 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
-  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean9) ** 2, 0) / ranked.length;
+  const mean10 = ranked.reduce((a, g) => a + g.aggregateScore, 0) / ranked.length;
+  const variance2 = ranked.reduce((a, g) => a + (g.aggregateScore - mean10) ** 2, 0) / ranked.length;
   return { winner: ranked[0], ranked, stdDev: Math.sqrt(variance2) };
 }
@@ -8747,10 +9042,11 @@ async function signManifest(m) {
   const bytes = new TextEncoder().encode(JSON.stringify(canonical));
   const digest = await globalThis.crypto.subtle.digest("SHA-256", bytes);
   const hash = Array.from(new Uint8Array(digest)).map((b) => b.toString(16).padStart(2, "0")).join("");
-  return { ...m, contentHash: hash };
+  return { ...m, contentHash: hash, algo: "sha256-content" };
 }
 async function verifyManifest(m) {
-  const { contentHash, ...rest } = m;
+  const { contentHash, algo: _algo, ...rest } = m;
+  void _algo;
   const resigned = await signManifest(rest);
   return resigned.contentHash === contentHash;
 }
@@ -8950,8 +9246,8 @@ async function proposeSynthesisTargets(dataset, traceStore, options = {}) {
     const sRuns = runs.filter((r) => r.scenarioId === s.id);
     const scores2 = sRuns.map((r) => r.outcome?.score).filter((x) => typeof x === "number");
     if (scores2.length < 3) continue;
-    const mean9 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
-    const variance2 = scores2.reduce((a, b) => a + (b - mean9) ** 2, 0) / scores2.length;
+    const mean10 = scores2.reduce((a, b) => a + b, 0) / scores2.length;
+    const variance2 = scores2.reduce((a, b) => a + (b - mean10) ** 2, 0) / scores2.length;
     if (variance2 > varianceThreshold) {
       targets.push({
         reason: "high-variance",
@@ -10989,7 +11285,7 @@ function defaultReferenceReplayMatcher(reference, candidate) {
   const textScore = tokenJaccard(referenceText, candidateText);
   const severityScore = reference.severity && candidate.severity ? normalize(reference.severity) === normalize(candidate.severity) ? 0.1 : -0.05 : 0;
   const tagScore = tagOverlap(reference.tags, candidate.tags) * 0.15;
-  const score = clamp012(textScore * 0.85 + tagScore + severityScore);
+  const score = clamp013(textScore * 0.85 + tagScore + severityScore);
   return { score, reason: `token=${textScore.toFixed(2)} tags=${tagScore.toFixed(2)} severity=${severityScore.toFixed(2)}` };
 }
 function scoreScenario(scenario, matcher, threshold, matchStrategy) {
@@ -11089,7 +11385,7 @@ function scorePair(scenario, matcher, reference, candidate) {
   if (!Number.isFinite(result.score)) {
     throw new Error(`reference replay matcher returned non-finite score for ${scenario.id}:${reference.id}:${candidate.id}`);
   }
-  return { score: clamp012(result.score), reason: result.reason ?? "" };
+  return { score: clamp013(result.score), reason: result.reason ?? "" };
 }
 function buildScenarioScore(scenario, matches2, falsePositives) {
   const matched = matches2.filter((match) => match.matched).length;
@@ -11188,7 +11484,7 @@ function tokens(text) {
 function normalize(text) {
   return text.toLowerCase().replace(/[^a-z0-9]+/g, " ").trim();
 }
-function clamp012(value) {
+function clamp013(value) {
   if (!Number.isFinite(value)) return 0;
   return Math.max(0, Math.min(1, value));
 }
@@ -12653,7 +12949,7 @@ async function scoreOne(config, variant, scenarioId, rep, split) {
       scenarioId,
       rep,
       ok: scored.ok ?? true,
-      score: clamp013(scored.score),
+      score: clamp014(scored.score),
       cost: scored.costUsd ?? run.costUsd ?? 0,
       durationMs: scored.durationMs ?? run.durationMs ?? 0,
       metrics: {
@@ -12765,7 +13061,7 @@ function stableHash2(input) {
   }
   return h >>> 0;
 }
-function clamp013(n) {
+function clamp014(n) {
   if (!Number.isFinite(n)) return 0;
   return Math.max(0, Math.min(1, n));
 }
@@ -12813,6 +13109,289 @@ function traceExcerpt(trace) {
   return void 0;
 }
+// src/release-confidence.ts
+var DEFAULT_THRESHOLDS = {
+  requireCorpus: true,
+  minScenarioCount: 1,
+  minSearchRuns: 1,
+  minHoldoutRuns: 1,
+  requireHoldout: true,
+  minPassRate: 0.8,
+  minMeanScore: 0.7,
+  maxOverfitGap: 0.15,
+  maxMeanCostUsd: Number.POSITIVE_INFINITY,
+  maxP95WallMs: Number.POSITIVE_INFINITY,
+  requireAsiForFailures: true,
+  failureScoreThreshold: 0.5
+};
+function releaseTraceEvidenceFromMultiShotTrials(trials) {
+  return trials.map((trial) => ({
+    scenarioId: trial.scenarioId,
+    candidateId: trial.variantId,
+    split: trial.split === "holdout" ? "holdout" : trial.split === "dev" ? "dev" : "search",
+    score: trial.score,
+    ok: trial.ok,
+    turnCount: Array.isArray(trial.trace?.turns) ? trial.trace.turns.length : void 0,
+    costUsd: trial.cost,
+    durationMs: trial.durationMs,
+    failureMode: trial.error ? "runtime_error" : void 0,
+    asi: trial.asi,
+    metadata: trial.metadata
+  }));
+}
+function evaluateReleaseConfidence(input) {
+  const thresholds = { ...DEFAULT_THRESHOLDS, ...input.thresholds };
+  const candidateId = input.candidateId ?? null;
+  const runs = filterCandidate(input.runs ?? [], candidateId, input.baselineId);
+  const traces = filterTraceCandidate(input.traces ?? [], candidateId, input.baselineId);
+  const scenarios = input.scenarios ?? [];
+  const scenarioCount = input.dataset?.scenarioCount ?? scenarios.length;
+  const splitCounts = input.dataset?.splitCounts ?? countScenarioSplits(scenarios);
+  const searchScores = scoresFor(runs, "search");
+  const holdoutScores = scoresFor(runs, "holdout");
+  const allScores = [...searchScores, ...holdoutScores];
+  const traceScores = traces.map((t) => t.score).filter(isFiniteNumber);
+  const scoreUniverse = allScores.length > 0 ? allScores : traceScores;
+  const searchRuns = runs.filter((r) => r.splitTag === "search").length;
+  const holdoutRuns = runs.filter((r) => r.splitTag === "holdout").length;
+  const searchMeanScore = mean8(searchScores);
+  const holdoutMeanScore = mean8(holdoutScores);
+  const metrics = {
+    scenarioCount,
+    searchRuns,
+    holdoutRuns,
+    passRate: passRate(runs, traces, thresholds.failureScoreThreshold),
+    meanScore: mean8(scoreUniverse),
+    searchMeanScore,
+    holdoutMeanScore,
+    overfitGap: safeDiff2(searchMeanScore, holdoutMeanScore),
+    meanCostUsd: mean8([...runs.map((r) => r.costUsd), ...traces.map((t) => t.costUsd).filter(isFiniteNumber)]),
+    p95WallMs: percentile([...runs.map((r) => r.wallMs), ...traces.map((t) => t.durationMs).filter(isFiniteNumber)], 0.95),
+    failedRows: failedRows(runs, traces, thresholds.failureScoreThreshold).length,
+    failuresWithAsi: failedRows(runs, traces, thresholds.failureScoreThreshold).filter((row) => row.hasAsi).length,
+    singleShotTraces: traces.filter((t) => t.turnCount === 1).length,
+    multiShotTraces: traces.filter((t) => (t.turnCount ?? 0) > 1).length,
+    splitCounts,
+    domainCounts: countDomains(scenarios),
+    failureModeCounts: countFailureModes(runs, traces, thresholds.failureScoreThreshold),
+    responsibleSurfaceCounts: countResponsibleSurfaces(traces)
+  };
+  const issues = [];
+  checkCorpus(input, thresholds, metrics, issues);
+  checkQuality(thresholds, metrics, issues);
+  checkGeneralization(input.gateDecision ?? null, thresholds, metrics, issues);
+  checkDiagnostics(thresholds, metrics, issues);
+  checkEfficiency(thresholds, metrics, issues);
+  const axes = buildAxes(metrics, thresholds, input.gateDecision ?? null, issues);
+  const status = issues.some((i) => i.severity === "critical") ? "fail" : issues.length > 0 ? "warn" : "pass";
+  return {
+    target: input.target,
+    candidateId,
+    baselineId: input.baselineId ?? null,
+    status,
+    promote: status === "pass" && (input.gateDecision ? input.gateDecision.promote : true),
+    axes,
+    issues,
+    metrics,
+    dataset: input.dataset ?? null,
+    gateDecision: input.gateDecision ?? null,
+    summary: renderSummary(input.target, status, metrics, issues)
+  };
+}
+function assertReleaseConfidence(input) {
+  const scorecard = evaluateReleaseConfidence(input);
+  if (scorecard.status === "fail") {
+    throw new Error(scorecard.summary);
+  }
+  return scorecard;
+}
+function filterCandidate(runs, candidateId, baselineId) {
+  if (candidateId) return runs.filter((r) => r.candidateId === candidateId);
+  if (baselineId) return runs.filter((r) => r.candidateId !== baselineId);
+  return [...runs];
+}
+function filterTraceCandidate(traces, candidateId, baselineId) {
+  if (candidateId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId === candidateId);
+  if (baselineId) return traces.filter((t) => t.candidateId === void 0 || t.candidateId !== baselineId);
+  return [...traces];
+}
+function checkCorpus(input, thresholds, metrics, issues) {
+  if (thresholds.requireCorpus && !input.dataset && (input.scenarios?.length ?? 0) === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_corpus", detail: "No Dataset manifest or scenarios supplied." });
+  }
+  if (metrics.scenarioCount < thresholds.minScenarioCount) {
+    issues.push({ axis: "corpus", severity: "critical", code: "few_scenarios", detail: `${metrics.scenarioCount} scenario(s) < min ${thresholds.minScenarioCount}.` });
+  }
+  if (thresholds.requireHoldout && metrics.splitCounts.holdout === 0) {
+    issues.push({ axis: "corpus", severity: "critical", code: "missing_holdout_split", detail: "Corpus has no holdout scenarios." });
+  }
+}
+function checkQuality(thresholds, metrics, issues) {
+  if (metrics.searchRuns < thresholds.minSearchRuns) {
+    issues.push({ axis: "quality", severity: "critical", code: "few_search_runs", detail: `${metrics.searchRuns} search run(s) < min ${thresholds.minSearchRuns}.` });
+  }
+  if (metrics.passRate < thresholds.minPassRate) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_pass_rate", detail: `passRate ${fmt3(metrics.passRate)} < ${fmt3(thresholds.minPassRate)}.` });
+  }
+  if (metrics.meanScore < thresholds.minMeanScore) {
+    issues.push({ axis: "quality", severity: "critical", code: "low_mean_score", detail: `meanScore ${fmt3(metrics.meanScore)} < ${fmt3(thresholds.minMeanScore)}.` });
+  }
+}
+function checkGeneralization(gateDecision, thresholds, metrics, issues) {
+  if (thresholds.requireHoldout && metrics.holdoutRuns < thresholds.minHoldoutRuns) {
+    issues.push({ axis: "generalization", severity: "critical", code: "few_holdout_runs", detail: `${metrics.holdoutRuns} holdout run(s) < min ${thresholds.minHoldoutRuns}.` });
+  }
+  if (Number.isFinite(metrics.overfitGap) && metrics.overfitGap > thresholds.maxOverfitGap) {
+    issues.push({ axis: "generalization", severity: "critical", code: "overfit_gap", detail: `search-holdout gap ${fmt3(metrics.overfitGap)} > ${fmt3(thresholds.maxOverfitGap)}.` });
+  }
+  if (gateDecision && !gateDecision.promote) {
+    issues.push({ axis: "generalization", severity: "critical", code: `gate_${gateDecision.rejectionCode ?? "reject"}`, detail: gateDecision.reason });
+  }
+}
+function checkDiagnostics(thresholds, metrics, issues) {
+  if (!thresholds.requireAsiForFailures) return;
+  if (metrics.failedRows > metrics.failuresWithAsi) {
+    issues.push({
+      axis: "diagnostics",
+      severity: "critical",
+      code: "missing_failure_asi",
+      detail: `${metrics.failedRows - metrics.failuresWithAsi} failed row(s) have no actionable side information.`
+    });
+  }
+}
+function checkEfficiency(thresholds, metrics, issues) {
+  if (metrics.meanCostUsd > thresholds.maxMeanCostUsd) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "cost_budget", detail: `meanCostUsd ${fmt3(metrics.meanCostUsd)} > ${fmt3(thresholds.maxMeanCostUsd)}.` });
+  }
+  if (metrics.p95WallMs > thresholds.maxP95WallMs) {
+    issues.push({ axis: "efficiency", severity: "critical", code: "latency_budget", detail: `p95WallMs ${fmt3(metrics.p95WallMs)} > ${fmt3(thresholds.maxP95WallMs)}.` });
+  }
+}
+function buildAxes(metrics, thresholds, gateDecision, issues) {
+  return [
+    axis("corpus", issues, bounded(metrics.scenarioCount / Math.max(1, thresholds.minScenarioCount)), `${metrics.scenarioCount} scenarios; holdout=${metrics.splitCounts.holdout}`),
+    axis("quality", issues, Math.min(metrics.passRate, metrics.meanScore), `passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`),
+    axis("generalization", issues, gateDecision && !gateDecision.promote ? 0 : gapScore(metrics.overfitGap, thresholds.maxOverfitGap), `holdoutRuns=${metrics.holdoutRuns} overfitGap=${fmt3(metrics.overfitGap)}`),
+    axis("diagnostics", issues, metrics.failedRows === 0 ? 1 : metrics.failuresWithAsi / metrics.failedRows, `failuresWithAsi=${metrics.failuresWithAsi}/${metrics.failedRows}`),
+    axis("efficiency", issues, efficiencyScore(metrics, thresholds), `meanCostUsd=${fmt3(metrics.meanCostUsd)} p95WallMs=${fmt3(metrics.p95WallMs)}`)
+  ];
+}
+function axis(name, issues, score, detail) {
+  const own = issues.filter((i) => i.axis === name);
+  const status = own.some((i) => i.severity === "critical") ? "fail" : own.length > 0 ? "warn" : "pass";
+  return { name, status, score: bounded(score), detail };
+}
+function countScenarioSplits(scenarios) {
+  const counts = { train: 0, dev: 0, test: 0, holdout: 0 };
+  for (const scenario of scenarios) counts[scenario.split ?? "train"]++;
+  return counts;
+}
+function countDomains(scenarios) {
+  const out = {};
+  for (const scenario of scenarios) {
+    const domain = scenario.tags?.domain ?? scenario.tags?.category ?? "uncategorized";
+    out[domain] = (out[domain] ?? 0) + 1;
+  }
+  return out;
+}
+function countFailureModes(runs, traces, threshold) {
+  const out = {};
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const mode = run.failureMode ?? "low_score";
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      const mode = trace.failureMode ?? (trace.ok === false ? "not_ok" : "low_score");
+      out[mode] = (out[mode] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function countResponsibleSurfaces(traces) {
+  const out = {};
+  for (const trace of traces) {
+    for (const asi of trace.asi ?? []) {
+      const surface = asi.responsibleSurface ?? "unknown";
+      out[surface] = (out[surface] ?? 0) + 1;
+    }
+  }
+  return out;
+}
+function failedRows(runs, traces, threshold) {
+  const out = [];
+  for (const run of runs) {
+    const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+    if (run.failureMode || score !== void 0 && score < threshold) {
+      const asiMetric = run.outcome.raw.asi;
+      out.push({ hasAsi: typeof asiMetric === "number" && asiMetric > 0 });
+    }
+  }
+  for (const trace of traces) {
+    if (trace.failureMode || trace.ok === false || trace.score !== void 0 && trace.score < threshold) {
+      out.push({ hasAsi: (trace.asi?.length ?? 0) > 0 });
+    }
+  }
+  return out;
+}
+function passRate(runs, traces, threshold) {
+  const outcomes = [
+    ...runs.map((run) => {
+      const score = run.outcome.holdoutScore ?? run.outcome.searchScore;
+      return !run.failureMode && score !== void 0 && score >= threshold;
+    }),
+    ...traces.map((trace) => trace.ok !== false && (trace.score === void 0 || trace.score >= threshold))
+  ];
+  if (outcomes.length === 0) return 0;
+  return outcomes.filter(Boolean).length / outcomes.length;
+}
+function scoresFor(runs, split) {
+  return runs.filter((run) => run.splitTag === split).map((run) => split === "holdout" ? run.outcome.holdoutScore : run.outcome.searchScore).filter(isFiniteNumber);
+}
+function mean8(xs) {
+  if (xs.length === 0) return Number.NaN;
+  return xs.reduce((sum2, x) => sum2 + x, 0) / xs.length;
+}
+function percentile(xs, p) {
+  if (xs.length === 0) return Number.NaN;
+  const sorted = [...xs].sort((a, b) => a - b);
+  return sorted[Math.min(sorted.length - 1, Math.max(0, Math.ceil(p * sorted.length) - 1))];
+}
+function isFiniteNumber(value) {
+  return typeof value === "number" && Number.isFinite(value);
+}
+function safeDiff2(a, b) {
+  if (!Number.isFinite(a) || !Number.isFinite(b)) return Number.NaN;
+  return a - b;
+}
+function gapScore(gap, maxGap) {
+  if (!Number.isFinite(gap)) return 0;
+  if (maxGap <= 0) return gap <= 0 ? 1 : 0;
+  return bounded(1 - Math.max(0, gap) / maxGap);
+}
+function efficiencyScore(metrics, thresholds) {
+  const cost = Number.isFinite(thresholds.maxMeanCostUsd) && Number.isFinite(metrics.meanCostUsd) ? bounded(thresholds.maxMeanCostUsd / Math.max(metrics.meanCostUsd, 1e-12)) : 1;
+  const latency = Number.isFinite(thresholds.maxP95WallMs) && Number.isFinite(metrics.p95WallMs) ? bounded(thresholds.maxP95WallMs / Math.max(metrics.p95WallMs, 1e-12)) : 1;
+  return Math.min(cost, latency);
+}
+function bounded(x) {
+  if (!Number.isFinite(x)) return 0;
+  return Math.max(0, Math.min(1, x));
+}
+function renderSummary(target, status, metrics, issues) {
+  const prefix = `release confidence ${status}: ${target}`;
+  const metricText = `scenarios=${metrics.scenarioCount} searchRuns=${metrics.searchRuns} holdoutRuns=${metrics.holdoutRuns} passRate=${fmt3(metrics.passRate)} meanScore=${fmt3(metrics.meanScore)}`;
+  if (issues.length === 0) return `${prefix}; ${metricText}`;
+  return `${prefix}; ${metricText}; issues=${issues.map((i) => i.code).join(",")}`;
+}
+function fmt3(x) {
+  if (!Number.isFinite(x)) return String(x);
+  return x.toFixed(4);
+}
 // src/jsonl-trial-cache.ts
 import { appendFileSync as appendFileSync4, existsSync as existsSync6, mkdirSync as mkdirSync4, readFileSync as readFileSync5 } from "fs";
 import { dirname as dirname4 } from "path";
@@ -13458,9 +14037,9 @@ function passOrthogonality(input) {
       sims.push(cosineSimilarity(vectors[i], vectors[j]));
     }
   }
-  const mean9 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
+  const mean10 = sims.length === 0 ? 0 : sims.reduce((a, b) => a + b, 0) / sims.length;
   return {
-    orthogonality: Math.max(0, Math.min(1, 1 - mean9)),
+    orthogonality: Math.max(0, Math.min(1, 1 - mean10)),
     passCount: passes.length,
     similarities: sims
   };
@@ -13506,8 +14085,8 @@ function bootstrapCi(baseline, candidate, options = {}) {
   const iterations = options.iterations ?? 1e3;
   const minTotal = options.minTotalSamples ?? 6;
   const rng = mulberry32(options.seed ?? hashSeed(baseline, candidate));
-  const baselineMean = mean8(baseline);
-  const candidateMean = mean8(candidate);
+  const baselineMean = mean9(baseline);
+  const candidateMean = mean9(candidate);
   const delta = candidateMean - baselineMean;
   if (baseline.length + candidate.length < minTotal || baseline.length === 0 || candidate.length === 0) {
     return {
@@ -13525,7 +14104,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
   for (let i = 0; i < iterations; i++) {
     const bResample = resample(baseline, rng);
     const cResample = resample(candidate, rng);
-    deltas[i] = mean8(cResample) - mean8(bResample);
+    deltas[i] = mean9(cResample) - mean9(bResample);
   }
   deltas.sort((a, b) => a - b);
   const lowerIdx = Math.floor(alpha / 2 * iterations);
@@ -13548,7 +14127,7 @@ function bootstrapCi(baseline, candidate, options = {}) {
     verdict
   };
 }
-function mean8(xs) {
+function mean9(xs) {
   if (xs.length === 0) return 0;
   let s = 0;
   for (const x of xs) s += x;
@@ -13865,6 +14444,7 @@ export {
   TraceEmitter,
   TrialTelemetry,
   UNIVERSAL_FINDERS,
+  acquisitionPlansForKnowledgeGaps,
   adversarialJudge,
   aggregateLlm,
   aggregateRunScore,
@@ -13872,6 +14452,7 @@ export {
   analyzeAntiSlop,
   analyzeSeries,
   argHash,
+  assertReleaseConfidence,
   assignFeedbackSplit,
   attributeCounterfactuals,
   deterministicSplit as benchmarkDeterministicSplit,
@@ -13879,6 +14460,7 @@ export {
   benjaminiHochberg,
   bhAdjust,
   bisect,
+  blockingKnowledgeEval,
   bonferroni,
   bootstrapCi,
   budgetBreachView,
@@ -13892,9 +14474,10 @@ export {
   callLlmJson,
   canaryLeakView,
   causalAttribution,
+  checkBehavioralCanary,
   checkCanaries,
   checkSlos,
-  clamp01,
+  clamp012 as clamp01,
   classifyEuAiRisk,
   classifyFailure,
   codeExecutionJudge,
@@ -13942,6 +14525,7 @@ export {
   evaluateContract,
   evaluateHypothesis,
   evaluateOracles,
+  evaluateReleaseConfidence,
   executeScenario,
   expectAgent,
   exportRewardModel,
@@ -14014,6 +14598,7 @@ export {
   pairedTTest,
   pairedWilcoxon,
   paraphraseRobustness,
+  paraphraseRobustnessScenarios,
   paretoChart,
   paretoFrontier,
   paretoFrontierWithCrowding,
@@ -14041,6 +14626,7 @@ export {
   regexMatch,
   regexMatches,
   regressionView,
+  releaseTraceEvidenceFromMultiShotTrials,
   renderMarkdown,
   renderMarkdownReport,
   renderPlaybookMarkdown,
@@ -14058,6 +14644,7 @@ export {
   rowWhere,
   runAgentControlLoop,
   runAssertions,
+  runBehavioralCanaries,
   runCanaries,
   runCounterfactual,
   runE2EWorkflow,
@@ -14081,6 +14668,7 @@ export {
   scanForMuffledGates,
   scoreAllProjects,
   scoreContinuity,
+  scoreKnowledgeReadiness,
   scoreProject,
   scoreRedTeamOutput,
   scoreReferenceReplay,
@@ -14115,6 +14703,7 @@ export {
   trialTraceFromMultiShotTrial,
   typoMutator,
   urlContains,
+  userQuestionsForKnowledgeGaps,
   validateRunRecord,
   verbosityBias,
   verifyManifest,