npm - incremnt - Versions diffs - 0.8.1 → 0.8.2 - Mend

incremnt 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/package.json +6 -1
package/src/ask-answer-verifier.js +249 -14
package/src/ask-coach.js +309 -21
package/src/openrouter.js +55 -30
package/src/promptfoo-evals.js +20 -3
package/src/queries.js +113 -18
package/src/score-prelude.js +16 -13
package/src/summary-evals.js +106 -474
package/src/sync-service.js +46 -11

package/src/summary-evals.js CHANGED Viewed

@@ -6,6 +6,7 @@ import {
   canonicalExerciseName,
   checkpointContext,
   cycleSummaryContext,
+  dateOnlyString,
   executeCoachReadTool,
   normalizeExerciseName,
   workoutSummaryContext,
@@ -24,12 +25,30 @@ import {
 import { computeScoreBand } from './score-context.js';
 import { stripXMLTagBlocks } from './prompt-security.js';
 import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
+import { findAskAnswerExerciseMentions, verifyAskAnswer } from './ask-answer-verifier.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
 export const summaryEvalFixturesRoot = path.resolve(__dirname, '../test/fixtures/summary-evals');
+const ASK_PROVENANCE_VERIFIER_KEYS = new Set([
+  'unsupported_weight_claim',
+  'unrouted_weight_claim',
+  'unsupported_weighted_set_claim',
+  'unrouted_weighted_set_claim',
+  'e1rm_without_records',
+  'direction_inversion',
+  'tool_replay_failed'
+]);
+const ASK_CLAIM_VERIFIER_KEYS = new Set([
+  'target_hit_contradiction',
+  'target_hit_without_session_evidence',
+  'clean_consistency_contradiction',
+  'unsupported_fatigue_recovery'
+]);
 export function defaultCaseSetName() {
   return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
 }
@@ -108,7 +127,8 @@ export function buildSummaryEvalContext(snapshot, testCase) {
       // prelude to the routed context. Including it here means a live eval feeds
       // the model the same dump-prone material, so evaluateAskScoreVoice actually
       // guards the prompt, not just the checker.
-      const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question });
+      const responseProfile = routed?.metadata?.responseProfile ?? routed?.metadata?.intent?.responseProfile ?? testCase.context?.responseProfile;
+      const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question, responseProfile });
       const routedContext = routed?.context ?? null;
       const trainingData = testCase.context?.trainingData
         ?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
@@ -222,6 +242,49 @@ function normalizeText(value) {
   return String(value ?? '').trim();
 }
+const askVerifierCacheKey = Symbol('summaryEvalAskVerifierCache');
+export function summaryEvalAskVerifierToday(testCase) {
+  return dateOnlyString(testCase.context?.today ?? testCase.today ?? new Date());
+}
+function getAskVerifierResult(output, context, snapshot, testCase) {
+  const answer = normalizeText(output);
+  const routingMetadata = context?.routedMetadata ?? testCase.context?.routedMetadata ?? {};
+  const today = summaryEvalAskVerifierToday(testCase);
+  const exclude = testCase.exclude ?? [];
+  const cacheKey = `${answer}\n${JSON.stringify(routingMetadata)}\n${today}\n${JSON.stringify(exclude)}`;
+  if (context && typeof context === 'object') {
+    if (!context[askVerifierCacheKey]) {
+      Object.defineProperty(context, askVerifierCacheKey, {
+        value: new Map(),
+        enumerable: false
+      });
+    }
+    const cached = context[askVerifierCacheKey].get(cacheKey);
+    if (cached) return cached;
+  }
+  const verification = verifyAskAnswer({
+    answer,
+    snapshot,
+    routingMetadata,
+    today,
+    exclude,
+    strictMentionProvenance: false
+  });
+  const result = {
+    verification,
+    provenanceFailures: (verification.blockingFailures ?? [])
+      .filter((failure) => ASK_PROVENANCE_VERIFIER_KEYS.has(failure.key)),
+    claimFailures: (verification.blockingFailures ?? [])
+      .filter((failure) => ASK_CLAIM_VERIFIER_KEYS.has(failure.key))
+  };
+  if (context && typeof context === 'object') {
+    context[askVerifierCacheKey].set(cacheKey, result);
+  }
+  return result;
+}
 function parseJsonOutput(output) {
   const normalized = normalizeText(output);
   if (!normalized) return null;
@@ -697,7 +760,7 @@ function evaluateNoInsight(output, testCase) {
   };
 }
-function evaluateShape(output, testCase) {
+function evaluateShape(output, testCase, context = null) {
   const normalized = testCase.surface === 'scoreCommentary'
     ? scoreCommentaryText(output)
     : normalizeText(output);
@@ -768,12 +831,20 @@ function evaluateShape(output, testCase) {
         reasons.push(`Checkpoint summaries must be 2-3 paragraphs, got ${paragraphs}.`);
       }
       break;
-    case 'ask':
-      if (sentences < 1 || sentences > 12) {
+    case 'ask': {
+      // Expansive answers are intentionally richer; the old 12-sentence cap was
+      // the pre-expansive policy. Allow more for expansive (still bounded so a
+      // genuine wall of text is flagged), keep the tight cap for defensive.
+      const profile = context?.routedMetadata?.responseProfile
+        ?? context?.routedMetadata?.intent?.responseProfile
+        ?? askResponseProfileFromTestCase(testCase);
+      const maxAskSentences = profile === 'expansive' ? 20 : 12;
+      if (sentences < 1 || sentences > maxAskSentences) {
         passed = false;
-        reasons.push(`Ask-coach answers must be 1-12 sentences, got ${sentences}.`);
+        reasons.push(`Ask-coach answers must be 1-${maxAskSentences} sentences, got ${sentences}.`);
       }
       break;
+    }
     case 'scoreCommentary':
       if (sentences < 1 || sentences > 8) {
         passed = false;
@@ -888,45 +959,6 @@ function hasFatigueLanguage(output) {
   return /\b(fatigue|fatigued|underrecovered|recovery debt|fatigue ceiling|limited by recovery|limited by fatigue|accumulated fatigue)\b/i.test(output);
 }
-function hasAskFatigueRecoveryLanguage(output) {
-  return hasFatigueLanguage(output)
-    || /\b(?:poor|low|bad|incomplete)\s+recovery\b/i.test(output)
-    || /\bunder[-\s]?recovery\b/i.test(output)
-    || /\brecovery\s+(?:limited|held back|caused|explains|drove|deficit|issue|problem)\b/i.test(output);
-}
-function hasAskFatigueRecoveryUncertaintyLanguage(output) {
-  const missingRecoveryData = /\b(?:no|not enough|without|missing|lack(?:ing)?|insufficient)\s+(?:\w+\s+){0,4}?(?:recovery|readiness|vitals?|sleep|hrv|heart rate|data|info|signals?|metrics?)\b/i.test(output);
-  const refusesInference = /\b(?:cannot|can't|do not|don't|does not|doesn't|would not|wouldn't|not enough|isn't enough|is not enough|no basis to|hard to)\s+(?:\w+\s+){0,12}?(?:infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\s+(?:\w+\s+){0,12}?(?:fatigue|recovery|readiness|why)\b/i.test(output);
-  const recoveryDoesNotExplain = /\b(?:fatigue|recovery|readiness)\b\s+(?:\w+\s+){0,10}?(?:cannot|can't|does not|doesn't|would not|wouldn't|isn't|is not)\s+(?:\w+\s+){0,10}?(?:explain|prove|show|tell|account for)\b/i.test(output);
-  return missingRecoveryData || refusesInference || recoveryDoesNotExplain;
-}
-function hasAskPositiveFatigueRecoveryAttribution(output) {
-  const concept = String.raw`(?:fatigue|fatigued|under[-\s]?recovered|under[-\s]?recovery|poor recovery|low recovery|incomplete recovery|recovery debt|fatigue ceiling|accumulated fatigue)`;
-  const causeVerb = String.raw`(?:because|due to|caused by|from|reflects?|suggests?|indicates?|points? to|explains?|limited|held back|drove|contributed to|tied to|tie\s+\w+\s+to)`;
-  const patterns = [
-    new RegExp(String.raw`\b${causeVerb}\b.{0,80}\b${concept}\b`, 'gi'),
-    new RegExp(String.raw`\b${concept}\b.{0,80}\b(?:caused|limited|held back|explains?|drove|led to|contributed to|accounts? for)\b`, 'gi')
-  ];
-  for (const pattern of patterns) {
-    for (const match of output.matchAll(pattern)) {
-      const start = Math.max(0, (match.index ?? 0) - 40);
-      const window = output.slice(start, (match.index ?? 0) + match[0].length);
-      if (!/\b(?:not|no|cannot|can't|doesn't|does not|would not|wouldn't|isn't|is not)\b/i.test(window)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-function hasUnsupportedAskFatigueRecoveryClaim(output) {
-  if (!hasAskFatigueRecoveryLanguage(output)) return false;
-  if (hasAskPositiveFatigueRecoveryAttribution(output)) return true;
-  return !hasAskFatigueRecoveryUncertaintyLanguage(output);
-}
 function matchesHistoricalFamilyName(claimName, actualName) {
   const claimVariants = new Set(historicalExerciseVariants(claimName));
   const actualVariants = new Set(historicalExerciseVariants(actualName));
@@ -1086,37 +1118,6 @@ function evaluateWorkoutClaims(output, context, testCase) {
   };
 }
-function extractAskTargetHitClaims(text) {
-  const claims = [];
-  const patterns = [
-    /\b(?:you\s+)?hit(?:ting)?\s+all\s+(?:your\s+)?target(?:ed)?\s+reps?\b/gi,
-    /\b(?:you\s+)?hit\s+all\s+(?:the\s+)?targets?\b/gi,
-    /\b(?:you\s+)?hit\s+(?:the|your)\s+target\b(?!\s+(?:of|for|on))/gi
-  ];
-  for (const pattern of patterns) {
-    for (const match of text.matchAll(pattern)) {
-      claims.push({ text: match[0] });
-    }
-  }
-  return claims;
-}
-function extractAskCleanConsistencyClaims(text) {
-  const claims = [];
-  const patterns = [
-    /\bclean,\s+consistent\b/gi,
-    /\bclean\s+and\s+consistent\b/gi,
-    /\bconsistent\s+set\s+of\s+work\b/gi,
-    /\bacross\s+the\s+board\b/gi
-  ];
-  for (const pattern of patterns) {
-    for (const match of text.matchAll(pattern)) {
-      claims.push({ text: match[0] });
-    }
-  }
-  return claims;
-}
 function extractAskPlannedListClaims(text) {
   const claims = [];
   const pattern = /\((\s*\d+(?:\s*,\s*\d+){2,})\s+planned\s*\)/gi;
@@ -1137,77 +1138,6 @@ function sessionPlannedReps(session) {
   return values;
 }
-function findMentionedExercises(text, snapshot) {
-  const exercisesByName = new Map();
-  for (const session of snapshot?.sessions ?? []) {
-    for (const exercise of session.exercises ?? []) {
-      if (!exercise?.name) continue;
-      const normalizedName = normalizeExerciseName(exercise.name);
-      if (!normalizedName || exercisesByName.has(normalizedName)) continue;
-      exercisesByName.set(normalizedName, exercise.name);
-    }
-    for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
-      if (!exercise?.exerciseName) continue;
-      const normalizedName = normalizeExerciseName(exercise.exerciseName);
-      if (!normalizedName || exercisesByName.has(normalizedName)) continue;
-      exercisesByName.set(normalizedName, exercise.exerciseName);
-    }
-  }
-  const mentions = [];
-  for (const [normalizedName, displayName] of exercisesByName) {
-    const pattern = new RegExp(`\\b${escapeRegExp(displayName)}\\b`, 'gi');
-    for (const match of text.matchAll(pattern)) {
-      mentions.push({
-        index: match.index ?? -1,
-        end: (match.index ?? -1) + match[0].length,
-        name: displayName,
-        normalizedName
-      });
-    }
-  }
-  return mentions
-    .filter((mention, index, allMentions) => !allMentions.some((candidate, candidateIndex) =>
-      candidateIndex !== index &&
-      candidate.index <= mention.index &&
-      candidate.end >= mention.end &&
-      candidate.normalizedName.length > mention.normalizedName.length
-    ))
-    .sort((lhs, rhs) => lhs.index - rhs.index);
-}
-function findRecentSessionMisses(snapshot, { lookbackDays = 7, exerciseNames = null } = {}) {
-  const sessions = snapshot?.sessions ?? [];
-  const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
-  const scopedExerciseNames = exerciseNames && exerciseNames.length > 0 ? new Set(exerciseNames) : null;
-  const misses = [];
-  for (const session of sessions) {
-    const completedAt = session.completedAt || session.date;
-    const completedTime = Date.parse(completedAt);
-    if (!Number.isFinite(completedTime) || completedTime < cutoff) continue;
-    const targetByExercise = new Map();
-    for (const planned of session.prescriptionSnapshot?.exercises ?? []) {
-      const target = Number(planned.targetReps);
-      if (Number.isFinite(target) && target > 0) {
-        targetByExercise.set(normalizeExerciseName(planned.exerciseName), target);
-      }
-    }
-    for (const exercise of session.exercises ?? []) {
-      const normalizedExerciseName = normalizeExerciseName(exercise.name);
-      if (scopedExerciseNames && !scopedExerciseNames.has(normalizedExerciseName)) continue;
-      const target = targetByExercise.get(normalizedExerciseName);
-      if (!Number.isFinite(target)) continue;
-      for (const set of exercise.sets ?? []) {
-        const reps = Number(set.reps);
-        if (set.isComplete && Number.isFinite(reps) && reps < target) {
-          misses.push({ sessionId: session.id, exerciseName: exercise.name, reps, target });
-        }
-      }
-    }
-  }
-  return misses;
-}
 function findNearestMentionedExercise(mentions, index) {
   let candidate = null;
   for (const mention of mentions) {
@@ -1217,170 +1147,6 @@ function findNearestMentionedExercise(mentions, index) {
   return candidate;
 }
-function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
-  const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
-  const withinCutoff = (dateValue) => {
-    const ms = Date.parse(dateValue);
-    return Number.isFinite(ms) && ms >= cutoff;
-  };
-  const vitalsSummaries = snapshot?.vitalsSummaries ?? [];
-  if (vitalsSummaries.some((entry) => withinCutoff(entry.date))) return true;
-  const metrics = snapshot?.healthMetrics ?? {};
-  for (const key of ['restingHR', 'hrv', 'sleep']) {
-    const readings = Array.isArray(metrics[key]) ? metrics[key] : [];
-    if (readings.some((reading) => withinCutoff(reading.date))) return true;
-  }
-  for (const session of snapshot?.sessions ?? []) {
-    const completedAt = session.completedAt || session.date;
-    if (!withinCutoff(completedAt)) continue;
-    for (const exercise of session.exercises ?? []) {
-      const reps = (exercise.sets ?? [])
-        .map((set) => Number(set.reps))
-        .filter((value) => Number.isFinite(value) && value > 0);
-      if (reps.length < 2) continue;
-      const first = reps[0];
-      const last = reps[reps.length - 1];
-      if (first > 0 && (first - last) / first >= 0.3) return true;
-    }
-  }
-  return false;
-}
-function parseWeightNumber(raw) {
-  return Number(String(raw).replace(/,/g, ''));
-}
-function extractAskWeightClaims(text) {
-  const claims = [];
-  // Accept comma-grouped thousands ("40,500 kg") as a single number so volume
-  // figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
-  // figures are excluded by isVolumeWeightClaim at the call sites, not by a
-  // magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
-  // 1000 kg, and a fabricated heavy load must still be graded.
-  const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
-  for (const match of text.matchAll(pattern)) {
-    claims.push({
-      text: match[0],
-      value: parseWeightNumber(match[1]),
-      index: match.index ?? -1
-    });
-  }
-  return claims;
-}
-function extractAskWeightedSetClaims(text) {
-  const claims = [];
-  // A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
-  // an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
-  // almost always a SET count ("70 kg for 4 working sets") or a duration, and
-  // treating it as reps flags real data as a fabricated pair. So match only the
-  // unambiguous forms; the plain-weight loop still grounds the weight itself.
-  const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
-  for (const match of text.matchAll(pattern)) {
-    claims.push({
-      text: match[0],
-      weight: parseWeightNumber(match[1]),
-      reps: Number(match[2] ?? match[3]),
-      index: match.index ?? -1,
-      end: (match.index ?? -1) + match[0].length
-    });
-  }
-  return claims;
-}
-function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
-  const weights = [];
-  for (const session of snapshot?.sessions ?? []) {
-    for (const exercise of session.exercises ?? []) {
-      if (normalizeExerciseName(exercise.name) !== normalizedExerciseName) continue;
-      for (const set of exercise.sets ?? []) {
-        const weight = Number(set.weight);
-        if (Number.isFinite(weight)) weights.push(weight);
-      }
-    }
-    for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
-      if (normalizeExerciseName(exercise.exerciseName) !== normalizedExerciseName) continue;
-      const targetWeight = Number(exercise.targetWeight);
-      if (Number.isFinite(targetWeight)) weights.push(targetWeight);
-      for (const targetSet of exercise.targetSets ?? []) {
-        const weight = Number(targetSet.weight ?? targetSet.targetWeight);
-        if (Number.isFinite(weight)) weights.push(weight);
-      }
-    }
-  }
-  return weights;
-}
-function weightClaimSupported(claim, allowedWeights) {
-  return allowedWeights.some((weight) => Math.abs(weight - claim.value) < 0.01);
-}
-function isEstimatedOneRepMaxWeightClaim(text, claim) {
-  const start = Math.max(0, claim.index - 40);
-  const end = Math.min(text.length, claim.index + claim.text.length + 40);
-  const window = text.slice(start, end);
-  return /\b(?:estimated\s+)?(?:1rm|one[-\s]?rep\s+max)\b/i.test(window);
-}
-function isVolumeWeightClaim(text, claim) {
-  // A kg figure in a clause about volume/tonnage/total load is a workload total
-  // (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
-  // exercise load. Scope to the claim's clause so a fabricated exercise load
-  // earlier in the same sentence is still graded.
-  return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
-}
-function claimClause(text, claim) {
-  const boundaries = [
-    '\n',
-    '. ',
-    ';',
-    ', while',
-    ', whereas',
-    ', but',
-    ' while ',
-    ' whereas ',
-    ' but '
-  ];
-  let start = 0;
-  for (const boundary of boundaries) {
-    const index = text.lastIndexOf(boundary, claim.index);
-    if (index >= 0) start = Math.max(start, index + boundary.length);
-  }
-  let end = text.length;
-  for (const boundary of boundaries) {
-    const index = text.indexOf(boundary, claim.index + claim.text.length);
-    if (index >= 0) end = Math.min(end, index);
-  }
-  return text.slice(start, end);
-}
-// Returns the sentence containing the claim, so context guards can look at the
-// whole clause rather than a fixed-width window (body-weight phrasing can put
-// the "body weight" anchor well before the kg figure).
-function claimSentence(text, claim) {
-  const before = text.slice(0, claim.index);
-  const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
-  const start = startBreak >= 0 ? startBreak + 1 : 0;
-  const after = text.slice(claim.index);
-  const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
-  const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
-  return text.slice(start, end);
-}
-// Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
-// exercise-load claims. findNearestMentionedExercise would otherwise attribute
-// them to the previously named lift and flag a correct answer as a
-// hallucination, so skip any kg figure stated in a body-weight clause.
-function isBodyWeightClaim(text, claim) {
-  return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
-}
 function askWorkingTopSetRows(snapshot) {
   const rows = [];
   for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
@@ -1529,7 +1295,7 @@ const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
 // number directly followed by one of these is left alone.
 const NON_SCORE_UNIT =
   '(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
-  'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
+  'years?|yrs?|h\\b|hrs?|hours?|mins?|minutes?|secs?|seconds?|bpm|ms|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
 // Heuristic, not a parser. Flags a component name followed — within a short,
 // period/newline-free gap (one clause) — by a score-like number that is not a
@@ -1632,11 +1398,19 @@ function evaluateAskSelfReference(output, testCase) {
 // On a question that is not about the Increment Score, the coach must not
 // volunteer the bare overall score number (e.g. "your score is 92/100"). The
 // prelude withholds the number for non-score questions; this guards the answer.
-function evaluateAskVolunteeredScore(output, testCase) {
+function evaluateAskVolunteeredScore(output, testCase, context = {}) {
   if (testCase.surface !== 'ask') {
     return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
   }
   const question = testCase.context?.question ?? testCase.question ?? '';
+  const responseProfile = context?.routedMetadata?.responseProfile
+    ?? context?.routedMetadata?.intent?.responseProfile
+    ?? testCase.context?.routedMetadata?.responseProfile
+    ?? testCase.context?.routedMetadata?.intent?.responseProfile
+    ?? testCase.context?.responseProfile;
+  if (responseProfile === 'expansive') {
+    return { key: 'ask_volunteered_score', passed: true, reason: 'Expansive Ask answers may name the rounded score headline.' };
+  }
   if (isScoreQuestion(question)) {
     return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
   }
@@ -1682,7 +1456,14 @@ function evaluateAskStaleness(output, snapshot, testCase) {
   };
 }
-function evaluateAskClaims(output, snapshot, testCase) {
+function askResponseProfileFromTestCase(testCase) {
+  return testCase?.context?.routedMetadata?.responseProfile
+    ?? testCase?.context?.routedMetadata?.intent?.responseProfile
+    ?? testCase?.context?.responseProfile
+    ?? null;
+}
+function evaluateAskClaims(output, snapshot, testCase, context = null) {
   if (testCase.surface !== 'ask') {
     return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
   }
@@ -1693,26 +1474,9 @@ function evaluateAskClaims(output, snapshot, testCase) {
   }
   const failures = [];
-  const mentionedExercises = findMentionedExercises(output, snapshot);
-  const scopedExerciseNames = uniqueStrings(mentionedExercises.map((mention) => mention.normalizedName));
-  const targetHitClaims = extractAskTargetHitClaims(normalized);
-  if (targetHitClaims.length > 0) {
-    const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
-    if (misses.length > 0) {
-      const sample = misses[0];
-      failures.push(`Ask answer claims targets hit ("${targetHitClaims[0].text}") but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
-    }
-  }
-  const cleanConsistencyClaims = extractAskCleanConsistencyClaims(normalized);
-  if (cleanConsistencyClaims.length > 0) {
-    const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
-    if (misses.length > 0) {
-      const sample = misses[0];
-      failures.push(`Ask answer frames missed target reps as "${cleanConsistencyClaims[0].text}", but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
-    }
-  }
+  const { claimFailures } = getAskVerifierResult(output, context, snapshot, testCase);
+  failures.push(...claimFailures.map((failure) => failure.reason));
+  const mentionedExercises = findAskAnswerExerciseMentions(output, snapshot);
   for (const claim of extractAskPlannedListClaims(normalized)) {
     const uniquePlanned = new Set(claim.reps);
@@ -1742,23 +1506,6 @@ function evaluateAskClaims(output, snapshot, testCase) {
     }
   }
-  for (const claim of extractAskWeightClaims(normalized)) {
-    if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
-    if (isVolumeWeightClaim(normalized, claim)) continue;
-    if (isBodyWeightClaim(normalized, claim)) continue;
-    const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
-    if (!referencedExercise) continue;
-    const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
-    if (allowedWeights.length === 0) continue;
-    if (!weightClaimSupported(claim, allowedWeights)) {
-      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but that weight is not present in recorded or planned sets for that exercise.`);
-    }
-  }
-  if (hasUnsupportedAskFatigueRecoveryClaim(normalized) && !hasAskFatigueSupport(snapshot)) {
-    failures.push('Ask answer uses fatigue/recovery language but the snapshot has no recent vitals, sleep, or rep-dropoff signals to support it.');
-  }
   return {
     key: 'ask_claims',
     passed: failures.length === 0,
@@ -1826,55 +1573,6 @@ function askToolEvidenceRows(toolResults = []) {
   return rows;
 }
-function askToolEvidenceWeights(rows = []) {
-  const weights = [];
-  for (const row of rows) {
-    for (const set of row.sets ?? []) {
-      const weight = Number(set.weight);
-      if (Number.isFinite(weight)) weights.push(weight);
-    }
-    const topWeight = Number(row.topSet?.weight);
-    if (Number.isFinite(topWeight)) weights.push(topWeight);
-    const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
-    if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
-  }
-  return weights;
-}
-function askToolEvidenceSetPairs(rows = []) {
-  const pairs = [];
-  for (const row of rows) {
-    for (const set of row.sets ?? []) {
-      const weight = Number(set.weight);
-      const reps = Number(set.reps);
-      if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
-    }
-    const topWeight = Number(row.topSet?.weight);
-    const topReps = Number(row.topSet?.reps);
-    if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
-    const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
-    const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
-    if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
-      pairs.push({ weight: previousTopWeight, reps: previousTopReps });
-    }
-  }
-  return pairs;
-}
-function toolEvidenceSupportsWeightClaim(claim, rows) {
-  if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
-  return false;
-}
-function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
-  if (askToolEvidenceSetPairs(rows).some((pair) => (
-    Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
-  ))) {
-    return true;
-  }
-  return false;
-}
 function compareToolEvidenceRecency(lhs, rhs) {
   const lhsDaysAgo = Number(lhs?.daysAgo);
   const rhsDaysAgo = Number(rhs?.daysAgo);
@@ -1890,14 +1588,6 @@ function newestToolEvidenceRow(rows = [], predicate = () => true) {
     .sort(compareToolEvidenceRecency)[0] ?? null;
 }
-function latestComparableToolRow(rows = []) {
-  return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
-}
-function isWithinWeightedSetClaim(claim, weightedSetClaims) {
-  return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
-}
 function rowIsStaleForEval(row, testCase) {
   const daysAgo = Number(row?.daysAgo);
   const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
@@ -1952,80 +1642,22 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
   const routedMetadata = context?.routedMetadata ?? {};
   const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
-  const { toolResults, replayFailures } = routedToolResultsForEval(snapshot, context);
+  const { toolResults } = routedToolResultsForEval(snapshot, context);
   const evidenceRows = askToolEvidenceRows(toolResults);
-  const mentionedExercises = findMentionedExercises(output, snapshot);
-  const unroutedMentionNames = new Set();
-  const failures = [...replayFailures];
+  const failures = [];
   for (const toolName of uniqueStrings(testCase.requiredTools)) {
     if (!toolsUsed.has(toolName)) {
       failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
     }
   }
-  if (/\b(?:estimated\s+)?(?:e1rm|1rm|one[- ]rep max)\b/i.test(output) && !toolsUsed.has('get_records')) {
-    failures.push('Ask answer mentions e1RM/1RM, but routed context did not use get_records.');
-  }
-  const weightedSetClaims = extractAskWeightedSetClaims(output);
-  for (const claim of weightedSetClaims) {
-    if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
-    const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
-    if (!referencedExercise) continue;
-    const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
-    if (rows.length === 0) {
-      unroutedMentionNames.add(referencedExercise.normalizedName);
-      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
-      continue;
-    }
-    if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
-      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
-    }
-  }
-  for (const claim of extractAskWeightClaims(output)) {
-    if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
-    if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
-    if (isVolumeWeightClaim(output, claim)) continue;
-    if (isBodyWeightClaim(output, claim)) continue;
-    const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
-    if (!referencedExercise) continue;
-    const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
-    if (rows.length === 0) {
-      unroutedMentionNames.add(referencedExercise.normalizedName);
-      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
-      continue;
-    }
-    if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
-      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
-    }
-  }
+  const { provenanceFailures } = getAskVerifierResult(output, context, snapshot, testCase);
+  failures.push(...provenanceFailures.map((failure) => failure.reason));
   const exerciseNames = evidenceRows.map((row) => row.exerciseName);
-  for (const mention of mentionedExercises) {
+  for (const mention of findAskAnswerExerciseMentions(output, snapshot)) {
     const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
-    if (rows.length === 0) {
-      if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
-        unroutedMentionNames.add(mention.normalizedName);
-        failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
-      }
-      continue;
-    }
-    const comparable = latestComparableToolRow(rows);
-    if (comparable) {
-      const direction = comparable.comparedToPreviousSession.loadDirection;
-      const previous = comparable.comparedToPreviousSession.previousTopSet;
-      const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
-      if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
-        failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
-      }
-      if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
-        failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
-      }
-      if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
-        failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
-      }
-    }
+    if (rows.length === 0) continue;
     const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
     if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
@@ -2665,18 +2297,18 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
   const checks = [
     evaluateNoInsight(visibleOutput, testCase),
-    evaluateShape(visibleOutput, testCase),
+    evaluateShape(visibleOutput, testCase, context),
     evaluateRequiredMentions(visibleOutput, testCase),
     evaluateAnyOfMentions(visibleOutput, testCase),
     evaluateForbiddenPhrases(visibleOutput, testCase),
     evaluateForbiddenMentions(visibleOutput, testCase),
     evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
     evaluateWorkoutClaims(visibleOutput, context, testCase),
-    evaluateAskClaims(visibleOutput, snapshot, testCase),
+    evaluateAskClaims(visibleOutput, snapshot, testCase, context),
     evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
     evaluateAskScoreVoice(visibleOutput, testCase),
     evaluateAskSelfReference(visibleOutput, testCase),
-    evaluateAskVolunteeredScore(visibleOutput, testCase),
+    evaluateAskVolunteeredScore(visibleOutput, testCase, context),
     evaluateAskStaleness(visibleOutput, snapshot, testCase),
     evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
     evaluateFormulaVersion(visibleOutput, snapshot, testCase),