npm - incremnt - Versions diffs - 0.7.0 → 0.7.2 - Mend

incremnt 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/src/summary-evals.js CHANGED Viewed

@@ -6,6 +6,7 @@ import {
   askRoutedContext,
   checkpointContext,
   cycleSummaryContext,
+  executeCoachReadTool,
   normalizeExerciseName,
   workoutSummaryContext,
   vitalsSummaryContext
@@ -86,8 +87,9 @@ export function buildSummaryEvalContext(snapshot, testCase) {
       return vitalsSummaryContext(snapshot, { exclude: new Set(testCase.exclude ?? []) });
     case 'ask': {
       const question = testCase.context?.question ?? testCase.question ?? '';
+      const today = testCase.context?.today ?? testCase.today ?? null;
       const routed = question
-        ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []) })
+        ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
         : null;
       return {
         ...(testCase.context ?? {}),
@@ -305,7 +307,6 @@ const historicalExerciseModifiers = new Set([
   'leg',
   'weighted',
   'romanian',
-  'hack',
   'full',
   'grip'
 ]);
@@ -358,6 +359,18 @@ function collectAllowedExerciseNames(surface, context) {
     for (const sc of context.planComparison?.setsComparison ?? []) {
       if (sc.exercise) names.add(sc.exercise);
     }
+    // Planned-but-skipped and unplanned-but-added exercises are legitimate
+    // planned-vs-actual subjects: the model is handed "skipped X, added Y" in
+    // its context, so a correct "you skipped X" note must not be flagged as an
+    // unauthorized mention. `setsComparison` only carries *performed* planned
+    // exercises (queries.js builds it with a performedNames filter), so skipped
+    // lifts would otherwise never be authorized.
+    for (const exerciseName of context.planComparison?.skipped ?? []) {
+      if (exerciseName) names.add(exerciseName);
+    }
+    for (const exerciseName of context.planComparison?.added ?? []) {
+      if (exerciseName) names.add(exerciseName);
+    }
   }
   if (surface === 'cycle' && context && typeof context === 'object') {
@@ -399,6 +412,49 @@ function collectAllowedExerciseNames(surface, context) {
   return [...names];
 }
+// Project the allow-set from the actual context object the model was handed.
+// Any known exercise name (from the snapshot's vocabulary) that appears anywhere
+// in the serialized context — structured fields, plan comparison, prior-session
+// comparisons, nearby cardio, or free-text session/exercise notes — is something
+// the model could legitimately reference. Deriving authorization this way means
+// the allow-set can never drift behind a newly added context field: the failure
+// mode that flagged a correct "you skipped Hack Squat" note and a note-echoed
+// lift. A genuine invention is a known exercise present in the output but absent
+// from the context entirely.
+function collectContextExerciseNames(context, knownNames) {
+  if (!context || typeof context !== 'object') return [];
+  let serialized;
+  try {
+    serialized = JSON.stringify(context);
+  } catch {
+    return [];
+  }
+  const contextText = normalizeExerciseName(serialized);
+  if (!contextText) return [];
+  const matches = [];
+  for (const name of knownNames) {
+    const normalized = normalizeExerciseName(name);
+    if (!normalized) continue;
+    const pattern = new RegExp(`(?<!\\S)${escapeRegex(normalized)}(?!\\S)`, 'g');
+    for (const match of contextText.matchAll(pattern)) {
+      matches.push({
+        name,
+        normalized,
+        start: match.index,
+        end: (match.index ?? 0) + normalized.length
+      });
+    }
+  }
+  return uniqueStrings(matches
+    .filter((match) => !matches.some((candidate) =>
+      candidate !== match &&
+      candidate.normalized.length > match.normalized.length &&
+      candidate.start <= match.start &&
+      candidate.end >= match.end
+    ))
+    .map((match) => match.name));
+}
 function historicalExerciseVariants(name) {
   const normalized = normalizeExerciseName(name);
   if (!normalized) return [];
@@ -431,9 +487,15 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
   const outputText = surface === 'scoreCommentary' ? scoreCommentaryText(output) : output;
   const isStored = testCase.source === 'stored';
+  const allNames = collectAllExerciseNames(snapshot);
+  // Union the hand-built field list with a projection of the actual context, so
+  // this can only ever *reduce* false positives. testCase.allowedExerciseMentions
+  // stays as an explicit override for cases that need it.
   const allowed = new Set();
   for (const name of [
     ...collectAllowedExerciseNames(surface, context),
+    ...collectContextExerciseNames(context, allNames),
     ...(testCase.allowedExerciseMentions ?? [])
   ]) {
     const variants = isStored ? historicalExerciseVariants(name) : [normalizeExerciseName(name)];
@@ -441,7 +503,6 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
       allowed.add(variant);
     }
   }
-  const allNames = collectAllExerciseNames(snapshot);
   const normalizedOutput = normalizeExerciseName(outputText);
   const mentions = [];
@@ -463,8 +524,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
   const unauthorized = mentions
     .filter((mention) => !mention.allowed)
+    // Collapse a shorter mention into any longer mention that spans the same
+    // text — regardless of whether the covering mention is itself allowed.
+    // "Squat" matches inside "Hack Squat" (whitespace word boundary), so without
+    // this an unauthorized "Hack Squat" was double-counted as both "Hack Squat"
+    // and "Squat". The covering mention carries the real verdict; the substring
+    // is never a distinct mention.
     .filter((mention) => !mentions.some((candidate) =>
-      candidate.allowed &&
+      candidate !== mention &&
       candidate.normalizedName.length > mention.normalizedName.length &&
       candidate.start <= mention.start &&
       candidate.end >= mention.end
@@ -1108,6 +1175,21 @@ function extractAskWeightClaims(text) {
   return claims;
 }
+function extractAskWeightedSetClaims(text) {
+  const claims = [];
+  const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:x|×|for)\s*(\d+)\b/gi;
+  for (const match of text.matchAll(pattern)) {
+    claims.push({
+      text: match[0],
+      weight: Number(match[1]),
+      reps: Number(match[2]),
+      index: match.index ?? -1,
+      end: (match.index ?? -1) + match[0].length
+    });
+  }
+  return claims;
+}
 function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
   const weights = [];
   for (const session of snapshot?.sessions ?? []) {
@@ -1149,6 +1231,184 @@ function isVolumeWeightClaim(text, claim) {
   return /\bvolume\b/i.test(window);
 }
+function askWorkingTopSetRows(snapshot) {
+  const rows = [];
+  for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
+    const completedAt = session.completedAt ?? session.date;
+    for (const exercise of session.exercises ?? []) {
+      const workingSets = (exercise.sets ?? [])
+        .filter((set) => set?.isComplete && !set?.isWarmup)
+        .map((set) => ({
+          weight: Number(set.weight) || 0,
+          reps: Number(set.reps) || 0
+        }));
+      if (workingSets.length === 0) continue;
+      const topSet = workingSets.sort((a, b) => b.weight - a.weight || b.reps - a.reps)[0];
+      rows.push({
+        sessionId: session.id ?? null,
+        date: String(completedAt ?? '').slice(0, 10),
+        exerciseName: exercise.name,
+        normalizedName: normalizeExerciseName(exercise.name),
+        ...topSet
+      });
+    }
+  }
+  return rows;
+}
+function daysAgoForEval(date, testCase) {
+  const today = testCase.context?.today ?? testCase.today;
+  if (!today) return null;
+  const dateMs = Date.parse(`${String(date ?? '').slice(0, 10)}T00:00:00.000Z`);
+  const todayMs = Date.parse(`${String(today).slice(0, 10)}T00:00:00.000Z`);
+  if (!Number.isFinite(dateMs) || !Number.isFinite(todayMs)) return null;
+  return Math.max(0, Math.round((todayMs - dateMs) / (24 * 60 * 60 * 1000)));
+}
+function hasUnqualifiedDeclineLanguage(window) {
+  const text = normalizeText(window);
+  const decline = /\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)|regress(?:ed|ion|ing)?|fell|fall(?:ing)?|decreas(?:e|ed|ing)|lower|worse|slid|slipped)\b/i;
+  if (!decline.test(text)) return false;
+  if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,45}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|regress(?:ed|ion|ing)?|fall(?:ing)?|fell|lower|worse|slid|slipped)\b/i.test(text)) return false;
+  if (/\b(?:rep|reps)\b.{0,20}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b/i.test(text)) return false;
+  if (/\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
+  return true;
+}
+function hasUnqualifiedImprovementLanguage(window) {
+  const text = normalizeText(window);
+  const improvement = /\b(improv(?:e|ed|ing|ement)|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)|moving up|went up|up from|load jump|jumped)\b/i;
+  if (!improvement.test(text)) return false;
+  if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,35}\b(improv(?:e|ed|ing|ement)?|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)?|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
+  if (/\b(?:rep|reps)\b.{0,20}\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
+  if (/\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
+  return true;
+}
+function isReferentialDirectionContinuation(sentence) {
+  return /^(?:that|this|it|there|still|the\s+(?:latest|top)|top\s+set|same\s+load)\b/i.test(sentence);
+}
+function directionEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
+  const normalizedExercise = normalizeExerciseName(exerciseName);
+  const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
+    .filter((name) => name && name !== normalizedExercise);
+  const sentences = outputText
+    .split(/(?<=[.!?])\s+/)
+    .map((sentence) => sentence.trim())
+    .filter(Boolean);
+  if (!normalizedExercise) return sentences;
+  const windows = [];
+  for (let index = 0; index < sentences.length; index++) {
+    if (!normalizeExerciseName(sentences[index]).includes(normalizedExercise)) continue;
+    windows.push(sentences[index]);
+    for (let nextIndex = index + 1; nextIndex < sentences.length; nextIndex++) {
+      const normalizedNext = normalizeExerciseName(sentences[nextIndex]);
+      if (otherExercises.some((name) => normalizedNext.includes(name))) break;
+      if (!isReferentialDirectionContinuation(sentences[nextIndex])) break;
+      windows.push(sentences[nextIndex]);
+    }
+  }
+  return windows.length > 0 ? [...new Set(windows)] : [outputText];
+}
+function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_directional_consistency', passed: true, reason: 'Not an ask answer.' };
+  }
+  const required = Array.isArray(testCase.directionalConsistency)
+    ? testCase.directionalConsistency
+    : [];
+  if (required.length === 0) {
+    return { key: 'ask_directional_consistency', passed: true, reason: 'No directional assertions configured.' };
+  }
+  const rows = askWorkingTopSetRows(snapshot);
+  const outputText = normalizeText(output);
+  const failures = [];
+  for (const expectation of required) {
+    const normalizedName = normalizeExerciseName(expectation.exercise ?? expectation.exerciseName);
+    const history = rows.filter((row) => row.normalizedName === normalizedName);
+    if (history.length < 2) continue;
+    const latest = history[0];
+    const previous = history[1];
+    const loadDelta = latest.weight - previous.weight;
+    const actualDirection = loadDelta > 0 ? 'up' : loadDelta < 0 ? 'down' : 'flat';
+    const expectedDirection = expectation.loadDirection ?? actualDirection;
+    if (expectedDirection !== actualDirection) {
+      failures.push(`Configured expected direction for ${latest.exerciseName} is ${expectedDirection}, but snapshot top-load direction is ${actualDirection}.`);
+      continue;
+    }
+    const windows = directionEvaluationWindows(
+      outputText,
+      expectation.exercise ?? expectation.exerciseName,
+      rows.map((row) => row.exerciseName)
+    );
+    if (actualDirection === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
+      failures.push(`Ask answer frames ${latest.exerciseName} as declining/drop-off even though top load increased from ${previous.weight} kg to ${latest.weight} kg.`);
+    }
+    if (actualDirection === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
+      failures.push(`Ask answer frames ${latest.exerciseName} as improving even though top load decreased from ${previous.weight} kg to ${latest.weight} kg.`);
+    }
+    if (actualDirection === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
+      failures.push(`Ask answer invents a load direction for ${latest.exerciseName}, but top load was flat at ${latest.weight} kg.`);
+    }
+  }
+  return {
+    key: 'ask_directional_consistency',
+    passed: failures.length === 0,
+    reason: failures.length === 0
+      ? 'Ask answer does not invert configured load directions.'
+      : failures.join(' ')
+  };
+}
+function relevantSessionsForStaleness(snapshot, testCase) {
+  const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
+    ?? testCase.directionalConsistency?.[0]?.exercise
+    ?? testCase.directionalConsistency?.[0]?.exerciseName
+    ?? null;
+  if (!configuredExercise) return snapshot?.sessions ?? [];
+  const normalized = normalizeExerciseName(configuredExercise);
+  return (snapshot?.sessions ?? []).filter((session) => (
+    (session.exercises ?? []).some((exercise) => normalizeExerciseName(exercise.name) === normalized)
+  ));
+}
+function evaluateAskStaleness(output, snapshot, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
+  }
+  const maxRecentDays = testCase.staleness?.maxRecentDays;
+  if (!Number.isFinite(Number(maxRecentDays))) {
+    return { key: 'ask_staleness', passed: true, reason: 'No staleness assertion configured.' };
+  }
+  const latestSession = stableSortByDateDesc(relevantSessionsForStaleness(snapshot, testCase), (session) => session.completedAt ?? session.date)[0] ?? null;
+  const daysAgo = daysAgoForEval(latestSession?.completedAt ?? latestSession?.date, testCase);
+  if (daysAgo == null || daysAgo <= Number(maxRecentDays)) {
+    return { key: 'ask_staleness', passed: true, reason: 'Latest session is inside the configured recency window.' };
+  }
+  const normalized = normalizeText(output);
+  const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
+  const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
+    || /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
+  const includesAge = new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
+  const passed = !claimsRecent || explicitlyNotRecent || includesAge;
+  return {
+    key: 'ask_staleness',
+    passed,
+    reason: passed
+      ? 'Ask answer does not present stale sessions as simply recent.'
+      : `Ask answer calls a ${daysAgo}-day-old session recent without the days-ago label.`
+  };
+}
 function evaluateAskClaims(output, snapshot, testCase) {
   if (testCase.surface !== 'ask') {
     return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
@@ -1234,14 +1494,195 @@ function evaluateAskClaims(output, snapshot, testCase) {
   };
 }
-function evaluateAskToolProvenance(output, context, testCase) {
+function routedToolResultsForEval(snapshot, context) {
+  const routedMetadata = context?.routedMetadata ?? {};
+  const toolParams = routedMetadata.toolParams ?? {};
+  const toolResults = [];
+  const replayFailures = [];
+  for (const toolName of uniqueStrings(routedMetadata.toolsUsed ?? [])) {
+    try {
+      toolResults.push(executeCoachReadTool(snapshot, toolName, toolParams[toolName] ?? {}));
+    } catch (error) {
+      replayFailures.push(`Could not replay routed tool ${toolName}: ${error?.message ?? String(error)}`);
+    }
+  }
+  return { toolResults, replayFailures };
+}
+function addAskToolEvidenceRow(rows, toolName, row, inherited = {}) {
+  const exerciseName = row?.exerciseName ?? row?.name ?? inherited.exerciseName ?? null;
+  const normalizedName = normalizeExerciseName(exerciseName);
+  if (!normalizedName) return;
+  rows.push({
+    toolName,
+    exerciseName,
+    normalizedName,
+    date: row?.date ?? inherited.date ?? null,
+    daysAgo: row?.daysAgo ?? inherited.daysAgo ?? null,
+    recencyLabel: row?.recencyLabel ?? inherited.recencyLabel ?? null,
+    isStale: row?.isStale ?? inherited.isStale ?? false,
+    recencyCutoffDays: row?.recencyCutoffDays ?? inherited.recencyCutoffDays ?? null,
+    warmupSetCount: row?.warmupSetCount ?? 0,
+    workingSetCount: row?.workingSetCount ?? null,
+    topSet: row?.topSet ?? null,
+    comparedToPreviousSession: row?.comparedToPreviousSession ?? null,
+    sets: Array.isArray(row?.sets) ? row.sets : []
+  });
+}
+function askToolEvidenceRows(toolResults = []) {
+  const rows = [];
+  for (const toolResult of toolResults) {
+    for (const row of toolResult?.rows ?? []) {
+      if (Array.isArray(row?.exercises)) {
+        for (const exercise of row.exercises) {
+          addAskToolEvidenceRow(rows, toolResult.toolName, exercise, {
+            date: row.date,
+            daysAgo: row.daysAgo,
+            recencyLabel: row.recencyLabel,
+            isStale: row.isStale,
+            recencyCutoffDays: row.recencyCutoffDays
+          });
+        }
+      } else {
+        addAskToolEvidenceRow(rows, toolResult.toolName, row);
+      }
+    }
+  }
+  return rows;
+}
+function askToolEvidenceWeights(rows = []) {
+  const weights = [];
+  for (const row of rows) {
+    for (const set of row.sets ?? []) {
+      const weight = Number(set.weight);
+      if (Number.isFinite(weight)) weights.push(weight);
+    }
+    const topWeight = Number(row.topSet?.weight);
+    if (Number.isFinite(topWeight)) weights.push(topWeight);
+    const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
+    if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
+  }
+  return weights;
+}
+function askToolEvidenceSetPairs(rows = []) {
+  const pairs = [];
+  for (const row of rows) {
+    for (const set of row.sets ?? []) {
+      const weight = Number(set.weight);
+      const reps = Number(set.reps);
+      if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
+    }
+    const topWeight = Number(row.topSet?.weight);
+    const topReps = Number(row.topSet?.reps);
+    if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
+    const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
+    const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
+    if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
+      pairs.push({ weight: previousTopWeight, reps: previousTopReps });
+    }
+  }
+  return pairs;
+}
+function toolEvidenceSupportsWeightClaim(claim, rows) {
+  if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
+  return false;
+}
+function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
+  if (askToolEvidenceSetPairs(rows).some((pair) => (
+    Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
+  ))) {
+    return true;
+  }
+  return false;
+}
+function compareToolEvidenceRecency(lhs, rhs) {
+  const lhsDaysAgo = Number(lhs?.daysAgo);
+  const rhsDaysAgo = Number(rhs?.daysAgo);
+  if (Number.isFinite(lhsDaysAgo) && Number.isFinite(rhsDaysAgo)) return lhsDaysAgo - rhsDaysAgo;
+  if (Number.isFinite(lhsDaysAgo)) return -1;
+  if (Number.isFinite(rhsDaysAgo)) return 1;
+  return String(rhs?.date ?? '').localeCompare(String(lhs?.date ?? ''));
+}
+function newestToolEvidenceRow(rows = [], predicate = () => true) {
+  return rows
+    .filter(predicate)
+    .sort(compareToolEvidenceRecency)[0] ?? null;
+}
+function latestComparableToolRow(rows = []) {
+  return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
+}
+function isWithinWeightedSetClaim(claim, weightedSetClaims) {
+  return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
+}
+function rowIsStaleForEval(row, testCase) {
+  const daysAgo = Number(row?.daysAgo);
+  const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
+  if (!Number.isFinite(daysAgo) || !Number.isFinite(cutoff)) return Boolean(row?.isStale);
+  return daysAgo > cutoff;
+}
+function outputCallsStaleEvidenceRecent(outputText, row) {
+  const normalized = normalizeText(outputText);
+  const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
+  if (!claimsRecent) return false;
+  const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
+    || /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
+  if (explicitlyNotRecent) return false;
+  const daysAgo = Number(row?.daysAgo);
+  return !Number.isFinite(daysAgo) || !new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
+}
+function recencyEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
+  const normalizedExercise = normalizeExerciseName(exerciseName);
+  const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
+    .filter((name) => name && name !== normalizedExercise);
+  const windows = directionEvaluationWindows(outputText, exerciseName, exerciseNames);
+  if (!normalizedExercise) return windows;
+  const scoped = [];
+  for (const window of windows) {
+    const clauses = window
+      .split(/\s*(?:[.;:]|,\s+|\b(?:while|whereas|but|and)\b)\s*/i)
+      .map((clause) => clause.trim())
+      .filter(Boolean);
+    let matched = false;
+    for (let index = 0; index < clauses.length; index++) {
+      if (!normalizeExerciseName(clauses[index]).includes(normalizedExercise)) continue;
+      matched = true;
+      let scopedWindow = clauses[index];
+      for (let nextIndex = index + 1; nextIndex < clauses.length; nextIndex++) {
+        const normalizedNext = normalizeExerciseName(clauses[nextIndex]);
+        if (otherExercises.some((name) => normalizedNext.includes(name))) break;
+        scopedWindow += ` ${clauses[nextIndex]}`;
+      }
+      scoped.push(scopedWindow);
+    }
+    if (!matched) scoped.push(window);
+  }
+  return scoped.length > 0 ? [...new Set(scoped)] : windows;
+}
+function evaluateAskToolProvenance(output, context, testCase, snapshot) {
   if (testCase.surface !== 'ask') {
     return { key: 'ask_tool_provenance', passed: true, reason: 'Not an ask answer.' };
   }
   const routedMetadata = context?.routedMetadata ?? {};
   const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
-  const failures = [];
+  const { toolResults, replayFailures } = routedToolResultsForEval(snapshot, context);
+  const evidenceRows = askToolEvidenceRows(toolResults);
+  const mentionedExercises = findMentionedExercises(output, snapshot);
+  const unroutedMentionNames = new Set();
+  const failures = [...replayFailures];
   for (const toolName of uniqueStrings(testCase.requiredTools)) {
     if (!toolsUsed.has(toolName)) {
       failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
@@ -1252,6 +1693,74 @@ function evaluateAskToolProvenance(output, context, testCase) {
     failures.push('Ask answer mentions e1RM/1RM, but routed context did not use get_records.');
   }
+  const weightedSetClaims = extractAskWeightedSetClaims(output);
+  for (const claim of weightedSetClaims) {
+    if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
+    const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
+    if (!referencedExercise) continue;
+    const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
+    if (rows.length === 0) {
+      unroutedMentionNames.add(referencedExercise.normalizedName);
+      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
+      continue;
+    }
+    if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
+      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
+    }
+  }
+  for (const claim of extractAskWeightClaims(output)) {
+    if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
+    if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
+    if (isVolumeWeightClaim(output, claim)) continue;
+    const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
+    if (!referencedExercise) continue;
+    const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
+    if (rows.length === 0) {
+      unroutedMentionNames.add(referencedExercise.normalizedName);
+      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
+      continue;
+    }
+    if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
+      failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
+    }
+  }
+  const exerciseNames = evidenceRows.map((row) => row.exerciseName);
+  for (const mention of mentionedExercises) {
+    const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
+    if (rows.length === 0) {
+      if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
+        unroutedMentionNames.add(mention.normalizedName);
+        failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
+      }
+      continue;
+    }
+    const comparable = latestComparableToolRow(rows);
+    if (comparable) {
+      const direction = comparable.comparedToPreviousSession.loadDirection;
+      const previous = comparable.comparedToPreviousSession.previousTopSet;
+      const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
+      if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
+        failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
+      }
+      if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
+        failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
+      }
+      if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
+        failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
+      }
+    }
+    const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
+    if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
+      const windows = recencyEvaluationWindows(output, mention.name, exerciseNames);
+      if (windows.some((window) => outputCallsStaleEvidenceRecent(window, latestDatedRow))) {
+        failures.push(`Ask answer calls ${mention.name} recent, but routed tool evidence says the latest relevant session was ${latestDatedRow.daysAgo} days ago.`);
+      }
+    }
+  }
   return {
     key: 'ask_tool_provenance',
     passed: failures.length === 0,
@@ -1537,7 +2046,9 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
     evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
     evaluateWorkoutClaims(output, context, testCase),
     evaluateAskClaims(output, snapshot, testCase),
-    evaluateAskToolProvenance(output, context, testCase),
+    evaluateAskDirectionalConsistency(output, snapshot, testCase),
+    evaluateAskStaleness(output, snapshot, testCase),
+    evaluateAskToolProvenance(output, context, testCase, snapshot),
     evaluateScoreCommentaryAction(output, context, testCase),
     evaluateScoreCommentarySynthesis(output, context, testCase),
     evaluateScoreCommentaryExerciseInvention(output, snapshot, context, testCase),
@@ -1567,12 +2078,14 @@ export async function runSummaryEvalCaseFromSnapshot(testCase, snapshot) {
   return evaluateSummaryOutputFromSnapshot(testCase, snapshot, output);
 }
-function genericForbiddenPhrasesForSurface(surface) {
+export function genericForbiddenPhrasesForSurface(surface) {
   switch (surface) {
     case 'workout':
       return ['solid progress', 'trust the process', 'keep it up', 'quality work', 'in a great place', 'continue progressive overload', 'as fatigue accumulates'];
     case 'cycle':
-      return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work'];
+      // 'solid first week' enforces the FIRST_WEEK_CYCLE_PROMPT's "do not say
+      // solid first week" rule, which was previously prompt-only (unguarded).
+      return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work', 'solid first week'];
     case 'checkpoint':
       return ['solid progress', 'quality work', 'trust the process', 'in a great place'];
     case 'vitals':