npm - incremnt - Versions diffs - 0.7.2 → 0.8.0 - Mend

incremnt 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/README.md +57 -1
package/package.json +2 -1
package/src/ask-answer-verifier.js +857 -0
package/src/ask-coach.js +2634 -0
package/src/ask-replay.js +358 -0
package/src/auth.js +169 -15
package/src/contract.js +160 -3
package/src/format.js +24 -1
package/src/lib.js +205 -17
package/src/mcp.js +88 -24
package/src/openrouter.js +242 -19
package/src/plan-changeset.js +132 -0
package/src/program-draft.js +230 -0
package/src/prompt-changelog.js +90 -0
package/src/promptfoo-evals.js +10 -4
package/src/promptfoo-langfuse-scores.js +55 -0
package/src/queries.js +992 -987
package/src/remote.js +465 -12
package/src/score-context.js +14 -7
package/src/score-prelude.js +113 -0
package/src/service-url.js +9 -0
package/src/summary-evals.js +677 -42
package/src/sync-service.js +1259 -352
package/src/transport.js +119 -3

package/src/summary-evals.js CHANGED Viewed

@@ -3,7 +3,7 @@ import path from 'node:path';
 import { fileURLToPath } from 'node:url';
 import {
   askContext,
-  askRoutedContext,
+  canonicalExerciseName,
   checkpointContext,
   cycleSummaryContext,
   executeCoachReadTool,
@@ -11,6 +11,8 @@ import {
   workoutSummaryContext,
   vitalsSummaryContext
 } from './queries.js';
+import { askRoutedContext, buildAskStructuredResponse } from './ask-coach.js';
+import { formatIncrementScorePrelude, isScoreQuestion } from './score-prelude.js';
 import {
   AI_PROMPT_VERSIONS,
   generateAskAnswer,
@@ -20,6 +22,8 @@ import {
   generateWorkoutCoachingSummary
 } from './openrouter.js';
 import { computeScoreBand } from './score-context.js';
+import { stripXMLTagBlocks } from './prompt-security.js';
+import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
 const __filename = fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -30,6 +34,14 @@ export function defaultCaseSetName() {
   return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
 }
+function envFlag(name, env = process.env) {
+  return ['1', 'true', 'yes'].includes(String(env[name] ?? '').toLowerCase());
+}
+export function summaryEvalsLiveGenerationEnabled(env = process.env) {
+  return envFlag('SUMMARY_EVALS_LIVE', env) || envFlag('PROMPTFOO_LIVE', env);
+}
 function stableSortByDateDesc(items, selector) {
   return [...items].sort((lhs, rhs) => String(selector(rhs)).localeCompare(String(selector(lhs))));
 }
@@ -88,12 +100,21 @@ export function buildSummaryEvalContext(snapshot, testCase) {
     case 'ask': {
       const question = testCase.context?.question ?? testCase.question ?? '';
       const today = testCase.context?.today ?? testCase.today ?? null;
+      const history = Array.isArray(testCase.context?.history) ? testCase.context.history : [];
       const routed = question
-        ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
+        ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), history, today: today ?? new Date() })
         : null;
+      // Mirror production: the live /cli/ask path prepends the Increment Score
+      // prelude to the routed context. Including it here means a live eval feeds
+      // the model the same dump-prone material, so evaluateAskScoreVoice actually
+      // guards the prompt, not just the checker.
+      const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question });
+      const routedContext = routed?.context ?? null;
+      const trainingData = testCase.context?.trainingData
+        ?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
       return {
         ...(testCase.context ?? {}),
-        trainingData: testCase.context?.trainingData ?? routed?.context ?? null,
+        trainingData,
         routedMetadata: routed?.metadata ?? null
       };
     }
@@ -117,11 +138,38 @@ function summaryEvalGenerationMetadata(result) {
   );
 }
+function buildAskEvalStructuredMetadata(testCase, context, output) {
+  if (testCase.surface !== 'ask') return {};
+  const parsedAsk = extractAskProgramDraft(output, {
+    canonicalizeExerciseName: canonicalExerciseName
+  });
+  const answer = stripXMLTagBlocks(parsedAsk.answerText);
+  const question = context?.question ?? testCase.context?.question ?? testCase.question ?? '';
+  const routingMetadata = context?.routedMetadata ?? null;
+  return {
+    routingMetadata,
+    structured: buildAskStructuredResponse(answer, routingMetadata ?? {}, {
+      programDraft: askStructuredProgramDraft(parsedAsk, routingMetadata),
+      question
+    })
+  };
+}
+function summaryEvalProviderMetadata(testCase, context, output, result = null) {
+  return {
+    ...summaryEvalGenerationMetadata(result),
+    ...buildAskEvalStructuredMetadata(testCase, context, output)
+  };
+}
 export async function generateSummaryEvalOutputWithMetadata(testCase, context, snapshot = null) {
-  const liveGenerationEnabled = process.env.SUMMARY_EVALS_LIVE === '1';
+  const liveGenerationEnabled = summaryEvalsLiveGenerationEnabled();
   const apiKey = process.env.OPENROUTER_API_KEY;
   if (!liveGenerationEnabled || !apiKey || testCase.shouldPass === false) {
-    return { output: testCase.output, metadata: {} };
+    return {
+      output: testCase.output,
+      metadata: summaryEvalProviderMetadata(testCase, context, testCase.output)
+    };
   }
   let result;
@@ -151,7 +199,8 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
         apiKey,
         history: context.history ?? [],
         tone: context.tone,
-        model: context.model
+        model: context.model,
+        routingMetadata: context.routedMetadata ?? undefined
       });
       break;
     }
@@ -161,7 +210,7 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
   return {
     output: result.text,
-    metadata: summaryEvalGenerationMetadata(result)
+    metadata: summaryEvalProviderMetadata(testCase, context, result.text, result)
   };
 }
@@ -260,8 +309,44 @@ function isSingleParagraph(text) {
   return !normalizeText(text).includes('\n\n');
 }
-function lowerIncludes(text, snippet) {
-  return normalizeText(text).toLowerCase().includes(String(snippet).toLowerCase());
+// Canonicalizes free-form coach text and required-mention snippets to the same
+// surface form before substring matching. The goal is to keep grounding checks
+// (does the answer cite this real number?) while tolerating the formatting an
+// LLM legitimately varies: unicode × vs ASCII x, set-token unit placement,
+// signed deltas, and rep-sequence separators (8/8/7 vs "8, 8, and 7").
+// It only adds equivalences; it never strips the digits a check is grounded on,
+// so a genuinely absent number still fails.
+function normalizeForMention(value) {
+  let s = normalizeText(value).toLowerCase();
+  // Unicode multiplication / bullet / asterisk between digits -> ASCII x.
+  s = s.replace(/(\d)\s*[×✕╳·∗*]\s*(\d)/g, '$1x$2');
+  // Drop weight units that sit inside set tokens: "80kg x 7" / "40 kg" -> "80 x 7" / "40".
+  s = s.replace(/(\d(?:\.\d+)?)\s*(?:kgs?|lbs?|pounds)\b/g, '$1');
+  // Collapse spaces around an x that joins two numbers: "80 x 7" -> "80x7".
+  s = s.replace(/(\d)\s*x\s*(?=\d)/g, '$1x');
+  // Unify rep-sequence separators: "8/8/7" and "8, 8, and 7" -> "8,8,7".
+  // Lookahead keeps the trailing digit so chained separators all collapse.
+  s = s.replace(/(\d)\s*\/\s*(?=\d)/g, '$1,');
+  s = s.replace(/(\d)\s*,\s*and\s+(?=\d)/g, '$1,');
+  s = s.replace(/(\d)\s*,\s*(?=\d)/g, '$1,');
+  return s.replace(/\s+/g, ' ').trim();
+}
+// A required/any-of mention may be a string or an array of acceptable
+// alternatives (matches if any alternative is present). Arrays express
+// AND-of-ORs at the fixture level: every top-level entry must match, and an
+// array entry matches when any of its phrasings appears.
+function mentionMatches(output, mention) {
+  const normalizedOutput = normalizeForMention(output);
+  const alternatives = Array.isArray(mention) ? mention : [mention];
+  return alternatives
+    .map((alternative) => normalizeForMention(alternative))
+    .filter(Boolean)
+    .some((alternative) => normalizedOutput.includes(alternative));
+}
+function describeMention(mention) {
+  return Array.isArray(mention) ? `one of [${mention.join(' | ')}]` : String(mention);
 }
 function phraseIncludes(text, snippet) {
@@ -548,11 +633,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
 }
 function evaluateRequiredMentions(output, testCase) {
-  const missing = uniqueStrings(testCase.requiredMentions).filter((mention) => !lowerIncludes(output, mention));
+  const mentions = Array.isArray(testCase.requiredMentions) ? testCase.requiredMentions : [];
+  const missing = mentions.filter((mention) => !mentionMatches(output, mention));
   return {
     key: 'required_mentions',
     passed: missing.length === 0,
-    reason: missing.length === 0 ? 'All required mentions present.' : `Missing required mention(s): ${missing.join(', ')}`
+    reason: missing.length === 0
+      ? 'All required mentions present.'
+      : `Missing required mention(s): ${missing.map(describeMention).join(', ')}`
   };
 }
@@ -566,7 +654,7 @@ function evaluateAnyOfMentions(output, testCase) {
     };
   }
-  const matched = candidates.some((mention) => lowerIncludes(output, mention));
+  const matched = candidates.some((mention) => mentionMatches(output, mention));
   return {
     key: 'required_any_of_mentions',
     passed: matched,
@@ -1162,13 +1250,22 @@ function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
   return false;
 }
+function parseWeightNumber(raw) {
+  return Number(String(raw).replace(/,/g, ''));
+}
 function extractAskWeightClaims(text) {
   const claims = [];
-  const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
+  // Accept comma-grouped thousands ("40,500 kg") as a single number so volume
+  // figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
+  // figures are excluded by isVolumeWeightClaim at the call sites, not by a
+  // magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
+  // 1000 kg, and a fabricated heavy load must still be graded.
+  const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
   for (const match of text.matchAll(pattern)) {
     claims.push({
       text: match[0],
-      value: Number(match[1]),
+      value: parseWeightNumber(match[1]),
       index: match.index ?? -1
     });
   }
@@ -1177,12 +1274,17 @@ function extractAskWeightClaims(text) {
 function extractAskWeightedSetClaims(text) {
   const claims = [];
-  const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:x|×|for)\s*(\d+)\b/gi;
+  // A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
+  // an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
+  // almost always a SET count ("70 kg for 4 working sets") or a duration, and
+  // treating it as reps flags real data as a fabricated pair. So match only the
+  // unambiguous forms; the plain-weight loop still grounds the weight itself.
+  const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
   for (const match of text.matchAll(pattern)) {
     claims.push({
       text: match[0],
-      weight: Number(match[1]),
-      reps: Number(match[2]),
+      weight: parseWeightNumber(match[1]),
+      reps: Number(match[2] ?? match[3]),
       index: match.index ?? -1,
       end: (match.index ?? -1) + match[0].length
     });
@@ -1225,10 +1327,58 @@ function isEstimatedOneRepMaxWeightClaim(text, claim) {
 }
 function isVolumeWeightClaim(text, claim) {
-  const start = Math.max(0, claim.index - 30);
-  const end = Math.min(text.length, claim.index + claim.text.length + 30);
-  const window = text.slice(start, end);
-  return /\bvolume\b/i.test(window);
+  // A kg figure in a clause about volume/tonnage/total load is a workload total
+  // (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
+  // exercise load. Scope to the claim's clause so a fabricated exercise load
+  // earlier in the same sentence is still graded.
+  return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
+}
+function claimClause(text, claim) {
+  const boundaries = [
+    '\n',
+    '. ',
+    ';',
+    ', while',
+    ', whereas',
+    ', but',
+    ' while ',
+    ' whereas ',
+    ' but '
+  ];
+  let start = 0;
+  for (const boundary of boundaries) {
+    const index = text.lastIndexOf(boundary, claim.index);
+    if (index >= 0) start = Math.max(start, index + boundary.length);
+  }
+  let end = text.length;
+  for (const boundary of boundaries) {
+    const index = text.indexOf(boundary, claim.index + claim.text.length);
+    if (index >= 0) end = Math.min(end, index);
+  }
+  return text.slice(start, end);
+}
+// Returns the sentence containing the claim, so context guards can look at the
+// whole clause rather than a fixed-width window (body-weight phrasing can put
+// the "body weight" anchor well before the kg figure).
+function claimSentence(text, claim) {
+  const before = text.slice(0, claim.index);
+  const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
+  const start = startBreak >= 0 ? startBreak + 1 : 0;
+  const after = text.slice(claim.index);
+  const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
+  const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
+  return text.slice(start, end);
+}
+// Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
+// exercise-load claims. findNearestMentionedExercise would otherwise attribute
+// them to the previously named lift and flag a correct answer as a
+// hallucination, so skip any kg figure stated in a body-weight clause.
+function isBodyWeightClaim(text, claim) {
+  return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
 }
 function askWorkingTopSetRows(snapshot) {
@@ -1367,6 +1517,70 @@ function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
   };
 }
+// Increment Score component names. Recited with a number, these are the raw
+// sub-scores the coach-observation-voice spec marks Tier 1 — never surface.
+const SCORE_COMPONENT_NAMES = ['coverage', 'stimulus', 'execution', 'progression', 'recovery'];
+// A score-like magnitude: 1-3 digits, optional one decimal place.
+const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
+// Contexts that mean the number is real-world data — reps, load, time, counts,
+// ratios, percentages, the /100 headline — not a raw component sub-score. A
+// number directly followed by one of these is left alone.
+const NON_SCORE_UNIT =
+  '(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
+  'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
+// Heuristic, not a parser. Flags a component name followed — within a short,
+// period/newline-free gap (one clause) — by a score-like number that is not a
+// real-world unit. The bounded gap (excludes digits, so it can't skip a unit'd
+// number) catches the natural phrasings an LLM actually emits — "recovery 35",
+// "recovery is 35", "recovery is sitting at 35", "recovery came in at 35",
+// "recovery (35)", "recovery is much lower at 42.8", "coverage 100" — while the
+// unit lookahead keeps clean prose ("recovery over the last 3 sessions",
+// "recovery after 3 hours of sleep", "execution at 9/10 RPE") from tripping.
+const SCORE_COMPONENT_DUMP_PATTERN = new RegExp(
+  // `(?!\\.\\d)` rejects a number that is really the integer part of a decimal —
+  // without it, backtracking matches "2" in "progression of 2.5 kg" (the unit
+  // guard only sees the ".5 kg" tail) and false-flags real load/time data.
+  `\\b(${SCORE_COMPONENT_NAMES.join('|')})\\b[^.\\d\\n]{0,25}?(${SCORE_NUMBER})\\b(?!\\.\\d)(?!\\s*${NON_SCORE_UNIT})`,
+  'gi'
+);
+// The other dump the prelude used to emit and the model parroted: an explicit
+// day-over-day delta number ("-13 day-over-day delta", "down 11 points day over
+// day"). A bare "down day-over-day" with no number is fine.
+const SCORE_DELTA_DUMP_PATTERN =
+  /[+-]\d+(?:\.\d+)?[^.\n]{0,16}?day[- ]over[- ]day|(?:\d+(?:\.\d+)?\s*points?)[^.\n]{0,16}?day[- ]over[- ]day|day[- ]over[- ]day[^.\n]{0,16}?(?:[+-]\d+(?:\.\d+)?|\d+(?:\.\d+)?\s*points?)/i;
+export function evaluateAskScoreVoice(output, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_score_voice', passed: true, reason: 'Not an ask answer.' };
+  }
+  // Escape hatch for cases that legitimately need raw component values
+  // (e.g. an ask case paired with the numbers-only tone).
+  if (testCase.allowScoreComponents === true) {
+    return { key: 'ask_score_voice', passed: true, reason: 'Score-component voice check opted out for this case.' };
+  }
+  const text = normalizeText(output);
+  const hits = new Set();
+  for (const match of text.matchAll(SCORE_COMPONENT_DUMP_PATTERN)) {
+    hits.add(`${match[1]} ${match[2]}`);
+  }
+  if (SCORE_DELTA_DUMP_PATTERN.test(text)) {
+    hits.add('day-over-day delta number');
+  }
+  return {
+    key: 'ask_score_voice',
+    passed: hits.size === 0,
+    reason: hits.size === 0
+      ? 'Ask answer does not recite raw Increment Score component sub-scores.'
+      : `Ask answer recites raw score internals: ${[...hits].join(', ')}. Speak in training reality, not raw sub-scores.`
+  };
+}
 function relevantSessionsForStaleness(snapshot, testCase) {
   const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
     ?? testCase.directionalConsistency?.[0]?.exercise
@@ -1379,6 +1593,65 @@ function relevantSessionsForStaleness(snapshot, testCase) {
   ));
 }
+// The coach IS the coach — it must speak in the first person and never refer to
+// itself or its own outputs as a third party ("the coach observation says…",
+// "the system shows…"). Own the observation instead ("I flagged…").
+const ASK_SELF_REFERENCE_PATTERNS = [
+  /\bthe coach observations?\b/i,
+  /\bthe coach\b/i,
+  /\bthe ai coach\b/i,
+  /\byour coach\b/i,
+  /\bthis coach\b/i,
+  /\bthe system\b/i,
+  /\bthe assistant\b/i
+];
+function evaluateAskSelfReference(output, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_self_reference', passed: true, reason: 'Not an ask answer.' };
+  }
+  const text = normalizeText(output);
+  if (text === 'NO_INSIGHT' || !text) {
+    return { key: 'ask_self_reference', passed: true, reason: 'No answer text.' };
+  }
+  const hits = [];
+  for (const pattern of ASK_SELF_REFERENCE_PATTERNS) {
+    const match = text.match(pattern);
+    if (match) hits.push(match[0]);
+  }
+  const unique = uniqueStrings(hits);
+  return {
+    key: 'ask_self_reference',
+    passed: unique.length === 0,
+    reason: unique.length === 0
+      ? 'Ask answer speaks in the first person.'
+      : `Ask answer refers to itself in the third person: ${unique.join(', ')}. You ARE the coach — own it ("I flagged…", "your data shows…").`
+  };
+}
+// On a question that is not about the Increment Score, the coach must not
+// volunteer the bare overall score number (e.g. "your score is 92/100"). The
+// prelude withholds the number for non-score questions; this guards the answer.
+function evaluateAskVolunteeredScore(output, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
+  }
+  const question = testCase.context?.question ?? testCase.question ?? '';
+  if (isScoreQuestion(question)) {
+    return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
+  }
+  const text = normalizeText(output);
+  const volunteered = /\b\d{2,3}\s*\/\s*100\b/.test(text)
+    || /\b(?:increment\s+)?score\s+(?:is|of|at|sits at|currently|was)\b[^.\n]*\b\d{2,3}\b/i.test(text);
+  return {
+    key: 'ask_volunteered_score',
+    passed: !volunteered,
+    reason: volunteered
+      ? 'Ask answer volunteers the overall Increment Score number on a question that was not about the score. Translate it to the limiter instead.'
+      : 'Ask answer does not volunteer the score number unprompted.'
+  };
+}
 function evaluateAskStaleness(output, snapshot, testCase) {
   if (testCase.surface !== 'ask') {
     return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
@@ -1472,6 +1745,7 @@ function evaluateAskClaims(output, snapshot, testCase) {
   for (const claim of extractAskWeightClaims(normalized)) {
     if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
     if (isVolumeWeightClaim(normalized, claim)) continue;
+    if (isBodyWeightClaim(normalized, claim)) continue;
     const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
     if (!referencedExercise) continue;
     const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
@@ -1713,6 +1987,7 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
     if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
     if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
     if (isVolumeWeightClaim(output, claim)) continue;
+    if (isBodyWeightClaim(output, claim)) continue;
     const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
     if (!referencedExercise) continue;
     const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
@@ -1770,6 +2045,309 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
   };
 }
+function scoreFormulaEntries(snapshot) {
+  const seen = new Set();
+  return scoreHistoryFromSnapshot(snapshot).filter((entry) => {
+    if (!entry) return false;
+    const key = entry.id ?? entry.snapshotAt;
+    if (key == null) return true;
+    if (seen.has(key)) return false;
+    seen.add(key);
+    return true;
+  });
+}
+function evaluateFormulaVersion(_output, snapshot, testCase) {
+  const expected = testCase.expectedFormulaVersion ?? testCase.formulaVersion ?? null;
+  if (!expected) {
+    return { key: 'formula_version', passed: true, reason: 'No formula version pin configured.' };
+  }
+  const entries = scoreFormulaEntries(snapshot);
+  const missingCount = entries.filter((entry) => !entry?.formulaVersion).length;
+  const versions = uniqueStrings(entries.map((entry) => entry?.formulaVersion));
+  const passed = entries.length > 0 && missingCount === 0 && versions.every((version) => version === expected);
+  return {
+    key: 'formula_version',
+    passed,
+    reason: passed
+      ? `Formula version is pinned to ${expected}.`
+      : missingCount > 0
+        ? `Expected formula version ${expected}, but ${missingCount} score snapshot(s) have no formula version.`
+        : versions.length > 0
+        ? `Expected formula version ${expected}, got ${versions.join(', ')}.`
+        : `Expected formula version ${expected}, but snapshot has no increment score formula version.`
+  };
+}
+function arrayContainsAll(actual = [], expected = []) {
+  const actualSet = new Set(actual ?? []);
+  return (expected ?? []).every((item) => actualSet.has(item));
+}
+function arrayEquals(actual = [], expected = []) {
+  if (!Array.isArray(actual) || !Array.isArray(expected) || actual.length !== expected.length) {
+    return false;
+  }
+  const sortedActual = [...actual].sort();
+  const sortedExpected = [...expected].sort();
+  return sortedActual.every((item, index) => item === sortedExpected[index]);
+}
+function askObservationCheckMatches(actualCheck, expectedCheck) {
+  return Object.entries(expectedCheck ?? {}).every(([key, value]) => {
+    if (Array.isArray(value)) return arrayContainsAll(actualCheck?.[key], value);
+    return actualCheck?.[key] === value;
+  });
+}
+function evaluateAskEvidencePlan(_output, context, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_evidence_plan', passed: true, reason: 'Not an ask answer.' };
+  }
+  const expected = testCase.expectedEvidencePlan ?? null;
+  if (!expected) {
+    return { key: 'ask_evidence_plan', passed: true, reason: 'No evidence plan assertion configured.' };
+  }
+  const plan = context?.routedMetadata?.evidencePlan ?? null;
+  const failures = [];
+  if (!plan) {
+    failures.push('Routed Ask context did not expose metadata.evidencePlan.');
+  } else {
+    for (const key of ['route', 'effectiveRoute', 'fallbackRoute']) {
+      if (key in expected && plan[key] !== expected[key]) {
+        failures.push(`Expected evidencePlan.${key}=${expected[key] ?? 'null'}, got ${plan[key] ?? 'null'}.`);
+      }
+    }
+    for (const key of ['requiredTools', 'optionalTools', 'executedTools', 'evidenceGaps']) {
+      if (Array.isArray(expected[key]) && !arrayEquals(plan[key], expected[key])) {
+        failures.push(`Expected evidencePlan.${key} to equal ${expected[key].join(', ')}; got ${(plan[key] ?? []).join(', ')}.`);
+      }
+    }
+    if (Array.isArray(expected.excludedExecutedTools)) {
+      const executed = new Set(plan.executedTools ?? []);
+      const hits = expected.excludedExecutedTools.filter((toolName) => executed.has(toolName));
+      if (hits.length > 0) {
+        failures.push(`Expected evidencePlan.executedTools to exclude ${hits.join(', ')}.`);
+      }
+    }
+    for (const expectedCheck of expected.observationChecks ?? []) {
+      const matched = (plan.observationChecks ?? []).some((actualCheck) => askObservationCheckMatches(actualCheck, expectedCheck));
+      if (!matched) {
+        failures.push(`Expected observation check ${JSON.stringify(expectedCheck)}; got ${JSON.stringify(plan.observationChecks ?? [])}.`);
+      }
+    }
+    if (Array.isArray(expected.observationChecks) && (plan.observationChecks ?? []).length !== expected.observationChecks.length) {
+      failures.push(`Expected ${expected.observationChecks.length} observation check(s), got ${(plan.observationChecks ?? []).length}.`);
+    }
+  }
+  return {
+    key: 'ask_evidence_plan',
+    passed: failures.length === 0,
+    reason: failures.length === 0
+      ? 'Ask evidence plan matches configured assertions.'
+      : failures.join(' ')
+  };
+}
+function askMetadataObservationReferences(metadata) {
+  const references = new Set([
+    ...(metadata?.includedCoachObservationIds ?? []),
+    ...(metadata?.coachObservationIds ?? [])
+  ]);
+  for (const comparison of metadata?.sessionObservationComparisons ?? []) {
+    if (comparison?.observationId) references.add(comparison.observationId);
+  }
+  for (const item of metadata?.provenance ?? []) {
+    for (const sourceId of item?.sourceIds ?? []) {
+      references.add(sourceId);
+    }
+  }
+  return references;
+}
+function evaluateAskMetadata(output, context, testCase) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_metadata', passed: true, reason: 'Not an ask answer.' };
+  }
+  const expected = testCase.expectedMetadata ?? null;
+  if (!expected) {
+    return { key: 'ask_metadata', passed: true, reason: 'No Ask metadata assertion configured.' };
+  }
+  const metadata = context?.routedMetadata ?? {};
+  const failures = [];
+  if (Array.isArray(expected.includedCoachObservationIds)) {
+    const included = new Set(metadata.includedCoachObservationIds ?? []);
+    const missing = expected.includedCoachObservationIds.filter((id) => !included.has(id));
+    if (missing.length > 0) {
+      failures.push(`Expected included coach observation id(s): ${missing.join(', ')}.`);
+    }
+  }
+  if (Array.isArray(expected.excludedCoachObservationIds)) {
+    const references = askMetadataObservationReferences(metadata);
+    const hits = expected.excludedCoachObservationIds.filter((id) => references.has(id));
+    if (hits.length > 0) {
+      failures.push(`Expected coach observation id(s) to be excluded from rendered metadata: ${hits.join(', ')}.`);
+    }
+  }
+  if (Array.isArray(expected.forbiddenObservationPhrases)) {
+    const hits = uniqueStrings(expected.forbiddenObservationPhrases).filter((phrase) => phraseIncludes(output, phrase));
+    if (hits.length > 0) {
+      failures.push(`Dismissed or excluded observation phrase(s) leaked into Ask answer: ${hits.join(', ')}.`);
+    }
+  }
+  return {
+    key: 'ask_metadata',
+    passed: failures.length === 0,
+    reason: failures.length === 0
+      ? 'Ask metadata matches configured assertions.'
+      : failures.join(' ')
+  };
+}
+function normalizedStructuredText(value) {
+  return String(value ?? '')
+    .toLowerCase()
+    .replace(/[^a-z0-9]+/g, ' ')
+    .replace(/\b(my|the|a|an)\b/g, ' ')
+    .replace(/\s+/g, ' ')
+    .trim();
+}
+function structuredStringArray(value) {
+  return Array.isArray(value)
+    ? value.map((item) => String(item ?? '').trim()).filter(Boolean)
+    : [];
+}
+function structuredObjectStringArray(items, key) {
+  return Array.isArray(items)
+    ? items.map((item) => String(item?.[key] ?? '').trim()).filter(Boolean)
+    : [];
+}
+function requireStructuredStrings(actual, expected, label, failures) {
+  if (!Array.isArray(expected)) return;
+  const actualSet = new Set(structuredStringArray(actual));
+  const missing = expected.filter((item) => !actualSet.has(item));
+  if (missing.length > 0) {
+    failures.push(`Expected structured ${label}: ${missing.join(', ')}.`);
+  }
+}
+function forbidStructuredSuggestions(actual, forbidden, failures) {
+  if (!Array.isArray(forbidden)) return;
+  const normalizedActual = new Set(structuredStringArray(actual).map(normalizedStructuredText).filter(Boolean));
+  const hits = forbidden.filter((item) => normalizedActual.has(normalizedStructuredText(item)));
+  if (hits.length > 0) {
+    failures.push(`Forbidden follow-up suggestion(s) present: ${hits.join(', ')}.`);
+  }
+}
+function evaluateAskStructuredResponse(_output, context, testCase, structured) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'ask_structured_response', passed: true, reason: 'Not an ask answer.' };
+  }
+  const expected = testCase.expectedStructuredResponse ?? null;
+  if (!expected) {
+    return { key: 'ask_structured_response', passed: true, reason: 'No structured response assertion configured.' };
+  }
+  const failures = [];
+  if (!structured || typeof structured !== 'object' || Array.isArray(structured)) {
+    failures.push('Ask structured response was not generated.');
+  } else {
+    if (expected.confidence && structured.confidence !== expected.confidence) {
+      failures.push(`Expected structured confidence ${expected.confidence}, got ${structured.confidence ?? 'null'}.`);
+    }
+    requireStructuredStrings(
+      structuredObjectStringArray(structured.evidenceUsed, 'toolName'),
+      expected.requiredEvidenceTools,
+      'evidence tool(s)',
+      failures
+    );
+    requireStructuredStrings(
+      structuredObjectStringArray(structured.evidenceUsed, 'label'),
+      expected.requiredEvidenceLabels,
+      'evidence label(s)',
+      failures
+    );
+    requireStructuredStrings(
+      structuredObjectStringArray(structured.recommendedActions, 'label'),
+      expected.requiredRecommendedActionLabels,
+      'recommended action label(s)',
+      failures
+    );
+    requireStructuredStrings(
+      structured.followUpSuggestions,
+      expected.requiredFollowUpSuggestions,
+      'follow-up suggestion(s)',
+      failures
+    );
+    requireStructuredStrings(
+      structured.limitations,
+      expected.requiredLimitations,
+      'limitation(s)',
+      failures
+    );
+    forbidStructuredSuggestions(structured.followUpSuggestions, expected.forbiddenFollowUpSuggestions, failures);
+    const followUps = structuredStringArray(structured.followUpSuggestions);
+    const normalizedFollowUps = followUps.map(normalizedStructuredText).filter(Boolean);
+    const duplicateCount = normalizedFollowUps.length - new Set(normalizedFollowUps).size;
+    if (duplicateCount > 0) {
+      failures.push('Structured follow-up suggestions must be unique.');
+    }
+    const normalizedQuestion = normalizedStructuredText(context?.question ?? testCase.context?.question ?? testCase.question ?? '');
+    if (normalizedQuestion && normalizedFollowUps.includes(normalizedQuestion)) {
+      failures.push('Structured follow-up suggestions must not repeat the current user question.');
+    }
+    if (Number.isFinite(expected.maxFollowUpSuggestions) && followUps.length > expected.maxFollowUpSuggestions) {
+      failures.push(`Expected at most ${expected.maxFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
+    }
+    if (Number.isFinite(expected.minFollowUpSuggestions) && followUps.length < expected.minFollowUpSuggestions) {
+      failures.push(`Expected at least ${expected.minFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
+    }
+    if (typeof expected.programDraftPresent === 'boolean') {
+      const hasProgramDraft = structured.programDraft != null;
+      if (hasProgramDraft !== expected.programDraftPresent) {
+        failures.push(`Expected programDraft present=${expected.programDraftPresent}, got ${hasProgramDraft}.`);
+      }
+    }
+  }
+  return {
+    key: 'ask_structured_response',
+    passed: failures.length === 0,
+    reason: failures.length === 0
+      ? 'Ask structured response matches configured assertions.'
+      : failures.join(' ')
+  };
+}
+function askStructuredProgramDraft(parsedAsk, routingMetadata) {
+  const shouldSuppressDraft = routingMetadata?.requestedCoachObservationIntent === 'successor_plan'
+    && routingMetadata?.coachObservationFollowUpMissing === true;
+  return shouldSuppressDraft ? undefined : parsedAsk?.programDraft;
+}
 function firstAction(payload) {
   const actions = Array.isArray(payload?.recommendedNextActions) ? payload.recommendedNextActions : [];
   return actions.find((action) => typeof action?.action === 'string' && action.action.trim());
@@ -1998,17 +2576,20 @@ function evaluatePersonaMotivation(output, context, testCase) {
     failures.push('Feedback acknowledges a PR/positive result but frames later-set dropoff in a demotivating way.');
   }
+  const dataLimitationCaveat = /\bnot enough\s+(?:\w+\s+){0,5}?(?:data|details?|context|evidence|information|info|signals?|metrics?)\b/i.test(normalized)
+    || /\bnot enough\s+(?:\w+\s+){0,8}?to\s+(?:separate|infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\b/i.test(normalized);
   const discouragingPatterns = [
     /\bdisappointing\b/i,
     /\bunderwhelming\b/i,
     /\bunderperformed\b/i,
     /\bpoor\b/i,
-    /\bnot enough\b/i,
+    /\bnot enough\s+(?:effort|work|volume|intensity|reps?|sets?|weight|load|progress|consistency)\b/i,
     /\bfailed to\b/i,
     /\bstruggled\b/i
   ];
-  if (discouragingPatterns.some((pattern) => pattern.test(normalized))) {
+  if (!dataLimitationCaveat && discouragingPatterns.some((pattern) => pattern.test(normalized))) {
     failures.push('Feedback uses discouraging language that is likely to reduce motivation.');
   }
@@ -2026,6 +2607,33 @@ export async function runSummaryEvalCase(testCase) {
   return runSummaryEvalCaseFromSnapshot(testCase, snapshot);
 }
+// When an ask answer emits a <program_draft> block, it must be valid JSON in the
+// exact Program shape (enums, limits, no forbidden keys) — validated by the same
+// normalizer the runtime uses to accept/drop drafts. Catches malformed drafts in
+// CI instead of silently dropping them in prod. No block = nothing to check.
+function evaluateProgramDraft(output, testCase, parsedAsk = null) {
+  if (testCase.surface !== 'ask') {
+    return { key: 'program_draft', passed: true, reason: 'Not an ask answer.' };
+  }
+  if (!hasProgramDraftBlock(output)) {
+    return { key: 'program_draft', passed: true, reason: 'No program draft block.' };
+  }
+  // Validate against the EXACT runtime rules — the runtime passes
+  // canonicalExerciseName, which strips non-alphanumerics; without it the eval
+  // would green-light drafts (e.g. punctuation-only names) that prod silently drops.
+  const { programDraft } = parsedAsk ?? extractAskProgramDraft(output, {
+    canonicalizeExerciseName: canonicalExerciseName,
+    strict: true
+  });
+  return {
+    key: 'program_draft',
+    passed: programDraft != null,
+    reason: programDraft != null
+      ? 'Program draft is valid JSON matching the required shape.'
+      : 'Program draft block is malformed (invalid JSON, or fails shape/enum/limit validation).'
+  };
+}
 export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
   const context = buildSummaryEvalContext(snapshot, testCase);
   if (context == null) {
@@ -2036,26 +2644,53 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
     throw new Error(`Eval case ${testCase.id} produced an empty output`);
   }
+  // strict: eval rejects a draft with any malformed nested item (the runtime
+  // salvages it, but partial malformation is a regression signal). The parsed
+  // result also feeds <program_draft> stripping for the other checks.
+  const parsedAsk = testCase.surface === 'ask'
+    ? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName, strict: true })
+    : null;
+  const structuredParsedAsk = testCase.surface === 'ask'
+    ? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName })
+    : null;
+  const visibleOutput = parsedAsk
+    ? stripXMLTagBlocks(parsedAsk.answerText)
+    : output;
+  const structuredAsk = testCase.surface === 'ask'
+    ? buildAskStructuredResponse(visibleOutput, context.routedMetadata ?? {}, {
+        programDraft: askStructuredProgramDraft(structuredParsedAsk, context.routedMetadata),
+        question: context.question ?? testCase.context?.question ?? testCase.question ?? ''
+      })
+    : null;
   const checks = [
-    evaluateNoInsight(output, testCase),
-    evaluateShape(output, testCase),
-    evaluateRequiredMentions(output, testCase),
-    evaluateAnyOfMentions(output, testCase),
-    evaluateForbiddenPhrases(output, testCase),
-    evaluateForbiddenMentions(output, testCase),
-    evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
-    evaluateWorkoutClaims(output, context, testCase),
-    evaluateAskClaims(output, snapshot, testCase),
-    evaluateAskDirectionalConsistency(output, snapshot, testCase),
-    evaluateAskStaleness(output, snapshot, testCase),
-    evaluateAskToolProvenance(output, context, testCase, snapshot),
-    evaluateScoreCommentaryAction(output, context, testCase),
-    evaluateScoreCommentarySynthesis(output, context, testCase),
-    evaluateScoreCommentaryExerciseInvention(output, snapshot, context, testCase),
-    evaluateScoreCommentaryBand(output, context, testCase),
-    evaluateScoreCommentaryTone(output, testCase),
-    evaluateScoreCommentaryLength(output, testCase),
-    evaluatePersonaMotivation(output, context, testCase)
+    evaluateNoInsight(visibleOutput, testCase),
+    evaluateShape(visibleOutput, testCase),
+    evaluateRequiredMentions(visibleOutput, testCase),
+    evaluateAnyOfMentions(visibleOutput, testCase),
+    evaluateForbiddenPhrases(visibleOutput, testCase),
+    evaluateForbiddenMentions(visibleOutput, testCase),
+    evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
+    evaluateWorkoutClaims(visibleOutput, context, testCase),
+    evaluateAskClaims(visibleOutput, snapshot, testCase),
+    evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
+    evaluateAskScoreVoice(visibleOutput, testCase),
+    evaluateAskSelfReference(visibleOutput, testCase),
+    evaluateAskVolunteeredScore(visibleOutput, testCase),
+    evaluateAskStaleness(visibleOutput, snapshot, testCase),
+    evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
+    evaluateFormulaVersion(visibleOutput, snapshot, testCase),
+    evaluateAskEvidencePlan(visibleOutput, context, testCase),
+    evaluateAskMetadata(visibleOutput, context, testCase),
+    evaluateAskStructuredResponse(visibleOutput, context, testCase, structuredAsk),
+    evaluateScoreCommentaryAction(visibleOutput, context, testCase),
+    evaluateScoreCommentarySynthesis(visibleOutput, context, testCase),
+    evaluateScoreCommentaryExerciseInvention(visibleOutput, snapshot, context, testCase),
+    evaluateScoreCommentaryBand(visibleOutput, context, testCase),
+    evaluateScoreCommentaryTone(visibleOutput, testCase),
+    evaluateScoreCommentaryLength(visibleOutput, testCase),
+    evaluatePersonaMotivation(visibleOutput, context, testCase),
+    evaluateProgramDraft(output, testCase, parsedAsk)
   ];
   return {