incremnt 0.7.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@ import path from 'node:path';
3
3
  import { fileURLToPath } from 'node:url';
4
4
  import {
5
5
  askContext,
6
- askRoutedContext,
6
+ canonicalExerciseName,
7
7
  checkpointContext,
8
8
  cycleSummaryContext,
9
9
  executeCoachReadTool,
@@ -11,6 +11,8 @@ import {
11
11
  workoutSummaryContext,
12
12
  vitalsSummaryContext
13
13
  } from './queries.js';
14
+ import { askRoutedContext, buildAskStructuredResponse } from './ask-coach.js';
15
+ import { formatIncrementScorePrelude, isScoreQuestion } from './score-prelude.js';
14
16
  import {
15
17
  AI_PROMPT_VERSIONS,
16
18
  generateAskAnswer,
@@ -20,6 +22,8 @@ import {
20
22
  generateWorkoutCoachingSummary
21
23
  } from './openrouter.js';
22
24
  import { computeScoreBand } from './score-context.js';
25
+ import { stripXMLTagBlocks } from './prompt-security.js';
26
+ import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
23
27
 
24
28
  const __filename = fileURLToPath(import.meta.url);
25
29
  const __dirname = path.dirname(__filename);
@@ -30,6 +34,14 @@ export function defaultCaseSetName() {
30
34
  return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
31
35
  }
32
36
 
37
+ function envFlag(name, env = process.env) {
38
+ return ['1', 'true', 'yes'].includes(String(env[name] ?? '').toLowerCase());
39
+ }
40
+
41
+ export function summaryEvalsLiveGenerationEnabled(env = process.env) {
42
+ return envFlag('SUMMARY_EVALS_LIVE', env) || envFlag('PROMPTFOO_LIVE', env);
43
+ }
44
+
33
45
  function stableSortByDateDesc(items, selector) {
34
46
  return [...items].sort((lhs, rhs) => String(selector(rhs)).localeCompare(String(selector(lhs))));
35
47
  }
@@ -88,12 +100,21 @@ export function buildSummaryEvalContext(snapshot, testCase) {
88
100
  case 'ask': {
89
101
  const question = testCase.context?.question ?? testCase.question ?? '';
90
102
  const today = testCase.context?.today ?? testCase.today ?? null;
103
+ const history = Array.isArray(testCase.context?.history) ? testCase.context.history : [];
91
104
  const routed = question
92
- ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
105
+ ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), history, today: today ?? new Date() })
93
106
  : null;
107
+ // Mirror production: the live /cli/ask path prepends the Increment Score
108
+ // prelude to the routed context. Including it here means a live eval feeds
109
+ // the model the same dump-prone material, so evaluateAskScoreVoice actually
110
+ // guards the prompt, not just the checker.
111
+ const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question });
112
+ const routedContext = routed?.context ?? null;
113
+ const trainingData = testCase.context?.trainingData
114
+ ?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
94
115
  return {
95
116
  ...(testCase.context ?? {}),
96
- trainingData: testCase.context?.trainingData ?? routed?.context ?? null,
117
+ trainingData,
97
118
  routedMetadata: routed?.metadata ?? null
98
119
  };
99
120
  }
@@ -117,11 +138,38 @@ function summaryEvalGenerationMetadata(result) {
117
138
  );
118
139
  }
119
140
 
141
+ function buildAskEvalStructuredMetadata(testCase, context, output) {
142
+ if (testCase.surface !== 'ask') return {};
143
+ const parsedAsk = extractAskProgramDraft(output, {
144
+ canonicalizeExerciseName: canonicalExerciseName
145
+ });
146
+ const answer = stripXMLTagBlocks(parsedAsk.answerText);
147
+ const question = context?.question ?? testCase.context?.question ?? testCase.question ?? '';
148
+ const routingMetadata = context?.routedMetadata ?? null;
149
+ return {
150
+ routingMetadata,
151
+ structured: buildAskStructuredResponse(answer, routingMetadata ?? {}, {
152
+ programDraft: askStructuredProgramDraft(parsedAsk, routingMetadata),
153
+ question
154
+ })
155
+ };
156
+ }
157
+
158
+ function summaryEvalProviderMetadata(testCase, context, output, result = null) {
159
+ return {
160
+ ...summaryEvalGenerationMetadata(result),
161
+ ...buildAskEvalStructuredMetadata(testCase, context, output)
162
+ };
163
+ }
164
+
120
165
  export async function generateSummaryEvalOutputWithMetadata(testCase, context, snapshot = null) {
121
- const liveGenerationEnabled = process.env.SUMMARY_EVALS_LIVE === '1';
166
+ const liveGenerationEnabled = summaryEvalsLiveGenerationEnabled();
122
167
  const apiKey = process.env.OPENROUTER_API_KEY;
123
168
  if (!liveGenerationEnabled || !apiKey || testCase.shouldPass === false) {
124
- return { output: testCase.output, metadata: {} };
169
+ return {
170
+ output: testCase.output,
171
+ metadata: summaryEvalProviderMetadata(testCase, context, testCase.output)
172
+ };
125
173
  }
126
174
 
127
175
  let result;
@@ -151,7 +199,8 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
151
199
  apiKey,
152
200
  history: context.history ?? [],
153
201
  tone: context.tone,
154
- model: context.model
202
+ model: context.model,
203
+ routingMetadata: context.routedMetadata ?? undefined
155
204
  });
156
205
  break;
157
206
  }
@@ -161,7 +210,7 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
161
210
 
162
211
  return {
163
212
  output: result.text,
164
- metadata: summaryEvalGenerationMetadata(result)
213
+ metadata: summaryEvalProviderMetadata(testCase, context, result.text, result)
165
214
  };
166
215
  }
167
216
 
@@ -260,8 +309,44 @@ function isSingleParagraph(text) {
260
309
  return !normalizeText(text).includes('\n\n');
261
310
  }
262
311
 
263
- function lowerIncludes(text, snippet) {
264
- return normalizeText(text).toLowerCase().includes(String(snippet).toLowerCase());
312
+ // Canonicalizes free-form coach text and required-mention snippets to the same
313
+ // surface form before substring matching. The goal is to keep grounding checks
314
+ // (does the answer cite this real number?) while tolerating the formatting an
315
+ // LLM legitimately varies: unicode × vs ASCII x, set-token unit placement,
316
+ // signed deltas, and rep-sequence separators (8/8/7 vs "8, 8, and 7").
317
+ // It only adds equivalences; it never strips the digits a check is grounded on,
318
+ // so a genuinely absent number still fails.
319
+ function normalizeForMention(value) {
320
+ let s = normalizeText(value).toLowerCase();
321
+ // Unicode multiplication / bullet / asterisk between digits -> ASCII x.
322
+ s = s.replace(/(\d)\s*[×✕╳·∗*]\s*(\d)/g, '$1x$2');
323
+ // Drop weight units that sit inside set tokens: "80kg x 7" / "40 kg" -> "80 x 7" / "40".
324
+ s = s.replace(/(\d(?:\.\d+)?)\s*(?:kgs?|lbs?|pounds)\b/g, '$1');
325
+ // Collapse spaces around an x that joins two numbers: "80 x 7" -> "80x7".
326
+ s = s.replace(/(\d)\s*x\s*(?=\d)/g, '$1x');
327
+ // Unify rep-sequence separators: "8/8/7" and "8, 8, and 7" -> "8,8,7".
328
+ // Lookahead keeps the trailing digit so chained separators all collapse.
329
+ s = s.replace(/(\d)\s*\/\s*(?=\d)/g, '$1,');
330
+ s = s.replace(/(\d)\s*,\s*and\s+(?=\d)/g, '$1,');
331
+ s = s.replace(/(\d)\s*,\s*(?=\d)/g, '$1,');
332
+ return s.replace(/\s+/g, ' ').trim();
333
+ }
334
+
335
+ // A required/any-of mention may be a string or an array of acceptable
336
+ // alternatives (matches if any alternative is present). Arrays express
337
+ // AND-of-ORs at the fixture level: every top-level entry must match, and an
338
+ // array entry matches when any of its phrasings appears.
339
+ function mentionMatches(output, mention) {
340
+ const normalizedOutput = normalizeForMention(output);
341
+ const alternatives = Array.isArray(mention) ? mention : [mention];
342
+ return alternatives
343
+ .map((alternative) => normalizeForMention(alternative))
344
+ .filter(Boolean)
345
+ .some((alternative) => normalizedOutput.includes(alternative));
346
+ }
347
+
348
+ function describeMention(mention) {
349
+ return Array.isArray(mention) ? `one of [${mention.join(' | ')}]` : String(mention);
265
350
  }
266
351
 
267
352
  function phraseIncludes(text, snippet) {
@@ -548,11 +633,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
548
633
  }
549
634
 
550
635
  function evaluateRequiredMentions(output, testCase) {
551
- const missing = uniqueStrings(testCase.requiredMentions).filter((mention) => !lowerIncludes(output, mention));
636
+ const mentions = Array.isArray(testCase.requiredMentions) ? testCase.requiredMentions : [];
637
+ const missing = mentions.filter((mention) => !mentionMatches(output, mention));
552
638
  return {
553
639
  key: 'required_mentions',
554
640
  passed: missing.length === 0,
555
- reason: missing.length === 0 ? 'All required mentions present.' : `Missing required mention(s): ${missing.join(', ')}`
641
+ reason: missing.length === 0
642
+ ? 'All required mentions present.'
643
+ : `Missing required mention(s): ${missing.map(describeMention).join(', ')}`
556
644
  };
557
645
  }
558
646
 
@@ -566,7 +654,7 @@ function evaluateAnyOfMentions(output, testCase) {
566
654
  };
567
655
  }
568
656
 
569
- const matched = candidates.some((mention) => lowerIncludes(output, mention));
657
+ const matched = candidates.some((mention) => mentionMatches(output, mention));
570
658
  return {
571
659
  key: 'required_any_of_mentions',
572
660
  passed: matched,
@@ -1162,13 +1250,22 @@ function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
1162
1250
  return false;
1163
1251
  }
1164
1252
 
1253
+ function parseWeightNumber(raw) {
1254
+ return Number(String(raw).replace(/,/g, ''));
1255
+ }
1256
+
1165
1257
  function extractAskWeightClaims(text) {
1166
1258
  const claims = [];
1167
- const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
1259
+ // Accept comma-grouped thousands ("40,500 kg") as a single number so volume
1260
+ // figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
1261
+ // figures are excluded by isVolumeWeightClaim at the call sites, not by a
1262
+ // magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
1263
+ // 1000 kg, and a fabricated heavy load must still be graded.
1264
+ const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
1168
1265
  for (const match of text.matchAll(pattern)) {
1169
1266
  claims.push({
1170
1267
  text: match[0],
1171
- value: Number(match[1]),
1268
+ value: parseWeightNumber(match[1]),
1172
1269
  index: match.index ?? -1
1173
1270
  });
1174
1271
  }
@@ -1177,12 +1274,17 @@ function extractAskWeightClaims(text) {
1177
1274
 
1178
1275
  function extractAskWeightedSetClaims(text) {
1179
1276
  const claims = [];
1180
- const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:x|×|for)\s*(\d+)\b/gi;
1277
+ // A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
1278
+ // an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
1279
+ // almost always a SET count ("70 kg for 4 working sets") or a duration, and
1280
+ // treating it as reps flags real data as a fabricated pair. So match only the
1281
+ // unambiguous forms; the plain-weight loop still grounds the weight itself.
1282
+ const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
1181
1283
  for (const match of text.matchAll(pattern)) {
1182
1284
  claims.push({
1183
1285
  text: match[0],
1184
- weight: Number(match[1]),
1185
- reps: Number(match[2]),
1286
+ weight: parseWeightNumber(match[1]),
1287
+ reps: Number(match[2] ?? match[3]),
1186
1288
  index: match.index ?? -1,
1187
1289
  end: (match.index ?? -1) + match[0].length
1188
1290
  });
@@ -1225,10 +1327,58 @@ function isEstimatedOneRepMaxWeightClaim(text, claim) {
1225
1327
  }
1226
1328
 
1227
1329
  function isVolumeWeightClaim(text, claim) {
1228
- const start = Math.max(0, claim.index - 30);
1229
- const end = Math.min(text.length, claim.index + claim.text.length + 30);
1230
- const window = text.slice(start, end);
1231
- return /\bvolume\b/i.test(window);
1330
+ // A kg figure in a clause about volume/tonnage/total load is a workload total
1331
+ // (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
1332
+ // exercise load. Scope to the claim's clause so a fabricated exercise load
1333
+ // earlier in the same sentence is still graded.
1334
+ return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
1335
+ }
1336
+
1337
+ function claimClause(text, claim) {
1338
+ const boundaries = [
1339
+ '\n',
1340
+ '. ',
1341
+ ';',
1342
+ ', while',
1343
+ ', whereas',
1344
+ ', but',
1345
+ ' while ',
1346
+ ' whereas ',
1347
+ ' but '
1348
+ ];
1349
+ let start = 0;
1350
+ for (const boundary of boundaries) {
1351
+ const index = text.lastIndexOf(boundary, claim.index);
1352
+ if (index >= 0) start = Math.max(start, index + boundary.length);
1353
+ }
1354
+
1355
+ let end = text.length;
1356
+ for (const boundary of boundaries) {
1357
+ const index = text.indexOf(boundary, claim.index + claim.text.length);
1358
+ if (index >= 0) end = Math.min(end, index);
1359
+ }
1360
+ return text.slice(start, end);
1361
+ }
1362
+
1363
+ // Returns the sentence containing the claim, so context guards can look at the
1364
+ // whole clause rather than a fixed-width window (body-weight phrasing can put
1365
+ // the "body weight" anchor well before the kg figure).
1366
+ function claimSentence(text, claim) {
1367
+ const before = text.slice(0, claim.index);
1368
+ const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
1369
+ const start = startBreak >= 0 ? startBreak + 1 : 0;
1370
+ const after = text.slice(claim.index);
1371
+ const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
1372
+ const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
1373
+ return text.slice(start, end);
1374
+ }
1375
+
1376
+ // Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
1377
+ // exercise-load claims. findNearestMentionedExercise would otherwise attribute
1378
+ // them to the previously named lift and flag a correct answer as a
1379
+ // hallucination, so skip any kg figure stated in a body-weight clause.
1380
+ function isBodyWeightClaim(text, claim) {
1381
+ return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
1232
1382
  }
1233
1383
 
1234
1384
  function askWorkingTopSetRows(snapshot) {
@@ -1367,6 +1517,70 @@ function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
1367
1517
  };
1368
1518
  }
1369
1519
 
1520
+ // Increment Score component names. Recited with a number, these are the raw
1521
+ // sub-scores the coach-observation-voice spec marks Tier 1 — never surface.
1522
+ const SCORE_COMPONENT_NAMES = ['coverage', 'stimulus', 'execution', 'progression', 'recovery'];
1523
+
1524
+ // A score-like magnitude: 1-3 digits, optional one decimal place.
1525
+ const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
1526
+
1527
+ // Contexts that mean the number is real-world data — reps, load, time, counts,
1528
+ // ratios, percentages, the /100 headline — not a raw component sub-score. A
1529
+ // number directly followed by one of these is left alone.
1530
+ const NON_SCORE_UNIT =
1531
+ '(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
1532
+ 'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
1533
+
1534
+ // Heuristic, not a parser. Flags a component name followed — within a short,
1535
+ // period/newline-free gap (one clause) — by a score-like number that is not a
1536
+ // real-world unit. The bounded gap (excludes digits, so it can't skip a unit'd
1537
+ // number) catches the natural phrasings an LLM actually emits — "recovery 35",
1538
+ // "recovery is 35", "recovery is sitting at 35", "recovery came in at 35",
1539
+ // "recovery (35)", "recovery is much lower at 42.8", "coverage 100" — while the
1540
+ // unit lookahead keeps clean prose ("recovery over the last 3 sessions",
1541
+ // "recovery after 3 hours of sleep", "execution at 9/10 RPE") from tripping.
1542
+ const SCORE_COMPONENT_DUMP_PATTERN = new RegExp(
1543
+ // `(?!\\.\\d)` rejects a number that is really the integer part of a decimal —
1544
+ // without it, backtracking matches "2" in "progression of 2.5 kg" (the unit
1545
+ // guard only sees the ".5 kg" tail) and false-flags real load/time data.
1546
+ `\\b(${SCORE_COMPONENT_NAMES.join('|')})\\b[^.\\d\\n]{0,25}?(${SCORE_NUMBER})\\b(?!\\.\\d)(?!\\s*${NON_SCORE_UNIT})`,
1547
+ 'gi'
1548
+ );
1549
+
1550
+ // The other dump the prelude used to emit and the model parroted: an explicit
1551
+ // day-over-day delta number ("-13 day-over-day delta", "down 11 points day over
1552
+ // day"). A bare "down day-over-day" with no number is fine.
1553
+ const SCORE_DELTA_DUMP_PATTERN =
1554
+ /[+-]\d+(?:\.\d+)?[^.\n]{0,16}?day[- ]over[- ]day|(?:\d+(?:\.\d+)?\s*points?)[^.\n]{0,16}?day[- ]over[- ]day|day[- ]over[- ]day[^.\n]{0,16}?(?:[+-]\d+(?:\.\d+)?|\d+(?:\.\d+)?\s*points?)/i;
1555
+
1556
+ export function evaluateAskScoreVoice(output, testCase) {
1557
+ if (testCase.surface !== 'ask') {
1558
+ return { key: 'ask_score_voice', passed: true, reason: 'Not an ask answer.' };
1559
+ }
1560
+ // Escape hatch for cases that legitimately need raw component values
1561
+ // (e.g. an ask case paired with the numbers-only tone).
1562
+ if (testCase.allowScoreComponents === true) {
1563
+ return { key: 'ask_score_voice', passed: true, reason: 'Score-component voice check opted out for this case.' };
1564
+ }
1565
+
1566
+ const text = normalizeText(output);
1567
+ const hits = new Set();
1568
+ for (const match of text.matchAll(SCORE_COMPONENT_DUMP_PATTERN)) {
1569
+ hits.add(`${match[1]} ${match[2]}`);
1570
+ }
1571
+ if (SCORE_DELTA_DUMP_PATTERN.test(text)) {
1572
+ hits.add('day-over-day delta number');
1573
+ }
1574
+
1575
+ return {
1576
+ key: 'ask_score_voice',
1577
+ passed: hits.size === 0,
1578
+ reason: hits.size === 0
1579
+ ? 'Ask answer does not recite raw Increment Score component sub-scores.'
1580
+ : `Ask answer recites raw score internals: ${[...hits].join(', ')}. Speak in training reality, not raw sub-scores.`
1581
+ };
1582
+ }
1583
+
1370
1584
  function relevantSessionsForStaleness(snapshot, testCase) {
1371
1585
  const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
1372
1586
  ?? testCase.directionalConsistency?.[0]?.exercise
@@ -1379,6 +1593,65 @@ function relevantSessionsForStaleness(snapshot, testCase) {
1379
1593
  ));
1380
1594
  }
1381
1595
 
1596
+ // The coach IS the coach — it must speak in the first person and never refer to
1597
+ // itself or its own outputs as a third party ("the coach observation says…",
1598
+ // "the system shows…"). Own the observation instead ("I flagged…").
1599
+ const ASK_SELF_REFERENCE_PATTERNS = [
1600
+ /\bthe coach observations?\b/i,
1601
+ /\bthe coach\b/i,
1602
+ /\bthe ai coach\b/i,
1603
+ /\byour coach\b/i,
1604
+ /\bthis coach\b/i,
1605
+ /\bthe system\b/i,
1606
+ /\bthe assistant\b/i
1607
+ ];
1608
+
1609
+ function evaluateAskSelfReference(output, testCase) {
1610
+ if (testCase.surface !== 'ask') {
1611
+ return { key: 'ask_self_reference', passed: true, reason: 'Not an ask answer.' };
1612
+ }
1613
+ const text = normalizeText(output);
1614
+ if (text === 'NO_INSIGHT' || !text) {
1615
+ return { key: 'ask_self_reference', passed: true, reason: 'No answer text.' };
1616
+ }
1617
+ const hits = [];
1618
+ for (const pattern of ASK_SELF_REFERENCE_PATTERNS) {
1619
+ const match = text.match(pattern);
1620
+ if (match) hits.push(match[0]);
1621
+ }
1622
+ const unique = uniqueStrings(hits);
1623
+ return {
1624
+ key: 'ask_self_reference',
1625
+ passed: unique.length === 0,
1626
+ reason: unique.length === 0
1627
+ ? 'Ask answer speaks in the first person.'
1628
+ : `Ask answer refers to itself in the third person: ${unique.join(', ')}. You ARE the coach — own it ("I flagged…", "your data shows…").`
1629
+ };
1630
+ }
1631
+
1632
+ // On a question that is not about the Increment Score, the coach must not
1633
+ // volunteer the bare overall score number (e.g. "your score is 92/100"). The
1634
+ // prelude withholds the number for non-score questions; this guards the answer.
1635
+ function evaluateAskVolunteeredScore(output, testCase) {
1636
+ if (testCase.surface !== 'ask') {
1637
+ return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
1638
+ }
1639
+ const question = testCase.context?.question ?? testCase.question ?? '';
1640
+ if (isScoreQuestion(question)) {
1641
+ return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
1642
+ }
1643
+ const text = normalizeText(output);
1644
+ const volunteered = /\b\d{2,3}\s*\/\s*100\b/.test(text)
1645
+ || /\b(?:increment\s+)?score\s+(?:is|of|at|sits at|currently|was)\b[^.\n]*\b\d{2,3}\b/i.test(text);
1646
+ return {
1647
+ key: 'ask_volunteered_score',
1648
+ passed: !volunteered,
1649
+ reason: volunteered
1650
+ ? 'Ask answer volunteers the overall Increment Score number on a question that was not about the score. Translate it to the limiter instead.'
1651
+ : 'Ask answer does not volunteer the score number unprompted.'
1652
+ };
1653
+ }
1654
+
1382
1655
  function evaluateAskStaleness(output, snapshot, testCase) {
1383
1656
  if (testCase.surface !== 'ask') {
1384
1657
  return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
@@ -1472,6 +1745,7 @@ function evaluateAskClaims(output, snapshot, testCase) {
1472
1745
  for (const claim of extractAskWeightClaims(normalized)) {
1473
1746
  if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
1474
1747
  if (isVolumeWeightClaim(normalized, claim)) continue;
1748
+ if (isBodyWeightClaim(normalized, claim)) continue;
1475
1749
  const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1476
1750
  if (!referencedExercise) continue;
1477
1751
  const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
@@ -1713,6 +1987,7 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
1713
1987
  if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
1714
1988
  if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
1715
1989
  if (isVolumeWeightClaim(output, claim)) continue;
1990
+ if (isBodyWeightClaim(output, claim)) continue;
1716
1991
  const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1717
1992
  if (!referencedExercise) continue;
1718
1993
  const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
@@ -1770,6 +2045,309 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
1770
2045
  };
1771
2046
  }
1772
2047
 
2048
+ function scoreFormulaEntries(snapshot) {
2049
+ const seen = new Set();
2050
+ return scoreHistoryFromSnapshot(snapshot).filter((entry) => {
2051
+ if (!entry) return false;
2052
+ const key = entry.id ?? entry.snapshotAt;
2053
+ if (key == null) return true;
2054
+ if (seen.has(key)) return false;
2055
+ seen.add(key);
2056
+ return true;
2057
+ });
2058
+ }
2059
+
2060
+ function evaluateFormulaVersion(_output, snapshot, testCase) {
2061
+ const expected = testCase.expectedFormulaVersion ?? testCase.formulaVersion ?? null;
2062
+ if (!expected) {
2063
+ return { key: 'formula_version', passed: true, reason: 'No formula version pin configured.' };
2064
+ }
2065
+
2066
+ const entries = scoreFormulaEntries(snapshot);
2067
+ const missingCount = entries.filter((entry) => !entry?.formulaVersion).length;
2068
+ const versions = uniqueStrings(entries.map((entry) => entry?.formulaVersion));
2069
+ const passed = entries.length > 0 && missingCount === 0 && versions.every((version) => version === expected);
2070
+ return {
2071
+ key: 'formula_version',
2072
+ passed,
2073
+ reason: passed
2074
+ ? `Formula version is pinned to ${expected}.`
2075
+ : missingCount > 0
2076
+ ? `Expected formula version ${expected}, but ${missingCount} score snapshot(s) have no formula version.`
2077
+ : versions.length > 0
2078
+ ? `Expected formula version ${expected}, got ${versions.join(', ')}.`
2079
+ : `Expected formula version ${expected}, but snapshot has no increment score formula version.`
2080
+ };
2081
+ }
2082
+
2083
+ function arrayContainsAll(actual = [], expected = []) {
2084
+ const actualSet = new Set(actual ?? []);
2085
+ return (expected ?? []).every((item) => actualSet.has(item));
2086
+ }
2087
+
2088
+ function arrayEquals(actual = [], expected = []) {
2089
+ if (!Array.isArray(actual) || !Array.isArray(expected) || actual.length !== expected.length) {
2090
+ return false;
2091
+ }
2092
+ const sortedActual = [...actual].sort();
2093
+ const sortedExpected = [...expected].sort();
2094
+ return sortedActual.every((item, index) => item === sortedExpected[index]);
2095
+ }
2096
+
2097
+ function askObservationCheckMatches(actualCheck, expectedCheck) {
2098
+ return Object.entries(expectedCheck ?? {}).every(([key, value]) => {
2099
+ if (Array.isArray(value)) return arrayContainsAll(actualCheck?.[key], value);
2100
+ return actualCheck?.[key] === value;
2101
+ });
2102
+ }
2103
+
2104
+ function evaluateAskEvidencePlan(_output, context, testCase) {
2105
+ if (testCase.surface !== 'ask') {
2106
+ return { key: 'ask_evidence_plan', passed: true, reason: 'Not an ask answer.' };
2107
+ }
2108
+
2109
+ const expected = testCase.expectedEvidencePlan ?? null;
2110
+ if (!expected) {
2111
+ return { key: 'ask_evidence_plan', passed: true, reason: 'No evidence plan assertion configured.' };
2112
+ }
2113
+
2114
+ const plan = context?.routedMetadata?.evidencePlan ?? null;
2115
+ const failures = [];
2116
+ if (!plan) {
2117
+ failures.push('Routed Ask context did not expose metadata.evidencePlan.');
2118
+ } else {
2119
+ for (const key of ['route', 'effectiveRoute', 'fallbackRoute']) {
2120
+ if (key in expected && plan[key] !== expected[key]) {
2121
+ failures.push(`Expected evidencePlan.${key}=${expected[key] ?? 'null'}, got ${plan[key] ?? 'null'}.`);
2122
+ }
2123
+ }
2124
+
2125
+ for (const key of ['requiredTools', 'optionalTools', 'executedTools', 'evidenceGaps']) {
2126
+ if (Array.isArray(expected[key]) && !arrayEquals(plan[key], expected[key])) {
2127
+ failures.push(`Expected evidencePlan.${key} to equal ${expected[key].join(', ')}; got ${(plan[key] ?? []).join(', ')}.`);
2128
+ }
2129
+ }
2130
+
2131
+ if (Array.isArray(expected.excludedExecutedTools)) {
2132
+ const executed = new Set(plan.executedTools ?? []);
2133
+ const hits = expected.excludedExecutedTools.filter((toolName) => executed.has(toolName));
2134
+ if (hits.length > 0) {
2135
+ failures.push(`Expected evidencePlan.executedTools to exclude ${hits.join(', ')}.`);
2136
+ }
2137
+ }
2138
+
2139
+ for (const expectedCheck of expected.observationChecks ?? []) {
2140
+ const matched = (plan.observationChecks ?? []).some((actualCheck) => askObservationCheckMatches(actualCheck, expectedCheck));
2141
+ if (!matched) {
2142
+ failures.push(`Expected observation check ${JSON.stringify(expectedCheck)}; got ${JSON.stringify(plan.observationChecks ?? [])}.`);
2143
+ }
2144
+ }
2145
+ if (Array.isArray(expected.observationChecks) && (plan.observationChecks ?? []).length !== expected.observationChecks.length) {
2146
+ failures.push(`Expected ${expected.observationChecks.length} observation check(s), got ${(plan.observationChecks ?? []).length}.`);
2147
+ }
2148
+ }
2149
+
2150
+ return {
2151
+ key: 'ask_evidence_plan',
2152
+ passed: failures.length === 0,
2153
+ reason: failures.length === 0
2154
+ ? 'Ask evidence plan matches configured assertions.'
2155
+ : failures.join(' ')
2156
+ };
2157
+ }
2158
+
2159
+ function askMetadataObservationReferences(metadata) {
2160
+ const references = new Set([
2161
+ ...(metadata?.includedCoachObservationIds ?? []),
2162
+ ...(metadata?.coachObservationIds ?? [])
2163
+ ]);
2164
+ for (const comparison of metadata?.sessionObservationComparisons ?? []) {
2165
+ if (comparison?.observationId) references.add(comparison.observationId);
2166
+ }
2167
+ for (const item of metadata?.provenance ?? []) {
2168
+ for (const sourceId of item?.sourceIds ?? []) {
2169
+ references.add(sourceId);
2170
+ }
2171
+ }
2172
+ return references;
2173
+ }
2174
+
2175
+ function evaluateAskMetadata(output, context, testCase) {
2176
+ if (testCase.surface !== 'ask') {
2177
+ return { key: 'ask_metadata', passed: true, reason: 'Not an ask answer.' };
2178
+ }
2179
+
2180
+ const expected = testCase.expectedMetadata ?? null;
2181
+ if (!expected) {
2182
+ return { key: 'ask_metadata', passed: true, reason: 'No Ask metadata assertion configured.' };
2183
+ }
2184
+
2185
+ const metadata = context?.routedMetadata ?? {};
2186
+ const failures = [];
2187
+
2188
+ if (Array.isArray(expected.includedCoachObservationIds)) {
2189
+ const included = new Set(metadata.includedCoachObservationIds ?? []);
2190
+ const missing = expected.includedCoachObservationIds.filter((id) => !included.has(id));
2191
+ if (missing.length > 0) {
2192
+ failures.push(`Expected included coach observation id(s): ${missing.join(', ')}.`);
2193
+ }
2194
+ }
2195
+
2196
+ if (Array.isArray(expected.excludedCoachObservationIds)) {
2197
+ const references = askMetadataObservationReferences(metadata);
2198
+ const hits = expected.excludedCoachObservationIds.filter((id) => references.has(id));
2199
+ if (hits.length > 0) {
2200
+ failures.push(`Expected coach observation id(s) to be excluded from rendered metadata: ${hits.join(', ')}.`);
2201
+ }
2202
+ }
2203
+
2204
+ if (Array.isArray(expected.forbiddenObservationPhrases)) {
2205
+ const hits = uniqueStrings(expected.forbiddenObservationPhrases).filter((phrase) => phraseIncludes(output, phrase));
2206
+ if (hits.length > 0) {
2207
+ failures.push(`Dismissed or excluded observation phrase(s) leaked into Ask answer: ${hits.join(', ')}.`);
2208
+ }
2209
+ }
2210
+
2211
+ return {
2212
+ key: 'ask_metadata',
2213
+ passed: failures.length === 0,
2214
+ reason: failures.length === 0
2215
+ ? 'Ask metadata matches configured assertions.'
2216
+ : failures.join(' ')
2217
+ };
2218
+ }
2219
+
2220
+ function normalizedStructuredText(value) {
2221
+ return String(value ?? '')
2222
+ .toLowerCase()
2223
+ .replace(/[^a-z0-9]+/g, ' ')
2224
+ .replace(/\b(my|the|a|an)\b/g, ' ')
2225
+ .replace(/\s+/g, ' ')
2226
+ .trim();
2227
+ }
2228
+
2229
+ function structuredStringArray(value) {
2230
+ return Array.isArray(value)
2231
+ ? value.map((item) => String(item ?? '').trim()).filter(Boolean)
2232
+ : [];
2233
+ }
2234
+
2235
+ function structuredObjectStringArray(items, key) {
2236
+ return Array.isArray(items)
2237
+ ? items.map((item) => String(item?.[key] ?? '').trim()).filter(Boolean)
2238
+ : [];
2239
+ }
2240
+
2241
+ function requireStructuredStrings(actual, expected, label, failures) {
2242
+ if (!Array.isArray(expected)) return;
2243
+ const actualSet = new Set(structuredStringArray(actual));
2244
+ const missing = expected.filter((item) => !actualSet.has(item));
2245
+ if (missing.length > 0) {
2246
+ failures.push(`Expected structured ${label}: ${missing.join(', ')}.`);
2247
+ }
2248
+ }
2249
+
2250
+ function forbidStructuredSuggestions(actual, forbidden, failures) {
2251
+ if (!Array.isArray(forbidden)) return;
2252
+ const normalizedActual = new Set(structuredStringArray(actual).map(normalizedStructuredText).filter(Boolean));
2253
+ const hits = forbidden.filter((item) => normalizedActual.has(normalizedStructuredText(item)));
2254
+ if (hits.length > 0) {
2255
+ failures.push(`Forbidden follow-up suggestion(s) present: ${hits.join(', ')}.`);
2256
+ }
2257
+ }
2258
+
2259
+ function evaluateAskStructuredResponse(_output, context, testCase, structured) {
2260
+ if (testCase.surface !== 'ask') {
2261
+ return { key: 'ask_structured_response', passed: true, reason: 'Not an ask answer.' };
2262
+ }
2263
+
2264
+ const expected = testCase.expectedStructuredResponse ?? null;
2265
+ if (!expected) {
2266
+ return { key: 'ask_structured_response', passed: true, reason: 'No structured response assertion configured.' };
2267
+ }
2268
+
2269
+ const failures = [];
2270
+ if (!structured || typeof structured !== 'object' || Array.isArray(structured)) {
2271
+ failures.push('Ask structured response was not generated.');
2272
+ } else {
2273
+ if (expected.confidence && structured.confidence !== expected.confidence) {
2274
+ failures.push(`Expected structured confidence ${expected.confidence}, got ${structured.confidence ?? 'null'}.`);
2275
+ }
2276
+
2277
+ requireStructuredStrings(
2278
+ structuredObjectStringArray(structured.evidenceUsed, 'toolName'),
2279
+ expected.requiredEvidenceTools,
2280
+ 'evidence tool(s)',
2281
+ failures
2282
+ );
2283
+ requireStructuredStrings(
2284
+ structuredObjectStringArray(structured.evidenceUsed, 'label'),
2285
+ expected.requiredEvidenceLabels,
2286
+ 'evidence label(s)',
2287
+ failures
2288
+ );
2289
+ requireStructuredStrings(
2290
+ structuredObjectStringArray(structured.recommendedActions, 'label'),
2291
+ expected.requiredRecommendedActionLabels,
2292
+ 'recommended action label(s)',
2293
+ failures
2294
+ );
2295
+ requireStructuredStrings(
2296
+ structured.followUpSuggestions,
2297
+ expected.requiredFollowUpSuggestions,
2298
+ 'follow-up suggestion(s)',
2299
+ failures
2300
+ );
2301
+ requireStructuredStrings(
2302
+ structured.limitations,
2303
+ expected.requiredLimitations,
2304
+ 'limitation(s)',
2305
+ failures
2306
+ );
2307
+ forbidStructuredSuggestions(structured.followUpSuggestions, expected.forbiddenFollowUpSuggestions, failures);
2308
+
2309
+ const followUps = structuredStringArray(structured.followUpSuggestions);
2310
+ const normalizedFollowUps = followUps.map(normalizedStructuredText).filter(Boolean);
2311
+ const duplicateCount = normalizedFollowUps.length - new Set(normalizedFollowUps).size;
2312
+ if (duplicateCount > 0) {
2313
+ failures.push('Structured follow-up suggestions must be unique.');
2314
+ }
2315
+
2316
+ const normalizedQuestion = normalizedStructuredText(context?.question ?? testCase.context?.question ?? testCase.question ?? '');
2317
+ if (normalizedQuestion && normalizedFollowUps.includes(normalizedQuestion)) {
2318
+ failures.push('Structured follow-up suggestions must not repeat the current user question.');
2319
+ }
2320
+
2321
+ if (Number.isFinite(expected.maxFollowUpSuggestions) && followUps.length > expected.maxFollowUpSuggestions) {
2322
+ failures.push(`Expected at most ${expected.maxFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
2323
+ }
2324
+ if (Number.isFinite(expected.minFollowUpSuggestions) && followUps.length < expected.minFollowUpSuggestions) {
2325
+ failures.push(`Expected at least ${expected.minFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
2326
+ }
2327
+
2328
+ if (typeof expected.programDraftPresent === 'boolean') {
2329
+ const hasProgramDraft = structured.programDraft != null;
2330
+ if (hasProgramDraft !== expected.programDraftPresent) {
2331
+ failures.push(`Expected programDraft present=${expected.programDraftPresent}, got ${hasProgramDraft}.`);
2332
+ }
2333
+ }
2334
+ }
2335
+
2336
+ return {
2337
+ key: 'ask_structured_response',
2338
+ passed: failures.length === 0,
2339
+ reason: failures.length === 0
2340
+ ? 'Ask structured response matches configured assertions.'
2341
+ : failures.join(' ')
2342
+ };
2343
+ }
2344
+
2345
+ function askStructuredProgramDraft(parsedAsk, routingMetadata) {
2346
+ const shouldSuppressDraft = routingMetadata?.requestedCoachObservationIntent === 'successor_plan'
2347
+ && routingMetadata?.coachObservationFollowUpMissing === true;
2348
+ return shouldSuppressDraft ? undefined : parsedAsk?.programDraft;
2349
+ }
2350
+
1773
2351
  function firstAction(payload) {
1774
2352
  const actions = Array.isArray(payload?.recommendedNextActions) ? payload.recommendedNextActions : [];
1775
2353
  return actions.find((action) => typeof action?.action === 'string' && action.action.trim());
@@ -1998,17 +2576,20 @@ function evaluatePersonaMotivation(output, context, testCase) {
1998
2576
  failures.push('Feedback acknowledges a PR/positive result but frames later-set dropoff in a demotivating way.');
1999
2577
  }
2000
2578
 
2579
+ const dataLimitationCaveat = /\bnot enough\s+(?:\w+\s+){0,5}?(?:data|details?|context|evidence|information|info|signals?|metrics?)\b/i.test(normalized)
2580
+ || /\bnot enough\s+(?:\w+\s+){0,8}?to\s+(?:separate|infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\b/i.test(normalized);
2581
+
2001
2582
  const discouragingPatterns = [
2002
2583
  /\bdisappointing\b/i,
2003
2584
  /\bunderwhelming\b/i,
2004
2585
  /\bunderperformed\b/i,
2005
2586
  /\bpoor\b/i,
2006
- /\bnot enough\b/i,
2587
+ /\bnot enough\s+(?:effort|work|volume|intensity|reps?|sets?|weight|load|progress|consistency)\b/i,
2007
2588
  /\bfailed to\b/i,
2008
2589
  /\bstruggled\b/i
2009
2590
  ];
2010
2591
 
2011
- if (discouragingPatterns.some((pattern) => pattern.test(normalized))) {
2592
+ if (!dataLimitationCaveat && discouragingPatterns.some((pattern) => pattern.test(normalized))) {
2012
2593
  failures.push('Feedback uses discouraging language that is likely to reduce motivation.');
2013
2594
  }
2014
2595
 
@@ -2026,6 +2607,33 @@ export async function runSummaryEvalCase(testCase) {
2026
2607
  return runSummaryEvalCaseFromSnapshot(testCase, snapshot);
2027
2608
  }
2028
2609
 
2610
+ // When an ask answer emits a <program_draft> block, it must be valid JSON in the
2611
+ // exact Program shape (enums, limits, no forbidden keys) — validated by the same
2612
+ // normalizer the runtime uses to accept/drop drafts. Catches malformed drafts in
2613
+ // CI instead of silently dropping them in prod. No block = nothing to check.
2614
+ function evaluateProgramDraft(output, testCase, parsedAsk = null) {
2615
+ if (testCase.surface !== 'ask') {
2616
+ return { key: 'program_draft', passed: true, reason: 'Not an ask answer.' };
2617
+ }
2618
+ if (!hasProgramDraftBlock(output)) {
2619
+ return { key: 'program_draft', passed: true, reason: 'No program draft block.' };
2620
+ }
2621
+ // Validate against the EXACT runtime rules — the runtime passes
2622
+ // canonicalExerciseName, which strips non-alphanumerics; without it the eval
2623
+ // would green-light drafts (e.g. punctuation-only names) that prod silently drops.
2624
+ const { programDraft } = parsedAsk ?? extractAskProgramDraft(output, {
2625
+ canonicalizeExerciseName: canonicalExerciseName,
2626
+ strict: true
2627
+ });
2628
+ return {
2629
+ key: 'program_draft',
2630
+ passed: programDraft != null,
2631
+ reason: programDraft != null
2632
+ ? 'Program draft is valid JSON matching the required shape.'
2633
+ : 'Program draft block is malformed (invalid JSON, or fails shape/enum/limit validation).'
2634
+ };
2635
+ }
2636
+
2029
2637
  export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
2030
2638
  const context = buildSummaryEvalContext(snapshot, testCase);
2031
2639
  if (context == null) {
@@ -2036,26 +2644,53 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
2036
2644
  throw new Error(`Eval case ${testCase.id} produced an empty output`);
2037
2645
  }
2038
2646
 
2647
+ // strict: eval rejects a draft with any malformed nested item (the runtime
2648
+ // salvages it, but partial malformation is a regression signal). The parsed
2649
+ // result also feeds <program_draft> stripping for the other checks.
2650
+ const parsedAsk = testCase.surface === 'ask'
2651
+ ? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName, strict: true })
2652
+ : null;
2653
+ const structuredParsedAsk = testCase.surface === 'ask'
2654
+ ? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName })
2655
+ : null;
2656
+ const visibleOutput = parsedAsk
2657
+ ? stripXMLTagBlocks(parsedAsk.answerText)
2658
+ : output;
2659
+ const structuredAsk = testCase.surface === 'ask'
2660
+ ? buildAskStructuredResponse(visibleOutput, context.routedMetadata ?? {}, {
2661
+ programDraft: askStructuredProgramDraft(structuredParsedAsk, context.routedMetadata),
2662
+ question: context.question ?? testCase.context?.question ?? testCase.question ?? ''
2663
+ })
2664
+ : null;
2665
+
2039
2666
  const checks = [
2040
- evaluateNoInsight(output, testCase),
2041
- evaluateShape(output, testCase),
2042
- evaluateRequiredMentions(output, testCase),
2043
- evaluateAnyOfMentions(output, testCase),
2044
- evaluateForbiddenPhrases(output, testCase),
2045
- evaluateForbiddenMentions(output, testCase),
2046
- evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
2047
- evaluateWorkoutClaims(output, context, testCase),
2048
- evaluateAskClaims(output, snapshot, testCase),
2049
- evaluateAskDirectionalConsistency(output, snapshot, testCase),
2050
- evaluateAskStaleness(output, snapshot, testCase),
2051
- evaluateAskToolProvenance(output, context, testCase, snapshot),
2052
- evaluateScoreCommentaryAction(output, context, testCase),
2053
- evaluateScoreCommentarySynthesis(output, context, testCase),
2054
- evaluateScoreCommentaryExerciseInvention(output, snapshot, context, testCase),
2055
- evaluateScoreCommentaryBand(output, context, testCase),
2056
- evaluateScoreCommentaryTone(output, testCase),
2057
- evaluateScoreCommentaryLength(output, testCase),
2058
- evaluatePersonaMotivation(output, context, testCase)
2667
+ evaluateNoInsight(visibleOutput, testCase),
2668
+ evaluateShape(visibleOutput, testCase),
2669
+ evaluateRequiredMentions(visibleOutput, testCase),
2670
+ evaluateAnyOfMentions(visibleOutput, testCase),
2671
+ evaluateForbiddenPhrases(visibleOutput, testCase),
2672
+ evaluateForbiddenMentions(visibleOutput, testCase),
2673
+ evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
2674
+ evaluateWorkoutClaims(visibleOutput, context, testCase),
2675
+ evaluateAskClaims(visibleOutput, snapshot, testCase),
2676
+ evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
2677
+ evaluateAskScoreVoice(visibleOutput, testCase),
2678
+ evaluateAskSelfReference(visibleOutput, testCase),
2679
+ evaluateAskVolunteeredScore(visibleOutput, testCase),
2680
+ evaluateAskStaleness(visibleOutput, snapshot, testCase),
2681
+ evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
2682
+ evaluateFormulaVersion(visibleOutput, snapshot, testCase),
2683
+ evaluateAskEvidencePlan(visibleOutput, context, testCase),
2684
+ evaluateAskMetadata(visibleOutput, context, testCase),
2685
+ evaluateAskStructuredResponse(visibleOutput, context, testCase, structuredAsk),
2686
+ evaluateScoreCommentaryAction(visibleOutput, context, testCase),
2687
+ evaluateScoreCommentarySynthesis(visibleOutput, context, testCase),
2688
+ evaluateScoreCommentaryExerciseInvention(visibleOutput, snapshot, context, testCase),
2689
+ evaluateScoreCommentaryBand(visibleOutput, context, testCase),
2690
+ evaluateScoreCommentaryTone(visibleOutput, testCase),
2691
+ evaluateScoreCommentaryLength(visibleOutput, testCase),
2692
+ evaluatePersonaMotivation(visibleOutput, context, testCase),
2693
+ evaluateProgramDraft(output, testCase, parsedAsk)
2059
2694
  ];
2060
2695
 
2061
2696
  return {