incremnt 0.8.0 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ import {
6
6
  canonicalExerciseName,
7
7
  checkpointContext,
8
8
  cycleSummaryContext,
9
+ dateOnlyString,
9
10
  executeCoachReadTool,
10
11
  normalizeExerciseName,
11
12
  workoutSummaryContext,
@@ -24,12 +25,30 @@ import {
24
25
  import { computeScoreBand } from './score-context.js';
25
26
  import { stripXMLTagBlocks } from './prompt-security.js';
26
27
  import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
28
+ import { findAskAnswerExerciseMentions, verifyAskAnswer } from './ask-answer-verifier.js';
27
29
 
28
30
  const __filename = fileURLToPath(import.meta.url);
29
31
  const __dirname = path.dirname(__filename);
30
32
 
31
33
  export const summaryEvalFixturesRoot = path.resolve(__dirname, '../test/fixtures/summary-evals');
32
34
 
35
+ const ASK_PROVENANCE_VERIFIER_KEYS = new Set([
36
+ 'unsupported_weight_claim',
37
+ 'unrouted_weight_claim',
38
+ 'unsupported_weighted_set_claim',
39
+ 'unrouted_weighted_set_claim',
40
+ 'e1rm_without_records',
41
+ 'direction_inversion',
42
+ 'tool_replay_failed'
43
+ ]);
44
+
45
+ const ASK_CLAIM_VERIFIER_KEYS = new Set([
46
+ 'target_hit_contradiction',
47
+ 'target_hit_without_session_evidence',
48
+ 'clean_consistency_contradiction',
49
+ 'unsupported_fatigue_recovery'
50
+ ]);
51
+
33
52
  export function defaultCaseSetName() {
34
53
  return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
35
54
  }
@@ -108,7 +127,8 @@ export function buildSummaryEvalContext(snapshot, testCase) {
108
127
  // prelude to the routed context. Including it here means a live eval feeds
109
128
  // the model the same dump-prone material, so evaluateAskScoreVoice actually
110
129
  // guards the prompt, not just the checker.
111
- const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question });
130
+ const responseProfile = routed?.metadata?.responseProfile ?? routed?.metadata?.intent?.responseProfile ?? testCase.context?.responseProfile;
131
+ const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question, responseProfile });
112
132
  const routedContext = routed?.context ?? null;
113
133
  const trainingData = testCase.context?.trainingData
114
134
  ?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
@@ -222,6 +242,49 @@ function normalizeText(value) {
222
242
  return String(value ?? '').trim();
223
243
  }
224
244
 
245
+ const askVerifierCacheKey = Symbol('summaryEvalAskVerifierCache');
246
+
247
+ export function summaryEvalAskVerifierToday(testCase) {
248
+ return dateOnlyString(testCase.context?.today ?? testCase.today ?? new Date());
249
+ }
250
+
251
+ function getAskVerifierResult(output, context, snapshot, testCase) {
252
+ const answer = normalizeText(output);
253
+ const routingMetadata = context?.routedMetadata ?? testCase.context?.routedMetadata ?? {};
254
+ const today = summaryEvalAskVerifierToday(testCase);
255
+ const exclude = testCase.exclude ?? [];
256
+ const cacheKey = `${answer}\n${JSON.stringify(routingMetadata)}\n${today}\n${JSON.stringify(exclude)}`;
257
+ if (context && typeof context === 'object') {
258
+ if (!context[askVerifierCacheKey]) {
259
+ Object.defineProperty(context, askVerifierCacheKey, {
260
+ value: new Map(),
261
+ enumerable: false
262
+ });
263
+ }
264
+ const cached = context[askVerifierCacheKey].get(cacheKey);
265
+ if (cached) return cached;
266
+ }
267
+ const verification = verifyAskAnswer({
268
+ answer,
269
+ snapshot,
270
+ routingMetadata,
271
+ today,
272
+ exclude,
273
+ strictMentionProvenance: false
274
+ });
275
+ const result = {
276
+ verification,
277
+ provenanceFailures: (verification.blockingFailures ?? [])
278
+ .filter((failure) => ASK_PROVENANCE_VERIFIER_KEYS.has(failure.key)),
279
+ claimFailures: (verification.blockingFailures ?? [])
280
+ .filter((failure) => ASK_CLAIM_VERIFIER_KEYS.has(failure.key))
281
+ };
282
+ if (context && typeof context === 'object') {
283
+ context[askVerifierCacheKey].set(cacheKey, result);
284
+ }
285
+ return result;
286
+ }
287
+
225
288
  function parseJsonOutput(output) {
226
289
  const normalized = normalizeText(output);
227
290
  if (!normalized) return null;
@@ -697,7 +760,7 @@ function evaluateNoInsight(output, testCase) {
697
760
  };
698
761
  }
699
762
 
700
- function evaluateShape(output, testCase) {
763
+ function evaluateShape(output, testCase, context = null) {
701
764
  const normalized = testCase.surface === 'scoreCommentary'
702
765
  ? scoreCommentaryText(output)
703
766
  : normalizeText(output);
@@ -768,12 +831,20 @@ function evaluateShape(output, testCase) {
768
831
  reasons.push(`Checkpoint summaries must be 2-3 paragraphs, got ${paragraphs}.`);
769
832
  }
770
833
  break;
771
- case 'ask':
772
- if (sentences < 1 || sentences > 12) {
834
+ case 'ask': {
835
+ // Expansive answers are intentionally richer; the old 12-sentence cap was
836
+ // the pre-expansive policy. Allow more for expansive (still bounded so a
837
+ // genuine wall of text is flagged), keep the tight cap for defensive.
838
+ const profile = context?.routedMetadata?.responseProfile
839
+ ?? context?.routedMetadata?.intent?.responseProfile
840
+ ?? askResponseProfileFromTestCase(testCase);
841
+ const maxAskSentences = profile === 'expansive' ? 20 : 12;
842
+ if (sentences < 1 || sentences > maxAskSentences) {
773
843
  passed = false;
774
- reasons.push(`Ask-coach answers must be 1-12 sentences, got ${sentences}.`);
844
+ reasons.push(`Ask-coach answers must be 1-${maxAskSentences} sentences, got ${sentences}.`);
775
845
  }
776
846
  break;
847
+ }
777
848
  case 'scoreCommentary':
778
849
  if (sentences < 1 || sentences > 8) {
779
850
  passed = false;
@@ -888,45 +959,6 @@ function hasFatigueLanguage(output) {
888
959
  return /\b(fatigue|fatigued|underrecovered|recovery debt|fatigue ceiling|limited by recovery|limited by fatigue|accumulated fatigue)\b/i.test(output);
889
960
  }
890
961
 
891
- function hasAskFatigueRecoveryLanguage(output) {
892
- return hasFatigueLanguage(output)
893
- || /\b(?:poor|low|bad|incomplete)\s+recovery\b/i.test(output)
894
- || /\bunder[-\s]?recovery\b/i.test(output)
895
- || /\brecovery\s+(?:limited|held back|caused|explains|drove|deficit|issue|problem)\b/i.test(output);
896
- }
897
-
898
- function hasAskFatigueRecoveryUncertaintyLanguage(output) {
899
- const missingRecoveryData = /\b(?:no|not enough|without|missing|lack(?:ing)?|insufficient)\s+(?:\w+\s+){0,4}?(?:recovery|readiness|vitals?|sleep|hrv|heart rate|data|info|signals?|metrics?)\b/i.test(output);
900
- const refusesInference = /\b(?:cannot|can't|do not|don't|does not|doesn't|would not|wouldn't|not enough|isn't enough|is not enough|no basis to|hard to)\s+(?:\w+\s+){0,12}?(?:infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\s+(?:\w+\s+){0,12}?(?:fatigue|recovery|readiness|why)\b/i.test(output);
901
- const recoveryDoesNotExplain = /\b(?:fatigue|recovery|readiness)\b\s+(?:\w+\s+){0,10}?(?:cannot|can't|does not|doesn't|would not|wouldn't|isn't|is not)\s+(?:\w+\s+){0,10}?(?:explain|prove|show|tell|account for)\b/i.test(output);
902
- return missingRecoveryData || refusesInference || recoveryDoesNotExplain;
903
- }
904
-
905
- function hasAskPositiveFatigueRecoveryAttribution(output) {
906
- const concept = String.raw`(?:fatigue|fatigued|under[-\s]?recovered|under[-\s]?recovery|poor recovery|low recovery|incomplete recovery|recovery debt|fatigue ceiling|accumulated fatigue)`;
907
- const causeVerb = String.raw`(?:because|due to|caused by|from|reflects?|suggests?|indicates?|points? to|explains?|limited|held back|drove|contributed to|tied to|tie\s+\w+\s+to)`;
908
- const patterns = [
909
- new RegExp(String.raw`\b${causeVerb}\b.{0,80}\b${concept}\b`, 'gi'),
910
- new RegExp(String.raw`\b${concept}\b.{0,80}\b(?:caused|limited|held back|explains?|drove|led to|contributed to|accounts? for)\b`, 'gi')
911
- ];
912
- for (const pattern of patterns) {
913
- for (const match of output.matchAll(pattern)) {
914
- const start = Math.max(0, (match.index ?? 0) - 40);
915
- const window = output.slice(start, (match.index ?? 0) + match[0].length);
916
- if (!/\b(?:not|no|cannot|can't|doesn't|does not|would not|wouldn't|isn't|is not)\b/i.test(window)) {
917
- return true;
918
- }
919
- }
920
- }
921
- return false;
922
- }
923
-
924
- function hasUnsupportedAskFatigueRecoveryClaim(output) {
925
- if (!hasAskFatigueRecoveryLanguage(output)) return false;
926
- if (hasAskPositiveFatigueRecoveryAttribution(output)) return true;
927
- return !hasAskFatigueRecoveryUncertaintyLanguage(output);
928
- }
929
-
930
962
  function matchesHistoricalFamilyName(claimName, actualName) {
931
963
  const claimVariants = new Set(historicalExerciseVariants(claimName));
932
964
  const actualVariants = new Set(historicalExerciseVariants(actualName));
@@ -1086,37 +1118,6 @@ function evaluateWorkoutClaims(output, context, testCase) {
1086
1118
  };
1087
1119
  }
1088
1120
 
1089
- function extractAskTargetHitClaims(text) {
1090
- const claims = [];
1091
- const patterns = [
1092
- /\b(?:you\s+)?hit(?:ting)?\s+all\s+(?:your\s+)?target(?:ed)?\s+reps?\b/gi,
1093
- /\b(?:you\s+)?hit\s+all\s+(?:the\s+)?targets?\b/gi,
1094
- /\b(?:you\s+)?hit\s+(?:the|your)\s+target\b(?!\s+(?:of|for|on))/gi
1095
- ];
1096
- for (const pattern of patterns) {
1097
- for (const match of text.matchAll(pattern)) {
1098
- claims.push({ text: match[0] });
1099
- }
1100
- }
1101
- return claims;
1102
- }
1103
-
1104
- function extractAskCleanConsistencyClaims(text) {
1105
- const claims = [];
1106
- const patterns = [
1107
- /\bclean,\s+consistent\b/gi,
1108
- /\bclean\s+and\s+consistent\b/gi,
1109
- /\bconsistent\s+set\s+of\s+work\b/gi,
1110
- /\bacross\s+the\s+board\b/gi
1111
- ];
1112
- for (const pattern of patterns) {
1113
- for (const match of text.matchAll(pattern)) {
1114
- claims.push({ text: match[0] });
1115
- }
1116
- }
1117
- return claims;
1118
- }
1119
-
1120
1121
  function extractAskPlannedListClaims(text) {
1121
1122
  const claims = [];
1122
1123
  const pattern = /\((\s*\d+(?:\s*,\s*\d+){2,})\s+planned\s*\)/gi;
@@ -1137,77 +1138,6 @@ function sessionPlannedReps(session) {
1137
1138
  return values;
1138
1139
  }
1139
1140
 
1140
- function findMentionedExercises(text, snapshot) {
1141
- const exercisesByName = new Map();
1142
- for (const session of snapshot?.sessions ?? []) {
1143
- for (const exercise of session.exercises ?? []) {
1144
- if (!exercise?.name) continue;
1145
- const normalizedName = normalizeExerciseName(exercise.name);
1146
- if (!normalizedName || exercisesByName.has(normalizedName)) continue;
1147
- exercisesByName.set(normalizedName, exercise.name);
1148
- }
1149
- for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
1150
- if (!exercise?.exerciseName) continue;
1151
- const normalizedName = normalizeExerciseName(exercise.exerciseName);
1152
- if (!normalizedName || exercisesByName.has(normalizedName)) continue;
1153
- exercisesByName.set(normalizedName, exercise.exerciseName);
1154
- }
1155
- }
1156
-
1157
- const mentions = [];
1158
- for (const [normalizedName, displayName] of exercisesByName) {
1159
- const pattern = new RegExp(`\\b${escapeRegExp(displayName)}\\b`, 'gi');
1160
- for (const match of text.matchAll(pattern)) {
1161
- mentions.push({
1162
- index: match.index ?? -1,
1163
- end: (match.index ?? -1) + match[0].length,
1164
- name: displayName,
1165
- normalizedName
1166
- });
1167
- }
1168
- }
1169
- return mentions
1170
- .filter((mention, index, allMentions) => !allMentions.some((candidate, candidateIndex) =>
1171
- candidateIndex !== index &&
1172
- candidate.index <= mention.index &&
1173
- candidate.end >= mention.end &&
1174
- candidate.normalizedName.length > mention.normalizedName.length
1175
- ))
1176
- .sort((lhs, rhs) => lhs.index - rhs.index);
1177
- }
1178
-
1179
- function findRecentSessionMisses(snapshot, { lookbackDays = 7, exerciseNames = null } = {}) {
1180
- const sessions = snapshot?.sessions ?? [];
1181
- const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
1182
- const scopedExerciseNames = exerciseNames && exerciseNames.length > 0 ? new Set(exerciseNames) : null;
1183
- const misses = [];
1184
- for (const session of sessions) {
1185
- const completedAt = session.completedAt || session.date;
1186
- const completedTime = Date.parse(completedAt);
1187
- if (!Number.isFinite(completedTime) || completedTime < cutoff) continue;
1188
- const targetByExercise = new Map();
1189
- for (const planned of session.prescriptionSnapshot?.exercises ?? []) {
1190
- const target = Number(planned.targetReps);
1191
- if (Number.isFinite(target) && target > 0) {
1192
- targetByExercise.set(normalizeExerciseName(planned.exerciseName), target);
1193
- }
1194
- }
1195
- for (const exercise of session.exercises ?? []) {
1196
- const normalizedExerciseName = normalizeExerciseName(exercise.name);
1197
- if (scopedExerciseNames && !scopedExerciseNames.has(normalizedExerciseName)) continue;
1198
- const target = targetByExercise.get(normalizedExerciseName);
1199
- if (!Number.isFinite(target)) continue;
1200
- for (const set of exercise.sets ?? []) {
1201
- const reps = Number(set.reps);
1202
- if (set.isComplete && Number.isFinite(reps) && reps < target) {
1203
- misses.push({ sessionId: session.id, exerciseName: exercise.name, reps, target });
1204
- }
1205
- }
1206
- }
1207
- }
1208
- return misses;
1209
- }
1210
-
1211
1141
  function findNearestMentionedExercise(mentions, index) {
1212
1142
  let candidate = null;
1213
1143
  for (const mention of mentions) {
@@ -1217,170 +1147,6 @@ function findNearestMentionedExercise(mentions, index) {
1217
1147
  return candidate;
1218
1148
  }
1219
1149
 
1220
- function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
1221
- const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
1222
- const withinCutoff = (dateValue) => {
1223
- const ms = Date.parse(dateValue);
1224
- return Number.isFinite(ms) && ms >= cutoff;
1225
- };
1226
-
1227
- const vitalsSummaries = snapshot?.vitalsSummaries ?? [];
1228
- if (vitalsSummaries.some((entry) => withinCutoff(entry.date))) return true;
1229
-
1230
- const metrics = snapshot?.healthMetrics ?? {};
1231
- for (const key of ['restingHR', 'hrv', 'sleep']) {
1232
- const readings = Array.isArray(metrics[key]) ? metrics[key] : [];
1233
- if (readings.some((reading) => withinCutoff(reading.date))) return true;
1234
- }
1235
-
1236
- for (const session of snapshot?.sessions ?? []) {
1237
- const completedAt = session.completedAt || session.date;
1238
- if (!withinCutoff(completedAt)) continue;
1239
- for (const exercise of session.exercises ?? []) {
1240
- const reps = (exercise.sets ?? [])
1241
- .map((set) => Number(set.reps))
1242
- .filter((value) => Number.isFinite(value) && value > 0);
1243
- if (reps.length < 2) continue;
1244
- const first = reps[0];
1245
- const last = reps[reps.length - 1];
1246
- if (first > 0 && (first - last) / first >= 0.3) return true;
1247
- }
1248
- }
1249
-
1250
- return false;
1251
- }
1252
-
1253
- function parseWeightNumber(raw) {
1254
- return Number(String(raw).replace(/,/g, ''));
1255
- }
1256
-
1257
- function extractAskWeightClaims(text) {
1258
- const claims = [];
1259
- // Accept comma-grouped thousands ("40,500 kg") as a single number so volume
1260
- // figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
1261
- // figures are excluded by isVolumeWeightClaim at the call sites, not by a
1262
- // magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
1263
- // 1000 kg, and a fabricated heavy load must still be graded.
1264
- const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
1265
- for (const match of text.matchAll(pattern)) {
1266
- claims.push({
1267
- text: match[0],
1268
- value: parseWeightNumber(match[1]),
1269
- index: match.index ?? -1
1270
- });
1271
- }
1272
- return claims;
1273
- }
1274
-
1275
- function extractAskWeightedSetClaims(text) {
1276
- const claims = [];
1277
- // A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
1278
- // an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
1279
- // almost always a SET count ("70 kg for 4 working sets") or a duration, and
1280
- // treating it as reps flags real data as a fabricated pair. So match only the
1281
- // unambiguous forms; the plain-weight loop still grounds the weight itself.
1282
- const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
1283
- for (const match of text.matchAll(pattern)) {
1284
- claims.push({
1285
- text: match[0],
1286
- weight: parseWeightNumber(match[1]),
1287
- reps: Number(match[2] ?? match[3]),
1288
- index: match.index ?? -1,
1289
- end: (match.index ?? -1) + match[0].length
1290
- });
1291
- }
1292
- return claims;
1293
- }
1294
-
1295
- function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
1296
- const weights = [];
1297
- for (const session of snapshot?.sessions ?? []) {
1298
- for (const exercise of session.exercises ?? []) {
1299
- if (normalizeExerciseName(exercise.name) !== normalizedExerciseName) continue;
1300
- for (const set of exercise.sets ?? []) {
1301
- const weight = Number(set.weight);
1302
- if (Number.isFinite(weight)) weights.push(weight);
1303
- }
1304
- }
1305
- for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
1306
- if (normalizeExerciseName(exercise.exerciseName) !== normalizedExerciseName) continue;
1307
- const targetWeight = Number(exercise.targetWeight);
1308
- if (Number.isFinite(targetWeight)) weights.push(targetWeight);
1309
- for (const targetSet of exercise.targetSets ?? []) {
1310
- const weight = Number(targetSet.weight ?? targetSet.targetWeight);
1311
- if (Number.isFinite(weight)) weights.push(weight);
1312
- }
1313
- }
1314
- }
1315
- return weights;
1316
- }
1317
-
1318
- function weightClaimSupported(claim, allowedWeights) {
1319
- return allowedWeights.some((weight) => Math.abs(weight - claim.value) < 0.01);
1320
- }
1321
-
1322
- function isEstimatedOneRepMaxWeightClaim(text, claim) {
1323
- const start = Math.max(0, claim.index - 40);
1324
- const end = Math.min(text.length, claim.index + claim.text.length + 40);
1325
- const window = text.slice(start, end);
1326
- return /\b(?:estimated\s+)?(?:1rm|one[-\s]?rep\s+max)\b/i.test(window);
1327
- }
1328
-
1329
- function isVolumeWeightClaim(text, claim) {
1330
- // A kg figure in a clause about volume/tonnage/total load is a workload total
1331
- // (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
1332
- // exercise load. Scope to the claim's clause so a fabricated exercise load
1333
- // earlier in the same sentence is still graded.
1334
- return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
1335
- }
1336
-
1337
- function claimClause(text, claim) {
1338
- const boundaries = [
1339
- '\n',
1340
- '. ',
1341
- ';',
1342
- ', while',
1343
- ', whereas',
1344
- ', but',
1345
- ' while ',
1346
- ' whereas ',
1347
- ' but '
1348
- ];
1349
- let start = 0;
1350
- for (const boundary of boundaries) {
1351
- const index = text.lastIndexOf(boundary, claim.index);
1352
- if (index >= 0) start = Math.max(start, index + boundary.length);
1353
- }
1354
-
1355
- let end = text.length;
1356
- for (const boundary of boundaries) {
1357
- const index = text.indexOf(boundary, claim.index + claim.text.length);
1358
- if (index >= 0) end = Math.min(end, index);
1359
- }
1360
- return text.slice(start, end);
1361
- }
1362
-
1363
- // Returns the sentence containing the claim, so context guards can look at the
1364
- // whole clause rather than a fixed-width window (body-weight phrasing can put
1365
- // the "body weight" anchor well before the kg figure).
1366
- function claimSentence(text, claim) {
1367
- const before = text.slice(0, claim.index);
1368
- const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
1369
- const start = startBreak >= 0 ? startBreak + 1 : 0;
1370
- const after = text.slice(claim.index);
1371
- const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
1372
- const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
1373
- return text.slice(start, end);
1374
- }
1375
-
1376
- // Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
1377
- // exercise-load claims. findNearestMentionedExercise would otherwise attribute
1378
- // them to the previously named lift and flag a correct answer as a
1379
- // hallucination, so skip any kg figure stated in a body-weight clause.
1380
- function isBodyWeightClaim(text, claim) {
1381
- return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
1382
- }
1383
-
1384
1150
  function askWorkingTopSetRows(snapshot) {
1385
1151
  const rows = [];
1386
1152
  for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
@@ -1529,7 +1295,7 @@ const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
1529
1295
  // number directly followed by one of these is left alone.
1530
1296
  const NON_SCORE_UNIT =
1531
1297
  '(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
1532
- 'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
1298
+ 'years?|yrs?|h\\b|hrs?|hours?|mins?|minutes?|secs?|seconds?|bpm|ms|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
1533
1299
 
1534
1300
  // Heuristic, not a parser. Flags a component name followed — within a short,
1535
1301
  // period/newline-free gap (one clause) — by a score-like number that is not a
@@ -1632,11 +1398,19 @@ function evaluateAskSelfReference(output, testCase) {
1632
1398
  // On a question that is not about the Increment Score, the coach must not
1633
1399
  // volunteer the bare overall score number (e.g. "your score is 92/100"). The
1634
1400
  // prelude withholds the number for non-score questions; this guards the answer.
1635
- function evaluateAskVolunteeredScore(output, testCase) {
1401
+ function evaluateAskVolunteeredScore(output, testCase, context = {}) {
1636
1402
  if (testCase.surface !== 'ask') {
1637
1403
  return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
1638
1404
  }
1639
1405
  const question = testCase.context?.question ?? testCase.question ?? '';
1406
+ const responseProfile = context?.routedMetadata?.responseProfile
1407
+ ?? context?.routedMetadata?.intent?.responseProfile
1408
+ ?? testCase.context?.routedMetadata?.responseProfile
1409
+ ?? testCase.context?.routedMetadata?.intent?.responseProfile
1410
+ ?? testCase.context?.responseProfile;
1411
+ if (responseProfile === 'expansive') {
1412
+ return { key: 'ask_volunteered_score', passed: true, reason: 'Expansive Ask answers may name the rounded score headline.' };
1413
+ }
1640
1414
  if (isScoreQuestion(question)) {
1641
1415
  return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
1642
1416
  }
@@ -1682,7 +1456,14 @@ function evaluateAskStaleness(output, snapshot, testCase) {
1682
1456
  };
1683
1457
  }
1684
1458
 
1685
- function evaluateAskClaims(output, snapshot, testCase) {
1459
+ function askResponseProfileFromTestCase(testCase) {
1460
+ return testCase?.context?.routedMetadata?.responseProfile
1461
+ ?? testCase?.context?.routedMetadata?.intent?.responseProfile
1462
+ ?? testCase?.context?.responseProfile
1463
+ ?? null;
1464
+ }
1465
+
1466
+ function evaluateAskClaims(output, snapshot, testCase, context = null) {
1686
1467
  if (testCase.surface !== 'ask') {
1687
1468
  return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
1688
1469
  }
@@ -1693,26 +1474,9 @@ function evaluateAskClaims(output, snapshot, testCase) {
1693
1474
  }
1694
1475
 
1695
1476
  const failures = [];
1696
- const mentionedExercises = findMentionedExercises(output, snapshot);
1697
- const scopedExerciseNames = uniqueStrings(mentionedExercises.map((mention) => mention.normalizedName));
1698
-
1699
- const targetHitClaims = extractAskTargetHitClaims(normalized);
1700
- if (targetHitClaims.length > 0) {
1701
- const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
1702
- if (misses.length > 0) {
1703
- const sample = misses[0];
1704
- failures.push(`Ask answer claims targets hit ("${targetHitClaims[0].text}") but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
1705
- }
1706
- }
1707
-
1708
- const cleanConsistencyClaims = extractAskCleanConsistencyClaims(normalized);
1709
- if (cleanConsistencyClaims.length > 0) {
1710
- const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
1711
- if (misses.length > 0) {
1712
- const sample = misses[0];
1713
- failures.push(`Ask answer frames missed target reps as "${cleanConsistencyClaims[0].text}", but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
1714
- }
1715
- }
1477
+ const { claimFailures } = getAskVerifierResult(output, context, snapshot, testCase);
1478
+ failures.push(...claimFailures.map((failure) => failure.reason));
1479
+ const mentionedExercises = findAskAnswerExerciseMentions(output, snapshot);
1716
1480
 
1717
1481
  for (const claim of extractAskPlannedListClaims(normalized)) {
1718
1482
  const uniquePlanned = new Set(claim.reps);
@@ -1742,23 +1506,6 @@ function evaluateAskClaims(output, snapshot, testCase) {
1742
1506
  }
1743
1507
  }
1744
1508
 
1745
- for (const claim of extractAskWeightClaims(normalized)) {
1746
- if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
1747
- if (isVolumeWeightClaim(normalized, claim)) continue;
1748
- if (isBodyWeightClaim(normalized, claim)) continue;
1749
- const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1750
- if (!referencedExercise) continue;
1751
- const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
1752
- if (allowedWeights.length === 0) continue;
1753
- if (!weightClaimSupported(claim, allowedWeights)) {
1754
- failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but that weight is not present in recorded or planned sets for that exercise.`);
1755
- }
1756
- }
1757
-
1758
- if (hasUnsupportedAskFatigueRecoveryClaim(normalized) && !hasAskFatigueSupport(snapshot)) {
1759
- failures.push('Ask answer uses fatigue/recovery language but the snapshot has no recent vitals, sleep, or rep-dropoff signals to support it.');
1760
- }
1761
-
1762
1509
  return {
1763
1510
  key: 'ask_claims',
1764
1511
  passed: failures.length === 0,
@@ -1826,55 +1573,6 @@ function askToolEvidenceRows(toolResults = []) {
1826
1573
  return rows;
1827
1574
  }
1828
1575
 
1829
- function askToolEvidenceWeights(rows = []) {
1830
- const weights = [];
1831
- for (const row of rows) {
1832
- for (const set of row.sets ?? []) {
1833
- const weight = Number(set.weight);
1834
- if (Number.isFinite(weight)) weights.push(weight);
1835
- }
1836
- const topWeight = Number(row.topSet?.weight);
1837
- if (Number.isFinite(topWeight)) weights.push(topWeight);
1838
- const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
1839
- if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
1840
- }
1841
- return weights;
1842
- }
1843
-
1844
- function askToolEvidenceSetPairs(rows = []) {
1845
- const pairs = [];
1846
- for (const row of rows) {
1847
- for (const set of row.sets ?? []) {
1848
- const weight = Number(set.weight);
1849
- const reps = Number(set.reps);
1850
- if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
1851
- }
1852
- const topWeight = Number(row.topSet?.weight);
1853
- const topReps = Number(row.topSet?.reps);
1854
- if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
1855
- const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
1856
- const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
1857
- if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
1858
- pairs.push({ weight: previousTopWeight, reps: previousTopReps });
1859
- }
1860
- }
1861
- return pairs;
1862
- }
1863
-
1864
- function toolEvidenceSupportsWeightClaim(claim, rows) {
1865
- if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
1866
- return false;
1867
- }
1868
-
1869
- function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
1870
- if (askToolEvidenceSetPairs(rows).some((pair) => (
1871
- Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
1872
- ))) {
1873
- return true;
1874
- }
1875
- return false;
1876
- }
1877
-
1878
1576
  function compareToolEvidenceRecency(lhs, rhs) {
1879
1577
  const lhsDaysAgo = Number(lhs?.daysAgo);
1880
1578
  const rhsDaysAgo = Number(rhs?.daysAgo);
@@ -1890,14 +1588,6 @@ function newestToolEvidenceRow(rows = [], predicate = () => true) {
1890
1588
  .sort(compareToolEvidenceRecency)[0] ?? null;
1891
1589
  }
1892
1590
 
1893
- function latestComparableToolRow(rows = []) {
1894
- return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
1895
- }
1896
-
1897
- function isWithinWeightedSetClaim(claim, weightedSetClaims) {
1898
- return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
1899
- }
1900
-
1901
1591
  function rowIsStaleForEval(row, testCase) {
1902
1592
  const daysAgo = Number(row?.daysAgo);
1903
1593
  const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
@@ -1952,80 +1642,22 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
1952
1642
 
1953
1643
  const routedMetadata = context?.routedMetadata ?? {};
1954
1644
  const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
1955
- const { toolResults, replayFailures } = routedToolResultsForEval(snapshot, context);
1645
+ const { toolResults } = routedToolResultsForEval(snapshot, context);
1956
1646
  const evidenceRows = askToolEvidenceRows(toolResults);
1957
- const mentionedExercises = findMentionedExercises(output, snapshot);
1958
- const unroutedMentionNames = new Set();
1959
- const failures = [...replayFailures];
1647
+ const failures = [];
1960
1648
  for (const toolName of uniqueStrings(testCase.requiredTools)) {
1961
1649
  if (!toolsUsed.has(toolName)) {
1962
1650
  failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
1963
1651
  }
1964
1652
  }
1965
1653
 
1966
- if (/\b(?:estimated\s+)?(?:e1rm|1rm|one[- ]rep max)\b/i.test(output) && !toolsUsed.has('get_records')) {
1967
- failures.push('Ask answer mentions e1RM/1RM, but routed context did not use get_records.');
1968
- }
1969
-
1970
- const weightedSetClaims = extractAskWeightedSetClaims(output);
1971
- for (const claim of weightedSetClaims) {
1972
- if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
1973
- const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1974
- if (!referencedExercise) continue;
1975
- const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
1976
- if (rows.length === 0) {
1977
- unroutedMentionNames.add(referencedExercise.normalizedName);
1978
- failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
1979
- continue;
1980
- }
1981
- if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
1982
- failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
1983
- }
1984
- }
1985
-
1986
- for (const claim of extractAskWeightClaims(output)) {
1987
- if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
1988
- if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
1989
- if (isVolumeWeightClaim(output, claim)) continue;
1990
- if (isBodyWeightClaim(output, claim)) continue;
1991
- const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1992
- if (!referencedExercise) continue;
1993
- const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
1994
- if (rows.length === 0) {
1995
- unroutedMentionNames.add(referencedExercise.normalizedName);
1996
- failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
1997
- continue;
1998
- }
1999
- if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
2000
- failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
2001
- }
2002
- }
1654
+ const { provenanceFailures } = getAskVerifierResult(output, context, snapshot, testCase);
1655
+ failures.push(...provenanceFailures.map((failure) => failure.reason));
2003
1656
 
2004
1657
  const exerciseNames = evidenceRows.map((row) => row.exerciseName);
2005
- for (const mention of mentionedExercises) {
1658
+ for (const mention of findAskAnswerExerciseMentions(output, snapshot)) {
2006
1659
  const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
2007
- if (rows.length === 0) {
2008
- if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
2009
- unroutedMentionNames.add(mention.normalizedName);
2010
- failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
2011
- }
2012
- continue;
2013
- }
2014
- const comparable = latestComparableToolRow(rows);
2015
- if (comparable) {
2016
- const direction = comparable.comparedToPreviousSession.loadDirection;
2017
- const previous = comparable.comparedToPreviousSession.previousTopSet;
2018
- const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
2019
- if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
2020
- failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
2021
- }
2022
- if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
2023
- failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
2024
- }
2025
- if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
2026
- failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
2027
- }
2028
- }
1660
+ if (rows.length === 0) continue;
2029
1661
 
2030
1662
  const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
2031
1663
  if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
@@ -2665,18 +2297,18 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
2665
2297
 
2666
2298
  const checks = [
2667
2299
  evaluateNoInsight(visibleOutput, testCase),
2668
- evaluateShape(visibleOutput, testCase),
2300
+ evaluateShape(visibleOutput, testCase, context),
2669
2301
  evaluateRequiredMentions(visibleOutput, testCase),
2670
2302
  evaluateAnyOfMentions(visibleOutput, testCase),
2671
2303
  evaluateForbiddenPhrases(visibleOutput, testCase),
2672
2304
  evaluateForbiddenMentions(visibleOutput, testCase),
2673
2305
  evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
2674
2306
  evaluateWorkoutClaims(visibleOutput, context, testCase),
2675
- evaluateAskClaims(visibleOutput, snapshot, testCase),
2307
+ evaluateAskClaims(visibleOutput, snapshot, testCase, context),
2676
2308
  evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
2677
2309
  evaluateAskScoreVoice(visibleOutput, testCase),
2678
2310
  evaluateAskSelfReference(visibleOutput, testCase),
2679
- evaluateAskVolunteeredScore(visibleOutput, testCase),
2311
+ evaluateAskVolunteeredScore(visibleOutput, testCase, context),
2680
2312
  evaluateAskStaleness(visibleOutput, snapshot, testCase),
2681
2313
  evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
2682
2314
  evaluateFormulaVersion(visibleOutput, snapshot, testCase),