incremnt 0.7.0 → 0.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,6 +6,7 @@ import {
6
6
  askRoutedContext,
7
7
  checkpointContext,
8
8
  cycleSummaryContext,
9
+ executeCoachReadTool,
9
10
  normalizeExerciseName,
10
11
  workoutSummaryContext,
11
12
  vitalsSummaryContext
@@ -86,8 +87,9 @@ export function buildSummaryEvalContext(snapshot, testCase) {
86
87
  return vitalsSummaryContext(snapshot, { exclude: new Set(testCase.exclude ?? []) });
87
88
  case 'ask': {
88
89
  const question = testCase.context?.question ?? testCase.question ?? '';
90
+ const today = testCase.context?.today ?? testCase.today ?? null;
89
91
  const routed = question
90
- ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []) })
92
+ ? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
91
93
  : null;
92
94
  return {
93
95
  ...(testCase.context ?? {}),
@@ -305,7 +307,6 @@ const historicalExerciseModifiers = new Set([
305
307
  'leg',
306
308
  'weighted',
307
309
  'romanian',
308
- 'hack',
309
310
  'full',
310
311
  'grip'
311
312
  ]);
@@ -358,6 +359,18 @@ function collectAllowedExerciseNames(surface, context) {
358
359
  for (const sc of context.planComparison?.setsComparison ?? []) {
359
360
  if (sc.exercise) names.add(sc.exercise);
360
361
  }
362
+ // Planned-but-skipped and unplanned-but-added exercises are legitimate
363
+ // planned-vs-actual subjects: the model is handed "skipped X, added Y" in
364
+ // its context, so a correct "you skipped X" note must not be flagged as an
365
+ // unauthorized mention. `setsComparison` only carries *performed* planned
366
+ // exercises (queries.js builds it with a performedNames filter), so skipped
367
+ // lifts would otherwise never be authorized.
368
+ for (const exerciseName of context.planComparison?.skipped ?? []) {
369
+ if (exerciseName) names.add(exerciseName);
370
+ }
371
+ for (const exerciseName of context.planComparison?.added ?? []) {
372
+ if (exerciseName) names.add(exerciseName);
373
+ }
361
374
  }
362
375
 
363
376
  if (surface === 'cycle' && context && typeof context === 'object') {
@@ -399,6 +412,49 @@ function collectAllowedExerciseNames(surface, context) {
399
412
  return [...names];
400
413
  }
401
414
 
415
+ // Project the allow-set from the actual context object the model was handed.
416
+ // Any known exercise name (from the snapshot's vocabulary) that appears anywhere
417
+ // in the serialized context — structured fields, plan comparison, prior-session
418
+ // comparisons, nearby cardio, or free-text session/exercise notes — is something
419
+ // the model could legitimately reference. Deriving authorization this way means
420
+ // the allow-set can never drift behind a newly added context field: the failure
421
+ // mode that flagged a correct "you skipped Hack Squat" note and a note-echoed
422
+ // lift. A genuine invention is a known exercise present in the output but absent
423
+ // from the context entirely.
424
+ function collectContextExerciseNames(context, knownNames) {
425
+ if (!context || typeof context !== 'object') return [];
426
+ let serialized;
427
+ try {
428
+ serialized = JSON.stringify(context);
429
+ } catch {
430
+ return [];
431
+ }
432
+ const contextText = normalizeExerciseName(serialized);
433
+ if (!contextText) return [];
434
+ const matches = [];
435
+ for (const name of knownNames) {
436
+ const normalized = normalizeExerciseName(name);
437
+ if (!normalized) continue;
438
+ const pattern = new RegExp(`(?<!\\S)${escapeRegex(normalized)}(?!\\S)`, 'g');
439
+ for (const match of contextText.matchAll(pattern)) {
440
+ matches.push({
441
+ name,
442
+ normalized,
443
+ start: match.index,
444
+ end: (match.index ?? 0) + normalized.length
445
+ });
446
+ }
447
+ }
448
+ return uniqueStrings(matches
449
+ .filter((match) => !matches.some((candidate) =>
450
+ candidate !== match &&
451
+ candidate.normalized.length > match.normalized.length &&
452
+ candidate.start <= match.start &&
453
+ candidate.end >= match.end
454
+ ))
455
+ .map((match) => match.name));
456
+ }
457
+
402
458
  function historicalExerciseVariants(name) {
403
459
  const normalized = normalizeExerciseName(name);
404
460
  if (!normalized) return [];
@@ -431,9 +487,15 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
431
487
 
432
488
  const outputText = surface === 'scoreCommentary' ? scoreCommentaryText(output) : output;
433
489
  const isStored = testCase.source === 'stored';
490
+ const allNames = collectAllExerciseNames(snapshot);
491
+
492
+ // Union the hand-built field list with a projection of the actual context, so
493
+ // this can only ever *reduce* false positives. testCase.allowedExerciseMentions
494
+ // stays as an explicit override for cases that need it.
434
495
  const allowed = new Set();
435
496
  for (const name of [
436
497
  ...collectAllowedExerciseNames(surface, context),
498
+ ...collectContextExerciseNames(context, allNames),
437
499
  ...(testCase.allowedExerciseMentions ?? [])
438
500
  ]) {
439
501
  const variants = isStored ? historicalExerciseVariants(name) : [normalizeExerciseName(name)];
@@ -441,7 +503,6 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
441
503
  allowed.add(variant);
442
504
  }
443
505
  }
444
- const allNames = collectAllExerciseNames(snapshot);
445
506
  const normalizedOutput = normalizeExerciseName(outputText);
446
507
  const mentions = [];
447
508
 
@@ -463,8 +524,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
463
524
 
464
525
  const unauthorized = mentions
465
526
  .filter((mention) => !mention.allowed)
527
+ // Collapse a shorter mention into any longer mention that spans the same
528
+ // text — regardless of whether the covering mention is itself allowed.
529
+ // "Squat" matches inside "Hack Squat" (whitespace word boundary), so without
530
+ // this an unauthorized "Hack Squat" was double-counted as both "Hack Squat"
531
+ // and "Squat". The covering mention carries the real verdict; the substring
532
+ // is never a distinct mention.
466
533
  .filter((mention) => !mentions.some((candidate) =>
467
- candidate.allowed &&
534
+ candidate !== mention &&
468
535
  candidate.normalizedName.length > mention.normalizedName.length &&
469
536
  candidate.start <= mention.start &&
470
537
  candidate.end >= mention.end
@@ -1108,6 +1175,21 @@ function extractAskWeightClaims(text) {
1108
1175
  return claims;
1109
1176
  }
1110
1177
 
1178
+ function extractAskWeightedSetClaims(text) {
1179
+ const claims = [];
1180
+ const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:x|×|for)\s*(\d+)\b/gi;
1181
+ for (const match of text.matchAll(pattern)) {
1182
+ claims.push({
1183
+ text: match[0],
1184
+ weight: Number(match[1]),
1185
+ reps: Number(match[2]),
1186
+ index: match.index ?? -1,
1187
+ end: (match.index ?? -1) + match[0].length
1188
+ });
1189
+ }
1190
+ return claims;
1191
+ }
1192
+
1111
1193
  function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
1112
1194
  const weights = [];
1113
1195
  for (const session of snapshot?.sessions ?? []) {
@@ -1149,6 +1231,184 @@ function isVolumeWeightClaim(text, claim) {
1149
1231
  return /\bvolume\b/i.test(window);
1150
1232
  }
1151
1233
 
1234
+ function askWorkingTopSetRows(snapshot) {
1235
+ const rows = [];
1236
+ for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
1237
+ const completedAt = session.completedAt ?? session.date;
1238
+ for (const exercise of session.exercises ?? []) {
1239
+ const workingSets = (exercise.sets ?? [])
1240
+ .filter((set) => set?.isComplete && !set?.isWarmup)
1241
+ .map((set) => ({
1242
+ weight: Number(set.weight) || 0,
1243
+ reps: Number(set.reps) || 0
1244
+ }));
1245
+ if (workingSets.length === 0) continue;
1246
+ const topSet = workingSets.sort((a, b) => b.weight - a.weight || b.reps - a.reps)[0];
1247
+ rows.push({
1248
+ sessionId: session.id ?? null,
1249
+ date: String(completedAt ?? '').slice(0, 10),
1250
+ exerciseName: exercise.name,
1251
+ normalizedName: normalizeExerciseName(exercise.name),
1252
+ ...topSet
1253
+ });
1254
+ }
1255
+ }
1256
+ return rows;
1257
+ }
1258
+
1259
+ function daysAgoForEval(date, testCase) {
1260
+ const today = testCase.context?.today ?? testCase.today;
1261
+ if (!today) return null;
1262
+ const dateMs = Date.parse(`${String(date ?? '').slice(0, 10)}T00:00:00.000Z`);
1263
+ const todayMs = Date.parse(`${String(today).slice(0, 10)}T00:00:00.000Z`);
1264
+ if (!Number.isFinite(dateMs) || !Number.isFinite(todayMs)) return null;
1265
+ return Math.max(0, Math.round((todayMs - dateMs) / (24 * 60 * 60 * 1000)));
1266
+ }
1267
+
1268
+ function hasUnqualifiedDeclineLanguage(window) {
1269
+ const text = normalizeText(window);
1270
+ const decline = /\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)|regress(?:ed|ion|ing)?|fell|fall(?:ing)?|decreas(?:e|ed|ing)|lower|worse|slid|slipped)\b/i;
1271
+ if (!decline.test(text)) return false;
1272
+ if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,45}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|regress(?:ed|ion|ing)?|fall(?:ing)?|fell|lower|worse|slid|slipped)\b/i.test(text)) return false;
1273
+ if (/\b(?:rep|reps)\b.{0,20}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b/i.test(text)) return false;
1274
+ if (/\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
1275
+ return true;
1276
+ }
1277
+
1278
+ function hasUnqualifiedImprovementLanguage(window) {
1279
+ const text = normalizeText(window);
1280
+ const improvement = /\b(improv(?:e|ed|ing|ement)|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)|moving up|went up|up from|load jump|jumped)\b/i;
1281
+ if (!improvement.test(text)) return false;
1282
+ if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,35}\b(improv(?:e|ed|ing|ement)?|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)?|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
1283
+ if (/\b(?:rep|reps)\b.{0,20}\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
1284
+ if (/\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
1285
+ return true;
1286
+ }
1287
+
1288
+ function isReferentialDirectionContinuation(sentence) {
1289
+ return /^(?:that|this|it|there|still|the\s+(?:latest|top)|top\s+set|same\s+load)\b/i.test(sentence);
1290
+ }
1291
+
1292
+ function directionEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
1293
+ const normalizedExercise = normalizeExerciseName(exerciseName);
1294
+ const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
1295
+ .filter((name) => name && name !== normalizedExercise);
1296
+ const sentences = outputText
1297
+ .split(/(?<=[.!?])\s+/)
1298
+ .map((sentence) => sentence.trim())
1299
+ .filter(Boolean);
1300
+ if (!normalizedExercise) return sentences;
1301
+ const windows = [];
1302
+ for (let index = 0; index < sentences.length; index++) {
1303
+ if (!normalizeExerciseName(sentences[index]).includes(normalizedExercise)) continue;
1304
+ windows.push(sentences[index]);
1305
+ for (let nextIndex = index + 1; nextIndex < sentences.length; nextIndex++) {
1306
+ const normalizedNext = normalizeExerciseName(sentences[nextIndex]);
1307
+ if (otherExercises.some((name) => normalizedNext.includes(name))) break;
1308
+ if (!isReferentialDirectionContinuation(sentences[nextIndex])) break;
1309
+ windows.push(sentences[nextIndex]);
1310
+ }
1311
+ }
1312
+ return windows.length > 0 ? [...new Set(windows)] : [outputText];
1313
+ }
1314
+
1315
+ function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
1316
+ if (testCase.surface !== 'ask') {
1317
+ return { key: 'ask_directional_consistency', passed: true, reason: 'Not an ask answer.' };
1318
+ }
1319
+
1320
+ const required = Array.isArray(testCase.directionalConsistency)
1321
+ ? testCase.directionalConsistency
1322
+ : [];
1323
+ if (required.length === 0) {
1324
+ return { key: 'ask_directional_consistency', passed: true, reason: 'No directional assertions configured.' };
1325
+ }
1326
+
1327
+ const rows = askWorkingTopSetRows(snapshot);
1328
+ const outputText = normalizeText(output);
1329
+ const failures = [];
1330
+
1331
+ for (const expectation of required) {
1332
+ const normalizedName = normalizeExerciseName(expectation.exercise ?? expectation.exerciseName);
1333
+ const history = rows.filter((row) => row.normalizedName === normalizedName);
1334
+ if (history.length < 2) continue;
1335
+ const latest = history[0];
1336
+ const previous = history[1];
1337
+ const loadDelta = latest.weight - previous.weight;
1338
+ const actualDirection = loadDelta > 0 ? 'up' : loadDelta < 0 ? 'down' : 'flat';
1339
+ const expectedDirection = expectation.loadDirection ?? actualDirection;
1340
+ if (expectedDirection !== actualDirection) {
1341
+ failures.push(`Configured expected direction for ${latest.exerciseName} is ${expectedDirection}, but snapshot top-load direction is ${actualDirection}.`);
1342
+ continue;
1343
+ }
1344
+
1345
+ const windows = directionEvaluationWindows(
1346
+ outputText,
1347
+ expectation.exercise ?? expectation.exerciseName,
1348
+ rows.map((row) => row.exerciseName)
1349
+ );
1350
+ if (actualDirection === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
1351
+ failures.push(`Ask answer frames ${latest.exerciseName} as declining/drop-off even though top load increased from ${previous.weight} kg to ${latest.weight} kg.`);
1352
+ }
1353
+ if (actualDirection === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
1354
+ failures.push(`Ask answer frames ${latest.exerciseName} as improving even though top load decreased from ${previous.weight} kg to ${latest.weight} kg.`);
1355
+ }
1356
+ if (actualDirection === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
1357
+ failures.push(`Ask answer invents a load direction for ${latest.exerciseName}, but top load was flat at ${latest.weight} kg.`);
1358
+ }
1359
+ }
1360
+
1361
+ return {
1362
+ key: 'ask_directional_consistency',
1363
+ passed: failures.length === 0,
1364
+ reason: failures.length === 0
1365
+ ? 'Ask answer does not invert configured load directions.'
1366
+ : failures.join(' ')
1367
+ };
1368
+ }
1369
+
1370
+ function relevantSessionsForStaleness(snapshot, testCase) {
1371
+ const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
1372
+ ?? testCase.directionalConsistency?.[0]?.exercise
1373
+ ?? testCase.directionalConsistency?.[0]?.exerciseName
1374
+ ?? null;
1375
+ if (!configuredExercise) return snapshot?.sessions ?? [];
1376
+ const normalized = normalizeExerciseName(configuredExercise);
1377
+ return (snapshot?.sessions ?? []).filter((session) => (
1378
+ (session.exercises ?? []).some((exercise) => normalizeExerciseName(exercise.name) === normalized)
1379
+ ));
1380
+ }
1381
+
1382
+ function evaluateAskStaleness(output, snapshot, testCase) {
1383
+ if (testCase.surface !== 'ask') {
1384
+ return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
1385
+ }
1386
+ const maxRecentDays = testCase.staleness?.maxRecentDays;
1387
+ if (!Number.isFinite(Number(maxRecentDays))) {
1388
+ return { key: 'ask_staleness', passed: true, reason: 'No staleness assertion configured.' };
1389
+ }
1390
+
1391
+ const latestSession = stableSortByDateDesc(relevantSessionsForStaleness(snapshot, testCase), (session) => session.completedAt ?? session.date)[0] ?? null;
1392
+ const daysAgo = daysAgoForEval(latestSession?.completedAt ?? latestSession?.date, testCase);
1393
+ if (daysAgo == null || daysAgo <= Number(maxRecentDays)) {
1394
+ return { key: 'ask_staleness', passed: true, reason: 'Latest session is inside the configured recency window.' };
1395
+ }
1396
+
1397
+ const normalized = normalizeText(output);
1398
+ const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
1399
+ const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
1400
+ || /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
1401
+ const includesAge = new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
1402
+ const passed = !claimsRecent || explicitlyNotRecent || includesAge;
1403
+ return {
1404
+ key: 'ask_staleness',
1405
+ passed,
1406
+ reason: passed
1407
+ ? 'Ask answer does not present stale sessions as simply recent.'
1408
+ : `Ask answer calls a ${daysAgo}-day-old session recent without the days-ago label.`
1409
+ };
1410
+ }
1411
+
1152
1412
  function evaluateAskClaims(output, snapshot, testCase) {
1153
1413
  if (testCase.surface !== 'ask') {
1154
1414
  return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
@@ -1234,14 +1494,195 @@ function evaluateAskClaims(output, snapshot, testCase) {
1234
1494
  };
1235
1495
  }
1236
1496
 
1237
- function evaluateAskToolProvenance(output, context, testCase) {
1497
+ function routedToolResultsForEval(snapshot, context) {
1498
+ const routedMetadata = context?.routedMetadata ?? {};
1499
+ const toolParams = routedMetadata.toolParams ?? {};
1500
+ const toolResults = [];
1501
+ const replayFailures = [];
1502
+ for (const toolName of uniqueStrings(routedMetadata.toolsUsed ?? [])) {
1503
+ try {
1504
+ toolResults.push(executeCoachReadTool(snapshot, toolName, toolParams[toolName] ?? {}));
1505
+ } catch (error) {
1506
+ replayFailures.push(`Could not replay routed tool ${toolName}: ${error?.message ?? String(error)}`);
1507
+ }
1508
+ }
1509
+ return { toolResults, replayFailures };
1510
+ }
1511
+
1512
+ function addAskToolEvidenceRow(rows, toolName, row, inherited = {}) {
1513
+ const exerciseName = row?.exerciseName ?? row?.name ?? inherited.exerciseName ?? null;
1514
+ const normalizedName = normalizeExerciseName(exerciseName);
1515
+ if (!normalizedName) return;
1516
+ rows.push({
1517
+ toolName,
1518
+ exerciseName,
1519
+ normalizedName,
1520
+ date: row?.date ?? inherited.date ?? null,
1521
+ daysAgo: row?.daysAgo ?? inherited.daysAgo ?? null,
1522
+ recencyLabel: row?.recencyLabel ?? inherited.recencyLabel ?? null,
1523
+ isStale: row?.isStale ?? inherited.isStale ?? false,
1524
+ recencyCutoffDays: row?.recencyCutoffDays ?? inherited.recencyCutoffDays ?? null,
1525
+ warmupSetCount: row?.warmupSetCount ?? 0,
1526
+ workingSetCount: row?.workingSetCount ?? null,
1527
+ topSet: row?.topSet ?? null,
1528
+ comparedToPreviousSession: row?.comparedToPreviousSession ?? null,
1529
+ sets: Array.isArray(row?.sets) ? row.sets : []
1530
+ });
1531
+ }
1532
+
1533
+ function askToolEvidenceRows(toolResults = []) {
1534
+ const rows = [];
1535
+ for (const toolResult of toolResults) {
1536
+ for (const row of toolResult?.rows ?? []) {
1537
+ if (Array.isArray(row?.exercises)) {
1538
+ for (const exercise of row.exercises) {
1539
+ addAskToolEvidenceRow(rows, toolResult.toolName, exercise, {
1540
+ date: row.date,
1541
+ daysAgo: row.daysAgo,
1542
+ recencyLabel: row.recencyLabel,
1543
+ isStale: row.isStale,
1544
+ recencyCutoffDays: row.recencyCutoffDays
1545
+ });
1546
+ }
1547
+ } else {
1548
+ addAskToolEvidenceRow(rows, toolResult.toolName, row);
1549
+ }
1550
+ }
1551
+ }
1552
+ return rows;
1553
+ }
1554
+
1555
+ function askToolEvidenceWeights(rows = []) {
1556
+ const weights = [];
1557
+ for (const row of rows) {
1558
+ for (const set of row.sets ?? []) {
1559
+ const weight = Number(set.weight);
1560
+ if (Number.isFinite(weight)) weights.push(weight);
1561
+ }
1562
+ const topWeight = Number(row.topSet?.weight);
1563
+ if (Number.isFinite(topWeight)) weights.push(topWeight);
1564
+ const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
1565
+ if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
1566
+ }
1567
+ return weights;
1568
+ }
1569
+
1570
+ function askToolEvidenceSetPairs(rows = []) {
1571
+ const pairs = [];
1572
+ for (const row of rows) {
1573
+ for (const set of row.sets ?? []) {
1574
+ const weight = Number(set.weight);
1575
+ const reps = Number(set.reps);
1576
+ if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
1577
+ }
1578
+ const topWeight = Number(row.topSet?.weight);
1579
+ const topReps = Number(row.topSet?.reps);
1580
+ if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
1581
+ const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
1582
+ const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
1583
+ if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
1584
+ pairs.push({ weight: previousTopWeight, reps: previousTopReps });
1585
+ }
1586
+ }
1587
+ return pairs;
1588
+ }
1589
+
1590
+ function toolEvidenceSupportsWeightClaim(claim, rows) {
1591
+ if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
1592
+ return false;
1593
+ }
1594
+
1595
+ function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
1596
+ if (askToolEvidenceSetPairs(rows).some((pair) => (
1597
+ Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
1598
+ ))) {
1599
+ return true;
1600
+ }
1601
+ return false;
1602
+ }
1603
+
1604
+ function compareToolEvidenceRecency(lhs, rhs) {
1605
+ const lhsDaysAgo = Number(lhs?.daysAgo);
1606
+ const rhsDaysAgo = Number(rhs?.daysAgo);
1607
+ if (Number.isFinite(lhsDaysAgo) && Number.isFinite(rhsDaysAgo)) return lhsDaysAgo - rhsDaysAgo;
1608
+ if (Number.isFinite(lhsDaysAgo)) return -1;
1609
+ if (Number.isFinite(rhsDaysAgo)) return 1;
1610
+ return String(rhs?.date ?? '').localeCompare(String(lhs?.date ?? ''));
1611
+ }
1612
+
1613
+ function newestToolEvidenceRow(rows = [], predicate = () => true) {
1614
+ return rows
1615
+ .filter(predicate)
1616
+ .sort(compareToolEvidenceRecency)[0] ?? null;
1617
+ }
1618
+
1619
+ function latestComparableToolRow(rows = []) {
1620
+ return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
1621
+ }
1622
+
1623
+ function isWithinWeightedSetClaim(claim, weightedSetClaims) {
1624
+ return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
1625
+ }
1626
+
1627
+ function rowIsStaleForEval(row, testCase) {
1628
+ const daysAgo = Number(row?.daysAgo);
1629
+ const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
1630
+ if (!Number.isFinite(daysAgo) || !Number.isFinite(cutoff)) return Boolean(row?.isStale);
1631
+ return daysAgo > cutoff;
1632
+ }
1633
+
1634
+ function outputCallsStaleEvidenceRecent(outputText, row) {
1635
+ const normalized = normalizeText(outputText);
1636
+ const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
1637
+ if (!claimsRecent) return false;
1638
+ const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
1639
+ || /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
1640
+ if (explicitlyNotRecent) return false;
1641
+ const daysAgo = Number(row?.daysAgo);
1642
+ return !Number.isFinite(daysAgo) || !new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
1643
+ }
1644
+
1645
+ function recencyEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
1646
+ const normalizedExercise = normalizeExerciseName(exerciseName);
1647
+ const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
1648
+ .filter((name) => name && name !== normalizedExercise);
1649
+ const windows = directionEvaluationWindows(outputText, exerciseName, exerciseNames);
1650
+ if (!normalizedExercise) return windows;
1651
+ const scoped = [];
1652
+ for (const window of windows) {
1653
+ const clauses = window
1654
+ .split(/\s*(?:[.;:]|,\s+|\b(?:while|whereas|but|and)\b)\s*/i)
1655
+ .map((clause) => clause.trim())
1656
+ .filter(Boolean);
1657
+ let matched = false;
1658
+ for (let index = 0; index < clauses.length; index++) {
1659
+ if (!normalizeExerciseName(clauses[index]).includes(normalizedExercise)) continue;
1660
+ matched = true;
1661
+ let scopedWindow = clauses[index];
1662
+ for (let nextIndex = index + 1; nextIndex < clauses.length; nextIndex++) {
1663
+ const normalizedNext = normalizeExerciseName(clauses[nextIndex]);
1664
+ if (otherExercises.some((name) => normalizedNext.includes(name))) break;
1665
+ scopedWindow += ` ${clauses[nextIndex]}`;
1666
+ }
1667
+ scoped.push(scopedWindow);
1668
+ }
1669
+ if (!matched) scoped.push(window);
1670
+ }
1671
+ return scoped.length > 0 ? [...new Set(scoped)] : windows;
1672
+ }
1673
+
1674
+ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
1238
1675
  if (testCase.surface !== 'ask') {
1239
1676
  return { key: 'ask_tool_provenance', passed: true, reason: 'Not an ask answer.' };
1240
1677
  }
1241
1678
 
1242
1679
  const routedMetadata = context?.routedMetadata ?? {};
1243
1680
  const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
1244
- const failures = [];
1681
+ const { toolResults, replayFailures } = routedToolResultsForEval(snapshot, context);
1682
+ const evidenceRows = askToolEvidenceRows(toolResults);
1683
+ const mentionedExercises = findMentionedExercises(output, snapshot);
1684
+ const unroutedMentionNames = new Set();
1685
+ const failures = [...replayFailures];
1245
1686
  for (const toolName of uniqueStrings(testCase.requiredTools)) {
1246
1687
  if (!toolsUsed.has(toolName)) {
1247
1688
  failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
@@ -1252,6 +1693,74 @@ function evaluateAskToolProvenance(output, context, testCase) {
1252
1693
  failures.push('Ask answer mentions e1RM/1RM, but routed context did not use get_records.');
1253
1694
  }
1254
1695
 
1696
+ const weightedSetClaims = extractAskWeightedSetClaims(output);
1697
+ for (const claim of weightedSetClaims) {
1698
+ if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
1699
+ const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1700
+ if (!referencedExercise) continue;
1701
+ const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
1702
+ if (rows.length === 0) {
1703
+ unroutedMentionNames.add(referencedExercise.normalizedName);
1704
+ failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
1705
+ continue;
1706
+ }
1707
+ if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
1708
+ failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
1709
+ }
1710
+ }
1711
+
1712
+ for (const claim of extractAskWeightClaims(output)) {
1713
+ if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
1714
+ if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
1715
+ if (isVolumeWeightClaim(output, claim)) continue;
1716
+ const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
1717
+ if (!referencedExercise) continue;
1718
+ const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
1719
+ if (rows.length === 0) {
1720
+ unroutedMentionNames.add(referencedExercise.normalizedName);
1721
+ failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
1722
+ continue;
1723
+ }
1724
+ if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
1725
+ failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
1726
+ }
1727
+ }
1728
+
1729
+ const exerciseNames = evidenceRows.map((row) => row.exerciseName);
1730
+ for (const mention of mentionedExercises) {
1731
+ const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
1732
+ if (rows.length === 0) {
1733
+ if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
1734
+ unroutedMentionNames.add(mention.normalizedName);
1735
+ failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
1736
+ }
1737
+ continue;
1738
+ }
1739
+ const comparable = latestComparableToolRow(rows);
1740
+ if (comparable) {
1741
+ const direction = comparable.comparedToPreviousSession.loadDirection;
1742
+ const previous = comparable.comparedToPreviousSession.previousTopSet;
1743
+ const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
1744
+ if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
1745
+ failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
1746
+ }
1747
+ if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
1748
+ failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
1749
+ }
1750
+ if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
1751
+ failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
1752
+ }
1753
+ }
1754
+
1755
+ const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
1756
+ if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
1757
+ const windows = recencyEvaluationWindows(output, mention.name, exerciseNames);
1758
+ if (windows.some((window) => outputCallsStaleEvidenceRecent(window, latestDatedRow))) {
1759
+ failures.push(`Ask answer calls ${mention.name} recent, but routed tool evidence says the latest relevant session was ${latestDatedRow.daysAgo} days ago.`);
1760
+ }
1761
+ }
1762
+ }
1763
+
1255
1764
  return {
1256
1765
  key: 'ask_tool_provenance',
1257
1766
  passed: failures.length === 0,
@@ -1537,7 +2046,9 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
1537
2046
  evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
1538
2047
  evaluateWorkoutClaims(output, context, testCase),
1539
2048
  evaluateAskClaims(output, snapshot, testCase),
1540
- evaluateAskToolProvenance(output, context, testCase),
2049
+ evaluateAskDirectionalConsistency(output, snapshot, testCase),
2050
+ evaluateAskStaleness(output, snapshot, testCase),
2051
+ evaluateAskToolProvenance(output, context, testCase, snapshot),
1541
2052
  evaluateScoreCommentaryAction(output, context, testCase),
1542
2053
  evaluateScoreCommentarySynthesis(output, context, testCase),
1543
2054
  evaluateScoreCommentaryExerciseInvention(output, snapshot, context, testCase),
@@ -1567,12 +2078,14 @@ export async function runSummaryEvalCaseFromSnapshot(testCase, snapshot) {
1567
2078
  return evaluateSummaryOutputFromSnapshot(testCase, snapshot, output);
1568
2079
  }
1569
2080
 
1570
- function genericForbiddenPhrasesForSurface(surface) {
2081
+ export function genericForbiddenPhrasesForSurface(surface) {
1571
2082
  switch (surface) {
1572
2083
  case 'workout':
1573
2084
  return ['solid progress', 'trust the process', 'keep it up', 'quality work', 'in a great place', 'continue progressive overload', 'as fatigue accumulates'];
1574
2085
  case 'cycle':
1575
- return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work'];
2086
+ // 'solid first week' enforces the FIRST_WEEK_CYCLE_PROMPT's "do not say
2087
+ // solid first week" rule, which was previously prompt-only (unguarded).
2088
+ return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work', 'solid first week'];
1576
2089
  case 'checkpoint':
1577
2090
  return ['solid progress', 'quality work', 'trust the process', 'in a great place'];
1578
2091
  case 'vitals':