incremnt 0.7.0 → 0.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -1
- package/SKILL.md +5 -2
- package/package.json +1 -1
- package/src/coach-facts.js +14 -1
- package/src/contract.js +36 -1
- package/src/format.js +78 -2
- package/src/mcp.js +1 -1
- package/src/openrouter.js +25 -20
- package/src/plan-comparison.js +245 -0
- package/src/prompt-changelog.js +94 -0
- package/src/queries.js +867 -174
- package/src/remote.js +103 -1
- package/src/summary-evals.js +522 -9
- package/src/sync-service.js +265 -4
package/src/summary-evals.js
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
askRoutedContext,
|
|
7
7
|
checkpointContext,
|
|
8
8
|
cycleSummaryContext,
|
|
9
|
+
executeCoachReadTool,
|
|
9
10
|
normalizeExerciseName,
|
|
10
11
|
workoutSummaryContext,
|
|
11
12
|
vitalsSummaryContext
|
|
@@ -86,8 +87,9 @@ export function buildSummaryEvalContext(snapshot, testCase) {
|
|
|
86
87
|
return vitalsSummaryContext(snapshot, { exclude: new Set(testCase.exclude ?? []) });
|
|
87
88
|
case 'ask': {
|
|
88
89
|
const question = testCase.context?.question ?? testCase.question ?? '';
|
|
90
|
+
const today = testCase.context?.today ?? testCase.today ?? null;
|
|
89
91
|
const routed = question
|
|
90
|
-
? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []) })
|
|
92
|
+
? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
|
|
91
93
|
: null;
|
|
92
94
|
return {
|
|
93
95
|
...(testCase.context ?? {}),
|
|
@@ -305,7 +307,6 @@ const historicalExerciseModifiers = new Set([
|
|
|
305
307
|
'leg',
|
|
306
308
|
'weighted',
|
|
307
309
|
'romanian',
|
|
308
|
-
'hack',
|
|
309
310
|
'full',
|
|
310
311
|
'grip'
|
|
311
312
|
]);
|
|
@@ -358,6 +359,18 @@ function collectAllowedExerciseNames(surface, context) {
|
|
|
358
359
|
for (const sc of context.planComparison?.setsComparison ?? []) {
|
|
359
360
|
if (sc.exercise) names.add(sc.exercise);
|
|
360
361
|
}
|
|
362
|
+
// Planned-but-skipped and unplanned-but-added exercises are legitimate
|
|
363
|
+
// planned-vs-actual subjects: the model is handed "skipped X, added Y" in
|
|
364
|
+
// its context, so a correct "you skipped X" note must not be flagged as an
|
|
365
|
+
// unauthorized mention. `setsComparison` only carries *performed* planned
|
|
366
|
+
// exercises (queries.js builds it with a performedNames filter), so skipped
|
|
367
|
+
// lifts would otherwise never be authorized.
|
|
368
|
+
for (const exerciseName of context.planComparison?.skipped ?? []) {
|
|
369
|
+
if (exerciseName) names.add(exerciseName);
|
|
370
|
+
}
|
|
371
|
+
for (const exerciseName of context.planComparison?.added ?? []) {
|
|
372
|
+
if (exerciseName) names.add(exerciseName);
|
|
373
|
+
}
|
|
361
374
|
}
|
|
362
375
|
|
|
363
376
|
if (surface === 'cycle' && context && typeof context === 'object') {
|
|
@@ -399,6 +412,49 @@ function collectAllowedExerciseNames(surface, context) {
|
|
|
399
412
|
return [...names];
|
|
400
413
|
}
|
|
401
414
|
|
|
415
|
+
// Project the allow-set from the actual context object the model was handed.
|
|
416
|
+
// Any known exercise name (from the snapshot's vocabulary) that appears anywhere
|
|
417
|
+
// in the serialized context — structured fields, plan comparison, prior-session
|
|
418
|
+
// comparisons, nearby cardio, or free-text session/exercise notes — is something
|
|
419
|
+
// the model could legitimately reference. Deriving authorization this way means
|
|
420
|
+
// the allow-set can never drift behind a newly added context field: the failure
|
|
421
|
+
// mode that flagged a correct "you skipped Hack Squat" note and a note-echoed
|
|
422
|
+
// lift. A genuine invention is a known exercise present in the output but absent
|
|
423
|
+
// from the context entirely.
|
|
424
|
+
function collectContextExerciseNames(context, knownNames) {
|
|
425
|
+
if (!context || typeof context !== 'object') return [];
|
|
426
|
+
let serialized;
|
|
427
|
+
try {
|
|
428
|
+
serialized = JSON.stringify(context);
|
|
429
|
+
} catch {
|
|
430
|
+
return [];
|
|
431
|
+
}
|
|
432
|
+
const contextText = normalizeExerciseName(serialized);
|
|
433
|
+
if (!contextText) return [];
|
|
434
|
+
const matches = [];
|
|
435
|
+
for (const name of knownNames) {
|
|
436
|
+
const normalized = normalizeExerciseName(name);
|
|
437
|
+
if (!normalized) continue;
|
|
438
|
+
const pattern = new RegExp(`(?<!\\S)${escapeRegex(normalized)}(?!\\S)`, 'g');
|
|
439
|
+
for (const match of contextText.matchAll(pattern)) {
|
|
440
|
+
matches.push({
|
|
441
|
+
name,
|
|
442
|
+
normalized,
|
|
443
|
+
start: match.index,
|
|
444
|
+
end: (match.index ?? 0) + normalized.length
|
|
445
|
+
});
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
return uniqueStrings(matches
|
|
449
|
+
.filter((match) => !matches.some((candidate) =>
|
|
450
|
+
candidate !== match &&
|
|
451
|
+
candidate.normalized.length > match.normalized.length &&
|
|
452
|
+
candidate.start <= match.start &&
|
|
453
|
+
candidate.end >= match.end
|
|
454
|
+
))
|
|
455
|
+
.map((match) => match.name));
|
|
456
|
+
}
|
|
457
|
+
|
|
402
458
|
function historicalExerciseVariants(name) {
|
|
403
459
|
const normalized = normalizeExerciseName(name);
|
|
404
460
|
if (!normalized) return [];
|
|
@@ -431,9 +487,15 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
|
|
|
431
487
|
|
|
432
488
|
const outputText = surface === 'scoreCommentary' ? scoreCommentaryText(output) : output;
|
|
433
489
|
const isStored = testCase.source === 'stored';
|
|
490
|
+
const allNames = collectAllExerciseNames(snapshot);
|
|
491
|
+
|
|
492
|
+
// Union the hand-built field list with a projection of the actual context, so
|
|
493
|
+
// this can only ever *reduce* false positives. testCase.allowedExerciseMentions
|
|
494
|
+
// stays as an explicit override for cases that need it.
|
|
434
495
|
const allowed = new Set();
|
|
435
496
|
for (const name of [
|
|
436
497
|
...collectAllowedExerciseNames(surface, context),
|
|
498
|
+
...collectContextExerciseNames(context, allNames),
|
|
437
499
|
...(testCase.allowedExerciseMentions ?? [])
|
|
438
500
|
]) {
|
|
439
501
|
const variants = isStored ? historicalExerciseVariants(name) : [normalizeExerciseName(name)];
|
|
@@ -441,7 +503,6 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
|
|
|
441
503
|
allowed.add(variant);
|
|
442
504
|
}
|
|
443
505
|
}
|
|
444
|
-
const allNames = collectAllExerciseNames(snapshot);
|
|
445
506
|
const normalizedOutput = normalizeExerciseName(outputText);
|
|
446
507
|
const mentions = [];
|
|
447
508
|
|
|
@@ -463,8 +524,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
|
|
|
463
524
|
|
|
464
525
|
const unauthorized = mentions
|
|
465
526
|
.filter((mention) => !mention.allowed)
|
|
527
|
+
// Collapse a shorter mention into any longer mention that spans the same
|
|
528
|
+
// text — regardless of whether the covering mention is itself allowed.
|
|
529
|
+
// "Squat" matches inside "Hack Squat" (whitespace word boundary), so without
|
|
530
|
+
// this an unauthorized "Hack Squat" was double-counted as both "Hack Squat"
|
|
531
|
+
// and "Squat". The covering mention carries the real verdict; the substring
|
|
532
|
+
// is never a distinct mention.
|
|
466
533
|
.filter((mention) => !mentions.some((candidate) =>
|
|
467
|
-
candidate
|
|
534
|
+
candidate !== mention &&
|
|
468
535
|
candidate.normalizedName.length > mention.normalizedName.length &&
|
|
469
536
|
candidate.start <= mention.start &&
|
|
470
537
|
candidate.end >= mention.end
|
|
@@ -1108,6 +1175,21 @@ function extractAskWeightClaims(text) {
|
|
|
1108
1175
|
return claims;
|
|
1109
1176
|
}
|
|
1110
1177
|
|
|
1178
|
+
function extractAskWeightedSetClaims(text) {
|
|
1179
|
+
const claims = [];
|
|
1180
|
+
const pattern = /\b(\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:x|×|for)\s*(\d+)\b/gi;
|
|
1181
|
+
for (const match of text.matchAll(pattern)) {
|
|
1182
|
+
claims.push({
|
|
1183
|
+
text: match[0],
|
|
1184
|
+
weight: Number(match[1]),
|
|
1185
|
+
reps: Number(match[2]),
|
|
1186
|
+
index: match.index ?? -1,
|
|
1187
|
+
end: (match.index ?? -1) + match[0].length
|
|
1188
|
+
});
|
|
1189
|
+
}
|
|
1190
|
+
return claims;
|
|
1191
|
+
}
|
|
1192
|
+
|
|
1111
1193
|
function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
|
|
1112
1194
|
const weights = [];
|
|
1113
1195
|
for (const session of snapshot?.sessions ?? []) {
|
|
@@ -1149,6 +1231,184 @@ function isVolumeWeightClaim(text, claim) {
|
|
|
1149
1231
|
return /\bvolume\b/i.test(window);
|
|
1150
1232
|
}
|
|
1151
1233
|
|
|
1234
|
+
function askWorkingTopSetRows(snapshot) {
|
|
1235
|
+
const rows = [];
|
|
1236
|
+
for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
|
|
1237
|
+
const completedAt = session.completedAt ?? session.date;
|
|
1238
|
+
for (const exercise of session.exercises ?? []) {
|
|
1239
|
+
const workingSets = (exercise.sets ?? [])
|
|
1240
|
+
.filter((set) => set?.isComplete && !set?.isWarmup)
|
|
1241
|
+
.map((set) => ({
|
|
1242
|
+
weight: Number(set.weight) || 0,
|
|
1243
|
+
reps: Number(set.reps) || 0
|
|
1244
|
+
}));
|
|
1245
|
+
if (workingSets.length === 0) continue;
|
|
1246
|
+
const topSet = workingSets.sort((a, b) => b.weight - a.weight || b.reps - a.reps)[0];
|
|
1247
|
+
rows.push({
|
|
1248
|
+
sessionId: session.id ?? null,
|
|
1249
|
+
date: String(completedAt ?? '').slice(0, 10),
|
|
1250
|
+
exerciseName: exercise.name,
|
|
1251
|
+
normalizedName: normalizeExerciseName(exercise.name),
|
|
1252
|
+
...topSet
|
|
1253
|
+
});
|
|
1254
|
+
}
|
|
1255
|
+
}
|
|
1256
|
+
return rows;
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1259
|
+
function daysAgoForEval(date, testCase) {
|
|
1260
|
+
const today = testCase.context?.today ?? testCase.today;
|
|
1261
|
+
if (!today) return null;
|
|
1262
|
+
const dateMs = Date.parse(`${String(date ?? '').slice(0, 10)}T00:00:00.000Z`);
|
|
1263
|
+
const todayMs = Date.parse(`${String(today).slice(0, 10)}T00:00:00.000Z`);
|
|
1264
|
+
if (!Number.isFinite(dateMs) || !Number.isFinite(todayMs)) return null;
|
|
1265
|
+
return Math.max(0, Math.round((todayMs - dateMs) / (24 * 60 * 60 * 1000)));
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
function hasUnqualifiedDeclineLanguage(window) {
|
|
1269
|
+
const text = normalizeText(window);
|
|
1270
|
+
const decline = /\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)|regress(?:ed|ion|ing)?|fell|fall(?:ing)?|decreas(?:e|ed|ing)|lower|worse|slid|slipped)\b/i;
|
|
1271
|
+
if (!decline.test(text)) return false;
|
|
1272
|
+
if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,45}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|regress(?:ed|ion|ing)?|fall(?:ing)?|fell|lower|worse|slid|slipped)\b/i.test(text)) return false;
|
|
1273
|
+
if (/\b(?:rep|reps)\b.{0,20}\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b/i.test(text)) return false;
|
|
1274
|
+
if (/\b(drop(?:ped|ping|s)?(?: off)?|drop-off|slip(?:ped|ping)?|fell|fall(?:ing)?|lower|declin(?:e|ed|ing)?|decreas(?:e|ed|ing)?|worse)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
|
|
1275
|
+
return true;
|
|
1276
|
+
}
|
|
1277
|
+
|
|
1278
|
+
function hasUnqualifiedImprovementLanguage(window) {
|
|
1279
|
+
const text = normalizeText(window);
|
|
1280
|
+
const improvement = /\b(improv(?:e|ed|ing|ement)|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)|moving up|went up|up from|load jump|jumped)\b/i;
|
|
1281
|
+
if (!improvement.test(text)) return false;
|
|
1282
|
+
if (/\b(?:no|not|isn'?t|wasn'?t|without|rather than)\b.{0,35}\b(improv(?:e|ed|ing|ement)?|progress(?:ed|ing)?|stronger|increas(?:e|ed|ing)?|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
|
|
1283
|
+
if (/\b(?:rep|reps)\b.{0,20}\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b/i.test(text)) return false;
|
|
1284
|
+
if (/\b(improv(?:e|ed|ing|ement)?|increas(?:e|ed|ing)?|better|moving up|went up|up from|load jump|jump(?:ed|ing)?)\b.{0,20}\b(?:rep|reps)\b/i.test(text)) return false;
|
|
1285
|
+
return true;
|
|
1286
|
+
}
|
|
1287
|
+
|
|
1288
|
+
function isReferentialDirectionContinuation(sentence) {
|
|
1289
|
+
return /^(?:that|this|it|there|still|the\s+(?:latest|top)|top\s+set|same\s+load)\b/i.test(sentence);
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
function directionEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
|
|
1293
|
+
const normalizedExercise = normalizeExerciseName(exerciseName);
|
|
1294
|
+
const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
|
|
1295
|
+
.filter((name) => name && name !== normalizedExercise);
|
|
1296
|
+
const sentences = outputText
|
|
1297
|
+
.split(/(?<=[.!?])\s+/)
|
|
1298
|
+
.map((sentence) => sentence.trim())
|
|
1299
|
+
.filter(Boolean);
|
|
1300
|
+
if (!normalizedExercise) return sentences;
|
|
1301
|
+
const windows = [];
|
|
1302
|
+
for (let index = 0; index < sentences.length; index++) {
|
|
1303
|
+
if (!normalizeExerciseName(sentences[index]).includes(normalizedExercise)) continue;
|
|
1304
|
+
windows.push(sentences[index]);
|
|
1305
|
+
for (let nextIndex = index + 1; nextIndex < sentences.length; nextIndex++) {
|
|
1306
|
+
const normalizedNext = normalizeExerciseName(sentences[nextIndex]);
|
|
1307
|
+
if (otherExercises.some((name) => normalizedNext.includes(name))) break;
|
|
1308
|
+
if (!isReferentialDirectionContinuation(sentences[nextIndex])) break;
|
|
1309
|
+
windows.push(sentences[nextIndex]);
|
|
1310
|
+
}
|
|
1311
|
+
}
|
|
1312
|
+
return windows.length > 0 ? [...new Set(windows)] : [outputText];
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
|
|
1316
|
+
if (testCase.surface !== 'ask') {
|
|
1317
|
+
return { key: 'ask_directional_consistency', passed: true, reason: 'Not an ask answer.' };
|
|
1318
|
+
}
|
|
1319
|
+
|
|
1320
|
+
const required = Array.isArray(testCase.directionalConsistency)
|
|
1321
|
+
? testCase.directionalConsistency
|
|
1322
|
+
: [];
|
|
1323
|
+
if (required.length === 0) {
|
|
1324
|
+
return { key: 'ask_directional_consistency', passed: true, reason: 'No directional assertions configured.' };
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
const rows = askWorkingTopSetRows(snapshot);
|
|
1328
|
+
const outputText = normalizeText(output);
|
|
1329
|
+
const failures = [];
|
|
1330
|
+
|
|
1331
|
+
for (const expectation of required) {
|
|
1332
|
+
const normalizedName = normalizeExerciseName(expectation.exercise ?? expectation.exerciseName);
|
|
1333
|
+
const history = rows.filter((row) => row.normalizedName === normalizedName);
|
|
1334
|
+
if (history.length < 2) continue;
|
|
1335
|
+
const latest = history[0];
|
|
1336
|
+
const previous = history[1];
|
|
1337
|
+
const loadDelta = latest.weight - previous.weight;
|
|
1338
|
+
const actualDirection = loadDelta > 0 ? 'up' : loadDelta < 0 ? 'down' : 'flat';
|
|
1339
|
+
const expectedDirection = expectation.loadDirection ?? actualDirection;
|
|
1340
|
+
if (expectedDirection !== actualDirection) {
|
|
1341
|
+
failures.push(`Configured expected direction for ${latest.exerciseName} is ${expectedDirection}, but snapshot top-load direction is ${actualDirection}.`);
|
|
1342
|
+
continue;
|
|
1343
|
+
}
|
|
1344
|
+
|
|
1345
|
+
const windows = directionEvaluationWindows(
|
|
1346
|
+
outputText,
|
|
1347
|
+
expectation.exercise ?? expectation.exerciseName,
|
|
1348
|
+
rows.map((row) => row.exerciseName)
|
|
1349
|
+
);
|
|
1350
|
+
if (actualDirection === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
|
|
1351
|
+
failures.push(`Ask answer frames ${latest.exerciseName} as declining/drop-off even though top load increased from ${previous.weight} kg to ${latest.weight} kg.`);
|
|
1352
|
+
}
|
|
1353
|
+
if (actualDirection === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
|
|
1354
|
+
failures.push(`Ask answer frames ${latest.exerciseName} as improving even though top load decreased from ${previous.weight} kg to ${latest.weight} kg.`);
|
|
1355
|
+
}
|
|
1356
|
+
if (actualDirection === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
|
|
1357
|
+
failures.push(`Ask answer invents a load direction for ${latest.exerciseName}, but top load was flat at ${latest.weight} kg.`);
|
|
1358
|
+
}
|
|
1359
|
+
}
|
|
1360
|
+
|
|
1361
|
+
return {
|
|
1362
|
+
key: 'ask_directional_consistency',
|
|
1363
|
+
passed: failures.length === 0,
|
|
1364
|
+
reason: failures.length === 0
|
|
1365
|
+
? 'Ask answer does not invert configured load directions.'
|
|
1366
|
+
: failures.join(' ')
|
|
1367
|
+
};
|
|
1368
|
+
}
|
|
1369
|
+
|
|
1370
|
+
function relevantSessionsForStaleness(snapshot, testCase) {
|
|
1371
|
+
const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
|
|
1372
|
+
?? testCase.directionalConsistency?.[0]?.exercise
|
|
1373
|
+
?? testCase.directionalConsistency?.[0]?.exerciseName
|
|
1374
|
+
?? null;
|
|
1375
|
+
if (!configuredExercise) return snapshot?.sessions ?? [];
|
|
1376
|
+
const normalized = normalizeExerciseName(configuredExercise);
|
|
1377
|
+
return (snapshot?.sessions ?? []).filter((session) => (
|
|
1378
|
+
(session.exercises ?? []).some((exercise) => normalizeExerciseName(exercise.name) === normalized)
|
|
1379
|
+
));
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
function evaluateAskStaleness(output, snapshot, testCase) {
|
|
1383
|
+
if (testCase.surface !== 'ask') {
|
|
1384
|
+
return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
|
|
1385
|
+
}
|
|
1386
|
+
const maxRecentDays = testCase.staleness?.maxRecentDays;
|
|
1387
|
+
if (!Number.isFinite(Number(maxRecentDays))) {
|
|
1388
|
+
return { key: 'ask_staleness', passed: true, reason: 'No staleness assertion configured.' };
|
|
1389
|
+
}
|
|
1390
|
+
|
|
1391
|
+
const latestSession = stableSortByDateDesc(relevantSessionsForStaleness(snapshot, testCase), (session) => session.completedAt ?? session.date)[0] ?? null;
|
|
1392
|
+
const daysAgo = daysAgoForEval(latestSession?.completedAt ?? latestSession?.date, testCase);
|
|
1393
|
+
if (daysAgo == null || daysAgo <= Number(maxRecentDays)) {
|
|
1394
|
+
return { key: 'ask_staleness', passed: true, reason: 'Latest session is inside the configured recency window.' };
|
|
1395
|
+
}
|
|
1396
|
+
|
|
1397
|
+
const normalized = normalizeText(output);
|
|
1398
|
+
const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
|
|
1399
|
+
const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
|
|
1400
|
+
|| /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
|
|
1401
|
+
const includesAge = new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
|
|
1402
|
+
const passed = !claimsRecent || explicitlyNotRecent || includesAge;
|
|
1403
|
+
return {
|
|
1404
|
+
key: 'ask_staleness',
|
|
1405
|
+
passed,
|
|
1406
|
+
reason: passed
|
|
1407
|
+
? 'Ask answer does not present stale sessions as simply recent.'
|
|
1408
|
+
: `Ask answer calls a ${daysAgo}-day-old session recent without the days-ago label.`
|
|
1409
|
+
};
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1152
1412
|
function evaluateAskClaims(output, snapshot, testCase) {
|
|
1153
1413
|
if (testCase.surface !== 'ask') {
|
|
1154
1414
|
return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
|
|
@@ -1234,14 +1494,195 @@ function evaluateAskClaims(output, snapshot, testCase) {
|
|
|
1234
1494
|
};
|
|
1235
1495
|
}
|
|
1236
1496
|
|
|
1237
|
-
function
|
|
1497
|
+
function routedToolResultsForEval(snapshot, context) {
|
|
1498
|
+
const routedMetadata = context?.routedMetadata ?? {};
|
|
1499
|
+
const toolParams = routedMetadata.toolParams ?? {};
|
|
1500
|
+
const toolResults = [];
|
|
1501
|
+
const replayFailures = [];
|
|
1502
|
+
for (const toolName of uniqueStrings(routedMetadata.toolsUsed ?? [])) {
|
|
1503
|
+
try {
|
|
1504
|
+
toolResults.push(executeCoachReadTool(snapshot, toolName, toolParams[toolName] ?? {}));
|
|
1505
|
+
} catch (error) {
|
|
1506
|
+
replayFailures.push(`Could not replay routed tool ${toolName}: ${error?.message ?? String(error)}`);
|
|
1507
|
+
}
|
|
1508
|
+
}
|
|
1509
|
+
return { toolResults, replayFailures };
|
|
1510
|
+
}
|
|
1511
|
+
|
|
1512
|
+
function addAskToolEvidenceRow(rows, toolName, row, inherited = {}) {
|
|
1513
|
+
const exerciseName = row?.exerciseName ?? row?.name ?? inherited.exerciseName ?? null;
|
|
1514
|
+
const normalizedName = normalizeExerciseName(exerciseName);
|
|
1515
|
+
if (!normalizedName) return;
|
|
1516
|
+
rows.push({
|
|
1517
|
+
toolName,
|
|
1518
|
+
exerciseName,
|
|
1519
|
+
normalizedName,
|
|
1520
|
+
date: row?.date ?? inherited.date ?? null,
|
|
1521
|
+
daysAgo: row?.daysAgo ?? inherited.daysAgo ?? null,
|
|
1522
|
+
recencyLabel: row?.recencyLabel ?? inherited.recencyLabel ?? null,
|
|
1523
|
+
isStale: row?.isStale ?? inherited.isStale ?? false,
|
|
1524
|
+
recencyCutoffDays: row?.recencyCutoffDays ?? inherited.recencyCutoffDays ?? null,
|
|
1525
|
+
warmupSetCount: row?.warmupSetCount ?? 0,
|
|
1526
|
+
workingSetCount: row?.workingSetCount ?? null,
|
|
1527
|
+
topSet: row?.topSet ?? null,
|
|
1528
|
+
comparedToPreviousSession: row?.comparedToPreviousSession ?? null,
|
|
1529
|
+
sets: Array.isArray(row?.sets) ? row.sets : []
|
|
1530
|
+
});
|
|
1531
|
+
}
|
|
1532
|
+
|
|
1533
|
+
function askToolEvidenceRows(toolResults = []) {
|
|
1534
|
+
const rows = [];
|
|
1535
|
+
for (const toolResult of toolResults) {
|
|
1536
|
+
for (const row of toolResult?.rows ?? []) {
|
|
1537
|
+
if (Array.isArray(row?.exercises)) {
|
|
1538
|
+
for (const exercise of row.exercises) {
|
|
1539
|
+
addAskToolEvidenceRow(rows, toolResult.toolName, exercise, {
|
|
1540
|
+
date: row.date,
|
|
1541
|
+
daysAgo: row.daysAgo,
|
|
1542
|
+
recencyLabel: row.recencyLabel,
|
|
1543
|
+
isStale: row.isStale,
|
|
1544
|
+
recencyCutoffDays: row.recencyCutoffDays
|
|
1545
|
+
});
|
|
1546
|
+
}
|
|
1547
|
+
} else {
|
|
1548
|
+
addAskToolEvidenceRow(rows, toolResult.toolName, row);
|
|
1549
|
+
}
|
|
1550
|
+
}
|
|
1551
|
+
}
|
|
1552
|
+
return rows;
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
function askToolEvidenceWeights(rows = []) {
|
|
1556
|
+
const weights = [];
|
|
1557
|
+
for (const row of rows) {
|
|
1558
|
+
for (const set of row.sets ?? []) {
|
|
1559
|
+
const weight = Number(set.weight);
|
|
1560
|
+
if (Number.isFinite(weight)) weights.push(weight);
|
|
1561
|
+
}
|
|
1562
|
+
const topWeight = Number(row.topSet?.weight);
|
|
1563
|
+
if (Number.isFinite(topWeight)) weights.push(topWeight);
|
|
1564
|
+
const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
|
|
1565
|
+
if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
|
|
1566
|
+
}
|
|
1567
|
+
return weights;
|
|
1568
|
+
}
|
|
1569
|
+
|
|
1570
|
+
function askToolEvidenceSetPairs(rows = []) {
|
|
1571
|
+
const pairs = [];
|
|
1572
|
+
for (const row of rows) {
|
|
1573
|
+
for (const set of row.sets ?? []) {
|
|
1574
|
+
const weight = Number(set.weight);
|
|
1575
|
+
const reps = Number(set.reps);
|
|
1576
|
+
if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
|
|
1577
|
+
}
|
|
1578
|
+
const topWeight = Number(row.topSet?.weight);
|
|
1579
|
+
const topReps = Number(row.topSet?.reps);
|
|
1580
|
+
if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
|
|
1581
|
+
const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
|
|
1582
|
+
const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
|
|
1583
|
+
if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
|
|
1584
|
+
pairs.push({ weight: previousTopWeight, reps: previousTopReps });
|
|
1585
|
+
}
|
|
1586
|
+
}
|
|
1587
|
+
return pairs;
|
|
1588
|
+
}
|
|
1589
|
+
|
|
1590
|
+
function toolEvidenceSupportsWeightClaim(claim, rows) {
|
|
1591
|
+
if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
|
|
1592
|
+
return false;
|
|
1593
|
+
}
|
|
1594
|
+
|
|
1595
|
+
function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
|
|
1596
|
+
if (askToolEvidenceSetPairs(rows).some((pair) => (
|
|
1597
|
+
Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
|
|
1598
|
+
))) {
|
|
1599
|
+
return true;
|
|
1600
|
+
}
|
|
1601
|
+
return false;
|
|
1602
|
+
}
|
|
1603
|
+
|
|
1604
|
+
function compareToolEvidenceRecency(lhs, rhs) {
|
|
1605
|
+
const lhsDaysAgo = Number(lhs?.daysAgo);
|
|
1606
|
+
const rhsDaysAgo = Number(rhs?.daysAgo);
|
|
1607
|
+
if (Number.isFinite(lhsDaysAgo) && Number.isFinite(rhsDaysAgo)) return lhsDaysAgo - rhsDaysAgo;
|
|
1608
|
+
if (Number.isFinite(lhsDaysAgo)) return -1;
|
|
1609
|
+
if (Number.isFinite(rhsDaysAgo)) return 1;
|
|
1610
|
+
return String(rhs?.date ?? '').localeCompare(String(lhs?.date ?? ''));
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1613
|
+
function newestToolEvidenceRow(rows = [], predicate = () => true) {
|
|
1614
|
+
return rows
|
|
1615
|
+
.filter(predicate)
|
|
1616
|
+
.sort(compareToolEvidenceRecency)[0] ?? null;
|
|
1617
|
+
}
|
|
1618
|
+
|
|
1619
|
+
function latestComparableToolRow(rows = []) {
|
|
1620
|
+
return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
|
|
1621
|
+
}
|
|
1622
|
+
|
|
1623
|
+
function isWithinWeightedSetClaim(claim, weightedSetClaims) {
|
|
1624
|
+
return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
|
|
1625
|
+
}
|
|
1626
|
+
|
|
1627
|
+
function rowIsStaleForEval(row, testCase) {
|
|
1628
|
+
const daysAgo = Number(row?.daysAgo);
|
|
1629
|
+
const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
|
|
1630
|
+
if (!Number.isFinite(daysAgo) || !Number.isFinite(cutoff)) return Boolean(row?.isStale);
|
|
1631
|
+
return daysAgo > cutoff;
|
|
1632
|
+
}
|
|
1633
|
+
|
|
1634
|
+
function outputCallsStaleEvidenceRecent(outputText, row) {
|
|
1635
|
+
const normalized = normalizeText(outputText);
|
|
1636
|
+
const claimsRecent = /\brecent(?:ly)?\b/i.test(normalized);
|
|
1637
|
+
if (!claimsRecent) return false;
|
|
1638
|
+
const explicitlyNotRecent = /\b(?:not|isn'?t|wasn'?t|no longer)\s+(?:a\s+)?recent\b/i.test(normalized)
|
|
1639
|
+
|| /\brecent\b.{0,20}\b(?:not|isn'?t|wasn'?t)\b/i.test(normalized);
|
|
1640
|
+
if (explicitlyNotRecent) return false;
|
|
1641
|
+
const daysAgo = Number(row?.daysAgo);
|
|
1642
|
+
return !Number.isFinite(daysAgo) || !new RegExp(`\\b${daysAgo}\\s+days?\\s+ago\\b`, 'i').test(normalized);
|
|
1643
|
+
}
|
|
1644
|
+
|
|
1645
|
+
function recencyEvaluationWindows(outputText, exerciseName, exerciseNames = []) {
|
|
1646
|
+
const normalizedExercise = normalizeExerciseName(exerciseName);
|
|
1647
|
+
const otherExercises = [...new Set(exerciseNames.map(normalizeExerciseName))]
|
|
1648
|
+
.filter((name) => name && name !== normalizedExercise);
|
|
1649
|
+
const windows = directionEvaluationWindows(outputText, exerciseName, exerciseNames);
|
|
1650
|
+
if (!normalizedExercise) return windows;
|
|
1651
|
+
const scoped = [];
|
|
1652
|
+
for (const window of windows) {
|
|
1653
|
+
const clauses = window
|
|
1654
|
+
.split(/\s*(?:[.;:]|,\s+|\b(?:while|whereas|but|and)\b)\s*/i)
|
|
1655
|
+
.map((clause) => clause.trim())
|
|
1656
|
+
.filter(Boolean);
|
|
1657
|
+
let matched = false;
|
|
1658
|
+
for (let index = 0; index < clauses.length; index++) {
|
|
1659
|
+
if (!normalizeExerciseName(clauses[index]).includes(normalizedExercise)) continue;
|
|
1660
|
+
matched = true;
|
|
1661
|
+
let scopedWindow = clauses[index];
|
|
1662
|
+
for (let nextIndex = index + 1; nextIndex < clauses.length; nextIndex++) {
|
|
1663
|
+
const normalizedNext = normalizeExerciseName(clauses[nextIndex]);
|
|
1664
|
+
if (otherExercises.some((name) => normalizedNext.includes(name))) break;
|
|
1665
|
+
scopedWindow += ` ${clauses[nextIndex]}`;
|
|
1666
|
+
}
|
|
1667
|
+
scoped.push(scopedWindow);
|
|
1668
|
+
}
|
|
1669
|
+
if (!matched) scoped.push(window);
|
|
1670
|
+
}
|
|
1671
|
+
return scoped.length > 0 ? [...new Set(scoped)] : windows;
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
function evaluateAskToolProvenance(output, context, testCase, snapshot) {
|
|
1238
1675
|
if (testCase.surface !== 'ask') {
|
|
1239
1676
|
return { key: 'ask_tool_provenance', passed: true, reason: 'Not an ask answer.' };
|
|
1240
1677
|
}
|
|
1241
1678
|
|
|
1242
1679
|
const routedMetadata = context?.routedMetadata ?? {};
|
|
1243
1680
|
const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
|
|
1244
|
-
const
|
|
1681
|
+
const { toolResults, replayFailures } = routedToolResultsForEval(snapshot, context);
|
|
1682
|
+
const evidenceRows = askToolEvidenceRows(toolResults);
|
|
1683
|
+
const mentionedExercises = findMentionedExercises(output, snapshot);
|
|
1684
|
+
const unroutedMentionNames = new Set();
|
|
1685
|
+
const failures = [...replayFailures];
|
|
1245
1686
|
for (const toolName of uniqueStrings(testCase.requiredTools)) {
|
|
1246
1687
|
if (!toolsUsed.has(toolName)) {
|
|
1247
1688
|
failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
|
|
@@ -1252,6 +1693,74 @@ function evaluateAskToolProvenance(output, context, testCase) {
|
|
|
1252
1693
|
failures.push('Ask answer mentions e1RM/1RM, but routed context did not use get_records.');
|
|
1253
1694
|
}
|
|
1254
1695
|
|
|
1696
|
+
const weightedSetClaims = extractAskWeightedSetClaims(output);
|
|
1697
|
+
for (const claim of weightedSetClaims) {
|
|
1698
|
+
if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
|
|
1699
|
+
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1700
|
+
if (!referencedExercise) continue;
|
|
1701
|
+
const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
|
|
1702
|
+
if (rows.length === 0) {
|
|
1703
|
+
unroutedMentionNames.add(referencedExercise.normalizedName);
|
|
1704
|
+
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
|
|
1705
|
+
continue;
|
|
1706
|
+
}
|
|
1707
|
+
if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
|
|
1708
|
+
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
|
|
1709
|
+
}
|
|
1710
|
+
}
|
|
1711
|
+
|
|
1712
|
+
for (const claim of extractAskWeightClaims(output)) {
|
|
1713
|
+
if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
|
|
1714
|
+
if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
|
|
1715
|
+
if (isVolumeWeightClaim(output, claim)) continue;
|
|
1716
|
+
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1717
|
+
if (!referencedExercise) continue;
|
|
1718
|
+
const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
|
|
1719
|
+
if (rows.length === 0) {
|
|
1720
|
+
unroutedMentionNames.add(referencedExercise.normalizedName);
|
|
1721
|
+
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
|
|
1722
|
+
continue;
|
|
1723
|
+
}
|
|
1724
|
+
if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
|
|
1725
|
+
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
|
|
1726
|
+
}
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
const exerciseNames = evidenceRows.map((row) => row.exerciseName);
|
|
1730
|
+
for (const mention of mentionedExercises) {
|
|
1731
|
+
const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
|
|
1732
|
+
if (rows.length === 0) {
|
|
1733
|
+
if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
|
|
1734
|
+
unroutedMentionNames.add(mention.normalizedName);
|
|
1735
|
+
failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
|
|
1736
|
+
}
|
|
1737
|
+
continue;
|
|
1738
|
+
}
|
|
1739
|
+
const comparable = latestComparableToolRow(rows);
|
|
1740
|
+
if (comparable) {
|
|
1741
|
+
const direction = comparable.comparedToPreviousSession.loadDirection;
|
|
1742
|
+
const previous = comparable.comparedToPreviousSession.previousTopSet;
|
|
1743
|
+
const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
|
|
1744
|
+
if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
|
|
1745
|
+
failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
|
|
1746
|
+
}
|
|
1747
|
+
if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
|
|
1748
|
+
failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
|
|
1749
|
+
}
|
|
1750
|
+
if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
|
|
1751
|
+
failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
|
|
1752
|
+
}
|
|
1753
|
+
}
|
|
1754
|
+
|
|
1755
|
+
const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
|
|
1756
|
+
if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
|
|
1757
|
+
const windows = recencyEvaluationWindows(output, mention.name, exerciseNames);
|
|
1758
|
+
if (windows.some((window) => outputCallsStaleEvidenceRecent(window, latestDatedRow))) {
|
|
1759
|
+
failures.push(`Ask answer calls ${mention.name} recent, but routed tool evidence says the latest relevant session was ${latestDatedRow.daysAgo} days ago.`);
|
|
1760
|
+
}
|
|
1761
|
+
}
|
|
1762
|
+
}
|
|
1763
|
+
|
|
1255
1764
|
return {
|
|
1256
1765
|
key: 'ask_tool_provenance',
|
|
1257
1766
|
passed: failures.length === 0,
|
|
@@ -1537,7 +2046,9 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
|
|
|
1537
2046
|
evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
|
|
1538
2047
|
evaluateWorkoutClaims(output, context, testCase),
|
|
1539
2048
|
evaluateAskClaims(output, snapshot, testCase),
|
|
1540
|
-
|
|
2049
|
+
evaluateAskDirectionalConsistency(output, snapshot, testCase),
|
|
2050
|
+
evaluateAskStaleness(output, snapshot, testCase),
|
|
2051
|
+
evaluateAskToolProvenance(output, context, testCase, snapshot),
|
|
1541
2052
|
evaluateScoreCommentaryAction(output, context, testCase),
|
|
1542
2053
|
evaluateScoreCommentarySynthesis(output, context, testCase),
|
|
1543
2054
|
evaluateScoreCommentaryExerciseInvention(output, snapshot, context, testCase),
|
|
@@ -1567,12 +2078,14 @@ export async function runSummaryEvalCaseFromSnapshot(testCase, snapshot) {
|
|
|
1567
2078
|
return evaluateSummaryOutputFromSnapshot(testCase, snapshot, output);
|
|
1568
2079
|
}
|
|
1569
2080
|
|
|
1570
|
-
function genericForbiddenPhrasesForSurface(surface) {
|
|
2081
|
+
export function genericForbiddenPhrasesForSurface(surface) {
|
|
1571
2082
|
switch (surface) {
|
|
1572
2083
|
case 'workout':
|
|
1573
2084
|
return ['solid progress', 'trust the process', 'keep it up', 'quality work', 'in a great place', 'continue progressive overload', 'as fatigue accumulates'];
|
|
1574
2085
|
case 'cycle':
|
|
1575
|
-
|
|
2086
|
+
// 'solid first week' enforces the FIRST_WEEK_CYCLE_PROMPT's "do not say
|
|
2087
|
+
// solid first week" rule, which was previously prompt-only (unguarded).
|
|
2088
|
+
return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work', 'solid first week'];
|
|
1576
2089
|
case 'checkpoint':
|
|
1577
2090
|
return ['solid progress', 'quality work', 'trust the process', 'in a great place'];
|
|
1578
2091
|
case 'vitals':
|