incremnt 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -1
- package/src/ask-answer-verifier.js +249 -14
- package/src/ask-coach.js +309 -21
- package/src/openrouter.js +55 -30
- package/src/promptfoo-evals.js +20 -3
- package/src/queries.js +113 -18
- package/src/score-prelude.js +16 -13
- package/src/summary-evals.js +106 -474
- package/src/sync-service.js +46 -11
package/src/summary-evals.js
CHANGED
|
@@ -6,6 +6,7 @@ import {
|
|
|
6
6
|
canonicalExerciseName,
|
|
7
7
|
checkpointContext,
|
|
8
8
|
cycleSummaryContext,
|
|
9
|
+
dateOnlyString,
|
|
9
10
|
executeCoachReadTool,
|
|
10
11
|
normalizeExerciseName,
|
|
11
12
|
workoutSummaryContext,
|
|
@@ -24,12 +25,30 @@ import {
|
|
|
24
25
|
import { computeScoreBand } from './score-context.js';
|
|
25
26
|
import { stripXMLTagBlocks } from './prompt-security.js';
|
|
26
27
|
import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
|
|
28
|
+
import { findAskAnswerExerciseMentions, verifyAskAnswer } from './ask-answer-verifier.js';
|
|
27
29
|
|
|
28
30
|
const __filename = fileURLToPath(import.meta.url);
|
|
29
31
|
const __dirname = path.dirname(__filename);
|
|
30
32
|
|
|
31
33
|
export const summaryEvalFixturesRoot = path.resolve(__dirname, '../test/fixtures/summary-evals');
|
|
32
34
|
|
|
35
|
+
const ASK_PROVENANCE_VERIFIER_KEYS = new Set([
|
|
36
|
+
'unsupported_weight_claim',
|
|
37
|
+
'unrouted_weight_claim',
|
|
38
|
+
'unsupported_weighted_set_claim',
|
|
39
|
+
'unrouted_weighted_set_claim',
|
|
40
|
+
'e1rm_without_records',
|
|
41
|
+
'direction_inversion',
|
|
42
|
+
'tool_replay_failed'
|
|
43
|
+
]);
|
|
44
|
+
|
|
45
|
+
const ASK_CLAIM_VERIFIER_KEYS = new Set([
|
|
46
|
+
'target_hit_contradiction',
|
|
47
|
+
'target_hit_without_session_evidence',
|
|
48
|
+
'clean_consistency_contradiction',
|
|
49
|
+
'unsupported_fatigue_recovery'
|
|
50
|
+
]);
|
|
51
|
+
|
|
33
52
|
export function defaultCaseSetName() {
|
|
34
53
|
return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
|
|
35
54
|
}
|
|
@@ -108,7 +127,8 @@ export function buildSummaryEvalContext(snapshot, testCase) {
|
|
|
108
127
|
// prelude to the routed context. Including it here means a live eval feeds
|
|
109
128
|
// the model the same dump-prone material, so evaluateAskScoreVoice actually
|
|
110
129
|
// guards the prompt, not just the checker.
|
|
111
|
-
const
|
|
130
|
+
const responseProfile = routed?.metadata?.responseProfile ?? routed?.metadata?.intent?.responseProfile ?? testCase.context?.responseProfile;
|
|
131
|
+
const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question, responseProfile });
|
|
112
132
|
const routedContext = routed?.context ?? null;
|
|
113
133
|
const trainingData = testCase.context?.trainingData
|
|
114
134
|
?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
|
|
@@ -222,6 +242,49 @@ function normalizeText(value) {
|
|
|
222
242
|
return String(value ?? '').trim();
|
|
223
243
|
}
|
|
224
244
|
|
|
245
|
+
const askVerifierCacheKey = Symbol('summaryEvalAskVerifierCache');
|
|
246
|
+
|
|
247
|
+
export function summaryEvalAskVerifierToday(testCase) {
|
|
248
|
+
return dateOnlyString(testCase.context?.today ?? testCase.today ?? new Date());
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function getAskVerifierResult(output, context, snapshot, testCase) {
|
|
252
|
+
const answer = normalizeText(output);
|
|
253
|
+
const routingMetadata = context?.routedMetadata ?? testCase.context?.routedMetadata ?? {};
|
|
254
|
+
const today = summaryEvalAskVerifierToday(testCase);
|
|
255
|
+
const exclude = testCase.exclude ?? [];
|
|
256
|
+
const cacheKey = `${answer}\n${JSON.stringify(routingMetadata)}\n${today}\n${JSON.stringify(exclude)}`;
|
|
257
|
+
if (context && typeof context === 'object') {
|
|
258
|
+
if (!context[askVerifierCacheKey]) {
|
|
259
|
+
Object.defineProperty(context, askVerifierCacheKey, {
|
|
260
|
+
value: new Map(),
|
|
261
|
+
enumerable: false
|
|
262
|
+
});
|
|
263
|
+
}
|
|
264
|
+
const cached = context[askVerifierCacheKey].get(cacheKey);
|
|
265
|
+
if (cached) return cached;
|
|
266
|
+
}
|
|
267
|
+
const verification = verifyAskAnswer({
|
|
268
|
+
answer,
|
|
269
|
+
snapshot,
|
|
270
|
+
routingMetadata,
|
|
271
|
+
today,
|
|
272
|
+
exclude,
|
|
273
|
+
strictMentionProvenance: false
|
|
274
|
+
});
|
|
275
|
+
const result = {
|
|
276
|
+
verification,
|
|
277
|
+
provenanceFailures: (verification.blockingFailures ?? [])
|
|
278
|
+
.filter((failure) => ASK_PROVENANCE_VERIFIER_KEYS.has(failure.key)),
|
|
279
|
+
claimFailures: (verification.blockingFailures ?? [])
|
|
280
|
+
.filter((failure) => ASK_CLAIM_VERIFIER_KEYS.has(failure.key))
|
|
281
|
+
};
|
|
282
|
+
if (context && typeof context === 'object') {
|
|
283
|
+
context[askVerifierCacheKey].set(cacheKey, result);
|
|
284
|
+
}
|
|
285
|
+
return result;
|
|
286
|
+
}
|
|
287
|
+
|
|
225
288
|
function parseJsonOutput(output) {
|
|
226
289
|
const normalized = normalizeText(output);
|
|
227
290
|
if (!normalized) return null;
|
|
@@ -697,7 +760,7 @@ function evaluateNoInsight(output, testCase) {
|
|
|
697
760
|
};
|
|
698
761
|
}
|
|
699
762
|
|
|
700
|
-
function evaluateShape(output, testCase) {
|
|
763
|
+
function evaluateShape(output, testCase, context = null) {
|
|
701
764
|
const normalized = testCase.surface === 'scoreCommentary'
|
|
702
765
|
? scoreCommentaryText(output)
|
|
703
766
|
: normalizeText(output);
|
|
@@ -768,12 +831,20 @@ function evaluateShape(output, testCase) {
|
|
|
768
831
|
reasons.push(`Checkpoint summaries must be 2-3 paragraphs, got ${paragraphs}.`);
|
|
769
832
|
}
|
|
770
833
|
break;
|
|
771
|
-
case 'ask':
|
|
772
|
-
|
|
834
|
+
case 'ask': {
|
|
835
|
+
// Expansive answers are intentionally richer; the old 12-sentence cap was
|
|
836
|
+
// the pre-expansive policy. Allow more for expansive (still bounded so a
|
|
837
|
+
// genuine wall of text is flagged), keep the tight cap for defensive.
|
|
838
|
+
const profile = context?.routedMetadata?.responseProfile
|
|
839
|
+
?? context?.routedMetadata?.intent?.responseProfile
|
|
840
|
+
?? askResponseProfileFromTestCase(testCase);
|
|
841
|
+
const maxAskSentences = profile === 'expansive' ? 20 : 12;
|
|
842
|
+
if (sentences < 1 || sentences > maxAskSentences) {
|
|
773
843
|
passed = false;
|
|
774
|
-
reasons.push(`Ask-coach answers must be 1
|
|
844
|
+
reasons.push(`Ask-coach answers must be 1-${maxAskSentences} sentences, got ${sentences}.`);
|
|
775
845
|
}
|
|
776
846
|
break;
|
|
847
|
+
}
|
|
777
848
|
case 'scoreCommentary':
|
|
778
849
|
if (sentences < 1 || sentences > 8) {
|
|
779
850
|
passed = false;
|
|
@@ -888,45 +959,6 @@ function hasFatigueLanguage(output) {
|
|
|
888
959
|
return /\b(fatigue|fatigued|underrecovered|recovery debt|fatigue ceiling|limited by recovery|limited by fatigue|accumulated fatigue)\b/i.test(output);
|
|
889
960
|
}
|
|
890
961
|
|
|
891
|
-
function hasAskFatigueRecoveryLanguage(output) {
|
|
892
|
-
return hasFatigueLanguage(output)
|
|
893
|
-
|| /\b(?:poor|low|bad|incomplete)\s+recovery\b/i.test(output)
|
|
894
|
-
|| /\bunder[-\s]?recovery\b/i.test(output)
|
|
895
|
-
|| /\brecovery\s+(?:limited|held back|caused|explains|drove|deficit|issue|problem)\b/i.test(output);
|
|
896
|
-
}
|
|
897
|
-
|
|
898
|
-
function hasAskFatigueRecoveryUncertaintyLanguage(output) {
|
|
899
|
-
const missingRecoveryData = /\b(?:no|not enough|without|missing|lack(?:ing)?|insufficient)\s+(?:\w+\s+){0,4}?(?:recovery|readiness|vitals?|sleep|hrv|heart rate|data|info|signals?|metrics?)\b/i.test(output);
|
|
900
|
-
const refusesInference = /\b(?:cannot|can't|do not|don't|does not|doesn't|would not|wouldn't|not enough|isn't enough|is not enough|no basis to|hard to)\s+(?:\w+\s+){0,12}?(?:infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\s+(?:\w+\s+){0,12}?(?:fatigue|recovery|readiness|why)\b/i.test(output);
|
|
901
|
-
const recoveryDoesNotExplain = /\b(?:fatigue|recovery|readiness)\b\s+(?:\w+\s+){0,10}?(?:cannot|can't|does not|doesn't|would not|wouldn't|isn't|is not)\s+(?:\w+\s+){0,10}?(?:explain|prove|show|tell|account for)\b/i.test(output);
|
|
902
|
-
return missingRecoveryData || refusesInference || recoveryDoesNotExplain;
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
function hasAskPositiveFatigueRecoveryAttribution(output) {
|
|
906
|
-
const concept = String.raw`(?:fatigue|fatigued|under[-\s]?recovered|under[-\s]?recovery|poor recovery|low recovery|incomplete recovery|recovery debt|fatigue ceiling|accumulated fatigue)`;
|
|
907
|
-
const causeVerb = String.raw`(?:because|due to|caused by|from|reflects?|suggests?|indicates?|points? to|explains?|limited|held back|drove|contributed to|tied to|tie\s+\w+\s+to)`;
|
|
908
|
-
const patterns = [
|
|
909
|
-
new RegExp(String.raw`\b${causeVerb}\b.{0,80}\b${concept}\b`, 'gi'),
|
|
910
|
-
new RegExp(String.raw`\b${concept}\b.{0,80}\b(?:caused|limited|held back|explains?|drove|led to|contributed to|accounts? for)\b`, 'gi')
|
|
911
|
-
];
|
|
912
|
-
for (const pattern of patterns) {
|
|
913
|
-
for (const match of output.matchAll(pattern)) {
|
|
914
|
-
const start = Math.max(0, (match.index ?? 0) - 40);
|
|
915
|
-
const window = output.slice(start, (match.index ?? 0) + match[0].length);
|
|
916
|
-
if (!/\b(?:not|no|cannot|can't|doesn't|does not|would not|wouldn't|isn't|is not)\b/i.test(window)) {
|
|
917
|
-
return true;
|
|
918
|
-
}
|
|
919
|
-
}
|
|
920
|
-
}
|
|
921
|
-
return false;
|
|
922
|
-
}
|
|
923
|
-
|
|
924
|
-
function hasUnsupportedAskFatigueRecoveryClaim(output) {
|
|
925
|
-
if (!hasAskFatigueRecoveryLanguage(output)) return false;
|
|
926
|
-
if (hasAskPositiveFatigueRecoveryAttribution(output)) return true;
|
|
927
|
-
return !hasAskFatigueRecoveryUncertaintyLanguage(output);
|
|
928
|
-
}
|
|
929
|
-
|
|
930
962
|
function matchesHistoricalFamilyName(claimName, actualName) {
|
|
931
963
|
const claimVariants = new Set(historicalExerciseVariants(claimName));
|
|
932
964
|
const actualVariants = new Set(historicalExerciseVariants(actualName));
|
|
@@ -1086,37 +1118,6 @@ function evaluateWorkoutClaims(output, context, testCase) {
|
|
|
1086
1118
|
};
|
|
1087
1119
|
}
|
|
1088
1120
|
|
|
1089
|
-
function extractAskTargetHitClaims(text) {
|
|
1090
|
-
const claims = [];
|
|
1091
|
-
const patterns = [
|
|
1092
|
-
/\b(?:you\s+)?hit(?:ting)?\s+all\s+(?:your\s+)?target(?:ed)?\s+reps?\b/gi,
|
|
1093
|
-
/\b(?:you\s+)?hit\s+all\s+(?:the\s+)?targets?\b/gi,
|
|
1094
|
-
/\b(?:you\s+)?hit\s+(?:the|your)\s+target\b(?!\s+(?:of|for|on))/gi
|
|
1095
|
-
];
|
|
1096
|
-
for (const pattern of patterns) {
|
|
1097
|
-
for (const match of text.matchAll(pattern)) {
|
|
1098
|
-
claims.push({ text: match[0] });
|
|
1099
|
-
}
|
|
1100
|
-
}
|
|
1101
|
-
return claims;
|
|
1102
|
-
}
|
|
1103
|
-
|
|
1104
|
-
function extractAskCleanConsistencyClaims(text) {
|
|
1105
|
-
const claims = [];
|
|
1106
|
-
const patterns = [
|
|
1107
|
-
/\bclean,\s+consistent\b/gi,
|
|
1108
|
-
/\bclean\s+and\s+consistent\b/gi,
|
|
1109
|
-
/\bconsistent\s+set\s+of\s+work\b/gi,
|
|
1110
|
-
/\bacross\s+the\s+board\b/gi
|
|
1111
|
-
];
|
|
1112
|
-
for (const pattern of patterns) {
|
|
1113
|
-
for (const match of text.matchAll(pattern)) {
|
|
1114
|
-
claims.push({ text: match[0] });
|
|
1115
|
-
}
|
|
1116
|
-
}
|
|
1117
|
-
return claims;
|
|
1118
|
-
}
|
|
1119
|
-
|
|
1120
1121
|
function extractAskPlannedListClaims(text) {
|
|
1121
1122
|
const claims = [];
|
|
1122
1123
|
const pattern = /\((\s*\d+(?:\s*,\s*\d+){2,})\s+planned\s*\)/gi;
|
|
@@ -1137,77 +1138,6 @@ function sessionPlannedReps(session) {
|
|
|
1137
1138
|
return values;
|
|
1138
1139
|
}
|
|
1139
1140
|
|
|
1140
|
-
function findMentionedExercises(text, snapshot) {
|
|
1141
|
-
const exercisesByName = new Map();
|
|
1142
|
-
for (const session of snapshot?.sessions ?? []) {
|
|
1143
|
-
for (const exercise of session.exercises ?? []) {
|
|
1144
|
-
if (!exercise?.name) continue;
|
|
1145
|
-
const normalizedName = normalizeExerciseName(exercise.name);
|
|
1146
|
-
if (!normalizedName || exercisesByName.has(normalizedName)) continue;
|
|
1147
|
-
exercisesByName.set(normalizedName, exercise.name);
|
|
1148
|
-
}
|
|
1149
|
-
for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
|
|
1150
|
-
if (!exercise?.exerciseName) continue;
|
|
1151
|
-
const normalizedName = normalizeExerciseName(exercise.exerciseName);
|
|
1152
|
-
if (!normalizedName || exercisesByName.has(normalizedName)) continue;
|
|
1153
|
-
exercisesByName.set(normalizedName, exercise.exerciseName);
|
|
1154
|
-
}
|
|
1155
|
-
}
|
|
1156
|
-
|
|
1157
|
-
const mentions = [];
|
|
1158
|
-
for (const [normalizedName, displayName] of exercisesByName) {
|
|
1159
|
-
const pattern = new RegExp(`\\b${escapeRegExp(displayName)}\\b`, 'gi');
|
|
1160
|
-
for (const match of text.matchAll(pattern)) {
|
|
1161
|
-
mentions.push({
|
|
1162
|
-
index: match.index ?? -1,
|
|
1163
|
-
end: (match.index ?? -1) + match[0].length,
|
|
1164
|
-
name: displayName,
|
|
1165
|
-
normalizedName
|
|
1166
|
-
});
|
|
1167
|
-
}
|
|
1168
|
-
}
|
|
1169
|
-
return mentions
|
|
1170
|
-
.filter((mention, index, allMentions) => !allMentions.some((candidate, candidateIndex) =>
|
|
1171
|
-
candidateIndex !== index &&
|
|
1172
|
-
candidate.index <= mention.index &&
|
|
1173
|
-
candidate.end >= mention.end &&
|
|
1174
|
-
candidate.normalizedName.length > mention.normalizedName.length
|
|
1175
|
-
))
|
|
1176
|
-
.sort((lhs, rhs) => lhs.index - rhs.index);
|
|
1177
|
-
}
|
|
1178
|
-
|
|
1179
|
-
function findRecentSessionMisses(snapshot, { lookbackDays = 7, exerciseNames = null } = {}) {
|
|
1180
|
-
const sessions = snapshot?.sessions ?? [];
|
|
1181
|
-
const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
|
|
1182
|
-
const scopedExerciseNames = exerciseNames && exerciseNames.length > 0 ? new Set(exerciseNames) : null;
|
|
1183
|
-
const misses = [];
|
|
1184
|
-
for (const session of sessions) {
|
|
1185
|
-
const completedAt = session.completedAt || session.date;
|
|
1186
|
-
const completedTime = Date.parse(completedAt);
|
|
1187
|
-
if (!Number.isFinite(completedTime) || completedTime < cutoff) continue;
|
|
1188
|
-
const targetByExercise = new Map();
|
|
1189
|
-
for (const planned of session.prescriptionSnapshot?.exercises ?? []) {
|
|
1190
|
-
const target = Number(planned.targetReps);
|
|
1191
|
-
if (Number.isFinite(target) && target > 0) {
|
|
1192
|
-
targetByExercise.set(normalizeExerciseName(planned.exerciseName), target);
|
|
1193
|
-
}
|
|
1194
|
-
}
|
|
1195
|
-
for (const exercise of session.exercises ?? []) {
|
|
1196
|
-
const normalizedExerciseName = normalizeExerciseName(exercise.name);
|
|
1197
|
-
if (scopedExerciseNames && !scopedExerciseNames.has(normalizedExerciseName)) continue;
|
|
1198
|
-
const target = targetByExercise.get(normalizedExerciseName);
|
|
1199
|
-
if (!Number.isFinite(target)) continue;
|
|
1200
|
-
for (const set of exercise.sets ?? []) {
|
|
1201
|
-
const reps = Number(set.reps);
|
|
1202
|
-
if (set.isComplete && Number.isFinite(reps) && reps < target) {
|
|
1203
|
-
misses.push({ sessionId: session.id, exerciseName: exercise.name, reps, target });
|
|
1204
|
-
}
|
|
1205
|
-
}
|
|
1206
|
-
}
|
|
1207
|
-
}
|
|
1208
|
-
return misses;
|
|
1209
|
-
}
|
|
1210
|
-
|
|
1211
1141
|
function findNearestMentionedExercise(mentions, index) {
|
|
1212
1142
|
let candidate = null;
|
|
1213
1143
|
for (const mention of mentions) {
|
|
@@ -1217,170 +1147,6 @@ function findNearestMentionedExercise(mentions, index) {
|
|
|
1217
1147
|
return candidate;
|
|
1218
1148
|
}
|
|
1219
1149
|
|
|
1220
|
-
function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
|
|
1221
|
-
const cutoff = Date.now() - lookbackDays * 24 * 60 * 60 * 1000;
|
|
1222
|
-
const withinCutoff = (dateValue) => {
|
|
1223
|
-
const ms = Date.parse(dateValue);
|
|
1224
|
-
return Number.isFinite(ms) && ms >= cutoff;
|
|
1225
|
-
};
|
|
1226
|
-
|
|
1227
|
-
const vitalsSummaries = snapshot?.vitalsSummaries ?? [];
|
|
1228
|
-
if (vitalsSummaries.some((entry) => withinCutoff(entry.date))) return true;
|
|
1229
|
-
|
|
1230
|
-
const metrics = snapshot?.healthMetrics ?? {};
|
|
1231
|
-
for (const key of ['restingHR', 'hrv', 'sleep']) {
|
|
1232
|
-
const readings = Array.isArray(metrics[key]) ? metrics[key] : [];
|
|
1233
|
-
if (readings.some((reading) => withinCutoff(reading.date))) return true;
|
|
1234
|
-
}
|
|
1235
|
-
|
|
1236
|
-
for (const session of snapshot?.sessions ?? []) {
|
|
1237
|
-
const completedAt = session.completedAt || session.date;
|
|
1238
|
-
if (!withinCutoff(completedAt)) continue;
|
|
1239
|
-
for (const exercise of session.exercises ?? []) {
|
|
1240
|
-
const reps = (exercise.sets ?? [])
|
|
1241
|
-
.map((set) => Number(set.reps))
|
|
1242
|
-
.filter((value) => Number.isFinite(value) && value > 0);
|
|
1243
|
-
if (reps.length < 2) continue;
|
|
1244
|
-
const first = reps[0];
|
|
1245
|
-
const last = reps[reps.length - 1];
|
|
1246
|
-
if (first > 0 && (first - last) / first >= 0.3) return true;
|
|
1247
|
-
}
|
|
1248
|
-
}
|
|
1249
|
-
|
|
1250
|
-
return false;
|
|
1251
|
-
}
|
|
1252
|
-
|
|
1253
|
-
function parseWeightNumber(raw) {
|
|
1254
|
-
return Number(String(raw).replace(/,/g, ''));
|
|
1255
|
-
}
|
|
1256
|
-
|
|
1257
|
-
function extractAskWeightClaims(text) {
|
|
1258
|
-
const claims = [];
|
|
1259
|
-
// Accept comma-grouped thousands ("40,500 kg") as a single number so volume
|
|
1260
|
-
// figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
|
|
1261
|
-
// figures are excluded by isVolumeWeightClaim at the call sites, not by a
|
|
1262
|
-
// magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
|
|
1263
|
-
// 1000 kg, and a fabricated heavy load must still be graded.
|
|
1264
|
-
const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
|
|
1265
|
-
for (const match of text.matchAll(pattern)) {
|
|
1266
|
-
claims.push({
|
|
1267
|
-
text: match[0],
|
|
1268
|
-
value: parseWeightNumber(match[1]),
|
|
1269
|
-
index: match.index ?? -1
|
|
1270
|
-
});
|
|
1271
|
-
}
|
|
1272
|
-
return claims;
|
|
1273
|
-
}
|
|
1274
|
-
|
|
1275
|
-
function extractAskWeightedSetClaims(text) {
|
|
1276
|
-
const claims = [];
|
|
1277
|
-
// A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
|
|
1278
|
-
// an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
|
|
1279
|
-
// almost always a SET count ("70 kg for 4 working sets") or a duration, and
|
|
1280
|
-
// treating it as reps flags real data as a fabricated pair. So match only the
|
|
1281
|
-
// unambiguous forms; the plain-weight loop still grounds the weight itself.
|
|
1282
|
-
const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
|
|
1283
|
-
for (const match of text.matchAll(pattern)) {
|
|
1284
|
-
claims.push({
|
|
1285
|
-
text: match[0],
|
|
1286
|
-
weight: parseWeightNumber(match[1]),
|
|
1287
|
-
reps: Number(match[2] ?? match[3]),
|
|
1288
|
-
index: match.index ?? -1,
|
|
1289
|
-
end: (match.index ?? -1) + match[0].length
|
|
1290
|
-
});
|
|
1291
|
-
}
|
|
1292
|
-
return claims;
|
|
1293
|
-
}
|
|
1294
|
-
|
|
1295
|
-
function allowedWeightsForExercise(snapshot, normalizedExerciseName) {
|
|
1296
|
-
const weights = [];
|
|
1297
|
-
for (const session of snapshot?.sessions ?? []) {
|
|
1298
|
-
for (const exercise of session.exercises ?? []) {
|
|
1299
|
-
if (normalizeExerciseName(exercise.name) !== normalizedExerciseName) continue;
|
|
1300
|
-
for (const set of exercise.sets ?? []) {
|
|
1301
|
-
const weight = Number(set.weight);
|
|
1302
|
-
if (Number.isFinite(weight)) weights.push(weight);
|
|
1303
|
-
}
|
|
1304
|
-
}
|
|
1305
|
-
for (const exercise of session.prescriptionSnapshot?.exercises ?? []) {
|
|
1306
|
-
if (normalizeExerciseName(exercise.exerciseName) !== normalizedExerciseName) continue;
|
|
1307
|
-
const targetWeight = Number(exercise.targetWeight);
|
|
1308
|
-
if (Number.isFinite(targetWeight)) weights.push(targetWeight);
|
|
1309
|
-
for (const targetSet of exercise.targetSets ?? []) {
|
|
1310
|
-
const weight = Number(targetSet.weight ?? targetSet.targetWeight);
|
|
1311
|
-
if (Number.isFinite(weight)) weights.push(weight);
|
|
1312
|
-
}
|
|
1313
|
-
}
|
|
1314
|
-
}
|
|
1315
|
-
return weights;
|
|
1316
|
-
}
|
|
1317
|
-
|
|
1318
|
-
function weightClaimSupported(claim, allowedWeights) {
|
|
1319
|
-
return allowedWeights.some((weight) => Math.abs(weight - claim.value) < 0.01);
|
|
1320
|
-
}
|
|
1321
|
-
|
|
1322
|
-
function isEstimatedOneRepMaxWeightClaim(text, claim) {
|
|
1323
|
-
const start = Math.max(0, claim.index - 40);
|
|
1324
|
-
const end = Math.min(text.length, claim.index + claim.text.length + 40);
|
|
1325
|
-
const window = text.slice(start, end);
|
|
1326
|
-
return /\b(?:estimated\s+)?(?:1rm|one[-\s]?rep\s+max)\b/i.test(window);
|
|
1327
|
-
}
|
|
1328
|
-
|
|
1329
|
-
function isVolumeWeightClaim(text, claim) {
|
|
1330
|
-
// A kg figure in a clause about volume/tonnage/total load is a workload total
|
|
1331
|
-
// (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
|
|
1332
|
-
// exercise load. Scope to the claim's clause so a fabricated exercise load
|
|
1333
|
-
// earlier in the same sentence is still graded.
|
|
1334
|
-
return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
|
|
1335
|
-
}
|
|
1336
|
-
|
|
1337
|
-
function claimClause(text, claim) {
|
|
1338
|
-
const boundaries = [
|
|
1339
|
-
'\n',
|
|
1340
|
-
'. ',
|
|
1341
|
-
';',
|
|
1342
|
-
', while',
|
|
1343
|
-
', whereas',
|
|
1344
|
-
', but',
|
|
1345
|
-
' while ',
|
|
1346
|
-
' whereas ',
|
|
1347
|
-
' but '
|
|
1348
|
-
];
|
|
1349
|
-
let start = 0;
|
|
1350
|
-
for (const boundary of boundaries) {
|
|
1351
|
-
const index = text.lastIndexOf(boundary, claim.index);
|
|
1352
|
-
if (index >= 0) start = Math.max(start, index + boundary.length);
|
|
1353
|
-
}
|
|
1354
|
-
|
|
1355
|
-
let end = text.length;
|
|
1356
|
-
for (const boundary of boundaries) {
|
|
1357
|
-
const index = text.indexOf(boundary, claim.index + claim.text.length);
|
|
1358
|
-
if (index >= 0) end = Math.min(end, index);
|
|
1359
|
-
}
|
|
1360
|
-
return text.slice(start, end);
|
|
1361
|
-
}
|
|
1362
|
-
|
|
1363
|
-
// Returns the sentence containing the claim, so context guards can look at the
|
|
1364
|
-
// whole clause rather than a fixed-width window (body-weight phrasing can put
|
|
1365
|
-
// the "body weight" anchor well before the kg figure).
|
|
1366
|
-
function claimSentence(text, claim) {
|
|
1367
|
-
const before = text.slice(0, claim.index);
|
|
1368
|
-
const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
|
|
1369
|
-
const start = startBreak >= 0 ? startBreak + 1 : 0;
|
|
1370
|
-
const after = text.slice(claim.index);
|
|
1371
|
-
const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
|
|
1372
|
-
const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
|
|
1373
|
-
return text.slice(start, end);
|
|
1374
|
-
}
|
|
1375
|
-
|
|
1376
|
-
// Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
|
|
1377
|
-
// exercise-load claims. findNearestMentionedExercise would otherwise attribute
|
|
1378
|
-
// them to the previously named lift and flag a correct answer as a
|
|
1379
|
-
// hallucination, so skip any kg figure stated in a body-weight clause.
|
|
1380
|
-
function isBodyWeightClaim(text, claim) {
|
|
1381
|
-
return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
|
|
1382
|
-
}
|
|
1383
|
-
|
|
1384
1150
|
function askWorkingTopSetRows(snapshot) {
|
|
1385
1151
|
const rows = [];
|
|
1386
1152
|
for (const session of stableSortByDateDesc(snapshot?.sessions ?? [], (session) => session.completedAt ?? session.date)) {
|
|
@@ -1529,7 +1295,7 @@ const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
|
|
|
1529
1295
|
// number directly followed by one of these is left alone.
|
|
1530
1296
|
const NON_SCORE_UNIT =
|
|
1531
1297
|
'(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
|
|
1532
|
-
'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
|
|
1298
|
+
'years?|yrs?|h\\b|hrs?|hours?|mins?|minutes?|secs?|seconds?|bpm|ms|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
|
|
1533
1299
|
|
|
1534
1300
|
// Heuristic, not a parser. Flags a component name followed — within a short,
|
|
1535
1301
|
// period/newline-free gap (one clause) — by a score-like number that is not a
|
|
@@ -1632,11 +1398,19 @@ function evaluateAskSelfReference(output, testCase) {
|
|
|
1632
1398
|
// On a question that is not about the Increment Score, the coach must not
|
|
1633
1399
|
// volunteer the bare overall score number (e.g. "your score is 92/100"). The
|
|
1634
1400
|
// prelude withholds the number for non-score questions; this guards the answer.
|
|
1635
|
-
function evaluateAskVolunteeredScore(output, testCase) {
|
|
1401
|
+
function evaluateAskVolunteeredScore(output, testCase, context = {}) {
|
|
1636
1402
|
if (testCase.surface !== 'ask') {
|
|
1637
1403
|
return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
|
|
1638
1404
|
}
|
|
1639
1405
|
const question = testCase.context?.question ?? testCase.question ?? '';
|
|
1406
|
+
const responseProfile = context?.routedMetadata?.responseProfile
|
|
1407
|
+
?? context?.routedMetadata?.intent?.responseProfile
|
|
1408
|
+
?? testCase.context?.routedMetadata?.responseProfile
|
|
1409
|
+
?? testCase.context?.routedMetadata?.intent?.responseProfile
|
|
1410
|
+
?? testCase.context?.responseProfile;
|
|
1411
|
+
if (responseProfile === 'expansive') {
|
|
1412
|
+
return { key: 'ask_volunteered_score', passed: true, reason: 'Expansive Ask answers may name the rounded score headline.' };
|
|
1413
|
+
}
|
|
1640
1414
|
if (isScoreQuestion(question)) {
|
|
1641
1415
|
return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
|
|
1642
1416
|
}
|
|
@@ -1682,7 +1456,14 @@ function evaluateAskStaleness(output, snapshot, testCase) {
|
|
|
1682
1456
|
};
|
|
1683
1457
|
}
|
|
1684
1458
|
|
|
1685
|
-
function
|
|
1459
|
+
function askResponseProfileFromTestCase(testCase) {
|
|
1460
|
+
return testCase?.context?.routedMetadata?.responseProfile
|
|
1461
|
+
?? testCase?.context?.routedMetadata?.intent?.responseProfile
|
|
1462
|
+
?? testCase?.context?.responseProfile
|
|
1463
|
+
?? null;
|
|
1464
|
+
}
|
|
1465
|
+
|
|
1466
|
+
function evaluateAskClaims(output, snapshot, testCase, context = null) {
|
|
1686
1467
|
if (testCase.surface !== 'ask') {
|
|
1687
1468
|
return { key: 'ask_claims', passed: true, reason: 'Not an ask answer.' };
|
|
1688
1469
|
}
|
|
@@ -1693,26 +1474,9 @@ function evaluateAskClaims(output, snapshot, testCase) {
|
|
|
1693
1474
|
}
|
|
1694
1475
|
|
|
1695
1476
|
const failures = [];
|
|
1696
|
-
const
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
const targetHitClaims = extractAskTargetHitClaims(normalized);
|
|
1700
|
-
if (targetHitClaims.length > 0) {
|
|
1701
|
-
const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
|
|
1702
|
-
if (misses.length > 0) {
|
|
1703
|
-
const sample = misses[0];
|
|
1704
|
-
failures.push(`Ask answer claims targets hit ("${targetHitClaims[0].text}") but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
|
|
1705
|
-
}
|
|
1706
|
-
}
|
|
1707
|
-
|
|
1708
|
-
const cleanConsistencyClaims = extractAskCleanConsistencyClaims(normalized);
|
|
1709
|
-
if (cleanConsistencyClaims.length > 0) {
|
|
1710
|
-
const misses = findRecentSessionMisses(snapshot, { exerciseNames: scopedExerciseNames });
|
|
1711
|
-
if (misses.length > 0) {
|
|
1712
|
-
const sample = misses[0];
|
|
1713
|
-
failures.push(`Ask answer frames missed target reps as "${cleanConsistencyClaims[0].text}", but recent session ${sample.sessionId} has ${sample.exerciseName} at ${sample.reps} reps below target ${sample.target}.`);
|
|
1714
|
-
}
|
|
1715
|
-
}
|
|
1477
|
+
const { claimFailures } = getAskVerifierResult(output, context, snapshot, testCase);
|
|
1478
|
+
failures.push(...claimFailures.map((failure) => failure.reason));
|
|
1479
|
+
const mentionedExercises = findAskAnswerExerciseMentions(output, snapshot);
|
|
1716
1480
|
|
|
1717
1481
|
for (const claim of extractAskPlannedListClaims(normalized)) {
|
|
1718
1482
|
const uniquePlanned = new Set(claim.reps);
|
|
@@ -1742,23 +1506,6 @@ function evaluateAskClaims(output, snapshot, testCase) {
|
|
|
1742
1506
|
}
|
|
1743
1507
|
}
|
|
1744
1508
|
|
|
1745
|
-
for (const claim of extractAskWeightClaims(normalized)) {
|
|
1746
|
-
if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
|
|
1747
|
-
if (isVolumeWeightClaim(normalized, claim)) continue;
|
|
1748
|
-
if (isBodyWeightClaim(normalized, claim)) continue;
|
|
1749
|
-
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1750
|
-
if (!referencedExercise) continue;
|
|
1751
|
-
const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
|
|
1752
|
-
if (allowedWeights.length === 0) continue;
|
|
1753
|
-
if (!weightClaimSupported(claim, allowedWeights)) {
|
|
1754
|
-
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but that weight is not present in recorded or planned sets for that exercise.`);
|
|
1755
|
-
}
|
|
1756
|
-
}
|
|
1757
|
-
|
|
1758
|
-
if (hasUnsupportedAskFatigueRecoveryClaim(normalized) && !hasAskFatigueSupport(snapshot)) {
|
|
1759
|
-
failures.push('Ask answer uses fatigue/recovery language but the snapshot has no recent vitals, sleep, or rep-dropoff signals to support it.');
|
|
1760
|
-
}
|
|
1761
|
-
|
|
1762
1509
|
return {
|
|
1763
1510
|
key: 'ask_claims',
|
|
1764
1511
|
passed: failures.length === 0,
|
|
@@ -1826,55 +1573,6 @@ function askToolEvidenceRows(toolResults = []) {
|
|
|
1826
1573
|
return rows;
|
|
1827
1574
|
}
|
|
1828
1575
|
|
|
1829
|
-
function askToolEvidenceWeights(rows = []) {
|
|
1830
|
-
const weights = [];
|
|
1831
|
-
for (const row of rows) {
|
|
1832
|
-
for (const set of row.sets ?? []) {
|
|
1833
|
-
const weight = Number(set.weight);
|
|
1834
|
-
if (Number.isFinite(weight)) weights.push(weight);
|
|
1835
|
-
}
|
|
1836
|
-
const topWeight = Number(row.topSet?.weight);
|
|
1837
|
-
if (Number.isFinite(topWeight)) weights.push(topWeight);
|
|
1838
|
-
const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
|
|
1839
|
-
if (Number.isFinite(previousTopWeight)) weights.push(previousTopWeight);
|
|
1840
|
-
}
|
|
1841
|
-
return weights;
|
|
1842
|
-
}
|
|
1843
|
-
|
|
1844
|
-
function askToolEvidenceSetPairs(rows = []) {
|
|
1845
|
-
const pairs = [];
|
|
1846
|
-
for (const row of rows) {
|
|
1847
|
-
for (const set of row.sets ?? []) {
|
|
1848
|
-
const weight = Number(set.weight);
|
|
1849
|
-
const reps = Number(set.reps);
|
|
1850
|
-
if (Number.isFinite(weight) && Number.isFinite(reps)) pairs.push({ weight, reps });
|
|
1851
|
-
}
|
|
1852
|
-
const topWeight = Number(row.topSet?.weight);
|
|
1853
|
-
const topReps = Number(row.topSet?.reps);
|
|
1854
|
-
if (Number.isFinite(topWeight) && Number.isFinite(topReps)) pairs.push({ weight: topWeight, reps: topReps });
|
|
1855
|
-
const previousTopWeight = Number(row.comparedToPreviousSession?.previousTopSet?.weight);
|
|
1856
|
-
const previousTopReps = Number(row.comparedToPreviousSession?.previousTopSet?.reps);
|
|
1857
|
-
if (Number.isFinite(previousTopWeight) && Number.isFinite(previousTopReps)) {
|
|
1858
|
-
pairs.push({ weight: previousTopWeight, reps: previousTopReps });
|
|
1859
|
-
}
|
|
1860
|
-
}
|
|
1861
|
-
return pairs;
|
|
1862
|
-
}
|
|
1863
|
-
|
|
1864
|
-
function toolEvidenceSupportsWeightClaim(claim, rows) {
|
|
1865
|
-
if (weightClaimSupported(claim, askToolEvidenceWeights(rows))) return true;
|
|
1866
|
-
return false;
|
|
1867
|
-
}
|
|
1868
|
-
|
|
1869
|
-
function toolEvidenceSupportsWeightedSetClaim(claim, rows) {
|
|
1870
|
-
if (askToolEvidenceSetPairs(rows).some((pair) => (
|
|
1871
|
-
Math.abs(pair.weight - claim.weight) < 0.01 && pair.reps === claim.reps
|
|
1872
|
-
))) {
|
|
1873
|
-
return true;
|
|
1874
|
-
}
|
|
1875
|
-
return false;
|
|
1876
|
-
}
|
|
1877
|
-
|
|
1878
1576
|
function compareToolEvidenceRecency(lhs, rhs) {
|
|
1879
1577
|
const lhsDaysAgo = Number(lhs?.daysAgo);
|
|
1880
1578
|
const rhsDaysAgo = Number(rhs?.daysAgo);
|
|
@@ -1890,14 +1588,6 @@ function newestToolEvidenceRow(rows = [], predicate = () => true) {
|
|
|
1890
1588
|
.sort(compareToolEvidenceRecency)[0] ?? null;
|
|
1891
1589
|
}
|
|
1892
1590
|
|
|
1893
|
-
function latestComparableToolRow(rows = []) {
|
|
1894
|
-
return newestToolEvidenceRow(rows, (row) => row.comparedToPreviousSession?.loadDirection) ?? null;
|
|
1895
|
-
}
|
|
1896
|
-
|
|
1897
|
-
function isWithinWeightedSetClaim(claim, weightedSetClaims) {
|
|
1898
|
-
return weightedSetClaims.some((setClaim) => claim.index >= setClaim.index && claim.index < setClaim.end);
|
|
1899
|
-
}
|
|
1900
|
-
|
|
1901
1591
|
function rowIsStaleForEval(row, testCase) {
|
|
1902
1592
|
const daysAgo = Number(row?.daysAgo);
|
|
1903
1593
|
const cutoff = Number(testCase.staleness?.maxRecentDays ?? row?.recencyCutoffDays);
|
|
@@ -1952,80 +1642,22 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
|
|
|
1952
1642
|
|
|
1953
1643
|
const routedMetadata = context?.routedMetadata ?? {};
|
|
1954
1644
|
const toolsUsed = new Set(routedMetadata.toolsUsed ?? []);
|
|
1955
|
-
const { toolResults
|
|
1645
|
+
const { toolResults } = routedToolResultsForEval(snapshot, context);
|
|
1956
1646
|
const evidenceRows = askToolEvidenceRows(toolResults);
|
|
1957
|
-
const
|
|
1958
|
-
const unroutedMentionNames = new Set();
|
|
1959
|
-
const failures = [...replayFailures];
|
|
1647
|
+
const failures = [];
|
|
1960
1648
|
for (const toolName of uniqueStrings(testCase.requiredTools)) {
|
|
1961
1649
|
if (!toolsUsed.has(toolName)) {
|
|
1962
1650
|
failures.push(`Expected routed Ask Coach context to use ${toolName}.`);
|
|
1963
1651
|
}
|
|
1964
1652
|
}
|
|
1965
1653
|
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
}
|
|
1969
|
-
|
|
1970
|
-
const weightedSetClaims = extractAskWeightedSetClaims(output);
|
|
1971
|
-
for (const claim of weightedSetClaims) {
|
|
1972
|
-
if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
|
|
1973
|
-
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1974
|
-
if (!referencedExercise) continue;
|
|
1975
|
-
const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
|
|
1976
|
-
if (rows.length === 0) {
|
|
1977
|
-
unroutedMentionNames.add(referencedExercise.normalizedName);
|
|
1978
|
-
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
|
|
1979
|
-
continue;
|
|
1980
|
-
}
|
|
1981
|
-
if (!toolEvidenceSupportsWeightedSetClaim(claim, rows)) {
|
|
1982
|
-
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight/reps pair.`);
|
|
1983
|
-
}
|
|
1984
|
-
}
|
|
1985
|
-
|
|
1986
|
-
for (const claim of extractAskWeightClaims(output)) {
|
|
1987
|
-
if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
|
|
1988
|
-
if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
|
|
1989
|
-
if (isVolumeWeightClaim(output, claim)) continue;
|
|
1990
|
-
if (isBodyWeightClaim(output, claim)) continue;
|
|
1991
|
-
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1992
|
-
if (!referencedExercise) continue;
|
|
1993
|
-
const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
|
|
1994
|
-
if (rows.length === 0) {
|
|
1995
|
-
unroutedMentionNames.add(referencedExercise.normalizedName);
|
|
1996
|
-
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but ${referencedExercise.name} was not present in routed tool outputs.`);
|
|
1997
|
-
continue;
|
|
1998
|
-
}
|
|
1999
|
-
if (!toolEvidenceSupportsWeightClaim(claim, rows)) {
|
|
2000
|
-
failures.push(`Ask answer asserts ${claim.text} for ${referencedExercise.name}, but routed tool outputs for ${referencedExercise.name} did not include that weight.`);
|
|
2001
|
-
}
|
|
2002
|
-
}
|
|
1654
|
+
const { provenanceFailures } = getAskVerifierResult(output, context, snapshot, testCase);
|
|
1655
|
+
failures.push(...provenanceFailures.map((failure) => failure.reason));
|
|
2003
1656
|
|
|
2004
1657
|
const exerciseNames = evidenceRows.map((row) => row.exerciseName);
|
|
2005
|
-
for (const mention of
|
|
1658
|
+
for (const mention of findAskAnswerExerciseMentions(output, snapshot)) {
|
|
2006
1659
|
const rows = evidenceRows.filter((row) => row.normalizedName === mention.normalizedName);
|
|
2007
|
-
if (rows.length === 0)
|
|
2008
|
-
if (toolResults.length > 0 && !unroutedMentionNames.has(mention.normalizedName)) {
|
|
2009
|
-
unroutedMentionNames.add(mention.normalizedName);
|
|
2010
|
-
failures.push(`Ask answer mentions ${mention.name}, but ${mention.name} was not present in routed tool outputs.`);
|
|
2011
|
-
}
|
|
2012
|
-
continue;
|
|
2013
|
-
}
|
|
2014
|
-
const comparable = latestComparableToolRow(rows);
|
|
2015
|
-
if (comparable) {
|
|
2016
|
-
const direction = comparable.comparedToPreviousSession.loadDirection;
|
|
2017
|
-
const previous = comparable.comparedToPreviousSession.previousTopSet;
|
|
2018
|
-
const windows = directionEvaluationWindows(output, mention.name, exerciseNames);
|
|
2019
|
-
if (direction === 'up' && windows.some(hasUnqualifiedDeclineLanguage)) {
|
|
2020
|
-
failures.push(`Ask answer frames ${mention.name} as declining/drop-off, but routed ${comparable.toolName} evidence says top load increased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
|
|
2021
|
-
}
|
|
2022
|
-
if (direction === 'down' && windows.some(hasUnqualifiedImprovementLanguage)) {
|
|
2023
|
-
failures.push(`Ask answer frames ${mention.name} as improving, but routed ${comparable.toolName} evidence says top load decreased from ${previous.weight} kg to ${comparable.topSet.weight} kg.`);
|
|
2024
|
-
}
|
|
2025
|
-
if (direction === 'flat' && windows.some((window) => hasUnqualifiedDeclineLanguage(window) || hasUnqualifiedImprovementLanguage(window))) {
|
|
2026
|
-
failures.push(`Ask answer invents a load direction for ${mention.name}, but routed ${comparable.toolName} evidence says top load was flat at ${comparable.topSet.weight} kg.`);
|
|
2027
|
-
}
|
|
2028
|
-
}
|
|
1660
|
+
if (rows.length === 0) continue;
|
|
2029
1661
|
|
|
2030
1662
|
const latestDatedRow = newestToolEvidenceRow(rows, (row) => row.daysAgo != null);
|
|
2031
1663
|
if (latestDatedRow && rowIsStaleForEval(latestDatedRow, testCase)) {
|
|
@@ -2665,18 +2297,18 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
|
|
|
2665
2297
|
|
|
2666
2298
|
const checks = [
|
|
2667
2299
|
evaluateNoInsight(visibleOutput, testCase),
|
|
2668
|
-
evaluateShape(visibleOutput, testCase),
|
|
2300
|
+
evaluateShape(visibleOutput, testCase, context),
|
|
2669
2301
|
evaluateRequiredMentions(visibleOutput, testCase),
|
|
2670
2302
|
evaluateAnyOfMentions(visibleOutput, testCase),
|
|
2671
2303
|
evaluateForbiddenPhrases(visibleOutput, testCase),
|
|
2672
2304
|
evaluateForbiddenMentions(visibleOutput, testCase),
|
|
2673
2305
|
evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
|
|
2674
2306
|
evaluateWorkoutClaims(visibleOutput, context, testCase),
|
|
2675
|
-
evaluateAskClaims(visibleOutput, snapshot, testCase),
|
|
2307
|
+
evaluateAskClaims(visibleOutput, snapshot, testCase, context),
|
|
2676
2308
|
evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
|
|
2677
2309
|
evaluateAskScoreVoice(visibleOutput, testCase),
|
|
2678
2310
|
evaluateAskSelfReference(visibleOutput, testCase),
|
|
2679
|
-
evaluateAskVolunteeredScore(visibleOutput, testCase),
|
|
2311
|
+
evaluateAskVolunteeredScore(visibleOutput, testCase, context),
|
|
2680
2312
|
evaluateAskStaleness(visibleOutput, snapshot, testCase),
|
|
2681
2313
|
evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
|
|
2682
2314
|
evaluateFormulaVersion(visibleOutput, snapshot, testCase),
|