incremnt 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +6 -1
- package/src/contract.js +37 -1
- package/src/format.js +5 -0
- package/src/openrouter.js +81 -24
- package/src/prompt-security.js +13 -0
- package/src/queries.js +190 -25
- package/src/remote.js +98 -1
- package/src/stored-summary-eval-report.js +138 -0
- package/src/summary-evals.js +839 -0
- package/src/sync-service.js +370 -39
- package/src/workout-prompt-variants.js +52 -0
|
@@ -0,0 +1,839 @@
|
|
|
1
|
+
import { readFile, readdir } from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { fileURLToPath } from 'node:url';
|
|
4
|
+
import {
|
|
5
|
+
checkpointContext,
|
|
6
|
+
cycleSummaryContext,
|
|
7
|
+
normalizeExerciseName,
|
|
8
|
+
workoutSummaryContext,
|
|
9
|
+
vitalsSummaryContext
|
|
10
|
+
} from './queries.js';
|
|
11
|
+
import {
|
|
12
|
+
generateCheckpointSummary,
|
|
13
|
+
generateCoachingSummary,
|
|
14
|
+
generateVitalsSummary,
|
|
15
|
+
generateWorkoutCoachingSummary
|
|
16
|
+
} from './openrouter.js';
|
|
17
|
+
|
|
18
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
19
|
+
const __dirname = path.dirname(__filename);
|
|
20
|
+
|
|
21
|
+
export const summaryEvalFixturesRoot = path.resolve(__dirname, '../test/fixtures/summary-evals');
|
|
22
|
+
|
|
23
|
+
export function defaultCaseSetName() {
|
|
24
|
+
return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function stableSortByDateDesc(items, selector) {
|
|
28
|
+
return [...items].sort((lhs, rhs) => String(selector(rhs)).localeCompare(String(selector(lhs))));
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export async function loadSummaryEvalCases(caseSet = defaultCaseSetName()) {
|
|
32
|
+
const casesDir = path.join(summaryEvalFixturesRoot, caseSet, 'cases');
|
|
33
|
+
let caseFiles;
|
|
34
|
+
try {
|
|
35
|
+
caseFiles = (await readdir(casesDir))
|
|
36
|
+
.filter((file) => file.endsWith('.json'))
|
|
37
|
+
.sort();
|
|
38
|
+
} catch (error) {
|
|
39
|
+
if (error?.code === 'ENOENT') {
|
|
40
|
+
return [];
|
|
41
|
+
}
|
|
42
|
+
throw error;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const loaded = [];
|
|
46
|
+
for (const file of caseFiles) {
|
|
47
|
+
const data = JSON.parse(await readFile(path.join(casesDir, file), 'utf8'));
|
|
48
|
+
loaded.push({
|
|
49
|
+
...data,
|
|
50
|
+
caseSet,
|
|
51
|
+
fixtureFile: file
|
|
52
|
+
});
|
|
53
|
+
}
|
|
54
|
+
return loaded;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export async function loadSummaryEvalSnapshot(testCase) {
|
|
58
|
+
if (!testCase.snapshotFile) {
|
|
59
|
+
throw new Error(`Eval case ${testCase.id} is missing snapshotFile`);
|
|
60
|
+
}
|
|
61
|
+
const snapshotPath = path.join(summaryEvalFixturesRoot, testCase.caseSet, 'snapshots', testCase.snapshotFile);
|
|
62
|
+
return JSON.parse(await readFile(snapshotPath, 'utf8'));
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function buildSummaryEvalContext(snapshot, testCase) {
|
|
66
|
+
switch (testCase.surface) {
|
|
67
|
+
case 'workout': {
|
|
68
|
+
const sessionId = testCase.selector?.sessionId;
|
|
69
|
+
return workoutSummaryContext(snapshot, sessionId, { exclude: new Set(testCase.exclude ?? []) });
|
|
70
|
+
}
|
|
71
|
+
case 'cycle': {
|
|
72
|
+
const programId = testCase.selector?.programId ?? null;
|
|
73
|
+
return cycleSummaryContext(snapshot, programId, { exclude: new Set(testCase.exclude ?? []) });
|
|
74
|
+
}
|
|
75
|
+
case 'checkpoint': {
|
|
76
|
+
const programId = testCase.selector?.programId ?? null;
|
|
77
|
+
const checkpointWeek = testCase.selector?.checkpointWeek;
|
|
78
|
+
return checkpointContext(snapshot, programId, checkpointWeek, { exclude: new Set(testCase.exclude ?? []) });
|
|
79
|
+
}
|
|
80
|
+
case 'vitals':
|
|
81
|
+
return vitalsSummaryContext(snapshot, { exclude: new Set(testCase.exclude ?? []) });
|
|
82
|
+
default:
|
|
83
|
+
throw new Error(`Unsupported summary eval surface: ${testCase.surface}`);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
export async function generateSummaryEvalOutput(testCase, context) {
|
|
88
|
+
const liveGenerationEnabled = process.env.SUMMARY_EVALS_LIVE === '1';
|
|
89
|
+
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
90
|
+
if (!liveGenerationEnabled || !apiKey || testCase.shouldPass === false) {
|
|
91
|
+
return testCase.output;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
switch (testCase.surface) {
|
|
95
|
+
case 'workout':
|
|
96
|
+
return (await generateWorkoutCoachingSummary(context, { apiKey })).text;
|
|
97
|
+
case 'cycle':
|
|
98
|
+
return (await generateCoachingSummary(context, { apiKey })).text;
|
|
99
|
+
case 'checkpoint':
|
|
100
|
+
return (await generateCheckpointSummary(context, { apiKey })).text;
|
|
101
|
+
case 'vitals':
|
|
102
|
+
return (await generateVitalsSummary(context, { apiKey })).text;
|
|
103
|
+
default:
|
|
104
|
+
throw new Error(`Unsupported summary eval surface: ${testCase.surface}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function normalizeText(value) {
|
|
109
|
+
return String(value ?? '').trim();
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function paragraphCount(text) {
|
|
113
|
+
return normalizeText(text)
|
|
114
|
+
.split(/\n\s*\n/)
|
|
115
|
+
.filter(Boolean)
|
|
116
|
+
.length;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
function sentenceCount(text) {
|
|
120
|
+
const matches = normalizeText(text).match(/[^.!?]+[.!?]+/g);
|
|
121
|
+
return matches ? matches.length : 0;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function isSingleParagraph(text) {
|
|
125
|
+
return !normalizeText(text).includes('\n\n');
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function lowerIncludes(text, snippet) {
|
|
129
|
+
return normalizeText(text).toLowerCase().includes(String(snippet).toLowerCase());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function uniqueStrings(values) {
|
|
133
|
+
return [...new Set((values ?? []).filter(Boolean))];
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function escapeRegex(value) {
|
|
137
|
+
return String(value).replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const historicalExerciseModifiers = new Set([
|
|
141
|
+
'barbell',
|
|
142
|
+
'dumbbell',
|
|
143
|
+
'cable',
|
|
144
|
+
'machine',
|
|
145
|
+
'seated',
|
|
146
|
+
'standing',
|
|
147
|
+
'smith',
|
|
148
|
+
'wide',
|
|
149
|
+
'narrow',
|
|
150
|
+
'close',
|
|
151
|
+
'reverse',
|
|
152
|
+
'incline',
|
|
153
|
+
'decline',
|
|
154
|
+
'single',
|
|
155
|
+
'one',
|
|
156
|
+
'arm',
|
|
157
|
+
'leg',
|
|
158
|
+
'weighted',
|
|
159
|
+
'romanian',
|
|
160
|
+
'hack',
|
|
161
|
+
'full',
|
|
162
|
+
'grip'
|
|
163
|
+
]);
|
|
164
|
+
|
|
165
|
+
function collectAllExerciseNames(snapshot) {
|
|
166
|
+
const names = new Set();
|
|
167
|
+
|
|
168
|
+
for (const session of snapshot.sessions ?? []) {
|
|
169
|
+
for (const exercise of session.exercises ?? []) {
|
|
170
|
+
if (exercise.name) names.add(exercise.name);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
for (const program of snapshot.programs ?? []) {
|
|
175
|
+
for (const day of program.days ?? []) {
|
|
176
|
+
for (const exercise of day.exercises ?? []) {
|
|
177
|
+
if (exercise.name) names.add(exercise.name);
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
for (const plan of snapshot.strengthPlans ?? []) {
|
|
183
|
+
for (const goal of plan.liftGoals ?? []) {
|
|
184
|
+
if (goal.exerciseDisplayName) names.add(goal.exerciseDisplayName);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
return [...names];
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function collectAllowedExerciseNames(surface, context) {
|
|
192
|
+
const names = new Set();
|
|
193
|
+
|
|
194
|
+
if (surface === 'workout' && context && typeof context === 'object') {
|
|
195
|
+
for (const exercise of context.exercises ?? []) {
|
|
196
|
+
if (exercise.exerciseName) names.add(exercise.exerciseName);
|
|
197
|
+
}
|
|
198
|
+
for (const exerciseName of context.nextSession?.exerciseNames ?? []) {
|
|
199
|
+
names.add(exerciseName);
|
|
200
|
+
}
|
|
201
|
+
for (const pr of context.prs ?? []) {
|
|
202
|
+
if (pr.exerciseName) names.add(pr.exerciseName);
|
|
203
|
+
}
|
|
204
|
+
for (const pr of context.bwPrs ?? []) {
|
|
205
|
+
if (pr.exerciseName) names.add(pr.exerciseName);
|
|
206
|
+
}
|
|
207
|
+
for (const pr of context.repPrs ?? []) {
|
|
208
|
+
if (pr.exerciseName) names.add(pr.exerciseName);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (surface === 'cycle' && context && typeof context === 'object') {
|
|
213
|
+
for (const session of context.sessions ?? []) {
|
|
214
|
+
for (const exercise of session.exercises ?? []) {
|
|
215
|
+
if (exercise.exerciseName) names.add(exercise.exerciseName);
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
for (const progress of context.goalProgress ?? []) {
|
|
219
|
+
if (progress.exerciseName) names.add(progress.exerciseName);
|
|
220
|
+
}
|
|
221
|
+
for (const decision of context.progressionDecisions ?? []) {
|
|
222
|
+
if (decision.exerciseName) names.add(decision.exerciseName);
|
|
223
|
+
}
|
|
224
|
+
for (const pr of context.prsThisCycle ?? []) {
|
|
225
|
+
if (pr.exerciseName) names.add(pr.exerciseName);
|
|
226
|
+
}
|
|
227
|
+
for (const pr of context.bwPrsThisCycle ?? []) {
|
|
228
|
+
if (pr.exerciseName) names.add(pr.exerciseName);
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if (surface === 'checkpoint' && context && typeof context === 'object') {
|
|
233
|
+
for (const exercise of context.exercises ?? []) {
|
|
234
|
+
if (exercise.name) names.add(exercise.name);
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
return [...names];
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
function historicalExerciseVariants(name) {
|
|
242
|
+
const normalized = normalizeExerciseName(name);
|
|
243
|
+
if (!normalized) return [];
|
|
244
|
+
|
|
245
|
+
const variants = new Set([normalized]);
|
|
246
|
+
const tokens = normalized.split(' ').filter(Boolean);
|
|
247
|
+
|
|
248
|
+
let start = 0;
|
|
249
|
+
while (start < tokens.length - 1 && historicalExerciseModifiers.has(tokens[start])) {
|
|
250
|
+
start += 1;
|
|
251
|
+
const variant = tokens.slice(start).join(' ');
|
|
252
|
+
if (variant) variants.add(variant);
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if (tokens.length >= 2) {
|
|
256
|
+
variants.add(tokens.slice(-2).join(' '));
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
return [...variants].filter(Boolean);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
function evaluateExerciseMentions(output, snapshot, context, surface, testCase) {
|
|
263
|
+
const isStored = testCase.source === 'stored';
|
|
264
|
+
const allowed = new Set();
|
|
265
|
+
for (const name of [
|
|
266
|
+
...collectAllowedExerciseNames(surface, context),
|
|
267
|
+
...(testCase.allowedExerciseMentions ?? [])
|
|
268
|
+
]) {
|
|
269
|
+
const variants = isStored ? historicalExerciseVariants(name) : [normalizeExerciseName(name)];
|
|
270
|
+
for (const variant of variants) {
|
|
271
|
+
allowed.add(variant);
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
const allNames = collectAllExerciseNames(snapshot);
|
|
275
|
+
const normalizedOutput = normalizeExerciseName(output);
|
|
276
|
+
const mentions = [];
|
|
277
|
+
|
|
278
|
+
for (const exerciseName of allNames) {
|
|
279
|
+
const normalizedName = normalizeExerciseName(exerciseName);
|
|
280
|
+
if (!normalizedName) continue;
|
|
281
|
+
|
|
282
|
+
const pattern = new RegExp(`(?<!\\S)${escapeRegex(normalizedName)}(?!\\S)`, 'g');
|
|
283
|
+
for (const match of normalizedOutput.matchAll(pattern)) {
|
|
284
|
+
mentions.push({
|
|
285
|
+
exerciseName,
|
|
286
|
+
normalizedName,
|
|
287
|
+
start: match.index,
|
|
288
|
+
end: (match.index ?? 0) + normalizedName.length,
|
|
289
|
+
allowed: allowed.has(normalizedName)
|
|
290
|
+
});
|
|
291
|
+
}
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
const unauthorized = mentions
|
|
295
|
+
.filter((mention) => !mention.allowed)
|
|
296
|
+
.filter((mention) => !mentions.some((candidate) =>
|
|
297
|
+
candidate.allowed &&
|
|
298
|
+
candidate.normalizedName.length > mention.normalizedName.length &&
|
|
299
|
+
candidate.start <= mention.start &&
|
|
300
|
+
candidate.end >= mention.end
|
|
301
|
+
))
|
|
302
|
+
.map((mention) => mention.exerciseName);
|
|
303
|
+
|
|
304
|
+
return {
|
|
305
|
+
key: 'exercise_mentions',
|
|
306
|
+
passed: unauthorized.length === 0,
|
|
307
|
+
reason: unauthorized.length === 0
|
|
308
|
+
? 'No unauthorized exercise mentions.'
|
|
309
|
+
: `Unauthorized exercise mention(s): ${uniqueStrings(unauthorized).join(', ')}`
|
|
310
|
+
};
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
function evaluateRequiredMentions(output, testCase) {
|
|
314
|
+
const missing = uniqueStrings(testCase.requiredMentions).filter((mention) => !lowerIncludes(output, mention));
|
|
315
|
+
return {
|
|
316
|
+
key: 'required_mentions',
|
|
317
|
+
passed: missing.length === 0,
|
|
318
|
+
reason: missing.length === 0 ? 'All required mentions present.' : `Missing required mention(s): ${missing.join(', ')}`
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
function evaluateAnyOfMentions(output, testCase) {
|
|
323
|
+
const candidates = uniqueStrings(testCase.requiredAnyOfMentions);
|
|
324
|
+
if (candidates.length === 0) {
|
|
325
|
+
return {
|
|
326
|
+
key: 'required_any_of_mentions',
|
|
327
|
+
passed: true,
|
|
328
|
+
reason: 'No any-of mention requirement.'
|
|
329
|
+
};
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
const matched = candidates.some((mention) => lowerIncludes(output, mention));
|
|
333
|
+
return {
|
|
334
|
+
key: 'required_any_of_mentions',
|
|
335
|
+
passed: matched,
|
|
336
|
+
reason: matched
|
|
337
|
+
? 'Matched at least one required any-of mention.'
|
|
338
|
+
: `Missing all any-of mentions: ${candidates.join(', ')}`
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
function evaluateForbiddenPhrases(output, testCase) {
|
|
343
|
+
const hits = uniqueStrings(testCase.forbiddenPhrases).filter((phrase) => lowerIncludes(output, phrase));
|
|
344
|
+
return {
|
|
345
|
+
key: 'forbidden_phrases',
|
|
346
|
+
passed: hits.length === 0,
|
|
347
|
+
reason: hits.length === 0 ? 'No forbidden phrases detected.' : `Forbidden phrase(s): ${hits.join(', ')}`
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
function evaluateForbiddenMentions(output, testCase) {
|
|
352
|
+
const hits = uniqueStrings(testCase.forbiddenMentions).filter((phrase) => lowerIncludes(output, phrase));
|
|
353
|
+
return {
|
|
354
|
+
key: 'forbidden_mentions',
|
|
355
|
+
passed: hits.length === 0,
|
|
356
|
+
reason: hits.length === 0 ? 'No forbidden mentions detected.' : `Forbidden mention(s): ${hits.join(', ')}`
|
|
357
|
+
};
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
function evaluateNoInsight(output, testCase) {
|
|
361
|
+
const normalized = normalizeText(output);
|
|
362
|
+
const expectedNoInsight = Boolean(testCase.expectNoInsight);
|
|
363
|
+
const passed = expectedNoInsight
|
|
364
|
+
? normalized === 'NO_INSIGHT'
|
|
365
|
+
: normalized !== 'NO_INSIGHT';
|
|
366
|
+
return {
|
|
367
|
+
key: 'no_insight',
|
|
368
|
+
passed,
|
|
369
|
+
reason: expectedNoInsight
|
|
370
|
+
? passed ? 'Correctly returned NO_INSIGHT.' : 'Expected exact NO_INSIGHT output.'
|
|
371
|
+
: passed ? 'Output is not NO_INSIGHT.' : 'Unexpected NO_INSIGHT output.'
|
|
372
|
+
};
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
function evaluateShape(output, testCase) {
|
|
376
|
+
const normalized = normalizeText(output);
|
|
377
|
+
if (normalized === 'NO_INSIGHT') {
|
|
378
|
+
return {
|
|
379
|
+
key: 'shape',
|
|
380
|
+
passed: true,
|
|
381
|
+
reason: 'NO_INSIGHT bypasses shape rules.'
|
|
382
|
+
};
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
const sentences = sentenceCount(output);
|
|
386
|
+
const paragraphs = paragraphCount(output);
|
|
387
|
+
const isStored = testCase.source === 'stored';
|
|
388
|
+
let passed = true;
|
|
389
|
+
const reasons = [];
|
|
390
|
+
|
|
391
|
+
switch (testCase.surface) {
|
|
392
|
+
case 'workout':
|
|
393
|
+
if (!isStored && !isSingleParagraph(output)) {
|
|
394
|
+
passed = false;
|
|
395
|
+
reasons.push('Workout summaries must be a single paragraph.');
|
|
396
|
+
}
|
|
397
|
+
if ((!isStored && (sentences < 2 || sentences > 4)) || (isStored && (sentences < 1 || sentences > 12))) {
|
|
398
|
+
passed = false;
|
|
399
|
+
reasons.push(isStored
|
|
400
|
+
? `Stored workout summaries must be 1-12 sentences, got ${sentences}.`
|
|
401
|
+
: `Workout summaries must be 2-4 sentences, got ${sentences}.`);
|
|
402
|
+
}
|
|
403
|
+
if (isStored && (paragraphs < 1 || paragraphs > 3)) {
|
|
404
|
+
passed = false;
|
|
405
|
+
reasons.push(`Stored workout summaries must be 1-3 paragraphs, got ${paragraphs}.`);
|
|
406
|
+
}
|
|
407
|
+
break;
|
|
408
|
+
case 'vitals':
|
|
409
|
+
if (!isStored && !isSingleParagraph(output)) {
|
|
410
|
+
passed = false;
|
|
411
|
+
reasons.push('Vitals summaries must be a single paragraph.');
|
|
412
|
+
}
|
|
413
|
+
if ((!isStored && (sentences < 2 || sentences > 3)) || (isStored && (sentences < 1 || sentences > 5))) {
|
|
414
|
+
passed = false;
|
|
415
|
+
reasons.push(isStored
|
|
416
|
+
? `Stored vitals summaries must be 1-5 sentences, got ${sentences}.`
|
|
417
|
+
: `Vitals summaries must be 2-3 sentences, got ${sentences}.`);
|
|
418
|
+
}
|
|
419
|
+
if (isStored && (paragraphs < 1 || paragraphs > 2)) {
|
|
420
|
+
passed = false;
|
|
421
|
+
reasons.push(`Stored vitals summaries must be 1-2 paragraphs, got ${paragraphs}.`);
|
|
422
|
+
}
|
|
423
|
+
break;
|
|
424
|
+
case 'cycle':
|
|
425
|
+
if (paragraphs < 1 || paragraphs > 4) {
|
|
426
|
+
passed = false;
|
|
427
|
+
reasons.push(`Cycle summaries must be 1-4 paragraphs, got ${paragraphs}.`);
|
|
428
|
+
}
|
|
429
|
+
break;
|
|
430
|
+
case 'checkpoint':
|
|
431
|
+
if (paragraphs < 2 || paragraphs > 3) {
|
|
432
|
+
passed = false;
|
|
433
|
+
reasons.push(`Checkpoint summaries must be 2-3 paragraphs, got ${paragraphs}.`);
|
|
434
|
+
}
|
|
435
|
+
break;
|
|
436
|
+
default:
|
|
437
|
+
break;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
return {
|
|
441
|
+
key: 'shape',
|
|
442
|
+
passed,
|
|
443
|
+
reason: passed ? 'Output shape matches surface rules.' : reasons.join(' ')
|
|
444
|
+
};
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
function parseClaimCount(rawValue) {
|
|
448
|
+
const normalized = String(rawValue).toLowerCase();
|
|
449
|
+
if (/^\d+$/.test(normalized)) return Number(normalized);
|
|
450
|
+
return {
|
|
451
|
+
one: 1,
|
|
452
|
+
two: 2,
|
|
453
|
+
three: 3,
|
|
454
|
+
four: 4,
|
|
455
|
+
five: 5
|
|
456
|
+
}[normalized] ?? null;
|
|
457
|
+
}
|
|
458
|
+
|
|
459
|
+
function extractWorkoutVolumeClaims(output) {
|
|
460
|
+
const text = normalizeText(output);
|
|
461
|
+
const claims = [];
|
|
462
|
+
const patterns = [
|
|
463
|
+
/volume (?:is )?(?:down|dropped|drop|lower)\s+(\d{1,3})%/gi,
|
|
464
|
+
/(\d{1,3})%\s+volume (?:drop|decrease|lower)/gi,
|
|
465
|
+
/volume (?:is )?(?:up|increased|increase|jumped|jump)\s+(\d{1,3})%/gi,
|
|
466
|
+
/(\d{1,3})%\s+volume (?:jump|increase|rise|up)/gi
|
|
467
|
+
];
|
|
468
|
+
|
|
469
|
+
for (const pattern of patterns) {
|
|
470
|
+
for (const match of text.matchAll(pattern)) {
|
|
471
|
+
const direction = /down|drop|lower|decrease/.test(match[0].toLowerCase()) ? 'down' : 'up';
|
|
472
|
+
claims.push({
|
|
473
|
+
direction,
|
|
474
|
+
percent: Number(match[1]),
|
|
475
|
+
text: match[0]
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
return claims;
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
function extractWorkoutPrCountClaims(output) {
|
|
484
|
+
const text = normalizeText(output);
|
|
485
|
+
const claims = [];
|
|
486
|
+
const pattern = /\b(one|two|three|four|five|\d+)\s+(?:new\s+)?(?:rep\s+)?prs?\b/gi;
|
|
487
|
+
for (const match of text.matchAll(pattern)) {
|
|
488
|
+
const count = parseClaimCount(match[1]);
|
|
489
|
+
if (count != null) {
|
|
490
|
+
claims.push({ count, text: match[0] });
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
return claims;
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
function extractSkippedExerciseClaims(output) {
|
|
497
|
+
const text = normalizeText(output);
|
|
498
|
+
const claims = [];
|
|
499
|
+
const pattern = /\bskipped\s+([A-Za-z][A-Za-z0-9\- ]{1,40}?)(?:\s+(?:entirely|today|this session|altogether))?(?=[,.!;]| but | and |$)/gi;
|
|
500
|
+
for (const match of text.matchAll(pattern)) {
|
|
501
|
+
const exerciseName = match[1].trim().replace(/\s+/g, ' ');
|
|
502
|
+
if (exerciseName) {
|
|
503
|
+
claims.push({ exerciseName, text: match[0] });
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
return claims;
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
function extractExerciseSpecificPrClaims(output) {
|
|
510
|
+
const text = normalizeText(output);
|
|
511
|
+
const claims = [];
|
|
512
|
+
const patterns = [
|
|
513
|
+
/\b([A-Za-z][A-Za-z0-9\- ]{1,50}?)\s+(?:hit|hits|landed|lands|got|gets|notched|posted)\s+(?:a\s+)?(?:new\s+)?(?:rep\s+)?pr\b/gi,
|
|
514
|
+
/\b(?:new\s+)?(?:rep\s+)?pr\s+(?:on|for)\s+([A-Za-z][A-Za-z0-9\- ]{1,50}?)(?=[,.!;]| and | but |$)/gi
|
|
515
|
+
];
|
|
516
|
+
for (const pattern of patterns) {
|
|
517
|
+
for (const match of text.matchAll(pattern)) {
|
|
518
|
+
claims.push({ exerciseName: match[1].trim(), text: match[0] });
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
return claims;
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
function extractStallClaims(output) {
|
|
525
|
+
const text = normalizeText(output);
|
|
526
|
+
const claims = [];
|
|
527
|
+
const patterns = [
|
|
528
|
+
/\b([A-Za-z][A-Za-z0-9\- ]{1,50}?)\s+(?:has\s+)?(?:stalled|plateaued)\b/gi,
|
|
529
|
+
/\bstall(?:ed)?\s+on\s+([A-Za-z][A-Za-z0-9\- ]{1,50}?)(?=[,.!;]| and | but |$)/gi,
|
|
530
|
+
/\bplateau(?:ed)?\s+on\s+([A-Za-z][A-Za-z0-9\- ]{1,50}?)(?=[,.!;]| and | but |$)/gi
|
|
531
|
+
];
|
|
532
|
+
for (const pattern of patterns) {
|
|
533
|
+
for (const match of text.matchAll(pattern)) {
|
|
534
|
+
claims.push({ exerciseName: match[1].trim(), text: match[0] });
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
return claims;
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
function hasFatigueLanguage(output) {
|
|
541
|
+
return /\b(fatigue|fatigued|underrecovered|recovery debt|fatigue ceiling|limited by recovery|limited by fatigue|accumulated fatigue)\b/i.test(output);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
function matchesHistoricalFamilyName(claimName, actualName) {
|
|
545
|
+
const claimVariants = new Set(historicalExerciseVariants(claimName));
|
|
546
|
+
const actualVariants = new Set(historicalExerciseVariants(actualName));
|
|
547
|
+
for (const variant of claimVariants) {
|
|
548
|
+
if (actualVariants.has(variant)) return true;
|
|
549
|
+
}
|
|
550
|
+
return false;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
function supportedSpecificExerciseClaim(claimName, actualName) {
|
|
554
|
+
const normalizedClaim = normalizeExerciseName(claimName);
|
|
555
|
+
if (!normalizedClaim) return false;
|
|
556
|
+
return historicalExerciseVariants(actualName).includes(normalizedClaim);
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
function findExerciseContext(exercises, claimName) {
|
|
560
|
+
return (exercises ?? []).find((exercise) => matchesHistoricalFamilyName(claimName, exercise.exerciseName));
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
function isStallSupported(exerciseContext) {
|
|
564
|
+
if (!exerciseContext?.topSet || !Array.isArray(exerciseContext.recentWeights)) return false;
|
|
565
|
+
const currentTopWeight = Number(exerciseContext.topSet.weight);
|
|
566
|
+
if (!Number.isFinite(currentTopWeight) || currentTopWeight <= 0) return false;
|
|
567
|
+
const recent = exerciseContext.recentWeights
|
|
568
|
+
.map((entry) => Number(entry.topWeight))
|
|
569
|
+
.filter((value) => Number.isFinite(value) && value > 0);
|
|
570
|
+
if (recent.length < 2) return false;
|
|
571
|
+
const window = [currentTopWeight, ...recent.slice(0, 2)];
|
|
572
|
+
return window.length === 3 && window.every((weight) => weight === currentTopWeight);
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
function fatigueSupportSignals(context) {
|
|
576
|
+
let count = 0;
|
|
577
|
+
|
|
578
|
+
if (context.readiness?.dominantSignal || context.readiness?.adaptationApplied) {
|
|
579
|
+
count += 1;
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
if (Number.isFinite(context.hrvOnDay) && Number.isFinite(context.hrvBaseline) && context.hrvOnDay <= context.hrvBaseline * 0.9) {
|
|
583
|
+
count += 1;
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (Number.isFinite(context.restingHROnDay) && Number.isFinite(context.restingHRBaseline) && context.restingHROnDay >= context.restingHRBaseline + 4) {
|
|
587
|
+
count += 1;
|
|
588
|
+
}
|
|
589
|
+
|
|
590
|
+
if (Number.isFinite(context.sleepNight?.durationMins) && context.sleepNight.durationMins <= 360) {
|
|
591
|
+
count += 1;
|
|
592
|
+
}
|
|
593
|
+
|
|
594
|
+
const cardioMinutes = (context.nearbyCardio ?? []).reduce((sum, workout) => sum + ((Number(workout.durationSecs) || 0) / 60), 0);
|
|
595
|
+
if (cardioMinutes >= 180) {
|
|
596
|
+
count += 1;
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
const recentComparisons = context.recentComparisons ?? [];
|
|
600
|
+
const currentVolume = Number(context.totalVolume);
|
|
601
|
+
if (Number.isFinite(currentVolume) && recentComparisons.some((comparison) => {
|
|
602
|
+
const baseline = Number(comparison.totalVolume);
|
|
603
|
+
return Number.isFinite(baseline) && baseline > 0 && currentVolume <= baseline * 0.85;
|
|
604
|
+
})) {
|
|
605
|
+
count += 1;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
return count;
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
function evaluateWorkoutClaims(output, context, testCase) {
|
|
612
|
+
if (testCase.surface !== 'workout') {
|
|
613
|
+
return {
|
|
614
|
+
key: 'workout_claims',
|
|
615
|
+
passed: true,
|
|
616
|
+
reason: 'Not a workout summary.'
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
|
|
620
|
+
const normalized = normalizeText(output);
|
|
621
|
+
if (normalized === 'NO_INSIGHT') {
|
|
622
|
+
return {
|
|
623
|
+
key: 'workout_claims',
|
|
624
|
+
passed: true,
|
|
625
|
+
reason: 'NO_INSIGHT bypasses workout claim validation.'
|
|
626
|
+
};
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
const failures = [];
|
|
630
|
+
const totalPrCount = (context.prs?.length ?? 0) + (context.bwPrs?.length ?? 0) + (context.repPrs?.length ?? 0);
|
|
631
|
+
|
|
632
|
+
if (/\b(?:new\s+)?(?:rep\s+)?prs?\b/i.test(normalized) && totalPrCount === 0) {
|
|
633
|
+
failures.push('Summary claims a PR but the context contains no PRs.');
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
for (const claim of extractWorkoutPrCountClaims(normalized)) {
|
|
637
|
+
if (totalPrCount !== claim.count) {
|
|
638
|
+
failures.push(`PR count claim "${claim.text}" does not match actual PR count ${totalPrCount}.`);
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
const prExerciseNames = [
|
|
643
|
+
...(context.prs ?? []).map((pr) => pr.exerciseName),
|
|
644
|
+
...(context.bwPrs ?? []).map((pr) => pr.exerciseName),
|
|
645
|
+
...(context.repPrs ?? []).map((pr) => pr.exerciseName)
|
|
646
|
+
];
|
|
647
|
+
for (const claim of extractExerciseSpecificPrClaims(normalized)) {
|
|
648
|
+
const matched = prExerciseNames.some((exerciseName) => supportedSpecificExerciseClaim(claim.exerciseName, exerciseName));
|
|
649
|
+
if (!matched) {
|
|
650
|
+
failures.push(`Exercise-specific PR claim "${claim.text}" is not supported by the recorded PR exercises.`);
|
|
651
|
+
}
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const comparisons = context.recentComparisons ?? [];
|
|
655
|
+
for (const claim of extractWorkoutVolumeClaims(normalized)) {
|
|
656
|
+
if (comparisons.length === 0) {
|
|
657
|
+
failures.push(`Volume claim "${claim.text}" cannot be verified because no comparison sessions are available.`);
|
|
658
|
+
continue;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
const matched = comparisons.some((comparison) => {
|
|
662
|
+
const baseline = Number(comparison.totalVolume);
|
|
663
|
+
const current = Number(context.totalVolume);
|
|
664
|
+
if (!Number.isFinite(baseline) || baseline <= 0 || !Number.isFinite(current)) return false;
|
|
665
|
+
const delta = ((current - baseline) / baseline) * 100;
|
|
666
|
+
const directionMatches = claim.direction === 'down' ? delta < 0 : delta > 0;
|
|
667
|
+
return directionMatches && Math.abs(Math.abs(delta) - claim.percent) <= 5;
|
|
668
|
+
});
|
|
669
|
+
|
|
670
|
+
if (!matched) {
|
|
671
|
+
failures.push(`Volume claim "${claim.text}" is not supported by recent comparison volumes.`);
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
const skippedExercises = context.planComparison?.skipped ?? [];
|
|
676
|
+
for (const claim of extractSkippedExerciseClaims(normalized)) {
|
|
677
|
+
const matched = skippedExercises.some((exerciseName) => matchesHistoricalFamilyName(claim.exerciseName, exerciseName));
|
|
678
|
+
if (!matched) {
|
|
679
|
+
failures.push(`Skipped-exercise claim "${claim.text}" is not supported by the plan comparison.`);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
|
|
683
|
+
for (const claim of extractStallClaims(normalized)) {
|
|
684
|
+
const exerciseContext = findExerciseContext(context.exercises, claim.exerciseName);
|
|
685
|
+
if (!isStallSupported(exerciseContext)) {
|
|
686
|
+
failures.push(`Stall claim "${claim.text}" is not supported by repeated top weights in recent sessions.`);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
if (hasFatigueLanguage(normalized) && fatigueSupportSignals(context) < 2) {
|
|
691
|
+
failures.push('Fatigue/recovery claim is not supported by enough recovery or load signals.');
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
return {
|
|
695
|
+
key: 'workout_claims',
|
|
696
|
+
passed: failures.length === 0,
|
|
697
|
+
reason: failures.length === 0
|
|
698
|
+
? 'Workout claims are supported by the context.'
|
|
699
|
+
: failures.join(' ')
|
|
700
|
+
};
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
export async function runSummaryEvalCase(testCase) {
|
|
704
|
+
const snapshot = await loadSummaryEvalSnapshot(testCase);
|
|
705
|
+
return runSummaryEvalCaseFromSnapshot(testCase, snapshot);
|
|
706
|
+
}
|
|
707
|
+
|
|
708
|
+
export async function runSummaryEvalCaseFromSnapshot(testCase, snapshot) {
|
|
709
|
+
const context = buildSummaryEvalContext(snapshot, testCase);
|
|
710
|
+
if (context == null) {
|
|
711
|
+
throw new Error(`Eval case ${testCase.id} produced no context`);
|
|
712
|
+
}
|
|
713
|
+
|
|
714
|
+
const output = await generateSummaryEvalOutput(testCase, context);
|
|
715
|
+
if (!normalizeText(output)) {
|
|
716
|
+
throw new Error(`Eval case ${testCase.id} produced an empty output`);
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
const checks = [
|
|
720
|
+
evaluateNoInsight(output, testCase),
|
|
721
|
+
evaluateShape(output, testCase),
|
|
722
|
+
evaluateRequiredMentions(output, testCase),
|
|
723
|
+
evaluateAnyOfMentions(output, testCase),
|
|
724
|
+
evaluateForbiddenPhrases(output, testCase),
|
|
725
|
+
evaluateForbiddenMentions(output, testCase),
|
|
726
|
+
evaluateExerciseMentions(output, snapshot, context, testCase.surface, testCase),
|
|
727
|
+
evaluateWorkoutClaims(output, context, testCase)
|
|
728
|
+
];
|
|
729
|
+
|
|
730
|
+
return {
|
|
731
|
+
id: testCase.id,
|
|
732
|
+
surface: testCase.surface,
|
|
733
|
+
name: testCase.name,
|
|
734
|
+
output,
|
|
735
|
+
passed: checks.every((check) => check.passed),
|
|
736
|
+
checks
|
|
737
|
+
};
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
function genericForbiddenPhrasesForSurface(surface) {
|
|
741
|
+
switch (surface) {
|
|
742
|
+
case 'workout':
|
|
743
|
+
return ['solid progress', 'trust the process', 'keep it up', 'quality work', 'in a great place', 'continue progressive overload', 'as fatigue accumulates'];
|
|
744
|
+
case 'cycle':
|
|
745
|
+
return ['solid progress', 'trust the process', 'in a great place', 'continue progressive overload', 'as fatigue accumulates', 'solid session', 'quality work'];
|
|
746
|
+
case 'checkpoint':
|
|
747
|
+
return ['solid progress', 'quality work', 'trust the process', 'in a great place'];
|
|
748
|
+
case 'vitals':
|
|
749
|
+
return [];
|
|
750
|
+
default:
|
|
751
|
+
return [];
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
function latestCycleSummariesWithAI(snapshot) {
|
|
756
|
+
const summaries = snapshot.cycleSummaries ?? [];
|
|
757
|
+
const latestByProgram = new Map();
|
|
758
|
+
|
|
759
|
+
for (const summary of stableSortByDateDesc(summaries, (item) => item.completedDate)) {
|
|
760
|
+
if (!summary.aiSummary || latestByProgram.has(summary.programId)) continue;
|
|
761
|
+
latestByProgram.set(summary.programId, summary);
|
|
762
|
+
}
|
|
763
|
+
|
|
764
|
+
return [...latestByProgram.values()];
|
|
765
|
+
}
|
|
766
|
+
|
|
767
|
+
export function harvestStoredSummaryEvalCases(snapshot, snapshotLabel = 'snapshot') {
|
|
768
|
+
const harvested = [];
|
|
769
|
+
|
|
770
|
+
for (const session of snapshot.sessions ?? []) {
|
|
771
|
+
const text = session.summary?.aiCoachNotes;
|
|
772
|
+
if (!text) continue;
|
|
773
|
+
const anyOf = [
|
|
774
|
+
...(session.exercises ?? []).map((exercise) => exercise.name),
|
|
775
|
+
session.dayName
|
|
776
|
+
].filter(Boolean);
|
|
777
|
+
harvested.push({
|
|
778
|
+
id: `stored-workout-${session.id}`,
|
|
779
|
+
name: `Stored workout summary ${session.id}`,
|
|
780
|
+
surface: 'workout',
|
|
781
|
+
source: 'stored',
|
|
782
|
+
snapshotLabel,
|
|
783
|
+
selector: { sessionId: session.id },
|
|
784
|
+
output: text,
|
|
785
|
+
requiredMentions: [],
|
|
786
|
+
requiredAnyOfMentions: anyOf,
|
|
787
|
+
forbiddenPhrases: genericForbiddenPhrasesForSurface('workout'),
|
|
788
|
+
forbiddenMentions: [],
|
|
789
|
+
expectNoInsight: false,
|
|
790
|
+
shouldPass: true
|
|
791
|
+
});
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
for (const summary of latestCycleSummariesWithAI(snapshot)) {
|
|
795
|
+
const anyOf = [
|
|
796
|
+
...(summary.progressionUpdates ?? []).map((update) => update.exerciseName),
|
|
797
|
+
...(summary.prs ?? []).map((pr) => pr.exerciseName),
|
|
798
|
+
'sleep',
|
|
799
|
+
'hrv',
|
|
800
|
+
'resting hr'
|
|
801
|
+
].filter(Boolean);
|
|
802
|
+
harvested.push({
|
|
803
|
+
id: `stored-cycle-${summary.id}`,
|
|
804
|
+
name: `Stored cycle summary ${summary.id}`,
|
|
805
|
+
surface: 'cycle',
|
|
806
|
+
source: 'stored',
|
|
807
|
+
snapshotLabel,
|
|
808
|
+
selector: { programId: summary.programId },
|
|
809
|
+
output: summary.aiSummary,
|
|
810
|
+
requiredMentions: [],
|
|
811
|
+
requiredAnyOfMentions: anyOf,
|
|
812
|
+
forbiddenPhrases: genericForbiddenPhrasesForSurface('cycle'),
|
|
813
|
+
forbiddenMentions: [],
|
|
814
|
+
expectNoInsight: false,
|
|
815
|
+
shouldPass: true
|
|
816
|
+
});
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
const latestVitals = stableSortByDateDesc(snapshot.vitalsSummaries ?? [], (entry) => entry.date)[0];
|
|
820
|
+
if (latestVitals?.summary) {
|
|
821
|
+
harvested.push({
|
|
822
|
+
id: `stored-vitals-${latestVitals.id ?? latestVitals.date}`,
|
|
823
|
+
name: `Stored vitals summary ${latestVitals.date}`,
|
|
824
|
+
surface: 'vitals',
|
|
825
|
+
source: 'stored',
|
|
826
|
+
snapshotLabel,
|
|
827
|
+
selector: {},
|
|
828
|
+
output: latestVitals.summary,
|
|
829
|
+
requiredMentions: [],
|
|
830
|
+
requiredAnyOfMentions: ['recovery', 'readiness', 'train', 'session', 'today'],
|
|
831
|
+
forbiddenPhrases: genericForbiddenPhrasesForSurface('vitals'),
|
|
832
|
+
forbiddenMentions: [],
|
|
833
|
+
expectNoInsight: false,
|
|
834
|
+
shouldPass: true
|
|
835
|
+
});
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
return harvested;
|
|
839
|
+
}
|