incremnt 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -1
- package/package.json +2 -1
- package/src/ask-answer-verifier.js +857 -0
- package/src/ask-coach.js +2634 -0
- package/src/ask-replay.js +358 -0
- package/src/auth.js +169 -15
- package/src/contract.js +160 -3
- package/src/format.js +24 -1
- package/src/lib.js +205 -17
- package/src/mcp.js +88 -24
- package/src/openrouter.js +242 -19
- package/src/plan-changeset.js +132 -0
- package/src/program-draft.js +230 -0
- package/src/prompt-changelog.js +90 -0
- package/src/promptfoo-evals.js +10 -4
- package/src/promptfoo-langfuse-scores.js +55 -0
- package/src/queries.js +992 -987
- package/src/remote.js +465 -12
- package/src/score-context.js +14 -7
- package/src/score-prelude.js +113 -0
- package/src/service-url.js +9 -0
- package/src/summary-evals.js +677 -42
- package/src/sync-service.js +1259 -352
- package/src/transport.js +119 -3
package/src/summary-evals.js
CHANGED
|
@@ -3,7 +3,7 @@ import path from 'node:path';
|
|
|
3
3
|
import { fileURLToPath } from 'node:url';
|
|
4
4
|
import {
|
|
5
5
|
askContext,
|
|
6
|
-
|
|
6
|
+
canonicalExerciseName,
|
|
7
7
|
checkpointContext,
|
|
8
8
|
cycleSummaryContext,
|
|
9
9
|
executeCoachReadTool,
|
|
@@ -11,6 +11,8 @@ import {
|
|
|
11
11
|
workoutSummaryContext,
|
|
12
12
|
vitalsSummaryContext
|
|
13
13
|
} from './queries.js';
|
|
14
|
+
import { askRoutedContext, buildAskStructuredResponse } from './ask-coach.js';
|
|
15
|
+
import { formatIncrementScorePrelude, isScoreQuestion } from './score-prelude.js';
|
|
14
16
|
import {
|
|
15
17
|
AI_PROMPT_VERSIONS,
|
|
16
18
|
generateAskAnswer,
|
|
@@ -20,6 +22,8 @@ import {
|
|
|
20
22
|
generateWorkoutCoachingSummary
|
|
21
23
|
} from './openrouter.js';
|
|
22
24
|
import { computeScoreBand } from './score-context.js';
|
|
25
|
+
import { stripXMLTagBlocks } from './prompt-security.js';
|
|
26
|
+
import { extractAskProgramDraft, hasProgramDraftBlock } from './program-draft.js';
|
|
23
27
|
|
|
24
28
|
const __filename = fileURLToPath(import.meta.url);
|
|
25
29
|
const __dirname = path.dirname(__filename);
|
|
@@ -30,6 +34,14 @@ export function defaultCaseSetName() {
|
|
|
30
34
|
return process.env.SUMMARY_EVAL_CASE_SET || 'synthetic';
|
|
31
35
|
}
|
|
32
36
|
|
|
37
|
+
function envFlag(name, env = process.env) {
|
|
38
|
+
return ['1', 'true', 'yes'].includes(String(env[name] ?? '').toLowerCase());
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export function summaryEvalsLiveGenerationEnabled(env = process.env) {
|
|
42
|
+
return envFlag('SUMMARY_EVALS_LIVE', env) || envFlag('PROMPTFOO_LIVE', env);
|
|
43
|
+
}
|
|
44
|
+
|
|
33
45
|
function stableSortByDateDesc(items, selector) {
|
|
34
46
|
return [...items].sort((lhs, rhs) => String(selector(rhs)).localeCompare(String(selector(lhs))));
|
|
35
47
|
}
|
|
@@ -88,12 +100,21 @@ export function buildSummaryEvalContext(snapshot, testCase) {
|
|
|
88
100
|
case 'ask': {
|
|
89
101
|
const question = testCase.context?.question ?? testCase.question ?? '';
|
|
90
102
|
const today = testCase.context?.today ?? testCase.today ?? null;
|
|
103
|
+
const history = Array.isArray(testCase.context?.history) ? testCase.context.history : [];
|
|
91
104
|
const routed = question
|
|
92
|
-
? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), today: today ?? new Date() })
|
|
105
|
+
? askRoutedContext(snapshot, question, { exclude: new Set(testCase.exclude ?? []), history, today: today ?? new Date() })
|
|
93
106
|
: null;
|
|
107
|
+
// Mirror production: the live /cli/ask path prepends the Increment Score
|
|
108
|
+
// prelude to the routed context. Including it here means a live eval feeds
|
|
109
|
+
// the model the same dump-prone material, so evaluateAskScoreVoice actually
|
|
110
|
+
// guards the prompt, not just the checker.
|
|
111
|
+
const prelude = formatIncrementScorePrelude(scoreHistoryFromSnapshot(snapshot), { question });
|
|
112
|
+
const routedContext = routed?.context ?? null;
|
|
113
|
+
const trainingData = testCase.context?.trainingData
|
|
114
|
+
?? (prelude && routedContext ? `${prelude}\n\n${routedContext}` : (routedContext ?? prelude));
|
|
94
115
|
return {
|
|
95
116
|
...(testCase.context ?? {}),
|
|
96
|
-
trainingData
|
|
117
|
+
trainingData,
|
|
97
118
|
routedMetadata: routed?.metadata ?? null
|
|
98
119
|
};
|
|
99
120
|
}
|
|
@@ -117,11 +138,38 @@ function summaryEvalGenerationMetadata(result) {
|
|
|
117
138
|
);
|
|
118
139
|
}
|
|
119
140
|
|
|
141
|
+
function buildAskEvalStructuredMetadata(testCase, context, output) {
|
|
142
|
+
if (testCase.surface !== 'ask') return {};
|
|
143
|
+
const parsedAsk = extractAskProgramDraft(output, {
|
|
144
|
+
canonicalizeExerciseName: canonicalExerciseName
|
|
145
|
+
});
|
|
146
|
+
const answer = stripXMLTagBlocks(parsedAsk.answerText);
|
|
147
|
+
const question = context?.question ?? testCase.context?.question ?? testCase.question ?? '';
|
|
148
|
+
const routingMetadata = context?.routedMetadata ?? null;
|
|
149
|
+
return {
|
|
150
|
+
routingMetadata,
|
|
151
|
+
structured: buildAskStructuredResponse(answer, routingMetadata ?? {}, {
|
|
152
|
+
programDraft: askStructuredProgramDraft(parsedAsk, routingMetadata),
|
|
153
|
+
question
|
|
154
|
+
})
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function summaryEvalProviderMetadata(testCase, context, output, result = null) {
|
|
159
|
+
return {
|
|
160
|
+
...summaryEvalGenerationMetadata(result),
|
|
161
|
+
...buildAskEvalStructuredMetadata(testCase, context, output)
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
120
165
|
export async function generateSummaryEvalOutputWithMetadata(testCase, context, snapshot = null) {
|
|
121
|
-
const liveGenerationEnabled =
|
|
166
|
+
const liveGenerationEnabled = summaryEvalsLiveGenerationEnabled();
|
|
122
167
|
const apiKey = process.env.OPENROUTER_API_KEY;
|
|
123
168
|
if (!liveGenerationEnabled || !apiKey || testCase.shouldPass === false) {
|
|
124
|
-
return {
|
|
169
|
+
return {
|
|
170
|
+
output: testCase.output,
|
|
171
|
+
metadata: summaryEvalProviderMetadata(testCase, context, testCase.output)
|
|
172
|
+
};
|
|
125
173
|
}
|
|
126
174
|
|
|
127
175
|
let result;
|
|
@@ -151,7 +199,8 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
|
|
|
151
199
|
apiKey,
|
|
152
200
|
history: context.history ?? [],
|
|
153
201
|
tone: context.tone,
|
|
154
|
-
model: context.model
|
|
202
|
+
model: context.model,
|
|
203
|
+
routingMetadata: context.routedMetadata ?? undefined
|
|
155
204
|
});
|
|
156
205
|
break;
|
|
157
206
|
}
|
|
@@ -161,7 +210,7 @@ export async function generateSummaryEvalOutputWithMetadata(testCase, context, s
|
|
|
161
210
|
|
|
162
211
|
return {
|
|
163
212
|
output: result.text,
|
|
164
|
-
metadata:
|
|
213
|
+
metadata: summaryEvalProviderMetadata(testCase, context, result.text, result)
|
|
165
214
|
};
|
|
166
215
|
}
|
|
167
216
|
|
|
@@ -260,8 +309,44 @@ function isSingleParagraph(text) {
|
|
|
260
309
|
return !normalizeText(text).includes('\n\n');
|
|
261
310
|
}
|
|
262
311
|
|
|
263
|
-
|
|
264
|
-
|
|
312
|
+
// Canonicalizes free-form coach text and required-mention snippets to the same
|
|
313
|
+
// surface form before substring matching. The goal is to keep grounding checks
|
|
314
|
+
// (does the answer cite this real number?) while tolerating the formatting an
|
|
315
|
+
// LLM legitimately varies: unicode × vs ASCII x, set-token unit placement,
|
|
316
|
+
// signed deltas, and rep-sequence separators (8/8/7 vs "8, 8, and 7").
|
|
317
|
+
// It only adds equivalences; it never strips the digits a check is grounded on,
|
|
318
|
+
// so a genuinely absent number still fails.
|
|
319
|
+
function normalizeForMention(value) {
|
|
320
|
+
let s = normalizeText(value).toLowerCase();
|
|
321
|
+
// Unicode multiplication / bullet / asterisk between digits -> ASCII x.
|
|
322
|
+
s = s.replace(/(\d)\s*[×✕╳·∗*]\s*(\d)/g, '$1x$2');
|
|
323
|
+
// Drop weight units that sit inside set tokens: "80kg x 7" / "40 kg" -> "80 x 7" / "40".
|
|
324
|
+
s = s.replace(/(\d(?:\.\d+)?)\s*(?:kgs?|lbs?|pounds)\b/g, '$1');
|
|
325
|
+
// Collapse spaces around an x that joins two numbers: "80 x 7" -> "80x7".
|
|
326
|
+
s = s.replace(/(\d)\s*x\s*(?=\d)/g, '$1x');
|
|
327
|
+
// Unify rep-sequence separators: "8/8/7" and "8, 8, and 7" -> "8,8,7".
|
|
328
|
+
// Lookahead keeps the trailing digit so chained separators all collapse.
|
|
329
|
+
s = s.replace(/(\d)\s*\/\s*(?=\d)/g, '$1,');
|
|
330
|
+
s = s.replace(/(\d)\s*,\s*and\s+(?=\d)/g, '$1,');
|
|
331
|
+
s = s.replace(/(\d)\s*,\s*(?=\d)/g, '$1,');
|
|
332
|
+
return s.replace(/\s+/g, ' ').trim();
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// A required/any-of mention may be a string or an array of acceptable
|
|
336
|
+
// alternatives (matches if any alternative is present). Arrays express
|
|
337
|
+
// AND-of-ORs at the fixture level: every top-level entry must match, and an
|
|
338
|
+
// array entry matches when any of its phrasings appears.
|
|
339
|
+
function mentionMatches(output, mention) {
|
|
340
|
+
const normalizedOutput = normalizeForMention(output);
|
|
341
|
+
const alternatives = Array.isArray(mention) ? mention : [mention];
|
|
342
|
+
return alternatives
|
|
343
|
+
.map((alternative) => normalizeForMention(alternative))
|
|
344
|
+
.filter(Boolean)
|
|
345
|
+
.some((alternative) => normalizedOutput.includes(alternative));
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
function describeMention(mention) {
|
|
349
|
+
return Array.isArray(mention) ? `one of [${mention.join(' | ')}]` : String(mention);
|
|
265
350
|
}
|
|
266
351
|
|
|
267
352
|
function phraseIncludes(text, snippet) {
|
|
@@ -548,11 +633,14 @@ function evaluateExerciseMentions(output, snapshot, context, surface, testCase)
|
|
|
548
633
|
}
|
|
549
634
|
|
|
550
635
|
function evaluateRequiredMentions(output, testCase) {
|
|
551
|
-
const
|
|
636
|
+
const mentions = Array.isArray(testCase.requiredMentions) ? testCase.requiredMentions : [];
|
|
637
|
+
const missing = mentions.filter((mention) => !mentionMatches(output, mention));
|
|
552
638
|
return {
|
|
553
639
|
key: 'required_mentions',
|
|
554
640
|
passed: missing.length === 0,
|
|
555
|
-
reason: missing.length === 0
|
|
641
|
+
reason: missing.length === 0
|
|
642
|
+
? 'All required mentions present.'
|
|
643
|
+
: `Missing required mention(s): ${missing.map(describeMention).join(', ')}`
|
|
556
644
|
};
|
|
557
645
|
}
|
|
558
646
|
|
|
@@ -566,7 +654,7 @@ function evaluateAnyOfMentions(output, testCase) {
|
|
|
566
654
|
};
|
|
567
655
|
}
|
|
568
656
|
|
|
569
|
-
const matched = candidates.some((mention) =>
|
|
657
|
+
const matched = candidates.some((mention) => mentionMatches(output, mention));
|
|
570
658
|
return {
|
|
571
659
|
key: 'required_any_of_mentions',
|
|
572
660
|
passed: matched,
|
|
@@ -1162,13 +1250,22 @@ function hasAskFatigueSupport(snapshot, lookbackDays = 7) {
|
|
|
1162
1250
|
return false;
|
|
1163
1251
|
}
|
|
1164
1252
|
|
|
1253
|
+
function parseWeightNumber(raw) {
|
|
1254
|
+
return Number(String(raw).replace(/,/g, ''));
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1165
1257
|
function extractAskWeightClaims(text) {
|
|
1166
1258
|
const claims = [];
|
|
1167
|
-
|
|
1259
|
+
// Accept comma-grouped thousands ("40,500 kg") as a single number so volume
|
|
1260
|
+
// figures are not shredded into bogus "500 kg" / "000 kg" claims. Volume/total
|
|
1261
|
+
// figures are excluded by isVolumeWeightClaim at the call sites, not by a
|
|
1262
|
+
// magnitude cap — heavy machine work (leg press, sled) legitimately exceeds
|
|
1263
|
+
// 1000 kg, and a fabricated heavy load must still be graded.
|
|
1264
|
+
const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\b/gi;
|
|
1168
1265
|
for (const match of text.matchAll(pattern)) {
|
|
1169
1266
|
claims.push({
|
|
1170
1267
|
text: match[0],
|
|
1171
|
-
value:
|
|
1268
|
+
value: parseWeightNumber(match[1]),
|
|
1172
1269
|
index: match.index ?? -1
|
|
1173
1270
|
});
|
|
1174
1271
|
}
|
|
@@ -1177,12 +1274,17 @@ function extractAskWeightClaims(text) {
|
|
|
1177
1274
|
|
|
1178
1275
|
function extractAskWeightedSetClaims(text) {
|
|
1179
1276
|
const claims = [];
|
|
1180
|
-
|
|
1277
|
+
// A weight×reps pair is only unambiguous with "x"/"×" (e.g. "70 kg x 5"), or
|
|
1278
|
+
// an explicit "for N rep(s)". Bare "X kg for N" is NOT a rep claim — N is
|
|
1279
|
+
// almost always a SET count ("70 kg for 4 working sets") or a duration, and
|
|
1280
|
+
// treating it as reps flags real data as a fabricated pair. So match only the
|
|
1281
|
+
// unambiguous forms; the plain-weight loop still grounds the weight itself.
|
|
1282
|
+
const pattern = /\b(\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+(?:\.\d+)?)\s*(?:kg|kilograms?)\s*(?:(?:x|×)\s*(\d+)|for\s+(\d+)\s*reps?)\b/gi;
|
|
1181
1283
|
for (const match of text.matchAll(pattern)) {
|
|
1182
1284
|
claims.push({
|
|
1183
1285
|
text: match[0],
|
|
1184
|
-
weight:
|
|
1185
|
-
reps: Number(match[2]),
|
|
1286
|
+
weight: parseWeightNumber(match[1]),
|
|
1287
|
+
reps: Number(match[2] ?? match[3]),
|
|
1186
1288
|
index: match.index ?? -1,
|
|
1187
1289
|
end: (match.index ?? -1) + match[0].length
|
|
1188
1290
|
});
|
|
@@ -1225,10 +1327,58 @@ function isEstimatedOneRepMaxWeightClaim(text, claim) {
|
|
|
1225
1327
|
}
|
|
1226
1328
|
|
|
1227
1329
|
function isVolumeWeightClaim(text, claim) {
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1330
|
+
// A kg figure in a clause about volume/tonnage/total load is a workload total
|
|
1331
|
+
// (e.g. "weekly strength volume fell from 44,000 kg to 40,500 kg"), not an
|
|
1332
|
+
// exercise load. Scope to the claim's clause so a fabricated exercise load
|
|
1333
|
+
// earlier in the same sentence is still graded.
|
|
1334
|
+
return /\b(?:volume|tonnage|total\s+(?:load|work|volume|tonnage))\b/i.test(claimClause(text, claim));
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
function claimClause(text, claim) {
|
|
1338
|
+
const boundaries = [
|
|
1339
|
+
'\n',
|
|
1340
|
+
'. ',
|
|
1341
|
+
';',
|
|
1342
|
+
', while',
|
|
1343
|
+
', whereas',
|
|
1344
|
+
', but',
|
|
1345
|
+
' while ',
|
|
1346
|
+
' whereas ',
|
|
1347
|
+
' but '
|
|
1348
|
+
];
|
|
1349
|
+
let start = 0;
|
|
1350
|
+
for (const boundary of boundaries) {
|
|
1351
|
+
const index = text.lastIndexOf(boundary, claim.index);
|
|
1352
|
+
if (index >= 0) start = Math.max(start, index + boundary.length);
|
|
1353
|
+
}
|
|
1354
|
+
|
|
1355
|
+
let end = text.length;
|
|
1356
|
+
for (const boundary of boundaries) {
|
|
1357
|
+
const index = text.indexOf(boundary, claim.index + claim.text.length);
|
|
1358
|
+
if (index >= 0) end = Math.min(end, index);
|
|
1359
|
+
}
|
|
1360
|
+
return text.slice(start, end);
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
// Returns the sentence containing the claim, so context guards can look at the
|
|
1364
|
+
// whole clause rather than a fixed-width window (body-weight phrasing can put
|
|
1365
|
+
// the "body weight" anchor well before the kg figure).
|
|
1366
|
+
function claimSentence(text, claim) {
|
|
1367
|
+
const before = text.slice(0, claim.index);
|
|
1368
|
+
const startBreak = Math.max(before.lastIndexOf('. '), before.lastIndexOf('\n'));
|
|
1369
|
+
const start = startBreak >= 0 ? startBreak + 1 : 0;
|
|
1370
|
+
const after = text.slice(claim.index);
|
|
1371
|
+
const breaks = [after.indexOf('. '), after.indexOf('\n')].filter((i) => i >= 0);
|
|
1372
|
+
const end = breaks.length ? claim.index + Math.min(...breaks) : text.length;
|
|
1373
|
+
return text.slice(start, end);
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
// Body-weight figures ("body weight is up 0.6 kg", "80.0 kg latest") are not
|
|
1377
|
+
// exercise-load claims. findNearestMentionedExercise would otherwise attribute
|
|
1378
|
+
// them to the previously named lift and flag a correct answer as a
|
|
1379
|
+
// hallucination, so skip any kg figure stated in a body-weight clause.
|
|
1380
|
+
function isBodyWeightClaim(text, claim) {
|
|
1381
|
+
return /\bbody\s*weight\b|\bbodyweight\b|\bweight\s+trend\b/i.test(claimSentence(text, claim));
|
|
1232
1382
|
}
|
|
1233
1383
|
|
|
1234
1384
|
function askWorkingTopSetRows(snapshot) {
|
|
@@ -1367,6 +1517,70 @@ function evaluateAskDirectionalConsistency(output, snapshot, testCase) {
|
|
|
1367
1517
|
};
|
|
1368
1518
|
}
|
|
1369
1519
|
|
|
1520
|
+
// Increment Score component names. Recited with a number, these are the raw
|
|
1521
|
+
// sub-scores the coach-observation-voice spec marks Tier 1 — never surface.
|
|
1522
|
+
const SCORE_COMPONENT_NAMES = ['coverage', 'stimulus', 'execution', 'progression', 'recovery'];
|
|
1523
|
+
|
|
1524
|
+
// A score-like magnitude: 1-3 digits, optional one decimal place.
|
|
1525
|
+
const SCORE_NUMBER = '\\d{1,3}(?:\\.\\d+)?';
|
|
1526
|
+
|
|
1527
|
+
// Contexts that mean the number is real-world data — reps, load, time, counts,
|
|
1528
|
+
// ratios, percentages, the /100 headline — not a raw component sub-score. A
|
|
1529
|
+
// number directly followed by one of these is left alone.
|
|
1530
|
+
const NON_SCORE_UNIT =
|
|
1531
|
+
'(?:kg|kilo|lbs?|pounds?|reps?|sets?|%|percent|pct|x\\b|for\\s+\\d|sessions?|days?|nights?|weeks?|months?|' +
|
|
1532
|
+
'years?|yrs?|hrs?|hours?|mins?|minutes?|secs?|seconds?|rpe|rir|am|pm|out\\s+of|of\\b|/\\s*\\d)';
|
|
1533
|
+
|
|
1534
|
+
// Heuristic, not a parser. Flags a component name followed — within a short,
|
|
1535
|
+
// period/newline-free gap (one clause) — by a score-like number that is not a
|
|
1536
|
+
// real-world unit. The bounded gap (excludes digits, so it can't skip a unit'd
|
|
1537
|
+
// number) catches the natural phrasings an LLM actually emits — "recovery 35",
|
|
1538
|
+
// "recovery is 35", "recovery is sitting at 35", "recovery came in at 35",
|
|
1539
|
+
// "recovery (35)", "recovery is much lower at 42.8", "coverage 100" — while the
|
|
1540
|
+
// unit lookahead keeps clean prose ("recovery over the last 3 sessions",
|
|
1541
|
+
// "recovery after 3 hours of sleep", "execution at 9/10 RPE") from tripping.
|
|
1542
|
+
const SCORE_COMPONENT_DUMP_PATTERN = new RegExp(
|
|
1543
|
+
// `(?!\\.\\d)` rejects a number that is really the integer part of a decimal —
|
|
1544
|
+
// without it, backtracking matches "2" in "progression of 2.5 kg" (the unit
|
|
1545
|
+
// guard only sees the ".5 kg" tail) and false-flags real load/time data.
|
|
1546
|
+
`\\b(${SCORE_COMPONENT_NAMES.join('|')})\\b[^.\\d\\n]{0,25}?(${SCORE_NUMBER})\\b(?!\\.\\d)(?!\\s*${NON_SCORE_UNIT})`,
|
|
1547
|
+
'gi'
|
|
1548
|
+
);
|
|
1549
|
+
|
|
1550
|
+
// The other dump the prelude used to emit and the model parroted: an explicit
|
|
1551
|
+
// day-over-day delta number ("-13 day-over-day delta", "down 11 points day over
|
|
1552
|
+
// day"). A bare "down day-over-day" with no number is fine.
|
|
1553
|
+
const SCORE_DELTA_DUMP_PATTERN =
|
|
1554
|
+
/[+-]\d+(?:\.\d+)?[^.\n]{0,16}?day[- ]over[- ]day|(?:\d+(?:\.\d+)?\s*points?)[^.\n]{0,16}?day[- ]over[- ]day|day[- ]over[- ]day[^.\n]{0,16}?(?:[+-]\d+(?:\.\d+)?|\d+(?:\.\d+)?\s*points?)/i;
|
|
1555
|
+
|
|
1556
|
+
export function evaluateAskScoreVoice(output, testCase) {
|
|
1557
|
+
if (testCase.surface !== 'ask') {
|
|
1558
|
+
return { key: 'ask_score_voice', passed: true, reason: 'Not an ask answer.' };
|
|
1559
|
+
}
|
|
1560
|
+
// Escape hatch for cases that legitimately need raw component values
|
|
1561
|
+
// (e.g. an ask case paired with the numbers-only tone).
|
|
1562
|
+
if (testCase.allowScoreComponents === true) {
|
|
1563
|
+
return { key: 'ask_score_voice', passed: true, reason: 'Score-component voice check opted out for this case.' };
|
|
1564
|
+
}
|
|
1565
|
+
|
|
1566
|
+
const text = normalizeText(output);
|
|
1567
|
+
const hits = new Set();
|
|
1568
|
+
for (const match of text.matchAll(SCORE_COMPONENT_DUMP_PATTERN)) {
|
|
1569
|
+
hits.add(`${match[1]} ${match[2]}`);
|
|
1570
|
+
}
|
|
1571
|
+
if (SCORE_DELTA_DUMP_PATTERN.test(text)) {
|
|
1572
|
+
hits.add('day-over-day delta number');
|
|
1573
|
+
}
|
|
1574
|
+
|
|
1575
|
+
return {
|
|
1576
|
+
key: 'ask_score_voice',
|
|
1577
|
+
passed: hits.size === 0,
|
|
1578
|
+
reason: hits.size === 0
|
|
1579
|
+
? 'Ask answer does not recite raw Increment Score component sub-scores.'
|
|
1580
|
+
: `Ask answer recites raw score internals: ${[...hits].join(', ')}. Speak in training reality, not raw sub-scores.`
|
|
1581
|
+
};
|
|
1582
|
+
}
|
|
1583
|
+
|
|
1370
1584
|
function relevantSessionsForStaleness(snapshot, testCase) {
|
|
1371
1585
|
const configuredExercise = testCase.staleness?.exercise ?? testCase.staleness?.exerciseName
|
|
1372
1586
|
?? testCase.directionalConsistency?.[0]?.exercise
|
|
@@ -1379,6 +1593,65 @@ function relevantSessionsForStaleness(snapshot, testCase) {
|
|
|
1379
1593
|
));
|
|
1380
1594
|
}
|
|
1381
1595
|
|
|
1596
|
+
// The coach IS the coach — it must speak in the first person and never refer to
|
|
1597
|
+
// itself or its own outputs as a third party ("the coach observation says…",
|
|
1598
|
+
// "the system shows…"). Own the observation instead ("I flagged…").
|
|
1599
|
+
const ASK_SELF_REFERENCE_PATTERNS = [
|
|
1600
|
+
/\bthe coach observations?\b/i,
|
|
1601
|
+
/\bthe coach\b/i,
|
|
1602
|
+
/\bthe ai coach\b/i,
|
|
1603
|
+
/\byour coach\b/i,
|
|
1604
|
+
/\bthis coach\b/i,
|
|
1605
|
+
/\bthe system\b/i,
|
|
1606
|
+
/\bthe assistant\b/i
|
|
1607
|
+
];
|
|
1608
|
+
|
|
1609
|
+
function evaluateAskSelfReference(output, testCase) {
|
|
1610
|
+
if (testCase.surface !== 'ask') {
|
|
1611
|
+
return { key: 'ask_self_reference', passed: true, reason: 'Not an ask answer.' };
|
|
1612
|
+
}
|
|
1613
|
+
const text = normalizeText(output);
|
|
1614
|
+
if (text === 'NO_INSIGHT' || !text) {
|
|
1615
|
+
return { key: 'ask_self_reference', passed: true, reason: 'No answer text.' };
|
|
1616
|
+
}
|
|
1617
|
+
const hits = [];
|
|
1618
|
+
for (const pattern of ASK_SELF_REFERENCE_PATTERNS) {
|
|
1619
|
+
const match = text.match(pattern);
|
|
1620
|
+
if (match) hits.push(match[0]);
|
|
1621
|
+
}
|
|
1622
|
+
const unique = uniqueStrings(hits);
|
|
1623
|
+
return {
|
|
1624
|
+
key: 'ask_self_reference',
|
|
1625
|
+
passed: unique.length === 0,
|
|
1626
|
+
reason: unique.length === 0
|
|
1627
|
+
? 'Ask answer speaks in the first person.'
|
|
1628
|
+
: `Ask answer refers to itself in the third person: ${unique.join(', ')}. You ARE the coach — own it ("I flagged…", "your data shows…").`
|
|
1629
|
+
};
|
|
1630
|
+
}
|
|
1631
|
+
|
|
1632
|
+
// On a question that is not about the Increment Score, the coach must not
|
|
1633
|
+
// volunteer the bare overall score number (e.g. "your score is 92/100"). The
|
|
1634
|
+
// prelude withholds the number for non-score questions; this guards the answer.
|
|
1635
|
+
function evaluateAskVolunteeredScore(output, testCase) {
|
|
1636
|
+
if (testCase.surface !== 'ask') {
|
|
1637
|
+
return { key: 'ask_volunteered_score', passed: true, reason: 'Not an ask answer.' };
|
|
1638
|
+
}
|
|
1639
|
+
const question = testCase.context?.question ?? testCase.question ?? '';
|
|
1640
|
+
if (isScoreQuestion(question)) {
|
|
1641
|
+
return { key: 'ask_volunteered_score', passed: true, reason: 'Question is about the score; naming it is allowed.' };
|
|
1642
|
+
}
|
|
1643
|
+
const text = normalizeText(output);
|
|
1644
|
+
const volunteered = /\b\d{2,3}\s*\/\s*100\b/.test(text)
|
|
1645
|
+
|| /\b(?:increment\s+)?score\s+(?:is|of|at|sits at|currently|was)\b[^.\n]*\b\d{2,3}\b/i.test(text);
|
|
1646
|
+
return {
|
|
1647
|
+
key: 'ask_volunteered_score',
|
|
1648
|
+
passed: !volunteered,
|
|
1649
|
+
reason: volunteered
|
|
1650
|
+
? 'Ask answer volunteers the overall Increment Score number on a question that was not about the score. Translate it to the limiter instead.'
|
|
1651
|
+
: 'Ask answer does not volunteer the score number unprompted.'
|
|
1652
|
+
};
|
|
1653
|
+
}
|
|
1654
|
+
|
|
1382
1655
|
function evaluateAskStaleness(output, snapshot, testCase) {
|
|
1383
1656
|
if (testCase.surface !== 'ask') {
|
|
1384
1657
|
return { key: 'ask_staleness', passed: true, reason: 'Not an ask answer.' };
|
|
@@ -1472,6 +1745,7 @@ function evaluateAskClaims(output, snapshot, testCase) {
|
|
|
1472
1745
|
for (const claim of extractAskWeightClaims(normalized)) {
|
|
1473
1746
|
if (isEstimatedOneRepMaxWeightClaim(normalized, claim)) continue;
|
|
1474
1747
|
if (isVolumeWeightClaim(normalized, claim)) continue;
|
|
1748
|
+
if (isBodyWeightClaim(normalized, claim)) continue;
|
|
1475
1749
|
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1476
1750
|
if (!referencedExercise) continue;
|
|
1477
1751
|
const allowedWeights = allowedWeightsForExercise(snapshot, referencedExercise.normalizedName);
|
|
@@ -1713,6 +1987,7 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
|
|
|
1713
1987
|
if (isWithinWeightedSetClaim(claim, weightedSetClaims)) continue;
|
|
1714
1988
|
if (isEstimatedOneRepMaxWeightClaim(output, claim)) continue;
|
|
1715
1989
|
if (isVolumeWeightClaim(output, claim)) continue;
|
|
1990
|
+
if (isBodyWeightClaim(output, claim)) continue;
|
|
1716
1991
|
const referencedExercise = findNearestMentionedExercise(mentionedExercises, claim.index);
|
|
1717
1992
|
if (!referencedExercise) continue;
|
|
1718
1993
|
const rows = evidenceRows.filter((row) => row.normalizedName === referencedExercise.normalizedName);
|
|
@@ -1770,6 +2045,309 @@ function evaluateAskToolProvenance(output, context, testCase, snapshot) {
|
|
|
1770
2045
|
};
|
|
1771
2046
|
}
|
|
1772
2047
|
|
|
2048
|
+
function scoreFormulaEntries(snapshot) {
|
|
2049
|
+
const seen = new Set();
|
|
2050
|
+
return scoreHistoryFromSnapshot(snapshot).filter((entry) => {
|
|
2051
|
+
if (!entry) return false;
|
|
2052
|
+
const key = entry.id ?? entry.snapshotAt;
|
|
2053
|
+
if (key == null) return true;
|
|
2054
|
+
if (seen.has(key)) return false;
|
|
2055
|
+
seen.add(key);
|
|
2056
|
+
return true;
|
|
2057
|
+
});
|
|
2058
|
+
}
|
|
2059
|
+
|
|
2060
|
+
function evaluateFormulaVersion(_output, snapshot, testCase) {
|
|
2061
|
+
const expected = testCase.expectedFormulaVersion ?? testCase.formulaVersion ?? null;
|
|
2062
|
+
if (!expected) {
|
|
2063
|
+
return { key: 'formula_version', passed: true, reason: 'No formula version pin configured.' };
|
|
2064
|
+
}
|
|
2065
|
+
|
|
2066
|
+
const entries = scoreFormulaEntries(snapshot);
|
|
2067
|
+
const missingCount = entries.filter((entry) => !entry?.formulaVersion).length;
|
|
2068
|
+
const versions = uniqueStrings(entries.map((entry) => entry?.formulaVersion));
|
|
2069
|
+
const passed = entries.length > 0 && missingCount === 0 && versions.every((version) => version === expected);
|
|
2070
|
+
return {
|
|
2071
|
+
key: 'formula_version',
|
|
2072
|
+
passed,
|
|
2073
|
+
reason: passed
|
|
2074
|
+
? `Formula version is pinned to ${expected}.`
|
|
2075
|
+
: missingCount > 0
|
|
2076
|
+
? `Expected formula version ${expected}, but ${missingCount} score snapshot(s) have no formula version.`
|
|
2077
|
+
: versions.length > 0
|
|
2078
|
+
? `Expected formula version ${expected}, got ${versions.join(', ')}.`
|
|
2079
|
+
: `Expected formula version ${expected}, but snapshot has no increment score formula version.`
|
|
2080
|
+
};
|
|
2081
|
+
}
|
|
2082
|
+
|
|
2083
|
+
function arrayContainsAll(actual = [], expected = []) {
|
|
2084
|
+
const actualSet = new Set(actual ?? []);
|
|
2085
|
+
return (expected ?? []).every((item) => actualSet.has(item));
|
|
2086
|
+
}
|
|
2087
|
+
|
|
2088
|
+
function arrayEquals(actual = [], expected = []) {
|
|
2089
|
+
if (!Array.isArray(actual) || !Array.isArray(expected) || actual.length !== expected.length) {
|
|
2090
|
+
return false;
|
|
2091
|
+
}
|
|
2092
|
+
const sortedActual = [...actual].sort();
|
|
2093
|
+
const sortedExpected = [...expected].sort();
|
|
2094
|
+
return sortedActual.every((item, index) => item === sortedExpected[index]);
|
|
2095
|
+
}
|
|
2096
|
+
|
|
2097
|
+
function askObservationCheckMatches(actualCheck, expectedCheck) {
|
|
2098
|
+
return Object.entries(expectedCheck ?? {}).every(([key, value]) => {
|
|
2099
|
+
if (Array.isArray(value)) return arrayContainsAll(actualCheck?.[key], value);
|
|
2100
|
+
return actualCheck?.[key] === value;
|
|
2101
|
+
});
|
|
2102
|
+
}
|
|
2103
|
+
|
|
2104
|
+
function evaluateAskEvidencePlan(_output, context, testCase) {
|
|
2105
|
+
if (testCase.surface !== 'ask') {
|
|
2106
|
+
return { key: 'ask_evidence_plan', passed: true, reason: 'Not an ask answer.' };
|
|
2107
|
+
}
|
|
2108
|
+
|
|
2109
|
+
const expected = testCase.expectedEvidencePlan ?? null;
|
|
2110
|
+
if (!expected) {
|
|
2111
|
+
return { key: 'ask_evidence_plan', passed: true, reason: 'No evidence plan assertion configured.' };
|
|
2112
|
+
}
|
|
2113
|
+
|
|
2114
|
+
const plan = context?.routedMetadata?.evidencePlan ?? null;
|
|
2115
|
+
const failures = [];
|
|
2116
|
+
if (!plan) {
|
|
2117
|
+
failures.push('Routed Ask context did not expose metadata.evidencePlan.');
|
|
2118
|
+
} else {
|
|
2119
|
+
for (const key of ['route', 'effectiveRoute', 'fallbackRoute']) {
|
|
2120
|
+
if (key in expected && plan[key] !== expected[key]) {
|
|
2121
|
+
failures.push(`Expected evidencePlan.${key}=${expected[key] ?? 'null'}, got ${plan[key] ?? 'null'}.`);
|
|
2122
|
+
}
|
|
2123
|
+
}
|
|
2124
|
+
|
|
2125
|
+
for (const key of ['requiredTools', 'optionalTools', 'executedTools', 'evidenceGaps']) {
|
|
2126
|
+
if (Array.isArray(expected[key]) && !arrayEquals(plan[key], expected[key])) {
|
|
2127
|
+
failures.push(`Expected evidencePlan.${key} to equal ${expected[key].join(', ')}; got ${(plan[key] ?? []).join(', ')}.`);
|
|
2128
|
+
}
|
|
2129
|
+
}
|
|
2130
|
+
|
|
2131
|
+
if (Array.isArray(expected.excludedExecutedTools)) {
|
|
2132
|
+
const executed = new Set(plan.executedTools ?? []);
|
|
2133
|
+
const hits = expected.excludedExecutedTools.filter((toolName) => executed.has(toolName));
|
|
2134
|
+
if (hits.length > 0) {
|
|
2135
|
+
failures.push(`Expected evidencePlan.executedTools to exclude ${hits.join(', ')}.`);
|
|
2136
|
+
}
|
|
2137
|
+
}
|
|
2138
|
+
|
|
2139
|
+
for (const expectedCheck of expected.observationChecks ?? []) {
|
|
2140
|
+
const matched = (plan.observationChecks ?? []).some((actualCheck) => askObservationCheckMatches(actualCheck, expectedCheck));
|
|
2141
|
+
if (!matched) {
|
|
2142
|
+
failures.push(`Expected observation check ${JSON.stringify(expectedCheck)}; got ${JSON.stringify(plan.observationChecks ?? [])}.`);
|
|
2143
|
+
}
|
|
2144
|
+
}
|
|
2145
|
+
if (Array.isArray(expected.observationChecks) && (plan.observationChecks ?? []).length !== expected.observationChecks.length) {
|
|
2146
|
+
failures.push(`Expected ${expected.observationChecks.length} observation check(s), got ${(plan.observationChecks ?? []).length}.`);
|
|
2147
|
+
}
|
|
2148
|
+
}
|
|
2149
|
+
|
|
2150
|
+
return {
|
|
2151
|
+
key: 'ask_evidence_plan',
|
|
2152
|
+
passed: failures.length === 0,
|
|
2153
|
+
reason: failures.length === 0
|
|
2154
|
+
? 'Ask evidence plan matches configured assertions.'
|
|
2155
|
+
: failures.join(' ')
|
|
2156
|
+
};
|
|
2157
|
+
}
|
|
2158
|
+
|
|
2159
|
+
function askMetadataObservationReferences(metadata) {
|
|
2160
|
+
const references = new Set([
|
|
2161
|
+
...(metadata?.includedCoachObservationIds ?? []),
|
|
2162
|
+
...(metadata?.coachObservationIds ?? [])
|
|
2163
|
+
]);
|
|
2164
|
+
for (const comparison of metadata?.sessionObservationComparisons ?? []) {
|
|
2165
|
+
if (comparison?.observationId) references.add(comparison.observationId);
|
|
2166
|
+
}
|
|
2167
|
+
for (const item of metadata?.provenance ?? []) {
|
|
2168
|
+
for (const sourceId of item?.sourceIds ?? []) {
|
|
2169
|
+
references.add(sourceId);
|
|
2170
|
+
}
|
|
2171
|
+
}
|
|
2172
|
+
return references;
|
|
2173
|
+
}
|
|
2174
|
+
|
|
2175
|
+
function evaluateAskMetadata(output, context, testCase) {
|
|
2176
|
+
if (testCase.surface !== 'ask') {
|
|
2177
|
+
return { key: 'ask_metadata', passed: true, reason: 'Not an ask answer.' };
|
|
2178
|
+
}
|
|
2179
|
+
|
|
2180
|
+
const expected = testCase.expectedMetadata ?? null;
|
|
2181
|
+
if (!expected) {
|
|
2182
|
+
return { key: 'ask_metadata', passed: true, reason: 'No Ask metadata assertion configured.' };
|
|
2183
|
+
}
|
|
2184
|
+
|
|
2185
|
+
const metadata = context?.routedMetadata ?? {};
|
|
2186
|
+
const failures = [];
|
|
2187
|
+
|
|
2188
|
+
if (Array.isArray(expected.includedCoachObservationIds)) {
|
|
2189
|
+
const included = new Set(metadata.includedCoachObservationIds ?? []);
|
|
2190
|
+
const missing = expected.includedCoachObservationIds.filter((id) => !included.has(id));
|
|
2191
|
+
if (missing.length > 0) {
|
|
2192
|
+
failures.push(`Expected included coach observation id(s): ${missing.join(', ')}.`);
|
|
2193
|
+
}
|
|
2194
|
+
}
|
|
2195
|
+
|
|
2196
|
+
if (Array.isArray(expected.excludedCoachObservationIds)) {
|
|
2197
|
+
const references = askMetadataObservationReferences(metadata);
|
|
2198
|
+
const hits = expected.excludedCoachObservationIds.filter((id) => references.has(id));
|
|
2199
|
+
if (hits.length > 0) {
|
|
2200
|
+
failures.push(`Expected coach observation id(s) to be excluded from rendered metadata: ${hits.join(', ')}.`);
|
|
2201
|
+
}
|
|
2202
|
+
}
|
|
2203
|
+
|
|
2204
|
+
if (Array.isArray(expected.forbiddenObservationPhrases)) {
|
|
2205
|
+
const hits = uniqueStrings(expected.forbiddenObservationPhrases).filter((phrase) => phraseIncludes(output, phrase));
|
|
2206
|
+
if (hits.length > 0) {
|
|
2207
|
+
failures.push(`Dismissed or excluded observation phrase(s) leaked into Ask answer: ${hits.join(', ')}.`);
|
|
2208
|
+
}
|
|
2209
|
+
}
|
|
2210
|
+
|
|
2211
|
+
return {
|
|
2212
|
+
key: 'ask_metadata',
|
|
2213
|
+
passed: failures.length === 0,
|
|
2214
|
+
reason: failures.length === 0
|
|
2215
|
+
? 'Ask metadata matches configured assertions.'
|
|
2216
|
+
: failures.join(' ')
|
|
2217
|
+
};
|
|
2218
|
+
}
|
|
2219
|
+
|
|
2220
|
+
function normalizedStructuredText(value) {
|
|
2221
|
+
return String(value ?? '')
|
|
2222
|
+
.toLowerCase()
|
|
2223
|
+
.replace(/[^a-z0-9]+/g, ' ')
|
|
2224
|
+
.replace(/\b(my|the|a|an)\b/g, ' ')
|
|
2225
|
+
.replace(/\s+/g, ' ')
|
|
2226
|
+
.trim();
|
|
2227
|
+
}
|
|
2228
|
+
|
|
2229
|
+
function structuredStringArray(value) {
|
|
2230
|
+
return Array.isArray(value)
|
|
2231
|
+
? value.map((item) => String(item ?? '').trim()).filter(Boolean)
|
|
2232
|
+
: [];
|
|
2233
|
+
}
|
|
2234
|
+
|
|
2235
|
+
function structuredObjectStringArray(items, key) {
|
|
2236
|
+
return Array.isArray(items)
|
|
2237
|
+
? items.map((item) => String(item?.[key] ?? '').trim()).filter(Boolean)
|
|
2238
|
+
: [];
|
|
2239
|
+
}
|
|
2240
|
+
|
|
2241
|
+
function requireStructuredStrings(actual, expected, label, failures) {
|
|
2242
|
+
if (!Array.isArray(expected)) return;
|
|
2243
|
+
const actualSet = new Set(structuredStringArray(actual));
|
|
2244
|
+
const missing = expected.filter((item) => !actualSet.has(item));
|
|
2245
|
+
if (missing.length > 0) {
|
|
2246
|
+
failures.push(`Expected structured ${label}: ${missing.join(', ')}.`);
|
|
2247
|
+
}
|
|
2248
|
+
}
|
|
2249
|
+
|
|
2250
|
+
function forbidStructuredSuggestions(actual, forbidden, failures) {
|
|
2251
|
+
if (!Array.isArray(forbidden)) return;
|
|
2252
|
+
const normalizedActual = new Set(structuredStringArray(actual).map(normalizedStructuredText).filter(Boolean));
|
|
2253
|
+
const hits = forbidden.filter((item) => normalizedActual.has(normalizedStructuredText(item)));
|
|
2254
|
+
if (hits.length > 0) {
|
|
2255
|
+
failures.push(`Forbidden follow-up suggestion(s) present: ${hits.join(', ')}.`);
|
|
2256
|
+
}
|
|
2257
|
+
}
|
|
2258
|
+
|
|
2259
|
+
function evaluateAskStructuredResponse(_output, context, testCase, structured) {
|
|
2260
|
+
if (testCase.surface !== 'ask') {
|
|
2261
|
+
return { key: 'ask_structured_response', passed: true, reason: 'Not an ask answer.' };
|
|
2262
|
+
}
|
|
2263
|
+
|
|
2264
|
+
const expected = testCase.expectedStructuredResponse ?? null;
|
|
2265
|
+
if (!expected) {
|
|
2266
|
+
return { key: 'ask_structured_response', passed: true, reason: 'No structured response assertion configured.' };
|
|
2267
|
+
}
|
|
2268
|
+
|
|
2269
|
+
const failures = [];
|
|
2270
|
+
if (!structured || typeof structured !== 'object' || Array.isArray(structured)) {
|
|
2271
|
+
failures.push('Ask structured response was not generated.');
|
|
2272
|
+
} else {
|
|
2273
|
+
if (expected.confidence && structured.confidence !== expected.confidence) {
|
|
2274
|
+
failures.push(`Expected structured confidence ${expected.confidence}, got ${structured.confidence ?? 'null'}.`);
|
|
2275
|
+
}
|
|
2276
|
+
|
|
2277
|
+
requireStructuredStrings(
|
|
2278
|
+
structuredObjectStringArray(structured.evidenceUsed, 'toolName'),
|
|
2279
|
+
expected.requiredEvidenceTools,
|
|
2280
|
+
'evidence tool(s)',
|
|
2281
|
+
failures
|
|
2282
|
+
);
|
|
2283
|
+
requireStructuredStrings(
|
|
2284
|
+
structuredObjectStringArray(structured.evidenceUsed, 'label'),
|
|
2285
|
+
expected.requiredEvidenceLabels,
|
|
2286
|
+
'evidence label(s)',
|
|
2287
|
+
failures
|
|
2288
|
+
);
|
|
2289
|
+
requireStructuredStrings(
|
|
2290
|
+
structuredObjectStringArray(structured.recommendedActions, 'label'),
|
|
2291
|
+
expected.requiredRecommendedActionLabels,
|
|
2292
|
+
'recommended action label(s)',
|
|
2293
|
+
failures
|
|
2294
|
+
);
|
|
2295
|
+
requireStructuredStrings(
|
|
2296
|
+
structured.followUpSuggestions,
|
|
2297
|
+
expected.requiredFollowUpSuggestions,
|
|
2298
|
+
'follow-up suggestion(s)',
|
|
2299
|
+
failures
|
|
2300
|
+
);
|
|
2301
|
+
requireStructuredStrings(
|
|
2302
|
+
structured.limitations,
|
|
2303
|
+
expected.requiredLimitations,
|
|
2304
|
+
'limitation(s)',
|
|
2305
|
+
failures
|
|
2306
|
+
);
|
|
2307
|
+
forbidStructuredSuggestions(structured.followUpSuggestions, expected.forbiddenFollowUpSuggestions, failures);
|
|
2308
|
+
|
|
2309
|
+
const followUps = structuredStringArray(structured.followUpSuggestions);
|
|
2310
|
+
const normalizedFollowUps = followUps.map(normalizedStructuredText).filter(Boolean);
|
|
2311
|
+
const duplicateCount = normalizedFollowUps.length - new Set(normalizedFollowUps).size;
|
|
2312
|
+
if (duplicateCount > 0) {
|
|
2313
|
+
failures.push('Structured follow-up suggestions must be unique.');
|
|
2314
|
+
}
|
|
2315
|
+
|
|
2316
|
+
const normalizedQuestion = normalizedStructuredText(context?.question ?? testCase.context?.question ?? testCase.question ?? '');
|
|
2317
|
+
if (normalizedQuestion && normalizedFollowUps.includes(normalizedQuestion)) {
|
|
2318
|
+
failures.push('Structured follow-up suggestions must not repeat the current user question.');
|
|
2319
|
+
}
|
|
2320
|
+
|
|
2321
|
+
if (Number.isFinite(expected.maxFollowUpSuggestions) && followUps.length > expected.maxFollowUpSuggestions) {
|
|
2322
|
+
failures.push(`Expected at most ${expected.maxFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
|
|
2323
|
+
}
|
|
2324
|
+
if (Number.isFinite(expected.minFollowUpSuggestions) && followUps.length < expected.minFollowUpSuggestions) {
|
|
2325
|
+
failures.push(`Expected at least ${expected.minFollowUpSuggestions} follow-up suggestion(s), got ${followUps.length}.`);
|
|
2326
|
+
}
|
|
2327
|
+
|
|
2328
|
+
if (typeof expected.programDraftPresent === 'boolean') {
|
|
2329
|
+
const hasProgramDraft = structured.programDraft != null;
|
|
2330
|
+
if (hasProgramDraft !== expected.programDraftPresent) {
|
|
2331
|
+
failures.push(`Expected programDraft present=${expected.programDraftPresent}, got ${hasProgramDraft}.`);
|
|
2332
|
+
}
|
|
2333
|
+
}
|
|
2334
|
+
}
|
|
2335
|
+
|
|
2336
|
+
return {
|
|
2337
|
+
key: 'ask_structured_response',
|
|
2338
|
+
passed: failures.length === 0,
|
|
2339
|
+
reason: failures.length === 0
|
|
2340
|
+
? 'Ask structured response matches configured assertions.'
|
|
2341
|
+
: failures.join(' ')
|
|
2342
|
+
};
|
|
2343
|
+
}
|
|
2344
|
+
|
|
2345
|
+
function askStructuredProgramDraft(parsedAsk, routingMetadata) {
|
|
2346
|
+
const shouldSuppressDraft = routingMetadata?.requestedCoachObservationIntent === 'successor_plan'
|
|
2347
|
+
&& routingMetadata?.coachObservationFollowUpMissing === true;
|
|
2348
|
+
return shouldSuppressDraft ? undefined : parsedAsk?.programDraft;
|
|
2349
|
+
}
|
|
2350
|
+
|
|
1773
2351
|
function firstAction(payload) {
|
|
1774
2352
|
const actions = Array.isArray(payload?.recommendedNextActions) ? payload.recommendedNextActions : [];
|
|
1775
2353
|
return actions.find((action) => typeof action?.action === 'string' && action.action.trim());
|
|
@@ -1998,17 +2576,20 @@ function evaluatePersonaMotivation(output, context, testCase) {
|
|
|
1998
2576
|
failures.push('Feedback acknowledges a PR/positive result but frames later-set dropoff in a demotivating way.');
|
|
1999
2577
|
}
|
|
2000
2578
|
|
|
2579
|
+
const dataLimitationCaveat = /\bnot enough\s+(?:\w+\s+){0,5}?(?:data|details?|context|evidence|information|info|signals?|metrics?)\b/i.test(normalized)
|
|
2580
|
+
|| /\bnot enough\s+(?:\w+\s+){0,8}?to\s+(?:separate|infer|tie|connect|attribute|blame|claim|say|show|prove|know|call)\b/i.test(normalized);
|
|
2581
|
+
|
|
2001
2582
|
const discouragingPatterns = [
|
|
2002
2583
|
/\bdisappointing\b/i,
|
|
2003
2584
|
/\bunderwhelming\b/i,
|
|
2004
2585
|
/\bunderperformed\b/i,
|
|
2005
2586
|
/\bpoor\b/i,
|
|
2006
|
-
/\bnot enough\b/i,
|
|
2587
|
+
/\bnot enough\s+(?:effort|work|volume|intensity|reps?|sets?|weight|load|progress|consistency)\b/i,
|
|
2007
2588
|
/\bfailed to\b/i,
|
|
2008
2589
|
/\bstruggled\b/i
|
|
2009
2590
|
];
|
|
2010
2591
|
|
|
2011
|
-
if (discouragingPatterns.some((pattern) => pattern.test(normalized))) {
|
|
2592
|
+
if (!dataLimitationCaveat && discouragingPatterns.some((pattern) => pattern.test(normalized))) {
|
|
2012
2593
|
failures.push('Feedback uses discouraging language that is likely to reduce motivation.');
|
|
2013
2594
|
}
|
|
2014
2595
|
|
|
@@ -2026,6 +2607,33 @@ export async function runSummaryEvalCase(testCase) {
|
|
|
2026
2607
|
return runSummaryEvalCaseFromSnapshot(testCase, snapshot);
|
|
2027
2608
|
}
|
|
2028
2609
|
|
|
2610
|
+
// When an ask answer emits a <program_draft> block, it must be valid JSON in the
|
|
2611
|
+
// exact Program shape (enums, limits, no forbidden keys) — validated by the same
|
|
2612
|
+
// normalizer the runtime uses to accept/drop drafts. Catches malformed drafts in
|
|
2613
|
+
// CI instead of silently dropping them in prod. No block = nothing to check.
|
|
2614
|
+
function evaluateProgramDraft(output, testCase, parsedAsk = null) {
|
|
2615
|
+
if (testCase.surface !== 'ask') {
|
|
2616
|
+
return { key: 'program_draft', passed: true, reason: 'Not an ask answer.' };
|
|
2617
|
+
}
|
|
2618
|
+
if (!hasProgramDraftBlock(output)) {
|
|
2619
|
+
return { key: 'program_draft', passed: true, reason: 'No program draft block.' };
|
|
2620
|
+
}
|
|
2621
|
+
// Validate against the EXACT runtime rules — the runtime passes
|
|
2622
|
+
// canonicalExerciseName, which strips non-alphanumerics; without it the eval
|
|
2623
|
+
// would green-light drafts (e.g. punctuation-only names) that prod silently drops.
|
|
2624
|
+
const { programDraft } = parsedAsk ?? extractAskProgramDraft(output, {
|
|
2625
|
+
canonicalizeExerciseName: canonicalExerciseName,
|
|
2626
|
+
strict: true
|
|
2627
|
+
});
|
|
2628
|
+
return {
|
|
2629
|
+
key: 'program_draft',
|
|
2630
|
+
passed: programDraft != null,
|
|
2631
|
+
reason: programDraft != null
|
|
2632
|
+
? 'Program draft is valid JSON matching the required shape.'
|
|
2633
|
+
: 'Program draft block is malformed (invalid JSON, or fails shape/enum/limit validation).'
|
|
2634
|
+
};
|
|
2635
|
+
}
|
|
2636
|
+
|
|
2029
2637
|
export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
|
|
2030
2638
|
const context = buildSummaryEvalContext(snapshot, testCase);
|
|
2031
2639
|
if (context == null) {
|
|
@@ -2036,26 +2644,53 @@ export function evaluateSummaryOutputFromSnapshot(testCase, snapshot, output) {
|
|
|
2036
2644
|
throw new Error(`Eval case ${testCase.id} produced an empty output`);
|
|
2037
2645
|
}
|
|
2038
2646
|
|
|
2647
|
+
// strict: eval rejects a draft with any malformed nested item (the runtime
|
|
2648
|
+
// salvages it, but partial malformation is a regression signal). The parsed
|
|
2649
|
+
// result also feeds <program_draft> stripping for the other checks.
|
|
2650
|
+
const parsedAsk = testCase.surface === 'ask'
|
|
2651
|
+
? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName, strict: true })
|
|
2652
|
+
: null;
|
|
2653
|
+
const structuredParsedAsk = testCase.surface === 'ask'
|
|
2654
|
+
? extractAskProgramDraft(output, { canonicalizeExerciseName: canonicalExerciseName })
|
|
2655
|
+
: null;
|
|
2656
|
+
const visibleOutput = parsedAsk
|
|
2657
|
+
? stripXMLTagBlocks(parsedAsk.answerText)
|
|
2658
|
+
: output;
|
|
2659
|
+
const structuredAsk = testCase.surface === 'ask'
|
|
2660
|
+
? buildAskStructuredResponse(visibleOutput, context.routedMetadata ?? {}, {
|
|
2661
|
+
programDraft: askStructuredProgramDraft(structuredParsedAsk, context.routedMetadata),
|
|
2662
|
+
question: context.question ?? testCase.context?.question ?? testCase.question ?? ''
|
|
2663
|
+
})
|
|
2664
|
+
: null;
|
|
2665
|
+
|
|
2039
2666
|
const checks = [
|
|
2040
|
-
evaluateNoInsight(
|
|
2041
|
-
evaluateShape(
|
|
2042
|
-
evaluateRequiredMentions(
|
|
2043
|
-
evaluateAnyOfMentions(
|
|
2044
|
-
evaluateForbiddenPhrases(
|
|
2045
|
-
evaluateForbiddenMentions(
|
|
2046
|
-
evaluateExerciseMentions(
|
|
2047
|
-
evaluateWorkoutClaims(
|
|
2048
|
-
evaluateAskClaims(
|
|
2049
|
-
evaluateAskDirectionalConsistency(
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
|
|
2056
|
-
|
|
2057
|
-
|
|
2058
|
-
|
|
2667
|
+
evaluateNoInsight(visibleOutput, testCase),
|
|
2668
|
+
evaluateShape(visibleOutput, testCase),
|
|
2669
|
+
evaluateRequiredMentions(visibleOutput, testCase),
|
|
2670
|
+
evaluateAnyOfMentions(visibleOutput, testCase),
|
|
2671
|
+
evaluateForbiddenPhrases(visibleOutput, testCase),
|
|
2672
|
+
evaluateForbiddenMentions(visibleOutput, testCase),
|
|
2673
|
+
evaluateExerciseMentions(visibleOutput, snapshot, context, testCase.surface, testCase),
|
|
2674
|
+
evaluateWorkoutClaims(visibleOutput, context, testCase),
|
|
2675
|
+
evaluateAskClaims(visibleOutput, snapshot, testCase),
|
|
2676
|
+
evaluateAskDirectionalConsistency(visibleOutput, snapshot, testCase),
|
|
2677
|
+
evaluateAskScoreVoice(visibleOutput, testCase),
|
|
2678
|
+
evaluateAskSelfReference(visibleOutput, testCase),
|
|
2679
|
+
evaluateAskVolunteeredScore(visibleOutput, testCase),
|
|
2680
|
+
evaluateAskStaleness(visibleOutput, snapshot, testCase),
|
|
2681
|
+
evaluateAskToolProvenance(visibleOutput, context, testCase, snapshot),
|
|
2682
|
+
evaluateFormulaVersion(visibleOutput, snapshot, testCase),
|
|
2683
|
+
evaluateAskEvidencePlan(visibleOutput, context, testCase),
|
|
2684
|
+
evaluateAskMetadata(visibleOutput, context, testCase),
|
|
2685
|
+
evaluateAskStructuredResponse(visibleOutput, context, testCase, structuredAsk),
|
|
2686
|
+
evaluateScoreCommentaryAction(visibleOutput, context, testCase),
|
|
2687
|
+
evaluateScoreCommentarySynthesis(visibleOutput, context, testCase),
|
|
2688
|
+
evaluateScoreCommentaryExerciseInvention(visibleOutput, snapshot, context, testCase),
|
|
2689
|
+
evaluateScoreCommentaryBand(visibleOutput, context, testCase),
|
|
2690
|
+
evaluateScoreCommentaryTone(visibleOutput, testCase),
|
|
2691
|
+
evaluateScoreCommentaryLength(visibleOutput, testCase),
|
|
2692
|
+
evaluatePersonaMotivation(visibleOutput, context, testCase),
|
|
2693
|
+
evaluateProgramDraft(output, testCase, parsedAsk)
|
|
2059
2694
|
];
|
|
2060
2695
|
|
|
2061
2696
|
return {
|