incremnt 0.4.0 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -3
- package/package.json +20 -4
- package/src/anonymize.js +12 -0
- package/src/coach-bakeoff.js +300 -0
- package/src/coach-facts.js +100 -0
- package/src/coach-prompt-variants.js +106 -0
- package/src/contract.js +32 -5
- package/src/exercise-aliases.js +163 -0
- package/src/format.js +104 -1
- package/src/increment-score-replay-data.js +486 -0
- package/src/increment-score-replay.js +822 -0
- package/src/lib.js +14 -2
- package/src/local.js +3 -3
- package/src/mcp.js +67 -0
- package/src/openrouter.js +979 -182
- package/src/program-phase-resolver.js +206 -0
- package/src/prompt-security.js +1 -1
- package/src/promptfoo-domain-assert.cjs +4 -0
- package/src/promptfoo-evals.js +166 -0
- package/src/promptfoo-langfuse-scores.js +354 -0
- package/src/promptfoo-provider.cjs +14 -0
- package/src/promptfoo-tests.cjs +4 -0
- package/src/queries.js +2281 -197
- package/src/remote.js +99 -6
- package/src/score-context.js +182 -0
- package/src/state.js +9 -2
- package/src/stored-summary-eval-report.js +85 -52
- package/src/summary-evals.js +900 -21
- package/src/sync-service.js +1275 -131
- package/src/transport.js +9 -1
package/src/remote.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import fs from 'node:fs/promises';
|
|
2
2
|
import { readSnapshot } from './local.js';
|
|
3
|
-
import { executeReadCommand } from './queries.js';
|
|
3
|
+
import { executeCoachReadTool as executeLocalCoachReadTool, executeReadCommand } from './queries.js';
|
|
4
4
|
import { resolveServiceUrl } from './service-url.js';
|
|
5
5
|
|
|
6
6
|
function notImplementedError() {
|
|
@@ -40,7 +40,9 @@ const remoteCommandHandlers = {
|
|
|
40
40
|
'training-load': executeRemoteRead,
|
|
41
41
|
'ask-history': executeRemoteRead,
|
|
42
42
|
'ask-show': executeRemoteRead,
|
|
43
|
-
'program-share-fetch': executeRemoteRead
|
|
43
|
+
'program-share-fetch': executeRemoteRead,
|
|
44
|
+
'increment-score-current': executeRemoteRead,
|
|
45
|
+
'increment-score-history': executeRemoteRead
|
|
44
46
|
};
|
|
45
47
|
|
|
46
48
|
async function executeRemoteRead(options, sessionState, normalizedCommand) {
|
|
@@ -161,6 +163,18 @@ function endpointForCommand(baseUrl, normalizedCommand, options) {
|
|
|
161
163
|
return resolveServiceUrl(baseUrl, `/cli/ask/history/${options.id}`);
|
|
162
164
|
case 'program-share-fetch':
|
|
163
165
|
return resolveServiceUrl(baseUrl, `/program-share/${options.token}`);
|
|
166
|
+
case 'increment-score-current': {
|
|
167
|
+
const url = resolveServiceUrl(baseUrl, '/cli/increment-score/current');
|
|
168
|
+
if (options.historyDays) url.searchParams.set('historyDays', options.historyDays);
|
|
169
|
+
return url;
|
|
170
|
+
}
|
|
171
|
+
case 'increment-score-history': {
|
|
172
|
+
const url = resolveServiceUrl(baseUrl, '/mobile/score-snapshots');
|
|
173
|
+
if (options.from) url.searchParams.set('from', options.from);
|
|
174
|
+
if (options.to) url.searchParams.set('to', options.to);
|
|
175
|
+
if (options.limit) url.searchParams.set('limit', options.limit);
|
|
176
|
+
return url;
|
|
177
|
+
}
|
|
164
178
|
default:
|
|
165
179
|
return resolveServiceUrl(baseUrl, '/');
|
|
166
180
|
}
|
|
@@ -190,6 +204,40 @@ function resourceNotFoundMessage(normalizedCommand, options) {
|
|
|
190
204
|
return 'Requested resource was not found.';
|
|
191
205
|
}
|
|
192
206
|
|
|
207
|
+
async function executeRemoteCoachReadTool(toolName, input, sessionState) {
|
|
208
|
+
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
209
|
+
if (baseUrl) {
|
|
210
|
+
const endpoint = resolveServiceUrl(baseUrl, `/cli/coach-tools/${encodeURIComponent(toolName)}`);
|
|
211
|
+
const response = await fetch(endpoint, {
|
|
212
|
+
method: 'POST',
|
|
213
|
+
headers: {
|
|
214
|
+
'Content-Type': 'application/json',
|
|
215
|
+
Authorization: `Bearer ${sessionState.session?.auth?.accessToken ?? ''}`
|
|
216
|
+
},
|
|
217
|
+
body: JSON.stringify(input ?? {})
|
|
218
|
+
});
|
|
219
|
+
|
|
220
|
+
if (response.status === 401 || response.status === 403) throw authenticationFailedError();
|
|
221
|
+
if (response.status === 404) {
|
|
222
|
+
const error = new Error(`Unknown coach read tool: ${toolName}`);
|
|
223
|
+
error.code = 'REMOTE_NOT_FOUND';
|
|
224
|
+
throw error;
|
|
225
|
+
}
|
|
226
|
+
if (!response.ok) {
|
|
227
|
+
const payload = await response.json().catch(() => null);
|
|
228
|
+
const error = new Error(payload?.error ?? `Unexpected error from incremnt sync service (HTTP ${response.status}).`);
|
|
229
|
+
error.code = 'REMOTE_HTTP_ERROR';
|
|
230
|
+
throw error;
|
|
231
|
+
}
|
|
232
|
+
return response.json();
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
const fixturePath = sessionState.session?.transport?.fixturePath;
|
|
236
|
+
if (!fixturePath) throw notImplementedError();
|
|
237
|
+
const snapshot = await readSnapshot(fixturePath);
|
|
238
|
+
return executeLocalCoachReadTool(snapshot, toolName, input);
|
|
239
|
+
}
|
|
240
|
+
|
|
193
241
|
const remoteWriteCommandHandlers = {
|
|
194
242
|
'programs-propose': async (options, sessionState) => {
|
|
195
243
|
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
@@ -346,16 +394,54 @@ const remoteWriteCommandHandlers = {
|
|
|
346
394
|
return response.json();
|
|
347
395
|
},
|
|
348
396
|
|
|
397
|
+
'increment-score-upload': async (options, sessionState) => {
|
|
398
|
+
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
399
|
+
if (!baseUrl) throw notImplementedError();
|
|
400
|
+
if (!options.file) {
|
|
401
|
+
const error = new Error('--file is required for increment-score upload.');
|
|
402
|
+
error.code = 'MISSING_OPTION';
|
|
403
|
+
throw error;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
const raw = await fs.readFile(options.file, 'utf8');
|
|
407
|
+
const body = JSON.parse(raw);
|
|
408
|
+
if (!body || !Array.isArray(body.snapshots)) {
|
|
409
|
+
const error = new Error('Invalid file: expected an object with a snapshots array.');
|
|
410
|
+
error.code = 'INVALID_PAYLOAD';
|
|
411
|
+
throw error;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
const endpoint = resolveServiceUrl(baseUrl, '/mobile/score-snapshots');
|
|
415
|
+
const response = await fetch(endpoint, {
|
|
416
|
+
method: 'POST',
|
|
417
|
+
headers: {
|
|
418
|
+
'Content-Type': 'application/json',
|
|
419
|
+
Authorization: `Bearer ${sessionState.session?.auth?.accessToken ?? ''}`
|
|
420
|
+
},
|
|
421
|
+
body: JSON.stringify(body)
|
|
422
|
+
});
|
|
423
|
+
|
|
424
|
+
if (response.status === 401 || response.status === 403) throw authenticationFailedError();
|
|
425
|
+
if (!response.ok) {
|
|
426
|
+
const payload = await response.json().catch(() => null);
|
|
427
|
+
const error = new Error(payload?.error ?? `Unexpected error (HTTP ${response.status}).`);
|
|
428
|
+
error.code = 'REMOTE_HTTP_ERROR';
|
|
429
|
+
throw error;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
return response.json();
|
|
433
|
+
},
|
|
434
|
+
|
|
349
435
|
'program-share-revoke': async (options, sessionState) => {
|
|
350
436
|
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
351
437
|
if (!baseUrl) throw notImplementedError();
|
|
352
|
-
if (!options
|
|
353
|
-
const error = new Error('--
|
|
438
|
+
if (!options['share-id']) {
|
|
439
|
+
const error = new Error('--share-id is required for programs share revoke.');
|
|
354
440
|
error.code = 'MISSING_OPTION';
|
|
355
441
|
throw error;
|
|
356
442
|
}
|
|
357
443
|
|
|
358
|
-
const endpoint = resolveServiceUrl(baseUrl, `/cli/program-share/${options
|
|
444
|
+
const endpoint = resolveServiceUrl(baseUrl, `/cli/program-share/${options['share-id']}/revoke`);
|
|
359
445
|
const response = await fetch(endpoint, {
|
|
360
446
|
method: 'POST',
|
|
361
447
|
headers: {
|
|
@@ -365,7 +451,7 @@ const remoteWriteCommandHandlers = {
|
|
|
365
451
|
|
|
366
452
|
if (response.status === 401 || response.status === 403) throw authenticationFailedError();
|
|
367
453
|
if (response.status === 404) {
|
|
368
|
-
const error = new Error(`Program share not found: ${options
|
|
454
|
+
const error = new Error(`Program share not found: ${options['share-id']}`);
|
|
369
455
|
error.code = 'REMOTE_NOT_FOUND';
|
|
370
456
|
throw error;
|
|
371
457
|
}
|
|
@@ -407,6 +493,13 @@ export function createRemoteTransport(sessionState, transportOptions = {}) {
|
|
|
407
493
|
|
|
408
494
|
return handler(options, sessionState, normalizedCommand);
|
|
409
495
|
},
|
|
496
|
+
async executeCoachReadTool(toolName, input = {}) {
|
|
497
|
+
if (transportOptions.expired) {
|
|
498
|
+
throw expiredSessionError();
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return executeRemoteCoachReadTool(toolName, input, sessionState);
|
|
502
|
+
},
|
|
410
503
|
async executeWriteCommand(normalizedCommand, options = {}) {
|
|
411
504
|
if (transportOptions.expired) {
|
|
412
505
|
throw expiredSessionError();
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
// Derived context for INCREMNT Score snapshots.
|
|
2
|
+
//
|
|
3
|
+
// These fields are computed at response time from existing snapshot data.
|
|
4
|
+
// They are NOT persisted — pure projections of (current snapshot, previous
|
|
5
|
+
// snapshot) into agent-friendly explanatory shape. See GitHub issue #498.
|
|
6
|
+
|
|
7
|
+
// Score bands. Inclusive lower bound, exclusive upper bound (except 'peak').
|
|
8
|
+
// 0..40 weak
|
|
9
|
+
// 40..60 developing
|
|
10
|
+
// 60..75 solid
|
|
11
|
+
// 75..90 strong
|
|
12
|
+
// 90..100 peak
|
|
13
|
+
export const SCORE_BANDS = [
|
|
14
|
+
{ name: 'weak', min: 0, max: 40 },
|
|
15
|
+
{ name: 'developing', min: 40, max: 60 },
|
|
16
|
+
{ name: 'solid', min: 60, max: 75 },
|
|
17
|
+
{ name: 'strong', min: 75, max: 90 },
|
|
18
|
+
{ name: 'peak', min: 90, max: 101 }
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
export function computeScoreBand(score) {
|
|
22
|
+
if (typeof score !== 'number' || !Number.isFinite(score)) return null;
|
|
23
|
+
for (const band of SCORE_BANDS) {
|
|
24
|
+
if (score >= band.min && score < band.max) return band.name;
|
|
25
|
+
}
|
|
26
|
+
return null;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Component-keyed action templates surfaced as `recommendedNextActions` for
|
|
30
|
+
// each top-2 negative driver. Keep these short, imperative, single-line.
|
|
31
|
+
const COMPONENT_ACTIONS = {
|
|
32
|
+
coverage: 'Add the missing muscle groups to your next session to close coverage gaps.',
|
|
33
|
+
recovery: 'Prioritise sleep and an easier session to let recovery rebound.',
|
|
34
|
+
stimulus: 'Push closer to productive weekly volume on your lagging muscle groups.',
|
|
35
|
+
execution: 'Hit your planned sets and reps with cleaner technique next session.',
|
|
36
|
+
progression: 'Add a small load or rep increase on your main lifts next session.'
|
|
37
|
+
};
|
|
38
|
+
|
|
39
|
+
const GENERIC_ACTION = 'Address this driver in your next session to lift the score.';
|
|
40
|
+
|
|
41
|
+
function actionForDriver(driver) {
|
|
42
|
+
if (!driver || typeof driver !== 'object') return GENERIC_ACTION;
|
|
43
|
+
const component = typeof driver.component === 'string' ? driver.component : null;
|
|
44
|
+
if (component && COMPONENT_ACTIONS[component]) {
|
|
45
|
+
return COMPONENT_ACTIONS[component];
|
|
46
|
+
}
|
|
47
|
+
return GENERIC_ACTION;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function driverDisplayMessage(driver) {
|
|
51
|
+
if (!driver || typeof driver !== 'object') return null;
|
|
52
|
+
if (typeof driver.message === 'string' && driver.message.trim()) return driver.message;
|
|
53
|
+
if (typeof driver.label === 'string' && driver.label.trim()) return driver.label;
|
|
54
|
+
return null;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function computeRecommendedNextActions(topNegativeDrivers) {
|
|
58
|
+
if (!Array.isArray(topNegativeDrivers) || topNegativeDrivers.length === 0) return [];
|
|
59
|
+
return topNegativeDrivers.slice(0, 2).map((driver) => ({
|
|
60
|
+
component: typeof driver?.component === 'string' ? driver.component : null,
|
|
61
|
+
driverMessage: driverDisplayMessage(driver),
|
|
62
|
+
action: actionForDriver(driver)
|
|
63
|
+
}));
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Top-2 component movers (by absolute change) between current and previous
|
|
67
|
+
// snapshot. Keys present in either side are considered; missing values
|
|
68
|
+
// treated as 0.
|
|
69
|
+
export function computeDeltaDrivers(currentComponents, previousComponents) {
|
|
70
|
+
if (
|
|
71
|
+
!currentComponents || typeof currentComponents !== 'object' ||
|
|
72
|
+
!previousComponents || typeof previousComponents !== 'object'
|
|
73
|
+
) {
|
|
74
|
+
return [];
|
|
75
|
+
}
|
|
76
|
+
const keys = new Set([
|
|
77
|
+
...Object.keys(currentComponents),
|
|
78
|
+
...Object.keys(previousComponents)
|
|
79
|
+
]);
|
|
80
|
+
const moves = [];
|
|
81
|
+
for (const key of keys) {
|
|
82
|
+
const cur = Number(currentComponents[key]);
|
|
83
|
+
const prev = Number(previousComponents[key]);
|
|
84
|
+
if (!Number.isFinite(cur) && !Number.isFinite(prev)) continue;
|
|
85
|
+
const c = Number.isFinite(cur) ? cur : 0;
|
|
86
|
+
const p = Number.isFinite(prev) ? prev : 0;
|
|
87
|
+
const delta = c - p;
|
|
88
|
+
if (delta === 0) continue;
|
|
89
|
+
moves.push({
|
|
90
|
+
component: key,
|
|
91
|
+
previousValue: p,
|
|
92
|
+
currentValue: c,
|
|
93
|
+
delta: Number(delta.toFixed(2))
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
moves.sort((a, b) => Math.abs(b.delta) - Math.abs(a.delta));
|
|
97
|
+
return moves.slice(0, 2);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function formatDelta(delta) {
|
|
101
|
+
if (delta === null || delta === undefined) return null;
|
|
102
|
+
if (delta === 0) return 'flat';
|
|
103
|
+
return delta > 0 ? `up ${delta}` : `down ${Math.abs(delta)}`;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export function computeSummaryText(enriched) {
|
|
107
|
+
if (!enriched || typeof enriched.score !== 'number') return null;
|
|
108
|
+
const parts = [];
|
|
109
|
+
const bandLabel = enriched.scoreBand ? ` (${enriched.scoreBand})` : '';
|
|
110
|
+
parts.push(`INCREMNT Score ${enriched.score}${bandLabel}.`);
|
|
111
|
+
|
|
112
|
+
if (typeof enriched.previousScore === 'number') {
|
|
113
|
+
const trend = formatDelta(enriched.delta);
|
|
114
|
+
if (enriched.comparisonSafe) {
|
|
115
|
+
parts.push(`${trend === 'flat' ? 'Essentially flat' : `Trend ${trend}`} vs previous ${enriched.previousScore}.`);
|
|
116
|
+
} else {
|
|
117
|
+
parts.push(`Previous ${enriched.previousScore} (formula version differs or is unavailable, comparison unsafe).`);
|
|
118
|
+
}
|
|
119
|
+
} else {
|
|
120
|
+
parts.push('No prior snapshot for comparison.');
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
const topNeg = Array.isArray(enriched.topNegativeDrivers) ? enriched.topNegativeDrivers[0] : null;
|
|
124
|
+
const topNegMessage = driverDisplayMessage(topNeg);
|
|
125
|
+
if (topNegMessage) {
|
|
126
|
+
parts.push(`Top drag: ${topNegMessage}.`);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const firstAction = Array.isArray(enriched.recommendedNextActions) ? enriched.recommendedNextActions[0] : null;
|
|
130
|
+
if (firstAction && typeof firstAction.action === 'string') {
|
|
131
|
+
parts.push(`Next: ${firstAction.action}`);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return parts.join(' ');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Enrich an array of snapshots (newest first, as returned by listScoreSnapshots).
|
|
138
|
+
// Adds derived fields to each snapshot in-place via a shallow copy. Existing
|
|
139
|
+
// fields are preserved; only new fields are added.
|
|
140
|
+
export function enrichScoreSnapshots(snapshots) {
|
|
141
|
+
if (!Array.isArray(snapshots) || snapshots.length === 0) return [];
|
|
142
|
+
return snapshots.map((snapshot, index) => {
|
|
143
|
+
const previous = snapshots[index + 1] ?? null;
|
|
144
|
+
return enrichScoreSnapshot(snapshot, previous);
|
|
145
|
+
});
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
export function enrichScoreSnapshot(current, previous) {
|
|
149
|
+
if (!current || typeof current !== 'object') return current;
|
|
150
|
+
|
|
151
|
+
const previousScore = previous && typeof previous.score === 'number' ? previous.score : null;
|
|
152
|
+
const delta = previousScore !== null && typeof current.score === 'number'
|
|
153
|
+
? current.score - previousScore
|
|
154
|
+
: null;
|
|
155
|
+
|
|
156
|
+
const comparisonSafe = !!(
|
|
157
|
+
previous &&
|
|
158
|
+
typeof previous.score === 'number' &&
|
|
159
|
+
typeof current.formulaVersion === 'string' &&
|
|
160
|
+
typeof previous.formulaVersion === 'string' &&
|
|
161
|
+
current.formulaVersion === previous.formulaVersion
|
|
162
|
+
);
|
|
163
|
+
|
|
164
|
+
const deltaDrivers = comparisonSafe
|
|
165
|
+
? computeDeltaDrivers(current.components, previous.components)
|
|
166
|
+
: [];
|
|
167
|
+
|
|
168
|
+
const scoreBand = computeScoreBand(current.score);
|
|
169
|
+
const recommendedNextActions = computeRecommendedNextActions(current.topNegativeDrivers);
|
|
170
|
+
|
|
171
|
+
const enriched = {
|
|
172
|
+
...current,
|
|
173
|
+
previousScore,
|
|
174
|
+
delta,
|
|
175
|
+
comparisonSafe,
|
|
176
|
+
deltaDrivers,
|
|
177
|
+
scoreBand,
|
|
178
|
+
recommendedNextActions
|
|
179
|
+
};
|
|
180
|
+
enriched.summaryText = computeSummaryText(enriched);
|
|
181
|
+
return enriched;
|
|
182
|
+
}
|
package/src/state.js
CHANGED
|
@@ -4,12 +4,19 @@ import path from 'node:path';
|
|
|
4
4
|
|
|
5
5
|
export const sessionSchemaVersion = 1;
|
|
6
6
|
|
|
7
|
+
// Prefer HOME env over os.homedir() so test fixtures and explicit overrides
|
|
8
|
+
// work consistently across platforms. On Linux, os.homedir() reads from
|
|
9
|
+
// /etc/passwd via getpwuid and ignores HOME, breaking tests that override HOME.
|
|
10
|
+
export function userHomeDir() {
|
|
11
|
+
return process.env.HOME || os.homedir();
|
|
12
|
+
}
|
|
13
|
+
|
|
7
14
|
function fallbackConfigRoot() {
|
|
8
15
|
if (process.platform === 'darwin') {
|
|
9
|
-
return path.join(
|
|
16
|
+
return path.join(userHomeDir(), 'Library', 'Application Support');
|
|
10
17
|
}
|
|
11
18
|
|
|
12
|
-
return path.join(
|
|
19
|
+
return path.join(userHomeDir(), '.config');
|
|
13
20
|
}
|
|
14
21
|
|
|
15
22
|
export function resolveConfigDir() {
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
function withPassRate(entry) {
|
|
2
|
+
return {
|
|
3
|
+
...entry,
|
|
4
|
+
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
5
|
+
};
|
|
6
|
+
}
|
|
7
|
+
|
|
1
8
|
export function summarizeResults(results) {
|
|
2
9
|
const counts = {
|
|
3
10
|
total: results.length,
|
|
@@ -8,75 +15,85 @@ export function summarizeResults(results) {
|
|
|
8
15
|
return counts;
|
|
9
16
|
}
|
|
10
17
|
|
|
11
|
-
|
|
18
|
+
function summarizeByKey(results, keyFn) {
|
|
12
19
|
const grouped = new Map();
|
|
13
20
|
for (const result of results) {
|
|
14
|
-
const
|
|
21
|
+
const key = keyFn(result);
|
|
22
|
+
const entry = grouped.get(key) ?? { total: 0, passed: 0, failed: 0 };
|
|
15
23
|
entry.total += 1;
|
|
16
24
|
if (result.passed) entry.passed += 1;
|
|
17
25
|
else entry.failed += 1;
|
|
18
|
-
grouped.set(
|
|
26
|
+
grouped.set(key, entry);
|
|
19
27
|
}
|
|
20
28
|
|
|
21
29
|
return Object.fromEntries(
|
|
22
|
-
[...grouped.entries()]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
...entry,
|
|
26
|
-
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
27
|
-
}
|
|
28
|
-
])
|
|
30
|
+
[...grouped.entries()]
|
|
31
|
+
.sort(([left], [right]) => left.localeCompare(right))
|
|
32
|
+
.map(([key, entry]) => [key, withPassRate(entry)])
|
|
29
33
|
);
|
|
30
34
|
}
|
|
31
35
|
|
|
32
|
-
export function
|
|
36
|
+
export function summarizeBySurface(results) {
|
|
37
|
+
return summarizeByKey(results, (result) => result.surface);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function normalizeGeneratedDate(generatedAt) {
|
|
41
|
+
if (typeof generatedAt !== 'string' || generatedAt.trim().length === 0) {
|
|
42
|
+
return 'legacy';
|
|
43
|
+
}
|
|
44
|
+
return generatedAt.slice(0, 10);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function metadataValue(value) {
|
|
48
|
+
return typeof value === 'string' && value.trim().length > 0 ? value : 'legacy';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function summarizeMetadata(results) {
|
|
52
|
+
return {
|
|
53
|
+
byPromptVersion: summarizeByKey(results, (result) => metadataValue(result.metadata?.promptVersion)),
|
|
54
|
+
byModel: summarizeByKey(results, (result) => metadataValue(result.metadata?.model)),
|
|
55
|
+
byGeneratedDate: summarizeByKey(results, (result) => normalizeGeneratedDate(result.metadata?.generatedAt)),
|
|
56
|
+
byGitSha: summarizeByKey(results, (result) => metadataValue(result.metadata?.gitSha)),
|
|
57
|
+
byCohort: summarizeByKey(results, (result) => {
|
|
58
|
+
const promptVersion = metadataValue(result.metadata?.promptVersion);
|
|
59
|
+
const model = metadataValue(result.metadata?.model);
|
|
60
|
+
const generatedDate = normalizeGeneratedDate(result.metadata?.generatedAt);
|
|
61
|
+
return `${result.surface} / ${promptVersion} / ${model} / ${generatedDate}`;
|
|
62
|
+
})
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function summarizeStoredResults(results) {
|
|
33
67
|
return {
|
|
34
|
-
snapshotPath,
|
|
35
68
|
summary: summarizeResults(results),
|
|
36
69
|
bySurface: summarizeBySurface(results),
|
|
70
|
+
metadata: summarizeMetadata(results)
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function buildStoredSummaryReport(snapshotPath, results) {
|
|
75
|
+
const summary = summarizeStoredResults(results);
|
|
76
|
+
return {
|
|
77
|
+
snapshotPath,
|
|
78
|
+
summary: summary.summary,
|
|
79
|
+
bySurface: summary.bySurface,
|
|
80
|
+
metadata: summary.metadata,
|
|
37
81
|
results: results.map((result) => ({
|
|
38
82
|
id: result.id,
|
|
39
83
|
surface: result.surface,
|
|
40
84
|
passed: result.passed,
|
|
41
85
|
output: result.output,
|
|
86
|
+
metadata: result.metadata ?? null,
|
|
42
87
|
failedChecks: result.checks.filter((check) => !check.passed)
|
|
43
88
|
}))
|
|
44
89
|
};
|
|
45
90
|
}
|
|
46
91
|
|
|
47
92
|
export function summarizeBatchReports(reports) {
|
|
48
|
-
const
|
|
49
|
-
let total = 0;
|
|
50
|
-
let passed = 0;
|
|
51
|
-
|
|
52
|
-
for (const report of reports) {
|
|
53
|
-
total += report.summary.total;
|
|
54
|
-
passed += report.summary.passed;
|
|
55
|
-
|
|
56
|
-
for (const [surface, entry] of Object.entries(report.bySurface ?? {})) {
|
|
57
|
-
const current = bySurface.get(surface) ?? { total: 0, passed: 0, failed: 0 };
|
|
58
|
-
current.total += entry.total;
|
|
59
|
-
current.passed += entry.passed;
|
|
60
|
-
current.failed += entry.failed;
|
|
61
|
-
bySurface.set(surface, current);
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
93
|
+
const results = reports.flatMap((report) => report.results ?? []);
|
|
65
94
|
return {
|
|
66
95
|
snapshotCount: reports.length,
|
|
67
|
-
|
|
68
|
-
passed,
|
|
69
|
-
failed: total - passed,
|
|
70
|
-
passRate: total > 0 ? passed / total : 0,
|
|
71
|
-
bySurface: Object.fromEntries(
|
|
72
|
-
[...bySurface.entries()].map(([surface, entry]) => [
|
|
73
|
-
surface,
|
|
74
|
-
{
|
|
75
|
-
...entry,
|
|
76
|
-
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
77
|
-
}
|
|
78
|
-
])
|
|
79
|
-
)
|
|
96
|
+
...summarizeStoredResults(results)
|
|
80
97
|
};
|
|
81
98
|
}
|
|
82
99
|
|
|
@@ -90,8 +107,8 @@ export function evaluateBatchThresholds(summary, {
|
|
|
90
107
|
} = {}) {
|
|
91
108
|
const failures = [];
|
|
92
109
|
|
|
93
|
-
if (typeof minPassRate === 'number' && summary.passRate < minPassRate) {
|
|
94
|
-
failures.push(`Overall pass rate ${percentage(summary.passRate)} is below required ${percentage(minPassRate)}.`);
|
|
110
|
+
if (typeof minPassRate === 'number' && summary.summary.passRate < minPassRate) {
|
|
111
|
+
failures.push(`Overall pass rate ${percentage(summary.summary.passRate)} is below required ${percentage(minPassRate)}.`);
|
|
95
112
|
}
|
|
96
113
|
|
|
97
114
|
for (const [surface, minimum] of Object.entries(minSurfacePassRates)) {
|
|
@@ -105,22 +122,38 @@ export function evaluateBatchThresholds(summary, {
|
|
|
105
122
|
return failures;
|
|
106
123
|
}
|
|
107
124
|
|
|
125
|
+
function formatSummaryLines(entries, { limit = null } = {}) {
|
|
126
|
+
const lines = Object.entries(entries).map(
|
|
127
|
+
([label, entry]) => `- ${label}: ${entry.passed}/${entry.total} passed (${(entry.passRate * 100).toFixed(1)}%)`
|
|
128
|
+
);
|
|
129
|
+
return limit == null ? lines : lines.slice(0, limit);
|
|
130
|
+
}
|
|
131
|
+
|
|
108
132
|
export function formatBatchSummaryMarkdown(summary, reports, failures = []) {
|
|
109
133
|
const lines = [
|
|
110
134
|
'# Stored Summary Eval Report',
|
|
111
135
|
'',
|
|
112
136
|
`- Snapshots: ${summary.snapshotCount}`,
|
|
113
|
-
`- Total summaries: ${summary.total}`,
|
|
114
|
-
`- Passed: ${summary.passed}`,
|
|
115
|
-
`- Failed: ${summary.failed}`,
|
|
116
|
-
`- Pass rate: ${(summary.passRate * 100).toFixed(1)}%`,
|
|
137
|
+
`- Total summaries: ${summary.summary.total}`,
|
|
138
|
+
`- Passed: ${summary.summary.passed}`,
|
|
139
|
+
`- Failed: ${summary.summary.failed}`,
|
|
140
|
+
`- Pass rate: ${(summary.summary.passRate * 100).toFixed(1)}%`,
|
|
117
141
|
'',
|
|
118
|
-
'## By Surface'
|
|
142
|
+
'## By Surface',
|
|
143
|
+
...formatSummaryLines(summary.bySurface)
|
|
119
144
|
];
|
|
120
145
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
146
|
+
lines.push('', '## By Prompt Version');
|
|
147
|
+
lines.push(...formatSummaryLines(summary.metadata.byPromptVersion));
|
|
148
|
+
|
|
149
|
+
lines.push('', '## By Model');
|
|
150
|
+
lines.push(...formatSummaryLines(summary.metadata.byModel));
|
|
151
|
+
|
|
152
|
+
lines.push('', '## By Generated Date');
|
|
153
|
+
lines.push(...formatSummaryLines(summary.metadata.byGeneratedDate));
|
|
154
|
+
|
|
155
|
+
lines.push('', '## Versioned Cohorts');
|
|
156
|
+
lines.push(...formatSummaryLines(summary.metadata.byCohort, { limit: 20 }));
|
|
124
157
|
|
|
125
158
|
lines.push('', '## Snapshots');
|
|
126
159
|
for (const report of reports) {
|