incremnt 0.7.2 → 0.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -1
- package/package.json +2 -1
- package/src/ask-answer-verifier.js +857 -0
- package/src/ask-coach.js +2634 -0
- package/src/ask-replay.js +358 -0
- package/src/auth.js +169 -15
- package/src/contract.js +160 -3
- package/src/format.js +28 -2
- package/src/lib.js +205 -17
- package/src/mcp.js +88 -24
- package/src/openrouter.js +242 -19
- package/src/plan-changeset.js +132 -0
- package/src/program-draft.js +230 -0
- package/src/prompt-changelog.js +90 -0
- package/src/promptfoo-evals.js +10 -4
- package/src/promptfoo-langfuse-scores.js +55 -0
- package/src/queries.js +992 -987
- package/src/remote.js +465 -12
- package/src/score-context.js +14 -7
- package/src/score-prelude.js +113 -0
- package/src/service-url.js +9 -0
- package/src/summary-evals.js +677 -42
- package/src/sync-service.js +1259 -352
- package/src/transport.js +119 -3
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
// Single source of truth for the AI coach's <program_draft> block: extraction,
|
|
2
|
+
// JSON-shape validation, and normalization. Lives here (not in sync-service.js)
|
|
3
|
+
// so both the runtime (askCoach drops invalid drafts) and the eval harness
|
|
4
|
+
// (summary-evals.js catches malformed drafts in CI, before they ship and get
|
|
5
|
+
// silently dropped in prod) validate against the exact same rules. Moved verbatim
|
|
6
|
+
// from sync-service.js — behaviour-preserving.
|
|
7
|
+
|
|
8
|
+
export const PROGRAM_DRAFT_VERSION = 1;
|
|
9
|
+
export const VALID_PROGRAM_DRAFT_EQUIPMENT_TIERS = new Set(['fullGym', 'benchDumbbells', 'dumbbellsOnly', 'bodyweightOnly']);
|
|
10
|
+
export const VALID_PROGRAM_DRAFT_VOLUME_LEVELS = new Set(['minimum', 'moderate', 'high']);
|
|
11
|
+
|
|
12
|
+
export const PROGRAM_DRAFT_LIMITS = {
|
|
13
|
+
nameMaxLen: 120,
|
|
14
|
+
muscleGroupMaxLen: 60,
|
|
15
|
+
dayLabelMaxLen: 60,
|
|
16
|
+
dayTitleMaxLen: 120,
|
|
17
|
+
daySubtitleMaxLen: 120,
|
|
18
|
+
noteMaxLen: 1000,
|
|
19
|
+
minWeight: 0,
|
|
20
|
+
maxWeight: 600,
|
|
21
|
+
minReps: 1,
|
|
22
|
+
maxReps: 30,
|
|
23
|
+
minRir: 0,
|
|
24
|
+
maxRir: 5,
|
|
25
|
+
minSetsPerExercise: 1,
|
|
26
|
+
maxSetsPerExercise: 12,
|
|
27
|
+
minExercisesPerDay: 1,
|
|
28
|
+
maxExercisesPerDay: 24,
|
|
29
|
+
minDaysPerWeek: 1,
|
|
30
|
+
maxDaysPerWeek: 7,
|
|
31
|
+
minDays: 1,
|
|
32
|
+
maxDays: 14
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
function collapseBlankLines(text) {
|
|
36
|
+
return String(text ?? '')
|
|
37
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
38
|
+
.trim();
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
function titleCaseExerciseName(name) {
|
|
42
|
+
return String(name ?? '')
|
|
43
|
+
.split(' ')
|
|
44
|
+
.filter(Boolean)
|
|
45
|
+
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
|
|
46
|
+
.join(' ');
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function normalizedExerciseDisplayName(name, canonicalizeExerciseName) {
|
|
50
|
+
const trimmed = String(name ?? '').trim();
|
|
51
|
+
if (!trimmed) return '';
|
|
52
|
+
const canonical = canonicalizeExerciseName ? canonicalizeExerciseName(trimmed) : trimmed.toLowerCase();
|
|
53
|
+
return titleCaseExerciseName(canonical);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function hasOnlyAllowedKeys(value, allowedKeys) {
|
|
57
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
|
|
58
|
+
return Object.keys(value).every((key) => allowedKeys.has(key));
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function normalizeProgramDraftSet(set) {
|
|
62
|
+
if (!hasOnlyAllowedKeys(set, new Set(['weight', 'reps', 'isWarmup']))) return null;
|
|
63
|
+
|
|
64
|
+
const weight = Number(set?.weight);
|
|
65
|
+
const reps = Number(set?.reps);
|
|
66
|
+
if (!Number.isFinite(weight) || !Number.isInteger(reps)) return null;
|
|
67
|
+
if (
|
|
68
|
+
weight < PROGRAM_DRAFT_LIMITS.minWeight ||
|
|
69
|
+
weight > PROGRAM_DRAFT_LIMITS.maxWeight ||
|
|
70
|
+
reps < PROGRAM_DRAFT_LIMITS.minReps ||
|
|
71
|
+
reps > PROGRAM_DRAFT_LIMITS.maxReps
|
|
72
|
+
) return null;
|
|
73
|
+
return {
|
|
74
|
+
weight,
|
|
75
|
+
reps,
|
|
76
|
+
isComplete: false,
|
|
77
|
+
isWarmup: set?.isWarmup === true
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
function normalizeProgramDraftExercise(exercise, canonicalizeExerciseName, strict = false) {
|
|
82
|
+
if (!hasOnlyAllowedKeys(exercise, new Set(['name', 'muscleGroup', 'sets', 'rir', 'note']))) return null;
|
|
83
|
+
|
|
84
|
+
const name = normalizedExerciseDisplayName(exercise?.name, canonicalizeExerciseName);
|
|
85
|
+
const muscleGroup = String(exercise?.muscleGroup ?? '').trim();
|
|
86
|
+
// strict (eval): any invalid set rejects the whole draft — catches partial
|
|
87
|
+
// malformation as a regression signal. lenient (runtime, default): drop the
|
|
88
|
+
// bad set and salvage a usable program for the user.
|
|
89
|
+
const mappedSets = Array.isArray(exercise?.sets) ? exercise.sets.map(normalizeProgramDraftSet) : [];
|
|
90
|
+
if (strict && mappedSets.some((set) => !set)) return null;
|
|
91
|
+
const sets = mappedSets.filter(Boolean);
|
|
92
|
+
|
|
93
|
+
if (!name || name.length > PROGRAM_DRAFT_LIMITS.nameMaxLen) return null;
|
|
94
|
+
if (!muscleGroup || muscleGroup.length > PROGRAM_DRAFT_LIMITS.muscleGroupMaxLen) return null;
|
|
95
|
+
if (
|
|
96
|
+
sets.length < PROGRAM_DRAFT_LIMITS.minSetsPerExercise ||
|
|
97
|
+
sets.length > PROGRAM_DRAFT_LIMITS.maxSetsPerExercise
|
|
98
|
+
) return null;
|
|
99
|
+
|
|
100
|
+
const rir = exercise?.rir == null ? null : Number(exercise.rir);
|
|
101
|
+
if (rir != null && (
|
|
102
|
+
!Number.isInteger(rir) ||
|
|
103
|
+
rir < PROGRAM_DRAFT_LIMITS.minRir ||
|
|
104
|
+
rir > PROGRAM_DRAFT_LIMITS.maxRir
|
|
105
|
+
)) return null;
|
|
106
|
+
|
|
107
|
+
const note = exercise?.note == null ? null : String(exercise.note);
|
|
108
|
+
if (note && note.length > PROGRAM_DRAFT_LIMITS.noteMaxLen) return null;
|
|
109
|
+
|
|
110
|
+
return {
|
|
111
|
+
name,
|
|
112
|
+
muscleGroup,
|
|
113
|
+
lastSuggestion: '',
|
|
114
|
+
nextSuggestion: '',
|
|
115
|
+
sets,
|
|
116
|
+
...(note ? { note } : {}),
|
|
117
|
+
...(rir != null ? { rir } : {})
|
|
118
|
+
};
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function normalizeProgramDraftDay(day, canonicalizeExerciseName, strict = false) {
|
|
122
|
+
if (!hasOnlyAllowedKeys(day, new Set(['dayLabel', 'title', 'subtitle', 'exercises']))) return null;
|
|
123
|
+
|
|
124
|
+
const dayLabel = String(day?.dayLabel ?? '').trim();
|
|
125
|
+
const title = String(day?.title ?? '').trim();
|
|
126
|
+
const subtitle = String(day?.subtitle ?? '').trim();
|
|
127
|
+
const mappedExercises = Array.isArray(day?.exercises)
|
|
128
|
+
? day.exercises.map((exercise) => normalizeProgramDraftExercise(exercise, canonicalizeExerciseName, strict))
|
|
129
|
+
: [];
|
|
130
|
+
if (strict && mappedExercises.some((exercise) => !exercise)) return null;
|
|
131
|
+
const exercises = mappedExercises.filter(Boolean);
|
|
132
|
+
|
|
133
|
+
if (!dayLabel || dayLabel.length > PROGRAM_DRAFT_LIMITS.dayLabelMaxLen) return null;
|
|
134
|
+
if (!title || title.length > PROGRAM_DRAFT_LIMITS.dayTitleMaxLen) return null;
|
|
135
|
+
if (subtitle.length > PROGRAM_DRAFT_LIMITS.daySubtitleMaxLen) return null;
|
|
136
|
+
if (
|
|
137
|
+
exercises.length < PROGRAM_DRAFT_LIMITS.minExercisesPerDay ||
|
|
138
|
+
exercises.length > PROGRAM_DRAFT_LIMITS.maxExercisesPerDay
|
|
139
|
+
) return null;
|
|
140
|
+
|
|
141
|
+
return { dayLabel, title, subtitle, exercises };
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
export function normalizeProgramDraft(rawProgram, { canonicalizeExerciseName, strict = false } = {}) {
|
|
145
|
+
if (!rawProgram || typeof rawProgram !== 'object' || Array.isArray(rawProgram)) return null;
|
|
146
|
+
if (!hasOnlyAllowedKeys(rawProgram, new Set([
|
|
147
|
+
'name',
|
|
148
|
+
'daysPerWeek',
|
|
149
|
+
'equipmentTier',
|
|
150
|
+
'volumeLevel',
|
|
151
|
+
'currentDayIndex',
|
|
152
|
+
'days'
|
|
153
|
+
]))) return null;
|
|
154
|
+
|
|
155
|
+
const name = String(rawProgram.name ?? '').trim();
|
|
156
|
+
const mappedDays = Array.isArray(rawProgram.days)
|
|
157
|
+
? rawProgram.days.map((day) => normalizeProgramDraftDay(day, canonicalizeExerciseName, strict))
|
|
158
|
+
: [];
|
|
159
|
+
if (strict && mappedDays.some((day) => !day)) return null;
|
|
160
|
+
const days = mappedDays.filter(Boolean);
|
|
161
|
+
const daysPerWeek = Number(rawProgram.daysPerWeek);
|
|
162
|
+
const currentDayIndex = rawProgram.currentDayIndex == null ? 0 : Number(rawProgram.currentDayIndex);
|
|
163
|
+
const equipmentTier = String(rawProgram.equipmentTier ?? 'fullGym').trim();
|
|
164
|
+
const volumeLevel = String(rawProgram.volumeLevel ?? 'moderate').trim();
|
|
165
|
+
|
|
166
|
+
if (!name || name.length > PROGRAM_DRAFT_LIMITS.nameMaxLen) return null;
|
|
167
|
+
if (days.length < PROGRAM_DRAFT_LIMITS.minDays || days.length > PROGRAM_DRAFT_LIMITS.maxDays) return null;
|
|
168
|
+
if (
|
|
169
|
+
!Number.isInteger(daysPerWeek) ||
|
|
170
|
+
daysPerWeek < PROGRAM_DRAFT_LIMITS.minDaysPerWeek ||
|
|
171
|
+
daysPerWeek > PROGRAM_DRAFT_LIMITS.maxDaysPerWeek
|
|
172
|
+
) return null;
|
|
173
|
+
if (!Number.isInteger(currentDayIndex) || currentDayIndex < 0 || currentDayIndex >= days.length) return null;
|
|
174
|
+
if (!VALID_PROGRAM_DRAFT_EQUIPMENT_TIERS.has(equipmentTier) || !VALID_PROGRAM_DRAFT_VOLUME_LEVELS.has(volumeLevel)) return null;
|
|
175
|
+
|
|
176
|
+
return {
|
|
177
|
+
name,
|
|
178
|
+
daysPerWeek,
|
|
179
|
+
equipmentTier,
|
|
180
|
+
volumeLevel,
|
|
181
|
+
source: 'guided',
|
|
182
|
+
days,
|
|
183
|
+
currentDayIndex
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
export function extractAskProgramDraft(rawText, { canonicalizeExerciseName, strict = false } = {}) {
|
|
188
|
+
const text = String(rawText ?? '');
|
|
189
|
+
const match = text.match(/<program_draft>\s*([\s\S]*?)\s*<\/program_draft>/i);
|
|
190
|
+
if (!match) {
|
|
191
|
+
return { answerText: text.trim(), programDraft: null };
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
const answerText = collapseBlankLines(text.replace(match[0], ''));
|
|
195
|
+
let parsed;
|
|
196
|
+
try {
|
|
197
|
+
parsed = JSON.parse(match[1]);
|
|
198
|
+
} catch (err) {
|
|
199
|
+
console.warn('askCoach: <program_draft> JSON parse failed — dropping draft:', err.message);
|
|
200
|
+
return { answerText, programDraft: null };
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
const program = normalizeProgramDraft(parsed, { canonicalizeExerciseName, strict });
|
|
204
|
+
if (!program) {
|
|
205
|
+
console.warn('askCoach: <program_draft> payload failed validation — dropping draft');
|
|
206
|
+
return { answerText, programDraft: null };
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
return {
|
|
210
|
+
answerText,
|
|
211
|
+
programDraft: {
|
|
212
|
+
program,
|
|
213
|
+
provenance: {
|
|
214
|
+
source: 'ai-coach',
|
|
215
|
+
type: 'program',
|
|
216
|
+
version: PROGRAM_DRAFT_VERSION,
|
|
217
|
+
createdAt: new Date().toISOString(),
|
|
218
|
+
tokenHint: null
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
/**
|
|
225
|
+
* Whether `rawText` contains a <program_draft> tag at all (valid or not).
|
|
226
|
+
* Lets the eval distinguish "no draft" from "malformed draft".
|
|
227
|
+
*/
|
|
228
|
+
export function hasProgramDraftBlock(rawText) {
|
|
229
|
+
return /<\s*\/?\s*program_draft\b[^>]*>/i.test(String(rawText ?? ''));
|
|
230
|
+
}
|
package/src/prompt-changelog.js
CHANGED
|
@@ -22,6 +22,69 @@ export const PROMPT_CHANGELOG_TYPES = Object.freeze([
|
|
|
22
22
|
]);
|
|
23
23
|
|
|
24
24
|
export const PROMPT_CHANGELOG = Object.freeze([
|
|
25
|
+
{
|
|
26
|
+
version: 'ask_agentic_v2026_06_02_1',
|
|
27
|
+
surface: 'askAgentic',
|
|
28
|
+
date: '2026-06-02',
|
|
29
|
+
type: 'feature',
|
|
30
|
+
summary:
|
|
31
|
+
'Broad progress/bodyweight/on-track answers use coach-operator shape: verdict, signal, evidence, caveat, and the next decision. Progress reviews may ask one goal-defining question when body-composition tradeoffs depend on missing goal context, and now synthesize bodyweight/readiness evidence when routed context provides it.',
|
|
32
|
+
eval: 'ask_progress_review_golden'
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
version: 'ask_v2026_06_02_1',
|
|
36
|
+
surface: 'ask',
|
|
37
|
+
date: '2026-06-02',
|
|
38
|
+
type: 'feature',
|
|
39
|
+
summary:
|
|
40
|
+
'Broad progress/bodyweight/on-track answers use coach-operator shape: verdict, signal, evidence, caveat, and the next decision. Progress reviews may ask one goal-defining question when body-composition tradeoffs depend on missing goal context, and now synthesize bodyweight/readiness evidence when routed context provides it.',
|
|
41
|
+
eval: 'ask_progress_review_golden'
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
version: 'ask_agentic_v2026_06_01_1',
|
|
45
|
+
surface: 'askAgentic',
|
|
46
|
+
date: '2026-06-01',
|
|
47
|
+
type: 'safety',
|
|
48
|
+
summary:
|
|
49
|
+
'Hoist a high-salience "Hard limits" block to the top of ASK_RULES restating the most-violated nevers (no 1RM/PR/records unless asked, except the routed broad-review PR count; no fatigue/recovery/readiness language without an explicit signal; no warmup/backoff loads as working sets; no raw Increment Score sub-scores). Also: speak in the first person (never "the coach"/"the coach observation"/"the system") and never volunteer the overall score number unless asked — paired with a question-gated score prelude that withholds the numeric headline on non-score questions. Reinforcement of buried rules plus the self-reference and volunteered-score fixes the live history showed.',
|
|
50
|
+
eval: 'ask_why_failed_no_vitals'
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
version: 'ask_v2026_06_01_1',
|
|
54
|
+
surface: 'ask',
|
|
55
|
+
date: '2026-06-01',
|
|
56
|
+
type: 'safety',
|
|
57
|
+
summary:
|
|
58
|
+
'Hoist a high-salience "Hard limits" block to the top of ASK_RULES restating the most-violated nevers (no 1RM/PR/records unless asked, except the routed broad-review PR count; no fatigue/recovery/readiness language without an explicit signal; no warmup/backoff loads as working sets; no raw Increment Score sub-scores; speak in the first person, never "the coach"/"the system"; never volunteer the overall score number unless asked). Reinforcement of buried rules plus self-reference and volunteered-score fixes.',
|
|
59
|
+
eval: 'ask_why_failed_no_vitals'
|
|
60
|
+
},
|
|
61
|
+
{
|
|
62
|
+
version: 'ask_agentic_v2026_05_30_3',
|
|
63
|
+
surface: 'askAgentic',
|
|
64
|
+
date: '2026-05-30',
|
|
65
|
+
type: 'fix',
|
|
66
|
+
summary:
|
|
67
|
+
'Broad progress reviews must include the observed training frequency/session count alongside volume, body-weight, and recent PR-count evidence, so live Ask replays do not skip the basic activity denominator.',
|
|
68
|
+
eval: 'ask_progress_review_golden'
|
|
69
|
+
},
|
|
70
|
+
{
|
|
71
|
+
version: 'ask_agentic_v2026_05_30_2',
|
|
72
|
+
surface: 'askAgentic',
|
|
73
|
+
date: '2026-05-30',
|
|
74
|
+
type: 'fix',
|
|
75
|
+
summary:
|
|
76
|
+
'For broad progress-review questions, carry the base Ask rule that recent all-time estimated 1RM PR counts must be mentioned when the routed context provides them; preserves the bounded read-only tool loop from ask_agentic_v2026_05_30_1.',
|
|
77
|
+
eval: 'ask_progress_review_golden'
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
version: 'ask_agentic_v2026_05_30_1',
|
|
81
|
+
surface: 'askAgentic',
|
|
82
|
+
date: '2026-05-30',
|
|
83
|
+
type: 'feature',
|
|
84
|
+
summary:
|
|
85
|
+
'Agentic Ask generation: the model receives the routed context as a warm start plus a read-only tool menu (records, body weight, weekly volume, readiness, etc.) and fetches missing evidence over a bounded, deduped loop instead of answering one-shot from a fixed route. Server-side privacy exclusions are forced into every tool call; all fetched tools are folded into provenance. Inherits the ask_v2026_05_30_1 rules via an appended tool-use addendum.',
|
|
86
|
+
eval: 'ask_tool_provenance'
|
|
87
|
+
},
|
|
25
88
|
{
|
|
26
89
|
version: 'workout_v2026_05_23_1',
|
|
27
90
|
surface: 'workout',
|
|
@@ -31,6 +94,33 @@ export const PROMPT_CHANGELOG = Object.freeze([
|
|
|
31
94
|
'Keep skipped-exercise mentions generic unless plan comparison supports naming the lift; anchor the note to completed-session work.',
|
|
32
95
|
eval: 'exercise_mentions'
|
|
33
96
|
},
|
|
97
|
+
{
|
|
98
|
+
version: 'ask_v2026_05_30_3',
|
|
99
|
+
surface: 'ask',
|
|
100
|
+
date: '2026-05-30',
|
|
101
|
+
type: 'fix',
|
|
102
|
+
summary:
|
|
103
|
+
'Broad progress reviews must include the observed training frequency/session count alongside volume, body-weight, and recent PR-count evidence, so the answer keeps the activity denominator visible.',
|
|
104
|
+
eval: 'ask_progress_review_golden'
|
|
105
|
+
},
|
|
106
|
+
{
|
|
107
|
+
version: 'ask_v2026_05_30_2',
|
|
108
|
+
surface: 'ask',
|
|
109
|
+
date: '2026-05-30',
|
|
110
|
+
type: 'fix',
|
|
111
|
+
summary:
|
|
112
|
+
'Broad progress reviews must explicitly mention the recent all-time estimated 1RM PR count when the context includes it, preventing recent PR density from being softened into vague "several lifts moved" language.',
|
|
113
|
+
eval: 'ask_progress_review_golden'
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
version: 'ask_v2026_05_30_1',
|
|
117
|
+
surface: 'ask',
|
|
118
|
+
date: '2026-05-30',
|
|
119
|
+
type: 'safety',
|
|
120
|
+
summary:
|
|
121
|
+
'Enforce score-voice: name the Increment Score and its overall value/direction, but never recite raw component sub-scores, decimals, or daily score lists; translate the score into training reality. Answer training questions first (no score-dump lead), do not re-recite the breakdown on follow-ups, and answer retrospectives at the multi-week altitude asked. Paired with a voice-safe formatIncrementScorePrelude.',
|
|
122
|
+
eval: 'ask_score_voice'
|
|
123
|
+
},
|
|
34
124
|
{
|
|
35
125
|
version: 'ask_v2026_05_23_1',
|
|
36
126
|
surface: 'ask',
|
package/src/promptfoo-evals.js
CHANGED
|
@@ -4,7 +4,8 @@ import {
|
|
|
4
4
|
loadSummaryEvalSnapshot,
|
|
5
5
|
summaryEvalFixturesRoot,
|
|
6
6
|
buildSummaryEvalContext,
|
|
7
|
-
generateSummaryEvalOutputWithMetadata
|
|
7
|
+
generateSummaryEvalOutputWithMetadata,
|
|
8
|
+
summaryEvalsLiveGenerationEnabled
|
|
8
9
|
} from './summary-evals.js';
|
|
9
10
|
import { publishPromptfooLangfuseScore } from './promptfoo-langfuse-scores.js';
|
|
10
11
|
|
|
@@ -130,15 +131,20 @@ export async function assertPromptfooDomain(output, context = {}) {
|
|
|
130
131
|
|
|
131
132
|
export async function callPromptfooProvider(prompt, context = {}) {
|
|
132
133
|
const { testCase, snapshot } = await resolvePromptfooEval(context.vars ?? {});
|
|
133
|
-
const liveGenerationEnabled =
|
|
134
|
+
const liveGenerationEnabled = summaryEvalsLiveGenerationEnabled();
|
|
134
135
|
|
|
135
136
|
if (!liveGenerationEnabled) {
|
|
137
|
+
const evalContext = buildSummaryEvalContext(snapshot, testCase);
|
|
138
|
+
const generation = await generateSummaryEvalOutputWithMetadata(testCase, evalContext, snapshot);
|
|
139
|
+
promptfooProviderMetadata.set(promptfooMetadataKey(context.vars ?? {}), generation.metadata);
|
|
140
|
+
|
|
136
141
|
return {
|
|
137
|
-
output:
|
|
142
|
+
output: generation.output,
|
|
138
143
|
metadata: {
|
|
139
144
|
caseId: testCase.id,
|
|
140
145
|
surface: testCase.surface,
|
|
141
|
-
mode: 'stored'
|
|
146
|
+
mode: 'stored',
|
|
147
|
+
...generation.metadata
|
|
142
148
|
}
|
|
143
149
|
};
|
|
144
150
|
}
|
|
@@ -100,6 +100,23 @@ function failedChecks(result) {
|
|
|
100
100
|
return (result.checks ?? []).filter((check) => !check.passed);
|
|
101
101
|
}
|
|
102
102
|
|
|
103
|
+
function nonEmptyStrings(values, { max = 12 } = {}) {
|
|
104
|
+
const unique = [];
|
|
105
|
+
const seen = new Set();
|
|
106
|
+
for (const value of Array.isArray(values) ? values : []) {
|
|
107
|
+
const text = typeof value === 'string' ? value.trim() : '';
|
|
108
|
+
if (!text || seen.has(text)) continue;
|
|
109
|
+
seen.add(text);
|
|
110
|
+
unique.push(text);
|
|
111
|
+
if (unique.length >= max) break;
|
|
112
|
+
}
|
|
113
|
+
return unique.length > 0 ? unique : undefined;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function countArray(values) {
|
|
117
|
+
return Array.isArray(values) ? values.length : undefined;
|
|
118
|
+
}
|
|
119
|
+
|
|
103
120
|
function langfuseScoreTarget({ testCase = {}, context = {}, sessionId }) {
|
|
104
121
|
const providerMetadata = context.providerResponse?.metadata ?? {};
|
|
105
122
|
const caseMetadata = testCase.metadata ?? {};
|
|
@@ -138,6 +155,21 @@ export function buildPromptfooLangfuseScorePayload({
|
|
|
138
155
|
const runId = scoreRunId(context, env, now);
|
|
139
156
|
const mode = promptfooEvalMode(env);
|
|
140
157
|
const provider = providerLabel(context);
|
|
158
|
+
const providerMetadata = context.providerResponse?.metadata ?? {};
|
|
159
|
+
const routingMetadata = providerMetadata.routingMetadata && typeof providerMetadata.routingMetadata === 'object'
|
|
160
|
+
? providerMetadata.routingMetadata
|
|
161
|
+
: {};
|
|
162
|
+
const evidencePlan = routingMetadata.evidencePlan && typeof routingMetadata.evidencePlan === 'object'
|
|
163
|
+
? routingMetadata.evidencePlan
|
|
164
|
+
: providerMetadata.evidencePlan && typeof providerMetadata.evidencePlan === 'object'
|
|
165
|
+
? providerMetadata.evidencePlan
|
|
166
|
+
: {};
|
|
167
|
+
const contextBundle = routingMetadata.contextBundle && typeof routingMetadata.contextBundle === 'object'
|
|
168
|
+
? routingMetadata.contextBundle
|
|
169
|
+
: {};
|
|
170
|
+
const structured = providerMetadata.structured && typeof providerMetadata.structured === 'object'
|
|
171
|
+
? providerMetadata.structured
|
|
172
|
+
: {};
|
|
141
173
|
const failed = failedChecks(result);
|
|
142
174
|
const promptVersion = vars.promptVersion
|
|
143
175
|
?? testCase.metadata?.promptVersion
|
|
@@ -179,6 +211,29 @@ export function buildPromptfooLangfuseScorePayload({
|
|
|
179
211
|
assertionCount: result.checks?.length ?? 0,
|
|
180
212
|
failedAssertionKeys: failed.map((check) => check.key),
|
|
181
213
|
failedAssertionReasons: failed.map((check) => check.reason).slice(0, 10),
|
|
214
|
+
route: firstString(routingMetadata.route, evidencePlan.route),
|
|
215
|
+
effectiveRoute: firstString(routingMetadata.effectiveRoute, evidencePlan.effectiveRoute),
|
|
216
|
+
requestedAction: firstString(routingMetadata.intent?.requestedAction),
|
|
217
|
+
intentConfidence: typeof routingMetadata.intent?.confidence === 'number' ? routingMetadata.intent.confidence : undefined,
|
|
218
|
+
contextCharCount: typeof routingMetadata.contextCharCount === 'number' ? routingMetadata.contextCharCount : undefined,
|
|
219
|
+
historyTurnCount: typeof routingMetadata.historyTurnCount === 'number' ? routingMetadata.historyTurnCount : undefined,
|
|
220
|
+
requiredTools: nonEmptyStrings(evidencePlan.requiredTools),
|
|
221
|
+
executedTools: nonEmptyStrings(evidencePlan.executedTools ?? routingMetadata.toolsUsed ?? contextBundle.executedTools),
|
|
222
|
+
evidenceGaps: nonEmptyStrings(evidencePlan.evidenceGaps),
|
|
223
|
+
missingDataFlags: nonEmptyStrings([
|
|
224
|
+
...(routingMetadata.missingDataFlags ?? []),
|
|
225
|
+
...(contextBundle.missingDataFlags ?? []),
|
|
226
|
+
...(evidencePlan.evidenceGaps ?? [])
|
|
227
|
+
]),
|
|
228
|
+
structuredResponsePresent: Object.keys(structured).length > 0 ? true : undefined,
|
|
229
|
+
structuredConfidence: firstString(structured.confidence),
|
|
230
|
+
followUpSuggestionCount: countArray(structured.followUpSuggestions),
|
|
231
|
+
limitationCount: countArray(structured.limitations),
|
|
232
|
+
evidenceUsedLabels: nonEmptyStrings((structured.evidenceUsed ?? []).map((item) => item?.label)),
|
|
233
|
+
evidenceUsedTools: nonEmptyStrings((structured.evidenceUsed ?? []).map((item) => item?.toolName)),
|
|
234
|
+
recommendedActionIds: nonEmptyStrings((structured.recommendedActions ?? []).map((item) => item?.id)),
|
|
235
|
+
recommendedActionLabels: nonEmptyStrings((structured.recommendedActions ?? []).map((item) => item?.label)),
|
|
236
|
+
hasProgramDraft: structured.programDraft != null ? true : undefined,
|
|
182
237
|
generatedAt: now.toISOString()
|
|
183
238
|
}),
|
|
184
239
|
environment: env.LANGFUSE_ENVIRONMENT ?? env.NODE_ENV ?? 'development'
|