incremnt 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/package.json +25 -4
- package/src/anonymize.js +12 -0
- package/src/coach-bakeoff.js +300 -0
- package/src/coach-facts.js +100 -0
- package/src/coach-prompt-variants.js +106 -0
- package/src/contract.js +56 -1
- package/src/exercise-aliases.js +163 -0
- package/src/format.js +64 -1
- package/src/increment-score-replay-data.js +486 -0
- package/src/increment-score-replay.js +822 -0
- package/src/lib.js +14 -2
- package/src/local.js +3 -3
- package/src/openrouter.js +1033 -179
- package/src/program-phase-resolver.js +206 -0
- package/src/prompt-security.js +13 -0
- package/src/promptfoo-domain-assert.cjs +4 -0
- package/src/promptfoo-evals.js +166 -0
- package/src/promptfoo-langfuse-scores.js +354 -0
- package/src/promptfoo-provider.cjs +14 -0
- package/src/promptfoo-tests.cjs +4 -0
- package/src/queries.js +2307 -164
- package/src/remote.js +144 -1
- package/src/state.js +9 -2
- package/src/stored-summary-eval-report.js +171 -0
- package/src/summary-evals.js +1445 -0
- package/src/sync-service.js +1557 -158
- package/src/workout-prompt-variants.js +52 -0
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
2
|
+
import { AI_PROMPT_VERSIONS } from './openrouter.js';
|
|
3
|
+
|
|
4
|
+
const DEFAULT_LANGFUSE_HOST = 'https://cloud.langfuse.com';
|
|
5
|
+
const SCORE_NAME = 'promptfoo_domain_pass';
|
|
6
|
+
const promptfooRunTracesCreated = new Set();
|
|
7
|
+
|
|
8
|
+
function compactObject(obj) {
|
|
9
|
+
return Object.fromEntries(
|
|
10
|
+
Object.entries(obj).filter(([, value]) => value !== undefined && value !== null)
|
|
11
|
+
);
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
function firstString(...values) {
|
|
15
|
+
for (const value of values) {
|
|
16
|
+
if (typeof value === 'string' && value.trim()) return value.trim();
|
|
17
|
+
}
|
|
18
|
+
return null;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function envFlag(name, env = process.env) {
|
|
22
|
+
return ['1', 'true', 'yes'].includes(String(env[name] ?? '').toLowerCase());
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function currentGitSha(env = process.env) {
|
|
26
|
+
return env.RENDER_GIT_COMMIT
|
|
27
|
+
?? env.GIT_SHA
|
|
28
|
+
?? env.COMMIT_SHA
|
|
29
|
+
?? env.VERCEL_GIT_COMMIT_SHA
|
|
30
|
+
?? null;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function shouldPublishPromptfooLangfuseScores(env = process.env) {
|
|
34
|
+
if (String(env.PROMPTFOO_LANGFUSE_SCORES ?? '').toLowerCase() === '0') return false;
|
|
35
|
+
return Boolean(env.LANGFUSE_PUBLIC_KEY && env.LANGFUSE_SECRET_KEY);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function langfuseHost(env = process.env) {
|
|
39
|
+
return (firstString(env.LANGFUSE_BASE_URL, env.LANGFUSE_HOST) ?? DEFAULT_LANGFUSE_HOST).replace(/\/+$/, '');
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function promptVersionForSurface(surface) {
|
|
43
|
+
switch (surface) {
|
|
44
|
+
case 'ask':
|
|
45
|
+
return AI_PROMPT_VERSIONS.ask;
|
|
46
|
+
case 'workout':
|
|
47
|
+
return AI_PROMPT_VERSIONS.workout;
|
|
48
|
+
case 'cycle':
|
|
49
|
+
return AI_PROMPT_VERSIONS.cycle;
|
|
50
|
+
case 'vitals':
|
|
51
|
+
return AI_PROMPT_VERSIONS.vitals;
|
|
52
|
+
case 'checkpoint':
|
|
53
|
+
return AI_PROMPT_VERSIONS.checkpoint;
|
|
54
|
+
case 'weekly-checkin':
|
|
55
|
+
return AI_PROMPT_VERSIONS.weeklyCheckin;
|
|
56
|
+
default:
|
|
57
|
+
return 'unknown';
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export function promptfooEvalMode(env = process.env) {
|
|
62
|
+
return envFlag('PROMPTFOO_LIVE', env) || envFlag('SUMMARY_EVALS_LIVE', env) ? 'live' : 'stored';
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
function stableScoreId({ runId, caseSet, caseId, provider, mode }) {
|
|
66
|
+
const key = [SCORE_NAME, runId, caseSet, caseId, provider, mode].join(':');
|
|
67
|
+
return `pf-${createHash('sha256').update(key).digest('hex').slice(0, 24)}`;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
function stablePromptfooRunTraceId({ sessionId, runId, mode }) {
|
|
71
|
+
const key = ['promptfoo_run_trace', sessionId, runId, mode].join(':');
|
|
72
|
+
return createHash('sha256').update(key).digest('hex').slice(0, 32);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function scoreRunId(context = {}, env = process.env, now = new Date()) {
|
|
76
|
+
return context.evaluationId
|
|
77
|
+
?? env.PROMPTFOO_RUN_ID
|
|
78
|
+
?? env.GITHUB_RUN_ID
|
|
79
|
+
?? env.CI_PIPELINE_ID
|
|
80
|
+
?? `local-${now.toISOString().slice(0, 19)}`;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function providerLabel(context = {}) {
|
|
84
|
+
return context.provider?.label
|
|
85
|
+
?? context.provider?.id
|
|
86
|
+
?? context.vars?.provider
|
|
87
|
+
?? 'incremnt-coach-current';
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function modelLabel(testCase, context = {}, env = process.env) {
|
|
91
|
+
return context.providerResponse?.metadata?.model
|
|
92
|
+
?? context.vars?.model
|
|
93
|
+
?? testCase.metadata?.model
|
|
94
|
+
?? env.SUMMARY_EVAL_MODEL
|
|
95
|
+
?? env.OPENROUTER_MODEL
|
|
96
|
+
?? 'current-chain';
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
function failedChecks(result) {
|
|
100
|
+
return (result.checks ?? []).filter((check) => !check.passed);
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
function langfuseScoreTarget({ testCase = {}, context = {}, sessionId }) {
|
|
104
|
+
const providerMetadata = context.providerResponse?.metadata ?? {};
|
|
105
|
+
const caseMetadata = testCase.metadata ?? {};
|
|
106
|
+
const traceId = firstString(
|
|
107
|
+
providerMetadata.langfuseTraceId,
|
|
108
|
+
providerMetadata.traceId,
|
|
109
|
+
caseMetadata.langfuseTraceId,
|
|
110
|
+
caseMetadata.traceId
|
|
111
|
+
);
|
|
112
|
+
const observationId = firstString(
|
|
113
|
+
providerMetadata.langfuseObservationId,
|
|
114
|
+
providerMetadata.observationId,
|
|
115
|
+
caseMetadata.langfuseObservationId,
|
|
116
|
+
caseMetadata.observationId
|
|
117
|
+
);
|
|
118
|
+
|
|
119
|
+
if (traceId) {
|
|
120
|
+
return compactObject({
|
|
121
|
+
traceId,
|
|
122
|
+
observationId
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return { sessionId };
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export function buildPromptfooLangfuseScorePayload({
|
|
130
|
+
result,
|
|
131
|
+
testCase,
|
|
132
|
+
context = {},
|
|
133
|
+
now = new Date(),
|
|
134
|
+
env = process.env
|
|
135
|
+
}) {
|
|
136
|
+
const vars = context.vars ?? {};
|
|
137
|
+
const caseSet = vars.caseSet ?? testCase.caseSet ?? env.SUMMARY_EVAL_CASE_SET ?? 'synthetic';
|
|
138
|
+
const runId = scoreRunId(context, env, now);
|
|
139
|
+
const mode = promptfooEvalMode(env);
|
|
140
|
+
const provider = providerLabel(context);
|
|
141
|
+
const failed = failedChecks(result);
|
|
142
|
+
const promptVersion = vars.promptVersion
|
|
143
|
+
?? testCase.metadata?.promptVersion
|
|
144
|
+
?? promptVersionForSurface(result.surface);
|
|
145
|
+
const sessionId = `promptfoo:${runId}`;
|
|
146
|
+
const target = langfuseScoreTarget({ testCase, context, sessionId });
|
|
147
|
+
const targetKind = target.observationId ? 'observation' : target.traceId ? 'trace' : 'session';
|
|
148
|
+
|
|
149
|
+
return {
|
|
150
|
+
id: stableScoreId({ runId, caseSet, caseId: result.id, provider, mode }),
|
|
151
|
+
name: SCORE_NAME,
|
|
152
|
+
value: result.passed ? 1 : 0,
|
|
153
|
+
dataType: 'BOOLEAN',
|
|
154
|
+
...target,
|
|
155
|
+
comment: result.passed
|
|
156
|
+
? 'Promptfoo domain checks passed.'
|
|
157
|
+
: failed.map((check) => `${check.key}: ${check.reason}`).join(' | ').slice(0, 1000),
|
|
158
|
+
metadata: compactObject({
|
|
159
|
+
kind: 'promptfoo_eval',
|
|
160
|
+
caseId: result.id,
|
|
161
|
+
caseName: result.name,
|
|
162
|
+
caseSet,
|
|
163
|
+
fixtureFile: vars.fixtureFile ?? testCase.fixtureFile,
|
|
164
|
+
snapshotFile: vars.snapshotFile ?? testCase.snapshotFile,
|
|
165
|
+
surface: result.surface,
|
|
166
|
+
promptVersion,
|
|
167
|
+
model: modelLabel(testCase, context, env),
|
|
168
|
+
provider,
|
|
169
|
+
mode,
|
|
170
|
+
runId,
|
|
171
|
+
shouldPass: vars.shouldPass ?? testCase.shouldPass !== false,
|
|
172
|
+
source: testCase.source ?? 'fixture',
|
|
173
|
+
gitSha: currentGitSha(env),
|
|
174
|
+
passed: result.passed,
|
|
175
|
+
scoreTarget: targetKind,
|
|
176
|
+
promptfooSessionId: sessionId,
|
|
177
|
+
langfuseTraceId: target.traceId,
|
|
178
|
+
langfuseObservationId: target.observationId,
|
|
179
|
+
assertionCount: result.checks?.length ?? 0,
|
|
180
|
+
failedAssertionKeys: failed.map((check) => check.key),
|
|
181
|
+
failedAssertionReasons: failed.map((check) => check.reason).slice(0, 10),
|
|
182
|
+
generatedAt: now.toISOString()
|
|
183
|
+
}),
|
|
184
|
+
environment: env.LANGFUSE_ENVIRONMENT ?? env.NODE_ENV ?? 'development'
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
export function buildPromptfooLangfuseRunTracePayload(scorePayload, {
|
|
189
|
+
now = new Date(),
|
|
190
|
+
eventId = randomUUID()
|
|
191
|
+
} = {}) {
|
|
192
|
+
if (!scorePayload?.sessionId) return null;
|
|
193
|
+
|
|
194
|
+
const metadata = scorePayload.metadata ?? {};
|
|
195
|
+
const runId = metadata.runId ?? String(scorePayload.sessionId).replace(/^promptfoo:/, '');
|
|
196
|
+
const traceId = stablePromptfooRunTraceId({
|
|
197
|
+
sessionId: scorePayload.sessionId,
|
|
198
|
+
runId,
|
|
199
|
+
mode: metadata.mode
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
return {
|
|
203
|
+
batch: [
|
|
204
|
+
{
|
|
205
|
+
type: 'trace-create',
|
|
206
|
+
id: eventId,
|
|
207
|
+
timestamp: now.toISOString(),
|
|
208
|
+
body: {
|
|
209
|
+
id: traceId,
|
|
210
|
+
timestamp: now.toISOString(),
|
|
211
|
+
name: 'promptfoo eval run',
|
|
212
|
+
sessionId: scorePayload.sessionId,
|
|
213
|
+
userId: 'promptfoo',
|
|
214
|
+
input: {
|
|
215
|
+
redacted: true,
|
|
216
|
+
source: 'promptfoo',
|
|
217
|
+
runId,
|
|
218
|
+
caseSet: metadata.caseSet,
|
|
219
|
+
mode: metadata.mode
|
|
220
|
+
},
|
|
221
|
+
output: {
|
|
222
|
+
redacted: true,
|
|
223
|
+
scoreName: scorePayload.name,
|
|
224
|
+
scoreTarget: metadata.scoreTarget
|
|
225
|
+
},
|
|
226
|
+
metadata: compactObject({
|
|
227
|
+
kind: 'promptfoo_eval_run',
|
|
228
|
+
runId,
|
|
229
|
+
caseSet: metadata.caseSet,
|
|
230
|
+
mode: metadata.mode,
|
|
231
|
+
provider: metadata.provider,
|
|
232
|
+
model: metadata.model,
|
|
233
|
+
promptfooSessionId: scorePayload.sessionId,
|
|
234
|
+
scoreName: scorePayload.name,
|
|
235
|
+
scoreTarget: metadata.scoreTarget,
|
|
236
|
+
gitSha: metadata.gitSha,
|
|
237
|
+
generatedAt: now.toISOString()
|
|
238
|
+
}),
|
|
239
|
+
tags: [
|
|
240
|
+
'source:promptfoo',
|
|
241
|
+
metadata.mode ? `mode:${metadata.mode}` : null,
|
|
242
|
+
metadata.caseSet ? `case-set:${metadata.caseSet}` : null
|
|
243
|
+
].filter(Boolean),
|
|
244
|
+
environment: scorePayload.environment
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
],
|
|
248
|
+
metadata: {
|
|
249
|
+
source: 'incremnt-promptfoo-langfuse-scores',
|
|
250
|
+
runId
|
|
251
|
+
}
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function authHeader(env) {
|
|
256
|
+
return `Basic ${Buffer.from(`${env.LANGFUSE_PUBLIC_KEY}:${env.LANGFUSE_SECRET_KEY}`).toString('base64')}`;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
async function ensurePromptfooRunTrace(payload, { env, fetchImpl, now }) {
|
|
260
|
+
if (!payload.sessionId) return null;
|
|
261
|
+
const runTracePayload = buildPromptfooLangfuseRunTracePayload(payload, { now });
|
|
262
|
+
const runTraceId = runTracePayload.batch[0].body.id;
|
|
263
|
+
|
|
264
|
+
if (promptfooRunTracesCreated.has(runTraceId)) {
|
|
265
|
+
return { status: 'already-created', traceId: runTraceId };
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const response = await fetchImpl(`${langfuseHost(env)}/api/public/ingestion`, {
|
|
269
|
+
method: 'POST',
|
|
270
|
+
headers: {
|
|
271
|
+
Authorization: authHeader(env),
|
|
272
|
+
'Content-Type': 'application/json'
|
|
273
|
+
},
|
|
274
|
+
body: JSON.stringify(runTracePayload)
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
let body = null;
|
|
278
|
+
let parseError = null;
|
|
279
|
+
let rawText = '';
|
|
280
|
+
|
|
281
|
+
if (typeof response?.json === 'function') {
|
|
282
|
+
try {
|
|
283
|
+
body = await response.json();
|
|
284
|
+
} catch (error) {
|
|
285
|
+
parseError = error;
|
|
286
|
+
rawText = typeof response?.text === 'function' ? await response.text() : '';
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if (!response?.ok || parseError || (Array.isArray(body?.errors) && body.errors.length > 0)) {
|
|
291
|
+
const detail = body ? JSON.stringify(body) : rawText;
|
|
292
|
+
throw new Error(`Langfuse promptfoo run trace publish failed: ${response?.status ?? 'unknown'} ${detail}`.trim());
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
promptfooRunTracesCreated.add(runTraceId);
|
|
296
|
+
return { status: 'published', traceId: runTraceId };
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
export async function publishPromptfooLangfuseScore(input, {
|
|
300
|
+
env = process.env,
|
|
301
|
+
fetchImpl = globalThis.fetch,
|
|
302
|
+
logger = console,
|
|
303
|
+
now = new Date()
|
|
304
|
+
} = {}) {
|
|
305
|
+
if (!shouldPublishPromptfooLangfuseScores(env)) {
|
|
306
|
+
return {
|
|
307
|
+
status: 'disabled',
|
|
308
|
+
reason: env.LANGFUSE_PUBLIC_KEY && env.LANGFUSE_SECRET_KEY ? 'disabled-by-env' : 'missing-credentials'
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
if (typeof fetchImpl !== 'function') {
|
|
313
|
+
return { status: 'failed', error: 'fetch is unavailable' };
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
const payload = buildPromptfooLangfuseScorePayload({ ...input, env, now });
|
|
317
|
+
try {
|
|
318
|
+
let runTrace = null;
|
|
319
|
+
try {
|
|
320
|
+
runTrace = await ensurePromptfooRunTrace(payload, { env, fetchImpl, now });
|
|
321
|
+
} catch (error) {
|
|
322
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
323
|
+
logger?.warn?.('Promptfoo Langfuse run trace publish failed:', message);
|
|
324
|
+
runTrace = { status: 'failed', error: message };
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
const response = await fetchImpl(`${langfuseHost(env)}/api/public/scores`, {
|
|
328
|
+
method: 'POST',
|
|
329
|
+
headers: {
|
|
330
|
+
Authorization: authHeader(env),
|
|
331
|
+
'Content-Type': 'application/json'
|
|
332
|
+
},
|
|
333
|
+
body: JSON.stringify(payload)
|
|
334
|
+
});
|
|
335
|
+
|
|
336
|
+
if (!response?.ok) {
|
|
337
|
+
const body = typeof response?.text === 'function' ? await response.text() : '';
|
|
338
|
+
throw new Error(`Langfuse score publish failed: ${response?.status ?? 'unknown'} ${body}`.trim());
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const data = typeof response.json === 'function' ? await response.json() : {};
|
|
342
|
+
return compactObject({
|
|
343
|
+
status: 'published',
|
|
344
|
+
scoreId: data.id,
|
|
345
|
+
sessionId: payload.sessionId,
|
|
346
|
+
runTraceId: runTrace?.traceId,
|
|
347
|
+
runTraceStatus: runTrace?.status
|
|
348
|
+
});
|
|
349
|
+
} catch (error) {
|
|
350
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
351
|
+
logger?.warn?.('Promptfoo Langfuse score publish skipped:', message);
|
|
352
|
+
return { status: 'failed', error: message };
|
|
353
|
+
}
|
|
354
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
module.exports = class IncremntCoachProvider {
|
|
2
|
+
constructor(options = {}) {
|
|
3
|
+
this.providerId = options.id || 'incremnt-coach';
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
id() {
|
|
7
|
+
return this.providerId;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
async callApi(prompt, context) {
|
|
11
|
+
const { callPromptfooProvider } = await import('./promptfoo-evals.js');
|
|
12
|
+
return callPromptfooProvider(prompt, context);
|
|
13
|
+
}
|
|
14
|
+
};
|