incremnt 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +9 -2
- package/package.json +20 -4
- package/src/anonymize.js +12 -0
- package/src/coach-bakeoff.js +300 -0
- package/src/coach-facts.js +100 -0
- package/src/coach-prompt-variants.js +106 -0
- package/src/contract.js +24 -5
- package/src/exercise-aliases.js +163 -0
- package/src/format.js +59 -1
- package/src/increment-score-replay-data.js +486 -0
- package/src/increment-score-replay.js +822 -0
- package/src/lib.js +14 -2
- package/src/local.js +3 -3
- package/src/openrouter.js +979 -182
- package/src/program-phase-resolver.js +206 -0
- package/src/prompt-security.js +1 -1
- package/src/promptfoo-domain-assert.cjs +4 -0
- package/src/promptfoo-evals.js +166 -0
- package/src/promptfoo-langfuse-scores.js +354 -0
- package/src/promptfoo-provider.cjs +14 -0
- package/src/promptfoo-tests.cjs +4 -0
- package/src/queries.js +2175 -197
- package/src/remote.js +51 -5
- package/src/state.js +9 -2
- package/src/stored-summary-eval-report.js +85 -52
- package/src/summary-evals.js +623 -17
- package/src/sync-service.js +1199 -131
package/src/remote.js
CHANGED
|
@@ -40,7 +40,8 @@ const remoteCommandHandlers = {
|
|
|
40
40
|
'training-load': executeRemoteRead,
|
|
41
41
|
'ask-history': executeRemoteRead,
|
|
42
42
|
'ask-show': executeRemoteRead,
|
|
43
|
-
'program-share-fetch': executeRemoteRead
|
|
43
|
+
'program-share-fetch': executeRemoteRead,
|
|
44
|
+
'increment-score-history': executeRemoteRead
|
|
44
45
|
};
|
|
45
46
|
|
|
46
47
|
async function executeRemoteRead(options, sessionState, normalizedCommand) {
|
|
@@ -161,6 +162,13 @@ function endpointForCommand(baseUrl, normalizedCommand, options) {
|
|
|
161
162
|
return resolveServiceUrl(baseUrl, `/cli/ask/history/${options.id}`);
|
|
162
163
|
case 'program-share-fetch':
|
|
163
164
|
return resolveServiceUrl(baseUrl, `/program-share/${options.token}`);
|
|
165
|
+
case 'increment-score-history': {
|
|
166
|
+
const url = resolveServiceUrl(baseUrl, '/mobile/score-snapshots');
|
|
167
|
+
if (options.from) url.searchParams.set('from', options.from);
|
|
168
|
+
if (options.to) url.searchParams.set('to', options.to);
|
|
169
|
+
if (options.limit) url.searchParams.set('limit', options.limit);
|
|
170
|
+
return url;
|
|
171
|
+
}
|
|
164
172
|
default:
|
|
165
173
|
return resolveServiceUrl(baseUrl, '/');
|
|
166
174
|
}
|
|
@@ -346,16 +354,54 @@ const remoteWriteCommandHandlers = {
|
|
|
346
354
|
return response.json();
|
|
347
355
|
},
|
|
348
356
|
|
|
357
|
+
'increment-score-upload': async (options, sessionState) => {
|
|
358
|
+
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
359
|
+
if (!baseUrl) throw notImplementedError();
|
|
360
|
+
if (!options.file) {
|
|
361
|
+
const error = new Error('--file is required for increment-score upload.');
|
|
362
|
+
error.code = 'MISSING_OPTION';
|
|
363
|
+
throw error;
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
const raw = await fs.readFile(options.file, 'utf8');
|
|
367
|
+
const body = JSON.parse(raw);
|
|
368
|
+
if (!body || !Array.isArray(body.snapshots)) {
|
|
369
|
+
const error = new Error('Invalid file: expected an object with a snapshots array.');
|
|
370
|
+
error.code = 'INVALID_PAYLOAD';
|
|
371
|
+
throw error;
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
const endpoint = resolveServiceUrl(baseUrl, '/mobile/score-snapshots');
|
|
375
|
+
const response = await fetch(endpoint, {
|
|
376
|
+
method: 'POST',
|
|
377
|
+
headers: {
|
|
378
|
+
'Content-Type': 'application/json',
|
|
379
|
+
Authorization: `Bearer ${sessionState.session?.auth?.accessToken ?? ''}`
|
|
380
|
+
},
|
|
381
|
+
body: JSON.stringify(body)
|
|
382
|
+
});
|
|
383
|
+
|
|
384
|
+
if (response.status === 401 || response.status === 403) throw authenticationFailedError();
|
|
385
|
+
if (!response.ok) {
|
|
386
|
+
const payload = await response.json().catch(() => null);
|
|
387
|
+
const error = new Error(payload?.error ?? `Unexpected error (HTTP ${response.status}).`);
|
|
388
|
+
error.code = 'REMOTE_HTTP_ERROR';
|
|
389
|
+
throw error;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
return response.json();
|
|
393
|
+
},
|
|
394
|
+
|
|
349
395
|
'program-share-revoke': async (options, sessionState) => {
|
|
350
396
|
const baseUrl = sessionState.session?.transport?.baseUrl;
|
|
351
397
|
if (!baseUrl) throw notImplementedError();
|
|
352
|
-
if (!options
|
|
353
|
-
const error = new Error('--
|
|
398
|
+
if (!options['share-id']) {
|
|
399
|
+
const error = new Error('--share-id is required for programs share revoke.');
|
|
354
400
|
error.code = 'MISSING_OPTION';
|
|
355
401
|
throw error;
|
|
356
402
|
}
|
|
357
403
|
|
|
358
|
-
const endpoint = resolveServiceUrl(baseUrl, `/cli/program-share/${options
|
|
404
|
+
const endpoint = resolveServiceUrl(baseUrl, `/cli/program-share/${options['share-id']}/revoke`);
|
|
359
405
|
const response = await fetch(endpoint, {
|
|
360
406
|
method: 'POST',
|
|
361
407
|
headers: {
|
|
@@ -365,7 +411,7 @@ const remoteWriteCommandHandlers = {
|
|
|
365
411
|
|
|
366
412
|
if (response.status === 401 || response.status === 403) throw authenticationFailedError();
|
|
367
413
|
if (response.status === 404) {
|
|
368
|
-
const error = new Error(`Program share not found: ${options
|
|
414
|
+
const error = new Error(`Program share not found: ${options['share-id']}`);
|
|
369
415
|
error.code = 'REMOTE_NOT_FOUND';
|
|
370
416
|
throw error;
|
|
371
417
|
}
|
package/src/state.js
CHANGED
|
@@ -4,12 +4,19 @@ import path from 'node:path';
|
|
|
4
4
|
|
|
5
5
|
export const sessionSchemaVersion = 1;
|
|
6
6
|
|
|
7
|
+
// Prefer HOME env over os.homedir() so test fixtures and explicit overrides
|
|
8
|
+
// work consistently across platforms. On Linux, os.homedir() reads from
|
|
9
|
+
// /etc/passwd via getpwuid and ignores HOME, breaking tests that override HOME.
|
|
10
|
+
export function userHomeDir() {
|
|
11
|
+
return process.env.HOME || os.homedir();
|
|
12
|
+
}
|
|
13
|
+
|
|
7
14
|
function fallbackConfigRoot() {
|
|
8
15
|
if (process.platform === 'darwin') {
|
|
9
|
-
return path.join(
|
|
16
|
+
return path.join(userHomeDir(), 'Library', 'Application Support');
|
|
10
17
|
}
|
|
11
18
|
|
|
12
|
-
return path.join(
|
|
19
|
+
return path.join(userHomeDir(), '.config');
|
|
13
20
|
}
|
|
14
21
|
|
|
15
22
|
export function resolveConfigDir() {
|
|
@@ -1,3 +1,10 @@
|
|
|
1
|
+
function withPassRate(entry) {
|
|
2
|
+
return {
|
|
3
|
+
...entry,
|
|
4
|
+
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
5
|
+
};
|
|
6
|
+
}
|
|
7
|
+
|
|
1
8
|
export function summarizeResults(results) {
|
|
2
9
|
const counts = {
|
|
3
10
|
total: results.length,
|
|
@@ -8,75 +15,85 @@ export function summarizeResults(results) {
|
|
|
8
15
|
return counts;
|
|
9
16
|
}
|
|
10
17
|
|
|
11
|
-
|
|
18
|
+
function summarizeByKey(results, keyFn) {
|
|
12
19
|
const grouped = new Map();
|
|
13
20
|
for (const result of results) {
|
|
14
|
-
const
|
|
21
|
+
const key = keyFn(result);
|
|
22
|
+
const entry = grouped.get(key) ?? { total: 0, passed: 0, failed: 0 };
|
|
15
23
|
entry.total += 1;
|
|
16
24
|
if (result.passed) entry.passed += 1;
|
|
17
25
|
else entry.failed += 1;
|
|
18
|
-
grouped.set(
|
|
26
|
+
grouped.set(key, entry);
|
|
19
27
|
}
|
|
20
28
|
|
|
21
29
|
return Object.fromEntries(
|
|
22
|
-
[...grouped.entries()]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
...entry,
|
|
26
|
-
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
27
|
-
}
|
|
28
|
-
])
|
|
30
|
+
[...grouped.entries()]
|
|
31
|
+
.sort(([left], [right]) => left.localeCompare(right))
|
|
32
|
+
.map(([key, entry]) => [key, withPassRate(entry)])
|
|
29
33
|
);
|
|
30
34
|
}
|
|
31
35
|
|
|
32
|
-
export function
|
|
36
|
+
export function summarizeBySurface(results) {
|
|
37
|
+
return summarizeByKey(results, (result) => result.surface);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function normalizeGeneratedDate(generatedAt) {
|
|
41
|
+
if (typeof generatedAt !== 'string' || generatedAt.trim().length === 0) {
|
|
42
|
+
return 'legacy';
|
|
43
|
+
}
|
|
44
|
+
return generatedAt.slice(0, 10);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function metadataValue(value) {
|
|
48
|
+
return typeof value === 'string' && value.trim().length > 0 ? value : 'legacy';
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function summarizeMetadata(results) {
|
|
52
|
+
return {
|
|
53
|
+
byPromptVersion: summarizeByKey(results, (result) => metadataValue(result.metadata?.promptVersion)),
|
|
54
|
+
byModel: summarizeByKey(results, (result) => metadataValue(result.metadata?.model)),
|
|
55
|
+
byGeneratedDate: summarizeByKey(results, (result) => normalizeGeneratedDate(result.metadata?.generatedAt)),
|
|
56
|
+
byGitSha: summarizeByKey(results, (result) => metadataValue(result.metadata?.gitSha)),
|
|
57
|
+
byCohort: summarizeByKey(results, (result) => {
|
|
58
|
+
const promptVersion = metadataValue(result.metadata?.promptVersion);
|
|
59
|
+
const model = metadataValue(result.metadata?.model);
|
|
60
|
+
const generatedDate = normalizeGeneratedDate(result.metadata?.generatedAt);
|
|
61
|
+
return `${result.surface} / ${promptVersion} / ${model} / ${generatedDate}`;
|
|
62
|
+
})
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
function summarizeStoredResults(results) {
|
|
33
67
|
return {
|
|
34
|
-
snapshotPath,
|
|
35
68
|
summary: summarizeResults(results),
|
|
36
69
|
bySurface: summarizeBySurface(results),
|
|
70
|
+
metadata: summarizeMetadata(results)
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export function buildStoredSummaryReport(snapshotPath, results) {
|
|
75
|
+
const summary = summarizeStoredResults(results);
|
|
76
|
+
return {
|
|
77
|
+
snapshotPath,
|
|
78
|
+
summary: summary.summary,
|
|
79
|
+
bySurface: summary.bySurface,
|
|
80
|
+
metadata: summary.metadata,
|
|
37
81
|
results: results.map((result) => ({
|
|
38
82
|
id: result.id,
|
|
39
83
|
surface: result.surface,
|
|
40
84
|
passed: result.passed,
|
|
41
85
|
output: result.output,
|
|
86
|
+
metadata: result.metadata ?? null,
|
|
42
87
|
failedChecks: result.checks.filter((check) => !check.passed)
|
|
43
88
|
}))
|
|
44
89
|
};
|
|
45
90
|
}
|
|
46
91
|
|
|
47
92
|
export function summarizeBatchReports(reports) {
|
|
48
|
-
const
|
|
49
|
-
let total = 0;
|
|
50
|
-
let passed = 0;
|
|
51
|
-
|
|
52
|
-
for (const report of reports) {
|
|
53
|
-
total += report.summary.total;
|
|
54
|
-
passed += report.summary.passed;
|
|
55
|
-
|
|
56
|
-
for (const [surface, entry] of Object.entries(report.bySurface ?? {})) {
|
|
57
|
-
const current = bySurface.get(surface) ?? { total: 0, passed: 0, failed: 0 };
|
|
58
|
-
current.total += entry.total;
|
|
59
|
-
current.passed += entry.passed;
|
|
60
|
-
current.failed += entry.failed;
|
|
61
|
-
bySurface.set(surface, current);
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
93
|
+
const results = reports.flatMap((report) => report.results ?? []);
|
|
65
94
|
return {
|
|
66
95
|
snapshotCount: reports.length,
|
|
67
|
-
|
|
68
|
-
passed,
|
|
69
|
-
failed: total - passed,
|
|
70
|
-
passRate: total > 0 ? passed / total : 0,
|
|
71
|
-
bySurface: Object.fromEntries(
|
|
72
|
-
[...bySurface.entries()].map(([surface, entry]) => [
|
|
73
|
-
surface,
|
|
74
|
-
{
|
|
75
|
-
...entry,
|
|
76
|
-
passRate: entry.total > 0 ? entry.passed / entry.total : 0
|
|
77
|
-
}
|
|
78
|
-
])
|
|
79
|
-
)
|
|
96
|
+
...summarizeStoredResults(results)
|
|
80
97
|
};
|
|
81
98
|
}
|
|
82
99
|
|
|
@@ -90,8 +107,8 @@ export function evaluateBatchThresholds(summary, {
|
|
|
90
107
|
} = {}) {
|
|
91
108
|
const failures = [];
|
|
92
109
|
|
|
93
|
-
if (typeof minPassRate === 'number' && summary.passRate < minPassRate) {
|
|
94
|
-
failures.push(`Overall pass rate ${percentage(summary.passRate)} is below required ${percentage(minPassRate)}.`);
|
|
110
|
+
if (typeof minPassRate === 'number' && summary.summary.passRate < minPassRate) {
|
|
111
|
+
failures.push(`Overall pass rate ${percentage(summary.summary.passRate)} is below required ${percentage(minPassRate)}.`);
|
|
95
112
|
}
|
|
96
113
|
|
|
97
114
|
for (const [surface, minimum] of Object.entries(minSurfacePassRates)) {
|
|
@@ -105,22 +122,38 @@ export function evaluateBatchThresholds(summary, {
|
|
|
105
122
|
return failures;
|
|
106
123
|
}
|
|
107
124
|
|
|
125
|
+
function formatSummaryLines(entries, { limit = null } = {}) {
|
|
126
|
+
const lines = Object.entries(entries).map(
|
|
127
|
+
([label, entry]) => `- ${label}: ${entry.passed}/${entry.total} passed (${(entry.passRate * 100).toFixed(1)}%)`
|
|
128
|
+
);
|
|
129
|
+
return limit == null ? lines : lines.slice(0, limit);
|
|
130
|
+
}
|
|
131
|
+
|
|
108
132
|
export function formatBatchSummaryMarkdown(summary, reports, failures = []) {
|
|
109
133
|
const lines = [
|
|
110
134
|
'# Stored Summary Eval Report',
|
|
111
135
|
'',
|
|
112
136
|
`- Snapshots: ${summary.snapshotCount}`,
|
|
113
|
-
`- Total summaries: ${summary.total}`,
|
|
114
|
-
`- Passed: ${summary.passed}`,
|
|
115
|
-
`- Failed: ${summary.failed}`,
|
|
116
|
-
`- Pass rate: ${(summary.passRate * 100).toFixed(1)}%`,
|
|
137
|
+
`- Total summaries: ${summary.summary.total}`,
|
|
138
|
+
`- Passed: ${summary.summary.passed}`,
|
|
139
|
+
`- Failed: ${summary.summary.failed}`,
|
|
140
|
+
`- Pass rate: ${(summary.summary.passRate * 100).toFixed(1)}%`,
|
|
117
141
|
'',
|
|
118
|
-
'## By Surface'
|
|
142
|
+
'## By Surface',
|
|
143
|
+
...formatSummaryLines(summary.bySurface)
|
|
119
144
|
];
|
|
120
145
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
146
|
+
lines.push('', '## By Prompt Version');
|
|
147
|
+
lines.push(...formatSummaryLines(summary.metadata.byPromptVersion));
|
|
148
|
+
|
|
149
|
+
lines.push('', '## By Model');
|
|
150
|
+
lines.push(...formatSummaryLines(summary.metadata.byModel));
|
|
151
|
+
|
|
152
|
+
lines.push('', '## By Generated Date');
|
|
153
|
+
lines.push(...formatSummaryLines(summary.metadata.byGeneratedDate));
|
|
154
|
+
|
|
155
|
+
lines.push('', '## Versioned Cohorts');
|
|
156
|
+
lines.push(...formatSummaryLines(summary.metadata.byCohort, { limit: 20 }));
|
|
124
157
|
|
|
125
158
|
lines.push('', '## Snapshots');
|
|
126
159
|
for (const report of reports) {
|