promptfoo 0.10.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -35
- package/dist/assertions.d.ts +1 -1
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +176 -34
- package/dist/assertions.js.map +1 -1
- package/dist/evaluator.js +1 -1
- package/dist/evaluator.js.map +1 -1
- package/dist/main.js +10 -6
- package/dist/main.js.map +1 -1
- package/dist/types.d.ts +6 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +2 -2
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +6 -3
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-15dfcd18.js +172 -0
- package/dist/web/client/assets/index-87905193.css +1 -0
- package/dist/web/client/index.html +2 -2
- package/package.json +3 -1
- package/src/assertions.ts +246 -35
- package/src/evaluator.ts +1 -1
- package/src/main.ts +18 -6
- package/src/types.ts +23 -2
- package/src/util.ts +6 -3
- package/src/web/client/src/ResultsTable.css +18 -6
- package/src/web/client/src/ResultsTable.tsx +101 -35
- package/src/web/client/src/ResultsView.tsx +76 -11
- package/src/web/client/src/index.css +6 -0
- package/src/web/client/src/types.ts +2 -0
- package/dist/web/client/assets/index-9a9ba400.css +0 -1
- package/dist/web/client/assets/index-b72d3ca9.js +0 -172
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-15dfcd18.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-87905193.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "Prompt engineering toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.11.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/index.js",
|
|
@@ -59,6 +59,7 @@
|
|
|
59
59
|
"typescript": "^5.0.4"
|
|
60
60
|
},
|
|
61
61
|
"dependencies": {
|
|
62
|
+
"@apidevtools/json-schema-ref-parser": "^10.1.0",
|
|
62
63
|
"async": "^3.2.4",
|
|
63
64
|
"cache-manager": "^4.1.0",
|
|
64
65
|
"cache-manager-fs-hash": "^1.0.0",
|
|
@@ -76,6 +77,7 @@
|
|
|
76
77
|
"node-fetch": "^2.6.7",
|
|
77
78
|
"nunjucks": "^3.2.4",
|
|
78
79
|
"opener": "^1.5.2",
|
|
80
|
+
"rouge": "^1.0.3",
|
|
79
81
|
"socket.io": "^4.6.1",
|
|
80
82
|
"tiny-invariant": "^1.3.1",
|
|
81
83
|
"winston": "^3.8.2"
|
package/src/assertions.ts
CHANGED
|
@@ -1,17 +1,49 @@
|
|
|
1
|
+
import rouge from 'rouge';
|
|
1
2
|
import invariant from 'tiny-invariant';
|
|
2
3
|
import nunjucks from 'nunjucks';
|
|
3
4
|
|
|
4
5
|
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
|
|
5
|
-
import { cosineSimilarity } from './util';
|
|
6
|
+
import { cosineSimilarity, fetchWithTimeout } from './util';
|
|
6
7
|
import { loadApiProvider } from './providers';
|
|
7
8
|
import { DEFAULT_GRADING_PROMPT } from './prompts';
|
|
8
9
|
|
|
9
|
-
import type {
|
|
10
|
+
import type {
|
|
11
|
+
Assertion,
|
|
12
|
+
AssertionType,
|
|
13
|
+
GradingConfig,
|
|
14
|
+
GradingResult,
|
|
15
|
+
AtomicTestCase,
|
|
16
|
+
} from './types';
|
|
10
17
|
|
|
11
18
|
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
12
19
|
|
|
13
20
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
|
|
14
21
|
|
|
22
|
+
function handleRougeScore(
|
|
23
|
+
baseType: 'rouge-n',
|
|
24
|
+
assertion: Assertion,
|
|
25
|
+
expected: string | string[],
|
|
26
|
+
output: string,
|
|
27
|
+
inverted: boolean,
|
|
28
|
+
): GradingResult {
|
|
29
|
+
const fnName = baseType[baseType.length - 1] as 'n' | 'l' | 's';
|
|
30
|
+
const rougeMethod = rouge[fnName];
|
|
31
|
+
const score = rougeMethod(output, expected);
|
|
32
|
+
console.log(output, expected, score);
|
|
33
|
+
const pass = score >= (assertion.threshold || 0.75) != inverted;
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
pass,
|
|
37
|
+
reason: pass
|
|
38
|
+
? `${baseType.toUpperCase()} score ${score} is greater than or equal to threshold ${
|
|
39
|
+
assertion.threshold || 0.75
|
|
40
|
+
}`
|
|
41
|
+
: `${baseType.toUpperCase()} score ${score} is less than threshold ${
|
|
42
|
+
assertion.threshold || 0.75
|
|
43
|
+
}`,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
15
47
|
export async function runAssertions(test: AtomicTestCase, output: string): Promise<GradingResult> {
|
|
16
48
|
const tokensUsed = {
|
|
17
49
|
total: 0,
|
|
@@ -46,7 +78,12 @@ export async function runAssertion(
|
|
|
46
78
|
): Promise<GradingResult> {
|
|
47
79
|
let pass: boolean = false;
|
|
48
80
|
|
|
49
|
-
|
|
81
|
+
invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
82
|
+
|
|
83
|
+
const inverse = assertion.type.startsWith('not-');
|
|
84
|
+
const baseType = inverse ? assertion.type.slice(4) : assertion.type;
|
|
85
|
+
|
|
86
|
+
if (baseType === 'equals') {
|
|
50
87
|
pass = assertion.value === output;
|
|
51
88
|
return {
|
|
52
89
|
pass,
|
|
@@ -54,52 +91,194 @@ export async function runAssertion(
|
|
|
54
91
|
};
|
|
55
92
|
}
|
|
56
93
|
|
|
57
|
-
if (
|
|
94
|
+
if (baseType === 'is-json') {
|
|
58
95
|
try {
|
|
59
96
|
JSON.parse(output);
|
|
60
|
-
|
|
97
|
+
pass = !inverse;
|
|
61
98
|
} catch (err) {
|
|
62
|
-
|
|
63
|
-
pass: false,
|
|
64
|
-
reason: `Expected output to be valid JSON, but it isn't.\nError: ${err}`,
|
|
65
|
-
};
|
|
99
|
+
pass = inverse;
|
|
66
100
|
}
|
|
101
|
+
return { pass, reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON' };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (baseType === 'contains') {
|
|
105
|
+
invariant(assertion.value, '"contains" assertion type must have a string value');
|
|
106
|
+
invariant(
|
|
107
|
+
typeof assertion.value === 'string',
|
|
108
|
+
'"contains" assertion type must have a string value',
|
|
109
|
+
);
|
|
110
|
+
pass = output.includes(assertion.value) !== inverse;
|
|
111
|
+
return {
|
|
112
|
+
pass,
|
|
113
|
+
reason: pass
|
|
114
|
+
? 'Assertion passed'
|
|
115
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (baseType === 'contains-any') {
|
|
120
|
+
invariant(assertion.value, '"contains-any" assertion type must have a value');
|
|
121
|
+
invariant(
|
|
122
|
+
Array.isArray(assertion.value),
|
|
123
|
+
'"contains-any" assertion type must have an array value',
|
|
124
|
+
);
|
|
125
|
+
pass = assertion.value.some((value) => output.includes(value)) !== inverse;
|
|
126
|
+
return {
|
|
127
|
+
pass,
|
|
128
|
+
reason: pass
|
|
129
|
+
? 'Assertion passed'
|
|
130
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${assertion.value.join(
|
|
131
|
+
', ',
|
|
132
|
+
)}"`,
|
|
133
|
+
};
|
|
67
134
|
}
|
|
68
135
|
|
|
69
|
-
if (
|
|
70
|
-
|
|
136
|
+
if (baseType === 'contains-all') {
|
|
137
|
+
invariant(assertion.value, '"contains-all" assertion type must have a value');
|
|
138
|
+
invariant(
|
|
139
|
+
Array.isArray(assertion.value),
|
|
140
|
+
'"contains-all" assertion type must have an array value',
|
|
141
|
+
);
|
|
142
|
+
pass = assertion.value.every((value) => output.includes(value)) !== inverse;
|
|
143
|
+
return {
|
|
144
|
+
pass,
|
|
145
|
+
reason: pass
|
|
146
|
+
? 'Assertion passed'
|
|
147
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${assertion.value.join(
|
|
148
|
+
', ',
|
|
149
|
+
)}"`,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (baseType === 'regex') {
|
|
154
|
+
invariant(assertion.value, '"regex" assertion type must have a string value');
|
|
155
|
+
invariant(
|
|
156
|
+
typeof assertion.value === 'string',
|
|
157
|
+
'"contains" assertion type must have a string value',
|
|
158
|
+
);
|
|
159
|
+
const regex = new RegExp(assertion.value);
|
|
160
|
+
pass = regex.test(output) !== inverse;
|
|
161
|
+
return {
|
|
162
|
+
pass,
|
|
163
|
+
reason: pass
|
|
164
|
+
? 'Assertion passed'
|
|
165
|
+
: `Expected output to ${inverse ? 'not ' : ''}match regex "${assertion.value}"`,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (baseType === 'icontains') {
|
|
170
|
+
invariant(assertion.value, '"icontains" assertion type must have a string value');
|
|
171
|
+
invariant(
|
|
172
|
+
typeof assertion.value === 'string',
|
|
173
|
+
'"icontains" assertion type must have a string value',
|
|
174
|
+
);
|
|
175
|
+
pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
|
|
176
|
+
return {
|
|
177
|
+
pass,
|
|
178
|
+
reason: pass
|
|
179
|
+
? 'Assertion passed'
|
|
180
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (baseType === 'contains-json') {
|
|
185
|
+
pass = containsJSON(output) !== inverse;
|
|
71
186
|
return {
|
|
72
187
|
pass,
|
|
73
|
-
reason: pass
|
|
188
|
+
reason: pass
|
|
189
|
+
? 'Assertion passed'
|
|
190
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
|
|
74
191
|
};
|
|
75
192
|
}
|
|
76
193
|
|
|
77
|
-
if (
|
|
194
|
+
if (baseType === 'javascript') {
|
|
78
195
|
try {
|
|
79
|
-
const customFunction = new Function('output', `return ${assertion.value}`);
|
|
80
|
-
|
|
196
|
+
const customFunction = new Function('output', 'context', `return ${assertion.value}`);
|
|
197
|
+
const context = {
|
|
198
|
+
vars: test.vars || {},
|
|
199
|
+
};
|
|
200
|
+
pass = customFunction(output, context) !== inverse;
|
|
81
201
|
} catch (err) {
|
|
82
202
|
return {
|
|
83
203
|
pass: false,
|
|
84
|
-
reason: `Custom function threw error: ${(err as Error).message}
|
|
204
|
+
reason: `Custom function threw error: ${(err as Error).message}
|
|
205
|
+
${assertion.value}`,
|
|
85
206
|
};
|
|
86
207
|
}
|
|
87
208
|
return {
|
|
88
209
|
pass,
|
|
89
|
-
reason: pass
|
|
210
|
+
reason: pass
|
|
211
|
+
? 'Assertion passed'
|
|
212
|
+
: `Custom function returned ${inverse ? 'true' : 'false'}
|
|
213
|
+
${assertion.value}`,
|
|
90
214
|
};
|
|
91
215
|
}
|
|
92
216
|
|
|
93
|
-
if (
|
|
217
|
+
if (baseType === 'similar') {
|
|
94
218
|
invariant(assertion.value, 'Similarity assertion must have a string value');
|
|
95
|
-
|
|
219
|
+
invariant(
|
|
220
|
+
typeof assertion.value === 'string',
|
|
221
|
+
'"contains" assertion type must have a string value',
|
|
222
|
+
);
|
|
223
|
+
return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75, inverse);
|
|
96
224
|
}
|
|
97
225
|
|
|
98
|
-
if (
|
|
226
|
+
if (baseType === 'llm-rubric') {
|
|
99
227
|
invariant(assertion.value, 'Similarity assertion must have a string value');
|
|
228
|
+
invariant(
|
|
229
|
+
typeof assertion.value === 'string',
|
|
230
|
+
'"contains" assertion type must have a string value',
|
|
231
|
+
);
|
|
100
232
|
return matchesLlmRubric(assertion.value, output, test.options);
|
|
101
233
|
}
|
|
102
234
|
|
|
235
|
+
if (baseType === 'webhook') {
|
|
236
|
+
invariant(assertion.value, '"webhook" assertion type must have a URL value');
|
|
237
|
+
invariant(
|
|
238
|
+
typeof assertion.value === 'string',
|
|
239
|
+
'"webhook" assertion type must have a URL value',
|
|
240
|
+
);
|
|
241
|
+
|
|
242
|
+
try {
|
|
243
|
+
const context = {
|
|
244
|
+
vars: test.vars || {},
|
|
245
|
+
};
|
|
246
|
+
const response = await fetchWithTimeout(
|
|
247
|
+
assertion.value,
|
|
248
|
+
{
|
|
249
|
+
method: 'POST',
|
|
250
|
+
headers: {
|
|
251
|
+
'Content-Type': 'application/json',
|
|
252
|
+
},
|
|
253
|
+
body: JSON.stringify({ output, context }),
|
|
254
|
+
},
|
|
255
|
+
process.env.WEBHOOK_TIMEOUT ? parseInt(process.env.WEBHOOK_TIMEOUT, 10) : 5000,
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
if (!response.ok) {
|
|
259
|
+
throw new Error(`Webhook response status: ${response.status}`);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const jsonResponse = await response.json();
|
|
263
|
+
pass = jsonResponse.pass !== inverse;
|
|
264
|
+
} catch (err) {
|
|
265
|
+
return {
|
|
266
|
+
pass: false,
|
|
267
|
+
reason: `Webhook error: ${(err as Error).message}`,
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
pass,
|
|
273
|
+
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if (baseType === 'rouge-n') {
|
|
278
|
+
invariant(assertion.value, '"rouge" assertion type must a value (string or string array)');
|
|
279
|
+
return handleRougeScore(baseType, assertion, assertion.value, output, inverse);
|
|
280
|
+
}
|
|
281
|
+
|
|
103
282
|
throw new Error('Unknown assertion type: ' + assertion.type);
|
|
104
283
|
}
|
|
105
284
|
|
|
@@ -125,6 +304,7 @@ export async function matchesSimilarity(
|
|
|
125
304
|
expected: string,
|
|
126
305
|
output: string,
|
|
127
306
|
threshold: number,
|
|
307
|
+
inverse: boolean = false,
|
|
128
308
|
): Promise<GradingResult> {
|
|
129
309
|
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
130
310
|
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
@@ -155,16 +335,19 @@ export async function matchesSimilarity(
|
|
|
155
335
|
}
|
|
156
336
|
|
|
157
337
|
const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
158
|
-
|
|
338
|
+
const pass = inverse ? similarity <= threshold : similarity >= threshold;
|
|
339
|
+
const greaterThanReason = `Similarity ${similarity} is greater than threshold ${threshold}`;
|
|
340
|
+
const lessThanReason = `Similarity ${similarity} is less than threshold ${threshold}`;
|
|
341
|
+
if (pass) {
|
|
159
342
|
return {
|
|
160
|
-
pass:
|
|
161
|
-
reason:
|
|
343
|
+
pass: true,
|
|
344
|
+
reason: inverse ? lessThanReason : greaterThanReason,
|
|
162
345
|
tokensUsed,
|
|
163
346
|
};
|
|
164
347
|
}
|
|
165
348
|
return {
|
|
166
|
-
pass:
|
|
167
|
-
reason:
|
|
349
|
+
pass: false,
|
|
350
|
+
reason: inverse ? greaterThanReason : lessThanReason,
|
|
168
351
|
tokensUsed,
|
|
169
352
|
};
|
|
170
353
|
}
|
|
@@ -224,16 +407,7 @@ export async function matchesLlmRubric(
|
|
|
224
407
|
}
|
|
225
408
|
|
|
226
409
|
export function assertionFromString(expected: string): Assertion {
|
|
227
|
-
|
|
228
|
-
if (match) {
|
|
229
|
-
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
230
|
-
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
231
|
-
return {
|
|
232
|
-
type: 'similar',
|
|
233
|
-
value: rest,
|
|
234
|
-
threshold,
|
|
235
|
-
};
|
|
236
|
-
}
|
|
410
|
+
// Legacy options
|
|
237
411
|
if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
|
|
238
412
|
// TODO(1.0): delete eval: legacy option
|
|
239
413
|
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
|
|
@@ -249,11 +423,48 @@ export function assertionFromString(expected: string): Assertion {
|
|
|
249
423
|
value: expected.slice(6),
|
|
250
424
|
};
|
|
251
425
|
}
|
|
426
|
+
|
|
427
|
+
// New options
|
|
428
|
+
const assertionRegex =
|
|
429
|
+
/^(not-)?(equals|contains|contains-any|contains-all|regex|icontains):(.+)$/;
|
|
430
|
+
const regexMatch = expected.match(assertionRegex);
|
|
431
|
+
|
|
432
|
+
if (regexMatch) {
|
|
433
|
+
const [_, notPrefix, type, value] = regexMatch;
|
|
434
|
+
const fullType = notPrefix ? `not-${type}` : type;
|
|
435
|
+
|
|
436
|
+
if (type === 'contains-any' || type === 'contains-all') {
|
|
437
|
+
return {
|
|
438
|
+
type: fullType as AssertionType,
|
|
439
|
+
value: value.split(',').map((s) => s.trim()),
|
|
440
|
+
};
|
|
441
|
+
} else {
|
|
442
|
+
return {
|
|
443
|
+
type: fullType as AssertionType,
|
|
444
|
+
value,
|
|
445
|
+
};
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Options that require some special handling
|
|
450
|
+
const match = expected.match(SIMILAR_REGEX);
|
|
451
|
+
if (match) {
|
|
452
|
+
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
453
|
+
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
454
|
+
return {
|
|
455
|
+
type: 'similar',
|
|
456
|
+
value: rest,
|
|
457
|
+
threshold,
|
|
458
|
+
};
|
|
459
|
+
}
|
|
460
|
+
|
|
252
461
|
if (expected === 'is-json' || expected === 'contains-json') {
|
|
253
462
|
return {
|
|
254
463
|
type: expected,
|
|
255
464
|
};
|
|
256
465
|
}
|
|
466
|
+
|
|
467
|
+
// Default to equality
|
|
257
468
|
return {
|
|
258
469
|
type: 'equals',
|
|
259
470
|
value: expected,
|
package/src/evaluator.ts
CHANGED
package/src/main.ts
CHANGED
|
@@ -68,7 +68,7 @@ async function main() {
|
|
|
68
68
|
];
|
|
69
69
|
let config: Partial<UnifiedConfig> = {};
|
|
70
70
|
for (const path of potentialPaths) {
|
|
71
|
-
const maybeConfig = maybeReadConfig(path);
|
|
71
|
+
const maybeConfig = await maybeReadConfig(path);
|
|
72
72
|
if (maybeConfig) {
|
|
73
73
|
config = maybeConfig;
|
|
74
74
|
break;
|
|
@@ -154,8 +154,16 @@ async function main() {
|
|
|
154
154
|
'This suffix is append to every prompt',
|
|
155
155
|
config.defaultTest?.options?.suffix,
|
|
156
156
|
)
|
|
157
|
-
.option(
|
|
158
|
-
|
|
157
|
+
.option(
|
|
158
|
+
'--no-write',
|
|
159
|
+
'Do not write results to promptfoo directory',
|
|
160
|
+
config?.commandLineOptions?.write,
|
|
161
|
+
)
|
|
162
|
+
.option(
|
|
163
|
+
'--no-cache',
|
|
164
|
+
'Do not read or write results to disk cache',
|
|
165
|
+
config?.commandLineOptions?.cache,
|
|
166
|
+
)
|
|
159
167
|
.option('--grader', 'Model that will grade outputs', config?.commandLineOptions?.grader)
|
|
160
168
|
.option('--verbose', 'Show debug logs', config?.commandLineOptions?.verbose)
|
|
161
169
|
.option('--view [port]', 'View in browser ui')
|
|
@@ -172,7 +180,7 @@ async function main() {
|
|
|
172
180
|
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
173
181
|
const configPath = cmdObj.config;
|
|
174
182
|
if (configPath) {
|
|
175
|
-
config = readConfig(configPath);
|
|
183
|
+
config = await readConfig(configPath);
|
|
176
184
|
} else {
|
|
177
185
|
config = {
|
|
178
186
|
prompts: cmdObj.prompts || config.prompts,
|
|
@@ -256,8 +264,9 @@ async function main() {
|
|
|
256
264
|
},
|
|
257
265
|
});
|
|
258
266
|
// Skip first row (header) and add the rest. Color PASS/FAIL
|
|
259
|
-
for (const row of summary.table.body) {
|
|
267
|
+
for (const row of summary.table.body.slice(0, 25)) {
|
|
260
268
|
table.push([
|
|
269
|
+
...row.vars,
|
|
261
270
|
...row.outputs.map((col) => {
|
|
262
271
|
const tableCellMaxLength = parseInt(cmdObj.tableCellMaxLength || '', 10);
|
|
263
272
|
if (!isNaN(tableCellMaxLength) && col.length > tableCellMaxLength) {
|
|
@@ -275,11 +284,14 @@ async function main() {
|
|
|
275
284
|
}
|
|
276
285
|
return col;
|
|
277
286
|
}),
|
|
278
|
-
...row.vars,
|
|
279
287
|
]);
|
|
280
288
|
}
|
|
281
289
|
|
|
282
290
|
logger.info('\n' + table.toString());
|
|
291
|
+
if (summary.table.body.length > 25) {
|
|
292
|
+
const rowsLeft = summary.table.body.length - 25;
|
|
293
|
+
logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? '' : 's'} not shown ...\n`);
|
|
294
|
+
}
|
|
283
295
|
}
|
|
284
296
|
if (cmdObj.view || !cmdObj.write) {
|
|
285
297
|
logger.info('Evaluation complete');
|
package/src/types.ts
CHANGED
|
@@ -113,13 +113,34 @@ export interface GradingResult {
|
|
|
113
113
|
tokensUsed?: TokenUsage;
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
+
type BaseAssertionTypes =
|
|
117
|
+
| 'equals'
|
|
118
|
+
| 'contains'
|
|
119
|
+
| 'icontains'
|
|
120
|
+
| 'contains-all'
|
|
121
|
+
| 'contains-any'
|
|
122
|
+
| 'regex'
|
|
123
|
+
| 'is-json'
|
|
124
|
+
| 'contains-json'
|
|
125
|
+
| 'javascript'
|
|
126
|
+
| 'similar'
|
|
127
|
+
| 'llm-rubric'
|
|
128
|
+
| 'webhook'
|
|
129
|
+
| 'rouge-n'
|
|
130
|
+
| 'rouge-s'
|
|
131
|
+
| 'rouge-l';
|
|
132
|
+
|
|
133
|
+
type NotPrefixed<T extends string> = `not-${T}`;
|
|
134
|
+
|
|
135
|
+
export type AssertionType = BaseAssertionTypes | NotPrefixed<BaseAssertionTypes>;
|
|
136
|
+
|
|
116
137
|
// TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
|
|
117
138
|
export interface Assertion {
|
|
118
139
|
// Type of assertion
|
|
119
|
-
type:
|
|
140
|
+
type: AssertionType;
|
|
120
141
|
|
|
121
142
|
// The expected value, if applicable
|
|
122
|
-
value?: string;
|
|
143
|
+
value?: string | string[];
|
|
123
144
|
|
|
124
145
|
// The threshold value, only applicable for similarity (cosine distance)
|
|
125
146
|
threshold?: number;
|
package/src/util.ts
CHANGED
|
@@ -2,6 +2,7 @@ import * as fs from 'fs';
|
|
|
2
2
|
import * as path from 'node:path';
|
|
3
3
|
import * as os from 'node:os';
|
|
4
4
|
|
|
5
|
+
import $RefParser from '@apidevtools/json-schema-ref-parser';
|
|
5
6
|
import fetch from 'node-fetch';
|
|
6
7
|
import yaml from 'js-yaml';
|
|
7
8
|
import nunjucks from 'nunjucks';
|
|
@@ -28,14 +29,14 @@ function parseJson(json: string): any | undefined {
|
|
|
28
29
|
}
|
|
29
30
|
}
|
|
30
31
|
|
|
31
|
-
export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
|
|
32
|
+
export async function maybeReadConfig(configPath: string): Promise<UnifiedConfig | undefined> {
|
|
32
33
|
if (!fs.existsSync(configPath)) {
|
|
33
34
|
return undefined;
|
|
34
35
|
}
|
|
35
36
|
return readConfig(configPath);
|
|
36
37
|
}
|
|
37
38
|
|
|
38
|
-
export function readConfig(configPath: string): UnifiedConfig {
|
|
39
|
+
export async function readConfig(configPath: string): Promise<UnifiedConfig> {
|
|
39
40
|
const ext = path.parse(configPath).ext;
|
|
40
41
|
switch (ext) {
|
|
41
42
|
case '.json':
|
|
@@ -45,7 +46,9 @@ export function readConfig(configPath: string): UnifiedConfig {
|
|
|
45
46
|
return require(configPath) as UnifiedConfig;
|
|
46
47
|
case '.yaml':
|
|
47
48
|
case '.yml':
|
|
48
|
-
|
|
49
|
+
let ret = yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
|
|
50
|
+
ret = (await $RefParser.dereference(ret)) as UnifiedConfig;
|
|
51
|
+
return ret;
|
|
49
52
|
default:
|
|
50
53
|
throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
51
54
|
}
|
|
@@ -40,7 +40,6 @@ td,
|
|
|
40
40
|
.td {
|
|
41
41
|
position: relative;
|
|
42
42
|
box-shadow: inset 0 0 0 1px var(--border-color);
|
|
43
|
-
word-break: break-all;
|
|
44
43
|
vertical-align: top;
|
|
45
44
|
|
|
46
45
|
padding: 1.5rem;
|
|
@@ -50,11 +49,11 @@ th.variable,
|
|
|
50
49
|
.th.variable,
|
|
51
50
|
td.variable,
|
|
52
51
|
.td.variable {
|
|
53
|
-
background-color:
|
|
52
|
+
background-color: var(--variable-background-color);
|
|
54
53
|
}
|
|
55
54
|
|
|
56
55
|
tr.header {
|
|
57
|
-
background-color:
|
|
56
|
+
background-color: var(--header-background-color);
|
|
58
57
|
}
|
|
59
58
|
|
|
60
59
|
th,
|
|
@@ -62,7 +61,7 @@ th,
|
|
|
62
61
|
padding: 1rem;
|
|
63
62
|
position: relative;
|
|
64
63
|
text-align: center;
|
|
65
|
-
|
|
64
|
+
vertical-align: bottom;
|
|
66
65
|
}
|
|
67
66
|
|
|
68
67
|
tr .cell {
|
|
@@ -72,7 +71,7 @@ tr .cell-rating {
|
|
|
72
71
|
visibility: hidden;
|
|
73
72
|
position: absolute;
|
|
74
73
|
bottom: 1.25rem;
|
|
75
|
-
right:
|
|
74
|
+
right: 0;
|
|
76
75
|
line-height: 0;
|
|
77
76
|
font-size: 1.75rem;
|
|
78
77
|
}
|
|
@@ -83,7 +82,10 @@ tr:hover .cell-rating {
|
|
|
83
82
|
|
|
84
83
|
tr .cell-rating .rating {
|
|
85
84
|
cursor: pointer;
|
|
86
|
-
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
tr .cell-rating .rating:first-child {
|
|
88
|
+
margin-right: 0.5rem;
|
|
87
89
|
}
|
|
88
90
|
|
|
89
91
|
th .smalltext {
|
|
@@ -97,6 +99,16 @@ th:hover .smalltext {
|
|
|
97
99
|
visibility: visible;
|
|
98
100
|
}
|
|
99
101
|
|
|
102
|
+
th .summary {
|
|
103
|
+
font-weight: normal;
|
|
104
|
+
font-size: 0.8rem;
|
|
105
|
+
padding: 0.25rem;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
th .summary.highlight {
|
|
109
|
+
background-color: var(--success-background-color);
|
|
110
|
+
}
|
|
111
|
+
|
|
100
112
|
td,
|
|
101
113
|
.td {
|
|
102
114
|
}
|