promptfoo 0.18.1 → 0.18.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +1 -1
- package/dist/src/assertions.d.ts +2 -2
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +42 -11
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/cache.d.ts +1 -1
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +4 -4
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +5 -2
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +4 -4
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts +2 -2
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +7 -5
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/llama.js +1 -1
- package/dist/src/providers/llama.js.map +1 -1
- package/dist/src/providers/localai.js +2 -2
- package/dist/src/providers/localai.js.map +1 -1
- package/dist/src/providers/ollama.d.ts +9 -0
- package/dist/src/providers/ollama.d.ts.map +1 -0
- package/dist/src/providers/ollama.js +66 -0
- package/dist/src/providers/ollama.js.map +1 -0
- package/dist/src/providers/openai.d.ts +2 -2
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +7 -5
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +11 -5
- package/dist/src/providers.js.map +1 -1
- package/dist/src/types.d.ts +6 -2
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +2 -0
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +24 -12
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/index-6d2a3573.js +200 -0
- package/dist/src/web/client/index.html +1 -1
- package/package.json +1 -1
- package/src/assertions.ts +45 -11
- package/src/cache.ts +3 -2
- package/src/evaluator.ts +5 -1
- package/src/main.ts +4 -4
- package/src/providers/azureopenai.ts +18 -6
- package/src/providers/llama.ts +2 -2
- package/src/providers/localai.ts +3 -3
- package/src/providers/ollama.ts +88 -0
- package/src/providers/openai.ts +8 -6
- package/src/providers.ts +20 -5
- package/src/types.ts +6 -2
- package/src/util.ts +25 -17
- package/src/web/client/package-lock.json +5726 -0
- package/src/web/client/src/EvalOutputPromptDialog.tsx +78 -16
- package/src/web/client/src/ResultsTable.tsx +32 -9
- package/src/web/client/src/ResultsView.tsx +1 -1
- package/src/web/client/src/types.ts +3 -1
- package/dist/src/web/client/assets/index-8388d689.js +0 -199
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { useState, useEffect } from 'react';
|
|
2
|
+
import Box from '@mui/material/Box';
|
|
2
3
|
import Button from '@mui/material/Button';
|
|
3
4
|
import Dialog from '@mui/material/Dialog';
|
|
4
5
|
import DialogActions from '@mui/material/DialogActions';
|
|
@@ -8,12 +9,60 @@ import TextareaAutosize from '@mui/base/TextareaAutosize';
|
|
|
8
9
|
import IconButton from '@mui/material/IconButton';
|
|
9
10
|
import ContentCopyIcon from '@mui/icons-material/ContentCopy';
|
|
10
11
|
import CheckIcon from '@mui/icons-material/Check';
|
|
12
|
+
import Table from '@mui/material/Table';
|
|
13
|
+
import TableBody from '@mui/material/TableBody';
|
|
14
|
+
import TableCell from '@mui/material/TableCell';
|
|
15
|
+
import TableContainer from '@mui/material/TableContainer';
|
|
16
|
+
import TableHead from '@mui/material/TableHead';
|
|
17
|
+
import TableRow from '@mui/material/TableRow';
|
|
18
|
+
import Typography from '@mui/material/Typography';
|
|
19
|
+
|
|
20
|
+
import type { GradingResult } from '../../../types';
|
|
11
21
|
|
|
12
22
|
interface EvalOutputPromptDialogProps {
|
|
13
23
|
open: boolean;
|
|
14
24
|
onClose: () => void;
|
|
15
25
|
prompt: string;
|
|
16
26
|
output?: string;
|
|
27
|
+
gradingResults?: GradingResult[];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function AssertionResults({ gradingResults }: { gradingResults?: GradingResult[] }) {
|
|
31
|
+
if (!gradingResults) {
|
|
32
|
+
return null;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return (
|
|
36
|
+
<Box mt={2}>
|
|
37
|
+
<Typography variant="subtitle1">Assertions</Typography>
|
|
38
|
+
<TableContainer>
|
|
39
|
+
<Table>
|
|
40
|
+
<TableHead>
|
|
41
|
+
<TableRow>
|
|
42
|
+
<TableCell style={{ fontWeight: 'bold' }}>Pass</TableCell>
|
|
43
|
+
<TableCell style={{ fontWeight: 'bold' }}>Score</TableCell>
|
|
44
|
+
<TableCell style={{ fontWeight: 'bold' }}>Type</TableCell>
|
|
45
|
+
<TableCell style={{ fontWeight: 'bold' }}>Value</TableCell>
|
|
46
|
+
<TableCell style={{ fontWeight: 'bold' }}>Reason</TableCell>
|
|
47
|
+
</TableRow>
|
|
48
|
+
</TableHead>
|
|
49
|
+
<TableBody>
|
|
50
|
+
{gradingResults.map((result, i) => (
|
|
51
|
+
<TableRow key={i}>
|
|
52
|
+
<TableCell>{result.pass ? '✅' : '❌'}</TableCell>
|
|
53
|
+
<TableCell>{result.score}</TableCell>
|
|
54
|
+
<TableCell>{result.assertion?.type || ''}</TableCell>
|
|
55
|
+
<TableCell>
|
|
56
|
+
{result.assertion?.value ? String(result.assertion.value) : '-'}
|
|
57
|
+
</TableCell>
|
|
58
|
+
<TableCell>{result.reason}</TableCell>
|
|
59
|
+
</TableRow>
|
|
60
|
+
))}
|
|
61
|
+
</TableBody>
|
|
62
|
+
</Table>
|
|
63
|
+
</TableContainer>
|
|
64
|
+
</Box>
|
|
65
|
+
);
|
|
17
66
|
}
|
|
18
67
|
|
|
19
68
|
export default function EvalOutputPromptDialog({
|
|
@@ -21,6 +70,7 @@ export default function EvalOutputPromptDialog({
|
|
|
21
70
|
onClose,
|
|
22
71
|
prompt,
|
|
23
72
|
output,
|
|
73
|
+
gradingResults,
|
|
24
74
|
}: EvalOutputPromptDialogProps) {
|
|
25
75
|
const [copied, setCopied] = useState(false);
|
|
26
76
|
|
|
@@ -35,28 +85,40 @@ export default function EvalOutputPromptDialog({
|
|
|
35
85
|
|
|
36
86
|
return (
|
|
37
87
|
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
|
|
38
|
-
<DialogTitle>
|
|
88
|
+
<DialogTitle>Details</DialogTitle>
|
|
39
89
|
<DialogContent>
|
|
40
|
-
<
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
<
|
|
51
|
-
|
|
90
|
+
<Box mb={2}>
|
|
91
|
+
<Typography variant="subtitle1" style={{ marginBottom: '1rem' }}>
|
|
92
|
+
Prompt
|
|
93
|
+
</Typography>
|
|
94
|
+
<TextareaAutosize
|
|
95
|
+
readOnly
|
|
96
|
+
value={prompt}
|
|
97
|
+
style={{ width: '100%', padding: '0.75rem' }}
|
|
98
|
+
maxRows={20}
|
|
99
|
+
/>
|
|
100
|
+
<IconButton
|
|
101
|
+
onClick={() => copyToClipboard(prompt)}
|
|
102
|
+
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
|
103
|
+
>
|
|
104
|
+
{copied ? <CheckIcon /> : <ContentCopyIcon />}
|
|
105
|
+
</IconButton>
|
|
106
|
+
</Box>
|
|
107
|
+
{output && (
|
|
108
|
+
<Box my={2}>
|
|
109
|
+
<Typography variant="subtitle1" style={{ marginBottom: '1rem', marginTop: '1rem' }}>
|
|
110
|
+
Output
|
|
111
|
+
</Typography>
|
|
52
112
|
<TextareaAutosize
|
|
53
113
|
readOnly
|
|
114
|
+
maxRows={20}
|
|
54
115
|
value={output}
|
|
55
116
|
style={{ width: '100%', padding: '0.75rem' }}
|
|
56
117
|
/>
|
|
57
|
-
</
|
|
58
|
-
|
|
59
|
-
|
|
118
|
+
</Box>
|
|
119
|
+
)}
|
|
120
|
+
<AssertionResults gradingResults={gradingResults} />
|
|
121
|
+
</DialogContent>
|
|
60
122
|
<DialogActions>
|
|
61
123
|
<Button onClick={onClose}>Close</Button>
|
|
62
124
|
</DialogActions>
|
|
@@ -155,6 +155,7 @@ function EvalOutputCell({
|
|
|
155
155
|
open={openPrompt}
|
|
156
156
|
onClose={handlePromptClose}
|
|
157
157
|
prompt={output.prompt}
|
|
158
|
+
gradingResults={output.gradingResult?.componentResults}
|
|
158
159
|
output={text}
|
|
159
160
|
/>
|
|
160
161
|
</>
|
|
@@ -223,12 +224,25 @@ export default function ResultsTable({
|
|
|
223
224
|
const { table, setTable } = useStore();
|
|
224
225
|
invariant(table, 'Table should be defined');
|
|
225
226
|
const { head, body } = table;
|
|
226
|
-
const
|
|
227
|
+
const numGoodTests = head.prompts.map((_, idx) =>
|
|
227
228
|
body.reduce((acc, row) => {
|
|
228
229
|
return acc + (row.outputs[idx].pass ? 1 : 0);
|
|
229
230
|
}, 0),
|
|
230
231
|
);
|
|
231
232
|
|
|
233
|
+
const numAsserts = head.prompts.map((_, idx) =>
|
|
234
|
+
body.reduce((acc, row) => {
|
|
235
|
+
return acc + (row.outputs[idx].gradingResult?.componentResults?.length || 0);
|
|
236
|
+
}, 0),
|
|
237
|
+
);
|
|
238
|
+
|
|
239
|
+
const numGoodAsserts = head.prompts.map((_, idx) =>
|
|
240
|
+
body.reduce((acc, row) => {
|
|
241
|
+
const componentResults = row.outputs[idx].gradingResult?.componentResults;
|
|
242
|
+
return acc + (componentResults ? componentResults.filter((r) => r.pass).length : 0);
|
|
243
|
+
}, 0),
|
|
244
|
+
);
|
|
245
|
+
|
|
232
246
|
const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => {
|
|
233
247
|
const updatedData = [...body];
|
|
234
248
|
const updatedRow = { ...updatedData[rowIndex] };
|
|
@@ -243,10 +257,13 @@ export default function ResultsTable({
|
|
|
243
257
|
});
|
|
244
258
|
};
|
|
245
259
|
|
|
246
|
-
const highestPassingIndex =
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
260
|
+
const highestPassingIndex = numGoodTests.reduce(
|
|
261
|
+
(maxIndex, currentPassCount, currentIndex, array) => {
|
|
262
|
+
return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
|
|
263
|
+
},
|
|
264
|
+
0,
|
|
265
|
+
);
|
|
266
|
+
const highestPassingCount = numGoodTests[highestPassingIndex];
|
|
250
267
|
const columnHelper = createColumnHelper<EvalRow>();
|
|
251
268
|
const columns = [
|
|
252
269
|
columnHelper.group({
|
|
@@ -282,9 +299,9 @@ export default function ResultsTable({
|
|
|
282
299
|
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
|
|
283
300
|
id: `Prompt ${idx + 1}`,
|
|
284
301
|
header: () => {
|
|
285
|
-
const pct = ((
|
|
302
|
+
const pct = ((numGoodTests[idx] / body.length) * 100.0).toFixed(2);
|
|
286
303
|
const isHighestPassing =
|
|
287
|
-
|
|
304
|
+
numGoodTests[idx] === highestPassingCount && highestPassingCount !== 0;
|
|
288
305
|
const columnId = `Prompt ${idx + 1}`;
|
|
289
306
|
const isChecked = failureFilter[columnId] || false;
|
|
290
307
|
// TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
|
|
@@ -313,12 +330,18 @@ export default function ResultsTable({
|
|
|
313
330
|
/>
|
|
314
331
|
)}
|
|
315
332
|
<div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
|
|
316
|
-
Passing: <strong>{pct}%</strong> ({
|
|
333
|
+
Passing: <strong>{pct}%</strong> ({numGoodTests[idx]}/{body.length} cases
|
|
334
|
+
{numAsserts[idx] ? (
|
|
335
|
+
<span>
|
|
336
|
+
, {numGoodAsserts[idx]}/{numAsserts[idx]} asserts
|
|
337
|
+
</span>
|
|
338
|
+
) : null}
|
|
339
|
+
)
|
|
317
340
|
</div>
|
|
318
341
|
</>
|
|
319
342
|
);
|
|
320
343
|
},
|
|
321
|
-
cell: (info: CellContext<EvalRow,
|
|
344
|
+
cell: (info: CellContext<EvalRow, EvalRowOutput>) => (
|
|
322
345
|
<EvalOutputCell
|
|
323
346
|
output={info.getValue() as unknown as EvalRowOutput}
|
|
324
347
|
maxTextLength={maxTextLength}
|
|
@@ -86,7 +86,7 @@ export default function ResultsView({ recentFiles, onRecentFileSelected }: Resul
|
|
|
86
86
|
setFailureFilter(newFailureFilter);
|
|
87
87
|
};
|
|
88
88
|
|
|
89
|
-
const [wordBreak, setWordBreak] = React.useState<'break-word' | 'break-all'>('break-
|
|
89
|
+
const [wordBreak, setWordBreak] = React.useState<'break-word' | 'break-all'>('break-word');
|
|
90
90
|
const handleWordBreakChange = (event: React.ChangeEvent<HTMLInputElement>) => {
|
|
91
91
|
setWordBreak(event.target.checked ? 'break-all' : 'break-word');
|
|
92
92
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { TokenUsage } from '../../../types';
|
|
1
|
+
import { EvaluateResult, TokenUsage } from '../../../types';
|
|
2
2
|
|
|
3
3
|
type Prompt = {
|
|
4
4
|
display: string;
|
|
@@ -10,6 +10,7 @@ export type EvalHead = {
|
|
|
10
10
|
vars: string[];
|
|
11
11
|
};
|
|
12
12
|
|
|
13
|
+
// TODO(ian): Remove this and replace with EvaluateResult
|
|
13
14
|
export type EvalRowOutput = {
|
|
14
15
|
pass: boolean;
|
|
15
16
|
score: number;
|
|
@@ -17,6 +18,7 @@ export type EvalRowOutput = {
|
|
|
17
18
|
prompt: string;
|
|
18
19
|
latencyMs: number;
|
|
19
20
|
tokenUsage?: Partial<TokenUsage>;
|
|
21
|
+
gradingResult: EvaluateResult['gradingResult'];
|
|
20
22
|
};
|
|
21
23
|
|
|
22
24
|
export type EvalRow = {
|