promptfoo 0.18.1 → 0.18.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/dist/package.json +1 -1
  2. package/dist/src/assertions.d.ts +2 -2
  3. package/dist/src/assertions.d.ts.map +1 -1
  4. package/dist/src/assertions.js +42 -11
  5. package/dist/src/assertions.js.map +1 -1
  6. package/dist/src/cache.d.ts +1 -1
  7. package/dist/src/cache.d.ts.map +1 -1
  8. package/dist/src/cache.js +4 -4
  9. package/dist/src/cache.js.map +1 -1
  10. package/dist/src/evaluator.d.ts.map +1 -1
  11. package/dist/src/evaluator.js +5 -2
  12. package/dist/src/evaluator.js.map +1 -1
  13. package/dist/src/main.js +4 -4
  14. package/dist/src/main.js.map +1 -1
  15. package/dist/src/providers/azureopenai.d.ts +2 -2
  16. package/dist/src/providers/azureopenai.d.ts.map +1 -1
  17. package/dist/src/providers/azureopenai.js +7 -5
  18. package/dist/src/providers/azureopenai.js.map +1 -1
  19. package/dist/src/providers/llama.js +1 -1
  20. package/dist/src/providers/llama.js.map +1 -1
  21. package/dist/src/providers/localai.js +2 -2
  22. package/dist/src/providers/localai.js.map +1 -1
  23. package/dist/src/providers/ollama.d.ts +9 -0
  24. package/dist/src/providers/ollama.d.ts.map +1 -0
  25. package/dist/src/providers/ollama.js +66 -0
  26. package/dist/src/providers/ollama.js.map +1 -0
  27. package/dist/src/providers/openai.d.ts +2 -2
  28. package/dist/src/providers/openai.d.ts.map +1 -1
  29. package/dist/src/providers/openai.js +7 -5
  30. package/dist/src/providers/openai.js.map +1 -1
  31. package/dist/src/providers.d.ts.map +1 -1
  32. package/dist/src/providers.js +11 -5
  33. package/dist/src/providers.js.map +1 -1
  34. package/dist/src/types.d.ts +6 -2
  35. package/dist/src/types.d.ts.map +1 -1
  36. package/dist/src/util.d.ts +2 -0
  37. package/dist/src/util.d.ts.map +1 -1
  38. package/dist/src/util.js +24 -12
  39. package/dist/src/util.js.map +1 -1
  40. package/dist/src/web/client/assets/index-6d2a3573.js +200 -0
  41. package/dist/src/web/client/index.html +1 -1
  42. package/package.json +1 -1
  43. package/src/assertions.ts +45 -11
  44. package/src/cache.ts +3 -2
  45. package/src/evaluator.ts +5 -1
  46. package/src/main.ts +4 -4
  47. package/src/providers/azureopenai.ts +18 -6
  48. package/src/providers/llama.ts +2 -2
  49. package/src/providers/localai.ts +3 -3
  50. package/src/providers/ollama.ts +88 -0
  51. package/src/providers/openai.ts +8 -6
  52. package/src/providers.ts +20 -5
  53. package/src/types.ts +6 -2
  54. package/src/util.ts +25 -17
  55. package/src/web/client/package-lock.json +5726 -0
  56. package/src/web/client/src/EvalOutputPromptDialog.tsx +78 -16
  57. package/src/web/client/src/ResultsTable.tsx +32 -9
  58. package/src/web/client/src/ResultsView.tsx +1 -1
  59. package/src/web/client/src/types.ts +3 -1
  60. package/dist/src/web/client/assets/index-8388d689.js +0 -199
@@ -1,4 +1,5 @@
1
1
  import { useState, useEffect } from 'react';
2
+ import Box from '@mui/material/Box';
2
3
  import Button from '@mui/material/Button';
3
4
  import Dialog from '@mui/material/Dialog';
4
5
  import DialogActions from '@mui/material/DialogActions';
@@ -8,12 +9,60 @@ import TextareaAutosize from '@mui/base/TextareaAutosize';
8
9
  import IconButton from '@mui/material/IconButton';
9
10
  import ContentCopyIcon from '@mui/icons-material/ContentCopy';
10
11
  import CheckIcon from '@mui/icons-material/Check';
12
+ import Table from '@mui/material/Table';
13
+ import TableBody from '@mui/material/TableBody';
14
+ import TableCell from '@mui/material/TableCell';
15
+ import TableContainer from '@mui/material/TableContainer';
16
+ import TableHead from '@mui/material/TableHead';
17
+ import TableRow from '@mui/material/TableRow';
18
+ import Typography from '@mui/material/Typography';
19
+
20
+ import type { GradingResult } from '../../../types';
11
21
 
12
22
  interface EvalOutputPromptDialogProps {
13
23
  open: boolean;
14
24
  onClose: () => void;
15
25
  prompt: string;
16
26
  output?: string;
27
+ gradingResults?: GradingResult[];
28
+ }
29
+
30
+ function AssertionResults({ gradingResults }: { gradingResults?: GradingResult[] }) {
31
+ if (!gradingResults) {
32
+ return null;
33
+ }
34
+
35
+ return (
36
+ <Box mt={2}>
37
+ <Typography variant="subtitle1">Assertions</Typography>
38
+ <TableContainer>
39
+ <Table>
40
+ <TableHead>
41
+ <TableRow>
42
+ <TableCell style={{ fontWeight: 'bold' }}>Pass</TableCell>
43
+ <TableCell style={{ fontWeight: 'bold' }}>Score</TableCell>
44
+ <TableCell style={{ fontWeight: 'bold' }}>Type</TableCell>
45
+ <TableCell style={{ fontWeight: 'bold' }}>Value</TableCell>
46
+ <TableCell style={{ fontWeight: 'bold' }}>Reason</TableCell>
47
+ </TableRow>
48
+ </TableHead>
49
+ <TableBody>
50
+ {gradingResults.map((result, i) => (
51
+ <TableRow key={i}>
52
+ <TableCell>{result.pass ? '✅' : '❌'}</TableCell>
53
+ <TableCell>{result.score}</TableCell>
54
+ <TableCell>{result.assertion?.type || ''}</TableCell>
55
+ <TableCell>
56
+ {result.assertion?.value ? String(result.assertion.value) : '-'}
57
+ </TableCell>
58
+ <TableCell>{result.reason}</TableCell>
59
+ </TableRow>
60
+ ))}
61
+ </TableBody>
62
+ </Table>
63
+ </TableContainer>
64
+ </Box>
65
+ );
17
66
  }
18
67
 
19
68
  export default function EvalOutputPromptDialog({
@@ -21,6 +70,7 @@ export default function EvalOutputPromptDialog({
21
70
  onClose,
22
71
  prompt,
23
72
  output,
73
+ gradingResults,
24
74
  }: EvalOutputPromptDialogProps) {
25
75
  const [copied, setCopied] = useState(false);
26
76
 
@@ -35,28 +85,40 @@ export default function EvalOutputPromptDialog({
35
85
 
36
86
  return (
37
87
  <Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
38
- <DialogTitle>Prompt</DialogTitle>
88
+ <DialogTitle>Details</DialogTitle>
39
89
  <DialogContent>
40
- <TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
41
- <IconButton
42
- onClick={() => copyToClipboard(prompt)}
43
- style={{ position: 'absolute', right: '10px', top: '10px' }}
44
- >
45
- {copied ? <CheckIcon /> : <ContentCopyIcon />}
46
- </IconButton>
47
- </DialogContent>
48
- {output && (
49
- <>
50
- <DialogTitle>Output</DialogTitle>
51
- <DialogContent>
90
+ <Box mb={2}>
91
+ <Typography variant="subtitle1" style={{ marginBottom: '1rem' }}>
92
+ Prompt
93
+ </Typography>
94
+ <TextareaAutosize
95
+ readOnly
96
+ value={prompt}
97
+ style={{ width: '100%', padding: '0.75rem' }}
98
+ maxRows={20}
99
+ />
100
+ <IconButton
101
+ onClick={() => copyToClipboard(prompt)}
102
+ style={{ position: 'absolute', right: '10px', top: '10px' }}
103
+ >
104
+ {copied ? <CheckIcon /> : <ContentCopyIcon />}
105
+ </IconButton>
106
+ </Box>
107
+ {output && (
108
+ <Box my={2}>
109
+ <Typography variant="subtitle1" style={{ marginBottom: '1rem', marginTop: '1rem' }}>
110
+ Output
111
+ </Typography>
52
112
  <TextareaAutosize
53
113
  readOnly
114
+ maxRows={20}
54
115
  value={output}
55
116
  style={{ width: '100%', padding: '0.75rem' }}
56
117
  />
57
- </DialogContent>
58
- </>
59
- )}
118
+ </Box>
119
+ )}
120
+ <AssertionResults gradingResults={gradingResults} />
121
+ </DialogContent>
60
122
  <DialogActions>
61
123
  <Button onClick={onClose}>Close</Button>
62
124
  </DialogActions>
@@ -155,6 +155,7 @@ function EvalOutputCell({
155
155
  open={openPrompt}
156
156
  onClose={handlePromptClose}
157
157
  prompt={output.prompt}
158
+ gradingResults={output.gradingResult?.componentResults}
158
159
  output={text}
159
160
  />
160
161
  </>
@@ -223,12 +224,25 @@ export default function ResultsTable({
223
224
  const { table, setTable } = useStore();
224
225
  invariant(table, 'Table should be defined');
225
226
  const { head, body } = table;
226
- const numGood = head.prompts.map((_, idx) =>
227
+ const numGoodTests = head.prompts.map((_, idx) =>
227
228
  body.reduce((acc, row) => {
228
229
  return acc + (row.outputs[idx].pass ? 1 : 0);
229
230
  }, 0),
230
231
  );
231
232
 
233
+ const numAsserts = head.prompts.map((_, idx) =>
234
+ body.reduce((acc, row) => {
235
+ return acc + (row.outputs[idx].gradingResult?.componentResults?.length || 0);
236
+ }, 0),
237
+ );
238
+
239
+ const numGoodAsserts = head.prompts.map((_, idx) =>
240
+ body.reduce((acc, row) => {
241
+ const componentResults = row.outputs[idx].gradingResult?.componentResults;
242
+ return acc + (componentResults ? componentResults.filter((r) => r.pass).length : 0);
243
+ }, 0),
244
+ );
245
+
232
246
  const handleRating = (rowIndex: number, promptIndex: number, isPass: boolean) => {
233
247
  const updatedData = [...body];
234
248
  const updatedRow = { ...updatedData[rowIndex] };
@@ -243,10 +257,13 @@ export default function ResultsTable({
243
257
  });
244
258
  };
245
259
 
246
- const highestPassingIndex = numGood.reduce((maxIndex, currentPassCount, currentIndex, array) => {
247
- return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
248
- }, 0);
249
- const highestPassingCount = numGood[highestPassingIndex];
260
+ const highestPassingIndex = numGoodTests.reduce(
261
+ (maxIndex, currentPassCount, currentIndex, array) => {
262
+ return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
263
+ },
264
+ 0,
265
+ );
266
+ const highestPassingCount = numGoodTests[highestPassingIndex];
250
267
  const columnHelper = createColumnHelper<EvalRow>();
251
268
  const columns = [
252
269
  columnHelper.group({
@@ -282,9 +299,9 @@ export default function ResultsTable({
282
299
  columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
283
300
  id: `Prompt ${idx + 1}`,
284
301
  header: () => {
285
- const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
302
+ const pct = ((numGoodTests[idx] / body.length) * 100.0).toFixed(2);
286
303
  const isHighestPassing =
287
- numGood[idx] === highestPassingCount && highestPassingCount !== 0;
304
+ numGoodTests[idx] === highestPassingCount && highestPassingCount !== 0;
288
305
  const columnId = `Prompt ${idx + 1}`;
289
306
  const isChecked = failureFilter[columnId] || false;
290
307
  // TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
@@ -313,12 +330,18 @@ export default function ResultsTable({
313
330
  />
314
331
  )}
315
332
  <div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
316
- Passing: <strong>{pct}%</strong> ({numGood[idx]} / {body.length})
333
+ Passing: <strong>{pct}%</strong> ({numGoodTests[idx]}/{body.length} cases
334
+ {numAsserts[idx] ? (
335
+ <span>
336
+ , {numGoodAsserts[idx]}/{numAsserts[idx]} asserts
337
+ </span>
338
+ ) : null}
339
+ )
317
340
  </div>
318
341
  </>
319
342
  );
320
343
  },
321
- cell: (info: CellContext<EvalRow, string>) => (
344
+ cell: (info: CellContext<EvalRow, EvalRowOutput>) => (
322
345
  <EvalOutputCell
323
346
  output={info.getValue() as unknown as EvalRowOutput}
324
347
  maxTextLength={maxTextLength}
@@ -86,7 +86,7 @@ export default function ResultsView({ recentFiles, onRecentFileSelected }: Resul
86
86
  setFailureFilter(newFailureFilter);
87
87
  };
88
88
 
89
- const [wordBreak, setWordBreak] = React.useState<'break-word' | 'break-all'>('break-all');
89
+ const [wordBreak, setWordBreak] = React.useState<'break-word' | 'break-all'>('break-word');
90
90
  const handleWordBreakChange = (event: React.ChangeEvent<HTMLInputElement>) => {
91
91
  setWordBreak(event.target.checked ? 'break-all' : 'break-word');
92
92
  };
@@ -1,4 +1,4 @@
1
- import { TokenUsage } from '../../../types';
1
+ import { EvaluateResult, TokenUsage } from '../../../types';
2
2
 
3
3
  type Prompt = {
4
4
  display: string;
@@ -10,6 +10,7 @@ export type EvalHead = {
10
10
  vars: string[];
11
11
  };
12
12
 
13
+ // TODO(ian): Remove this and replace with EvaluateResult
13
14
  export type EvalRowOutput = {
14
15
  pass: boolean;
15
16
  score: number;
@@ -17,6 +18,7 @@ export type EvalRowOutput = {
17
18
  prompt: string;
18
19
  latencyMs: number;
19
20
  tokenUsage?: Partial<TokenUsage>;
21
+ gradingResult: EvaluateResult['gradingResult'];
20
22
  };
21
23
 
22
24
  export type EvalRow = {