promptfoo 0.10.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,12 +9,14 @@ import {
9
9
  getCoreRowModel,
10
10
  useReactTable,
11
11
  } from '@tanstack/react-table';
12
+ import Checkbox from '@mui/material/Checkbox';
13
+ import FormControlLabel from '@mui/material/FormControlLabel';
12
14
 
13
15
  import { useStore } from './store.js';
14
16
 
15
17
  import type { CellContext, VisibilityState } from '@tanstack/table-core';
16
18
 
17
- import type { EvalRow } from './types.js';
19
+ import type { EvalRow, FilterMode } from './types.js';
18
20
 
19
21
  import './ResultsTable.css';
20
22
 
@@ -114,12 +116,23 @@ function TableHeader({ text, maxLength, smallText }: TruncatedTextProps & { smal
114
116
  );
115
117
  }
116
118
 
117
- interface ResultsViewProps {
119
+ interface ResultsTableProps {
118
120
  maxTextLength: number;
119
121
  columnVisibility: VisibilityState;
122
+ wordBreak: 'break-word' | 'break-all';
123
+ filterMode: FilterMode;
124
+ failureFilter: { [key: string]: boolean };
125
+ onFailureFilterToggle: (columnId: string, checked: boolean) => void;
120
126
  }
121
127
 
122
- export default function ResultsTable({ maxTextLength, columnVisibility }: ResultsViewProps) {
128
+ export default function ResultsTable({
129
+ maxTextLength,
130
+ columnVisibility,
131
+ wordBreak,
132
+ filterMode,
133
+ failureFilter,
134
+ onFailureFilterToggle,
135
+ }: ResultsTableProps) {
123
136
  const { table, setTable } = useStore();
124
137
  invariant(table, 'Table should be defined');
125
138
  const { head, body } = table;
@@ -146,6 +159,10 @@ export default function ResultsTable({ maxTextLength, columnVisibility }: Result
146
159
  });
147
160
  };
148
161
 
162
+ const highestPassingIndex = numGood.reduce((maxIndex, currentPassCount, currentIndex, array) => {
163
+ return currentPassCount > array[maxIndex] ? currentIndex : maxIndex;
164
+ }, 0);
165
+ const highestPassingCount = numGood[highestPassingIndex];
149
166
  const columnHelper = createColumnHelper<EvalRow>();
150
167
  const columns = [
151
168
  columnHelper.group({
@@ -164,6 +181,8 @@ export default function ResultsTable({ maxTextLength, columnVisibility }: Result
164
181
  cell: (info: CellContext<EvalRow, string>) => (
165
182
  <TruncatedText text={info.getValue()} maxLength={maxTextLength} />
166
183
  ),
184
+ // Minimize the size of Variable columns.
185
+ size: 50,
167
186
  }),
168
187
  ),
169
188
  }),
@@ -173,16 +192,41 @@ export default function ResultsTable({ maxTextLength, columnVisibility }: Result
173
192
  columns: head.prompts.map((prompt, idx) =>
174
193
  columnHelper.accessor((row: EvalRow) => row.outputs[idx], {
175
194
  id: `Prompt ${idx + 1}`,
176
- header: () => (
177
- <>
178
- <TableHeader
179
- smallText={`Prompt ${idx + 1}`}
180
- text={prompt}
181
- maxLength={maxTextLength}
182
- />
183
- {numGood[idx]} / {body.length} 👍
184
- </>
185
- ),
195
+ header: () => {
196
+ const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
197
+ const isHighestPassing =
198
+ numGood[idx] === highestPassingCount && highestPassingCount !== 0;
199
+ const columnId = `Prompt ${idx + 1}`;
200
+ const isChecked = failureFilter[columnId] || false;
201
+ return (
202
+ <>
203
+ <TableHeader
204
+ smallText={`Prompt ${idx + 1}`}
205
+ text={prompt}
206
+ maxLength={maxTextLength}
207
+ />
208
+ {filterMode === 'failures' && (
209
+ <FormControlLabel
210
+ sx={{
211
+ '& .MuiFormControlLabel-label': {
212
+ fontSize: '0.75rem',
213
+ },
214
+ }}
215
+ control={
216
+ <Checkbox
217
+ checked={isChecked}
218
+ onChange={(event) => onFailureFilterToggle(columnId, event.target.checked)}
219
+ />
220
+ }
221
+ label="Show failures"
222
+ />
223
+ )}
224
+ <div className={`summary ${isHighestPassing ? 'highlight' : ''}`}>
225
+ Passing: <strong>{pct}%</strong> ({numGood[idx]} / {body.length})
226
+ </div>
227
+ </>
228
+ );
229
+ },
186
230
  cell: (info: CellContext<EvalRow, string>) => (
187
231
  <PromptOutput
188
232
  text={info.getValue()}
@@ -197,8 +241,24 @@ export default function ResultsTable({ maxTextLength, columnVisibility }: Result
197
241
  }),
198
242
  ];
199
243
 
244
+ const filteredBody = React.useMemo(() => {
245
+ if (filterMode === 'failures') {
246
+ if (Object.values(failureFilter).every((v) => !v)) {
247
+ return body;
248
+ }
249
+ return body.filter((row) => {
250
+ return row.outputs.some((output, idx) => {
251
+ const columnId = `Prompt ${idx + 1}`;
252
+ const isFail = output.startsWith('[FAIL] ');
253
+ return failureFilter[columnId] && isFail;
254
+ });
255
+ });
256
+ }
257
+ return body;
258
+ }, [body, failureFilter, filterMode]);
259
+
200
260
  const reactTable = useReactTable({
201
- data: body,
261
+ data: filteredBody,
202
262
  columns,
203
263
  columnResizeMode: 'onChange',
204
264
  getCoreRowModel: getCoreRowModel(),
@@ -209,32 +269,38 @@ export default function ResultsTable({ maxTextLength, columnVisibility }: Result
209
269
  });
210
270
 
211
271
  return (
212
- <table>
272
+ <table
273
+ style={{
274
+ wordBreak,
275
+ }}
276
+ >
213
277
  <thead>
214
278
  {reactTable.getHeaderGroups().map((headerGroup) => (
215
279
  <tr key={headerGroup.id} className="header">
216
- {headerGroup.headers.map((header) => (
217
- <th
218
- {...{
219
- key: header.id,
220
- colSpan: header.colSpan,
221
- style: {
222
- width: header.getSize(),
223
- },
224
- }}
225
- >
226
- {header.isPlaceholder
227
- ? null
228
- : flexRender(header.column.columnDef.header, header.getContext())}
229
- <div
280
+ {headerGroup.headers.map((header) => {
281
+ return (
282
+ <th
230
283
  {...{
231
- onMouseDown: header.getResizeHandler(),
232
- onTouchStart: header.getResizeHandler(),
233
- className: `resizer ${header.column.getIsResizing() ? 'isResizing' : ''}`,
284
+ key: header.id,
285
+ colSpan: header.colSpan,
286
+ style: {
287
+ width: header.getSize(),
288
+ },
234
289
  }}
235
- />
236
- </th>
237
- ))}
290
+ >
291
+ {header.isPlaceholder
292
+ ? null
293
+ : flexRender(header.column.columnDef.header, header.getContext())}
294
+ <div
295
+ {...{
296
+ onMouseDown: header.getResizeHandler(),
297
+ onTouchStart: header.getResizeHandler(),
298
+ className: `resizer ${header.column.getIsResizing() ? 'isResizing' : ''}`,
299
+ }}
300
+ />
301
+ </th>
302
+ );
303
+ })}
238
304
  </tr>
239
305
  ))}
240
306
  </thead>
@@ -2,22 +2,32 @@ import * as React from 'react';
2
2
 
3
3
  import invariant from 'tiny-invariant';
4
4
  import Box from '@mui/material/Box';
5
- import Paper from '@mui/material/Box';
6
- import Stack from '@mui/material/Stack';
7
- import Slider from '@mui/material/Slider';
8
- import Typography from '@mui/material/Typography';
9
- import OutlinedInput from '@mui/material/OutlinedInput';
10
- import InputLabel from '@mui/material/InputLabel';
11
- import MenuItem from '@mui/material/MenuItem';
5
+ import Checkbox from '@mui/material/Checkbox';
12
6
  import FormControl from '@mui/material/FormControl';
7
+ import FormControlLabel from '@mui/material/FormControlLabel';
8
+ import InputLabel from '@mui/material/InputLabel';
13
9
  import ListItemText from '@mui/material/ListItemText';
10
+ import MenuItem from '@mui/material/MenuItem';
11
+ import OutlinedInput from '@mui/material/OutlinedInput';
12
+ import Paper from '@mui/material/Box';
14
13
  import Select, { SelectChangeEvent } from '@mui/material/Select';
15
- import Checkbox from '@mui/material/Checkbox';
14
+ import Slider from '@mui/material/Slider';
15
+ import Stack from '@mui/material/Stack';
16
+ import Tooltip from '@mui/material/Tooltip';
17
+ import Typography from '@mui/material/Typography';
18
+ import { styled } from '@mui/system';
16
19
 
17
20
  import ResultsTable from './ResultsTable.js';
18
21
  import { useStore } from './store.js';
19
22
 
20
23
  import type { VisibilityState } from '@tanstack/table-core';
24
+ import { FilterMode } from './types.js';
25
+
26
+ const ResponsiveStack = styled(Stack)(({ theme }) => ({
27
+ [theme.breakpoints.down('sm')]: {
28
+ flexDirection: 'column',
29
+ },
30
+ }));
21
31
 
22
32
  export default function ResultsView() {
23
33
  const { table } = useStore();
@@ -25,6 +35,29 @@ export default function ResultsView() {
25
35
  const [columnVisibility, setColumnVisibility] = React.useState<VisibilityState>({});
26
36
  const [selectedColumns, setSelectedColumns] = React.useState<string[]>([]);
27
37
 
38
+ const [failureFilter, setFailureFilter] = React.useState<{ [key: string]: boolean }>({});
39
+ const handleFailureFilterToggle = (columnId: string, checked: boolean) => {
40
+ setFailureFilter((prevFailureFilter) => ({ ...prevFailureFilter, [columnId]: checked }));
41
+ };
42
+
43
+ const [filterMode, setFilterMode] = React.useState<FilterMode>('all');
44
+ const handleFilterModeChange = (event: SelectChangeEvent<unknown>) => {
45
+ const mode = event.target.value as FilterMode;
46
+ setFilterMode(mode);
47
+
48
+ const newFailureFilter: { [key: string]: boolean } = {};
49
+ head.prompts.forEach((_, idx) => {
50
+ const columnId = `Prompt ${idx + 1}`;
51
+ newFailureFilter[columnId] = mode === 'failures';
52
+ });
53
+ setFailureFilter(newFailureFilter);
54
+ };
55
+
56
+ const [wordBreak, setWordBreak] = React.useState<'break-word' | 'break-all'>('break-all');
57
+ const handleWordBreakChange = (event: React.ChangeEvent<HTMLInputElement>) => {
58
+ setWordBreak(event.target.checked ? 'break-all' : 'break-word');
59
+ };
60
+
28
61
  invariant(table, 'Table data must be loaded before rendering ResultsView');
29
62
  const { head } = table;
30
63
 
@@ -71,7 +104,7 @@ export default function ResultsView() {
71
104
  return (
72
105
  <div>
73
106
  <Paper py="md">
74
- <Stack direction="row" spacing={2} alignItems="center">
107
+ <ResponsiveStack direction="row" spacing={8} alignItems="center">
75
108
  <Box>
76
109
  <FormControl sx={{ m: 1, minWidth: 300 }} size="small">
77
110
  <InputLabel id="visible-columns-label">Visible columns</InputLabel>
@@ -93,6 +126,21 @@ export default function ResultsView() {
93
126
  </Select>
94
127
  </FormControl>
95
128
  </Box>
129
+ <Box>
130
+ <FormControl sx={{ minWidth: 180 }} size="small">
131
+ <InputLabel id="failure-filter-mode-label">Filter</InputLabel>
132
+ <Select
133
+ labelId="filter-mode-label"
134
+ id="filter-mode"
135
+ value={filterMode}
136
+ onChange={handleFilterModeChange}
137
+ label="Filter"
138
+ >
139
+ <MenuItem value="all">Show all results</MenuItem>
140
+ <MenuItem value="failures">Show only failures</MenuItem>
141
+ </Select>
142
+ </FormControl>
143
+ </Box>
96
144
  <Box>
97
145
  <Typography mt={2}>Max text length: {maxTextLength}</Typography>
98
146
  <Slider
@@ -102,9 +150,26 @@ export default function ResultsView() {
102
150
  onChange={(_, val: number | number[]) => setMaxTextLength(val as number)}
103
151
  />
104
152
  </Box>
105
- </Stack>
153
+ <Box>
154
+ <Tooltip title="Forcing line breaks makes it easier to adjust column widths to your liking">
155
+ <FormControlLabel
156
+ control={
157
+ <Checkbox checked={wordBreak === 'break-all'} onChange={handleWordBreakChange} />
158
+ }
159
+ label="Force line breaks"
160
+ />
161
+ </Tooltip>
162
+ </Box>
163
+ </ResponsiveStack>
106
164
  </Paper>
107
- <ResultsTable maxTextLength={maxTextLength} columnVisibility={columnVisibility} />
165
+ <ResultsTable
166
+ maxTextLength={maxTextLength}
167
+ columnVisibility={columnVisibility}
168
+ wordBreak={wordBreak}
169
+ filterMode={filterMode}
170
+ failureFilter={failureFilter}
171
+ onFailureFilterToggle={handleFailureFilterToggle}
172
+ />
108
173
  </div>
109
174
  );
110
175
  }
@@ -15,6 +15,9 @@
15
15
  --pass-color: green;
16
16
  --fail-color: #ad0000;
17
17
  --smalltext-color: gray;
18
+ --success-background-color: #d1ffd7;
19
+ --variable-background-color: #f7f7f7;
20
+ --header-background-color: #fffdf7;
18
21
  }
19
22
 
20
23
  /* Dark mode colors */
@@ -38,6 +41,9 @@
38
41
  --pass-color: #4caf50;
39
42
  --fail-color: #f44336;
40
43
  --smalltext-color: #888888;
44
+ --success-background-color: #216d2b;
45
+ --variable-background-color: #333;
46
+ --header-background-color: #333;
41
47
  }
42
48
 
43
49
  html {
@@ -12,3 +12,5 @@ export type EvalTable = {
12
12
  head: EvalHead;
13
13
  body: EvalRow[];
14
14
  };
15
+
16
+ export type FilterMode = 'all' | 'failures';
@@ -1 +0,0 @@
1
- :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);word-break:break-all;vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:#f8fbff}tr.header{background-color:#fffdf7}th,.th{padding:1rem;position:relative;text-align:center;font-weight:semi-bold}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:-1rem;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer;margin-right:1rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}