promptfoo 0.14.2 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,10 +16,37 @@ import { useStore } from './store.js';
16
16
 
17
17
  import type { CellContext, VisibilityState } from '@tanstack/table-core';
18
18
 
19
- import type { EvalRow, FilterMode } from './types.js';
19
+ import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
20
20
 
21
21
  import './ResultsTable.css';
22
22
 
23
+ function formatRowOutput(output: EvalRowOutput | string) {
24
+ if (typeof output === 'string') {
25
+ // Backwards compatibility for 0.15.0 breaking change. Remove eventually.
26
+ const pass = output.startsWith('[PASS]');
27
+ let text = output;
28
+ if (output.startsWith('[PASS]')) {
29
+ text = text.slice('[PASS]'.length);
30
+ } else if (output.startsWith('[FAIL]')) {
31
+ text = text.slice('[FAIL]'.length);
32
+ }
33
+ return {
34
+ text,
35
+ pass,
36
+ score: pass ? 1 : 0,
37
+ };
38
+ }
39
+ return output;
40
+ }
41
+
42
+ function scoreToString(score: number) {
43
+ if (score === 0 || score === 1) {
44
+ // Don't show boolean scores.
45
+ return '';
46
+ }
47
+ return `(${score.toFixed(2)})`;
48
+ }
49
+
23
50
  interface TruncatedTextProps {
24
51
  text: string | number;
25
52
  maxLength: number;
@@ -60,30 +87,26 @@ function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
60
87
  }
61
88
 
62
89
  interface PromptOutputProps {
63
- text: string | number;
90
+ output: EvalRowOutput;
64
91
  maxTextLength: number;
65
92
  rowIndex: number;
66
93
  promptIndex: number;
67
94
  onRating: (rowIndex: number, promptIndex: number, isPass: boolean) => void;
68
95
  }
69
96
 
70
- function PromptOutput({ text: rawText, maxTextLength, rowIndex, promptIndex, onRating }: PromptOutputProps) {
71
- let text = String(rawText);
72
- const isPass = text.startsWith('[PASS] ');
73
- const isFail = text.startsWith('[FAIL] ');
97
+ function PromptOutput({
98
+ output,
99
+ maxTextLength,
100
+ rowIndex,
101
+ promptIndex,
102
+ onRating,
103
+ }: PromptOutputProps) {
104
+ let text = String(output.text);
74
105
  let chunks: string[] = [];
75
- if (isPass) {
76
- text = text.substring(7);
77
- } else if (isFail) {
78
- if (text.includes('---')) {
79
- chunks = text.split('---');
80
- text = chunks.slice(1).join('---');
81
- } else {
82
- chunks = ['[FAIL]'];
83
- if (text.startsWith('[FAIL] ')) {
84
- text = text.substring(7);
85
- }
86
- }
106
+ if (!output.pass && text.includes('---')) {
107
+ // TODO(ian): Plumb through failure message instead of parsing it out.
108
+ chunks = text.split('---');
109
+ text = chunks.slice(1).join('---');
87
110
  }
88
111
 
89
112
  const handleClick = (isPass: boolean) => {
@@ -93,8 +116,16 @@ function PromptOutput({ text: rawText, maxTextLength, rowIndex, promptIndex, onR
93
116
  return (
94
117
  <>
95
118
  <div className="cell">
96
- {isPass && <div className="status pass">[PASS]</div>}
97
- {isFail && <div className="status fail">{chunks[0]}</div>}{' '}
119
+ {output.pass && (
120
+ <div className="status pass">
121
+ PASS <span className="score">{scoreToString(output.score)}</span>
122
+ </div>
123
+ )}
124
+ {!output.pass && (
125
+ <div className="status fail">
126
+ [FAIL<span className="score">{scoreToString(output.score)}</span>] {chunks[0]}
127
+ </div>
128
+ )}{' '}
98
129
  <TruncatedText text={text} maxLength={maxTextLength} />
99
130
  </div>
100
131
  <div className="cell-rating">
@@ -138,10 +169,9 @@ export default function ResultsTable({
138
169
  const { table, setTable } = useStore();
139
170
  invariant(table, 'Table should be defined');
140
171
  const { head, body } = table;
141
- // TODO(ian): Correctly plumb through the results instead of parsing the string.
142
172
  const numGood = head.prompts.map((_, idx) =>
143
173
  body.reduce((acc, row) => {
144
- return acc + (row.outputs[idx].startsWith('[PASS]') ? 1 : 0);
174
+ return acc + (row.outputs[idx].pass ? 1 : 0);
145
175
  }, 0),
146
176
  );
147
177
 
@@ -149,10 +179,8 @@ export default function ResultsTable({
149
179
  const updatedData = [...body];
150
180
  const updatedRow = { ...updatedData[rowIndex] };
151
181
  const updatedOutputs = [...updatedRow.outputs];
152
- const updatedOutput = isPass
153
- ? `[PASS] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`
154
- : `[FAIL] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`;
155
- updatedOutputs[promptIndex] = updatedOutput;
182
+ updatedOutputs[promptIndex].pass = isPass;
183
+ updatedOutputs[promptIndex].score = isPass ? 1 : 0;
156
184
  updatedRow.outputs = updatedOutputs;
157
185
  updatedData[rowIndex] = updatedRow;
158
186
  setTable({
@@ -192,7 +220,7 @@ export default function ResultsTable({
192
220
  id: 'prompts',
193
221
  header: () => <span>Outputs</span>,
194
222
  columns: head.prompts.map((prompt, idx) =>
195
- columnHelper.accessor((row: EvalRow) => row.outputs[idx], {
223
+ columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
196
224
  id: `Prompt ${idx + 1}`,
197
225
  header: () => {
198
226
  const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
@@ -231,7 +259,7 @@ export default function ResultsTable({
231
259
  },
232
260
  cell: (info: CellContext<EvalRow, string>) => (
233
261
  <PromptOutput
234
- text={info.getValue()}
262
+ output={info.getValue() as unknown as EvalRowOutput}
235
263
  maxTextLength={maxTextLength}
236
264
  rowIndex={info.row.index}
237
265
  promptIndex={idx}
@@ -251,7 +279,7 @@ export default function ResultsTable({
251
279
  return body.filter((row) => {
252
280
  return row.outputs.some((output, idx) => {
253
281
  const columnId = `Prompt ${idx + 1}`;
254
- const isFail = output.startsWith('[FAIL] ');
282
+ const isFail = !output.pass;
255
283
  return failureFilter[columnId] && isFail;
256
284
  });
257
285
  });
@@ -3,8 +3,14 @@ export type EvalHead = {
3
3
  vars: string[];
4
4
  };
5
5
 
6
+ export type EvalRowOutput = {
7
+ pass: boolean;
8
+ score: number;
9
+ text: string;
10
+ };
11
+
6
12
  export type EvalRow = {
7
- outputs: string[]; // var inputs
13
+ outputs: EvalRowOutput[];
8
14
  vars: string[]; // model outputs
9
15
  };
10
16