promptfoo 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/package.json +2 -2
  2. package/dist/src/assertions.d.ts.map +1 -1
  3. package/dist/src/assertions.js +63 -10
  4. package/dist/src/assertions.js.map +1 -1
  5. package/dist/src/evaluator.d.ts.map +1 -1
  6. package/dist/src/evaluator.js +16 -7
  7. package/dist/src/evaluator.js.map +1 -1
  8. package/dist/src/main.js +1 -0
  9. package/dist/src/main.js.map +1 -1
  10. package/dist/src/prompts.d.ts +8 -0
  11. package/dist/src/prompts.d.ts.map +1 -1
  12. package/dist/src/prompts.js +14 -6
  13. package/dist/src/prompts.js.map +1 -1
  14. package/dist/src/providers/openai.d.ts.map +1 -1
  15. package/dist/src/providers/openai.js +1 -1
  16. package/dist/src/providers/openai.js.map +1 -1
  17. package/dist/src/table.d.ts.map +1 -1
  18. package/dist/src/table.js +12 -12
  19. package/dist/src/table.js.map +1 -1
  20. package/dist/src/types.d.ts +9 -1
  21. package/dist/src/types.d.ts.map +1 -1
  22. package/dist/src/util.d.ts.map +1 -1
  23. package/dist/src/util.js +3 -2
  24. package/dist/src/util.js.map +1 -1
  25. package/dist/src/web/client/assets/{index-70e6ca57.js → index-9d27a707.js} +25 -25
  26. package/dist/src/web/client/assets/{index-87905193.css → index-c3faa651.css} +1 -1
  27. package/dist/src/web/client/index.html +2 -2
  28. package/dist/src/web/server.js +1 -1
  29. package/dist/src/web/server.js.map +1 -1
  30. package/package.json +2 -2
  31. package/src/assertions.ts +64 -12
  32. package/src/evaluator.ts +16 -7
  33. package/src/main.ts +1 -0
  34. package/src/prompts.ts +15 -5
  35. package/src/providers/openai.ts +1 -1
  36. package/src/table.ts +14 -12
  37. package/src/types.ts +12 -1
  38. package/src/util.ts +14 -3
  39. package/src/web/client/src/ResultsTable.css +4 -0
  40. package/src/web/client/src/ResultsTable.tsx +60 -30
  41. package/src/web/client/src/types.ts +7 -1
  42. package/src/web/server.ts +1 -1
  43. package/src/web/client/package-lock.json +0 -5726
@@ -16,17 +16,45 @@ import { useStore } from './store.js';
16
16
 
17
17
  import type { CellContext, VisibilityState } from '@tanstack/table-core';
18
18
 
19
- import type { EvalRow, FilterMode } from './types.js';
19
+ import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
20
20
 
21
21
  import './ResultsTable.css';
22
22
 
23
+ function formatRowOutput(output: EvalRowOutput | string) {
24
+ if (typeof output === 'string') {
25
+ // Backwards compatibility for 0.15.0 breaking change. Remove eventually.
26
+ const pass = output.startsWith('[PASS]');
27
+ let text = output;
28
+ if (output.startsWith('[PASS]')) {
29
+ text = text.slice('[PASS]'.length);
30
+ } else if (output.startsWith('[FAIL]')) {
31
+ text = text.slice('[FAIL]'.length);
32
+ }
33
+ return {
34
+ text,
35
+ pass,
36
+ score: pass ? 1 : 0,
37
+ };
38
+ }
39
+ return output;
40
+ }
41
+
42
+ function scoreToString(score: number) {
43
+ if (score === 0 || score === 1) {
44
+ // Don't show boolean scores.
45
+ return '';
46
+ }
47
+ return `(${score.toFixed(2)})`;
48
+ }
49
+
23
50
  interface TruncatedTextProps {
24
- text: string;
51
+ text: string | number;
25
52
  maxLength: number;
26
53
  }
27
54
 
28
- function TruncatedText({ text, maxLength }: TruncatedTextProps) {
55
+ function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
29
56
  const [isTruncated, setIsTruncated] = React.useState<boolean>(true);
57
+ const text = String(rawText);
30
58
 
31
59
  const toggleTruncate = () => {
32
60
  setIsTruncated(!isTruncated);
@@ -59,29 +87,26 @@ function TruncatedText({ text, maxLength }: TruncatedTextProps) {
59
87
  }
60
88
 
61
89
  interface PromptOutputProps {
62
- text: string;
90
+ output: EvalRowOutput;
63
91
  maxTextLength: number;
64
92
  rowIndex: number;
65
93
  promptIndex: number;
66
94
  onRating: (rowIndex: number, promptIndex: number, isPass: boolean) => void;
67
95
  }
68
96
 
69
- function PromptOutput({ text, maxTextLength, rowIndex, promptIndex, onRating }: PromptOutputProps) {
70
- const isPass = text.startsWith('[PASS] ');
71
- const isFail = text.startsWith('[FAIL] ');
97
+ function PromptOutput({
98
+ output,
99
+ maxTextLength,
100
+ rowIndex,
101
+ promptIndex,
102
+ onRating,
103
+ }: PromptOutputProps) {
104
+ let text = String(output.text);
72
105
  let chunks: string[] = [];
73
- if (isPass) {
74
- text = text.substring(7);
75
- } else if (isFail) {
76
- if (text.includes('---')) {
77
- chunks = text.split('---');
78
- text = chunks.slice(1).join('---');
79
- } else {
80
- chunks = ['[FAIL]'];
81
- if (text.startsWith('[FAIL] ')) {
82
- text = text.substring(7);
83
- }
84
- }
106
+ if (!output.pass && text.includes('---')) {
107
+ // TODO(ian): Plumb through failure message instead of parsing it out.
108
+ chunks = text.split('---');
109
+ text = chunks.slice(1).join('---');
85
110
  }
86
111
 
87
112
  const handleClick = (isPass: boolean) => {
@@ -91,8 +116,16 @@ function PromptOutput({ text, maxTextLength, rowIndex, promptIndex, onRating }:
91
116
  return (
92
117
  <>
93
118
  <div className="cell">
94
- {isPass && <div className="status pass">[PASS]</div>}
95
- {isFail && <div className="status fail">{chunks[0]}</div>}{' '}
119
+ {output.pass && (
120
+ <div className="status pass">
121
+ PASS <span className="score">{scoreToString(output.score)}</span>
122
+ </div>
123
+ )}
124
+ {!output.pass && (
125
+ <div className="status fail">
126
+ [FAIL<span className="score">{scoreToString(output.score)}</span>] {chunks[0]}
127
+ </div>
128
+ )}{' '}
96
129
  <TruncatedText text={text} maxLength={maxTextLength} />
97
130
  </div>
98
131
  <div className="cell-rating">
@@ -136,10 +169,9 @@ export default function ResultsTable({
136
169
  const { table, setTable } = useStore();
137
170
  invariant(table, 'Table should be defined');
138
171
  const { head, body } = table;
139
- // TODO(ian): Correctly plumb through the results instead of parsing the string.
140
172
  const numGood = head.prompts.map((_, idx) =>
141
173
  body.reduce((acc, row) => {
142
- return acc + (row.outputs[idx].startsWith('[PASS]') ? 1 : 0);
174
+ return acc + (row.outputs[idx].pass ? 1 : 0);
143
175
  }, 0),
144
176
  );
145
177
 
@@ -147,10 +179,8 @@ export default function ResultsTable({
147
179
  const updatedData = [...body];
148
180
  const updatedRow = { ...updatedData[rowIndex] };
149
181
  const updatedOutputs = [...updatedRow.outputs];
150
- const updatedOutput = isPass
151
- ? `[PASS] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`
152
- : `[FAIL] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`;
153
- updatedOutputs[promptIndex] = updatedOutput;
182
+ updatedOutputs[promptIndex].pass = isPass;
183
+ updatedOutputs[promptIndex].score = isPass ? 1 : 0;
154
184
  updatedRow.outputs = updatedOutputs;
155
185
  updatedData[rowIndex] = updatedRow;
156
186
  setTable({
@@ -190,7 +220,7 @@ export default function ResultsTable({
190
220
  id: 'prompts',
191
221
  header: () => <span>Outputs</span>,
192
222
  columns: head.prompts.map((prompt, idx) =>
193
- columnHelper.accessor((row: EvalRow) => row.outputs[idx], {
223
+ columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
194
224
  id: `Prompt ${idx + 1}`,
195
225
  header: () => {
196
226
  const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
@@ -229,7 +259,7 @@ export default function ResultsTable({
229
259
  },
230
260
  cell: (info: CellContext<EvalRow, string>) => (
231
261
  <PromptOutput
232
- text={info.getValue()}
262
+ output={info.getValue() as unknown as EvalRowOutput}
233
263
  maxTextLength={maxTextLength}
234
264
  rowIndex={info.row.index}
235
265
  promptIndex={idx}
@@ -249,7 +279,7 @@ export default function ResultsTable({
249
279
  return body.filter((row) => {
250
280
  return row.outputs.some((output, idx) => {
251
281
  const columnId = `Prompt ${idx + 1}`;
252
- const isFail = output.startsWith('[FAIL] ');
282
+ const isFail = !output.pass;
253
283
  return failureFilter[columnId] && isFail;
254
284
  });
255
285
  });
@@ -3,8 +3,14 @@ export type EvalHead = {
3
3
  vars: string[];
4
4
  };
5
5
 
6
+ export type EvalRowOutput = {
7
+ pass: boolean;
8
+ score: number;
9
+ text: string;
10
+ };
11
+
6
12
  export type EvalRow = {
7
- outputs: string[]; // var inputs
13
+ outputs: EvalRowOutput[];
8
14
  vars: string[]; // model outputs
9
15
  };
10
16
 
package/src/web/server.ts CHANGED
@@ -44,7 +44,7 @@ export function init(port = 15500) {
44
44
  latestJsonPath,
45
45
  debounce((event: string) => {
46
46
  if (event === 'change') {
47
- socket.emit('update', readLatestJson);
47
+ socket.emit('update', readLatestJson());
48
48
  }
49
49
  }, 250),
50
50
  );