promptfoo 0.14.2 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -2
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +63 -10
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +13 -5
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +1 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/prompts.d.ts +8 -0
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +14 -6
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +12 -12
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +9 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +3 -2
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-820d9559.js → index-9d27a707.js} +25 -25
- package/dist/src/web/client/assets/{index-87905193.css → index-c3faa651.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/package.json +2 -2
- package/src/assertions.ts +64 -12
- package/src/evaluator.ts +13 -5
- package/src/main.ts +1 -0
- package/src/prompts.ts +15 -5
- package/src/table.ts +14 -12
- package/src/types.ts +12 -1
- package/src/util.ts +14 -3
- package/src/web/client/src/ResultsTable.css +4 -0
- package/src/web/client/src/ResultsTable.tsx +57 -29
- package/src/web/client/src/types.ts +7 -1
- package/src/web/client/package-lock.json +0 -5726
|
@@ -16,10 +16,37 @@ import { useStore } from './store.js';
|
|
|
16
16
|
|
|
17
17
|
import type { CellContext, VisibilityState } from '@tanstack/table-core';
|
|
18
18
|
|
|
19
|
-
import type { EvalRow, FilterMode } from './types.js';
|
|
19
|
+
import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
|
|
20
20
|
|
|
21
21
|
import './ResultsTable.css';
|
|
22
22
|
|
|
23
|
+
function formatRowOutput(output: EvalRowOutput | string) {
|
|
24
|
+
if (typeof output === 'string') {
|
|
25
|
+
// Backwards compatibility for 0.15.0 breaking change. Remove eventually.
|
|
26
|
+
const pass = output.startsWith('[PASS]');
|
|
27
|
+
let text = output;
|
|
28
|
+
if (output.startsWith('[PASS]')) {
|
|
29
|
+
text = text.slice('[PASS]'.length);
|
|
30
|
+
} else if (output.startsWith('[FAIL]')) {
|
|
31
|
+
text = text.slice('[FAIL]'.length);
|
|
32
|
+
}
|
|
33
|
+
return {
|
|
34
|
+
text,
|
|
35
|
+
pass,
|
|
36
|
+
score: pass ? 1 : 0,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
return output;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function scoreToString(score: number) {
|
|
43
|
+
if (score === 0 || score === 1) {
|
|
44
|
+
// Don't show boolean scores.
|
|
45
|
+
return '';
|
|
46
|
+
}
|
|
47
|
+
return `(${score.toFixed(2)})`;
|
|
48
|
+
}
|
|
49
|
+
|
|
23
50
|
interface TruncatedTextProps {
|
|
24
51
|
text: string | number;
|
|
25
52
|
maxLength: number;
|
|
@@ -60,30 +87,26 @@ function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
|
|
|
60
87
|
}
|
|
61
88
|
|
|
62
89
|
interface PromptOutputProps {
|
|
63
|
-
|
|
90
|
+
output: EvalRowOutput;
|
|
64
91
|
maxTextLength: number;
|
|
65
92
|
rowIndex: number;
|
|
66
93
|
promptIndex: number;
|
|
67
94
|
onRating: (rowIndex: number, promptIndex: number, isPass: boolean) => void;
|
|
68
95
|
}
|
|
69
96
|
|
|
70
|
-
function PromptOutput({
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
97
|
+
function PromptOutput({
|
|
98
|
+
output,
|
|
99
|
+
maxTextLength,
|
|
100
|
+
rowIndex,
|
|
101
|
+
promptIndex,
|
|
102
|
+
onRating,
|
|
103
|
+
}: PromptOutputProps) {
|
|
104
|
+
let text = String(output.text);
|
|
74
105
|
let chunks: string[] = [];
|
|
75
|
-
if (
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
chunks = text.split('---');
|
|
80
|
-
text = chunks.slice(1).join('---');
|
|
81
|
-
} else {
|
|
82
|
-
chunks = ['[FAIL]'];
|
|
83
|
-
if (text.startsWith('[FAIL] ')) {
|
|
84
|
-
text = text.substring(7);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
106
|
+
if (!output.pass && text.includes('---')) {
|
|
107
|
+
// TODO(ian): Plumb through failure message instead of parsing it out.
|
|
108
|
+
chunks = text.split('---');
|
|
109
|
+
text = chunks.slice(1).join('---');
|
|
87
110
|
}
|
|
88
111
|
|
|
89
112
|
const handleClick = (isPass: boolean) => {
|
|
@@ -93,8 +116,16 @@ function PromptOutput({ text: rawText, maxTextLength, rowIndex, promptIndex, onR
|
|
|
93
116
|
return (
|
|
94
117
|
<>
|
|
95
118
|
<div className="cell">
|
|
96
|
-
{
|
|
97
|
-
|
|
119
|
+
{output.pass && (
|
|
120
|
+
<div className="status pass">
|
|
121
|
+
PASS <span className="score">{scoreToString(output.score)}</span>
|
|
122
|
+
</div>
|
|
123
|
+
)}
|
|
124
|
+
{!output.pass && (
|
|
125
|
+
<div className="status fail">
|
|
126
|
+
[FAIL<span className="score">{scoreToString(output.score)}</span>] {chunks[0]}
|
|
127
|
+
</div>
|
|
128
|
+
)}{' '}
|
|
98
129
|
<TruncatedText text={text} maxLength={maxTextLength} />
|
|
99
130
|
</div>
|
|
100
131
|
<div className="cell-rating">
|
|
@@ -138,10 +169,9 @@ export default function ResultsTable({
|
|
|
138
169
|
const { table, setTable } = useStore();
|
|
139
170
|
invariant(table, 'Table should be defined');
|
|
140
171
|
const { head, body } = table;
|
|
141
|
-
// TODO(ian): Correctly plumb through the results instead of parsing the string.
|
|
142
172
|
const numGood = head.prompts.map((_, idx) =>
|
|
143
173
|
body.reduce((acc, row) => {
|
|
144
|
-
return acc + (row.outputs[idx].
|
|
174
|
+
return acc + (row.outputs[idx].pass ? 1 : 0);
|
|
145
175
|
}, 0),
|
|
146
176
|
);
|
|
147
177
|
|
|
@@ -149,10 +179,8 @@ export default function ResultsTable({
|
|
|
149
179
|
const updatedData = [...body];
|
|
150
180
|
const updatedRow = { ...updatedData[rowIndex] };
|
|
151
181
|
const updatedOutputs = [...updatedRow.outputs];
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
: `[FAIL] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`;
|
|
155
|
-
updatedOutputs[promptIndex] = updatedOutput;
|
|
182
|
+
updatedOutputs[promptIndex].pass = isPass;
|
|
183
|
+
updatedOutputs[promptIndex].score = isPass ? 1 : 0;
|
|
156
184
|
updatedRow.outputs = updatedOutputs;
|
|
157
185
|
updatedData[rowIndex] = updatedRow;
|
|
158
186
|
setTable({
|
|
@@ -192,7 +220,7 @@ export default function ResultsTable({
|
|
|
192
220
|
id: 'prompts',
|
|
193
221
|
header: () => <span>Outputs</span>,
|
|
194
222
|
columns: head.prompts.map((prompt, idx) =>
|
|
195
|
-
columnHelper.accessor((row: EvalRow) => row.outputs[idx], {
|
|
223
|
+
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
|
|
196
224
|
id: `Prompt ${idx + 1}`,
|
|
197
225
|
header: () => {
|
|
198
226
|
const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
|
|
@@ -231,7 +259,7 @@ export default function ResultsTable({
|
|
|
231
259
|
},
|
|
232
260
|
cell: (info: CellContext<EvalRow, string>) => (
|
|
233
261
|
<PromptOutput
|
|
234
|
-
|
|
262
|
+
output={info.getValue() as unknown as EvalRowOutput}
|
|
235
263
|
maxTextLength={maxTextLength}
|
|
236
264
|
rowIndex={info.row.index}
|
|
237
265
|
promptIndex={idx}
|
|
@@ -251,7 +279,7 @@ export default function ResultsTable({
|
|
|
251
279
|
return body.filter((row) => {
|
|
252
280
|
return row.outputs.some((output, idx) => {
|
|
253
281
|
const columnId = `Prompt ${idx + 1}`;
|
|
254
|
-
const isFail = output.
|
|
282
|
+
const isFail = !output.pass;
|
|
255
283
|
return failureFilter[columnId] && isFail;
|
|
256
284
|
});
|
|
257
285
|
});
|
|
@@ -3,8 +3,14 @@ export type EvalHead = {
|
|
|
3
3
|
vars: string[];
|
|
4
4
|
};
|
|
5
5
|
|
|
6
|
+
export type EvalRowOutput = {
|
|
7
|
+
pass: boolean;
|
|
8
|
+
score: number;
|
|
9
|
+
text: string;
|
|
10
|
+
};
|
|
11
|
+
|
|
6
12
|
export type EvalRow = {
|
|
7
|
-
outputs:
|
|
13
|
+
outputs: EvalRowOutput[];
|
|
8
14
|
vars: string[]; // model outputs
|
|
9
15
|
};
|
|
10
16
|
|