promptfoo 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -2
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +63 -10
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +16 -7
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +1 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/prompts.d.ts +8 -0
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +14 -6
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +1 -1
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +12 -12
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +9 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +3 -2
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-70e6ca57.js → index-9d27a707.js} +25 -25
- package/dist/src/web/client/assets/{index-87905193.css → index-c3faa651.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/dist/src/web/server.js +1 -1
- package/dist/src/web/server.js.map +1 -1
- package/package.json +2 -2
- package/src/assertions.ts +64 -12
- package/src/evaluator.ts +16 -7
- package/src/main.ts +1 -0
- package/src/prompts.ts +15 -5
- package/src/providers/openai.ts +1 -1
- package/src/table.ts +14 -12
- package/src/types.ts +12 -1
- package/src/util.ts +14 -3
- package/src/web/client/src/ResultsTable.css +4 -0
- package/src/web/client/src/ResultsTable.tsx +60 -30
- package/src/web/client/src/types.ts +7 -1
- package/src/web/server.ts +1 -1
- package/src/web/client/package-lock.json +0 -5726
|
@@ -16,17 +16,45 @@ import { useStore } from './store.js';
|
|
|
16
16
|
|
|
17
17
|
import type { CellContext, VisibilityState } from '@tanstack/table-core';
|
|
18
18
|
|
|
19
|
-
import type { EvalRow, FilterMode } from './types.js';
|
|
19
|
+
import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
|
|
20
20
|
|
|
21
21
|
import './ResultsTable.css';
|
|
22
22
|
|
|
23
|
+
function formatRowOutput(output: EvalRowOutput | string) {
|
|
24
|
+
if (typeof output === 'string') {
|
|
25
|
+
// Backwards compatibility for 0.15.0 breaking change. Remove eventually.
|
|
26
|
+
const pass = output.startsWith('[PASS]');
|
|
27
|
+
let text = output;
|
|
28
|
+
if (output.startsWith('[PASS]')) {
|
|
29
|
+
text = text.slice('[PASS]'.length);
|
|
30
|
+
} else if (output.startsWith('[FAIL]')) {
|
|
31
|
+
text = text.slice('[FAIL]'.length);
|
|
32
|
+
}
|
|
33
|
+
return {
|
|
34
|
+
text,
|
|
35
|
+
pass,
|
|
36
|
+
score: pass ? 1 : 0,
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
return output;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function scoreToString(score: number) {
|
|
43
|
+
if (score === 0 || score === 1) {
|
|
44
|
+
// Don't show boolean scores.
|
|
45
|
+
return '';
|
|
46
|
+
}
|
|
47
|
+
return `(${score.toFixed(2)})`;
|
|
48
|
+
}
|
|
49
|
+
|
|
23
50
|
interface TruncatedTextProps {
|
|
24
|
-
text: string;
|
|
51
|
+
text: string | number;
|
|
25
52
|
maxLength: number;
|
|
26
53
|
}
|
|
27
54
|
|
|
28
|
-
function TruncatedText({ text, maxLength }: TruncatedTextProps) {
|
|
55
|
+
function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
|
|
29
56
|
const [isTruncated, setIsTruncated] = React.useState<boolean>(true);
|
|
57
|
+
const text = String(rawText);
|
|
30
58
|
|
|
31
59
|
const toggleTruncate = () => {
|
|
32
60
|
setIsTruncated(!isTruncated);
|
|
@@ -59,29 +87,26 @@ function TruncatedText({ text, maxLength }: TruncatedTextProps) {
|
|
|
59
87
|
}
|
|
60
88
|
|
|
61
89
|
interface PromptOutputProps {
|
|
62
|
-
|
|
90
|
+
output: EvalRowOutput;
|
|
63
91
|
maxTextLength: number;
|
|
64
92
|
rowIndex: number;
|
|
65
93
|
promptIndex: number;
|
|
66
94
|
onRating: (rowIndex: number, promptIndex: number, isPass: boolean) => void;
|
|
67
95
|
}
|
|
68
96
|
|
|
69
|
-
function PromptOutput({
|
|
70
|
-
|
|
71
|
-
|
|
97
|
+
function PromptOutput({
|
|
98
|
+
output,
|
|
99
|
+
maxTextLength,
|
|
100
|
+
rowIndex,
|
|
101
|
+
promptIndex,
|
|
102
|
+
onRating,
|
|
103
|
+
}: PromptOutputProps) {
|
|
104
|
+
let text = String(output.text);
|
|
72
105
|
let chunks: string[] = [];
|
|
73
|
-
if (
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
chunks = text.split('---');
|
|
78
|
-
text = chunks.slice(1).join('---');
|
|
79
|
-
} else {
|
|
80
|
-
chunks = ['[FAIL]'];
|
|
81
|
-
if (text.startsWith('[FAIL] ')) {
|
|
82
|
-
text = text.substring(7);
|
|
83
|
-
}
|
|
84
|
-
}
|
|
106
|
+
if (!output.pass && text.includes('---')) {
|
|
107
|
+
// TODO(ian): Plumb through failure message instead of parsing it out.
|
|
108
|
+
chunks = text.split('---');
|
|
109
|
+
text = chunks.slice(1).join('---');
|
|
85
110
|
}
|
|
86
111
|
|
|
87
112
|
const handleClick = (isPass: boolean) => {
|
|
@@ -91,8 +116,16 @@ function PromptOutput({ text, maxTextLength, rowIndex, promptIndex, onRating }:
|
|
|
91
116
|
return (
|
|
92
117
|
<>
|
|
93
118
|
<div className="cell">
|
|
94
|
-
{
|
|
95
|
-
|
|
119
|
+
{output.pass && (
|
|
120
|
+
<div className="status pass">
|
|
121
|
+
PASS <span className="score">{scoreToString(output.score)}</span>
|
|
122
|
+
</div>
|
|
123
|
+
)}
|
|
124
|
+
{!output.pass && (
|
|
125
|
+
<div className="status fail">
|
|
126
|
+
[FAIL<span className="score">{scoreToString(output.score)}</span>] {chunks[0]}
|
|
127
|
+
</div>
|
|
128
|
+
)}{' '}
|
|
96
129
|
<TruncatedText text={text} maxLength={maxTextLength} />
|
|
97
130
|
</div>
|
|
98
131
|
<div className="cell-rating">
|
|
@@ -136,10 +169,9 @@ export default function ResultsTable({
|
|
|
136
169
|
const { table, setTable } = useStore();
|
|
137
170
|
invariant(table, 'Table should be defined');
|
|
138
171
|
const { head, body } = table;
|
|
139
|
-
// TODO(ian): Correctly plumb through the results instead of parsing the string.
|
|
140
172
|
const numGood = head.prompts.map((_, idx) =>
|
|
141
173
|
body.reduce((acc, row) => {
|
|
142
|
-
return acc + (row.outputs[idx].
|
|
174
|
+
return acc + (row.outputs[idx].pass ? 1 : 0);
|
|
143
175
|
}, 0),
|
|
144
176
|
);
|
|
145
177
|
|
|
@@ -147,10 +179,8 @@ export default function ResultsTable({
|
|
|
147
179
|
const updatedData = [...body];
|
|
148
180
|
const updatedRow = { ...updatedData[rowIndex] };
|
|
149
181
|
const updatedOutputs = [...updatedRow.outputs];
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
: `[FAIL] ${updatedOutputs[promptIndex].replace(/^\[(PASS|FAIL)\] /, '')}`;
|
|
153
|
-
updatedOutputs[promptIndex] = updatedOutput;
|
|
182
|
+
updatedOutputs[promptIndex].pass = isPass;
|
|
183
|
+
updatedOutputs[promptIndex].score = isPass ? 1 : 0;
|
|
154
184
|
updatedRow.outputs = updatedOutputs;
|
|
155
185
|
updatedData[rowIndex] = updatedRow;
|
|
156
186
|
setTable({
|
|
@@ -190,7 +220,7 @@ export default function ResultsTable({
|
|
|
190
220
|
id: 'prompts',
|
|
191
221
|
header: () => <span>Outputs</span>,
|
|
192
222
|
columns: head.prompts.map((prompt, idx) =>
|
|
193
|
-
columnHelper.accessor((row: EvalRow) => row.outputs[idx], {
|
|
223
|
+
columnHelper.accessor((row: EvalRow) => formatRowOutput(row.outputs[idx]), {
|
|
194
224
|
id: `Prompt ${idx + 1}`,
|
|
195
225
|
header: () => {
|
|
196
226
|
const pct = ((numGood[idx] / body.length) * 100.0).toFixed(2);
|
|
@@ -229,7 +259,7 @@ export default function ResultsTable({
|
|
|
229
259
|
},
|
|
230
260
|
cell: (info: CellContext<EvalRow, string>) => (
|
|
231
261
|
<PromptOutput
|
|
232
|
-
|
|
262
|
+
output={info.getValue() as unknown as EvalRowOutput}
|
|
233
263
|
maxTextLength={maxTextLength}
|
|
234
264
|
rowIndex={info.row.index}
|
|
235
265
|
promptIndex={idx}
|
|
@@ -249,7 +279,7 @@ export default function ResultsTable({
|
|
|
249
279
|
return body.filter((row) => {
|
|
250
280
|
return row.outputs.some((output, idx) => {
|
|
251
281
|
const columnId = `Prompt ${idx + 1}`;
|
|
252
|
-
const isFail = output.
|
|
282
|
+
const isFail = !output.pass;
|
|
253
283
|
return failureFilter[columnId] && isFail;
|
|
254
284
|
});
|
|
255
285
|
});
|
|
@@ -3,8 +3,14 @@ export type EvalHead = {
|
|
|
3
3
|
vars: string[];
|
|
4
4
|
};
|
|
5
5
|
|
|
6
|
+
export type EvalRowOutput = {
|
|
7
|
+
pass: boolean;
|
|
8
|
+
score: number;
|
|
9
|
+
text: string;
|
|
10
|
+
};
|
|
11
|
+
|
|
6
12
|
export type EvalRow = {
|
|
7
|
-
outputs:
|
|
13
|
+
outputs: EvalRowOutput[];
|
|
8
14
|
vars: string[]; // model outputs
|
|
9
15
|
};
|
|
10
16
|
|
package/src/web/server.ts
CHANGED