promptfoo 0.16.0 → 0.17.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -2
- package/dist/src/assertions.js +6 -6
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/cache.d.ts +1 -0
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +6 -1
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +2 -1
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +12 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +9 -4
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/scriptCompletion.d.ts +9 -0
- package/dist/src/providers/scriptCompletion.d.ts.map +1 -0
- package/dist/src/providers/scriptCompletion.js +27 -0
- package/dist/src/providers/scriptCompletion.js.map +1 -0
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +7 -1
- package/dist/src/providers.js.map +1 -1
- package/dist/src/table.js +1 -1
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +2 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +3 -1
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-eb1e9052.js → index-b6617ee8.js} +26 -26
- package/dist/src/web/client/assets/{index-c3faa651.css → index-b82d0138.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/package.json +2 -2
- package/src/assertions.ts +8 -8
- package/src/cache.ts +5 -0
- package/src/evaluator.ts +2 -1
- package/src/main.ts +14 -1
- package/src/providers/openai.ts +15 -6
- package/src/providers/scriptCompletion.ts +23 -0
- package/src/providers.ts +6 -1
- package/src/table.ts +1 -1
- package/src/types.ts +2 -1
- package/src/util.ts +3 -1
- package/src/web/client/package-lock.json +31 -31
- package/src/web/client/src/EvalOutputPromptDialog.tsx +61 -0
- package/src/web/client/src/ResultsTable.css +10 -7
- package/src/web/client/src/ResultsTable.tsx +63 -18
- package/src/web/client/src/types.ts +7 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-b6617ee8.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-b82d0138.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "promptfoo",
|
|
3
|
-
"description": "
|
|
3
|
+
"description": "LLM eval & testing toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.17.1",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
package/src/assertions.ts
CHANGED
|
@@ -123,12 +123,12 @@ export async function runAssertion(
|
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
if (baseType === 'contains') {
|
|
126
|
-
invariant(assertion.value, '"contains" assertion type must have a string value');
|
|
126
|
+
invariant(assertion.value, '"contains" assertion type must have a string or number value');
|
|
127
127
|
invariant(
|
|
128
|
-
typeof assertion.value === 'string',
|
|
129
|
-
'"contains" assertion type must have a string value',
|
|
128
|
+
typeof assertion.value === 'string' || typeof assertion.value === 'number',
|
|
129
|
+
'"contains" assertion type must have a string or number value',
|
|
130
130
|
);
|
|
131
|
-
pass = output.includes(assertion.value) !== inverse;
|
|
131
|
+
pass = output.includes(String(assertion.value)) !== inverse;
|
|
132
132
|
return {
|
|
133
133
|
pass,
|
|
134
134
|
score: pass ? 1 : 0,
|
|
@@ -192,12 +192,12 @@ export async function runAssertion(
|
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
if (baseType === 'icontains') {
|
|
195
|
-
invariant(assertion.value, '"icontains" assertion type must have a string value');
|
|
195
|
+
invariant(assertion.value, '"icontains" assertion type must have a string or number value');
|
|
196
196
|
invariant(
|
|
197
|
-
typeof assertion.value === 'string',
|
|
198
|
-
'"icontains" assertion type must have a string value',
|
|
197
|
+
typeof assertion.value === 'string' || typeof assertion.value === 'number',
|
|
198
|
+
'"icontains" assertion type must have a string or number value',
|
|
199
199
|
);
|
|
200
|
-
pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
|
|
200
|
+
pass = output.toLowerCase().includes(String(assertion.value).toLowerCase()) !== inverse;
|
|
201
201
|
return {
|
|
202
202
|
pass,
|
|
203
203
|
score: pass ? 1 : 0,
|
package/src/cache.ts
CHANGED
package/src/evaluator.ts
CHANGED
|
@@ -293,7 +293,7 @@ class Evaluator {
|
|
|
293
293
|
|
|
294
294
|
const table: EvaluateTable = {
|
|
295
295
|
head: {
|
|
296
|
-
prompts
|
|
296
|
+
prompts,
|
|
297
297
|
vars: Array.from(varNames).sort(),
|
|
298
298
|
// TODO(ian): add assertions to table?
|
|
299
299
|
},
|
|
@@ -383,6 +383,7 @@ class Evaluator {
|
|
|
383
383
|
pass: row.success,
|
|
384
384
|
score: row.score,
|
|
385
385
|
text: resultText,
|
|
386
|
+
prompt: row.prompt.raw,
|
|
386
387
|
};
|
|
387
388
|
},
|
|
388
389
|
);
|
package/src/main.ts
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
writeOutput,
|
|
20
20
|
} from './util';
|
|
21
21
|
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
|
|
22
|
-
import { disableCache } from './cache';
|
|
22
|
+
import { disableCache, clearCache } from './cache';
|
|
23
23
|
import { getDirectory } from './esm';
|
|
24
24
|
import { init } from './web/server';
|
|
25
25
|
import { checkForUpdates } from './updates';
|
|
@@ -139,6 +139,19 @@ async function main() {
|
|
|
139
139
|
logger.info(`View results: ${chalk.greenBright.bold(url)}`);
|
|
140
140
|
});
|
|
141
141
|
|
|
142
|
+
program
|
|
143
|
+
.command('cache')
|
|
144
|
+
.description('Manage cache')
|
|
145
|
+
.command('clear')
|
|
146
|
+
.description('Clear cache')
|
|
147
|
+
.action(async () => {
|
|
148
|
+
await clearCache();
|
|
149
|
+
telemetry.record('command_used', {
|
|
150
|
+
name: 'cache_clear',
|
|
151
|
+
});
|
|
152
|
+
await telemetry.send();
|
|
153
|
+
});
|
|
154
|
+
|
|
142
155
|
program
|
|
143
156
|
.command('eval')
|
|
144
157
|
.description('Evaluate prompts')
|
package/src/providers/openai.ts
CHANGED
|
@@ -204,6 +204,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
204
204
|
'gpt-3.5-turbo',
|
|
205
205
|
'gpt-3.5-turbo-0301',
|
|
206
206
|
'gpt-3.5-turbo-0613',
|
|
207
|
+
'gpt-3.5-turbo-16k',
|
|
208
|
+
'gpt-3.5-turbo-16k-0613',
|
|
207
209
|
];
|
|
208
210
|
|
|
209
211
|
options: OpenAiCompletionOptions;
|
|
@@ -216,7 +218,6 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
216
218
|
this.options = context || {};
|
|
217
219
|
}
|
|
218
220
|
|
|
219
|
-
// TODO(ian): support passing in `messages` directly
|
|
220
221
|
async callApi(prompt: string, options?: OpenAiCompletionOptions): Promise<ProviderResponse> {
|
|
221
222
|
if (!this.apiKey) {
|
|
222
223
|
throw new Error(
|
|
@@ -224,12 +225,20 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
224
225
|
);
|
|
225
226
|
}
|
|
226
227
|
|
|
227
|
-
let messages: { role: string; content: string }[];
|
|
228
|
+
let messages: { role: string; content: string; name?: string }[];
|
|
228
229
|
try {
|
|
229
|
-
|
|
230
|
-
// string prompt into a `messages` array.
|
|
231
|
-
messages = JSON.parse(prompt);
|
|
230
|
+
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
232
231
|
} catch (err) {
|
|
232
|
+
const trimmedPrompt = prompt.trim();
|
|
233
|
+
if (
|
|
234
|
+
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
235
|
+
trimmedPrompt.startsWith('{') ||
|
|
236
|
+
trimmedPrompt.startsWith('[')
|
|
237
|
+
) {
|
|
238
|
+
throw new Error(
|
|
239
|
+
`OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
240
|
+
);
|
|
241
|
+
}
|
|
233
242
|
messages = [{ role: 'user', content: prompt }];
|
|
234
243
|
}
|
|
235
244
|
|
|
@@ -292,4 +301,4 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
292
301
|
|
|
293
302
|
export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
|
|
294
303
|
export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
|
|
295
|
-
export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
|
|
304
|
+
export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { exec } from 'child_process';
|
|
2
|
+
|
|
3
|
+
import { ApiProvider, ProviderConfig, ProviderResponse } from '../types';
|
|
4
|
+
|
|
5
|
+
export class ScriptCompletionProvider implements ApiProvider {
|
|
6
|
+
constructor(private scriptPath: string, private config?: ProviderConfig) {}
|
|
7
|
+
|
|
8
|
+
id() {
|
|
9
|
+
return 'script';
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async callApi(prompt: string) {
|
|
13
|
+
return new Promise((resolve, reject) => {
|
|
14
|
+
exec(`${this.scriptPath} "${prompt}"`, (error, stdout, stderr) => {
|
|
15
|
+
if (error) {
|
|
16
|
+
reject(error);
|
|
17
|
+
} else {
|
|
18
|
+
resolve({ output: stdout.trim() });
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
}) as Promise<ProviderResponse>;
|
|
22
|
+
}
|
|
23
|
+
}
|
package/src/providers.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
|
|
|
4
4
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
6
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
7
|
+
import { ScriptCompletionProvider } from './providers/scriptCompletion';
|
|
7
8
|
|
|
8
9
|
export async function loadApiProviders(
|
|
9
10
|
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
|
|
@@ -30,7 +31,11 @@ export async function loadApiProvider(
|
|
|
30
31
|
providerPath: string,
|
|
31
32
|
context: ProviderConfig | undefined = undefined,
|
|
32
33
|
): Promise<ApiProvider> {
|
|
33
|
-
if (providerPath?.startsWith('
|
|
34
|
+
if (providerPath?.startsWith('script:')) {
|
|
35
|
+
// Load script module
|
|
36
|
+
const scriptPath = providerPath.split(':')[1];
|
|
37
|
+
return new ScriptCompletionProvider(scriptPath, context?.config);
|
|
38
|
+
} else if (providerPath?.startsWith('openai:')) {
|
|
34
39
|
// Load OpenAI module
|
|
35
40
|
const options = providerPath.split(':');
|
|
36
41
|
const modelType = options[1];
|
package/src/table.ts
CHANGED
|
@@ -7,7 +7,7 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
|
|
|
7
7
|
const head = summary.table.head;
|
|
8
8
|
const headLength = head.prompts.length + head.vars.length;
|
|
9
9
|
const table = new Table({
|
|
10
|
-
head: [...head.prompts, ...head.vars],
|
|
10
|
+
head: [...head.prompts.map((prompt) => prompt.display), ...head.vars],
|
|
11
11
|
colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
|
|
12
12
|
wordWrap: true,
|
|
13
13
|
wrapOnWordBoundary: false,
|
package/src/types.ts
CHANGED
|
@@ -95,11 +95,12 @@ export interface EvaluateTableOutput {
|
|
|
95
95
|
pass: boolean;
|
|
96
96
|
score: number;
|
|
97
97
|
text: string;
|
|
98
|
+
prompt: string;
|
|
98
99
|
}
|
|
99
100
|
|
|
100
101
|
export interface EvaluateTable {
|
|
101
102
|
head: {
|
|
102
|
-
prompts:
|
|
103
|
+
prompts: Prompt[];
|
|
103
104
|
vars: string[];
|
|
104
105
|
};
|
|
105
106
|
|
package/src/util.ts
CHANGED
|
@@ -347,7 +347,9 @@ export function testCaseFromCsvRow(row: CsvRow): TestCase {
|
|
|
347
347
|
const asserts: Assertion[] = [];
|
|
348
348
|
for (const [key, value] of Object.entries(row)) {
|
|
349
349
|
if (key === '__expected') {
|
|
350
|
-
|
|
350
|
+
if (value.trim() !== '') {
|
|
351
|
+
asserts.push(assertionFromString(value));
|
|
352
|
+
}
|
|
351
353
|
} else {
|
|
352
354
|
vars[key] = value;
|
|
353
355
|
}
|
|
@@ -35,6 +35,15 @@
|
|
|
35
35
|
"vite": "^4.3.2"
|
|
36
36
|
}
|
|
37
37
|
},
|
|
38
|
+
"node_modules/@aashutoshrathi/word-wrap": {
|
|
39
|
+
"version": "1.2.6",
|
|
40
|
+
"resolved": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
|
|
41
|
+
"integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
|
|
42
|
+
"dev": true,
|
|
43
|
+
"engines": {
|
|
44
|
+
"node": ">=0.10.0"
|
|
45
|
+
}
|
|
46
|
+
},
|
|
38
47
|
"node_modules/@babel/code-frame": {
|
|
39
48
|
"version": "7.21.4",
|
|
40
49
|
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.21.4.tgz",
|
|
@@ -2685,17 +2694,17 @@
|
|
|
2685
2694
|
}
|
|
2686
2695
|
},
|
|
2687
2696
|
"node_modules/optionator": {
|
|
2688
|
-
"version": "0.9.
|
|
2689
|
-
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.
|
|
2690
|
-
"integrity": "sha512-
|
|
2697
|
+
"version": "0.9.3",
|
|
2698
|
+
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
|
|
2699
|
+
"integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
|
|
2691
2700
|
"dev": true,
|
|
2692
2701
|
"dependencies": {
|
|
2702
|
+
"@aashutoshrathi/word-wrap": "^1.2.3",
|
|
2693
2703
|
"deep-is": "^0.1.3",
|
|
2694
2704
|
"fast-levenshtein": "^2.0.6",
|
|
2695
2705
|
"levn": "^0.4.1",
|
|
2696
2706
|
"prelude-ls": "^1.2.1",
|
|
2697
|
-
"type-check": "^0.4.0"
|
|
2698
|
-
"word-wrap": "^1.2.3"
|
|
2707
|
+
"type-check": "^0.4.0"
|
|
2699
2708
|
},
|
|
2700
2709
|
"engines": {
|
|
2701
2710
|
"node": ">= 0.8.0"
|
|
@@ -3083,9 +3092,9 @@
|
|
|
3083
3092
|
}
|
|
3084
3093
|
},
|
|
3085
3094
|
"node_modules/semver": {
|
|
3086
|
-
"version": "7.5.
|
|
3087
|
-
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.
|
|
3088
|
-
"integrity": "sha512
|
|
3095
|
+
"version": "7.5.3",
|
|
3096
|
+
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.3.tgz",
|
|
3097
|
+
"integrity": "sha512-QBlUtyVk/5EeHbi7X0fw6liDZc7BBmEaSYn01fMU1OUYbf6GPsbTtd8WmnqbI20SeycoHSeiybkE/q1Q+qlThQ==",
|
|
3089
3098
|
"dev": true,
|
|
3090
3099
|
"dependencies": {
|
|
3091
3100
|
"lru-cache": "^6.0.0"
|
|
@@ -3391,15 +3400,6 @@
|
|
|
3391
3400
|
"node": ">= 8"
|
|
3392
3401
|
}
|
|
3393
3402
|
},
|
|
3394
|
-
"node_modules/word-wrap": {
|
|
3395
|
-
"version": "1.2.3",
|
|
3396
|
-
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
|
|
3397
|
-
"integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
|
|
3398
|
-
"dev": true,
|
|
3399
|
-
"engines": {
|
|
3400
|
-
"node": ">=0.10.0"
|
|
3401
|
-
}
|
|
3402
|
-
},
|
|
3403
3403
|
"node_modules/wrappy": {
|
|
3404
3404
|
"version": "1.0.2",
|
|
3405
3405
|
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
|
|
@@ -3485,6 +3485,12 @@
|
|
|
3485
3485
|
}
|
|
3486
3486
|
},
|
|
3487
3487
|
"dependencies": {
|
|
3488
|
+
"@aashutoshrathi/word-wrap": {
|
|
3489
|
+
"version": "1.2.6",
|
|
3490
|
+
"resolved": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
|
|
3491
|
+
"integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
|
|
3492
|
+
"dev": true
|
|
3493
|
+
},
|
|
3488
3494
|
"@babel/code-frame": {
|
|
3489
3495
|
"version": "7.21.4",
|
|
3490
3496
|
"resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.21.4.tgz",
|
|
@@ -5231,17 +5237,17 @@
|
|
|
5231
5237
|
}
|
|
5232
5238
|
},
|
|
5233
5239
|
"optionator": {
|
|
5234
|
-
"version": "0.9.
|
|
5235
|
-
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.
|
|
5236
|
-
"integrity": "sha512-
|
|
5240
|
+
"version": "0.9.3",
|
|
5241
|
+
"resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
|
|
5242
|
+
"integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
|
|
5237
5243
|
"dev": true,
|
|
5238
5244
|
"requires": {
|
|
5245
|
+
"@aashutoshrathi/word-wrap": "^1.2.3",
|
|
5239
5246
|
"deep-is": "^0.1.3",
|
|
5240
5247
|
"fast-levenshtein": "^2.0.6",
|
|
5241
5248
|
"levn": "^0.4.1",
|
|
5242
5249
|
"prelude-ls": "^1.2.1",
|
|
5243
|
-
"type-check": "^0.4.0"
|
|
5244
|
-
"word-wrap": "^1.2.3"
|
|
5250
|
+
"type-check": "^0.4.0"
|
|
5245
5251
|
}
|
|
5246
5252
|
},
|
|
5247
5253
|
"p-limit": {
|
|
@@ -5483,9 +5489,9 @@
|
|
|
5483
5489
|
}
|
|
5484
5490
|
},
|
|
5485
5491
|
"semver": {
|
|
5486
|
-
"version": "7.5.
|
|
5487
|
-
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.
|
|
5488
|
-
"integrity": "sha512
|
|
5492
|
+
"version": "7.5.3",
|
|
5493
|
+
"resolved": "https://registry.npmjs.org/semver/-/semver-7.5.3.tgz",
|
|
5494
|
+
"integrity": "sha512-QBlUtyVk/5EeHbi7X0fw6liDZc7BBmEaSYn01fMU1OUYbf6GPsbTtd8WmnqbI20SeycoHSeiybkE/q1Q+qlThQ==",
|
|
5489
5495
|
"dev": true,
|
|
5490
5496
|
"requires": {
|
|
5491
5497
|
"lru-cache": "^6.0.0"
|
|
@@ -5674,12 +5680,6 @@
|
|
|
5674
5680
|
"isexe": "^2.0.0"
|
|
5675
5681
|
}
|
|
5676
5682
|
},
|
|
5677
|
-
"word-wrap": {
|
|
5678
|
-
"version": "1.2.3",
|
|
5679
|
-
"resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
|
|
5680
|
-
"integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
|
|
5681
|
-
"dev": true
|
|
5682
|
-
},
|
|
5683
5683
|
"wrappy": {
|
|
5684
5684
|
"version": "1.0.2",
|
|
5685
5685
|
"resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { useState, useEffect } from 'react';
|
|
2
|
+
import Button from '@mui/material/Button';
|
|
3
|
+
import Dialog from '@mui/material/Dialog';
|
|
4
|
+
import DialogActions from '@mui/material/DialogActions';
|
|
5
|
+
import DialogContent from '@mui/material/DialogContent';
|
|
6
|
+
import DialogTitle from '@mui/material/DialogTitle';
|
|
7
|
+
import TextareaAutosize from '@mui/base/TextareaAutosize';
|
|
8
|
+
import IconButton from '@mui/material/IconButton';
|
|
9
|
+
import ContentCopyIcon from '@mui/icons-material/ContentCopy';
|
|
10
|
+
import CheckIcon from '@mui/icons-material/Check';
|
|
11
|
+
|
|
12
|
+
interface EvalOutputPromptDialogProps {
|
|
13
|
+
open: boolean;
|
|
14
|
+
onClose: () => void;
|
|
15
|
+
prompt: string;
|
|
16
|
+
output?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export default function EvalOutputPromptDialog({
|
|
20
|
+
open,
|
|
21
|
+
onClose,
|
|
22
|
+
prompt,
|
|
23
|
+
output,
|
|
24
|
+
}: EvalOutputPromptDialogProps) {
|
|
25
|
+
const [copied, setCopied] = useState(false);
|
|
26
|
+
|
|
27
|
+
useEffect(() => {
|
|
28
|
+
setCopied(false);
|
|
29
|
+
}, [prompt]);
|
|
30
|
+
|
|
31
|
+
const copyToClipboard = async (text: string) => {
|
|
32
|
+
await navigator.clipboard.writeText(text);
|
|
33
|
+
setCopied(true);
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
return (
|
|
37
|
+
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
|
|
38
|
+
<DialogTitle>Prompt</DialogTitle>
|
|
39
|
+
<DialogContent>
|
|
40
|
+
<TextareaAutosize readOnly value={prompt} style={{ width: '100%' }} />
|
|
41
|
+
<IconButton
|
|
42
|
+
onClick={() => copyToClipboard(prompt)}
|
|
43
|
+
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
|
44
|
+
>
|
|
45
|
+
{copied ? <CheckIcon /> : <ContentCopyIcon />}
|
|
46
|
+
</IconButton>
|
|
47
|
+
</DialogContent>
|
|
48
|
+
{output && (
|
|
49
|
+
<>
|
|
50
|
+
<DialogTitle>Output</DialogTitle>
|
|
51
|
+
<DialogContent>
|
|
52
|
+
<TextareaAutosize readOnly value={output} style={{ width: '100%' }} />
|
|
53
|
+
</DialogContent>
|
|
54
|
+
</>
|
|
55
|
+
)}
|
|
56
|
+
<DialogActions>
|
|
57
|
+
<Button onClick={onClose}>Close</Button>
|
|
58
|
+
</DialogActions>
|
|
59
|
+
</Dialog>
|
|
60
|
+
);
|
|
61
|
+
}
|
|
@@ -64,10 +64,17 @@ th,
|
|
|
64
64
|
vertical-align: bottom;
|
|
65
65
|
}
|
|
66
66
|
|
|
67
|
+
th .action {
|
|
68
|
+
cursor: pointer;
|
|
69
|
+
margin-left: 0.5rem;
|
|
70
|
+
}
|
|
71
|
+
|
|
67
72
|
tr .cell {
|
|
68
73
|
}
|
|
69
74
|
|
|
70
|
-
tr .cell-
|
|
75
|
+
tr .cell-actions {
|
|
76
|
+
display: flex;
|
|
77
|
+
gap: 0.5rem;
|
|
71
78
|
visibility: hidden;
|
|
72
79
|
position: absolute;
|
|
73
80
|
bottom: 1.25rem;
|
|
@@ -76,18 +83,14 @@ tr .cell-rating {
|
|
|
76
83
|
font-size: 1.75rem;
|
|
77
84
|
}
|
|
78
85
|
|
|
79
|
-
tr:hover .cell-
|
|
86
|
+
tr:hover .cell-actions {
|
|
80
87
|
visibility: visible;
|
|
81
88
|
}
|
|
82
89
|
|
|
83
|
-
tr .cell-
|
|
90
|
+
tr .cell-actions .action {
|
|
84
91
|
cursor: pointer;
|
|
85
92
|
}
|
|
86
93
|
|
|
87
|
-
tr .cell-rating .rating:first-child {
|
|
88
|
-
margin-right: 0.5rem;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
94
|
th .smalltext {
|
|
92
95
|
visibility: hidden;
|
|
93
96
|
font-weight: normal;
|
|
@@ -16,6 +16,8 @@ import { useStore } from './store.js';
|
|
|
16
16
|
|
|
17
17
|
import type { CellContext, VisibilityState } from '@tanstack/table-core';
|
|
18
18
|
|
|
19
|
+
import EvalOutputPromptDialog from './EvalOutputPromptDialog';
|
|
20
|
+
|
|
19
21
|
import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
|
|
20
22
|
|
|
21
23
|
import './ResultsTable.css';
|
|
@@ -62,28 +64,24 @@ function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
|
|
|
62
64
|
|
|
63
65
|
const renderTruncatedText = () => {
|
|
64
66
|
if (text.length <= maxLength) {
|
|
65
|
-
return text
|
|
67
|
+
return <span>{text}</span>;
|
|
66
68
|
}
|
|
67
69
|
if (isTruncated) {
|
|
68
70
|
return (
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
</span>
|
|
73
|
-
</>
|
|
71
|
+
<span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
|
|
72
|
+
{text.substring(0, maxLength)} ...
|
|
73
|
+
</span>
|
|
74
74
|
);
|
|
75
75
|
} else {
|
|
76
76
|
return (
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
</span>
|
|
81
|
-
</>
|
|
77
|
+
<span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
|
|
78
|
+
{text}
|
|
79
|
+
</span>
|
|
82
80
|
);
|
|
83
81
|
}
|
|
84
82
|
};
|
|
85
83
|
|
|
86
|
-
return
|
|
84
|
+
return renderTruncatedText();
|
|
87
85
|
}
|
|
88
86
|
|
|
89
87
|
interface PromptOutputProps {
|
|
@@ -101,6 +99,13 @@ function EvalOutputCell({
|
|
|
101
99
|
promptIndex,
|
|
102
100
|
onRating,
|
|
103
101
|
}: PromptOutputProps) {
|
|
102
|
+
const [openPrompt, setOpen] = React.useState(false);
|
|
103
|
+
const handlePromptOpen = () => {
|
|
104
|
+
setOpen(true);
|
|
105
|
+
};
|
|
106
|
+
const handlePromptClose = () => {
|
|
107
|
+
setOpen(false);
|
|
108
|
+
};
|
|
104
109
|
let text = typeof output.text === 'string' ? output.text : JSON.stringify(output.text);
|
|
105
110
|
let chunks: string[] = [];
|
|
106
111
|
if (!output.pass && text.includes('---')) {
|
|
@@ -113,6 +118,7 @@ function EvalOutputCell({
|
|
|
113
118
|
onRating(rowIndex, promptIndex, isPass);
|
|
114
119
|
};
|
|
115
120
|
|
|
121
|
+
// TODO(ian): output.prompt check for backwards compatibility, remove after 0.17.0
|
|
116
122
|
return (
|
|
117
123
|
<>
|
|
118
124
|
<div className="cell">
|
|
@@ -128,11 +134,24 @@ function EvalOutputCell({
|
|
|
128
134
|
)}{' '}
|
|
129
135
|
<TruncatedText text={text} maxLength={maxTextLength} />
|
|
130
136
|
</div>
|
|
131
|
-
<div className="cell-
|
|
132
|
-
|
|
137
|
+
<div className="cell-actions">
|
|
138
|
+
{output.prompt && (
|
|
139
|
+
<>
|
|
140
|
+
<span className="action" onClick={handlePromptOpen}>
|
|
141
|
+
🔎
|
|
142
|
+
</span>
|
|
143
|
+
<EvalOutputPromptDialog
|
|
144
|
+
open={openPrompt}
|
|
145
|
+
onClose={handlePromptClose}
|
|
146
|
+
prompt={output.prompt}
|
|
147
|
+
output={text}
|
|
148
|
+
/>
|
|
149
|
+
</>
|
|
150
|
+
)}
|
|
151
|
+
<span className="action" onClick={() => handleClick(true)}>
|
|
133
152
|
👍
|
|
134
153
|
</span>
|
|
135
|
-
<span className="
|
|
154
|
+
<span className="action" onClick={() => handleClick(false)}>
|
|
136
155
|
👎
|
|
137
156
|
</span>
|
|
138
157
|
</div>
|
|
@@ -140,11 +159,35 @@ function EvalOutputCell({
|
|
|
140
159
|
);
|
|
141
160
|
}
|
|
142
161
|
|
|
143
|
-
function TableHeader({
|
|
162
|
+
function TableHeader({
|
|
163
|
+
text,
|
|
164
|
+
maxLength,
|
|
165
|
+
smallText,
|
|
166
|
+
expandedText,
|
|
167
|
+
}: TruncatedTextProps & { smallText: string; expandedText?: string }) {
|
|
168
|
+
const [openPrompt, setOpen] = React.useState(false);
|
|
169
|
+
const handlePromptOpen = () => {
|
|
170
|
+
setOpen(true);
|
|
171
|
+
};
|
|
172
|
+
const handlePromptClose = () => {
|
|
173
|
+
setOpen(false);
|
|
174
|
+
};
|
|
144
175
|
return (
|
|
145
176
|
<div>
|
|
146
177
|
<TruncatedText text={text} maxLength={maxLength} />
|
|
147
|
-
|
|
178
|
+
{expandedText && (
|
|
179
|
+
<>
|
|
180
|
+
<span className="action" onClick={handlePromptOpen}>
|
|
181
|
+
🔎
|
|
182
|
+
</span>
|
|
183
|
+
<EvalOutputPromptDialog
|
|
184
|
+
open={openPrompt}
|
|
185
|
+
onClose={handlePromptClose}
|
|
186
|
+
prompt={expandedText}
|
|
187
|
+
/>
|
|
188
|
+
</>
|
|
189
|
+
)}
|
|
190
|
+
<div className="smalltext">{smallText}</div>
|
|
148
191
|
</div>
|
|
149
192
|
);
|
|
150
193
|
}
|
|
@@ -233,11 +276,13 @@ export default function ResultsTable({
|
|
|
233
276
|
numGood[idx] === highestPassingCount && highestPassingCount !== 0;
|
|
234
277
|
const columnId = `Prompt ${idx + 1}`;
|
|
235
278
|
const isChecked = failureFilter[columnId] || false;
|
|
279
|
+
// TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
|
|
236
280
|
return (
|
|
237
281
|
<>
|
|
238
282
|
<TableHeader
|
|
239
283
|
smallText={`Prompt ${idx + 1}`}
|
|
240
|
-
text={prompt}
|
|
284
|
+
text={typeof prompt === 'string' ? prompt : prompt.display}
|
|
285
|
+
expandedText={typeof prompt === 'string' ? undefined : prompt.raw}
|
|
241
286
|
maxLength={maxTextLength}
|
|
242
287
|
/>
|
|
243
288
|
{filterMode === 'failures' && (
|
|
@@ -1,5 +1,10 @@
|
|
|
1
|
+
type Prompt = {
|
|
2
|
+
display: string;
|
|
3
|
+
raw: string;
|
|
4
|
+
};
|
|
5
|
+
|
|
1
6
|
export type EvalHead = {
|
|
2
|
-
prompts:
|
|
7
|
+
prompts: Prompt[];
|
|
3
8
|
vars: string[];
|
|
4
9
|
};
|
|
5
10
|
|
|
@@ -7,6 +12,7 @@ export type EvalRowOutput = {
|
|
|
7
12
|
pass: boolean;
|
|
8
13
|
score: number;
|
|
9
14
|
text: string | object;
|
|
15
|
+
prompt: string;
|
|
10
16
|
};
|
|
11
17
|
|
|
12
18
|
export type EvalRow = {
|