promptfoo 0.16.0 → 0.17.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/dist/package.json +2 -2
  2. package/dist/src/assertions.js +6 -6
  3. package/dist/src/assertions.js.map +1 -1
  4. package/dist/src/cache.d.ts +1 -0
  5. package/dist/src/cache.d.ts.map +1 -1
  6. package/dist/src/cache.js +6 -1
  7. package/dist/src/cache.js.map +1 -1
  8. package/dist/src/evaluator.d.ts.map +1 -1
  9. package/dist/src/evaluator.js +2 -1
  10. package/dist/src/evaluator.js.map +1 -1
  11. package/dist/src/main.js +12 -0
  12. package/dist/src/main.js.map +1 -1
  13. package/dist/src/providers/openai.d.ts.map +1 -1
  14. package/dist/src/providers/openai.js +9 -4
  15. package/dist/src/providers/openai.js.map +1 -1
  16. package/dist/src/providers/scriptCompletion.d.ts +9 -0
  17. package/dist/src/providers/scriptCompletion.d.ts.map +1 -0
  18. package/dist/src/providers/scriptCompletion.js +27 -0
  19. package/dist/src/providers/scriptCompletion.js.map +1 -0
  20. package/dist/src/providers.d.ts.map +1 -1
  21. package/dist/src/providers.js +7 -1
  22. package/dist/src/providers.js.map +1 -1
  23. package/dist/src/table.js +1 -1
  24. package/dist/src/table.js.map +1 -1
  25. package/dist/src/types.d.ts +2 -1
  26. package/dist/src/types.d.ts.map +1 -1
  27. package/dist/src/util.d.ts.map +1 -1
  28. package/dist/src/util.js +3 -1
  29. package/dist/src/util.js.map +1 -1
  30. package/dist/src/web/client/assets/{index-eb1e9052.js → index-b6617ee8.js} +26 -26
  31. package/dist/src/web/client/assets/{index-c3faa651.css → index-b82d0138.css} +1 -1
  32. package/dist/src/web/client/index.html +2 -2
  33. package/package.json +2 -2
  34. package/src/assertions.ts +8 -8
  35. package/src/cache.ts +5 -0
  36. package/src/evaluator.ts +2 -1
  37. package/src/main.ts +14 -1
  38. package/src/providers/openai.ts +15 -6
  39. package/src/providers/scriptCompletion.ts +23 -0
  40. package/src/providers.ts +6 -1
  41. package/src/table.ts +1 -1
  42. package/src/types.ts +2 -1
  43. package/src/util.ts +3 -1
  44. package/src/web/client/package-lock.json +31 -31
  45. package/src/web/client/src/EvalOutputPromptDialog.tsx +61 -0
  46. package/src/web/client/src/ResultsTable.css +10 -7
  47. package/src/web/client/src/ResultsTable.tsx +63 -18
  48. package/src/web/client/src/types.ts +7 -1
@@ -1 +1 @@
1
- :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
1
+ :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-eb1e9052.js"></script>
9
- <link rel="stylesheet" href="/assets/index-c3faa651.css">
8
+ <script type="module" crossorigin src="/assets/index-b6617ee8.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-b82d0138.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "promptfoo",
3
- "description": "Prompt engineering toolkit",
3
+ "description": "LLM eval & testing toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.16.0",
5
+ "version": "0.17.1",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/src/index.js",
package/src/assertions.ts CHANGED
@@ -123,12 +123,12 @@ export async function runAssertion(
123
123
  }
124
124
 
125
125
  if (baseType === 'contains') {
126
- invariant(assertion.value, '"contains" assertion type must have a string value');
126
+ invariant(assertion.value, '"contains" assertion type must have a string or number value');
127
127
  invariant(
128
- typeof assertion.value === 'string',
129
- '"contains" assertion type must have a string value',
128
+ typeof assertion.value === 'string' || typeof assertion.value === 'number',
129
+ '"contains" assertion type must have a string or number value',
130
130
  );
131
- pass = output.includes(assertion.value) !== inverse;
131
+ pass = output.includes(String(assertion.value)) !== inverse;
132
132
  return {
133
133
  pass,
134
134
  score: pass ? 1 : 0,
@@ -192,12 +192,12 @@ export async function runAssertion(
192
192
  }
193
193
 
194
194
  if (baseType === 'icontains') {
195
- invariant(assertion.value, '"icontains" assertion type must have a string value');
195
+ invariant(assertion.value, '"icontains" assertion type must have a string or number value');
196
196
  invariant(
197
- typeof assertion.value === 'string',
198
- '"icontains" assertion type must have a string value',
197
+ typeof assertion.value === 'string' || typeof assertion.value === 'number',
198
+ '"icontains" assertion type must have a string or number value',
199
199
  );
200
- pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
200
+ pass = output.toLowerCase().includes(String(assertion.value).toLowerCase()) !== inverse;
201
201
  return {
202
202
  pass,
203
203
  score: pass ? 1 : 0,
package/src/cache.ts CHANGED
@@ -97,3 +97,8 @@ export function disableCache() {
97
97
  logger.info('Cache is disabled.');
98
98
  enabled = false;
99
99
  }
100
+
101
+ export async function clearCache() {
102
+ logger.info('Clearing cache...');
103
+ return getCache().reset();
104
+ }
package/src/evaluator.ts CHANGED
@@ -293,7 +293,7 @@ class Evaluator {
293
293
 
294
294
  const table: EvaluateTable = {
295
295
  head: {
296
- prompts: prompts.map((p) => p.display),
296
+ prompts,
297
297
  vars: Array.from(varNames).sort(),
298
298
  // TODO(ian): add assertions to table?
299
299
  },
@@ -383,6 +383,7 @@ class Evaluator {
383
383
  pass: row.success,
384
384
  score: row.score,
385
385
  text: resultText,
386
+ prompt: row.prompt.raw,
386
387
  };
387
388
  },
388
389
  );
package/src/main.ts CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  writeOutput,
20
20
  } from './util';
21
21
  import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
22
- import { disableCache } from './cache';
22
+ import { disableCache, clearCache } from './cache';
23
23
  import { getDirectory } from './esm';
24
24
  import { init } from './web/server';
25
25
  import { checkForUpdates } from './updates';
@@ -139,6 +139,19 @@ async function main() {
139
139
  logger.info(`View results: ${chalk.greenBright.bold(url)}`);
140
140
  });
141
141
 
142
+ program
143
+ .command('cache')
144
+ .description('Manage cache')
145
+ .command('clear')
146
+ .description('Clear cache')
147
+ .action(async () => {
148
+ await clearCache();
149
+ telemetry.record('command_used', {
150
+ name: 'cache_clear',
151
+ });
152
+ await telemetry.send();
153
+ });
154
+
142
155
  program
143
156
  .command('eval')
144
157
  .description('Evaluate prompts')
@@ -204,6 +204,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
204
204
  'gpt-3.5-turbo',
205
205
  'gpt-3.5-turbo-0301',
206
206
  'gpt-3.5-turbo-0613',
207
+ 'gpt-3.5-turbo-16k',
208
+ 'gpt-3.5-turbo-16k-0613',
207
209
  ];
208
210
 
209
211
  options: OpenAiCompletionOptions;
@@ -216,7 +218,6 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
216
218
  this.options = context || {};
217
219
  }
218
220
 
219
- // TODO(ian): support passing in `messages` directly
220
221
  async callApi(prompt: string, options?: OpenAiCompletionOptions): Promise<ProviderResponse> {
221
222
  if (!this.apiKey) {
222
223
  throw new Error(
@@ -224,12 +225,20 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
224
225
  );
225
226
  }
226
227
 
227
- let messages: { role: string; content: string }[];
228
+ let messages: { role: string; content: string; name?: string }[];
228
229
  try {
229
- // User can specify `messages` payload as JSON, or we'll just put the
230
- // string prompt into a `messages` array.
231
- messages = JSON.parse(prompt);
230
+ messages = JSON.parse(prompt) as { role: string; content: string }[];
232
231
  } catch (err) {
232
+ const trimmedPrompt = prompt.trim();
233
+ if (
234
+ process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
235
+ trimmedPrompt.startsWith('{') ||
236
+ trimmedPrompt.startsWith('[')
237
+ ) {
238
+ throw new Error(
239
+ `OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
240
+ );
241
+ }
233
242
  messages = [{ role: 'user', content: prompt }];
234
243
  }
235
244
 
@@ -292,4 +301,4 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
292
301
 
293
302
  export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
294
303
  export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
295
- export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
304
+ export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
@@ -0,0 +1,23 @@
1
+ import { exec } from 'child_process';
2
+
3
+ import { ApiProvider, ProviderConfig, ProviderResponse } from '../types';
4
+
5
+ export class ScriptCompletionProvider implements ApiProvider {
6
+ constructor(private scriptPath: string, private config?: ProviderConfig) {}
7
+
8
+ id() {
9
+ return 'script';
10
+ }
11
+
12
+ async callApi(prompt: string) {
13
+ return new Promise((resolve, reject) => {
14
+ exec(`${this.scriptPath} "${prompt}"`, (error, stdout, stderr) => {
15
+ if (error) {
16
+ reject(error);
17
+ } else {
18
+ resolve({ output: stdout.trim() });
19
+ }
20
+ });
21
+ }) as Promise<ProviderResponse>;
22
+ }
23
+ }
package/src/providers.ts CHANGED
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
4
4
 
5
5
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
6
  import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
7
+ import { ScriptCompletionProvider } from './providers/scriptCompletion';
7
8
 
8
9
  export async function loadApiProviders(
9
10
  providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
@@ -30,7 +31,11 @@ export async function loadApiProvider(
30
31
  providerPath: string,
31
32
  context: ProviderConfig | undefined = undefined,
32
33
  ): Promise<ApiProvider> {
33
- if (providerPath?.startsWith('openai:')) {
34
+ if (providerPath?.startsWith('script:')) {
35
+ // Load script module
36
+ const scriptPath = providerPath.split(':')[1];
37
+ return new ScriptCompletionProvider(scriptPath, context?.config);
38
+ } else if (providerPath?.startsWith('openai:')) {
34
39
  // Load OpenAI module
35
40
  const options = providerPath.split(':');
36
41
  const modelType = options[1];
package/src/table.ts CHANGED
@@ -7,7 +7,7 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
7
7
  const head = summary.table.head;
8
8
  const headLength = head.prompts.length + head.vars.length;
9
9
  const table = new Table({
10
- head: [...head.prompts, ...head.vars],
10
+ head: [...head.prompts.map((prompt) => prompt.display), ...head.vars],
11
11
  colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
12
12
  wordWrap: true,
13
13
  wrapOnWordBoundary: false,
package/src/types.ts CHANGED
@@ -95,11 +95,12 @@ export interface EvaluateTableOutput {
95
95
  pass: boolean;
96
96
  score: number;
97
97
  text: string;
98
+ prompt: string;
98
99
  }
99
100
 
100
101
  export interface EvaluateTable {
101
102
  head: {
102
- prompts: string[];
103
+ prompts: Prompt[];
103
104
  vars: string[];
104
105
  };
105
106
 
package/src/util.ts CHANGED
@@ -347,7 +347,9 @@ export function testCaseFromCsvRow(row: CsvRow): TestCase {
347
347
  const asserts: Assertion[] = [];
348
348
  for (const [key, value] of Object.entries(row)) {
349
349
  if (key === '__expected') {
350
- asserts.push(assertionFromString(value));
350
+ if (value.trim() !== '') {
351
+ asserts.push(assertionFromString(value));
352
+ }
351
353
  } else {
352
354
  vars[key] = value;
353
355
  }
@@ -35,6 +35,15 @@
35
35
  "vite": "^4.3.2"
36
36
  }
37
37
  },
38
+ "node_modules/@aashutoshrathi/word-wrap": {
39
+ "version": "1.2.6",
40
+ "resolved": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
41
+ "integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
42
+ "dev": true,
43
+ "engines": {
44
+ "node": ">=0.10.0"
45
+ }
46
+ },
38
47
  "node_modules/@babel/code-frame": {
39
48
  "version": "7.21.4",
40
49
  "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.21.4.tgz",
@@ -2685,17 +2694,17 @@
2685
2694
  }
2686
2695
  },
2687
2696
  "node_modules/optionator": {
2688
- "version": "0.9.1",
2689
- "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.1.tgz",
2690
- "integrity": "sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==",
2697
+ "version": "0.9.3",
2698
+ "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
2699
+ "integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
2691
2700
  "dev": true,
2692
2701
  "dependencies": {
2702
+ "@aashutoshrathi/word-wrap": "^1.2.3",
2693
2703
  "deep-is": "^0.1.3",
2694
2704
  "fast-levenshtein": "^2.0.6",
2695
2705
  "levn": "^0.4.1",
2696
2706
  "prelude-ls": "^1.2.1",
2697
- "type-check": "^0.4.0",
2698
- "word-wrap": "^1.2.3"
2707
+ "type-check": "^0.4.0"
2699
2708
  },
2700
2709
  "engines": {
2701
2710
  "node": ">= 0.8.0"
@@ -3083,9 +3092,9 @@
3083
3092
  }
3084
3093
  },
3085
3094
  "node_modules/semver": {
3086
- "version": "7.5.0",
3087
- "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.0.tgz",
3088
- "integrity": "sha512-+XC0AD/R7Q2mPSRuy2Id0+CGTZ98+8f+KvwirxOKIEyid+XSx6HbC63p+O4IndTHuX5Z+JxQ0TghCkO5Cg/2HA==",
3095
+ "version": "7.5.3",
3096
+ "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.3.tgz",
3097
+ "integrity": "sha512-QBlUtyVk/5EeHbi7X0fw6liDZc7BBmEaSYn01fMU1OUYbf6GPsbTtd8WmnqbI20SeycoHSeiybkE/q1Q+qlThQ==",
3089
3098
  "dev": true,
3090
3099
  "dependencies": {
3091
3100
  "lru-cache": "^6.0.0"
@@ -3391,15 +3400,6 @@
3391
3400
  "node": ">= 8"
3392
3401
  }
3393
3402
  },
3394
- "node_modules/word-wrap": {
3395
- "version": "1.2.3",
3396
- "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
3397
- "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
3398
- "dev": true,
3399
- "engines": {
3400
- "node": ">=0.10.0"
3401
- }
3402
- },
3403
3403
  "node_modules/wrappy": {
3404
3404
  "version": "1.0.2",
3405
3405
  "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
@@ -3485,6 +3485,12 @@
3485
3485
  }
3486
3486
  },
3487
3487
  "dependencies": {
3488
+ "@aashutoshrathi/word-wrap": {
3489
+ "version": "1.2.6",
3490
+ "resolved": "https://registry.npmjs.org/@aashutoshrathi/word-wrap/-/word-wrap-1.2.6.tgz",
3491
+ "integrity": "sha512-1Yjs2SvM8TflER/OD3cOjhWWOZb58A2t7wpE2S9XfBYTiIl+XFhQG2bjy4Pu1I+EAlCNUzRDYDdFwFYUKvXcIA==",
3492
+ "dev": true
3493
+ },
3488
3494
  "@babel/code-frame": {
3489
3495
  "version": "7.21.4",
3490
3496
  "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.21.4.tgz",
@@ -5231,17 +5237,17 @@
5231
5237
  }
5232
5238
  },
5233
5239
  "optionator": {
5234
- "version": "0.9.1",
5235
- "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.1.tgz",
5236
- "integrity": "sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==",
5240
+ "version": "0.9.3",
5241
+ "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz",
5242
+ "integrity": "sha512-JjCoypp+jKn1ttEFExxhetCKeJt9zhAgAve5FXHixTvFDW/5aEktX9bufBKLRRMdU7bNtpLfcGu94B3cdEJgjg==",
5237
5243
  "dev": true,
5238
5244
  "requires": {
5245
+ "@aashutoshrathi/word-wrap": "^1.2.3",
5239
5246
  "deep-is": "^0.1.3",
5240
5247
  "fast-levenshtein": "^2.0.6",
5241
5248
  "levn": "^0.4.1",
5242
5249
  "prelude-ls": "^1.2.1",
5243
- "type-check": "^0.4.0",
5244
- "word-wrap": "^1.2.3"
5250
+ "type-check": "^0.4.0"
5245
5251
  }
5246
5252
  },
5247
5253
  "p-limit": {
@@ -5483,9 +5489,9 @@
5483
5489
  }
5484
5490
  },
5485
5491
  "semver": {
5486
- "version": "7.5.0",
5487
- "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.0.tgz",
5488
- "integrity": "sha512-+XC0AD/R7Q2mPSRuy2Id0+CGTZ98+8f+KvwirxOKIEyid+XSx6HbC63p+O4IndTHuX5Z+JxQ0TghCkO5Cg/2HA==",
5492
+ "version": "7.5.3",
5493
+ "resolved": "https://registry.npmjs.org/semver/-/semver-7.5.3.tgz",
5494
+ "integrity": "sha512-QBlUtyVk/5EeHbi7X0fw6liDZc7BBmEaSYn01fMU1OUYbf6GPsbTtd8WmnqbI20SeycoHSeiybkE/q1Q+qlThQ==",
5489
5495
  "dev": true,
5490
5496
  "requires": {
5491
5497
  "lru-cache": "^6.0.0"
@@ -5674,12 +5680,6 @@
5674
5680
  "isexe": "^2.0.0"
5675
5681
  }
5676
5682
  },
5677
- "word-wrap": {
5678
- "version": "1.2.3",
5679
- "resolved": "https://registry.npmjs.org/word-wrap/-/word-wrap-1.2.3.tgz",
5680
- "integrity": "sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==",
5681
- "dev": true
5682
- },
5683
5683
  "wrappy": {
5684
5684
  "version": "1.0.2",
5685
5685
  "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
@@ -0,0 +1,61 @@
1
+ import { useState, useEffect } from 'react';
2
+ import Button from '@mui/material/Button';
3
+ import Dialog from '@mui/material/Dialog';
4
+ import DialogActions from '@mui/material/DialogActions';
5
+ import DialogContent from '@mui/material/DialogContent';
6
+ import DialogTitle from '@mui/material/DialogTitle';
7
+ import TextareaAutosize from '@mui/base/TextareaAutosize';
8
+ import IconButton from '@mui/material/IconButton';
9
+ import ContentCopyIcon from '@mui/icons-material/ContentCopy';
10
+ import CheckIcon from '@mui/icons-material/Check';
11
+
12
+ interface EvalOutputPromptDialogProps {
13
+ open: boolean;
14
+ onClose: () => void;
15
+ prompt: string;
16
+ output?: string;
17
+ }
18
+
19
+ export default function EvalOutputPromptDialog({
20
+ open,
21
+ onClose,
22
+ prompt,
23
+ output,
24
+ }: EvalOutputPromptDialogProps) {
25
+ const [copied, setCopied] = useState(false);
26
+
27
+ useEffect(() => {
28
+ setCopied(false);
29
+ }, [prompt]);
30
+
31
+ const copyToClipboard = async (text: string) => {
32
+ await navigator.clipboard.writeText(text);
33
+ setCopied(true);
34
+ };
35
+
36
+ return (
37
+ <Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
38
+ <DialogTitle>Prompt</DialogTitle>
39
+ <DialogContent>
40
+ <TextareaAutosize readOnly value={prompt} style={{ width: '100%' }} />
41
+ <IconButton
42
+ onClick={() => copyToClipboard(prompt)}
43
+ style={{ position: 'absolute', right: '10px', top: '10px' }}
44
+ >
45
+ {copied ? <CheckIcon /> : <ContentCopyIcon />}
46
+ </IconButton>
47
+ </DialogContent>
48
+ {output && (
49
+ <>
50
+ <DialogTitle>Output</DialogTitle>
51
+ <DialogContent>
52
+ <TextareaAutosize readOnly value={output} style={{ width: '100%' }} />
53
+ </DialogContent>
54
+ </>
55
+ )}
56
+ <DialogActions>
57
+ <Button onClick={onClose}>Close</Button>
58
+ </DialogActions>
59
+ </Dialog>
60
+ );
61
+ }
@@ -64,10 +64,17 @@ th,
64
64
  vertical-align: bottom;
65
65
  }
66
66
 
67
+ th .action {
68
+ cursor: pointer;
69
+ margin-left: 0.5rem;
70
+ }
71
+
67
72
  tr .cell {
68
73
  }
69
74
 
70
- tr .cell-rating {
75
+ tr .cell-actions {
76
+ display: flex;
77
+ gap: 0.5rem;
71
78
  visibility: hidden;
72
79
  position: absolute;
73
80
  bottom: 1.25rem;
@@ -76,18 +83,14 @@ tr .cell-rating {
76
83
  font-size: 1.75rem;
77
84
  }
78
85
 
79
- tr:hover .cell-rating {
86
+ tr:hover .cell-actions {
80
87
  visibility: visible;
81
88
  }
82
89
 
83
- tr .cell-rating .rating {
90
+ tr .cell-actions .action {
84
91
  cursor: pointer;
85
92
  }
86
93
 
87
- tr .cell-rating .rating:first-child {
88
- margin-right: 0.5rem;
89
- }
90
-
91
94
  th .smalltext {
92
95
  visibility: hidden;
93
96
  font-weight: normal;
@@ -16,6 +16,8 @@ import { useStore } from './store.js';
16
16
 
17
17
  import type { CellContext, VisibilityState } from '@tanstack/table-core';
18
18
 
19
+ import EvalOutputPromptDialog from './EvalOutputPromptDialog';
20
+
19
21
  import type { EvalRow, EvalRowOutput, FilterMode } from './types.js';
20
22
 
21
23
  import './ResultsTable.css';
@@ -62,28 +64,24 @@ function TruncatedText({ text: rawText, maxLength }: TruncatedTextProps) {
62
64
 
63
65
  const renderTruncatedText = () => {
64
66
  if (text.length <= maxLength) {
65
- return text;
67
+ return <span>{text}</span>;
66
68
  }
67
69
  if (isTruncated) {
68
70
  return (
69
- <>
70
- <span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
71
- {text.substring(0, maxLength)} ...
72
- </span>
73
- </>
71
+ <span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
72
+ {text.substring(0, maxLength)} ...
73
+ </span>
74
74
  );
75
75
  } else {
76
76
  return (
77
- <>
78
- <span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
79
- {text}
80
- </span>
81
- </>
77
+ <span style={{ cursor: 'pointer' }} onClick={toggleTruncate}>
78
+ {text}
79
+ </span>
82
80
  );
83
81
  }
84
82
  };
85
83
 
86
- return <div>{renderTruncatedText()}</div>;
84
+ return renderTruncatedText();
87
85
  }
88
86
 
89
87
  interface PromptOutputProps {
@@ -101,6 +99,13 @@ function EvalOutputCell({
101
99
  promptIndex,
102
100
  onRating,
103
101
  }: PromptOutputProps) {
102
+ const [openPrompt, setOpen] = React.useState(false);
103
+ const handlePromptOpen = () => {
104
+ setOpen(true);
105
+ };
106
+ const handlePromptClose = () => {
107
+ setOpen(false);
108
+ };
104
109
  let text = typeof output.text === 'string' ? output.text : JSON.stringify(output.text);
105
110
  let chunks: string[] = [];
106
111
  if (!output.pass && text.includes('---')) {
@@ -113,6 +118,7 @@ function EvalOutputCell({
113
118
  onRating(rowIndex, promptIndex, isPass);
114
119
  };
115
120
 
121
+ // TODO(ian): output.prompt check for backwards compatibility, remove after 0.17.0
116
122
  return (
117
123
  <>
118
124
  <div className="cell">
@@ -128,11 +134,24 @@ function EvalOutputCell({
128
134
  )}{' '}
129
135
  <TruncatedText text={text} maxLength={maxTextLength} />
130
136
  </div>
131
- <div className="cell-rating">
132
- <span className="rating" onClick={() => handleClick(true)}>
137
+ <div className="cell-actions">
138
+ {output.prompt && (
139
+ <>
140
+ <span className="action" onClick={handlePromptOpen}>
141
+ 🔎
142
+ </span>
143
+ <EvalOutputPromptDialog
144
+ open={openPrompt}
145
+ onClose={handlePromptClose}
146
+ prompt={output.prompt}
147
+ output={text}
148
+ />
149
+ </>
150
+ )}
151
+ <span className="action" onClick={() => handleClick(true)}>
133
152
  👍
134
153
  </span>
135
- <span className="rating" onClick={() => handleClick(false)}>
154
+ <span className="action" onClick={() => handleClick(false)}>
136
155
  👎
137
156
  </span>
138
157
  </div>
@@ -140,11 +159,35 @@ function EvalOutputCell({
140
159
  );
141
160
  }
142
161
 
143
- function TableHeader({ text, maxLength, smallText }: TruncatedTextProps & { smallText: string }) {
162
+ function TableHeader({
163
+ text,
164
+ maxLength,
165
+ smallText,
166
+ expandedText,
167
+ }: TruncatedTextProps & { smallText: string; expandedText?: string }) {
168
+ const [openPrompt, setOpen] = React.useState(false);
169
+ const handlePromptOpen = () => {
170
+ setOpen(true);
171
+ };
172
+ const handlePromptClose = () => {
173
+ setOpen(false);
174
+ };
144
175
  return (
145
176
  <div>
146
177
  <TruncatedText text={text} maxLength={maxLength} />
147
- <span className="smalltext">{smallText}</span>
178
+ {expandedText && (
179
+ <>
180
+ <span className="action" onClick={handlePromptOpen}>
181
+ 🔎
182
+ </span>
183
+ <EvalOutputPromptDialog
184
+ open={openPrompt}
185
+ onClose={handlePromptClose}
186
+ prompt={expandedText}
187
+ />
188
+ </>
189
+ )}
190
+ <div className="smalltext">{smallText}</div>
148
191
  </div>
149
192
  );
150
193
  }
@@ -233,11 +276,13 @@ export default function ResultsTable({
233
276
  numGood[idx] === highestPassingCount && highestPassingCount !== 0;
234
277
  const columnId = `Prompt ${idx + 1}`;
235
278
  const isChecked = failureFilter[columnId] || false;
279
+ // TODO(ian): prompt string support for backwards compatibility, remove after 0.17.0
236
280
  return (
237
281
  <>
238
282
  <TableHeader
239
283
  smallText={`Prompt ${idx + 1}`}
240
- text={prompt}
284
+ text={typeof prompt === 'string' ? prompt : prompt.display}
285
+ expandedText={typeof prompt === 'string' ? undefined : prompt.raw}
241
286
  maxLength={maxTextLength}
242
287
  />
243
288
  {filterMode === 'failures' && (
@@ -1,5 +1,10 @@
1
+ type Prompt = {
2
+ display: string;
3
+ raw: string;
4
+ };
5
+
1
6
  export type EvalHead = {
2
- prompts: string[];
7
+ prompts: Prompt[];
3
8
  vars: string[];
4
9
  };
5
10
 
@@ -7,6 +12,7 @@ export type EvalRowOutput = {
7
12
  pass: boolean;
8
13
  score: number;
9
14
  text: string | object;
15
+ prompt: string;
10
16
  };
11
17
 
12
18
  export type EvalRow = {