promptfoo 0.17.5 → 0.17.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/dist/package.json +2 -1
  2. package/dist/src/cache.d.ts +3 -0
  3. package/dist/src/cache.d.ts.map +1 -1
  4. package/dist/src/cache.js +6 -1
  5. package/dist/src/cache.js.map +1 -1
  6. package/dist/src/evaluator.d.ts.map +1 -1
  7. package/dist/src/evaluator.js +21 -17
  8. package/dist/src/evaluator.js.map +1 -1
  9. package/dist/src/index.d.ts +1 -0
  10. package/dist/src/index.d.ts.map +1 -1
  11. package/dist/src/main.js +9 -5
  12. package/dist/src/main.js.map +1 -1
  13. package/dist/src/providers/azureopenai.d.ts.map +1 -1
  14. package/dist/src/providers/azureopenai.js +1 -13
  15. package/dist/src/providers/azureopenai.js.map +1 -1
  16. package/dist/src/providers/localai.d.ts.map +1 -1
  17. package/dist/src/providers/localai.js +2 -1
  18. package/dist/src/providers/localai.js.map +1 -1
  19. package/dist/src/providers/openai.d.ts.map +1 -1
  20. package/dist/src/providers/openai.js +1 -27
  21. package/dist/src/providers/openai.js.map +1 -1
  22. package/dist/src/providers/replicate.d.ts +11 -0
  23. package/dist/src/providers/replicate.d.ts.map +1 -0
  24. package/dist/src/providers/replicate.js +78 -0
  25. package/dist/src/providers/replicate.js.map +1 -0
  26. package/dist/src/providers/shared.d.ts +5 -0
  27. package/dist/src/providers/shared.d.ts.map +1 -1
  28. package/dist/src/providers/shared.js +33 -1
  29. package/dist/src/providers/shared.js.map +1 -1
  30. package/dist/src/providers.d.ts +2 -0
  31. package/dist/src/providers.d.ts.map +1 -1
  32. package/dist/src/providers.js +8 -0
  33. package/dist/src/providers.js.map +1 -1
  34. package/dist/src/types.d.ts +2 -0
  35. package/dist/src/types.d.ts.map +1 -1
  36. package/dist/src/web/client/assets/{index-c2756e5d.js → index-13198388.js} +23 -23
  37. package/dist/src/web/client/assets/index-f9b230d1.css +1 -0
  38. package/dist/src/web/client/index.html +2 -2
  39. package/package.json +2 -1
  40. package/src/cache.ts +5 -1
  41. package/src/evaluator.ts +23 -17
  42. package/src/main.ts +13 -5
  43. package/src/providers/azureopenai.ts +2 -18
  44. package/src/providers/localai.ts +3 -2
  45. package/src/providers/openai.ts +2 -33
  46. package/src/providers/replicate.ts +86 -0
  47. package/src/providers/shared.ts +29 -0
  48. package/src/providers.ts +8 -0
  49. package/src/types.ts +2 -0
  50. package/src/web/client/src/App.tsx +6 -0
  51. package/src/web/client/src/EvalOutputPromptDialog.tsx +6 -2
  52. package/src/web/client/src/ResultsTable.tsx +5 -0
  53. package/src/web/client/src/ResultsView.tsx +2 -1
  54. package/src/web/client/src/index.css +1 -12
  55. package/src/web/client/src/types.ts +1 -1
  56. package/dist/src/web/client/assets/index-b82d0138.css +0 -1
@@ -0,0 +1 @@
1
+ :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-c2756e5d.js"></script>
9
- <link rel="stylesheet" href="/assets/index-b82d0138.css">
8
+ <script type="module" crossorigin src="/assets/index-13198388.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-f9b230d1.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "promptfoo",
3
3
  "description": "LLM eval & testing toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.17.5",
5
+ "version": "0.17.6",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/src/index.js",
@@ -79,6 +79,7 @@
79
79
  "node-fetch": "^2.6.7",
80
80
  "nunjucks": "^3.2.4",
81
81
  "opener": "^1.5.2",
82
+ "replicate": "^0.12.3",
82
83
  "rouge": "^1.0.3",
83
84
  "semver": "^7.5.3",
84
85
  "socket.io": "^4.6.1",
package/src/cache.ts CHANGED
@@ -20,7 +20,7 @@ let enabled =
20
20
  const cacheType =
21
21
  process.env.PROMPTFOO_CACHE_TYPE || (process.env.NODE_ENV === 'test' ? 'memory' : 'disk');
22
22
 
23
- function getCache() {
23
+ export function getCache() {
24
24
  if (!cacheInstance) {
25
25
  const cachePath =
26
26
  process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
@@ -102,3 +102,7 @@ export async function clearCache() {
102
102
  logger.info('Clearing cache...');
103
103
  return getCache().reset();
104
104
  }
105
+
106
+ export function isCacheEnabled() {
107
+ return enabled;
108
+ }
package/src/evaluator.ts CHANGED
@@ -33,6 +33,7 @@ interface RunEvalOptions {
33
33
 
34
34
  rowIndex: number;
35
35
  colIndex: number;
36
+ repeatIndex: number;
36
37
  }
37
38
 
38
39
  const DEFAULT_MAX_CONCURRENCY = 4;
@@ -266,25 +267,30 @@ class Evaluator {
266
267
  // Finalize test case eval
267
268
  const varCombinations = generateVarCombinations(testCase.vars || {});
268
269
  totalVarCombinations += varCombinations.length;
269
- for (const vars of varCombinations) {
270
- let colIndex = 0;
271
- for (const prompt of testSuite.prompts) {
272
- for (const provider of testSuite.providers) {
273
- runEvalOptions.push({
274
- provider,
275
- prompt: {
276
- ...prompt,
277
- raw: prependToPrompt + prompt.raw + appendToPrompt,
278
- },
279
- test: { ...testCase, vars },
280
- includeProviderId: testSuite.providers.length > 1,
281
- rowIndex,
282
- colIndex,
283
- });
284
- colIndex++;
270
+
271
+ const numRepeat = this.options.repeat || 1;
272
+ for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
273
+ for (const vars of varCombinations) {
274
+ let colIndex = 0;
275
+ for (const prompt of testSuite.prompts) {
276
+ for (const provider of testSuite.providers) {
277
+ runEvalOptions.push({
278
+ provider,
279
+ prompt: {
280
+ ...prompt,
281
+ raw: prependToPrompt + prompt.raw + appendToPrompt,
282
+ },
283
+ test: { ...testCase, vars },
284
+ includeProviderId: testSuite.providers.length > 1,
285
+ rowIndex,
286
+ colIndex,
287
+ repeatIndex,
288
+ });
289
+ colIndex++;
290
+ }
285
291
  }
292
+ rowIndex++;
286
293
  }
287
- rowIndex++;
288
294
  }
289
295
  }
290
296
 
package/src/main.ts CHANGED
@@ -130,7 +130,7 @@ async function main() {
130
130
 
131
131
  program
132
132
  .command('share')
133
- .description('Share your most recent result')
133
+ .description('Create a shareable URL of your most recent eval')
134
134
  .option('-y, --yes', 'Skip confirmation')
135
135
  .action(async (cmdObj: { yes: boolean } & Command) => {
136
136
  telemetry.maybeShowNotice();
@@ -158,10 +158,9 @@ async function main() {
158
158
  });
159
159
 
160
160
  reader.question(
161
- 'Are you sure you want to create a public URL? [y/N] ',
161
+ 'Are you sure you want to create a shareable URL of your most recent eval? Anyone you give this URL to will be able to view the results [Y/n] ',
162
162
  async function (answer: string) {
163
- if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y') {
164
- logger.info('Did not create a public URL.');
163
+ if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y' && answer !== '') {
165
164
  reader.close();
166
165
  return;
167
166
  }
@@ -218,6 +217,13 @@ async function main() {
218
217
  ? String(defaultConfig.evaluateOptions.maxConcurrency)
219
218
  : undefined,
220
219
  )
220
+ .option(
221
+ '--repeat <number>',
222
+ 'Number of times to run each test',
223
+ defaultConfig.evaluateOptions?.repeat
224
+ ? String(defaultConfig.evaluateOptions.repeat)
225
+ : undefined,
226
+ )
221
227
  .option(
222
228
  '--table-cell-max-length <number>',
223
229
  'Truncate console table cells to this length',
@@ -263,7 +269,6 @@ async function main() {
263
269
  }
264
270
 
265
271
  // Config parsing
266
- const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
267
272
  let fileConfig: Partial<UnifiedConfig> = {};
268
273
  const configPath = cmdObj.config;
269
274
  if (configPath) {
@@ -326,12 +331,15 @@ async function main() {
326
331
  defaultTest,
327
332
  };
328
333
 
334
+ const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
335
+ const iterations = parseInt(cmdObj.repeat || '', 10);
329
336
  const options: EvaluateOptions = {
330
337
  showProgressBar:
331
338
  typeof cmdObj.progressBar === 'undefined'
332
339
  ? getLogLevel() !== 'debug'
333
340
  : cmdObj.progressBar,
334
341
  maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
342
+ repeat: !isNaN(iterations) && iterations > 0 ? iterations : 1,
335
343
  ...evaluateOptions,
336
344
  };
337
345
 
@@ -1,6 +1,6 @@
1
1
  import logger from '../logger';
2
2
  import { fetchJsonWithCache } from '../cache';
3
- import { REQUEST_TIMEOUT_MS } from './shared';
3
+ import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
4
4
 
5
5
  import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
6
6
 
@@ -205,23 +205,7 @@ export class AzureOpenAiChatCompletionProvider extends AzureOpenAiGenericProvide
205
205
  throw new Error('Azure OpenAI API host must be set');
206
206
  }
207
207
 
208
- let messages: { role: string; content: string; name?: string }[];
209
- try {
210
- messages = JSON.parse(prompt) as { role: string; content: string }[];
211
- } catch (err) {
212
- const trimmedPrompt = prompt.trim();
213
- if (
214
- process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
215
- trimmedPrompt.startsWith('{') ||
216
- trimmedPrompt.startsWith('[')
217
- ) {
218
- throw new Error(
219
- `Azure OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
220
- );
221
- }
222
- messages = [{ role: 'user', content: prompt }];
223
- }
224
-
208
+ const messages = parseChatPrompt(prompt);
225
209
  const body = {
226
210
  model: this.deploymentName,
227
211
  messages: messages,
@@ -1,6 +1,6 @@
1
1
  import logger from '../logger';
2
2
  import { fetchJsonWithCache } from '../cache';
3
- import { REQUEST_TIMEOUT_MS } from './shared';
3
+ import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
4
4
 
5
5
  import type { ApiProvider, ProviderResponse } from '../types.js';
6
6
 
@@ -29,9 +29,10 @@ class LocalAiGenericProvider implements ApiProvider {
29
29
 
30
30
  export class LocalAiChatProvider extends LocalAiGenericProvider {
31
31
  async callApi(prompt: string): Promise<ProviderResponse> {
32
+ const messages = parseChatPrompt(prompt);
32
33
  const body = {
33
34
  model: this.modelName,
34
- prompt,
35
+ messages: messages,
35
36
  temperature: process.env.LOCALAI_TEMPERATURE || 0.7,
36
37
  };
37
38
  logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
@@ -1,8 +1,7 @@
1
- import yaml from 'js-yaml';
2
1
 
3
2
  import logger from '../logger';
4
3
  import { fetchJsonWithCache } from '../cache';
5
- import { REQUEST_TIMEOUT_MS } from './shared';
4
+ import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
6
5
 
7
6
  import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
8
7
 
@@ -227,37 +226,7 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
227
226
  );
228
227
  }
229
228
 
230
- let messages: { role: string; content: string; name?: string }[];
231
- const trimmedPrompt = prompt.trim();
232
- if (trimmedPrompt.startsWith('- role:')) {
233
- try {
234
- // Try YAML
235
- messages = yaml.load(prompt) as { role: string; content: string }[];
236
- } catch (err) {
237
- throw new Error(
238
- `OpenAI Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`,
239
- );
240
- }
241
- } else {
242
- try {
243
- // Try JSON
244
- messages = JSON.parse(prompt) as { role: string; content: string }[];
245
- } catch (err) {
246
- if (
247
- process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
248
- trimmedPrompt.startsWith('{') ||
249
- trimmedPrompt.startsWith('[')
250
- ) {
251
- throw new Error(
252
- `OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
253
- );
254
- }
255
-
256
- // Fall back to wrapping the prompt in a user message
257
- messages = [{ role: 'user', content: prompt }];
258
- }
259
- }
260
-
229
+ const messages = parseChatPrompt(prompt);
261
230
  const body = {
262
231
  model: this.modelName,
263
232
  messages: messages,
@@ -0,0 +1,86 @@
1
+ import Replicate from 'replicate';
2
+
3
+ import fetch from 'node-fetch';
4
+ import logger from '../logger';
5
+ import { getCache, isCacheEnabled } from '../cache';
6
+
7
+ import type { ApiProvider, ProviderResponse } from '../types.js';
8
+
9
+ export class ReplicateProvider implements ApiProvider {
10
+ modelName: string;
11
+ apiKey?: string;
12
+ replicate: any;
13
+
14
+ constructor(modelName: string, apiKey?: string) {
15
+ this.modelName = modelName;
16
+ this.apiKey = apiKey || process.env.REPLICATE_API_TOKEN || process.env.REPLICATE_API_KEY;
17
+ }
18
+
19
+ id(): string {
20
+ return `replicate:${this.modelName}`;
21
+ }
22
+
23
+ toString(): string {
24
+ return `[Replicate Provider ${this.modelName}]`;
25
+ }
26
+
27
+ async callApi(prompt: string): Promise<ProviderResponse> {
28
+ if (!this.apiKey) {
29
+ throw new Error(
30
+ 'Replicate API key is not set. Set REPLICATE_API_TOKEN environment variable or pass it as an argument to the constructor.',
31
+ );
32
+ }
33
+
34
+ let cache;
35
+ let cacheKey;
36
+ if (isCacheEnabled()) {
37
+ cache = await getCache();
38
+ cacheKey = `replicate:${this.modelName}:${prompt}`;
39
+
40
+ // Try to get the cached response
41
+ const cachedResponse = await cache.get(cacheKey);
42
+
43
+ if (cachedResponse) {
44
+ logger.debug(`Returning cached response for ${prompt}: ${cachedResponse}`);
45
+ return JSON.parse(cachedResponse as string);
46
+ }
47
+ }
48
+
49
+ const replicate = new Replicate({
50
+ auth: this.apiKey,
51
+ fetch,
52
+ });
53
+
54
+ logger.debug(`Calling Replicate: ${prompt}`);
55
+ let response;
56
+ try {
57
+ response = await replicate.run(this.modelName as any, {
58
+ input: {
59
+ prompt,
60
+ max_length: process.env.REPLICATE_MAX_LENGTH || 2046,
61
+ temperature: process.env.REPLICATE_TEMPERATURE || 0.5,
62
+ repetition_penalty: process.env.REPLICATE_REPETITION_PENALTY || 1.0,
63
+ },
64
+ });
65
+ } catch (err) {
66
+ return {
67
+ error: `API call error: ${String(err)}`,
68
+ };
69
+ }
70
+ logger.debug(`\tReplicate API response: ${JSON.stringify(response)}`);
71
+ try {
72
+ const result = {
73
+ output: (response as string[]).join(''),
74
+ tokenUsage: {}, // TODO: add token usage once Replicate API supports it
75
+ };
76
+ if (cache && cacheKey) {
77
+ await cache.set(cacheKey, JSON.stringify(result));
78
+ }
79
+ return result;
80
+ } catch (err) {
81
+ return {
82
+ error: `API response error: ${String(err)}: ${JSON.stringify(response)}`,
83
+ };
84
+ }
85
+ }
86
+ }
@@ -1,3 +1,32 @@
1
+ import yaml from 'js-yaml';
2
+
1
3
  export const REQUEST_TIMEOUT_MS = process.env.REQUEST_TIMEOUT_MS
2
4
  ? parseInt(process.env.REQUEST_TIMEOUT_MS, 10)
3
5
  : 300_000;
6
+
7
+ export function parseChatPrompt(prompt: string): { role: string; content: string; name?: string }[] {
8
+ const trimmedPrompt = prompt.trim();
9
+ if (trimmedPrompt.startsWith('- role:')) {
10
+ try {
11
+ // Try YAML
12
+ return yaml.load(prompt) as { role: string; content: string }[];
13
+ } catch (err) {
14
+ throw new Error(`Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`);
15
+ }
16
+ } else {
17
+ try {
18
+ // Try JSON
19
+ return JSON.parse(prompt) as { role: string; content: string }[];
20
+ } catch (err) {
21
+ if (
22
+ process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
23
+ trimmedPrompt.startsWith('{') ||
24
+ trimmedPrompt.startsWith('[')
25
+ ) {
26
+ throw new Error(`Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`);
27
+ }
28
+ // Fall back to wrapping the prompt in a user message
29
+ return [{ role: 'user', content: prompt }];
30
+ }
31
+ }
32
+ }
package/src/providers.ts CHANGED
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
4
4
 
5
5
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
6
  import { AnthropicCompletionProvider } from './providers/anthropic';
7
+ import { ReplicateProvider } from './providers/replicate';
7
8
  import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
8
9
  import { ScriptCompletionProvider } from './providers/scriptCompletion';
9
10
  import {
@@ -106,6 +107,12 @@ export async function loadApiProvider(
106
107
  `Unknown Anthropic model type: ${modelType}. Use one of the following providers: anthropic:completion:<model name>`,
107
108
  );
108
109
  }
110
+ } else if (providerPath?.startsWith('replicate:')) {
111
+ // Load Replicate module
112
+ const options = providerPath.split(':');
113
+ const modelName = options.slice(1).join(':');
114
+
115
+ return new ReplicateProvider(modelName, undefined);
109
116
  }
110
117
 
111
118
  if (providerPath?.startsWith('localai:')) {
@@ -131,6 +138,7 @@ export default {
131
138
  OpenAiCompletionProvider,
132
139
  OpenAiChatCompletionProvider,
133
140
  AnthropicCompletionProvider,
141
+ ReplicateProvider,
134
142
  LocalAiCompletionProvider,
135
143
  LocalAiChatProvider,
136
144
  loadApiProvider,
package/src/types.ts CHANGED
@@ -6,6 +6,7 @@ export interface CommandLineOptions {
6
6
 
7
7
  // Shared with EvaluateOptions
8
8
  maxConcurrency: string;
9
+ repeat: string;
9
10
 
10
11
  // Command line only
11
12
  vars?: string;
@@ -75,6 +76,7 @@ export interface EvaluateOptions {
75
76
  maxConcurrency?: number;
76
77
  showProgressBar?: boolean;
77
78
  generateSuggestions?: boolean;
79
+ repeat?: number;
78
80
  }
79
81
 
80
82
  export interface Prompt {
@@ -37,6 +37,12 @@ function App() {
37
37
  }
38
38
  };
39
39
 
40
+ React.useEffect(() => {
41
+ if (prefersDarkMode) {
42
+ document.documentElement.setAttribute('data-theme', 'dark');
43
+ }
44
+ }, [prefersDarkMode]);
45
+
40
46
  React.useEffect(() => {
41
47
  const fetchEvalData = async (id: string) => {
42
48
  if (loadedFromApi.current) {
@@ -37,7 +37,7 @@ export default function EvalOutputPromptDialog({
37
37
  <Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
38
38
  <DialogTitle>Prompt</DialogTitle>
39
39
  <DialogContent>
40
- <TextareaAutosize readOnly value={prompt} style={{ width: '100%' }} />
40
+ <TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
41
41
  <IconButton
42
42
  onClick={() => copyToClipboard(prompt)}
43
43
  style={{ position: 'absolute', right: '10px', top: '10px' }}
@@ -49,7 +49,11 @@ export default function EvalOutputPromptDialog({
49
49
  <>
50
50
  <DialogTitle>Output</DialogTitle>
51
51
  <DialogContent>
52
- <TextareaAutosize readOnly value={output} style={{ width: '100%' }} />
52
+ <TextareaAutosize
53
+ readOnly
54
+ value={output}
55
+ style={{ width: '100%', padding: '0.75rem' }}
56
+ />
53
57
  </DialogContent>
54
58
  </>
55
59
  )}
@@ -334,6 +334,11 @@ export default function ResultsTable({
334
334
  return failureFilter[columnId] && isFail;
335
335
  });
336
336
  });
337
+ } else if (filterMode === 'different') {
338
+ return body.filter((row) => {
339
+ // TODO(ian): This works for strings, but not objects.
340
+ return !row.outputs.every((output) => output.text === row.outputs[0].text);
341
+ });
337
342
  }
338
343
  return body;
339
344
  }, [body, failureFilter, filterMode]);
@@ -181,7 +181,8 @@ export default function ResultsView() {
181
181
  label="Filter"
182
182
  >
183
183
  <MenuItem value="all">Show all results</MenuItem>
184
- <MenuItem value="failures">Show only failures</MenuItem>
184
+ <MenuItem value="failures">Show failures only</MenuItem>
185
+ <MenuItem value="different">Show different only</MenuItem>
185
186
  </Select>
186
187
  </FormControl>
187
188
  </Box>
@@ -21,19 +21,8 @@
21
21
  }
22
22
 
23
23
  /* Dark mode colors */
24
- @media (prefers-color-scheme: dark) {
25
- :root {
26
- --background-color: #1a1a1a;
27
- --text-color: #f0f0f0;
28
- --border-color: #444444;
29
- --table-border-color: #444444;
30
- --pass-color: #4caf50;
31
- --fail-color: #f44336;
32
- --smalltext-color: #888888;
33
- }
34
- }
35
-
36
24
  [data-theme='dark'] {
25
+ /* Keep synced with prefers-color-scheme above */
37
26
  --background-color: #1a1a1a;
38
27
  --text-color: #f0f0f0;
39
28
  --border-color: #444444;
@@ -25,6 +25,6 @@ export type EvalTable = {
25
25
  body: EvalRow[];
26
26
  };
27
27
 
28
- export type FilterMode = 'all' | 'failures';
28
+ export type FilterMode = 'all' | 'failures' | 'different';
29
29
 
30
30
  export type { UnifiedConfig } from '../../../types';
@@ -1 +0,0 @@
1
- :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}