promptfoo 0.17.7 → 0.17.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +2 -0
  2. package/dist/package.json +1 -1
  3. package/dist/src/evaluator.d.ts.map +1 -1
  4. package/dist/src/evaluator.js +31 -6
  5. package/dist/src/evaluator.js.map +1 -1
  6. package/dist/src/main.js +2 -0
  7. package/dist/src/main.js.map +1 -1
  8. package/dist/src/providers/azureopenai.d.ts +4 -0
  9. package/dist/src/providers/azureopenai.d.ts.map +1 -1
  10. package/dist/src/providers/azureopenai.js +15 -0
  11. package/dist/src/providers/azureopenai.js.map +1 -1
  12. package/dist/src/providers/openai.d.ts +4 -0
  13. package/dist/src/providers/openai.d.ts.map +1 -1
  14. package/dist/src/providers/openai.js +21 -2
  15. package/dist/src/providers/openai.js.map +1 -1
  16. package/dist/src/providers/replicate.d.ts.map +1 -1
  17. package/dist/src/providers/replicate.js +2 -1
  18. package/dist/src/providers/replicate.js.map +1 -1
  19. package/dist/src/providers/shared.d.ts.map +1 -1
  20. package/dist/src/providers/shared.js.map +1 -1
  21. package/dist/src/types.d.ts +9 -2
  22. package/dist/src/types.d.ts.map +1 -1
  23. package/dist/src/util.d.ts +10 -3
  24. package/dist/src/util.d.ts.map +1 -1
  25. package/dist/src/util.js +125 -40
  26. package/dist/src/util.js.map +1 -1
  27. package/dist/src/web/client/assets/{index-13198388.js → index-8388d689.js} +25 -25
  28. package/dist/src/web/client/assets/{index-f9b230d1.css → index-d2b6a160.css} +1 -1
  29. package/dist/src/web/client/index.html +2 -2
  30. package/dist/src/web/server.d.ts.map +1 -1
  31. package/dist/src/web/server.js +26 -3
  32. package/dist/src/web/server.js.map +1 -1
  33. package/package.json +1 -1
  34. package/src/evaluator.ts +37 -6
  35. package/src/main.ts +3 -0
  36. package/src/providers/azureopenai.ts +24 -0
  37. package/src/providers/openai.ts +32 -3
  38. package/src/providers/replicate.ts +7 -3
  39. package/src/providers/shared.ts +3 -1
  40. package/src/types.ts +12 -2
  41. package/src/util.ts +140 -42
  42. package/src/web/client/src/App.tsx +24 -1
  43. package/src/web/client/src/ResultsTable.css +11 -1
  44. package/src/web/client/src/ResultsTable.tsx +10 -0
  45. package/src/web/client/src/ResultsView.tsx +48 -3
  46. package/src/web/client/src/types.ts +4 -0
  47. package/src/web/server.ts +33 -10
package/src/util.ts CHANGED
@@ -29,7 +29,10 @@ import type {
29
29
  TestSuite,
30
30
  } from './types';
31
31
 
32
- export function readProviderPromptMap(config: Partial<UnifiedConfig>, parsedPrompts: Prompt[]): TestSuite["providerPromptMap"] {
32
+ export function readProviderPromptMap(
33
+ config: Partial<UnifiedConfig>,
34
+ parsedPrompts: Prompt[],
35
+ ): TestSuite['providerPromptMap'] {
33
36
  const ret: Record<string, string[]> = {};
34
37
 
35
38
  if (!config.providers) {
@@ -221,7 +224,31 @@ export async function fetchCsvFromGoogleSheet(url: string): Promise<string> {
221
224
  return csvData;
222
225
  }
223
226
 
224
- export async function readVars(varsPath: string, basePath: string = ''): Promise<CsvRow[]> {
227
+ export async function readVarsFiles(
228
+ pathOrGlobs: string | string[],
229
+ basePath: string = '',
230
+ ): Promise<Record<string, string | string[] | object>> {
231
+ if (typeof pathOrGlobs === 'string') {
232
+ pathOrGlobs = [pathOrGlobs];
233
+ }
234
+
235
+ const ret: Record<string, string | string[] | object> = {};
236
+ for (const pathOrGlob of pathOrGlobs) {
237
+ const resolvedPath = path.resolve(basePath, pathOrGlob);
238
+ const paths = globSync(resolvedPath);
239
+
240
+ for (const p of paths) {
241
+ const yamlData = yaml.load(fs.readFileSync(p, 'utf-8'));
242
+ Object.assign(ret, yamlData);
243
+ }
244
+ }
245
+
246
+ return ret;
247
+ }
248
+
249
+ export async function readTestsFile(varsPath: string, basePath: string = ''): Promise<CsvRow[]> {
250
+ // This function is confusingly named - it reads a CSV, JSON, or YAML file of
251
+ // TESTS or test equivalents.
225
252
  const resolvedVarsPath = path.resolve(basePath, varsPath);
226
253
  const fileExtension = parsePath(varsPath).ext.slice(1);
227
254
  let rows: CsvRow[] = [];
@@ -243,25 +270,53 @@ export async function readVars(varsPath: string, basePath: string = ''): Promise
243
270
  }
244
271
 
245
272
  export async function readTests(
246
- tests: string | TestCase[] | undefined,
273
+ tests: string | string[] | TestCase[] | undefined,
247
274
  basePath: string = '',
248
275
  ): Promise<TestCase[]> {
249
- if (!tests) {
250
- return [];
251
- }
276
+ const ret: TestCase[] = [];
277
+
278
+ const loadTestsFromGlob = async (loadTestsGlob: string) => {
279
+ const resolvedPath = path.resolve(basePath, loadTestsGlob);
280
+ const testFiles = globSync(resolvedPath);
281
+ for (const testFile of testFiles) {
282
+ const testFileContent = yaml.load(fs.readFileSync(testFile, 'utf-8')) as TestCase[];
283
+ for (const testCase of testFileContent) {
284
+ if (typeof testCase.vars === 'string' || Array.isArray(testCase.vars)) {
285
+ const testcaseBasePath = path.dirname(testFile);
286
+ testCase.vars = await readVarsFiles(testCase.vars, testcaseBasePath);
287
+ }
288
+ }
289
+ ret.push(...testFileContent);
290
+ }
291
+ };
252
292
 
253
293
  if (typeof tests === 'string') {
254
- // It's a filepath, load from CSV
255
- const vars = await readVars(tests, basePath);
256
- return vars.map((row, idx) => {
257
- const test = testCaseFromCsvRow(row);
258
- test.description = `Row #${idx + 1}`;
259
- return test;
260
- });
294
+ if (tests.endsWith('yaml') || tests.endsWith('yml')) {
295
+ // Load testcase config from yaml
296
+ await loadTestsFromGlob(tests);
297
+ } else {
298
+ // Legacy load CSV
299
+ const vars = await readTestsFile(tests, basePath);
300
+ return vars.map((row, idx) => {
301
+ const test = testCaseFromCsvRow(row);
302
+ test.description = `Row #${idx + 1}`;
303
+ return test;
304
+ });
305
+ }
306
+ } else if (Array.isArray(tests)) {
307
+ for (const maybeTestsGlob of tests) {
308
+ if (typeof maybeTestsGlob === 'string') {
309
+ // Assume it's a filepath
310
+ await loadTestsFromGlob(maybeTestsGlob);
311
+ } else {
312
+ // Assume it's a full test case
313
+ ret.push(maybeTestsGlob);
314
+ }
315
+ }
261
316
  }
262
317
 
263
318
  // Some validation of the shape of tests
264
- for (const test of tests) {
319
+ for (const test of ret) {
265
320
  if (!test.assert && !test.vars) {
266
321
  throw new Error(
267
322
  `Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
@@ -273,7 +328,7 @@ export async function readTests(
273
328
  }
274
329
  }
275
330
 
276
- return tests;
331
+ return ret;
277
332
  }
278
333
 
279
334
  export function writeOutput(
@@ -313,28 +368,31 @@ export function writeOutput(
313
368
  }
314
369
  }
315
370
 
316
- export async function fetchWithTimeout(
371
+ export function fetchWithTimeout(
317
372
  url: RequestInfo,
318
373
  options: RequestInit = {},
319
374
  timeout: number,
320
375
  ): Promise<Response> {
321
- const controller = new AbortController();
322
- const { signal } = controller;
323
- options.signal = signal;
324
-
325
- const timeoutId = setTimeout(() => {
326
- controller.abort();
327
- throw new Error(`Request timed out after ${timeout} ms`);
328
- }, timeout);
329
-
330
- try {
331
- const response = await fetch(url, options);
332
- clearTimeout(timeoutId);
333
- return response;
334
- } catch (error) {
335
- clearTimeout(timeoutId);
336
- throw error;
337
- }
376
+ return new Promise((resolve, reject) => {
377
+ const controller = new AbortController();
378
+ const { signal } = controller;
379
+ options.signal = signal;
380
+
381
+ const timeoutId = setTimeout(() => {
382
+ controller.abort();
383
+ reject(new Error(`Request timed out after ${timeout} ms`));
384
+ }, timeout);
385
+
386
+ fetch(url, options)
387
+ .then((response) => {
388
+ clearTimeout(timeoutId);
389
+ resolve(response);
390
+ })
391
+ .catch((error) => {
392
+ clearTimeout(timeoutId);
393
+ reject(error);
394
+ });
395
+ });
338
396
  }
339
397
 
340
398
  export async function fetchWithRetries(
@@ -356,6 +414,8 @@ export async function fetchWithRetries(
356
414
  throw new Error(`Request failed after ${retries} retries: ${(lastError as Error).message}`);
357
415
  }
358
416
 
417
+ const RESULT_HISTORY_LENGTH = 50;
418
+
359
419
  export function getConfigDirectoryPath(): string {
360
420
  return path.join(os.homedir(), '.promptfoo');
361
421
  }
@@ -365,11 +425,17 @@ export function getLatestResultsPath(): string {
365
425
  }
366
426
 
367
427
  export function writeLatestResults(results: EvaluateSummary, config: Partial<UnifiedConfig>) {
428
+ const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
429
+
430
+ // Replace hyphens with colons (Windows compatibility).
431
+ const timestamp = new Date().toISOString().replace(/:/g, '-');
432
+
433
+ const newResultsPath = path.join(resultsDirectory, `eval-${timestamp}.json`);
368
434
  const latestResultsPath = getLatestResultsPath();
369
435
  try {
370
- fs.mkdirSync(path.dirname(latestResultsPath), { recursive: true });
436
+ fs.mkdirSync(resultsDirectory, { recursive: true });
371
437
  fs.writeFileSync(
372
- latestResultsPath,
438
+ newResultsPath,
373
439
  JSON.stringify(
374
440
  {
375
441
  version: 1,
@@ -380,8 +446,45 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
380
446
  2,
381
447
  ),
382
448
  );
449
+ if (fs.existsSync(latestResultsPath)) {
450
+ fs.unlinkSync(latestResultsPath);
451
+ }
452
+ fs.symlinkSync(newResultsPath, latestResultsPath);
453
+ cleanupOldResults();
383
454
  } catch (err) {
384
- logger.error(`Failed to write latest results to ${latestResultsPath}:\n${err}`);
455
+ logger.error(`Failed to write latest results to ${newResultsPath}:\n${err}`);
456
+ }
457
+ }
458
+
459
+ export function listPreviousResults(): string[] {
460
+ const directory = path.join(getConfigDirectoryPath(), 'output');
461
+ const files = fs.readdirSync(directory);
462
+ const resultsFiles = files.filter((file) => file.startsWith('eval-') && file.endsWith('.json'));
463
+ const sortedFiles = resultsFiles.sort((a, b) => {
464
+ const statA = fs.statSync(path.join(directory, a));
465
+ const statB = fs.statSync(path.join(directory, b));
466
+ return statB.birthtime.getTime() - statA.birthtime.getTime(); // sort in descending order
467
+ });
468
+ return sortedFiles;
469
+ }
470
+
471
+ export function cleanupOldResults(remaining = RESULT_HISTORY_LENGTH) {
472
+ const sortedFiles = listPreviousResults();
473
+ for (let i = 0; i < sortedFiles.length - remaining; i++) {
474
+ fs.unlinkSync(path.join(getConfigDirectoryPath(), 'output', sortedFiles[i]));
475
+ }
476
+ }
477
+
478
+ export function readResult(
479
+ name: string,
480
+ ): { results: EvaluateSummary; config: Partial<UnifiedConfig> } | undefined {
481
+ const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
482
+ const resultsPath = path.join(resultsDirectory, name);
483
+ try {
484
+ const results = JSON.parse(fs.readFileSync(fs.realpathSync(resultsPath), 'utf-8'));
485
+ return results;
486
+ } catch (err) {
487
+ logger.error(`Failed to read results from ${resultsPath}:\n${err}`);
385
488
  }
386
489
  }
387
490
 
@@ -389,12 +492,7 @@ export function readLatestResults():
389
492
  | { results: EvaluateSummary; config: Partial<UnifiedConfig> }
390
493
  | undefined {
391
494
  const latestResultsPath = getLatestResultsPath();
392
- try {
393
- const latestResults = JSON.parse(fs.readFileSync(latestResultsPath, 'utf-8'));
394
- return latestResults;
395
- } catch (err) {
396
- logger.error(`Failed to read latest results from ${latestResultsPath}:\n${err}`);
397
- }
495
+ return readResult(latestResultsPath);
398
496
  }
399
497
 
400
498
  export function cosineSimilarity(vecA: number[], vecB: number[]) {
@@ -14,6 +14,7 @@ function App() {
14
14
  const { table, setTable, setConfig } = useStore();
15
15
  const [loaded, setLoaded] = React.useState<boolean>(false);
16
16
  const loadedFromApi = React.useRef(false);
17
+ const [recentFiles, setRecentFiles] = React.useState<string[]>([]);
17
18
 
18
19
  const prefersDarkMode = useMediaQuery('(prefers-color-scheme: dark)');
19
20
  const [darkMode, setDarkMode] = React.useState(prefersDarkMode);
@@ -43,6 +44,22 @@ function App() {
43
44
  }
44
45
  }, [prefersDarkMode]);
45
46
 
47
+ const fetchRecentFiles = async () => {
48
+ if (!window.location.href.includes('localhost')) {
49
+ return;
50
+ }
51
+ const resp = await fetch(`http://localhost:15500/results`);
52
+ const body = await resp.json();
53
+ setRecentFiles(body.data);
54
+ };
55
+
56
+ const handleRecentFileSelection = async (file: string) => {
57
+ const resp = await fetch(`http://localhost:15500/results/${file}`);
58
+ const body = await resp.json();
59
+ setTable(body.data.results.table);
60
+ setConfig(body.data.config);
61
+ };
62
+
46
63
  React.useEffect(() => {
47
64
  const fetchEvalData = async (id: string) => {
48
65
  if (loadedFromApi.current) {
@@ -72,12 +89,14 @@ function App() {
72
89
  setLoaded(true);
73
90
  setTable(data.results.table);
74
91
  setConfig(data.config);
92
+ fetchRecentFiles();
75
93
  });
76
94
 
77
95
  socket.on('update', (data) => {
78
96
  console.log('Received data update', data);
79
97
  setTable(data.results.table);
80
98
  setConfig(data.config);
99
+ fetchRecentFiles();
81
100
  });
82
101
  }
83
102
 
@@ -89,7 +108,11 @@ function App() {
89
108
  return (
90
109
  <ThemeProvider theme={theme}>
91
110
  <NavBar darkMode={darkMode} onToggleDarkMode={toggleDarkMode} />
92
- {loaded && table ? <ResultsView /> : <div>Loading...</div>}
111
+ {loaded && table ? (
112
+ <ResultsView recentFiles={recentFiles} onRecentFileSelected={handleRecentFileSelection} />
113
+ ) : (
114
+ <div>Loading...</div>
115
+ )}
93
116
  </ThemeProvider>
94
117
  );
95
118
  }
@@ -83,7 +83,17 @@ tr .cell-actions {
83
83
  font-size: 1.75rem;
84
84
  }
85
85
 
86
- tr:hover .cell-actions {
86
+ tr .cell-detail {
87
+ visibility: hidden;
88
+ position: absolute;
89
+ bottom: 0.25rem;
90
+ margin-top: 1rem;
91
+ font-size: 0.75rem;
92
+ color: #888;
93
+ }
94
+
95
+ tr:hover .cell-actions,
96
+ tr:hover .cell-detail {
87
97
  visibility: visible;
88
98
  }
89
99
 
@@ -135,6 +135,16 @@ function EvalOutputCell({
135
135
  )}{' '}
136
136
  <TruncatedText text={text} maxLength={maxTextLength} />
137
137
  </div>
138
+ <div className="cell-detail">
139
+ {output.tokenUsage?.cached ? (
140
+ <span>{output.tokenUsage.cached} tokens (cached)</span>
141
+ ) : (
142
+ <>
143
+ {output.tokenUsage?.total && <span>{output.tokenUsage.total} tokens</span>} |{' '}
144
+ <span>{output.latencyMs} ms</span>
145
+ </>
146
+ )}
147
+ </div>
138
148
  <div className="cell-actions">
139
149
  {output.prompt && (
140
150
  <>
@@ -37,7 +37,32 @@ const ResponsiveStack = styled(Stack)(({ theme }) => ({
37
37
  },
38
38
  }));
39
39
 
40
- export default function ResultsView() {
40
+ function filenameToDate(filename: string) {
41
+ const dateString = filename.slice('eval-'.length, filename.length - '.json'.length);
42
+
43
+ // Replace hyphens with colons where necessary (Windows compatibility).
44
+ const dateParts = dateString.split('T');
45
+ const timePart = dateParts[1].replace(/-/g, ':');
46
+ const formattedDateString = `${dateParts[0]}T${timePart}`;
47
+
48
+ const date = new Date(formattedDateString);
49
+ return date.toLocaleDateString('en-US', {
50
+ year: 'numeric',
51
+ month: 'long',
52
+ day: 'numeric',
53
+ hour: '2-digit',
54
+ minute: '2-digit',
55
+ second: '2-digit',
56
+ timeZoneName: 'short',
57
+ });
58
+ }
59
+
60
+ interface ResultsViewProps {
61
+ recentFiles: string[];
62
+ onRecentFileSelected: (file: string) => void;
63
+ }
64
+
65
+ export default function ResultsView({ recentFiles, onRecentFileSelected }: ResultsViewProps) {
41
66
  const { table, config } = useStore();
42
67
  const [maxTextLength, setMaxTextLength] = React.useState(250);
43
68
  const [columnVisibility, setColumnVisibility] = React.useState<VisibilityState>({});
@@ -148,10 +173,30 @@ export default function ResultsView() {
148
173
  return (
149
174
  <div>
150
175
  <Paper py="md">
151
- <ResponsiveStack direction="row" spacing={8} alignItems="center">
176
+ <ResponsiveStack direction="row" spacing={4} alignItems="center">
177
+ <Box>
178
+ {recentFiles && recentFiles.length > 0 && (
179
+ <FormControl sx={{ m: 1, minWidth: 200 }} size="small">
180
+ <InputLabel>View run</InputLabel>
181
+ <Select
182
+ key={recentFiles.join(',')}
183
+ className="recent-files"
184
+ label="Previous runs"
185
+ defaultValue={recentFiles[0]}
186
+ onChange={(e: SelectChangeEvent) => onRecentFileSelected(e.target.value)}
187
+ >
188
+ {recentFiles.map((file) => (
189
+ <MenuItem key={file} value={file}>
190
+ {filenameToDate(file)}
191
+ </MenuItem>
192
+ ))}
193
+ </Select>
194
+ </FormControl>
195
+ )}
196
+ </Box>
152
197
  <Box>
153
198
  <FormControl sx={{ m: 1, minWidth: 200 }} size="small">
154
- <InputLabel id="visible-columns-label">Visible columns</InputLabel>
199
+ <InputLabel id="visible-columns-label">Show columns</InputLabel>
155
200
  <Select
156
201
  labelId="visible-columns-label"
157
202
  id="visible-columns"
@@ -1,3 +1,5 @@
1
+ import { TokenUsage } from '../../../types';
2
+
1
3
  type Prompt = {
2
4
  display: string;
3
5
  raw: string;
@@ -13,6 +15,8 @@ export type EvalRowOutput = {
13
15
  score: number;
14
16
  text: string | object;
15
17
  prompt: string;
18
+ latencyMs: number;
19
+ tokenUsage?: Partial<TokenUsage>;
16
20
  };
17
21
 
18
22
  export type EvalRow = {
package/src/web/server.ts CHANGED
@@ -1,4 +1,4 @@
1
- import fs from 'fs';
1
+ import fs, { Stats } from 'fs';
2
2
  import path from 'node:path';
3
3
  import readline from 'node:readline';
4
4
  import http from 'node:http';
@@ -11,7 +11,7 @@ import { Server as SocketIOServer } from 'socket.io';
11
11
 
12
12
  import logger from '../logger';
13
13
  import { getDirectory } from '../esm';
14
- import { getLatestResultsPath } from '../util';
14
+ import { getLatestResultsPath, listPreviousResults, readResult } from '../util';
15
15
 
16
16
  export function init(port = 15500) {
17
17
  const app = express();
@@ -40,14 +40,37 @@ export function init(port = 15500) {
40
40
  socket.emit('init', readLatestJson());
41
41
 
42
42
  // Watch for changes to latest.json and emit the update event
43
- fs.watch(
44
- latestJsonPath,
45
- debounce((event: string) => {
46
- if (event === 'change') {
47
- socket.emit('update', readLatestJson());
48
- }
49
- }, 250),
50
- );
43
+ const watcher = debounce((curr: Stats, prev: Stats) => {
44
+ if (curr.mtime !== prev.mtime) {
45
+ socket.emit('update', readLatestJson());
46
+ }
47
+ }, 250);
48
+ fs.watchFile(latestJsonPath, watcher);
49
+
50
+ // Stop watching the file when the socket connection is closed
51
+ socket.on('disconnect', () => {
52
+ fs.unwatchFile(latestJsonPath, watcher);
53
+ });
54
+ });
55
+
56
+ app.get('/results', (req, res) => {
57
+ const previousResults = listPreviousResults();
58
+ res.json({ data: previousResults });
59
+ });
60
+
61
+ app.get('/results/:filename', (req, res) => {
62
+ const filename = req.params.filename;
63
+ const safeFilename = path.basename(filename);
64
+ if (safeFilename !== filename || !listPreviousResults().includes(safeFilename)) {
65
+ res.status(400).send('Invalid filename');
66
+ return;
67
+ }
68
+ const result = readResult(safeFilename);
69
+ if (!result) {
70
+ res.status(404).send('Result not found');
71
+ return;
72
+ }
73
+ res.json({ data: result });
51
74
  });
52
75
 
53
76
  httpServer.listen(port, () => {