promptfoo 0.17.8 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +2 -0
  2. package/dist/package.json +1 -1
  3. package/dist/src/assertions.d.ts.map +1 -1
  4. package/dist/src/assertions.js +97 -42
  5. package/dist/src/assertions.js.map +1 -1
  6. package/dist/src/evaluator.d.ts.map +1 -1
  7. package/dist/src/evaluator.js +43 -7
  8. package/dist/src/evaluator.js.map +1 -1
  9. package/dist/src/index.d.ts.map +1 -1
  10. package/dist/src/index.js +3 -0
  11. package/dist/src/index.js.map +1 -1
  12. package/dist/src/main.js +9 -0
  13. package/dist/src/main.js.map +1 -1
  14. package/dist/src/providers.d.ts +2 -2
  15. package/dist/src/providers.d.ts.map +1 -1
  16. package/dist/src/providers.js +15 -1
  17. package/dist/src/providers.js.map +1 -1
  18. package/dist/src/table.js +2 -2
  19. package/dist/src/table.js.map +1 -1
  20. package/dist/src/types.d.ts +15 -4
  21. package/dist/src/types.d.ts.map +1 -1
  22. package/dist/src/util.d.ts +3 -2
  23. package/dist/src/util.d.ts.map +1 -1
  24. package/dist/src/util.js +70 -18
  25. package/dist/src/util.js.map +1 -1
  26. package/dist/src/web/client/assets/{index-0c6f887d.js → index-8388d689.js} +1 -1
  27. package/dist/src/web/client/assets/{index-f9b230d1.css → index-d2b6a160.css} +1 -1
  28. package/dist/src/web/client/index.html +2 -2
  29. package/package.json +1 -1
  30. package/src/assertions.ts +102 -49
  31. package/src/evaluator.ts +42 -4
  32. package/src/index.ts +6 -1
  33. package/src/main.ts +14 -0
  34. package/src/providers.ts +22 -4
  35. package/src/table.ts +2 -2
  36. package/src/types.ts +29 -3
  37. package/src/util.ts +82 -17
  38. package/src/web/client/package-lock.json +5726 -0
  39. package/src/web/client/src/ResultsTable.css +11 -1
  40. package/src/web/client/src/ResultsTable.tsx +10 -0
  41. package/src/web/client/src/ResultsView.tsx +7 -1
  42. package/src/web/client/src/types.ts +4 -0
package/src/main.ts CHANGED
@@ -281,6 +281,7 @@ async function main() {
281
281
  prompts: cmdObj.prompts || fileConfig.prompts || defaultConfig.prompts,
282
282
  providers: cmdObj.providers || fileConfig.providers || defaultConfig.providers,
283
283
  tests: cmdObj.tests || cmdObj.vars || fileConfig.tests || defaultConfig.tests,
284
+ scenarios: fileConfig.scenarios || defaultConfig.scenarios,
284
285
  sharing:
285
286
  process.env.PROMPTFOO_DISABLE_SHARING === '1'
286
287
  ? false
@@ -310,6 +311,18 @@ async function main() {
310
311
  config.tests,
311
312
  cmdObj.tests ? undefined : basePath,
312
313
  );
314
+
315
+ //parse testCases for each scenario
316
+ if (fileConfig.scenarios) {
317
+ for (const scenario of fileConfig.scenarios) {
318
+ const parsedScenarioTests: TestCase[] = await readTests(
319
+ scenario.tests,
320
+ cmdObj.tests ? undefined : basePath,
321
+ );
322
+ scenario.tests = parsedScenarioTests;
323
+ }
324
+ }
325
+
313
326
  const parsedProviderPromptMap = readProviderPromptMap(config, parsedPrompts);
314
327
 
315
328
  if (parsedPrompts.length === 0) {
@@ -334,6 +347,7 @@ async function main() {
334
347
  providers: parsedProviders,
335
348
  providerPromptMap: parsedProviderPromptMap,
336
349
  tests: parsedTests,
350
+ scenarios: config.scenarios,
337
351
  defaultTest,
338
352
  };
339
353
 
package/src/providers.ts CHANGED
@@ -1,7 +1,5 @@
1
1
  import path from 'path';
2
2
 
3
- import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './types';
4
-
5
3
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
4
  import { AnthropicCompletionProvider } from './providers/anthropic';
7
5
  import { ReplicateProvider } from './providers/replicate';
@@ -12,17 +10,37 @@ import {
12
10
  AzureOpenAiCompletionProvider,
13
11
  } from './providers/azureopenai';
14
12
 
13
+ import type {
14
+ ApiProvider,
15
+ ProviderConfig,
16
+ ProviderFunction,
17
+ ProviderId,
18
+ RawProviderConfig,
19
+ } from './types';
20
+
15
21
  export async function loadApiProviders(
16
- providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
22
+ providerPaths: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction,
17
23
  basePath?: string,
18
24
  ): Promise<ApiProvider[]> {
19
25
  if (typeof providerPaths === 'string') {
20
26
  return [await loadApiProvider(providerPaths, undefined, basePath)];
27
+ } else if (typeof providerPaths === 'function') {
28
+ return [
29
+ {
30
+ id: () => 'custom-function',
31
+ callApi: providerPaths,
32
+ },
33
+ ];
21
34
  } else if (Array.isArray(providerPaths)) {
22
35
  return Promise.all(
23
- providerPaths.map((provider) => {
36
+ providerPaths.map((provider, idx) => {
24
37
  if (typeof provider === 'string') {
25
38
  return loadApiProvider(provider, undefined, basePath);
39
+ } else if (typeof provider === 'function') {
40
+ return {
41
+ id: () => `custom-function-${idx}`,
42
+ callApi: provider,
43
+ };
26
44
  } else {
27
45
  const id = Object.keys(provider)[0];
28
46
  const context = { ...provider[id], id };
package/src/table.ts CHANGED
@@ -24,11 +24,11 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
24
24
  text = text.slice(0, tableCellMaxLength) + '...';
25
25
  }
26
26
  if (pass) {
27
- return chalk.green.bold('[PASS] ') + text;
27
+ return chalk.green('[PASS] ') + text;
28
28
  } else if (!pass) {
29
29
  // color everything red up until '---'
30
30
  return (
31
- chalk.red.bold('[FAIL] ') +
31
+ chalk.red('[FAIL] ') +
32
32
  text
33
33
  .split('---')
34
34
  .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
package/src/types.ts CHANGED
@@ -96,6 +96,7 @@ export interface EvaluateResult {
96
96
  error?: string;
97
97
  success: boolean;
98
98
  score: number;
99
+ latencyMs: number;
99
100
  }
100
101
 
101
102
  export interface EvaluateTableOutput {
@@ -103,6 +104,8 @@ export interface EvaluateTableOutput {
103
104
  score: number;
104
105
  text: string;
105
106
  prompt: string;
107
+ latencyMs: number;
108
+ tokenUsage?: Partial<TokenUsage>;
106
109
  }
107
110
 
108
111
  export interface EvaluateTable {
@@ -148,6 +151,7 @@ type BaseAssertionTypes =
148
151
  | 'is-json'
149
152
  | 'contains-json'
150
153
  | 'javascript'
154
+ | 'python'
151
155
  | 'similar'
152
156
  | 'llm-rubric'
153
157
  | 'webhook'
@@ -165,7 +169,10 @@ export interface Assertion {
165
169
  type: AssertionType;
166
170
 
167
171
  // The expected value, if applicable
168
- value?: string | string[];
172
+ value?:
173
+ | string
174
+ | string[]
175
+ | ((output: string, testCase: AtomicTestCase, assertion: Assertion) => Promise<GradingResult>);
169
176
 
170
177
  // The threshold value, only applicable for similarity (cosine distance)
171
178
  threshold?: number;
@@ -192,6 +199,17 @@ export interface TestCase {
192
199
  options?: PromptConfig & OutputConfig & GradingConfig;
193
200
  }
194
201
 
202
+ export interface Scenario {
203
+ // Optional description of what you're testing
204
+ description?: string;
205
+
206
+ // Default test case config
207
+ config: Partial<TestCase>[];
208
+
209
+ // Optional list of automatic checks to run on the LLM output
210
+ tests: TestCase[];
211
+ }
212
+
195
213
  // Same as a TestCase, except the `vars` object has been flattened into its final form.
196
214
  export interface AtomicTestCase extends TestCase {
197
215
  vars?: Record<string, string | object>;
@@ -215,12 +233,17 @@ export interface TestSuite {
215
233
  // Test cases
216
234
  tests?: TestCase[];
217
235
 
236
+ // scenarios
237
+ scenarios?: Scenario[];
238
+
218
239
  // Default test case config
219
240
  defaultTest?: Partial<TestCase>;
220
241
  }
221
242
 
222
243
  export type ProviderId = string;
223
244
 
245
+ export type ProviderFunction = (prompt: string) => Promise<ProviderResponse>;
246
+
224
247
  export type RawProviderConfig = Record<ProviderId, Omit<ProviderConfig, 'id'>>;
225
248
 
226
249
  // TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
@@ -229,13 +252,16 @@ export interface TestSuiteConfig {
229
252
  description?: string;
230
253
 
231
254
  // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
232
- providers: ProviderId | ProviderId[] | RawProviderConfig[];
255
+ providers: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction;
233
256
 
234
257
  // One or more prompt files to load
235
258
  prompts: string | string[];
236
259
 
237
260
  // Path to a test file, OR list of LLM prompt variations (aka "test case")
238
- tests: string | TestCase[];
261
+ tests: string | string[] | TestCase[];
262
+
263
+ // Scenarios, groupings of data and tests to be evaluated
264
+ scenarios?: Scenario[];
239
265
 
240
266
  // Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
241
267
  defaultTest?: Omit<TestCase, 'description'>;
package/src/util.ts CHANGED
@@ -4,6 +4,7 @@ import * as os from 'os';
4
4
 
5
5
  import $RefParser from '@apidevtools/json-schema-ref-parser';
6
6
  import fetch from 'node-fetch';
7
+ import invariant from 'tiny-invariant';
7
8
  import yaml from 'js-yaml';
8
9
  import nunjucks from 'nunjucks';
9
10
  import { globSync } from 'glob';
@@ -44,6 +45,15 @@ export function readProviderPromptMap(
44
45
  allPrompts.push(prompt.display);
45
46
  }
46
47
 
48
+ invariant(
49
+ typeof config.providers !== 'string',
50
+ 'In order to use a provider-prompt map, config.providers should be an array of objects, not a string',
51
+ );
52
+ invariant(
53
+ typeof config.providers !== 'function',
54
+ 'In order to use a provider-prompt map, config.providers should be an array of objects, not a function',
55
+ );
56
+
47
57
  for (const provider of config.providers) {
48
58
  if (typeof provider === 'object') {
49
59
  const rawProvider = provider as RawProviderConfig;
@@ -224,7 +234,31 @@ export async function fetchCsvFromGoogleSheet(url: string): Promise<string> {
224
234
  return csvData;
225
235
  }
226
236
 
227
- export async function readVars(varsPath: string, basePath: string = ''): Promise<CsvRow[]> {
237
+ export async function readVarsFiles(
238
+ pathOrGlobs: string | string[],
239
+ basePath: string = '',
240
+ ): Promise<Record<string, string | string[] | object>> {
241
+ if (typeof pathOrGlobs === 'string') {
242
+ pathOrGlobs = [pathOrGlobs];
243
+ }
244
+
245
+ const ret: Record<string, string | string[] | object> = {};
246
+ for (const pathOrGlob of pathOrGlobs) {
247
+ const resolvedPath = path.resolve(basePath, pathOrGlob);
248
+ const paths = globSync(resolvedPath);
249
+
250
+ for (const p of paths) {
251
+ const yamlData = yaml.load(fs.readFileSync(p, 'utf-8'));
252
+ Object.assign(ret, yamlData);
253
+ }
254
+ }
255
+
256
+ return ret;
257
+ }
258
+
259
+ export async function readTestsFile(varsPath: string, basePath: string = ''): Promise<CsvRow[]> {
260
+ // This function is confusingly named - it reads a CSV, JSON, or YAML file of
261
+ // TESTS or test equivalents.
228
262
  const resolvedVarsPath = path.resolve(basePath, varsPath);
229
263
  const fileExtension = parsePath(varsPath).ext.slice(1);
230
264
  let rows: CsvRow[] = [];
@@ -246,25 +280,53 @@ export async function readVars(varsPath: string, basePath: string = ''): Promise
246
280
  }
247
281
 
248
282
  export async function readTests(
249
- tests: string | TestCase[] | undefined,
283
+ tests: string | string[] | TestCase[] | undefined,
250
284
  basePath: string = '',
251
285
  ): Promise<TestCase[]> {
252
- if (!tests) {
253
- return [];
254
- }
286
+ const ret: TestCase[] = [];
287
+
288
+ const loadTestsFromGlob = async (loadTestsGlob: string) => {
289
+ const resolvedPath = path.resolve(basePath, loadTestsGlob);
290
+ const testFiles = globSync(resolvedPath);
291
+ for (const testFile of testFiles) {
292
+ const testFileContent = yaml.load(fs.readFileSync(testFile, 'utf-8')) as TestCase[];
293
+ for (const testCase of testFileContent) {
294
+ if (typeof testCase.vars === 'string' || Array.isArray(testCase.vars)) {
295
+ const testcaseBasePath = path.dirname(testFile);
296
+ testCase.vars = await readVarsFiles(testCase.vars, testcaseBasePath);
297
+ }
298
+ }
299
+ ret.push(...testFileContent);
300
+ }
301
+ };
255
302
 
256
303
  if (typeof tests === 'string') {
257
- // It's a filepath, load from CSV
258
- const vars = await readVars(tests, basePath);
259
- return vars.map((row, idx) => {
260
- const test = testCaseFromCsvRow(row);
261
- test.description = `Row #${idx + 1}`;
262
- return test;
263
- });
304
+ if (tests.endsWith('yaml') || tests.endsWith('yml')) {
305
+ // Load testcase config from yaml
306
+ await loadTestsFromGlob(tests);
307
+ } else {
308
+ // Legacy load CSV
309
+ const vars = await readTestsFile(tests, basePath);
310
+ return vars.map((row, idx) => {
311
+ const test = testCaseFromCsvRow(row);
312
+ test.description = `Row #${idx + 1}`;
313
+ return test;
314
+ });
315
+ }
316
+ } else if (Array.isArray(tests)) {
317
+ for (const maybeTestsGlob of tests) {
318
+ if (typeof maybeTestsGlob === 'string') {
319
+ // Assume it's a filepath
320
+ await loadTestsFromGlob(maybeTestsGlob);
321
+ } else {
322
+ // Assume it's a full test case
323
+ ret.push(maybeTestsGlob);
324
+ }
325
+ }
264
326
  }
265
327
 
266
328
  // Some validation of the shape of tests
267
- for (const test of tests) {
329
+ for (const test of ret) {
268
330
  if (!test.assert && !test.vars) {
269
331
  throw new Error(
270
332
  `Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
@@ -276,7 +338,7 @@ export async function readTests(
276
338
  }
277
339
  }
278
340
 
279
- return tests;
341
+ return ret;
280
342
  }
281
343
 
282
344
  export function writeOutput(
@@ -374,7 +436,10 @@ export function getLatestResultsPath(): string {
374
436
 
375
437
  export function writeLatestResults(results: EvaluateSummary, config: Partial<UnifiedConfig>) {
376
438
  const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
377
- const timestamp = new Date().toISOString();
439
+
440
+ // Replace hyphens with colons (Windows compatibility).
441
+ const timestamp = new Date().toISOString().replace(/:/g, '-');
442
+
378
443
  const newResultsPath = path.join(resultsDirectory, `eval-${timestamp}.json`);
379
444
  const latestResultsPath = getLatestResultsPath();
380
445
  try {
@@ -391,7 +456,7 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
391
456
  2,
392
457
  ),
393
458
  );
394
- if (fs.existsSync(latestResultsPath)) {
459
+ if (fs.existsSync(latestResultsPath) || fs.lstatSync(latestResultsPath).isSymbolicLink()) {
395
460
  fs.unlinkSync(latestResultsPath);
396
461
  }
397
462
  fs.symlinkSync(newResultsPath, latestResultsPath);
@@ -408,7 +473,7 @@ export function listPreviousResults(): string[] {
408
473
  const sortedFiles = resultsFiles.sort((a, b) => {
409
474
  const statA = fs.statSync(path.join(directory, a));
410
475
  const statB = fs.statSync(path.join(directory, b));
411
- return statB.birthtime.getTime() - statA.birthtime.getTime(); // sort in descending order
476
+ return statA.birthtime.getTime() - statB.birthtime.getTime(); // sort in ascending order
412
477
  });
413
478
  return sortedFiles;
414
479
  }