promptfoo 0.17.9 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -3,7 +3,7 @@ import providers from './providers';
3
3
  import telemetry from './telemetry';
4
4
  import { evaluate as doEvaluate } from './evaluator';
5
5
  import { loadApiProviders } from './providers';
6
- import { readTests } from './util';
6
+ import { readTests, writeOutput } from './util';
7
7
  import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
8
8
 
9
9
  export * from './types';
@@ -28,6 +28,11 @@ async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions =
28
28
  };
29
29
  telemetry.maybeShowNotice();
30
30
  const ret = await doEvaluate(constructedTestSuite, options);
31
+
32
+ if (testSuite.outputPath) {
33
+ writeOutput(testSuite.outputPath, ret, testSuite, null);
34
+ }
35
+
31
36
  await telemetry.send();
32
37
  return ret;
33
38
  }
package/src/main.ts CHANGED
@@ -281,6 +281,7 @@ async function main() {
281
281
  prompts: cmdObj.prompts || fileConfig.prompts || defaultConfig.prompts,
282
282
  providers: cmdObj.providers || fileConfig.providers || defaultConfig.providers,
283
283
  tests: cmdObj.tests || cmdObj.vars || fileConfig.tests || defaultConfig.tests,
284
+ scenarios: fileConfig.scenarios || defaultConfig.scenarios,
284
285
  sharing:
285
286
  process.env.PROMPTFOO_DISABLE_SHARING === '1'
286
287
  ? false
@@ -310,6 +311,18 @@ async function main() {
310
311
  config.tests,
311
312
  cmdObj.tests ? undefined : basePath,
312
313
  );
314
+
315
+ //parse testCases for each scenario
316
+ if (fileConfig.scenarios) {
317
+ for (const scenario of fileConfig.scenarios) {
318
+ const parsedScenarioTests: TestCase[] = await readTests(
319
+ scenario.tests,
320
+ cmdObj.tests ? undefined : basePath,
321
+ );
322
+ scenario.tests = parsedScenarioTests;
323
+ }
324
+ }
325
+
313
326
  const parsedProviderPromptMap = readProviderPromptMap(config, parsedPrompts);
314
327
 
315
328
  if (parsedPrompts.length === 0) {
@@ -334,6 +347,7 @@ async function main() {
334
347
  providers: parsedProviders,
335
348
  providerPromptMap: parsedProviderPromptMap,
336
349
  tests: parsedTests,
350
+ scenarios: config.scenarios,
337
351
  defaultTest,
338
352
  };
339
353
 
package/src/providers.ts CHANGED
@@ -1,7 +1,5 @@
1
1
  import path from 'path';
2
2
 
3
- import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './types';
4
-
5
3
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
4
  import { AnthropicCompletionProvider } from './providers/anthropic';
7
5
  import { ReplicateProvider } from './providers/replicate';
@@ -12,17 +10,37 @@ import {
12
10
  AzureOpenAiCompletionProvider,
13
11
  } from './providers/azureopenai';
14
12
 
13
+ import type {
14
+ ApiProvider,
15
+ ProviderConfig,
16
+ ProviderFunction,
17
+ ProviderId,
18
+ RawProviderConfig,
19
+ } from './types';
20
+
15
21
  export async function loadApiProviders(
16
- providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
22
+ providerPaths: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction,
17
23
  basePath?: string,
18
24
  ): Promise<ApiProvider[]> {
19
25
  if (typeof providerPaths === 'string') {
20
26
  return [await loadApiProvider(providerPaths, undefined, basePath)];
27
+ } else if (typeof providerPaths === 'function') {
28
+ return [
29
+ {
30
+ id: () => 'custom-function',
31
+ callApi: providerPaths,
32
+ },
33
+ ];
21
34
  } else if (Array.isArray(providerPaths)) {
22
35
  return Promise.all(
23
- providerPaths.map((provider) => {
36
+ providerPaths.map((provider, idx) => {
24
37
  if (typeof provider === 'string') {
25
38
  return loadApiProvider(provider, undefined, basePath);
39
+ } else if (typeof provider === 'function') {
40
+ return {
41
+ id: () => `custom-function-${idx}`,
42
+ callApi: provider,
43
+ };
26
44
  } else {
27
45
  const id = Object.keys(provider)[0];
28
46
  const context = { ...provider[id], id };
package/src/table.ts CHANGED
@@ -24,11 +24,11 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
24
24
  text = text.slice(0, tableCellMaxLength) + '...';
25
25
  }
26
26
  if (pass) {
27
- return chalk.green.bold('[PASS] ') + text;
27
+ return chalk.green('[PASS] ') + text;
28
28
  } else if (!pass) {
29
29
  // color everything red up until '---'
30
30
  return (
31
- chalk.red.bold('[FAIL] ') +
31
+ chalk.red('[FAIL] ') +
32
32
  text
33
33
  .split('---')
34
34
  .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
package/src/types.ts CHANGED
@@ -151,6 +151,7 @@ type BaseAssertionTypes =
151
151
  | 'is-json'
152
152
  | 'contains-json'
153
153
  | 'javascript'
154
+ | 'python'
154
155
  | 'similar'
155
156
  | 'llm-rubric'
156
157
  | 'webhook'
@@ -168,7 +169,10 @@ export interface Assertion {
168
169
  type: AssertionType;
169
170
 
170
171
  // The expected value, if applicable
171
- value?: string | string[];
172
+ value?:
173
+ | string
174
+ | string[]
175
+ | ((output: string, testCase: AtomicTestCase, assertion: Assertion) => Promise<GradingResult>);
172
176
 
173
177
  // The threshold value, only applicable for similarity (cosine distance)
174
178
  threshold?: number;
@@ -188,9 +192,6 @@ export interface TestCase {
188
192
  // Key-value pairs to substitute in the prompt
189
193
  vars?: Record<string, string | string[] | object>;
190
194
 
191
- // Optional filepath or glob pattern to load vars from
192
- loadVars?: string | string[];
193
-
194
195
  // Optional list of automatic checks to run on the LLM output
195
196
  assert?: Assertion[];
196
197
 
@@ -198,6 +199,17 @@ export interface TestCase {
198
199
  options?: PromptConfig & OutputConfig & GradingConfig;
199
200
  }
200
201
 
202
+ export interface Scenario {
203
+ // Optional description of what you're testing
204
+ description?: string;
205
+
206
+ // Default test case config
207
+ config: Partial<TestCase>[];
208
+
209
+ // Optional list of automatic checks to run on the LLM output
210
+ tests: TestCase[];
211
+ }
212
+
201
213
  // Same as a TestCase, except the `vars` object has been flattened into its final form.
202
214
  export interface AtomicTestCase extends TestCase {
203
215
  vars?: Record<string, string | object>;
@@ -221,12 +233,17 @@ export interface TestSuite {
221
233
  // Test cases
222
234
  tests?: TestCase[];
223
235
 
236
+ // scenarios
237
+ scenarios?: Scenario[];
238
+
224
239
  // Default test case config
225
240
  defaultTest?: Partial<TestCase>;
226
241
  }
227
242
 
228
243
  export type ProviderId = string;
229
244
 
245
+ export type ProviderFunction = (prompt: string) => Promise<ProviderResponse>;
246
+
230
247
  export type RawProviderConfig = Record<ProviderId, Omit<ProviderConfig, 'id'>>;
231
248
 
232
249
  // TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
@@ -235,7 +252,7 @@ export interface TestSuiteConfig {
235
252
  description?: string;
236
253
 
237
254
  // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
238
- providers: ProviderId | ProviderId[] | RawProviderConfig[];
255
+ providers: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction;
239
256
 
240
257
  // One or more prompt files to load
241
258
  prompts: string | string[];
@@ -243,6 +260,9 @@ export interface TestSuiteConfig {
243
260
  // Path to a test file, OR list of LLM prompt variations (aka "test case")
244
261
  tests: string | string[] | TestCase[];
245
262
 
263
+ // Scenarios, groupings of data and tests to be evaluated
264
+ scenarios?: Scenario[];
265
+
246
266
  // Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
247
267
  defaultTest?: Omit<TestCase, 'description'>;
248
268
 
package/src/util.ts CHANGED
@@ -4,6 +4,7 @@ import * as os from 'os';
4
4
 
5
5
  import $RefParser from '@apidevtools/json-schema-ref-parser';
6
6
  import fetch from 'node-fetch';
7
+ import invariant from 'tiny-invariant';
7
8
  import yaml from 'js-yaml';
8
9
  import nunjucks from 'nunjucks';
9
10
  import { globSync } from 'glob';
@@ -44,6 +45,15 @@ export function readProviderPromptMap(
44
45
  allPrompts.push(prompt.display);
45
46
  }
46
47
 
48
+ invariant(
49
+ typeof config.providers !== 'string',
50
+ 'In order to use a provider-prompt map, config.providers should be an array of objects, not a string',
51
+ );
52
+ invariant(
53
+ typeof config.providers !== 'function',
54
+ 'In order to use a provider-prompt map, config.providers should be an array of objects, not a function',
55
+ );
56
+
47
57
  for (const provider of config.providers) {
48
58
  if (typeof provider === 'object') {
49
59
  const rawProvider = provider as RawProviderConfig;
@@ -446,7 +456,7 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
446
456
  2,
447
457
  ),
448
458
  );
449
- if (fs.existsSync(latestResultsPath)) {
459
+ if (fs.existsSync(latestResultsPath) || fs.lstatSync(latestResultsPath).isSymbolicLink()) {
450
460
  fs.unlinkSync(latestResultsPath);
451
461
  }
452
462
  fs.symlinkSync(newResultsPath, latestResultsPath);
@@ -463,7 +473,7 @@ export function listPreviousResults(): string[] {
463
473
  const sortedFiles = resultsFiles.sort((a, b) => {
464
474
  const statA = fs.statSync(path.join(directory, a));
465
475
  const statB = fs.statSync(path.join(directory, b));
466
- return statB.birthtime.getTime() - statA.birthtime.getTime(); // sort in descending order
476
+ return statA.birthtime.getTime() - statB.birthtime.getTime(); // sort in ascending order
467
477
  });
468
478
  return sortedFiles;
469
479
  }