promptfoo 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +123 -73
  2. package/dist/assertions.d.ts +4 -10
  3. package/dist/assertions.d.ts.map +1 -1
  4. package/dist/assertions.js +126 -20
  5. package/dist/assertions.js.map +1 -1
  6. package/dist/cache.d.ts.map +1 -1
  7. package/dist/cache.js.map +1 -1
  8. package/dist/evaluator.d.ts +2 -2
  9. package/dist/evaluator.d.ts.map +1 -1
  10. package/dist/evaluator.js +72 -41
  11. package/dist/evaluator.js.map +1 -1
  12. package/dist/index.d.ts +6 -4
  13. package/dist/index.d.ts.map +1 -1
  14. package/dist/index.js +8 -21
  15. package/dist/index.js.map +1 -1
  16. package/dist/main.js +88 -81
  17. package/dist/main.js.map +1 -1
  18. package/dist/onboarding.d.ts +4 -0
  19. package/dist/onboarding.d.ts.map +1 -0
  20. package/dist/onboarding.js +63 -0
  21. package/dist/onboarding.js.map +1 -0
  22. package/dist/providers.d.ts +1 -0
  23. package/dist/providers.d.ts.map +1 -1
  24. package/dist/providers.js +11 -1
  25. package/dist/providers.js.map +1 -1
  26. package/dist/types.d.ts +40 -9
  27. package/dist/types.d.ts.map +1 -1
  28. package/dist/util.d.ts +6 -3
  29. package/dist/util.d.ts.map +1 -1
  30. package/dist/util.js +73 -1
  31. package/dist/util.js.map +1 -1
  32. package/dist/web/server.d.ts.map +1 -1
  33. package/dist/web/server.js +0 -11
  34. package/dist/web/server.js.map +1 -1
  35. package/package.json +2 -1
  36. package/src/assertions.ts +141 -28
  37. package/src/cache.ts +0 -1
  38. package/src/evaluator.ts +88 -44
  39. package/src/index.ts +14 -26
  40. package/src/main.ts +115 -102
  41. package/src/onboarding.ts +61 -0
  42. package/src/providers.ts +9 -0
  43. package/src/types.ts +89 -12
  44. package/src/util.ts +90 -3
  45. package/src/web/server.ts +0 -18
@@ -0,0 +1,61 @@
1
+ export const DEFAULT_PROMPTS = `Your first prompt goes here
2
+ ---
3
+ Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
4
+ ---
5
+ This is the next prompt.
6
+
7
+ These prompts are nunjucks templates, so you can use logic like this:
8
+ {% if var1 %}
9
+ {{ var1 }}
10
+ {% endif %}
11
+ ---
12
+ If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
13
+ `;
14
+
15
+ export const DEFAULT_YAML_CONFIG = `# This configuration runs each prompt through a series of example inputs and checks if they meet requirements.
16
+
17
+ prompts: [prompts.txt]
18
+ providers: [openai:gpt-3.5-turbo]
19
+ tests:
20
+ - description: First test case - automatic review
21
+ vars:
22
+ var1: first variable's value
23
+ var2: another value
24
+ var3: some other value
25
+ assert:
26
+ - type: equality
27
+ value: expected LLM output goes here
28
+ - type: function
29
+ value: output.includes('some text')
30
+
31
+ - description: Second test case - manual review
32
+ # Test cases don't need assertions if you prefer to manually review the output
33
+ vars:
34
+ var1: new value
35
+ var2: another value
36
+ var3: third value
37
+
38
+ - description: Third test case - other types of automatic review
39
+ vars:
40
+ var1: yet another value
41
+ var2: and another
42
+ var3: dear llm, please output your response in json format
43
+ assert:
44
+ - type: contains-json
45
+ - type: similarity
46
+ value: ensures that output is semantically similar to this text
47
+ - type: llm-rubric
48
+ value: ensure that output contains a reference to X
49
+ `;
50
+
51
+ export const DEFAULT_README = `To get started, set your OPENAI_API_KEY environment variable.
52
+
53
+ Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.
54
+
55
+ Then run:
56
+ \`\`\`
57
+ promptfoo eval
58
+ \`\`\`
59
+
60
+ Afterwards, you can view the results by running \`promptfoo view\`
61
+ `;
package/src/providers.ts CHANGED
@@ -5,6 +5,15 @@ import { ApiProvider } from './types.js';
5
5
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
6
6
  import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';
7
7
 
8
+ export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
9
+ if (typeof providerPaths === 'string') {
10
+ return [await loadApiProvider(providerPaths)];
11
+ } else if (Array.isArray(providerPaths)) {
12
+ return Promise.all(providerPaths.map((provider) => loadApiProvider(provider)));
13
+ }
14
+ throw new Error('Invalid providers list');
15
+ }
16
+
8
17
  export async function loadApiProvider(providerPath: string): Promise<ApiProvider> {
9
18
  if (providerPath?.startsWith('openai:')) {
10
19
  // Load OpenAI module
package/src/types.ts CHANGED
@@ -1,11 +1,16 @@
1
1
  export interface CommandLineOptions {
2
+ // Shared with TestSuite
2
3
  prompts: string[];
3
4
  providers: string[];
4
- output?: string;
5
+ output: string;
6
+
7
+ // Shared with EvaluateOptions
8
+ maxConcurrency: string;
9
+
10
+ // Command line only
5
11
  vars?: string;
6
12
  config?: string;
7
13
  verbose?: boolean;
8
- maxConcurrency?: string;
9
14
  grader?: string;
10
15
  view?: string;
11
16
  tableCellMaxLength?: string;
@@ -48,27 +53,19 @@ export interface CsvRow {
48
53
  export type VarMapping = Record<string, string>;
49
54
 
50
55
  export interface GradingConfig {
51
- prompt?: string;
56
+ rubricPrompt?: string;
52
57
  provider?: string | ApiProvider;
53
58
  }
54
59
 
55
60
  export interface PromptConfig {
56
61
  prefix?: string;
57
62
  suffix?: string;
58
- generateSuggestions?: boolean;
59
63
  }
60
64
 
61
65
  export interface EvaluateOptions {
62
- providers: ApiProvider[];
63
- prompts: string[];
64
- vars?: VarMapping[];
65
-
66
66
  maxConcurrency?: number;
67
67
  showProgressBar?: boolean;
68
-
69
- grading?: GradingConfig;
70
-
71
- prompt?: PromptConfig;
68
+ generateSuggestions?: boolean;
72
69
  }
73
70
 
74
71
  export interface Prompt {
@@ -108,3 +105,83 @@ export interface EvaluateSummary {
108
105
  table: EvaluateTable;
109
106
  stats: EvaluateStats;
110
107
  }
108
+
109
+ export interface GradingResult {
110
+ pass: boolean;
111
+ reason: string;
112
+ tokensUsed?: TokenUsage;
113
+ }
114
+
115
+ // TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
116
+ export interface Assertion {
117
+ // Type of assertion
118
+ type: 'equals' | 'is-json' | 'contains-json' | 'javascript' | 'similar' | 'llm-rubric';
119
+
120
+ // The expected value, if applicable
121
+ value?: string;
122
+
123
+ // The threshold value, only applicable for similarity (cosine distance)
124
+ threshold?: number;
125
+
126
+ // Some assertions (similarity, llm-rubric) require an LLM provider
127
+ provider?: ApiProvider;
128
+ }
129
+
130
+ // Each test case is graded pass/fail. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
131
+ export interface TestCase {
132
+ // Optional description of what you're testing
133
+ description?: string;
134
+
135
+ // Key-value pairs to substitute in the prompt
136
+ vars?: Record<string, string>;
137
+
138
+ // Optional list of automatic checks to run on the LLM output
139
+ assert?: Assertion[];
140
+
141
+ // Additional configuration settings for the prompt
142
+ options?: PromptConfig & GradingConfig;
143
+ }
144
+
145
+ // The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
146
+ export interface TestSuite {
147
+ // Optional description of what your LLM is trying to do
148
+ description?: string;
149
+
150
+ // One or more LLM APIs to use
151
+ providers: ApiProvider[];
152
+
153
+ // One or more prompt strings
154
+ prompts: string[];
155
+
156
+ // Test cases
157
+ tests?: TestCase[];
158
+
159
+ // Default test case config
160
+ defaultTest?: Partial<TestCase>;
161
+ }
162
+
163
+ // TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
164
+ export interface TestSuiteConfig {
165
+ // Optional description of what your LLM is trying to do
166
+ description?: string;
167
+
168
+ // One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
169
+ providers: string | string[];
170
+
171
+ // One or more prompt files to load
172
+ prompts: string | string[];
173
+
174
+ // Path to a test file, OR list of LLM prompt variations (aka "test case")
175
+ tests: string | TestCase[];
176
+
177
+ // Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
178
+ defaultTest?: Omit<TestCase, 'description'>;
179
+
180
+ // Path to write output. Writes to console/web viewer if not set.
181
+ outputPath?: string;
182
+ }
183
+
184
+ export type UnifiedConfig = TestSuiteConfig & {
185
+ evaluateOptions: EvaluateOptions;
186
+ commandLineOptions: Partial<CommandLineOptions>;
187
+ };
package/src/util.ts CHANGED
@@ -7,7 +7,6 @@ import yaml from 'js-yaml';
7
7
  import nunjucks from 'nunjucks';
8
8
  import { globSync } from 'glob';
9
9
  import { parse as parsePath } from 'path';
10
- import { CsvRow } from './types.js';
11
10
  import { parse as parseCsv } from 'csv-parse/sync';
12
11
  import { stringify } from 'csv-stringify/sync';
13
12
 
@@ -16,7 +15,16 @@ import { getDirectory } from './esm.js';
16
15
 
17
16
  import type { RequestInfo, RequestInit, Response } from 'node-fetch';
18
17
 
19
- import type { EvaluateSummary } from './types.js';
18
+ import type {
19
+ Assertion,
20
+ CsvRow,
21
+ EvaluateSummary,
22
+ CommandLineOptions,
23
+ TestSuite,
24
+ UnifiedConfig,
25
+ TestCase,
26
+ } from './types.js';
27
+ import { assertionFromString } from './assertions.js';
20
28
 
21
29
  const PROMPT_DELIMITER = '---';
22
30
 
@@ -28,7 +36,35 @@ function parseJson(json: string): any | undefined {
28
36
  }
29
37
  }
30
38
 
31
- export function readPrompts(promptPathsOrGlobs: string[]): string[] {
39
+ export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
40
+ try {
41
+ return readConfig(configPath);
42
+ } catch {
43
+ return undefined;
44
+ }
45
+ }
46
+
47
+ export function readConfig(configPath: string): UnifiedConfig {
48
+ if (!fs.existsSync(configPath)) {
49
+ throw new Error(`Config file not found: ${configPath}`);
50
+ }
51
+ const ext = path.parse(configPath).ext;
52
+ switch (ext) {
53
+ case '.json':
54
+ const content = fs.readFileSync(configPath, 'utf-8');
55
+ return JSON.parse(content) as UnifiedConfig;
56
+ case '.js':
57
+ return require(configPath) as UnifiedConfig;
58
+ case '.yaml':
59
+ return yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
60
+ default:
61
+ throw new Error(`Unsupported configuration file format: ${ext}`);
62
+ }
63
+ }
64
+
65
+ export function readPrompts(promptPathsOrGlobs: string | string[]): string[] {
66
+ promptPathsOrGlobs =
67
+ typeof promptPathsOrGlobs === 'string' ? [promptPathsOrGlobs] : promptPathsOrGlobs;
32
68
  const promptPaths = promptPathsOrGlobs.flatMap((pathOrGlob) => globSync(pathOrGlob));
33
69
  let promptContents: string[] = [];
34
70
 
@@ -49,6 +85,9 @@ export function readPrompts(promptPathsOrGlobs: string[]): string[] {
49
85
  if (promptContents.length === 1) {
50
86
  promptContents = promptContents[0].split(PROMPT_DELIMITER).map((p) => p.trim());
51
87
  }
88
+ if (promptContents.length === 0) {
89
+ throw new Error(`There are no prompts in ${promptPathsOrGlobs.join(', ')}`);
90
+ }
52
91
  return promptContents;
53
92
  }
54
93
 
@@ -67,6 +106,37 @@ export function readVars(varsPath: string): CsvRow[] {
67
106
  return rows;
68
107
  }
69
108
 
109
+ export function readTests(tests: string | TestCase[] | undefined): TestCase[] {
110
+ if (!tests) {
111
+ return [];
112
+ }
113
+
114
+ if (typeof tests === 'string') {
115
+ // It's a filepath, load from CSV
116
+ const vars = readVars(tests);
117
+ return vars.map((row, idx) => {
118
+ const test = testCaseFromCsvRow(row);
119
+ test.description = `Row #${idx + 1}`;
120
+ return test;
121
+ });
122
+ }
123
+
124
+ // Some validation of the shape of tests
125
+ for (const test of tests) {
126
+ if (!test.assert && !test.vars) {
127
+ throw new Error(
128
+ `Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
129
+ test,
130
+ null,
131
+ 2,
132
+ )}`,
133
+ );
134
+ }
135
+ }
136
+
137
+ return tests;
138
+ }
139
+
70
140
  export function writeOutput(outputPath: string, summary: EvaluateSummary): void {
71
141
  const outputExtension = outputPath.split('.').pop()?.toLowerCase();
72
142
 
@@ -153,3 +223,20 @@ export function cosineSimilarity(vecA: number[], vecB: number[]) {
153
223
  const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
154
224
  return dotProduct / (vecAMagnitude * vecBMagnitude);
155
225
  }
226
+
227
+ export function testCaseFromCsvRow(row: CsvRow): TestCase {
228
+ const vars: Record<string, string> = {};
229
+ const asserts: Assertion[] = [];
230
+ for (const [key, value] of Object.entries(row)) {
231
+ if (key === '__expected') {
232
+ asserts.push(assertionFromString(value));
233
+ } else {
234
+ vars[key] = value;
235
+ }
236
+ }
237
+
238
+ return {
239
+ vars,
240
+ assert: asserts,
241
+ };
242
+ }
package/src/web/server.ts CHANGED
@@ -32,24 +32,6 @@ export function init(port = 15500) {
32
32
  },
33
33
  });
34
34
 
35
- interface EvaluateRequestBody {
36
- provider: string;
37
- options: {
38
- prompts: string[];
39
- vars: Record<string, string>[];
40
- };
41
- }
42
-
43
- app.post('/evaluate', async (req: Request, res: Response) => {
44
- try {
45
- const { provider, options } = req.body as EvaluateRequestBody;
46
- const summary = await promptfoo.evaluate(provider, options);
47
- res.json(summary);
48
- } catch (error) {
49
- res.status(500).json({ message: 'Error evaluating prompts' });
50
- }
51
- });
52
-
53
35
  const latestJsonPath = getLatestResultsPath();
54
36
  const readLatestJson = () => {
55
37
  const data = fs.readFileSync(latestJsonPath, 'utf8');