promptfoo 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/evaluator.ts CHANGED
@@ -1,9 +1,18 @@
1
1
  import async from 'async';
2
2
  import nunjucks from 'nunjucks';
3
3
 
4
- import type { SingleBar } from 'cli-progress';
4
+ import { DEFAULT_GRADING_PROMPT } from './prompts.js';
5
5
 
6
- import { EvaluateOptions, EvaluateSummary, EvaluateResult, ApiProvider, Prompt } from './types.js';
6
+ import type { SingleBar } from 'cli-progress';
7
+ import type {
8
+ ApiProvider,
9
+ EvaluateOptions,
10
+ EvaluateResult,
11
+ EvaluateStats,
12
+ EvaluateSummary,
13
+ Prompt,
14
+ TokenUsage,
15
+ } from './types.js';
7
16
 
8
17
  interface RunEvalOptions {
9
18
  provider: ApiProvider;
@@ -12,193 +21,297 @@ interface RunEvalOptions {
12
21
  includeProviderId?: boolean;
13
22
  }
14
23
 
15
- const DEFAULT_MAX_CONCURRENCY = 3;
16
-
17
- function checkExpectedValue(expected: string, output: string): boolean {
18
- if (expected.startsWith('eval:')) {
19
- const evalBody = expected.slice(5);
20
- const evalFunction = new Function('output', `return ${evalBody}`);
21
- return evalFunction(output);
22
- } else if (expected.startsWith('grade:')) {
23
- // NYI
24
- return false;
25
- } else {
26
- return expected === output;
27
- }
24
+ interface GradingResult {
25
+ pass: boolean;
26
+ reason: string;
27
+ tokensUsed: TokenUsage;
28
28
  }
29
29
 
30
- async function runEval({
31
- provider,
32
- prompt,
33
- vars,
34
- includeProviderId,
35
- }: RunEvalOptions): Promise<EvaluateResult> {
36
- vars = vars || {};
37
- const renderedPrompt = nunjucks.renderString(prompt, vars);
38
-
39
- // Note that we're using original prompt, not renderedPrompt
40
- const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
41
-
42
- const setup = {
43
- prompt: {
44
- raw: renderedPrompt,
45
- display: promptDisplay,
46
- },
47
- vars,
48
- };
49
-
50
- try {
51
- const response = await provider.callApi(renderedPrompt);
52
- const success = vars.__expected ? checkExpectedValue(vars.__expected, response.output) : true;
53
- const ret: EvaluateResult = {
54
- ...setup,
55
- response,
56
- success,
57
- };
58
- if (!success) {
59
- ret.error = `Expected ${vars.__expected}, got "${response.output}"`;
60
- }
61
- return ret;
62
- } catch (err) {
63
- return {
64
- ...setup,
65
- error: String(err),
66
- success: false,
30
+ const DEFAULT_MAX_CONCURRENCY = 4;
31
+
32
+ class Evaluator {
33
+ options: EvaluateOptions;
34
+ stats: EvaluateStats;
35
+
36
+ constructor(options: EvaluateOptions) {
37
+ this.options = options;
38
+ this.stats = {
39
+ successes: 0,
40
+ failures: 0,
41
+ tokenUsage: {
42
+ total: 0,
43
+ prompt: 0,
44
+ completion: 0,
45
+ },
67
46
  };
68
47
  }
69
- }
70
48
 
71
- export async function evaluate(options: EvaluateOptions): Promise<EvaluateSummary> {
72
- const prompts: Prompt[] = [];
73
- const results: EvaluateResult[] = [];
49
+ async gradeOutput(expected: string, output: string): Promise<GradingResult> {
50
+ const { grading } = this.options;
74
51
 
75
- for (const promptContent of options.prompts) {
76
- for (const provider of options.providers) {
77
- prompts.push({
78
- raw: promptContent,
79
- display:
80
- options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent,
81
- });
52
+ if (!grading) {
53
+ throw new Error(
54
+ 'Cannot grade output without grading config. Specify --grader option or grading config.',
55
+ );
82
56
  }
83
- }
84
57
 
85
- const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
86
- const varsWithExpectedKeyRemoved = vars.map((v) => {
87
- const ret = { ...v };
88
- delete ret.__expected;
89
- return ret;
90
- });
91
- const isTest = vars[0].__expected;
92
- const table: string[][] = [
93
- isTest
94
- ? [
95
- 'RESULT',
96
- [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
97
- ].flat()
98
- : [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
99
- ];
100
-
101
- const stats = {
102
- successes: 0,
103
- failures: 0,
104
- tokenUsage: {
105
- total: 0,
106
- prompt: 0,
107
- completion: 0,
108
- },
109
- };
110
-
111
- let progressbar: SingleBar | undefined;
112
- if (options.showProgressBar) {
113
- const totalNumRuns =
114
- options.prompts.length * options.providers.length * (options.vars?.length || 1);
115
- const cliProgress = await import('cli-progress');
116
- progressbar = new cliProgress.SingleBar(
117
- {
118
- format:
119
- 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
120
- },
121
- cliProgress.Presets.shades_classic,
122
- );
123
- progressbar.start(totalNumRuns, 0, {
124
- provider: '',
125
- prompt: '',
126
- vars: '',
58
+ const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
59
+ content: output,
60
+ rubric: expected,
127
61
  });
62
+
63
+ const resp = await grading.provider.callApi(prompt);
64
+ if (resp.error || !resp.output) {
65
+ return {
66
+ pass: false,
67
+ reason: resp.error || 'No output',
68
+ tokensUsed: {
69
+ total: resp.tokenUsage?.total || 0,
70
+ prompt: resp.tokenUsage?.prompt || 0,
71
+ completion: resp.tokenUsage?.completion || 0,
72
+ },
73
+ };
74
+ }
75
+
76
+ try {
77
+ const parsed = JSON.parse(resp.output) as GradingResult;
78
+ parsed.tokensUsed = {
79
+ total: resp.tokenUsage?.total || 0,
80
+ prompt: resp.tokenUsage?.prompt || 0,
81
+ completion: resp.tokenUsage?.completion || 0,
82
+ };
83
+ return parsed;
84
+ } catch (err) {
85
+ return {
86
+ pass: false,
87
+ reason: `Output is not valid JSON: ${resp.output}`,
88
+ tokensUsed: {
89
+ total: resp.tokenUsage?.total || 0,
90
+ prompt: resp.tokenUsage?.prompt || 0,
91
+ completion: resp.tokenUsage?.completion || 0,
92
+ },
93
+ };
94
+ }
128
95
  }
129
96
 
130
- const runEvalOptions: RunEvalOptions[] = [];
131
- for (const row of vars) {
132
- for (const promptContent of options.prompts) {
133
- for (const provider of options.providers) {
134
- runEvalOptions.push({
135
- provider,
136
- prompt: promptContent,
137
- vars: row,
138
- includeProviderId: options.providers.length > 1,
139
- });
140
- }
97
+ async checkExpectedValue(
98
+ expected: string,
99
+ output: string,
100
+ ): Promise<{ pass: boolean; reason?: string }> {
101
+ if (expected.startsWith('eval:')) {
102
+ const evalBody = expected.slice(5);
103
+ const evalFunction = new Function('output', `return ${evalBody}`);
104
+ return { pass: evalFunction(output) };
105
+ } else if (expected.startsWith('grade:')) {
106
+ const gradingResult = await this.gradeOutput(expected.slice(6), output);
107
+ return {
108
+ pass: gradingResult.pass,
109
+ reason: gradingResult.reason,
110
+ };
111
+ } else {
112
+ const pass = expected === output;
113
+ return {
114
+ pass,
115
+ reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
116
+ };
141
117
  }
142
118
  }
143
119
 
144
- const combinedOutputs: string[][] = new Array(vars.length).fill(null).map(() => []);
145
- await async.forEachOfLimit(
146
- runEvalOptions,
147
- options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
148
- async (options: RunEvalOptions, index: number | string) => {
149
- const row = await runEval(options);
150
- results.push(row);
151
- if (row.error) {
152
- stats.failures++;
153
- } else {
154
- if (row.success) {
155
- stats.successes++;
156
- } else {
157
- stats.failures++;
120
+ async runEval({
121
+ provider,
122
+ prompt,
123
+ vars,
124
+ includeProviderId,
125
+ }: RunEvalOptions): Promise<EvaluateResult> {
126
+ vars = vars || {};
127
+ const renderedPrompt = nunjucks.renderString(prompt, vars);
128
+
129
+ // Note that we're using original prompt, not renderedPrompt
130
+ const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
131
+
132
+ const setup = {
133
+ prompt: {
134
+ raw: renderedPrompt,
135
+ display: promptDisplay,
136
+ },
137
+ vars,
138
+ };
139
+
140
+ try {
141
+ const response = await provider.callApi(renderedPrompt);
142
+ const ret: EvaluateResult = {
143
+ ...setup,
144
+ response,
145
+ success: false,
146
+ };
147
+ if (response.error) {
148
+ ret.error = response.error;
149
+ } else if (response.output) {
150
+ const checkResult = vars.__expected
151
+ ? await this.checkExpectedValue(vars.__expected, response.output)
152
+ : { pass: true };
153
+ if (!checkResult.pass) {
154
+ ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
158
155
  }
159
- stats.tokenUsage.total += row.response?.tokenUsage?.total || 0;
160
- stats.tokenUsage.prompt += row.response?.tokenUsage?.prompt || 0;
161
- stats.tokenUsage.completion += row.response?.tokenUsage?.completion || 0;
156
+ ret.success = checkResult.pass;
157
+ } else {
158
+ ret.success = false;
159
+ ret.error = 'No output';
160
+ }
161
+
162
+ // Update token usage stats
163
+ this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
164
+ this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
165
+ this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
166
+
167
+ if (ret.success) {
168
+ this.stats.successes++;
169
+ } else {
170
+ this.stats.failures++;
162
171
  }
163
172
 
164
- if (progressbar) {
165
- progressbar.increment({
166
- provider: options.provider.id(),
167
- prompt: options.prompt.slice(0, 10),
168
- vars: Object.entries(options.vars || {})
169
- .map(([k, v]) => `${k}=${v}`)
170
- .join(' ')
171
- .slice(0, 10),
173
+ return ret;
174
+ } catch (err) {
175
+ return {
176
+ ...setup,
177
+ error: String(err),
178
+ success: false,
179
+ };
180
+ }
181
+ }
182
+
183
+ async evaluate(): Promise<EvaluateSummary> {
184
+ const options = this.options;
185
+ const prompts: Prompt[] = [];
186
+
187
+ for (const promptContent of options.prompts) {
188
+ for (const provider of options.providers) {
189
+ const display =
190
+ options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
191
+ prompts.push({
192
+ raw: promptContent,
193
+ display,
172
194
  });
173
195
  }
196
+ }
174
197
 
175
- // Bookkeeping for table
176
- if (typeof index !== 'number') {
177
- throw new Error('Expected index to be a number');
198
+ const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
199
+ const varsWithExpectedKeyRemoved = vars.map((v) => {
200
+ const ret = { ...v };
201
+ delete ret.__expected;
202
+ return ret;
203
+ });
204
+ const isTest = vars[0].__expected;
205
+ const table: string[][] = [
206
+ [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
207
+ ];
208
+
209
+ let progressbar: SingleBar | undefined;
210
+ if (options.showProgressBar) {
211
+ const totalNumRuns =
212
+ options.prompts.length * options.providers.length * (options.vars?.length || 1);
213
+ const cliProgress = await import('cli-progress');
214
+ progressbar = new cliProgress.SingleBar(
215
+ {
216
+ format:
217
+ 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
218
+ },
219
+ cliProgress.Presets.shades_classic,
220
+ );
221
+ progressbar.start(totalNumRuns, 0, {
222
+ provider: '',
223
+ prompt: '',
224
+ vars: '',
225
+ });
226
+ }
227
+
228
+ const runEvalOptions: RunEvalOptions[] = [];
229
+ for (const row of vars) {
230
+ for (const promptContent of options.prompts) {
231
+ for (const provider of options.providers) {
232
+ runEvalOptions.push({
233
+ provider,
234
+ prompt: promptContent,
235
+ vars: row,
236
+ includeProviderId: options.providers.length > 1,
237
+ });
238
+ }
178
239
  }
179
- const combinedOutputIndex = Math.floor(index / prompts.length);
180
- combinedOutputs[combinedOutputIndex].push(row.response?.output || '');
181
- },
182
- );
240
+ }
183
241
 
184
- if (progressbar) {
185
- progressbar.stop();
186
- }
242
+ const tempResults: { index: number; row: EvaluateResult }[] = [];
243
+ const combinedOutputs: string[][] = new Array(vars.length).fill(null).map(() => []);
244
+ await async.forEachOfLimit(
245
+ runEvalOptions,
246
+ options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
247
+ async (options: RunEvalOptions, index: number | string) => {
248
+ const row = await this.runEval(options);
249
+ //results[index as number] = row;
250
+ tempResults.push({ index: index as number, row });
187
251
 
188
- // TODO(ian): Display errors in table UI.
189
- if (isTest) {
190
- table.push(
191
- ...combinedOutputs.map((output, index) => [
192
- results[index].success ? 'PASS' : `FAIL: ${results[index].error}`,
193
- ...output,
194
- ...Object.values(varsWithExpectedKeyRemoved[index]),
195
- ]),
196
- );
197
- } else {
198
- table.push(
199
- ...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]),
252
+ if (progressbar) {
253
+ progressbar.increment({
254
+ provider: options.provider.id(),
255
+ prompt: options.prompt.slice(0, 10),
256
+ vars: Object.entries(options.vars || {})
257
+ .map(([k, v]) => `${k}=${v}`)
258
+ .join(' ')
259
+ .slice(0, 10),
260
+ });
261
+ }
262
+
263
+ // Bookkeeping for table
264
+ if (typeof index !== 'number') {
265
+ throw new Error('Expected index to be a number');
266
+ }
267
+ const combinedOutputIndex = Math.floor(index / prompts.length);
268
+ combinedOutputs[combinedOutputIndex].push(row.response?.output || row.error || '');
269
+ },
200
270
  );
271
+
272
+ if (progressbar) {
273
+ progressbar.stop();
274
+ }
275
+
276
+ const results: EvaluateResult[] = [];
277
+ tempResults
278
+ .sort((a, b) => a.index - b.index)
279
+ .forEach(({ index, row }) => {
280
+ results[index] = row;
281
+ });
282
+
283
+ // TODO(ian): Provide full context in table cells, and have the caller
284
+ // construct the table contents itself.
285
+ if (isTest) {
286
+ // Iterate through each combined output
287
+ combinedOutputs.forEach((output, index) => {
288
+ // Create a new array to store the modified output with [PASS] or [FAIL] prepended
289
+ const modifiedOutput: string[] = [];
290
+
291
+ // Iterate through each output value and prepend [PASS] or [FAIL] based on the success status
292
+ output.forEach((o, outputIndex) => {
293
+ const resultIndex = index * prompts.length + outputIndex;
294
+ const result = results[resultIndex];
295
+ // TODO(ian): sometimes output and result.error can be identical (in the case of exception)
296
+ const resultStatus = result.success ? `[PASS] ${o}` : `[FAIL] ${result.error}\n---\n${o}`;
297
+ modifiedOutput.push(resultStatus);
298
+ });
299
+
300
+ // Add the modified output and the corresponding values from varsWithExpectedKeyRemoved to the table
301
+ const tableRow = [...modifiedOutput, ...Object.values(varsWithExpectedKeyRemoved[index])];
302
+ table.push(tableRow);
303
+ });
304
+ } else {
305
+ table.push(
306
+ ...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]),
307
+ );
308
+ }
309
+
310
+ return { results, stats: this.stats, table };
201
311
  }
312
+ }
202
313
 
203
- return { results, stats, table };
314
+ export function evaluate(options: EvaluateOptions) {
315
+ const ev = new Evaluator(options);
316
+ return ev.evaluate();
204
317
  }
package/src/main.ts CHANGED
@@ -10,6 +10,7 @@ import logger, { setLogLevel } from './logger.js';
10
10
  import { loadApiProvider } from './providers.js';
11
11
  import { evaluate } from './evaluator.js';
12
12
  import { readPrompts, readVars, writeOutput } from './util.js';
13
+ import { getDirectory } from './esm.js';
13
14
 
14
15
  import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
15
16
 
@@ -36,7 +37,7 @@ These prompts are nunjucks templates, so you can use logic like this:
36
37
  prompts: ['prompts.txt'],
37
38
  providers: ['openai:gpt-3.5-turbo'],
38
39
  vars: 'vars.csv',
39
- maxConcurrency: 3,
40
+ maxConcurrency: 4,
40
41
  };`;
41
42
  const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
42
43
  \`\`\`
@@ -80,6 +81,14 @@ async function main() {
80
81
 
81
82
  const program = new Command();
82
83
 
84
+ program.option('--version', 'Print version', () => {
85
+ const packageJson = JSON.parse(
86
+ readFileSync(pathJoin(getDirectory(), '../package.json'), 'utf8'),
87
+ );
88
+ console.log(packageJson.version);
89
+ process.exit(0);
90
+ });
91
+
83
92
  program
84
93
  .command('init [directory]')
85
94
  .description('Initialize project with dummy files')
@@ -120,6 +129,7 @@ async function main() {
120
129
  'Maximum number of concurrent API calls',
121
130
  String(defaultConfig.maxConcurrency),
122
131
  )
132
+ .option('--grader', 'Model that will grade outputs', defaultConfig.grader)
123
133
  .option('--verbose', 'Show debug logs', defaultConfig.verbose)
124
134
  .action(async (cmdObj: CommandLineOptions & Command) => {
125
135
  if (cmdObj.verbose) {
@@ -161,6 +171,12 @@ async function main() {
161
171
  ...config,
162
172
  };
163
173
 
174
+ if (cmdObj.grader) {
175
+ options.grading = {
176
+ provider: await loadApiProvider(cmdObj.grader),
177
+ };
178
+ }
179
+
164
180
  const summary = await evaluate(options);
165
181
 
166
182
  if (cmdObj.output) {
@@ -179,10 +195,23 @@ async function main() {
179
195
  head: ['blue', 'bold'],
180
196
  },
181
197
  });
182
- // Skip first row (header) and add the rest. Color the first column green if it's a success, red if it's a failure.
198
+ // Skip first row (header) and add the rest. Color PASS/FAIL
183
199
  for (const row of summary.table.slice(1)) {
184
- const color = row[0] === 'PASS' ? 'green' : row[0].startsWith('FAIL') ? 'red' : undefined;
185
- table.push(row.map((col, i) => (i === 0 && color ? chalk[color](col) : col)));
200
+ table.push(
201
+ row.map((col) => {
202
+ if (col.startsWith('[PASS]')) {
203
+ // color '[PASS]' green
204
+ return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
205
+ } else if (col.startsWith('[FAIL]')) {
206
+ // color everything red up until '---'
207
+ return col
208
+ .split('---')
209
+ .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
210
+ .join('---');
211
+ }
212
+ return col;
213
+ }),
214
+ );
186
215
  }
187
216
 
188
217
  logger.info('\n' + table.toString());
package/src/prompts.ts ADDED
@@ -0,0 +1,20 @@
1
+ export const DEFAULT_GRADING_PROMPT = JSON.stringify([
2
+ {
3
+ role: 'system',
4
+ content: `You are grading content according to a user-specified rubric. If the statement in the rubric is true, then the content passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
5
+
6
+ Examples:
7
+
8
+ Content: Hello world
9
+ Rubric: Contains a greeting
10
+ {"pass": true, "reason": "the content contains the word 'world'"}
11
+
12
+ Content: Avast ye swabs, repel the invaders!
13
+ Rubric: Does not speak like a pirate
14
+ {"pass": false, "reason": "'avast ye' is a common pirate term"}`,
15
+ },
16
+ {
17
+ role: 'user',
18
+ content: 'Content: {{ content }}\nRubric: {{ rubric }}',
19
+ },
20
+ ]);