promptfoo 0.5.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. package/README.md +35 -250
  2. package/dist/__mocks__/esm.js +5 -1
  3. package/dist/__mocks__/esm.js.map +1 -1
  4. package/dist/assertions.d.ts +18 -0
  5. package/dist/assertions.d.ts.map +1 -0
  6. package/dist/assertions.js +128 -0
  7. package/dist/assertions.js.map +1 -0
  8. package/dist/cache.d.ts +8 -0
  9. package/dist/cache.d.ts.map +1 -0
  10. package/dist/cache.js +78 -0
  11. package/dist/cache.js.map +1 -0
  12. package/dist/esm.d.ts.map +1 -1
  13. package/dist/esm.js +10 -3
  14. package/dist/esm.js.map +1 -1
  15. package/dist/evaluator.d.ts.map +1 -1
  16. package/dist/evaluator.js +90 -117
  17. package/dist/evaluator.js.map +1 -1
  18. package/dist/index.d.ts +13 -0
  19. package/dist/index.d.ts.map +1 -1
  20. package/dist/index.js +34 -5
  21. package/dist/index.js.map +1 -1
  22. package/dist/logger.js +18 -11
  23. package/dist/logger.js.map +1 -1
  24. package/dist/main.js +103 -56
  25. package/dist/main.js.map +1 -1
  26. package/dist/prompts.d.ts +4 -0
  27. package/dist/prompts.d.ts.map +1 -1
  28. package/dist/prompts.js +12 -1
  29. package/dist/prompts.js.map +1 -1
  30. package/dist/providers/localai.d.ts.map +1 -1
  31. package/dist/providers/localai.js +23 -17
  32. package/dist/providers/localai.js.map +1 -1
  33. package/dist/providers/openai.d.ts +9 -4
  34. package/dist/providers/openai.d.ts.map +1 -1
  35. package/dist/providers/openai.js +61 -58
  36. package/dist/providers/openai.js.map +1 -1
  37. package/dist/providers/shared.d.ts.map +1 -1
  38. package/dist/providers/shared.js +5 -2
  39. package/dist/providers/shared.js.map +1 -1
  40. package/dist/providers.d.ts +10 -0
  41. package/dist/providers.d.ts.map +1 -1
  42. package/dist/providers.js +51 -14
  43. package/dist/providers.js.map +1 -1
  44. package/dist/suggestions.d.ts +9 -0
  45. package/dist/suggestions.d.ts.map +1 -0
  46. package/dist/suggestions.js +54 -0
  47. package/dist/suggestions.js.map +1 -0
  48. package/dist/types.d.ts +17 -6
  49. package/dist/types.d.ts.map +1 -1
  50. package/dist/types.js +2 -1
  51. package/dist/util.d.ts +1 -1
  52. package/dist/util.d.ts.map +1 -1
  53. package/dist/util.js +85 -31
  54. package/dist/util.js.map +1 -1
  55. package/dist/web/client/assets/index-207192fc.css +1 -0
  56. package/dist/web/client/assets/index-8751749f.js +172 -0
  57. package/dist/web/client/index.html +2 -2
  58. package/dist/web/server.js +38 -31
  59. package/dist/web/server.js.map +1 -1
  60. package/package.json +18 -5
  61. package/src/assertions.ts +154 -0
  62. package/src/cache.ts +91 -0
  63. package/src/esm.ts +5 -2
  64. package/src/evaluator.ts +63 -139
  65. package/src/index.ts +12 -0
  66. package/src/main.ts +39 -9
  67. package/src/prompts.ts +9 -0
  68. package/src/providers/localai.ts +9 -11
  69. package/src/providers/openai.ts +49 -50
  70. package/src/providers/shared.ts +1 -1
  71. package/src/providers.ts +8 -0
  72. package/src/suggestions.ts +63 -0
  73. package/src/types.ts +20 -6
  74. package/src/util.ts +24 -4
  75. package/src/web/client/package.json +1 -0
  76. package/src/web/client/src/App.css +4 -0
  77. package/src/web/client/src/App.tsx +29 -5
  78. package/src/web/client/src/Logo.css +5 -0
  79. package/src/web/client/src/NavBar.css +18 -0
  80. package/src/web/client/src/NavBar.tsx +12 -1
  81. package/src/web/client/src/index.css +10 -0
  82. package/src/web/server.ts +2 -2
  83. package/dist/web/client/assets/index-710f1308.css +0 -1
  84. package/dist/web/client/assets/index-900b20c0.js +0 -172
package/src/evaluator.ts CHANGED
@@ -1,8 +1,11 @@
1
+ import readline from 'node:readline';
2
+
1
3
  import async from 'async';
4
+ import chalk from 'chalk';
2
5
  import nunjucks from 'nunjucks';
3
6
 
4
- import { DEFAULT_GRADING_PROMPT } from './prompts.js';
5
- import { DefaultEmbeddingProvider } from './providers/openai.js';
7
+ import logger from './logger.js';
8
+ import { matchesExpectedValue } from './assertions.js';
6
9
 
7
10
  import type { SingleBar } from 'cli-progress';
8
11
  import type {
@@ -13,9 +16,8 @@ import type {
13
16
  EvaluateSummary,
14
17
  EvaluateTable,
15
18
  Prompt,
16
- TokenUsage,
17
19
  } from './types.js';
18
- import { cosineSimilarity } from './util.js';
20
+ import { generatePrompts } from './suggestions.js';
19
21
 
20
22
  interface RunEvalOptions {
21
23
  provider: ApiProvider;
@@ -27,16 +29,8 @@ interface RunEvalOptions {
27
29
  colIndex: number;
28
30
  }
29
31
 
30
- interface GradingResult {
31
- pass: boolean;
32
- reason: string;
33
- tokensUsed: TokenUsage;
34
- }
35
-
36
32
  const DEFAULT_MAX_CONCURRENCY = 4;
37
33
 
38
- const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
39
-
40
34
  class Evaluator {
41
35
  options: EvaluateOptions;
42
36
  stats: EvaluateStats;
@@ -50,132 +44,11 @@ class Evaluator {
50
44
  total: 0,
51
45
  prompt: 0,
52
46
  completion: 0,
47
+ cached: 0,
53
48
  },
54
49
  };
55
50
  }
56
51
 
57
- async gradeOutput(expected: string, output: string): Promise<GradingResult> {
58
- const { grading } = this.options;
59
-
60
- if (!grading) {
61
- throw new Error(
62
- 'Cannot grade output without grading config. Specify --grader option or grading config.',
63
- );
64
- }
65
-
66
- const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
67
- content: output,
68
- rubric: expected,
69
- });
70
-
71
- const resp = await grading.provider.callApi(prompt);
72
- if (resp.error || !resp.output) {
73
- return {
74
- pass: false,
75
- reason: resp.error || 'No output',
76
- tokensUsed: {
77
- total: resp.tokenUsage?.total || 0,
78
- prompt: resp.tokenUsage?.prompt || 0,
79
- completion: resp.tokenUsage?.completion || 0,
80
- },
81
- };
82
- }
83
-
84
- try {
85
- const parsed = JSON.parse(resp.output) as GradingResult;
86
- parsed.tokensUsed = {
87
- total: resp.tokenUsage?.total || 0,
88
- prompt: resp.tokenUsage?.prompt || 0,
89
- completion: resp.tokenUsage?.completion || 0,
90
- };
91
- return parsed;
92
- } catch (err) {
93
- return {
94
- pass: false,
95
- reason: `Output is not valid JSON: ${resp.output}`,
96
- tokensUsed: {
97
- total: resp.tokenUsage?.total || 0,
98
- prompt: resp.tokenUsage?.prompt || 0,
99
- completion: resp.tokenUsage?.completion || 0,
100
- },
101
- };
102
- }
103
- }
104
-
105
- async checkSimilarity(
106
- expected: string,
107
- output: string,
108
- threshold: number,
109
- ): Promise<GradingResult> {
110
- const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
111
- const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
112
-
113
- const tokensUsed = {
114
- total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
115
- prompt:
116
- (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
117
- completion:
118
- (expectedEmbedding.tokenUsage?.completion || 0) +
119
- (outputEmbedding.tokenUsage?.completion || 0),
120
- };
121
-
122
- if (expectedEmbedding.error || outputEmbedding.error) {
123
- return {
124
- pass: false,
125
- reason:
126
- expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
127
- tokensUsed,
128
- };
129
- }
130
-
131
- if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
132
- return {
133
- pass: false,
134
- reason: 'Embedding not found',
135
- tokensUsed,
136
- };
137
- }
138
-
139
- const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
140
- if (similarity < threshold) {
141
- return {
142
- pass: false,
143
- reason: `Similarity ${similarity} is less than threshold ${threshold}`,
144
- tokensUsed,
145
- };
146
- }
147
- return {
148
- pass: true,
149
- reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
150
- tokensUsed,
151
- };
152
- }
153
-
154
- async checkExpectedValue(
155
- expected: string,
156
- output: string,
157
- ): Promise<{ pass: boolean; reason?: string }> {
158
- const match = expected.match(SIMILAR_REGEX);
159
-
160
- if (match) {
161
- const threshold = parseFloat(match[1]) || 0.8;
162
- const rest = expected.replace(SIMILAR_REGEX, '').trim();
163
- return this.checkSimilarity(rest, output, threshold);
164
- } else if (expected.startsWith('eval:')) {
165
- const evalBody = expected.slice(5);
166
- const evalFunction = new Function('output', `return ${evalBody}`);
167
- return { pass: evalFunction(output) };
168
- } else if (expected.startsWith('grade:')) {
169
- return this.gradeOutput(expected.slice(6), output);
170
- } else {
171
- const pass = expected === output;
172
- return {
173
- pass,
174
- reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
175
- };
176
- }
177
- }
178
-
179
52
  async runEval({
180
53
  provider,
181
54
  prompt,
@@ -207,7 +80,7 @@ class Evaluator {
207
80
  ret.error = response.error;
208
81
  } else if (response.output) {
209
82
  const checkResult = vars.__expected
210
- ? await this.checkExpectedValue(vars.__expected, response.output)
83
+ ? await matchesExpectedValue(vars.__expected, response.output, this.options)
211
84
  : { pass: true };
212
85
  if (!checkResult.pass) {
213
86
  ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
@@ -222,6 +95,7 @@ class Evaluator {
222
95
  this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
223
96
  this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
224
97
  this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
98
+ this.stats.tokenUsage.cached += response.tokenUsage?.cached || 0;
225
99
 
226
100
  if (ret.success) {
227
101
  this.stats.successes++;
@@ -243,6 +117,48 @@ class Evaluator {
243
117
  const options = this.options;
244
118
  const prompts: Prompt[] = [];
245
119
 
120
+ if (options.prompt?.generateSuggestions) {
121
+ logger.info(`Generating prompt variations...`);
122
+ const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
123
+ if (error || !newPrompts) {
124
+ throw new Error(`Failed to generate prompts: ${error}`);
125
+ }
126
+
127
+ logger.info(chalk.blue('Generated prompts:'));
128
+ let numAdded = 0;
129
+ for (const prompt of newPrompts) {
130
+ logger.info('--------------------------------------------------------');
131
+ logger.info(`${prompt}`);
132
+ logger.info('--------------------------------------------------------');
133
+
134
+ // Ask the user if they want to continue
135
+ await new Promise((resolve) => {
136
+ const rl = readline.createInterface({
137
+ input: process.stdin,
138
+ output: process.stdout,
139
+ });
140
+ rl.question(
141
+ `${chalk.blue('Do you want to test this prompt?')} (y/N): `,
142
+ async (answer) => {
143
+ rl.close();
144
+ if (answer.toLowerCase().startsWith('y')) {
145
+ options.prompts.push(prompt);
146
+ numAdded++;
147
+ } else {
148
+ logger.info('Skipping this prompt.');
149
+ }
150
+ resolve(true);
151
+ },
152
+ );
153
+ });
154
+ }
155
+
156
+ if (numAdded < 1) {
157
+ logger.info(chalk.red('No prompts selected. Aborting.'));
158
+ process.exit(1);
159
+ }
160
+ }
161
+
246
162
  for (const promptContent of options.prompts) {
247
163
  for (const provider of options.providers) {
248
164
  const display =
@@ -255,16 +171,20 @@ class Evaluator {
255
171
  }
256
172
 
257
173
  const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
258
- const varsWithExpectedKeyRemoved = vars.map((v) => {
174
+ const varsWithSpecialColsRemoved = vars.map((v) => {
259
175
  const ret = { ...v };
260
- delete ret.__expected;
176
+ Object.keys(ret).forEach((key) => {
177
+ if (key.startsWith('__')) {
178
+ delete ret[key];
179
+ }
180
+ });
261
181
  return ret;
262
182
  });
263
183
  const isTest = vars[0].__expected;
264
184
  const table: EvaluateTable = {
265
185
  head: {
266
186
  prompts: prompts.map((p) => p.display),
267
- vars: Object.keys(varsWithExpectedKeyRemoved[0]),
187
+ vars: Object.keys(varsWithSpecialColsRemoved[0]),
268
188
  },
269
189
  body: [],
270
190
  };
@@ -292,11 +212,15 @@ class Evaluator {
292
212
  let rowIndex = 0;
293
213
  for (const row of vars) {
294
214
  let colIndex = 0;
215
+
216
+ const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
217
+ const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
218
+
295
219
  for (const promptContent of options.prompts) {
296
220
  for (const provider of options.providers) {
297
221
  runEvalOptions.push({
298
222
  provider,
299
- prompt: promptContent,
223
+ prompt: prependToPrompt + promptContent + appendToPrompt,
300
224
  vars: row,
301
225
  includeProviderId: options.providers.length > 1,
302
226
  rowIndex,
package/src/index.ts CHANGED
@@ -1,8 +1,12 @@
1
1
  import { evaluate as doEvaluate } from './evaluator.js';
2
2
  import { loadApiProvider } from './providers.js';
3
+ import assertions from './assertions.js';
4
+ import providers from './providers.js';
3
5
 
4
6
  import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
5
7
 
8
+ export * from './types.js';
9
+
6
10
  async function evaluate(
7
11
  providers: (string | ApiProvider)[] | (string | ApiProvider),
8
12
  options: Omit<EvaluateOptions, 'providers'>,
@@ -30,6 +34,14 @@ async function evaluate(
30
34
  });
31
35
  }
32
36
 
37
+ module.exports = {
38
+ evaluate,
39
+ assertions,
40
+ providers,
41
+ };
42
+
33
43
  export default {
34
44
  evaluate,
45
+ assertions,
46
+ providers,
35
47
  };
package/src/main.ts CHANGED
@@ -14,6 +14,7 @@ import { getDirectory } from './esm.js';
14
14
  import { init } from './web/server.js';
15
15
 
16
16
  import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
17
+ import { disableCache } from './cache.js';
17
18
 
18
19
  function createDummyFiles(directory: string | null) {
19
20
  if (directory) {
@@ -34,7 +35,7 @@ These prompts are nunjucks templates, so you can use logic like this:
34
35
  {% endif %}`;
35
36
  const dummyVars =
36
37
  'var1,var2,var3\nvalue1,value2,value3\nanother value1,another value2,another value3';
37
- const dummyConfig = `export default {
38
+ const dummyConfig = `module.exports = {
38
39
  prompts: ['prompts.txt'],
39
40
  providers: ['openai:gpt-3.5-turbo'],
40
41
  vars: 'vars.csv',
@@ -79,6 +80,10 @@ async function main() {
79
80
  defaultConfig = (await import(pathJoin(process.cwd(), './promptfooconfig.js'))).default;
80
81
  logger.info('Loaded default config from promptfooconfig.js');
81
82
  }
83
+ if (existsSync('promptfooconfig.json')) {
84
+ defaultConfig = JSON.parse(readFileSync('promptfooconfig.json', 'utf-8'));
85
+ logger.info('Loaded default config from promptfooconfig.json');
86
+ }
82
87
 
83
88
  const program = new Command();
84
89
 
@@ -143,15 +148,26 @@ async function main() {
143
148
  'Truncate console table cells to this length',
144
149
  '250',
145
150
  )
151
+ .option(
152
+ '--suggest-prompts <number>',
153
+ 'Generate N new prompts and append them to the prompt list',
154
+ )
155
+ .option(
156
+ '--prompt-prefix <path>',
157
+ 'This prefix is prepended to every prompt',
158
+ defaultConfig.promptPrefix,
159
+ )
160
+ .option(
161
+ '--prompt-suffix <path>',
162
+ 'This suffix is append to every prompt',
163
+ defaultConfig.promptSuffix,
164
+ )
146
165
  .option('--no-write', 'Do not write results to promptfoo directory')
166
+ .option('--no-cache', 'Do not read or write results to disk cache')
147
167
  .option('--grader', 'Model that will grade outputs', defaultConfig.grader)
148
168
  .option('--verbose', 'Show debug logs', defaultConfig.verbose)
149
- .option('--view', 'View in browser ui')
169
+ .option('--view [port]', 'View in browser ui')
150
170
  .action(async (cmdObj: CommandLineOptions & Command) => {
151
- if (cmdObj.verbose) {
152
- setLogLevel('debug');
153
- }
154
-
155
171
  const configPath = cmdObj.config;
156
172
  let config = {};
157
173
  if (configPath) {
@@ -169,6 +185,13 @@ async function main() {
169
185
  }
170
186
  }
171
187
 
188
+ if (cmdObj.verbose) {
189
+ setLogLevel('debug');
190
+ }
191
+ if (!cmdObj.cache) {
192
+ disableCache();
193
+ }
194
+
172
195
  let vars: VarMapping[] = [];
173
196
  if (cmdObj.vars) {
174
197
  vars = readVars(cmdObj.vars);
@@ -184,6 +207,10 @@ async function main() {
184
207
  providers,
185
208
  showProgressBar: true,
186
209
  maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
210
+ prompt: {
211
+ prefix: cmdObj.promptPrefix,
212
+ suffix: cmdObj.promptSuffix,
213
+ },
187
214
  ...config,
188
215
  };
189
216
 
@@ -192,6 +219,9 @@ async function main() {
192
219
  provider: await loadApiProvider(cmdObj.grader),
193
220
  };
194
221
  }
222
+ if (cmdObj.generateSuggestions) {
223
+ options.prompt!.generateSuggestions = true;
224
+ }
195
225
 
196
226
  const summary = await evaluate(options);
197
227
 
@@ -238,7 +268,7 @@ async function main() {
238
268
 
239
269
  logger.info('\n' + table.toString());
240
270
  }
241
- if (cmdObj.noWrite || cmdObj.view) {
271
+ if (cmdObj.view || !cmdObj.write) {
242
272
  logger.info('Evaluation complete');
243
273
  } else {
244
274
  writeLatestResults(summary);
@@ -247,12 +277,12 @@ async function main() {
247
277
  logger.info(chalk.green.bold(`Successes: ${summary.stats.successes}`));
248
278
  logger.info(chalk.red.bold(`Failures: ${summary.stats.failures}`));
249
279
  logger.info(
250
- `Token usage: Total ${summary.stats.tokenUsage.total} Prompt ${summary.stats.tokenUsage.prompt} Completion ${summary.stats.tokenUsage.completion}`,
280
+ `Token usage: Total ${summary.stats.tokenUsage.total}, Prompt ${summary.stats.tokenUsage.prompt}, Completion ${summary.stats.tokenUsage.completion}, Cached ${summary.stats.tokenUsage.cached}`,
251
281
  );
252
282
  logger.info('Done.');
253
283
 
254
284
  if (cmdObj.view) {
255
- init(15500);
285
+ init(parseInt(cmdObj.view, 10) || 15500);
256
286
  }
257
287
  });
258
288
 
package/src/prompts.ts CHANGED
@@ -18,3 +18,12 @@ Rubric: Does not speak like a pirate
18
18
  content: 'Content: {{ content }}\nRubric: {{ rubric }}',
19
19
  },
20
20
  ]);
21
+
22
+ export const SUGGEST_PROMPTS_SYSTEM_MESSAGE = {
23
+ role: 'system',
24
+ content: `You're helping a scientist who is tuning a prompt for a large language model. You will receive messages, and each message is a full prompt. Generate a candidate variation of the given prompt. This variation will be tested for quality in order to select a winner.
25
+
26
+ Substantially revise the prompt, revising its structure and content however necessary to make it perform better, while preserving the original intent and including important details.
27
+
28
+ Your output is going to be copied directly into the program. It should contain the prompt ONLY`,
29
+ };
@@ -1,5 +1,5 @@
1
1
  import logger from '../logger.js';
2
- import { fetchWithTimeout } from '../util.js';
2
+ import { fetchJsonWithCache } from '../cache.js';
3
3
  import { REQUEST_TIMEOUT_MS } from './shared.js';
4
4
 
5
5
  import type { ApiProvider, ProviderResponse } from '../types.js';
@@ -36,9 +36,10 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
36
36
  };
37
37
  logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
38
38
 
39
- let response, data;
39
+ let data,
40
+ cached = false;
40
41
  try {
41
- response = await fetchWithTimeout(
42
+ ({ data, cached } = (await fetchJsonWithCache(
42
43
  `${this.apiBaseUrl}/chat/completions`,
43
44
  {
44
45
  method: 'POST',
@@ -48,9 +49,7 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
48
49
  body: JSON.stringify(body),
49
50
  },
50
51
  REQUEST_TIMEOUT_MS,
51
- );
52
-
53
- data = (await response.json()) as unknown as any;
52
+ )) as unknown as any);
54
53
  } catch (err) {
55
54
  return {
56
55
  error: `API call error: ${String(err)}`,
@@ -78,9 +77,10 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
78
77
  };
79
78
  logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
80
79
 
81
- let response, data;
80
+ let data,
81
+ cached = false;
82
82
  try {
83
- response = await fetchWithTimeout(
83
+ ({ data, cached } = (await fetchJsonWithCache(
84
84
  `${this.apiBaseUrl}/completions`,
85
85
  {
86
86
  method: 'POST',
@@ -90,9 +90,7 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
90
90
  body: JSON.stringify(body),
91
91
  },
92
92
  REQUEST_TIMEOUT_MS,
93
- );
94
-
95
- data = (await response.json()) as unknown as any;
93
+ )) as unknown as any);
96
94
  } catch (err) {
97
95
  return {
98
96
  error: `API call error: ${String(err)}`,