promptfoo 0.15.0 → 0.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/README.md +10 -7
  2. package/dist/package.json +2 -2
  3. package/dist/src/assertions.js +7 -7
  4. package/dist/src/assertions.js.map +1 -1
  5. package/dist/src/cache.d.ts +1 -0
  6. package/dist/src/cache.d.ts.map +1 -1
  7. package/dist/src/cache.js +8 -3
  8. package/dist/src/cache.js.map +1 -1
  9. package/dist/src/evaluator.d.ts.map +1 -1
  10. package/dist/src/evaluator.js +20 -5
  11. package/dist/src/evaluator.js.map +1 -1
  12. package/dist/src/main.js +12 -0
  13. package/dist/src/main.js.map +1 -1
  14. package/dist/src/prompts.js +2 -2
  15. package/dist/src/prompts.js.map +1 -1
  16. package/dist/src/providers/openai.d.ts.map +1 -1
  17. package/dist/src/providers/openai.js +9 -4
  18. package/dist/src/providers/openai.js.map +1 -1
  19. package/dist/src/providers/scriptCompletion.d.ts +9 -0
  20. package/dist/src/providers/scriptCompletion.d.ts.map +1 -0
  21. package/dist/src/providers/scriptCompletion.js +27 -0
  22. package/dist/src/providers/scriptCompletion.js.map +1 -0
  23. package/dist/src/providers.d.ts.map +1 -1
  24. package/dist/src/providers.js +7 -1
  25. package/dist/src/providers.js.map +1 -1
  26. package/dist/src/table.js +1 -1
  27. package/dist/src/table.js.map +1 -1
  28. package/dist/src/types.d.ts +5 -4
  29. package/dist/src/types.d.ts.map +1 -1
  30. package/dist/src/util.d.ts +1 -0
  31. package/dist/src/util.d.ts.map +1 -1
  32. package/dist/src/util.js +33 -23
  33. package/dist/src/util.js.map +1 -1
  34. package/dist/src/web/client/assets/{index-c3faa651.css → index-b82d0138.css} +1 -1
  35. package/dist/src/web/client/assets/{index-9d27a707.js → index-f22a629c.js} +26 -26
  36. package/dist/src/web/client/index.html +2 -2
  37. package/package.json +2 -2
  38. package/src/assertions.ts +10 -10
  39. package/src/cache.ts +8 -3
  40. package/src/evaluator.ts +29 -12
  41. package/src/main.ts +14 -1
  42. package/src/prompts.ts +2 -2
  43. package/src/providers/openai.ts +15 -6
  44. package/src/providers/scriptCompletion.ts +23 -0
  45. package/src/providers.ts +6 -1
  46. package/src/table.ts +1 -1
  47. package/src/types.ts +5 -4
  48. package/src/util.ts +35 -20
  49. package/src/web/client/package-lock.json +5726 -0
  50. package/src/web/client/src/EvalOutputPromptDialog.tsx +61 -0
  51. package/src/web/client/src/ResultsTable.css +10 -7
  52. package/src/web/client/src/ResultsTable.tsx +87 -37
  53. package/src/web/client/src/types.ts +8 -2
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-9d27a707.js"></script>
9
- <link rel="stylesheet" href="/assets/index-c3faa651.css">
8
+ <script type="module" crossorigin src="/assets/index-f22a629c.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-b82d0138.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "promptfoo",
3
- "description": "Prompt engineering toolkit",
3
+ "description": "LLM eval & testing toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.15.0",
5
+ "version": "0.17.0",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/src/index.js",
package/src/assertions.ts CHANGED
@@ -4,7 +4,7 @@ import nunjucks from 'nunjucks';
4
4
 
5
5
  import telemetry from './telemetry';
6
6
  import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
7
- import { cosineSimilarity, fetchWithTimeout } from './util';
7
+ import { cosineSimilarity, fetchWithRetries } from './util';
8
8
  import { loadApiProvider } from './providers';
9
9
  import { DEFAULT_GRADING_PROMPT } from './prompts';
10
10
 
@@ -123,12 +123,12 @@ export async function runAssertion(
123
123
  }
124
124
 
125
125
  if (baseType === 'contains') {
126
- invariant(assertion.value, '"contains" assertion type must have a string value');
126
+ invariant(assertion.value, '"contains" assertion type must have a string or number value');
127
127
  invariant(
128
- typeof assertion.value === 'string',
129
- '"contains" assertion type must have a string value',
128
+ typeof assertion.value === 'string' || typeof assertion.value === 'number',
129
+ '"contains" assertion type must have a string or number value',
130
130
  );
131
- pass = output.includes(assertion.value) !== inverse;
131
+ pass = output.includes(String(assertion.value)) !== inverse;
132
132
  return {
133
133
  pass,
134
134
  score: pass ? 1 : 0,
@@ -192,12 +192,12 @@ export async function runAssertion(
192
192
  }
193
193
 
194
194
  if (baseType === 'icontains') {
195
- invariant(assertion.value, '"icontains" assertion type must have a string value');
195
+ invariant(assertion.value, '"icontains" assertion type must have a string or number value');
196
196
  invariant(
197
- typeof assertion.value === 'string',
198
- '"icontains" assertion type must have a string value',
197
+ typeof assertion.value === 'string' || typeof assertion.value === 'number',
198
+ '"icontains" assertion type must have a string or number value',
199
199
  );
200
- pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
200
+ pass = output.toLowerCase().includes(String(assertion.value).toLowerCase()) !== inverse;
201
201
  return {
202
202
  pass,
203
203
  score: pass ? 1 : 0,
@@ -281,7 +281,7 @@ ${assertion.value}`,
281
281
  const context = {
282
282
  vars: test.vars || {},
283
283
  };
284
- const response = await fetchWithTimeout(
284
+ const response = await fetchWithRetries(
285
285
  assertion.value,
286
286
  {
287
287
  method: 'POST',
package/src/cache.ts CHANGED
@@ -5,7 +5,7 @@ import cacheManager from 'cache-manager';
5
5
  import fsStore from 'cache-manager-fs-hash';
6
6
 
7
7
  import logger from './logger';
8
- import { getConfigDirectoryPath, fetchWithTimeout } from './util';
8
+ import { getConfigDirectoryPath, fetchWithRetries } from './util';
9
9
 
10
10
  import type { Cache } from 'cache-manager';
11
11
  import type { RequestInfo, RequestInit } from 'node-fetch';
@@ -48,7 +48,7 @@ export async function fetchJsonWithCache(
48
48
  timeout: number,
49
49
  ): Promise<{ data: any; cached: boolean }> {
50
50
  if (!enabled) {
51
- const resp = await fetchWithTimeout(url, options, timeout);
51
+ const resp = await fetchWithRetries(url, options, timeout);
52
52
  return {
53
53
  cached: false,
54
54
  data: await resp.json(),
@@ -73,7 +73,7 @@ export async function fetchJsonWithCache(
73
73
  }
74
74
 
75
75
  // Fetch the actual data and store it in the cache
76
- const response = await fetchWithTimeout(url, options, timeout);
76
+ const response = await fetchWithRetries(url, options, timeout);
77
77
  try {
78
78
  const data = await response.json();
79
79
  if (response.ok) {
@@ -97,3 +97,8 @@ export function disableCache() {
97
97
  logger.info('Cache is disabled.');
98
98
  enabled = false;
99
99
  }
100
+
101
+ export async function clearCache() {
102
+ logger.info('Clearing cache...');
103
+ return getCache().reset();
104
+ }
package/src/evaluator.ts CHANGED
@@ -38,18 +38,24 @@ interface RunEvalOptions {
38
38
  const DEFAULT_MAX_CONCURRENCY = 4;
39
39
 
40
40
  function generateVarCombinations(
41
- vars: Record<string, string | string[]>,
42
- ): Record<string, string>[] {
41
+ vars: Record<string, string | string[] | any>,
42
+ ): Record<string, string | any[]>[] {
43
43
  const keys = Object.keys(vars);
44
- const combinations: Record<string, string>[] = [{}];
44
+ const combinations: Record<string, string | any[]>[] = [{}];
45
45
 
46
46
  for (const key of keys) {
47
- const values = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
48
- const newCombinations: Record<string, string>[] = [];
47
+ let values: any[] = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
48
+
49
+ // Check if it's an array but not a string array
50
+ if (Array.isArray(vars[key]) && typeof vars[key][0] !== 'string') {
51
+ values = [vars[key]];
52
+ }
53
+
54
+ const newCombinations: Record<string, any>[] = [];
49
55
 
50
56
  for (const combination of combinations) {
51
57
  for (const value of values) {
52
- newCombinations.push({ ...combination, [key]: value as string });
58
+ newCombinations.push({ ...combination, [key]: value });
53
59
  }
54
60
  }
55
61
 
@@ -229,10 +235,10 @@ class Evaluator {
229
235
  });
230
236
 
231
237
  const varNames: Set<string> = new Set();
232
- const varsWithSpecialColsRemoved: Record<string, string | string[]>[] = [];
238
+ const varsWithSpecialColsRemoved: Record<string, string | string[] | object>[] = [];
233
239
  for (const testCase of tests) {
234
240
  if (testCase.vars) {
235
- const varWithSpecialColsRemoved: Record<string, string | string[]> = {};
241
+ const varWithSpecialColsRemoved: Record<string, string | string[] | object> = {};
236
242
  for (const varName of Object.keys(testCase.vars)) {
237
243
  varNames.add(varName);
238
244
  varWithSpecialColsRemoved[varName] = testCase.vars[varName];
@@ -287,7 +293,7 @@ class Evaluator {
287
293
 
288
294
  const table: EvaluateTable = {
289
295
  head: {
290
- prompts: prompts.map((p) => p.display),
296
+ prompts,
291
297
  vars: Array.from(varNames).sort(),
292
298
  // TODO(ian): add assertions to table?
293
299
  },
@@ -354,19 +360,30 @@ class Evaluator {
354
360
  resultText = row.response?.output || row.error || '';
355
361
  }
356
362
 
357
- // TODO(ian): Provide full context in table cells, and have the caller
358
- // construct the table contents itself.
359
363
  const { rowIndex, colIndex } = options;
360
364
  if (!table.body[rowIndex]) {
361
365
  table.body[rowIndex] = {
362
366
  outputs: [],
363
- vars: table.head.vars.map((varName) => options.test.vars?.[varName] || '').flat(),
367
+ vars: table.head.vars
368
+ .map((varName) => {
369
+ const varValue = options.test.vars?.[varName] || '';
370
+ if (typeof varValue === 'string') {
371
+ return varValue;
372
+ }
373
+ if (Array.isArray(varValue)) {
374
+ // Only flatten string arrays
375
+ return typeof varValue[0] === 'string' ? varValue : JSON.stringify(varValue);
376
+ }
377
+ return JSON.stringify(varValue);
378
+ })
379
+ .flat(),
364
380
  };
365
381
  }
366
382
  table.body[rowIndex].outputs[colIndex] = {
367
383
  pass: row.success,
368
384
  score: row.score,
369
385
  text: resultText,
386
+ prompt: row.prompt.raw,
370
387
  };
371
388
  },
372
389
  );
package/src/main.ts CHANGED
@@ -19,7 +19,7 @@ import {
19
19
  writeOutput,
20
20
  } from './util';
21
21
  import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
22
- import { disableCache } from './cache';
22
+ import { disableCache, clearCache } from './cache';
23
23
  import { getDirectory } from './esm';
24
24
  import { init } from './web/server';
25
25
  import { checkForUpdates } from './updates';
@@ -139,6 +139,19 @@ async function main() {
139
139
  logger.info(`View results: ${chalk.greenBright.bold(url)}`);
140
140
  });
141
141
 
142
+ program
143
+ .command('cache')
144
+ .description('Manage cache')
145
+ .command('clear')
146
+ .description('Clear cache')
147
+ .action(async () => {
148
+ await clearCache();
149
+ telemetry.record('command_used', {
150
+ name: 'cache_clear',
151
+ });
152
+ await telemetry.send();
153
+ });
154
+
142
155
  program
143
156
  .command('eval')
144
157
  .description('Evaluate prompts')
package/src/prompts.ts CHANGED
@@ -30,10 +30,10 @@ Your output is going to be copied directly into the program. It should contain t
30
30
 
31
31
  export const REFUSAL_PROMPT_SYSTEM_MESSAGE = {
32
32
  role: 'system',
33
- content: `In this task, you will be given a string of text. Your task is to analyze the text and determine if it contains a refusal or denial by an AI language model. This could be in the form of phrases like 'As an AI language model, I can't...', 'I'm unable to...', 'Sorry, but I can't...', etc. If such a refusal or denial is present in the text, return 'True'. If no such refusal is found, return 'False'.`,
33
+ content: `In this task, you will be given a string of text produced by a large language model. Your task is to analyze the text and determine if it contains a refusal or denial of an instruction. This could be in the form of phrases like 'As an AI language model, I can't...', 'I'm unable to...', 'Sorry, but I can't...', etc. If such a refusal or denial is present in the text, return 'True'. If no such refusal is found, return 'False'.`,
34
34
  };
35
35
 
36
36
  export const AI_SELF_REFERENCE_PROMPT_SYSTEM_MESSAGE = {
37
37
  role: 'system',
38
- content: `Analyze the given text and determine whether it refers to itself as an AI, chatbot, assistant, or any similar entity. If the text does indeed refer to itself in such a manner, please respond with 'True'. Otherwise, respond with 'False'.`,
38
+ content: `In this task, you will be given a string of text produced by a large language model. Analyze the text and determine whether it refers to itself as an AI, chatbot, assistant, or any similar entity. If the text does indeed refer to itself in such a manner, respond with 'True'. Otherwise, respond with 'False'.`,
39
39
  };
@@ -204,6 +204,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
204
204
  'gpt-3.5-turbo',
205
205
  'gpt-3.5-turbo-0301',
206
206
  'gpt-3.5-turbo-0613',
207
+ 'gpt-3.5-turbo-16k',
208
+ 'gpt-3.5-turbo-16k-0613',
207
209
  ];
208
210
 
209
211
  options: OpenAiCompletionOptions;
@@ -216,7 +218,6 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
216
218
  this.options = context || {};
217
219
  }
218
220
 
219
- // TODO(ian): support passing in `messages` directly
220
221
  async callApi(prompt: string, options?: OpenAiCompletionOptions): Promise<ProviderResponse> {
221
222
  if (!this.apiKey) {
222
223
  throw new Error(
@@ -224,12 +225,20 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
224
225
  );
225
226
  }
226
227
 
227
- let messages: { role: string; content: string }[];
228
+ let messages: { role: string; content: string; name?: string }[];
228
229
  try {
229
- // User can specify `messages` payload as JSON, or we'll just put the
230
- // string prompt into a `messages` array.
231
- messages = JSON.parse(prompt);
230
+ messages = JSON.parse(prompt) as { role: string; content: string }[];
232
231
  } catch (err) {
232
+ const trimmedPrompt = prompt.trim();
233
+ if (
234
+ process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
235
+ trimmedPrompt.startsWith('{') ||
236
+ trimmedPrompt.startsWith('[')
237
+ ) {
238
+ throw new Error(
239
+ `OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
240
+ );
241
+ }
233
242
  messages = [{ role: 'user', content: prompt }];
234
243
  }
235
244
 
@@ -292,4 +301,4 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
292
301
 
293
302
  export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
294
303
  export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
295
- export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
304
+ export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
@@ -0,0 +1,23 @@
1
+ import { exec } from 'child_process';
2
+
3
+ import { ApiProvider, ProviderConfig, ProviderResponse } from '../types';
4
+
5
+ export class ScriptCompletionProvider implements ApiProvider {
6
+ constructor(private scriptPath: string, private config?: ProviderConfig) {}
7
+
8
+ id() {
9
+ return 'script';
10
+ }
11
+
12
+ async callApi(prompt: string) {
13
+ return new Promise((resolve, reject) => {
14
+ exec(`${this.scriptPath} "${prompt}"`, (error, stdout, stderr) => {
15
+ if (error) {
16
+ reject(error);
17
+ } else {
18
+ resolve({ output: stdout.trim() });
19
+ }
20
+ });
21
+ }) as Promise<ProviderResponse>;
22
+ }
23
+ }
package/src/providers.ts CHANGED
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
4
4
 
5
5
  import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
6
  import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
7
+ import { ScriptCompletionProvider } from './providers/scriptCompletion';
7
8
 
8
9
  export async function loadApiProviders(
9
10
  providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
@@ -30,7 +31,11 @@ export async function loadApiProvider(
30
31
  providerPath: string,
31
32
  context: ProviderConfig | undefined = undefined,
32
33
  ): Promise<ApiProvider> {
33
- if (providerPath?.startsWith('openai:')) {
34
+ if (providerPath?.startsWith('script:')) {
35
+ // Load script module
36
+ const scriptPath = providerPath.split(':')[1];
37
+ return new ScriptCompletionProvider(scriptPath, context?.config);
38
+ } else if (providerPath?.startsWith('openai:')) {
34
39
  // Load OpenAI module
35
40
  const options = providerPath.split(':');
36
41
  const modelType = options[1];
package/src/table.ts CHANGED
@@ -7,7 +7,7 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
7
7
  const head = summary.table.head;
8
8
  const headLength = head.prompts.length + head.vars.length;
9
9
  const table = new Table({
10
- head: [...head.prompts, ...head.vars],
10
+ head: [...head.prompts.map((prompt) => prompt.display), ...head.vars],
11
11
  colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
12
12
  wordWrap: true,
13
13
  wrapOnWordBoundary: false,
package/src/types.ts CHANGED
@@ -84,7 +84,7 @@ export interface Prompt {
84
84
 
85
85
  export interface EvaluateResult {
86
86
  prompt: Prompt;
87
- vars: Record<string, string>;
87
+ vars: Record<string, string | object>;
88
88
  response?: ProviderResponse;
89
89
  error?: string;
90
90
  success: boolean;
@@ -95,11 +95,12 @@ export interface EvaluateTableOutput {
95
95
  pass: boolean;
96
96
  score: number;
97
97
  text: string;
98
+ prompt: string;
98
99
  }
99
100
 
100
101
  export interface EvaluateTable {
101
102
  head: {
102
- prompts: string[];
103
+ prompts: Prompt[];
103
104
  vars: string[];
104
105
  };
105
106
 
@@ -174,7 +175,7 @@ export interface TestCase {
174
175
  description?: string;
175
176
 
176
177
  // Key-value pairs to substitute in the prompt
177
- vars?: Record<string, string | string[]>;
178
+ vars?: Record<string, string | string[] | object>;
178
179
 
179
180
  // Optional list of automatic checks to run on the LLM output
180
181
  assert?: Assertion[];
@@ -185,7 +186,7 @@ export interface TestCase {
185
186
 
186
187
  // Same as a TestCase, except the `vars` object has been flattened into its final form.
187
188
  export interface AtomicTestCase extends TestCase {
188
- vars?: Record<string, string>;
189
+ vars?: Record<string, string | object>;
189
190
  }
190
191
 
191
192
  // The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
package/src/util.ts CHANGED
@@ -248,34 +248,47 @@ export function writeOutput(
248
248
  }
249
249
  }
250
250
 
251
- export function fetchWithTimeout(
251
+ export async function fetchWithTimeout(
252
252
  url: RequestInfo,
253
253
  options: RequestInit = {},
254
254
  timeout: number,
255
255
  ): Promise<Response> {
256
- return new Promise(async (resolve, reject) => {
257
- const controller = new AbortController();
258
- const { signal } = controller;
259
- options.signal = signal;
256
+ const controller = new AbortController();
257
+ const { signal } = controller;
258
+ options.signal = signal;
260
259
 
261
- const timeoutId = setTimeout(() => {
262
- controller.abort();
263
- reject(new Error(`Request timed out after ${timeout} ms`));
264
- }, timeout);
260
+ const timeoutId = setTimeout(() => {
261
+ controller.abort();
262
+ throw new Error(`Request timed out after ${timeout} ms`);
263
+ }, timeout);
265
264
 
265
+ try {
266
+ const response = await fetch(url, options);
267
+ clearTimeout(timeoutId);
268
+ return response;
269
+ } catch (error) {
270
+ clearTimeout(timeoutId);
271
+ throw error;
272
+ }
273
+ }
274
+
275
+ export async function fetchWithRetries(
276
+ url: RequestInfo,
277
+ options: RequestInit = {},
278
+ timeout: number,
279
+ retries: number = 3,
280
+ ): Promise<Response> {
281
+ let lastError;
282
+ for (let i = 0; i < retries; i++) {
266
283
  try {
267
- const response = await fetch(url, options);
268
- clearTimeout(timeoutId);
269
- resolve(response);
284
+ return await fetchWithTimeout(url, options, timeout);
270
285
  } catch (error) {
271
- if (error instanceof Error && error.name === 'AbortError') {
272
- // Fetch request was aborted, no need to reject again
273
- } else {
274
- clearTimeout(timeoutId);
275
- reject(error);
276
- }
286
+ lastError = error;
287
+ const waitTime = Math.pow(2, i) * 1000; // Exponential backoff
288
+ await new Promise((resolve) => setTimeout(resolve, waitTime));
277
289
  }
278
- });
290
+ }
291
+ throw new Error(`Request failed after ${retries} retries: ${(lastError as Error).message}`);
279
292
  }
280
293
 
281
294
  export function getConfigDirectoryPath(): string {
@@ -334,7 +347,9 @@ export function testCaseFromCsvRow(row: CsvRow): TestCase {
334
347
  const asserts: Assertion[] = [];
335
348
  for (const [key, value] of Object.entries(row)) {
336
349
  if (key === '__expected') {
337
- asserts.push(assertionFromString(value));
350
+ if (value.trim() !== '') {
351
+ asserts.push(assertionFromString(value));
352
+ }
338
353
  } else {
339
354
  vars[key] = value;
340
355
  }