promptfoo 0.8.3 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +5 -5
  2. package/dist/assertions.d.ts +3 -3
  3. package/dist/assertions.d.ts.map +1 -1
  4. package/dist/assertions.js +11 -12
  5. package/dist/assertions.js.map +1 -1
  6. package/dist/cache.d.ts.map +1 -1
  7. package/dist/cache.js +9 -9
  8. package/dist/cache.js.map +1 -1
  9. package/dist/evaluator.d.ts +1 -1
  10. package/dist/evaluator.d.ts.map +1 -1
  11. package/dist/evaluator.js +60 -34
  12. package/dist/evaluator.js.map +1 -1
  13. package/dist/index.d.ts +10 -10
  14. package/dist/index.d.ts.map +1 -1
  15. package/dist/index.js +18 -14
  16. package/dist/index.js.map +1 -1
  17. package/dist/main.js +41 -40
  18. package/dist/main.js.map +1 -1
  19. package/dist/providers/localai.js +11 -11
  20. package/dist/providers/localai.js.map +1 -1
  21. package/dist/providers/openai.d.ts.map +1 -1
  22. package/dist/providers/openai.js +30 -21
  23. package/dist/providers/openai.js.map +1 -1
  24. package/dist/providers.d.ts +3 -3
  25. package/dist/providers.d.ts.map +1 -1
  26. package/dist/providers.js +15 -15
  27. package/dist/providers.js.map +1 -1
  28. package/dist/types.d.ts +5 -2
  29. package/dist/types.d.ts.map +1 -1
  30. package/dist/util.d.ts +2 -2
  31. package/dist/util.d.ts.map +1 -1
  32. package/dist/util.js +43 -15
  33. package/dist/util.js.map +1 -1
  34. package/dist/web/client/assets/index-9a9ba400.css +1 -0
  35. package/dist/web/client/assets/{index-8751749f.js → index-b72d3ca9.js} +12 -12
  36. package/dist/web/client/index.html +2 -2
  37. package/dist/web/server.js +9 -9
  38. package/dist/web/server.js.map +1 -1
  39. package/package.json +3 -1
  40. package/src/assertions.ts +8 -9
  41. package/src/cache.ts +5 -4
  42. package/src/evaluator.ts +66 -33
  43. package/src/index.ts +13 -8
  44. package/src/main.ts +13 -18
  45. package/src/providers/localai.ts +3 -3
  46. package/src/providers/openai.ts +16 -8
  47. package/src/providers.ts +3 -3
  48. package/src/types.ts +7 -2
  49. package/src/util.ts +42 -20
  50. package/src/web/client/package-lock.json +5729 -0
  51. package/src/web/client/src/ResultsTable.css +19 -0
  52. package/src/web/client/src/ResultsTable.tsx +51 -37
  53. package/src/web/client/src/ResultsView.tsx +7 -7
  54. package/src/web/server.ts +3 -3
  55. package/dist/web/client/assets/index-207192fc.css +0 -1
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-8751749f.js"></script>
9
- <link rel="stylesheet" href="/assets/index-207192fc.css">
8
+ <script type="module" crossorigin src="/assets/index-b72d3ca9.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-9a9ba400.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
@@ -13,12 +13,12 @@ const express_1 = __importDefault(require("express"));
13
13
  const cors_1 = __importDefault(require("cors"));
14
14
  const opener_1 = __importDefault(require("opener"));
15
15
  const socket_io_1 = require("socket.io");
16
- const logger_js_1 = __importDefault(require("../logger.js"));
17
- const esm_js_1 = require("../esm.js");
18
- const util_js_1 = require("../util.js");
16
+ const logger_1 = __importDefault(require("../logger"));
17
+ const esm_1 = require("../esm");
18
+ const util_1 = require("../util");
19
19
  function init(port = 15500) {
20
20
  const app = (0, express_1.default)();
21
- const staticDir = node_path_1.default.join((0, esm_js_1.getDirectory)(), 'web', 'client');
21
+ const staticDir = node_path_1.default.join((0, esm_1.getDirectory)(), 'web', 'client');
22
22
  app.use((0, cors_1.default)());
23
23
  app.use(express_1.default.json());
24
24
  app.use(express_1.default.static(staticDir));
@@ -28,7 +28,7 @@ function init(port = 15500) {
28
28
  origin: '*',
29
29
  },
30
30
  });
31
- const latestJsonPath = (0, util_js_1.getLatestResultsPath)();
31
+ const latestJsonPath = (0, util_1.getLatestResultsPath)();
32
32
  const readLatestJson = () => {
33
33
  const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
34
34
  const jsonData = JSON.parse(data);
@@ -46,7 +46,7 @@ function init(port = 15500) {
46
46
  });
47
47
  httpServer.listen(port, () => {
48
48
  const url = `http://localhost:${port}`;
49
- logger_js_1.default.info(`Server listening at ${url}`);
49
+ logger_1.default.info(`Server listening at ${url}`);
50
50
  const rl = node_readline_1.default.createInterface({
51
51
  input: process.stdin,
52
52
  output: process.stdout,
@@ -55,14 +55,14 @@ function init(port = 15500) {
55
55
  if (answer.toLowerCase().startsWith('y')) {
56
56
  try {
57
57
  await (0, opener_1.default)(url);
58
- logger_js_1.default.info(`Opening browser to: ${url}`);
58
+ logger_1.default.info(`Opening browser to: ${url}`);
59
59
  }
60
60
  catch (err) {
61
- logger_js_1.default.error(`Failed to open browser: ${String(err)}`);
61
+ logger_1.default.error(`Failed to open browser: ${String(err)}`);
62
62
  }
63
63
  }
64
64
  rl.close();
65
- logger_js_1.default.info('Press Ctrl+C to stop the server');
65
+ logger_1.default.info('Press Ctrl+C to stop the server');
66
66
  });
67
67
  });
68
68
  }
@@ -1 +1 @@
1
- {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,6DAAkC;AAClC,sCAAyC;AACzC,wCAAkD;AAIlD,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,qBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,8BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,mBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,mBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
1
+ {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAI/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "promptfoo",
3
3
  "description": "Prompt engineering toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.8.3",
5
+ "version": "0.10.0",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/index.js",
@@ -38,6 +38,7 @@
38
38
  },
39
39
  "devDependencies": {
40
40
  "@types/async": "^3.2.20",
41
+ "@types/cache-manager": "^4.0.2",
41
42
  "@types/cache-manager-fs-hash": "^0.0.1",
42
43
  "@types/cli-progress": "^3.11.0",
43
44
  "@types/cors": "^2.8.13",
@@ -50,6 +51,7 @@
50
51
  "@types/nunjucks": "^3.2.2",
51
52
  "@types/opener": "^1.4.0",
52
53
  "babel-jest": "^29.5.0",
54
+ "jest": "^29.5.0",
53
55
  "jest-watch-typeahead": "^2.2.2",
54
56
  "prettier": "^2.8.8",
55
57
  "ts-jest": "^29.1.0",
package/src/assertions.ts CHANGED
@@ -1,18 +1,18 @@
1
1
  import invariant from 'tiny-invariant';
2
2
  import nunjucks from 'nunjucks';
3
3
 
4
- import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
5
- import { cosineSimilarity } from './util.js';
6
- import { loadApiProvider } from './providers.js';
7
- import { DEFAULT_GRADING_PROMPT } from './prompts.js';
4
+ import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
5
+ import { cosineSimilarity } from './util';
6
+ import { loadApiProvider } from './providers';
7
+ import { DEFAULT_GRADING_PROMPT } from './prompts';
8
8
 
9
- import type { Assertion, GradingConfig, TestCase, GradingResult } from './types.js';
9
+ import type { Assertion, GradingConfig, TestCase, GradingResult, AtomicTestCase } from './types';
10
10
 
11
11
  const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
12
12
 
13
13
  const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
14
14
 
15
- export async function runAssertions(test: TestCase, output: string): Promise<GradingResult> {
15
+ export async function runAssertions(test: AtomicTestCase, output: string): Promise<GradingResult> {
16
16
  const tokensUsed = {
17
17
  total: 0,
18
18
  prompt: 0,
@@ -41,7 +41,7 @@ export async function runAssertions(test: TestCase, output: string): Promise<Gra
41
41
 
42
42
  export async function runAssertion(
43
43
  assertion: Assertion,
44
- test: TestCase,
44
+ test: AtomicTestCase,
45
45
  output: string,
46
46
  ): Promise<GradingResult> {
47
47
  let pass: boolean = false;
@@ -92,8 +92,7 @@ export async function runAssertion(
92
92
 
93
93
  if (assertion.type === 'similar') {
94
94
  invariant(assertion.value, 'Similarity assertion must have a string value');
95
- invariant(assertion.threshold, 'Similarity assertion must have a threshold');
96
- return matchesSimilarity(assertion.value, output, assertion.threshold);
95
+ return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75);
97
96
  }
98
97
 
99
98
  if (assertion.type === 'llm-rubric') {
package/src/cache.ts CHANGED
@@ -4,8 +4,8 @@ import path from 'node:path';
4
4
  import cacheManager from 'cache-manager';
5
5
  import fsStore from 'cache-manager-fs-hash';
6
6
 
7
- import logger from './logger.js';
8
- import { getConfigDirectoryPath, fetchWithTimeout } from './util.js';
7
+ import logger from './logger';
8
+ import { getConfigDirectoryPath, fetchWithTimeout } from './util';
9
9
 
10
10
  import type { Cache } from 'cache-manager';
11
11
  import type { RequestInfo, RequestInit } from 'node-fetch';
@@ -22,9 +22,10 @@ const cacheType =
22
22
 
23
23
  function getCache() {
24
24
  if (!cacheInstance) {
25
- const cachePath = process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
25
+ const cachePath =
26
+ process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
26
27
  if (!fs.existsSync(cachePath)) {
27
- logger.info('Creating cache folder at ${cachePath}.');
28
+ logger.info(`Creating cache folder at ${cachePath}.`);
28
29
  fs.mkdirSync(cachePath, { recursive: true });
29
30
  }
30
31
  cacheInstance = cacheManager.caching({
package/src/evaluator.ts CHANGED
@@ -4,8 +4,8 @@ import async from 'async';
4
4
  import chalk from 'chalk';
5
5
  import nunjucks from 'nunjucks';
6
6
 
7
- import logger from './logger.js';
8
- import { runAssertions } from './assertions.js';
7
+ import logger from './logger';
8
+ import { runAssertions } from './assertions';
9
9
 
10
10
  import type { SingleBar } from 'cli-progress';
11
11
  import type {
@@ -18,14 +18,15 @@ import type {
18
18
  TestSuite,
19
19
  Prompt,
20
20
  TestCase,
21
- } from './types.js';
22
- import { generatePrompts } from './suggestions.js';
21
+ AtomicTestCase,
22
+ } from './types';
23
+ import { generatePrompts } from './suggestions';
23
24
 
24
25
  interface RunEvalOptions {
25
26
  provider: ApiProvider;
26
- prompt: string;
27
+ prompt: Prompt;
27
28
 
28
- test: TestCase;
29
+ test: AtomicTestCase;
29
30
 
30
31
  includeProviderId?: boolean;
31
32
 
@@ -35,6 +36,29 @@ interface RunEvalOptions {
35
36
 
36
37
  const DEFAULT_MAX_CONCURRENCY = 4;
37
38
 
39
+ function generateVarCombinations(
40
+ vars: Record<string, string | string[]>,
41
+ ): Record<string, string>[] {
42
+ const keys = Object.keys(vars);
43
+ const combinations: Record<string, string>[] = [{}];
44
+
45
+ for (const key of keys) {
46
+ const values = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
47
+ const newCombinations: Record<string, string>[] = [];
48
+
49
+ for (const combination of combinations) {
50
+ for (const value of values) {
51
+ newCombinations.push({ ...combination, [key]: value as string });
52
+ }
53
+ }
54
+
55
+ combinations.length = 0;
56
+ combinations.push(...newCombinations);
57
+ }
58
+
59
+ return combinations;
60
+ }
61
+
38
62
  class Evaluator {
39
63
  testSuite: TestSuite;
40
64
  options: EvaluateOptions;
@@ -62,10 +86,13 @@ class Evaluator {
62
86
  includeProviderId,
63
87
  }: RunEvalOptions): Promise<EvaluateResult> {
64
88
  const vars = test.vars || {};
65
- const renderedPrompt = nunjucks.renderString(prompt, vars);
89
+ const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
66
90
 
67
91
  // Note that we're using original prompt, not renderedPrompt
68
- const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
92
+ let promptDisplay = prompt.display;
93
+ if (includeProviderId) {
94
+ promptDisplay = `[${provider.id()}] ${promptDisplay}`;
95
+ }
69
96
 
70
97
  const setup = {
71
98
  prompt: {
@@ -131,7 +158,7 @@ class Evaluator {
131
158
  if (options.generateSuggestions) {
132
159
  // TODO(ian): Move this into its own command/file
133
160
  logger.info(`Generating prompt variations...`);
134
- const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
161
+ const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
135
162
  if (error || !newPrompts) {
136
163
  throw new Error(`Failed to generate prompts: ${error}`);
137
164
  }
@@ -154,7 +181,7 @@ class Evaluator {
154
181
  async (answer) => {
155
182
  rl.close();
156
183
  if (answer.toLowerCase().startsWith('y')) {
157
- testSuite.prompts.push(prompt);
184
+ testSuite.prompts.push({ raw: prompt, display: prompt });
158
185
  numAdded++;
159
186
  } else {
160
187
  logger.info('Skipping this prompt.');
@@ -172,13 +199,13 @@ class Evaluator {
172
199
  }
173
200
 
174
201
  // Split prompts by provider
175
- for (const promptContent of testSuite.prompts) {
202
+ for (const prompt of testSuite.prompts) {
176
203
  for (const provider of testSuite.providers) {
177
- const display =
178
- testSuite.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
204
+ const updatedDisplay =
205
+ testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
179
206
  prompts.push({
180
- raw: promptContent,
181
- display,
207
+ ...prompt,
208
+ display: updatedDisplay,
182
209
  });
183
210
  }
184
211
  }
@@ -197,10 +224,10 @@ class Evaluator {
197
224
  });
198
225
 
199
226
  const varNames: Set<string> = new Set();
200
- const varsWithSpecialColsRemoved: Record<string, string>[] = [];
227
+ const varsWithSpecialColsRemoved: Record<string, string | string[]>[] = [];
201
228
  for (const testCase of tests) {
202
229
  if (testCase.vars) {
203
- const varWithSpecialColsRemoved: Record<string, string> = {};
230
+ const varWithSpecialColsRemoved: Record<string, string | string[]> = {};
204
231
  for (const varName of Object.keys(testCase.vars)) {
205
232
  varNames.add(varName);
206
233
  varWithSpecialColsRemoved[varName] = testCase.vars[varName];
@@ -224,6 +251,7 @@ class Evaluator {
224
251
  // And progress bar...
225
252
  let progressbar: SingleBar | undefined;
226
253
  if (options.showProgressBar) {
254
+ // FIXME(ian): Add var combinations too
227
255
  const totalNumRuns =
228
256
  testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
229
257
  const cliProgress = await import('cli-progress');
@@ -245,8 +273,6 @@ class Evaluator {
245
273
  const runEvalOptions: RunEvalOptions[] = [];
246
274
  let rowIndex = 0;
247
275
  for (const testCase of tests) {
248
- let colIndex = 0;
249
-
250
276
  // Handle default properties
251
277
  testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
252
278
  testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
@@ -259,20 +285,27 @@ class Evaluator {
259
285
  testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
260
286
 
261
287
  // Finalize test case eval
262
- for (const promptContent of testSuite.prompts) {
263
- for (const provider of testSuite.providers) {
264
- runEvalOptions.push({
265
- provider,
266
- prompt: prependToPrompt + promptContent + appendToPrompt,
267
- test: testCase,
268
- includeProviderId: testSuite.providers.length > 1,
269
- rowIndex,
270
- colIndex,
271
- });
272
- colIndex++;
288
+ const varCombinations = generateVarCombinations(testCase.vars || {});
289
+ for (const vars of varCombinations) {
290
+ let colIndex = 0;
291
+ for (const prompt of testSuite.prompts) {
292
+ for (const provider of testSuite.providers) {
293
+ runEvalOptions.push({
294
+ provider,
295
+ prompt: {
296
+ ...prompt,
297
+ raw: prependToPrompt + prompt.raw + appendToPrompt,
298
+ },
299
+ test: { ...testCase, vars },
300
+ includeProviderId: testSuite.providers.length > 1,
301
+ rowIndex,
302
+ colIndex,
303
+ });
304
+ colIndex++;
305
+ }
273
306
  }
307
+ rowIndex++;
274
308
  }
275
- rowIndex++;
276
309
  }
277
310
 
278
311
  // Actually run the eval
@@ -288,7 +321,7 @@ class Evaluator {
288
321
  if (progressbar) {
289
322
  progressbar.increment({
290
323
  provider: options.provider.id(),
291
- prompt: options.prompt.slice(0, 10),
324
+ prompt: options.prompt.raw.slice(0, 10),
292
325
  vars: Object.entries(options.test.vars || {})
293
326
  .map(([k, v]) => `${k}=${v}`)
294
327
  .join(' ')
@@ -320,7 +353,7 @@ class Evaluator {
320
353
  if (!table.body[rowIndex]) {
321
354
  table.body[rowIndex] = {
322
355
  outputs: [],
323
- vars: table.head.vars.map((varName) => options.test.vars?.[varName] || ''),
356
+ vars: table.head.vars.map((varName) => options.test.vars?.[varName] || '').flat(),
324
357
  };
325
358
  }
326
359
  table.body[rowIndex].outputs[colIndex] = resultText;
package/src/index.ts CHANGED
@@ -1,12 +1,12 @@
1
- import { evaluate as doEvaluate } from './evaluator.js';
2
- import { loadApiProviders } from './providers.js';
3
- import assertions from './assertions.js';
4
- import providers from './providers.js';
1
+ import { evaluate as doEvaluate } from './evaluator';
2
+ import { loadApiProviders } from './providers';
3
+ import assertions from './assertions';
4
+ import providers from './providers';
5
5
 
6
- import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types.js';
7
- import { readTests } from './util.js';
6
+ import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
7
+ import { readTests } from './util';
8
8
 
9
- export * from './types.js';
9
+ export * from './types';
10
10
 
11
11
  interface EvaluateTestSuite extends TestSuiteConfig {
12
12
  prompts: string[];
@@ -15,9 +15,14 @@ interface EvaluateTestSuite extends TestSuiteConfig {
15
15
  async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
16
16
  const constructedTestSuite: TestSuite = {
17
17
  ...testSuite,
18
- prompts: testSuite.prompts, // raw prompts expected
19
18
  providers: await loadApiProviders(testSuite.providers),
20
19
  tests: await readTests(testSuite.tests),
20
+
21
+ // Full prompts expected (not filepaths)
22
+ prompts: testSuite.prompts.map((promptContent) => ({
23
+ raw: promptContent,
24
+ display: promptContent,
25
+ })),
21
26
  };
22
27
  return doEvaluate(constructedTestSuite, options);
23
28
  }
package/src/main.ts CHANGED
@@ -6,9 +6,9 @@ import Table from 'cli-table3';
6
6
  import chalk from 'chalk';
7
7
  import { Command } from 'commander';
8
8
 
9
- import logger, { setLogLevel } from './logger.js';
10
- import { loadApiProvider, loadApiProviders } from './providers.js';
11
- import { evaluate } from './evaluator.js';
9
+ import logger, { setLogLevel } from './logger';
10
+ import { loadApiProvider, loadApiProviders } from './providers';
11
+ import { evaluate } from './evaluator';
12
12
  import {
13
13
  maybeReadConfig,
14
14
  readConfig,
@@ -16,10 +16,10 @@ import {
16
16
  readTests,
17
17
  writeLatestResults,
18
18
  writeOutput,
19
- } from './util.js';
20
- import { getDirectory } from './esm.js';
21
- import { init } from './web/server.js';
22
- import { disableCache } from './cache.js';
19
+ } from './util';
20
+ import { getDirectory } from './esm';
21
+ import { init } from './web/server';
22
+ import { disableCache } from './cache';
23
23
 
24
24
  import type {
25
25
  CommandLineOptions,
@@ -27,8 +27,8 @@ import type {
27
27
  TestCase,
28
28
  TestSuite,
29
29
  UnifiedConfig,
30
- } from './types.js';
31
- import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding.js';
30
+ } from './types';
31
+ import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
32
32
 
33
33
  function createDummyFiles(directory: string | null) {
34
34
  if (directory) {
@@ -52,11 +52,9 @@ function createDummyFiles(directory: string | null) {
52
52
  writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), DEFAULT_README);
53
53
 
54
54
  if (directory === '.') {
55
- logger.info(
56
- 'Wrote prompts.txt, vars.csv, and promptfooconfig.js. Open README.md to get started!',
57
- );
55
+ logger.info('Wrote prompts.txt and promptfooconfig.js. Open README.md to get started!');
58
56
  } else {
59
- logger.info(`Wrote prompts.txt, vars.csv, and promptfooconfig.js to ${directory}`);
57
+ logger.info(`Wrote prompts.txt and promptfooconfig.js to ./${directory}`);
60
58
  logger.info(`\`cd ${directory}\` and open README.md to get started!`);
61
59
  }
62
60
  }
@@ -128,11 +126,7 @@ async function main() {
128
126
  'Path to CSV with test cases',
129
127
  config?.commandLineOptions?.vars,
130
128
  )
131
- .option(
132
- '-t, --tests <path>',
133
- 'Path to CSV with test cases',
134
- config?.commandLineOptions?.tests,
135
- )
129
+ .option('-t, --tests <path>', 'Path to CSV with test cases', config?.commandLineOptions?.tests)
136
130
  .option('-o, --output <path>', 'Path to output file (csv, json, yaml, html)', config.outputPath)
137
131
  .option(
138
132
  '-j, --max-concurrency <number>',
@@ -184,6 +178,7 @@ async function main() {
184
178
  prompts: cmdObj.prompts || config.prompts,
185
179
  providers: cmdObj.providers || config.providers,
186
180
  tests: cmdObj.tests || cmdObj.vars || config.tests,
181
+ defaultTest: config.defaultTest,
187
182
  };
188
183
  }
189
184
 
@@ -1,6 +1,6 @@
1
- import logger from '../logger.js';
2
- import { fetchJsonWithCache } from '../cache.js';
3
- import { REQUEST_TIMEOUT_MS } from './shared.js';
1
+ import logger from '../logger';
2
+ import { fetchJsonWithCache } from '../cache';
3
+ import { REQUEST_TIMEOUT_MS } from './shared';
4
4
 
5
5
  import type { ApiProvider, ProviderResponse } from '../types.js';
6
6
 
@@ -1,6 +1,6 @@
1
- import logger from '../logger.js';
2
- import { fetchJsonWithCache } from '../cache.js';
3
- import { REQUEST_TIMEOUT_MS } from './shared.js';
1
+ import logger from '../logger';
2
+ import { fetchJsonWithCache } from '../cache';
3
+ import { REQUEST_TIMEOUT_MS } from './shared';
4
4
 
5
5
  import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
6
6
 
@@ -126,12 +126,20 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
126
126
  );
127
127
  }
128
128
 
129
+ let stop: string;
130
+ try {
131
+ stop = process.env.OPENAI_STOP
132
+ ? JSON.parse(process.env.OPENAI_STOP)
133
+ : ['<|im_end|>', '<|endoftext|>'];
134
+ } catch (err) {
135
+ throw new Error(`OPENAI_STOP is not a valid JSON string: ${err}`);
136
+ }
129
137
  const body = {
130
138
  model: this.modelName,
131
139
  prompt,
132
- max_tokens: process.env.OPENAI_MAX_TOKENS || 1024,
133
- temperature: options?.temperature ?? (process.env.OPENAI_MAX_TEMPERATURE || 0),
134
- stop: process.env.OPENAI_STOP ? JSON.parse(process.env.OPENAI_STOP) : undefined,
140
+ max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
141
+ temperature: options?.temperature ?? parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
142
+ stop,
135
143
  };
136
144
  logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
137
145
  let data,
@@ -210,8 +218,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
210
218
  const body = {
211
219
  model: this.modelName,
212
220
  messages: messages,
213
- max_tokens: process.env.OPENAI_MAX_TOKENS || 1024,
214
- temperature: options?.temperature ?? (process.env.OPENAI_MAX_TEMPERATURE || 0),
221
+ max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
222
+ temperature: options?.temperature ?? parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
215
223
  };
216
224
  logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
217
225
 
package/src/providers.ts CHANGED
@@ -1,9 +1,9 @@
1
1
  import path from 'node:path';
2
2
 
3
- import { ApiProvider } from './types.js';
3
+ import { ApiProvider } from './types';
4
4
 
5
- import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
6
- import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';
5
+ import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
6
+ import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
7
7
 
8
8
  export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
9
9
  if (typeof providerPaths === 'string') {
package/src/types.ts CHANGED
@@ -134,7 +134,7 @@ export interface TestCase {
134
134
  description?: string;
135
135
 
136
136
  // Key-value pairs to substitute in the prompt
137
- vars?: Record<string, string>;
137
+ vars?: Record<string, string | string[]>;
138
138
 
139
139
  // Optional list of automatic checks to run on the LLM output
140
140
  assert?: Assertion[];
@@ -143,6 +143,11 @@ export interface TestCase {
143
143
  options?: PromptConfig & GradingConfig;
144
144
  }
145
145
 
146
+ // Same as a TestCase, except the `vars` object has been flattened into its final form.
147
+ export interface AtomicTestCase extends TestCase {
148
+ vars?: Record<string, string>;
149
+ }
150
+
146
151
  // The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
147
152
  export interface TestSuite {
148
153
  // Optional description of what your LLM is trying to do
@@ -152,7 +157,7 @@ export interface TestSuite {
152
157
  providers: ApiProvider[];
153
158
 
154
159
  // One or more prompt strings
155
- prompts: string[];
160
+ prompts: Prompt[];
156
161
 
157
162
  // Test cases
158
163
  tests?: TestCase[];