promptfoo 0.9.0 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +50 -40
  2. package/dist/assertions.d.ts +2 -2
  3. package/dist/assertions.d.ts.map +1 -1
  4. package/dist/assertions.js +186 -44
  5. package/dist/assertions.js.map +1 -1
  6. package/dist/cache.js +9 -9
  7. package/dist/cache.js.map +1 -1
  8. package/dist/evaluator.d.ts +1 -1
  9. package/dist/evaluator.d.ts.map +1 -1
  10. package/dist/evaluator.js +30 -23
  11. package/dist/evaluator.js.map +1 -1
  12. package/dist/index.d.ts +10 -10
  13. package/dist/index.d.ts.map +1 -1
  14. package/dist/index.js +18 -14
  15. package/dist/index.js.map +1 -1
  16. package/dist/main.js +49 -44
  17. package/dist/main.js.map +1 -1
  18. package/dist/providers/localai.js +11 -11
  19. package/dist/providers/localai.js.map +1 -1
  20. package/dist/providers/openai.d.ts.map +1 -1
  21. package/dist/providers/openai.js +30 -21
  22. package/dist/providers/openai.js.map +1 -1
  23. package/dist/providers.d.ts +3 -3
  24. package/dist/providers.d.ts.map +1 -1
  25. package/dist/providers.js +15 -15
  26. package/dist/providers.js.map +1 -1
  27. package/dist/types.d.ts +7 -3
  28. package/dist/types.d.ts.map +1 -1
  29. package/dist/util.d.ts +4 -4
  30. package/dist/util.d.ts.map +1 -1
  31. package/dist/util.js +49 -18
  32. package/dist/util.js.map +1 -1
  33. package/dist/web/client/assets/index-15dfcd18.js +172 -0
  34. package/dist/web/client/assets/index-87905193.css +1 -0
  35. package/dist/web/client/index.html +2 -2
  36. package/dist/web/server.js +9 -9
  37. package/dist/web/server.js.map +1 -1
  38. package/package.json +3 -1
  39. package/src/assertions.ts +249 -38
  40. package/src/cache.ts +2 -2
  41. package/src/evaluator.ts +25 -18
  42. package/src/index.ts +13 -8
  43. package/src/main.ts +28 -15
  44. package/src/providers/localai.ts +3 -3
  45. package/src/providers/openai.ts +16 -8
  46. package/src/providers.ts +3 -3
  47. package/src/types.ts +24 -3
  48. package/src/util.ts +48 -17
  49. package/src/web/client/package-lock.json +5729 -0
  50. package/src/web/client/src/ResultsTable.css +35 -4
  51. package/src/web/client/src/ResultsTable.tsx +150 -70
  52. package/src/web/client/src/ResultsView.tsx +83 -18
  53. package/src/web/client/src/index.css +6 -0
  54. package/src/web/client/src/types.ts +2 -0
  55. package/src/web/server.ts +3 -3
  56. package/dist/web/client/assets/index-207192fc.css +0 -1
  57. package/dist/web/client/assets/index-8751749f.js +0 -172
@@ -0,0 +1 @@
1
+ :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-8751749f.js"></script>
9
- <link rel="stylesheet" href="/assets/index-207192fc.css">
8
+ <script type="module" crossorigin src="/assets/index-15dfcd18.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-87905193.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
@@ -13,12 +13,12 @@ const express_1 = __importDefault(require("express"));
13
13
  const cors_1 = __importDefault(require("cors"));
14
14
  const opener_1 = __importDefault(require("opener"));
15
15
  const socket_io_1 = require("socket.io");
16
- const logger_js_1 = __importDefault(require("../logger.js"));
17
- const esm_js_1 = require("../esm.js");
18
- const util_js_1 = require("../util.js");
16
+ const logger_1 = __importDefault(require("../logger"));
17
+ const esm_1 = require("../esm");
18
+ const util_1 = require("../util");
19
19
  function init(port = 15500) {
20
20
  const app = (0, express_1.default)();
21
- const staticDir = node_path_1.default.join((0, esm_js_1.getDirectory)(), 'web', 'client');
21
+ const staticDir = node_path_1.default.join((0, esm_1.getDirectory)(), 'web', 'client');
22
22
  app.use((0, cors_1.default)());
23
23
  app.use(express_1.default.json());
24
24
  app.use(express_1.default.static(staticDir));
@@ -28,7 +28,7 @@ function init(port = 15500) {
28
28
  origin: '*',
29
29
  },
30
30
  });
31
- const latestJsonPath = (0, util_js_1.getLatestResultsPath)();
31
+ const latestJsonPath = (0, util_1.getLatestResultsPath)();
32
32
  const readLatestJson = () => {
33
33
  const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
34
34
  const jsonData = JSON.parse(data);
@@ -46,7 +46,7 @@ function init(port = 15500) {
46
46
  });
47
47
  httpServer.listen(port, () => {
48
48
  const url = `http://localhost:${port}`;
49
- logger_js_1.default.info(`Server listening at ${url}`);
49
+ logger_1.default.info(`Server listening at ${url}`);
50
50
  const rl = node_readline_1.default.createInterface({
51
51
  input: process.stdin,
52
52
  output: process.stdout,
@@ -55,14 +55,14 @@ function init(port = 15500) {
55
55
  if (answer.toLowerCase().startsWith('y')) {
56
56
  try {
57
57
  await (0, opener_1.default)(url);
58
- logger_js_1.default.info(`Opening browser to: ${url}`);
58
+ logger_1.default.info(`Opening browser to: ${url}`);
59
59
  }
60
60
  catch (err) {
61
- logger_js_1.default.error(`Failed to open browser: ${String(err)}`);
61
+ logger_1.default.error(`Failed to open browser: ${String(err)}`);
62
62
  }
63
63
  }
64
64
  rl.close();
65
- logger_js_1.default.info('Press Ctrl+C to stop the server');
65
+ logger_1.default.info('Press Ctrl+C to stop the server');
66
66
  });
67
67
  });
68
68
  }
@@ -1 +1 @@
1
- {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,6DAAkC;AAClC,sCAAyC;AACzC,wCAAkD;AAIlD,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,qBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,8BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,mBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,mBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
1
+ {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAI/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "promptfoo",
3
3
  "description": "Prompt engineering toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.9.0",
5
+ "version": "0.11.0",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/index.js",
@@ -59,6 +59,7 @@
59
59
  "typescript": "^5.0.4"
60
60
  },
61
61
  "dependencies": {
62
+ "@apidevtools/json-schema-ref-parser": "^10.1.0",
62
63
  "async": "^3.2.4",
63
64
  "cache-manager": "^4.1.0",
64
65
  "cache-manager-fs-hash": "^1.0.0",
@@ -76,6 +77,7 @@
76
77
  "node-fetch": "^2.6.7",
77
78
  "nunjucks": "^3.2.4",
78
79
  "opener": "^1.5.2",
80
+ "rouge": "^1.0.3",
79
81
  "socket.io": "^4.6.1",
80
82
  "tiny-invariant": "^1.3.1",
81
83
  "winston": "^3.8.2"
package/src/assertions.ts CHANGED
@@ -1,17 +1,49 @@
1
+ import rouge from 'rouge';
1
2
  import invariant from 'tiny-invariant';
2
3
  import nunjucks from 'nunjucks';
3
4
 
4
- import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
5
- import { cosineSimilarity } from './util.js';
6
- import { loadApiProvider } from './providers.js';
7
- import { DEFAULT_GRADING_PROMPT } from './prompts.js';
5
+ import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
6
+ import { cosineSimilarity, fetchWithTimeout } from './util';
7
+ import { loadApiProvider } from './providers';
8
+ import { DEFAULT_GRADING_PROMPT } from './prompts';
8
9
 
9
- import type { Assertion, GradingConfig, TestCase, GradingResult, AtomicTestCase } from './types.js';
10
+ import type {
11
+ Assertion,
12
+ AssertionType,
13
+ GradingConfig,
14
+ GradingResult,
15
+ AtomicTestCase,
16
+ } from './types';
10
17
 
11
18
  const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
12
19
 
13
20
  const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
14
21
 
22
+ function handleRougeScore(
23
+ baseType: 'rouge-n',
24
+ assertion: Assertion,
25
+ expected: string | string[],
26
+ output: string,
27
+ inverted: boolean,
28
+ ): GradingResult {
29
+ const fnName = baseType[baseType.length - 1] as 'n' | 'l' | 's';
30
+ const rougeMethod = rouge[fnName];
31
+ const score = rougeMethod(output, expected);
32
+ console.log(output, expected, score);
33
+ const pass = score >= (assertion.threshold || 0.75) != inverted;
34
+
35
+ return {
36
+ pass,
37
+ reason: pass
38
+ ? `${baseType.toUpperCase()} score ${score} is greater than or equal to threshold ${
39
+ assertion.threshold || 0.75
40
+ }`
41
+ : `${baseType.toUpperCase()} score ${score} is less than threshold ${
42
+ assertion.threshold || 0.75
43
+ }`,
44
+ };
45
+ }
46
+
15
47
  export async function runAssertions(test: AtomicTestCase, output: string): Promise<GradingResult> {
16
48
  const tokensUsed = {
17
49
  total: 0,
@@ -46,7 +78,12 @@ export async function runAssertion(
46
78
  ): Promise<GradingResult> {
47
79
  let pass: boolean = false;
48
80
 
49
- if (assertion.type === 'equals') {
81
+ invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
82
+
83
+ const inverse = assertion.type.startsWith('not-');
84
+ const baseType = inverse ? assertion.type.slice(4) : assertion.type;
85
+
86
+ if (baseType === 'equals') {
50
87
  pass = assertion.value === output;
51
88
  return {
52
89
  pass,
@@ -54,52 +91,194 @@ export async function runAssertion(
54
91
  };
55
92
  }
56
93
 
57
- if (assertion.type === 'is-json') {
94
+ if (baseType === 'is-json') {
58
95
  try {
59
96
  JSON.parse(output);
60
- return { pass: true, reason: 'Assertion passed' };
97
+ pass = !inverse;
61
98
  } catch (err) {
62
- return {
63
- pass: false,
64
- reason: `Expected output to be valid JSON, but it isn't.\nError: ${err}`,
65
- };
99
+ pass = inverse;
66
100
  }
101
+ return { pass, reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON' };
102
+ }
103
+
104
+ if (baseType === 'contains') {
105
+ invariant(assertion.value, '"contains" assertion type must have a string value');
106
+ invariant(
107
+ typeof assertion.value === 'string',
108
+ '"contains" assertion type must have a string value',
109
+ );
110
+ pass = output.includes(assertion.value) !== inverse;
111
+ return {
112
+ pass,
113
+ reason: pass
114
+ ? 'Assertion passed'
115
+ : `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
116
+ };
117
+ }
118
+
119
+ if (baseType === 'contains-any') {
120
+ invariant(assertion.value, '"contains-any" assertion type must have a value');
121
+ invariant(
122
+ Array.isArray(assertion.value),
123
+ '"contains-any" assertion type must have an array value',
124
+ );
125
+ pass = assertion.value.some((value) => output.includes(value)) !== inverse;
126
+ return {
127
+ pass,
128
+ reason: pass
129
+ ? 'Assertion passed'
130
+ : `Expected output to ${inverse ? 'not ' : ''}contain one of "${assertion.value.join(
131
+ ', ',
132
+ )}"`,
133
+ };
67
134
  }
68
135
 
69
- if (assertion.type === 'contains-json') {
70
- const pass = containsJSON(output);
136
+ if (baseType === 'contains-all') {
137
+ invariant(assertion.value, '"contains-all" assertion type must have a value');
138
+ invariant(
139
+ Array.isArray(assertion.value),
140
+ '"contains-all" assertion type must have an array value',
141
+ );
142
+ pass = assertion.value.every((value) => output.includes(value)) !== inverse;
143
+ return {
144
+ pass,
145
+ reason: pass
146
+ ? 'Assertion passed'
147
+ : `Expected output to ${inverse ? 'not ' : ''}contain all of "${assertion.value.join(
148
+ ', ',
149
+ )}"`,
150
+ };
151
+ }
152
+
153
+ if (baseType === 'regex') {
154
+ invariant(assertion.value, '"regex" assertion type must have a string value');
155
+ invariant(
156
+ typeof assertion.value === 'string',
157
+ '"contains" assertion type must have a string value',
158
+ );
159
+ const regex = new RegExp(assertion.value);
160
+ pass = regex.test(output) !== inverse;
161
+ return {
162
+ pass,
163
+ reason: pass
164
+ ? 'Assertion passed'
165
+ : `Expected output to ${inverse ? 'not ' : ''}match regex "${assertion.value}"`,
166
+ };
167
+ }
168
+
169
+ if (baseType === 'icontains') {
170
+ invariant(assertion.value, '"icontains" assertion type must have a string value');
171
+ invariant(
172
+ typeof assertion.value === 'string',
173
+ '"icontains" assertion type must have a string value',
174
+ );
175
+ pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
176
+ return {
177
+ pass,
178
+ reason: pass
179
+ ? 'Assertion passed'
180
+ : `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
181
+ };
182
+ }
183
+
184
+ if (baseType === 'contains-json') {
185
+ pass = containsJSON(output) !== inverse;
71
186
  return {
72
187
  pass,
73
- reason: pass ? 'Assertion passed' : 'Expected output to contain valid JSON',
188
+ reason: pass
189
+ ? 'Assertion passed'
190
+ : `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
74
191
  };
75
192
  }
76
193
 
77
- if (assertion.type === 'javascript') {
194
+ if (baseType === 'javascript') {
78
195
  try {
79
- const customFunction = new Function('output', `return ${assertion.value}`);
80
- pass = customFunction(output);
196
+ const customFunction = new Function('output', 'context', `return ${assertion.value}`);
197
+ const context = {
198
+ vars: test.vars || {},
199
+ };
200
+ pass = customFunction(output, context) !== inverse;
81
201
  } catch (err) {
82
202
  return {
83
203
  pass: false,
84
- reason: `Custom function threw error: ${(err as Error).message}`,
204
+ reason: `Custom function threw error: ${(err as Error).message}
205
+ ${assertion.value}`,
85
206
  };
86
207
  }
87
208
  return {
88
209
  pass,
89
- reason: pass ? 'Assertion passed' : `Custom function returned false`,
210
+ reason: pass
211
+ ? 'Assertion passed'
212
+ : `Custom function returned ${inverse ? 'true' : 'false'}
213
+ ${assertion.value}`,
90
214
  };
91
215
  }
92
216
 
93
- if (assertion.type === 'similar') {
217
+ if (baseType === 'similar') {
94
218
  invariant(assertion.value, 'Similarity assertion must have a string value');
95
- return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75);
219
+ invariant(
220
+ typeof assertion.value === 'string',
221
+ '"contains" assertion type must have a string value',
222
+ );
223
+ return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75, inverse);
96
224
  }
97
225
 
98
- if (assertion.type === 'llm-rubric') {
226
+ if (baseType === 'llm-rubric') {
99
227
  invariant(assertion.value, 'Similarity assertion must have a string value');
228
+ invariant(
229
+ typeof assertion.value === 'string',
230
+ '"contains" assertion type must have a string value',
231
+ );
100
232
  return matchesLlmRubric(assertion.value, output, test.options);
101
233
  }
102
234
 
235
+ if (baseType === 'webhook') {
236
+ invariant(assertion.value, '"webhook" assertion type must have a URL value');
237
+ invariant(
238
+ typeof assertion.value === 'string',
239
+ '"webhook" assertion type must have a URL value',
240
+ );
241
+
242
+ try {
243
+ const context = {
244
+ vars: test.vars || {},
245
+ };
246
+ const response = await fetchWithTimeout(
247
+ assertion.value,
248
+ {
249
+ method: 'POST',
250
+ headers: {
251
+ 'Content-Type': 'application/json',
252
+ },
253
+ body: JSON.stringify({ output, context }),
254
+ },
255
+ process.env.WEBHOOK_TIMEOUT ? parseInt(process.env.WEBHOOK_TIMEOUT, 10) : 5000,
256
+ );
257
+
258
+ if (!response.ok) {
259
+ throw new Error(`Webhook response status: ${response.status}`);
260
+ }
261
+
262
+ const jsonResponse = await response.json();
263
+ pass = jsonResponse.pass !== inverse;
264
+ } catch (err) {
265
+ return {
266
+ pass: false,
267
+ reason: `Webhook error: ${(err as Error).message}`,
268
+ };
269
+ }
270
+
271
+ return {
272
+ pass,
273
+ reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
274
+ };
275
+ }
276
+
277
+ if (baseType === 'rouge-n') {
278
+ invariant(assertion.value, '"rouge" assertion type must a value (string or string array)');
279
+ return handleRougeScore(baseType, assertion, assertion.value, output, inverse);
280
+ }
281
+
103
282
  throw new Error('Unknown assertion type: ' + assertion.type);
104
283
  }
105
284
 
@@ -125,6 +304,7 @@ export async function matchesSimilarity(
125
304
  expected: string,
126
305
  output: string,
127
306
  threshold: number,
307
+ inverse: boolean = false,
128
308
  ): Promise<GradingResult> {
129
309
  const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
130
310
  const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
@@ -155,16 +335,19 @@ export async function matchesSimilarity(
155
335
  }
156
336
 
157
337
  const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
158
- if (similarity < threshold) {
338
+ const pass = inverse ? similarity <= threshold : similarity >= threshold;
339
+ const greaterThanReason = `Similarity ${similarity} is greater than threshold ${threshold}`;
340
+ const lessThanReason = `Similarity ${similarity} is less than threshold ${threshold}`;
341
+ if (pass) {
159
342
  return {
160
- pass: false,
161
- reason: `Similarity ${similarity} is less than threshold ${threshold}`,
343
+ pass: true,
344
+ reason: inverse ? lessThanReason : greaterThanReason,
162
345
  tokensUsed,
163
346
  };
164
347
  }
165
348
  return {
166
- pass: true,
167
- reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
349
+ pass: false,
350
+ reason: inverse ? greaterThanReason : lessThanReason,
168
351
  tokensUsed,
169
352
  };
170
353
  }
@@ -224,16 +407,7 @@ export async function matchesLlmRubric(
224
407
  }
225
408
 
226
409
  export function assertionFromString(expected: string): Assertion {
227
- const match = expected.match(SIMILAR_REGEX);
228
- if (match) {
229
- const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
230
- const rest = expected.replace(SIMILAR_REGEX, '').trim();
231
- return {
232
- type: 'similar',
233
- value: rest,
234
- threshold,
235
- };
236
- }
410
+ // Legacy options
237
411
  if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
238
412
  // TODO(1.0): delete eval: legacy option
239
413
  const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
@@ -249,11 +423,48 @@ export function assertionFromString(expected: string): Assertion {
249
423
  value: expected.slice(6),
250
424
  };
251
425
  }
426
+
427
+ // New options
428
+ const assertionRegex =
429
+ /^(not-)?(equals|contains|contains-any|contains-all|regex|icontains):(.+)$/;
430
+ const regexMatch = expected.match(assertionRegex);
431
+
432
+ if (regexMatch) {
433
+ const [_, notPrefix, type, value] = regexMatch;
434
+ const fullType = notPrefix ? `not-${type}` : type;
435
+
436
+ if (type === 'contains-any' || type === 'contains-all') {
437
+ return {
438
+ type: fullType as AssertionType,
439
+ value: value.split(',').map((s) => s.trim()),
440
+ };
441
+ } else {
442
+ return {
443
+ type: fullType as AssertionType,
444
+ value,
445
+ };
446
+ }
447
+ }
448
+
449
+ // Options that require some special handling
450
+ const match = expected.match(SIMILAR_REGEX);
451
+ if (match) {
452
+ const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
453
+ const rest = expected.replace(SIMILAR_REGEX, '').trim();
454
+ return {
455
+ type: 'similar',
456
+ value: rest,
457
+ threshold,
458
+ };
459
+ }
460
+
252
461
  if (expected === 'is-json' || expected === 'contains-json') {
253
462
  return {
254
463
  type: expected,
255
464
  };
256
465
  }
466
+
467
+ // Default to equality
257
468
  return {
258
469
  type: 'equals',
259
470
  value: expected,
package/src/cache.ts CHANGED
@@ -4,8 +4,8 @@ import path from 'node:path';
4
4
  import cacheManager from 'cache-manager';
5
5
  import fsStore from 'cache-manager-fs-hash';
6
6
 
7
- import logger from './logger.js';
8
- import { getConfigDirectoryPath, fetchWithTimeout } from './util.js';
7
+ import logger from './logger';
8
+ import { getConfigDirectoryPath, fetchWithTimeout } from './util';
9
9
 
10
10
  import type { Cache } from 'cache-manager';
11
11
  import type { RequestInfo, RequestInit } from 'node-fetch';
package/src/evaluator.ts CHANGED
@@ -4,8 +4,8 @@ import async from 'async';
4
4
  import chalk from 'chalk';
5
5
  import nunjucks from 'nunjucks';
6
6
 
7
- import logger from './logger.js';
8
- import { runAssertions } from './assertions.js';
7
+ import logger from './logger';
8
+ import { runAssertions } from './assertions';
9
9
 
10
10
  import type { SingleBar } from 'cli-progress';
11
11
  import type {
@@ -19,12 +19,12 @@ import type {
19
19
  Prompt,
20
20
  TestCase,
21
21
  AtomicTestCase,
22
- } from './types.js';
23
- import { generatePrompts } from './suggestions.js';
22
+ } from './types';
23
+ import { generatePrompts } from './suggestions';
24
24
 
25
25
  interface RunEvalOptions {
26
26
  provider: ApiProvider;
27
- prompt: string;
27
+ prompt: Prompt;
28
28
 
29
29
  test: AtomicTestCase;
30
30
 
@@ -86,10 +86,13 @@ class Evaluator {
86
86
  includeProviderId,
87
87
  }: RunEvalOptions): Promise<EvaluateResult> {
88
88
  const vars = test.vars || {};
89
- const renderedPrompt = nunjucks.renderString(prompt, vars);
89
+ const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
90
90
 
91
91
  // Note that we're using original prompt, not renderedPrompt
92
- const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
92
+ let promptDisplay = prompt.display;
93
+ if (includeProviderId) {
94
+ promptDisplay = `[${provider.id()}] ${promptDisplay}`;
95
+ }
93
96
 
94
97
  const setup = {
95
98
  prompt: {
@@ -142,7 +145,7 @@ class Evaluator {
142
145
  } catch (err) {
143
146
  return {
144
147
  ...setup,
145
- error: String(err),
148
+ error: String(err) + '\n\n' + (err as Error).stack,
146
149
  success: false,
147
150
  };
148
151
  }
@@ -155,7 +158,7 @@ class Evaluator {
155
158
  if (options.generateSuggestions) {
156
159
  // TODO(ian): Move this into its own command/file
157
160
  logger.info(`Generating prompt variations...`);
158
- const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
161
+ const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
159
162
  if (error || !newPrompts) {
160
163
  throw new Error(`Failed to generate prompts: ${error}`);
161
164
  }
@@ -178,7 +181,7 @@ class Evaluator {
178
181
  async (answer) => {
179
182
  rl.close();
180
183
  if (answer.toLowerCase().startsWith('y')) {
181
- testSuite.prompts.push(prompt);
184
+ testSuite.prompts.push({ raw: prompt, display: prompt });
182
185
  numAdded++;
183
186
  } else {
184
187
  logger.info('Skipping this prompt.');
@@ -196,13 +199,13 @@ class Evaluator {
196
199
  }
197
200
 
198
201
  // Split prompts by provider
199
- for (const promptContent of testSuite.prompts) {
202
+ for (const prompt of testSuite.prompts) {
200
203
  for (const provider of testSuite.providers) {
201
- const display =
202
- testSuite.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
204
+ const updatedDisplay =
205
+ testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
203
206
  prompts.push({
204
- raw: promptContent,
205
- display,
207
+ ...prompt,
208
+ display: updatedDisplay,
206
209
  });
207
210
  }
208
211
  }
@@ -248,6 +251,7 @@ class Evaluator {
248
251
  // And progress bar...
249
252
  let progressbar: SingleBar | undefined;
250
253
  if (options.showProgressBar) {
254
+ // FIXME(ian): Add var combinations too
251
255
  const totalNumRuns =
252
256
  testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
253
257
  const cliProgress = await import('cli-progress');
@@ -284,11 +288,14 @@ class Evaluator {
284
288
  const varCombinations = generateVarCombinations(testCase.vars || {});
285
289
  for (const vars of varCombinations) {
286
290
  let colIndex = 0;
287
- for (const promptContent of testSuite.prompts) {
291
+ for (const prompt of testSuite.prompts) {
288
292
  for (const provider of testSuite.providers) {
289
293
  runEvalOptions.push({
290
294
  provider,
291
- prompt: prependToPrompt + promptContent + appendToPrompt,
295
+ prompt: {
296
+ ...prompt,
297
+ raw: prependToPrompt + prompt.raw + appendToPrompt,
298
+ },
292
299
  test: { ...testCase, vars },
293
300
  includeProviderId: testSuite.providers.length > 1,
294
301
  rowIndex,
@@ -314,7 +321,7 @@ class Evaluator {
314
321
  if (progressbar) {
315
322
  progressbar.increment({
316
323
  provider: options.provider.id(),
317
- prompt: options.prompt.slice(0, 10),
324
+ prompt: options.prompt.raw.slice(0, 10),
318
325
  vars: Object.entries(options.test.vars || {})
319
326
  .map(([k, v]) => `${k}=${v}`)
320
327
  .join(' ')
package/src/index.ts CHANGED
@@ -1,12 +1,12 @@
1
- import { evaluate as doEvaluate } from './evaluator.js';
2
- import { loadApiProviders } from './providers.js';
3
- import assertions from './assertions.js';
4
- import providers from './providers.js';
1
+ import { evaluate as doEvaluate } from './evaluator';
2
+ import { loadApiProviders } from './providers';
3
+ import assertions from './assertions';
4
+ import providers from './providers';
5
5
 
6
- import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types.js';
7
- import { readTests } from './util.js';
6
+ import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
7
+ import { readTests } from './util';
8
8
 
9
- export * from './types.js';
9
+ export * from './types';
10
10
 
11
11
  interface EvaluateTestSuite extends TestSuiteConfig {
12
12
  prompts: string[];
@@ -15,9 +15,14 @@ interface EvaluateTestSuite extends TestSuiteConfig {
15
15
  async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
16
16
  const constructedTestSuite: TestSuite = {
17
17
  ...testSuite,
18
- prompts: testSuite.prompts, // raw prompts expected
19
18
  providers: await loadApiProviders(testSuite.providers),
20
19
  tests: await readTests(testSuite.tests),
20
+
21
+ // Full prompts expected (not filepaths)
22
+ prompts: testSuite.prompts.map((promptContent) => ({
23
+ raw: promptContent,
24
+ display: promptContent,
25
+ })),
21
26
  };
22
27
  return doEvaluate(constructedTestSuite, options);
23
28
  }