promptfoo 0.14.1 → 0.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/package.json +2 -2
  2. package/dist/src/assertions.d.ts.map +1 -1
  3. package/dist/src/assertions.js +63 -10
  4. package/dist/src/assertions.js.map +1 -1
  5. package/dist/src/evaluator.d.ts.map +1 -1
  6. package/dist/src/evaluator.js +16 -7
  7. package/dist/src/evaluator.js.map +1 -1
  8. package/dist/src/main.js +1 -0
  9. package/dist/src/main.js.map +1 -1
  10. package/dist/src/prompts.d.ts +8 -0
  11. package/dist/src/prompts.d.ts.map +1 -1
  12. package/dist/src/prompts.js +14 -6
  13. package/dist/src/prompts.js.map +1 -1
  14. package/dist/src/providers/openai.d.ts.map +1 -1
  15. package/dist/src/providers/openai.js +1 -1
  16. package/dist/src/providers/openai.js.map +1 -1
  17. package/dist/src/table.d.ts.map +1 -1
  18. package/dist/src/table.js +12 -12
  19. package/dist/src/table.js.map +1 -1
  20. package/dist/src/types.d.ts +9 -1
  21. package/dist/src/types.d.ts.map +1 -1
  22. package/dist/src/util.d.ts.map +1 -1
  23. package/dist/src/util.js +3 -2
  24. package/dist/src/util.js.map +1 -1
  25. package/dist/src/web/client/assets/{index-70e6ca57.js → index-9d27a707.js} +25 -25
  26. package/dist/src/web/client/assets/{index-87905193.css → index-c3faa651.css} +1 -1
  27. package/dist/src/web/client/index.html +2 -2
  28. package/dist/src/web/server.js +1 -1
  29. package/dist/src/web/server.js.map +1 -1
  30. package/package.json +2 -2
  31. package/src/assertions.ts +64 -12
  32. package/src/evaluator.ts +16 -7
  33. package/src/main.ts +1 -0
  34. package/src/prompts.ts +15 -5
  35. package/src/providers/openai.ts +1 -1
  36. package/src/table.ts +14 -12
  37. package/src/types.ts +12 -1
  38. package/src/util.ts +14 -3
  39. package/src/web/client/src/ResultsTable.css +4 -0
  40. package/src/web/client/src/ResultsTable.tsx +60 -30
  41. package/src/web/client/src/types.ts +7 -1
  42. package/src/web/server.ts +1 -1
  43. package/src/web/client/package-lock.json +0 -5726
@@ -1 +1 @@
1
- :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
1
+ :root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-70e6ca57.js"></script>
9
- <link rel="stylesheet" href="/assets/index-87905193.css">
8
+ <script type="module" crossorigin src="/assets/index-9d27a707.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-c3faa651.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
@@ -39,7 +39,7 @@ function init(port = 15500) {
39
39
  // Watch for changes to latest.json and emit the update event
40
40
  fs_1.default.watch(latestJsonPath, (0, debounce_1.default)((event) => {
41
41
  if (event === 'change') {
42
- socket.emit('update', readLatestJson);
42
+ socket.emit('update', readLatestJson());
43
43
  }
44
44
  }, 250));
45
45
  });
@@ -1 +1 @@
1
- {"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAE/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;aACvC;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA1DD,oBA0DC"}
1
+ {"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAE/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;aACzC;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA1DD,oBA0DC"}
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "promptfoo",
3
3
  "description": "Prompt engineering toolkit",
4
4
  "author": "Ian Webster",
5
- "version": "0.14.1",
5
+ "version": "0.15.0",
6
6
  "license": "MIT",
7
7
  "type": "commonjs",
8
8
  "main": "dist/src/index.js",
@@ -79,7 +79,7 @@
79
79
  "nunjucks": "^3.2.4",
80
80
  "opener": "^1.5.2",
81
81
  "rouge": "^1.0.3",
82
- "semver": "^7.5.1",
82
+ "semver": "^7.5.3",
83
83
  "socket.io": "^4.6.1",
84
84
  "tiny-invariant": "^1.3.1",
85
85
  "winston": "^3.8.2"
package/src/assertions.ts CHANGED
@@ -32,6 +32,7 @@ function handleRougeScore(
32
32
 
33
33
  return {
34
34
  pass,
35
+ score: inverted ? 1 - score : score,
35
36
  reason: pass
36
37
  ? `${baseType.toUpperCase()} score ${score} is greater than or equal to threshold ${
37
38
  assertion.threshold || 0.75
@@ -49,24 +50,36 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
49
50
  completion: 0,
50
51
  };
51
52
 
52
- if (!test.assert) {
53
- return { pass: true, reason: 'No assertions', tokensUsed };
53
+ if (!test.assert || test.assert.length < 1) {
54
+ return { pass: true, score: 1, reason: 'No assertions', tokensUsed };
54
55
  }
55
56
 
57
+ let totalScore = 0;
58
+ let totalWeight = 0;
56
59
  for (const assertion of test.assert) {
57
- const result = await runAssertion(assertion, test, output);
58
- if (!result.pass) {
59
- return result;
60
- }
60
+ const weight = assertion.weight || 1;
61
+ totalWeight += weight;
61
62
 
63
+ const result = await runAssertion(assertion, test, output);
64
+ totalScore += result.score * weight;
62
65
  if (result.tokensUsed) {
63
66
  tokensUsed.total += result.tokensUsed.total;
64
67
  tokensUsed.prompt += result.tokensUsed.prompt;
65
68
  tokensUsed.completion += result.tokensUsed.completion;
66
69
  }
70
+
71
+ if (!result.pass) {
72
+ // Short-circuit assertions
73
+ return result;
74
+ }
67
75
  }
68
76
 
69
- return { pass: true, reason: 'All assertions passed', tokensUsed };
77
+ return {
78
+ pass: true,
79
+ score: totalScore / totalWeight,
80
+ reason: 'All assertions passed',
81
+ tokensUsed,
82
+ };
70
83
  }
71
84
 
72
85
  export async function runAssertion(
@@ -75,6 +88,7 @@ export async function runAssertion(
75
88
  output: string,
76
89
  ): Promise<GradingResult> {
77
90
  let pass: boolean = false;
91
+ let score: number = 0.0;
78
92
 
79
93
  invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
80
94
 
@@ -89,6 +103,7 @@ export async function runAssertion(
89
103
  pass = assertion.value === output;
90
104
  return {
91
105
  pass,
106
+ score: pass ? 1 : 0,
92
107
  reason: pass ? 'Assertion passed' : `Expected output "${assertion.value}"`,
93
108
  };
94
109
  }
@@ -100,7 +115,11 @@ export async function runAssertion(
100
115
  } catch (err) {
101
116
  pass = inverse;
102
117
  }
103
- return { pass, reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON' };
118
+ return {
119
+ pass,
120
+ score: pass ? 1 : 0,
121
+ reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON',
122
+ };
104
123
  }
105
124
 
106
125
  if (baseType === 'contains') {
@@ -112,6 +131,7 @@ export async function runAssertion(
112
131
  pass = output.includes(assertion.value) !== inverse;
113
132
  return {
114
133
  pass,
134
+ score: pass ? 1 : 0,
115
135
  reason: pass
116
136
  ? 'Assertion passed'
117
137
  : `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
@@ -127,6 +147,7 @@ export async function runAssertion(
127
147
  pass = assertion.value.some((value) => output.includes(value)) !== inverse;
128
148
  return {
129
149
  pass,
150
+ score: pass ? 1 : 0,
130
151
  reason: pass
131
152
  ? 'Assertion passed'
132
153
  : `Expected output to ${inverse ? 'not ' : ''}contain one of "${assertion.value.join(
@@ -144,6 +165,7 @@ export async function runAssertion(
144
165
  pass = assertion.value.every((value) => output.includes(value)) !== inverse;
145
166
  return {
146
167
  pass,
168
+ score: pass ? 1 : 0,
147
169
  reason: pass
148
170
  ? 'Assertion passed'
149
171
  : `Expected output to ${inverse ? 'not ' : ''}contain all of "${assertion.value.join(
@@ -162,6 +184,7 @@ export async function runAssertion(
162
184
  pass = regex.test(output) !== inverse;
163
185
  return {
164
186
  pass,
187
+ score: pass ? 1 : 0,
165
188
  reason: pass
166
189
  ? 'Assertion passed'
167
190
  : `Expected output to ${inverse ? 'not ' : ''}match regex "${assertion.value}"`,
@@ -177,6 +200,7 @@ export async function runAssertion(
177
200
  pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
178
201
  return {
179
202
  pass,
203
+ score: pass ? 1 : 0,
180
204
  reason: pass
181
205
  ? 'Assertion passed'
182
206
  : `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
@@ -187,6 +211,7 @@ export async function runAssertion(
187
211
  pass = containsJSON(output) !== inverse;
188
212
  return {
189
213
  pass,
214
+ score: pass ? 1 : 0,
190
215
  reason: pass
191
216
  ? 'Assertion passed'
192
217
  : `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
@@ -199,16 +224,27 @@ export async function runAssertion(
199
224
  const context = {
200
225
  vars: test.vars || {},
201
226
  };
202
- pass = customFunction(output, context) !== inverse;
227
+ const result = customFunction(output, context) as any;
228
+ if (typeof result === 'boolean') {
229
+ pass = result !== inverse;
230
+ score = 1.0;
231
+ } else if (typeof result === 'number') {
232
+ pass = true;
233
+ score = result;
234
+ } else {
235
+ throw new Error('Custom function must return a boolean or number');
236
+ }
203
237
  } catch (err) {
204
238
  return {
205
239
  pass: false,
240
+ score: 0,
206
241
  reason: `Custom function threw error: ${(err as Error).message}
207
242
  ${assertion.value}`,
208
243
  };
209
244
  }
210
245
  return {
211
246
  pass,
247
+ score,
212
248
  reason: pass
213
249
  ? 'Assertion passed'
214
250
  : `Custom function returned ${inverse ? 'true' : 'false'}
@@ -263,15 +299,25 @@ ${assertion.value}`,
263
299
 
264
300
  const jsonResponse = await response.json();
265
301
  pass = jsonResponse.pass !== inverse;
302
+ score =
303
+ typeof jsonResponse.score === 'undefined'
304
+ ? pass
305
+ ? 1
306
+ : 0
307
+ : inverse
308
+ ? 1 - jsonResponse.score
309
+ : jsonResponse.score;
266
310
  } catch (err) {
267
311
  return {
268
312
  pass: false,
313
+ score: 0,
269
314
  reason: `Webhook error: ${(err as Error).message}`,
270
315
  };
271
316
  }
272
317
 
273
318
  return {
274
319
  pass,
320
+ score,
275
321
  reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
276
322
  };
277
323
  }
@@ -322,6 +368,7 @@ export async function matchesSimilarity(
322
368
  if (expectedEmbedding.error || outputEmbedding.error) {
323
369
  return {
324
370
  pass: false,
371
+ score: 0,
325
372
  reason:
326
373
  expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
327
374
  tokensUsed,
@@ -331,6 +378,7 @@ export async function matchesSimilarity(
331
378
  if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
332
379
  return {
333
380
  pass: false,
381
+ score: 0,
334
382
  reason: 'Embedding not found',
335
383
  tokensUsed,
336
384
  };
@@ -343,12 +391,14 @@ export async function matchesSimilarity(
343
391
  if (pass) {
344
392
  return {
345
393
  pass: true,
394
+ score: inverse ? 1 - similarity : similarity,
346
395
  reason: inverse ? lessThanReason : greaterThanReason,
347
396
  tokensUsed,
348
397
  };
349
398
  }
350
399
  return {
351
400
  pass: false,
401
+ score: inverse ? 1 - similarity : similarity,
352
402
  reason: inverse ? greaterThanReason : lessThanReason,
353
403
  tokensUsed,
354
404
  };
@@ -366,7 +416,7 @@ export async function matchesLlmRubric(
366
416
  }
367
417
 
368
418
  const prompt = nunjucks.renderString(options.rubricPrompt || DEFAULT_GRADING_PROMPT, {
369
- content: output,
419
+ output,
370
420
  rubric: expected,
371
421
  });
372
422
 
@@ -378,6 +428,7 @@ export async function matchesLlmRubric(
378
428
  if (resp.error || !resp.output) {
379
429
  return {
380
430
  pass: false,
431
+ score: 0,
381
432
  reason: resp.error || 'No output',
382
433
  tokensUsed: {
383
434
  total: resp.tokenUsage?.total || 0,
@@ -388,16 +439,17 @@ export async function matchesLlmRubric(
388
439
  }
389
440
 
390
441
  try {
391
- const parsed = JSON.parse(resp.output) as GradingResult;
442
+ const parsed = JSON.parse(resp.output) as Omit<GradingResult, 'score'>;
392
443
  parsed.tokensUsed = {
393
444
  total: resp.tokenUsage?.total || 0,
394
445
  prompt: resp.tokenUsage?.prompt || 0,
395
446
  completion: resp.tokenUsage?.completion || 0,
396
447
  };
397
- return parsed;
448
+ return { ...parsed, score: parsed.pass ? 1 : 0 };
398
449
  } catch (err) {
399
450
  return {
400
451
  pass: false,
452
+ score: 0,
401
453
  reason: `Output is not valid JSON: ${resp.output}`,
402
454
  tokensUsed: {
403
455
  total: resp.tokenUsage?.total || 0,
package/src/evaluator.ts CHANGED
@@ -109,6 +109,7 @@ class Evaluator {
109
109
  ...setup,
110
110
  response,
111
111
  success: false,
112
+ score: 0,
112
113
  };
113
114
  if (response.error) {
114
115
  ret.error = response.error;
@@ -118,6 +119,7 @@ class Evaluator {
118
119
  ret.error = checkResult.reason;
119
120
  }
120
121
  ret.success = checkResult.pass;
122
+ ret.score = checkResult.score;
121
123
  if (checkResult.tokensUsed) {
122
124
  this.stats.tokenUsage.total += checkResult.tokensUsed.total;
123
125
  this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
@@ -125,6 +127,7 @@ class Evaluator {
125
127
  }
126
128
  } else {
127
129
  ret.success = false;
130
+ ret.score = 0;
128
131
  ret.error = 'No output';
129
132
  }
130
133
 
@@ -148,6 +151,7 @@ class Evaluator {
148
151
  ...setup,
149
152
  error: String(err) + '\n\n' + (err as Error).stack,
150
153
  success: false,
154
+ score: 0,
151
155
  };
152
156
  }
153
157
  }
@@ -323,11 +327,12 @@ class Evaluator {
323
327
  if (progressbar) {
324
328
  progressbar.increment({
325
329
  provider: options.provider.id(),
326
- prompt: options.prompt.raw.slice(0, 10),
330
+ prompt: options.prompt.raw.slice(0, 10).replace(/\n/g, ' '),
327
331
  vars: Object.entries(options.test.vars || {})
328
332
  .map(([k, v]) => `${k}=${v}`)
329
333
  .join(' ')
330
- .slice(0, 10),
334
+ .slice(0, 10)
335
+ .replace(/\n/g, ' '),
331
336
  });
332
337
  }
333
338
 
@@ -339,12 +344,12 @@ class Evaluator {
339
344
  let resultText: string | undefined;
340
345
  if (isTest) {
341
346
  if (row.success) {
342
- resultText = `[PASS] ${row.response?.output || row.error || ''}`;
347
+ resultText = `${row.response?.output || row.error || ''}`;
343
348
  } else {
344
- resultText = `[FAIL] ${row.error}\n---\n${row.response?.output || row.error || ''}`;
349
+ resultText = `${row.error}\n---\n${row.response?.output || row.error || ''}`;
345
350
  }
346
351
  } else if (row.error) {
347
- resultText = `[FAIL] ${row.error}`;
352
+ resultText = `${row.error}`;
348
353
  } else {
349
354
  resultText = row.response?.output || row.error || '';
350
355
  }
@@ -358,7 +363,11 @@ class Evaluator {
358
363
  vars: table.head.vars.map((varName) => options.test.vars?.[varName] || '').flat(),
359
364
  };
360
365
  }
361
- table.body[rowIndex].outputs[colIndex] = resultText;
366
+ table.body[rowIndex].outputs[colIndex] = {
367
+ pass: row.success,
368
+ score: row.score,
369
+ text: resultText,
370
+ };
362
371
  },
363
372
  );
364
373
 
@@ -368,7 +377,7 @@ class Evaluator {
368
377
 
369
378
  telemetry.record('eval_ran', {});
370
379
 
371
- return { version: 1, results, stats: this.stats, table };
380
+ return { version: 2, results, stats: this.stats, table };
372
381
  }
373
382
  }
374
383
 
package/src/main.ts CHANGED
@@ -220,6 +220,7 @@ async function main() {
220
220
  prompts: cmdObj.prompts || config.prompts,
221
221
  providers: cmdObj.providers || config.providers,
222
222
  tests: cmdObj.tests || cmdObj.vars || config.tests,
223
+ defaultTest: config.defaultTest,
223
224
  };
224
225
 
225
226
  // Validation
package/src/prompts.ts CHANGED
@@ -1,21 +1,21 @@
1
1
  export const DEFAULT_GRADING_PROMPT = JSON.stringify([
2
2
  {
3
3
  role: 'system',
4
- content: `You are grading content according to a user-specified rubric. If the statement in the rubric is true, then the content passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
4
+ content: `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
5
5
 
6
6
  Examples:
7
7
 
8
- Content: Hello world
9
- Rubric: Contains a greeting
8
+ Output: Hello world
9
+ Rubric: Content contains a greeting
10
10
  {"pass": true, "reason": "the content contains the word 'world'"}
11
11
 
12
- Content: Avast ye swabs, repel the invaders!
12
+ Output: Avast ye swabs, repel the invaders!
13
13
  Rubric: Does not speak like a pirate
14
14
  {"pass": false, "reason": "'avast ye' is a common pirate term"}`,
15
15
  },
16
16
  {
17
17
  role: 'user',
18
- content: 'Content: {{ content }}\nRubric: {{ rubric }}',
18
+ content: 'Output: {{ output }}\nRubric: {{ rubric }}',
19
19
  },
20
20
  ]);
21
21
 
@@ -27,3 +27,13 @@ Substantially revise the prompt, revising its structure and content however nece
27
27
 
28
28
  Your output is going to be copied directly into the program. It should contain the prompt ONLY`,
29
29
  };
30
+
31
+ export const REFUSAL_PROMPT_SYSTEM_MESSAGE = {
32
+ role: 'system',
33
+ content: `In this task, you will be given a string of text. Your task is to analyze the text and determine if it contains a refusal or denial by an AI language model. This could be in the form of phrases like 'As an AI language model, I can't...', 'I'm unable to...', 'Sorry, but I can't...', etc. If such a refusal or denial is present in the text, return 'True'. If no such refusal is found, return 'False'.`,
34
+ };
35
+
36
+ export const AI_SELF_REFERENCE_PROMPT_SYSTEM_MESSAGE = {
37
+ role: 'system',
38
+ content: `Analyze the given text and determine whether it refers to itself as an AI, chatbot, assistant, or any similar entity. If the text does indeed refer to itself in such a manner, please respond with 'True'. Otherwise, respond with 'False'.`,
39
+ };
@@ -291,5 +291,5 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
291
291
  }
292
292
 
293
293
  export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
294
- export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4');
294
+ export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
295
295
  export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
package/src/table.ts CHANGED
@@ -19,21 +19,23 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
19
19
  for (const row of summary.table.body.slice(0, maxRows)) {
20
20
  table.push([
21
21
  ...row.vars,
22
- ...row.outputs.map((col) => {
23
- if (col.length > tableCellMaxLength) {
24
- col = col.slice(0, tableCellMaxLength) + '...';
22
+ ...row.outputs.map(({ pass, score, text }) => {
23
+ if (text.length > tableCellMaxLength) {
24
+ text = text.slice(0, tableCellMaxLength) + '...';
25
25
  }
26
- if (col.startsWith('[PASS]')) {
27
- // color '[PASS]' green
28
- return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
29
- } else if (col.startsWith('[FAIL]')) {
26
+ if (pass) {
27
+ return chalk.green.bold('[PASS] ') + text;
28
+ } else if (!pass) {
30
29
  // color everything red up until '---'
31
- return col
32
- .split('---')
33
- .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
34
- .join('---');
30
+ return (
31
+ chalk.red.bold('[FAIL] ') +
32
+ text
33
+ .split('---')
34
+ .map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
35
+ .join('---')
36
+ );
35
37
  }
36
- return col;
38
+ return text;
37
39
  }),
38
40
  ]);
39
41
  }
package/src/types.ts CHANGED
@@ -88,6 +88,13 @@ export interface EvaluateResult {
88
88
  response?: ProviderResponse;
89
89
  error?: string;
90
90
  success: boolean;
91
+ score: number;
92
+ }
93
+
94
+ export interface EvaluateTableOutput {
95
+ pass: boolean;
96
+ score: number;
97
+ text: string;
91
98
  }
92
99
 
93
100
  export interface EvaluateTable {
@@ -97,7 +104,7 @@ export interface EvaluateTable {
97
104
  };
98
105
 
99
106
  body: {
100
- outputs: string[];
107
+ outputs: EvaluateTableOutput[];
101
108
  vars: string[];
102
109
  }[];
103
110
  }
@@ -117,6 +124,7 @@ export interface EvaluateSummary {
117
124
 
118
125
  export interface GradingResult {
119
126
  pass: boolean;
127
+ score: number;
120
128
  reason: string;
121
129
  tokensUsed?: TokenUsage;
122
130
  }
@@ -153,6 +161,9 @@ export interface Assertion {
153
161
  // The threshold value, only applicable for similarity (cosine distance)
154
162
  threshold?: number;
155
163
 
164
+ // The weight of this assertion compared to other assertions in the test case. Defaults to 1.
165
+ weight?: number;
166
+
156
167
  // Some assertions (similarity, llm-rubric) require an LLM provider
157
168
  provider?: ApiProvider;
158
169
  }
package/src/util.ts CHANGED
@@ -17,7 +17,15 @@ import { getDirectory } from './esm';
17
17
 
18
18
  import type { RequestInfo, RequestInit, Response } from 'node-fetch';
19
19
 
20
- import type { Assertion, CsvRow, EvaluateSummary, UnifiedConfig, TestCase, Prompt } from './types';
20
+ import type {
21
+ Assertion,
22
+ CsvRow,
23
+ EvaluateSummary,
24
+ EvaluateTableOutput,
25
+ UnifiedConfig,
26
+ TestCase,
27
+ Prompt,
28
+ } from './types';
21
29
 
22
30
  const PROMPT_DELIMITER = '---';
23
31
 
@@ -211,10 +219,13 @@ export function writeOutput(
211
219
  ): void {
212
220
  const outputExtension = outputPath.split('.').pop()?.toLowerCase();
213
221
 
222
+ const outputToSimpleString = (output: EvaluateTableOutput) =>
223
+ `${output.pass ? '[PASS]' : '[FAIL]'} (${output.score.toFixed(2)}) ${output.text}`;
224
+
214
225
  if (outputExtension === 'csv' || outputExtension === 'txt') {
215
226
  const csvOutput = stringify([
216
227
  [...results.table.head.prompts, ...results.table.head.vars],
217
- ...results.table.body.map((row) => [...row.outputs, ...row.vars]),
228
+ ...results.table.body.map((row) => [...row.outputs.map(outputToSimpleString), ...row.vars]),
218
229
  ]);
219
230
  fs.writeFileSync(outputPath, csvOutput);
220
231
  } else if (outputExtension === 'json') {
@@ -225,7 +236,7 @@ export function writeOutput(
225
236
  const template = fs.readFileSync(`${getDirectory()}/tableOutput.html`, 'utf-8');
226
237
  const table = [
227
238
  [...results.table.head.prompts, ...results.table.head.vars],
228
- ...results.table.body.map((row) => [...row.outputs, ...row.vars]),
239
+ ...results.table.body.map((row) => [...row.outputs.map(outputToSimpleString), ...row.vars]),
229
240
  ];
230
241
  const htmlOutput = nunjucks.renderString(template, {
231
242
  table,
@@ -118,6 +118,10 @@ td .status {
118
118
  font-weight: bold;
119
119
  }
120
120
 
121
+ td .score {
122
+ font-weight: normal;
123
+ }
124
+
121
125
  td .pass {
122
126
  color: var(--pass-color);
123
127
  }