promptfoo 0.5.1 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +20 -248
  2. package/dist/__mocks__/esm.js +5 -1
  3. package/dist/__mocks__/esm.js.map +1 -1
  4. package/dist/assertions.d.ts +18 -0
  5. package/dist/assertions.d.ts.map +1 -0
  6. package/dist/assertions.js +128 -0
  7. package/dist/assertions.js.map +1 -0
  8. package/dist/esm.d.ts.map +1 -1
  9. package/dist/esm.js +10 -3
  10. package/dist/esm.js.map +1 -1
  11. package/dist/evaluator.d.ts.map +1 -1
  12. package/dist/evaluator.js +88 -117
  13. package/dist/evaluator.js.map +1 -1
  14. package/dist/index.d.ts +13 -0
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +34 -5
  17. package/dist/index.js.map +1 -1
  18. package/dist/logger.js +18 -11
  19. package/dist/logger.js.map +1 -1
  20. package/dist/main.js +95 -53
  21. package/dist/main.js.map +1 -1
  22. package/dist/prompts.d.ts +4 -0
  23. package/dist/prompts.d.ts.map +1 -1
  24. package/dist/prompts.js +12 -1
  25. package/dist/prompts.js.map +1 -1
  26. package/dist/providers/localai.js +21 -13
  27. package/dist/providers/localai.js.map +1 -1
  28. package/dist/providers/openai.d.ts +9 -4
  29. package/dist/providers/openai.d.ts.map +1 -1
  30. package/dist/providers/openai.js +39 -29
  31. package/dist/providers/openai.js.map +1 -1
  32. package/dist/providers/shared.d.ts.map +1 -1
  33. package/dist/providers/shared.js +5 -2
  34. package/dist/providers/shared.js.map +1 -1
  35. package/dist/providers.d.ts +10 -0
  36. package/dist/providers.d.ts.map +1 -1
  37. package/dist/providers.js +51 -14
  38. package/dist/providers.js.map +1 -1
  39. package/dist/suggestions.d.ts +9 -0
  40. package/dist/suggestions.d.ts.map +1 -0
  41. package/dist/suggestions.js +54 -0
  42. package/dist/suggestions.js.map +1 -0
  43. package/dist/types.d.ts +11 -2
  44. package/dist/types.d.ts.map +1 -1
  45. package/dist/types.js +2 -1
  46. package/dist/util.d.ts +1 -1
  47. package/dist/util.d.ts.map +1 -1
  48. package/dist/util.js +86 -31
  49. package/dist/util.js.map +1 -1
  50. package/dist/web/client/assets/index-207192fc.css +1 -0
  51. package/dist/web/client/assets/index-8751749f.js +172 -0
  52. package/dist/web/client/index.html +2 -2
  53. package/dist/web/server.js +38 -31
  54. package/dist/web/server.js.map +1 -1
  55. package/package.json +14 -4
  56. package/src/assertions.ts +154 -0
  57. package/src/esm.ts +5 -2
  58. package/src/evaluator.ts +61 -139
  59. package/src/index.ts +12 -0
  60. package/src/main.ts +28 -3
  61. package/src/prompts.ts +9 -0
  62. package/src/providers/openai.ts +16 -9
  63. package/src/providers/shared.ts +1 -1
  64. package/src/providers.ts +8 -0
  65. package/src/suggestions.ts +63 -0
  66. package/src/types.ts +14 -2
  67. package/src/util.ts +24 -3
  68. package/src/web/client/package.json +1 -0
  69. package/src/web/client/src/App.css +4 -0
  70. package/src/web/client/src/App.tsx +29 -5
  71. package/src/web/client/src/Logo.css +5 -0
  72. package/src/web/client/src/NavBar.css +18 -0
  73. package/src/web/client/src/NavBar.tsx +12 -1
  74. package/src/web/client/src/index.css +10 -0
  75. package/src/web/server.ts +2 -2
  76. package/dist/web/client/assets/index-710f1308.css +0 -1
  77. package/dist/web/client/assets/index-900b20c0.js +0 -172
@@ -5,8 +5,8 @@
5
5
  <link rel="icon" type="image/svg+xml" href="favicon.ico" />
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>promptfoo web viewer</title>
8
- <script type="module" crossorigin src="/assets/index-900b20c0.js"></script>
9
- <link rel="stylesheet" href="/assets/index-710f1308.css">
8
+ <script type="module" crossorigin src="/assets/index-8751749f.js"></script>
9
+ <link rel="stylesheet" href="/assets/index-207192fc.css">
10
10
  </head>
11
11
  <body>
12
12
  <div id="root"></div>
@@ -1,24 +1,30 @@
1
- import fs from 'fs';
2
- import path from 'node:path';
3
- import readline from 'node:readline';
4
- import http from 'node:http';
5
- import debounce from 'debounce';
6
- import open from 'open';
7
- import express from 'express';
8
- import cors from 'cors';
9
- import { Server as SocketIOServer } from 'socket.io';
10
- import promptfoo from '../index.js';
11
- import logger from '../logger.js';
12
- import { getDirectory } from '../esm.js';
13
- import { getLatestResultsPath } from '../util.js';
14
- export function init(port = 15500) {
15
- const app = express();
16
- const staticDir = path.join(getDirectory(), 'web', 'client');
17
- app.use(cors());
18
- app.use(express.json());
19
- app.use(express.static(staticDir));
20
- const httpServer = http.createServer(app);
21
- const io = new SocketIOServer(httpServer, {
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.init = void 0;
7
+ const fs_1 = __importDefault(require("fs"));
8
+ const node_path_1 = __importDefault(require("node:path"));
9
+ const node_readline_1 = __importDefault(require("node:readline"));
10
+ const node_http_1 = __importDefault(require("node:http"));
11
+ const debounce_1 = __importDefault(require("debounce"));
12
+ const express_1 = __importDefault(require("express"));
13
+ const cors_1 = __importDefault(require("cors"));
14
+ const opener_1 = __importDefault(require("opener"));
15
+ const socket_io_1 = require("socket.io");
16
+ const index_js_1 = __importDefault(require("../index.js"));
17
+ const logger_js_1 = __importDefault(require("../logger.js"));
18
+ const esm_js_1 = require("../esm.js");
19
+ const util_js_1 = require("../util.js");
20
+ function init(port = 15500) {
21
+ const app = (0, express_1.default)();
22
+ const staticDir = node_path_1.default.join((0, esm_js_1.getDirectory)(), 'web', 'client');
23
+ app.use((0, cors_1.default)());
24
+ app.use(express_1.default.json());
25
+ app.use(express_1.default.static(staticDir));
26
+ const httpServer = node_http_1.default.createServer(app);
27
+ const io = new socket_io_1.Server(httpServer, {
22
28
  cors: {
23
29
  origin: '*',
24
30
  },
@@ -26,16 +32,16 @@ export function init(port = 15500) {
26
32
  app.post('/evaluate', async (req, res) => {
27
33
  try {
28
34
  const { provider, options } = req.body;
29
- const summary = await promptfoo.evaluate(provider, options);
35
+ const summary = await index_js_1.default.evaluate(provider, options);
30
36
  res.json(summary);
31
37
  }
32
38
  catch (error) {
33
39
  res.status(500).json({ message: 'Error evaluating prompts' });
34
40
  }
35
41
  });
36
- const latestJsonPath = getLatestResultsPath();
42
+ const latestJsonPath = (0, util_js_1.getLatestResultsPath)();
37
43
  const readLatestJson = () => {
38
- const data = fs.readFileSync(latestJsonPath, 'utf8');
44
+ const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
39
45
  const jsonData = JSON.parse(data);
40
46
  return jsonData.table;
41
47
  };
@@ -43,7 +49,7 @@ export function init(port = 15500) {
43
49
  // Send the initial table data when a client connects
44
50
  socket.emit('init', { table: readLatestJson() });
45
51
  // Watch for changes to latest.json and emit the update event
46
- fs.watch(latestJsonPath, debounce((event) => {
52
+ fs_1.default.watch(latestJsonPath, (0, debounce_1.default)((event) => {
47
53
  if (event === 'change') {
48
54
  socket.emit('update', { table: readLatestJson() });
49
55
  }
@@ -51,24 +57,25 @@ export function init(port = 15500) {
51
57
  });
52
58
  httpServer.listen(port, () => {
53
59
  const url = `http://localhost:${port}`;
54
- logger.info(`Server listening at ${url}`);
55
- const rl = readline.createInterface({
60
+ logger_js_1.default.info(`Server listening at ${url}`);
61
+ const rl = node_readline_1.default.createInterface({
56
62
  input: process.stdin,
57
63
  output: process.stdout,
58
64
  });
59
65
  rl.question('Do you want to open the browser to the URL? (y/N): ', async (answer) => {
60
66
  if (answer.toLowerCase().startsWith('y')) {
61
67
  try {
62
- await open(url);
63
- logger.info(`Opening browser to: ${url}`);
68
+ await (0, opener_1.default)(url);
69
+ logger_js_1.default.info(`Opening browser to: ${url}`);
64
70
  }
65
71
  catch (err) {
66
- logger.error(`Failed to open browser: ${String(err)}`);
72
+ logger_js_1.default.error(`Failed to open browser: ${String(err)}`);
67
73
  }
68
74
  }
69
75
  rl.close();
70
- logger.info('Press Ctrl+C to stop the server');
76
+ logger_js_1.default.info('Press Ctrl+C to stop the server');
71
77
  });
72
78
  });
73
79
  }
80
+ exports.init = init;
74
81
  //# sourceMappingURL=server.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,IAAI,CAAC;AACpB,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,QAAQ,MAAM,eAAe,CAAC;AACrC,OAAO,IAAI,MAAM,WAAW,CAAC;AAE7B,OAAO,QAAQ,MAAM,UAAU,CAAC;AAChC,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,OAAO,MAAM,SAAS,CAAC;AAC9B,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,MAAM,IAAI,cAAc,EAAE,MAAM,WAAW,CAAC;AAErD,OAAO,SAAS,MAAM,aAAa,CAAC;AACpC,OAAO,MAAM,MAAM,cAAc,CAAC;AAClC,OAAO,EAAE,YAAY,EAAE,MAAM,WAAW,CAAC;AACzC,OAAO,EAAE,oBAAoB,EAAE,MAAM,YAAY,CAAC;AAIlD,MAAM,UAAU,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,OAAO,EAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,YAAY,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,OAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,IAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,cAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAUH,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,EAAE,GAAY,EAAE,GAAa,EAAE,EAAE;QAC1D,IAAI;YACF,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,GAAG,CAAC,IAA2B,CAAC;YAC9D,MAAM,OAAO,GAAG,MAAM,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YAC5D,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;SACnB;QAAC,OAAO,KAAK,EAAE;YACd,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,0BAA0B,EAAE,CAAC,CAAC;SAC/D;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,oBAAoB,EAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,EAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,EAAE,CAAC,KAAK,CACN,cAAc,EACd,QAAQ,CAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,MAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,QAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAI,CAAC,GAAG,CAAC,CAAC;oBAChB,MAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,MAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,MAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC"}
1
+ {"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,2DAAoC;AACpC,6DAAkC;AAClC,sCAAyC;AACzC,wCAAkD;AAIlD,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,qBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAUH,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,EAAE,GAAY,EAAE,GAAa,EAAE,EAAE;QAC1D,IAAI;YACF,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,GAAG,CAAC,IAA2B,CAAC;YAC9D,MAAM,OAAO,GAAG,MAAM,kBAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YAC5D,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;SACnB;QAAC,OAAO,KAAK,EAAE;YACd,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,0BAA0B,EAAE,CAAC,CAAC;SAC/D;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,8BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,mBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,mBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA7ED,oBA6EC"}
package/package.json CHANGED
@@ -1,10 +1,16 @@
1
1
  {
2
2
  "name": "promptfoo",
3
3
  "author": "Ian Webster",
4
- "version": "0.5.1",
4
+ "version": "0.6.0",
5
5
  "license": "MIT",
6
- "type": "module",
6
+ "type": "commonjs",
7
7
  "main": "dist/index.js",
8
+ "exports": {
9
+ ".": {
10
+ "import": "./dist/index.js",
11
+ "require": "./dist/index.js"
12
+ }
13
+ },
8
14
  "types": "dist/index.d.ts",
9
15
  "typings": "dist/index.d.ts",
10
16
  "files": [
@@ -35,9 +41,12 @@
35
41
  "@types/cors": "^2.8.13",
36
42
  "@types/debounce": "^1.2.1",
37
43
  "@types/express": "^4.17.17",
44
+ "@types/glob": "^8.1.0",
38
45
  "@types/jest": "^29.5.1",
39
46
  "@types/js-yaml": "^4.0.5",
47
+ "@types/node-fetch": "^2.6.4",
40
48
  "@types/nunjucks": "^3.2.2",
49
+ "@types/opener": "^1.4.0",
41
50
  "babel-jest": "^29.5.0",
42
51
  "jest-watch-typeahead": "^2.2.2",
43
52
  "prettier": "^2.8.8",
@@ -56,11 +65,12 @@
56
65
  "csv-stringify": "^6.3.2",
57
66
  "debounce": "^1.2.1",
58
67
  "express": "^4.18.2",
68
+ "glob": "^10.2.6",
59
69
  "js-yaml": "^4.1.0",
60
70
  "lru-cache": "^9.1.1",
61
- "node-fetch": "^3.3.1",
71
+ "node-fetch": "^2.6.7",
62
72
  "nunjucks": "^3.2.4",
63
- "open": "^9.1.0",
73
+ "opener": "^1.5.2",
64
74
  "socket.io": "^4.6.1",
65
75
  "winston": "^3.8.2"
66
76
  }
@@ -0,0 +1,154 @@
1
+ import nunjucks from 'nunjucks';
2
+
3
+ import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
4
+ import { cosineSimilarity } from './util.js';
5
+ import { loadApiProvider } from './providers.js';
6
+ import { DEFAULT_GRADING_PROMPT } from './prompts.js';
7
+
8
+ import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
9
+
10
+ interface GradingResult {
11
+ pass: boolean;
12
+ reason: string;
13
+ tokensUsed: TokenUsage;
14
+ }
15
+
16
+ const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
17
+
18
+ const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
19
+
20
+ export async function matchesExpectedValue(
21
+ expected: string,
22
+ output: string,
23
+ options: EvaluateOptions,
24
+ ): Promise<{ pass: boolean; reason?: string }> {
25
+ const match = expected.match(SIMILAR_REGEX);
26
+
27
+ if (match) {
28
+ const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
29
+ const rest = expected.replace(SIMILAR_REGEX, '').trim();
30
+ return matchesSimilarity(rest, output, threshold);
31
+ } else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
32
+ // TODO(1.0): delete eval: legacy option
33
+ const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
34
+ const functionBody = expected.slice(sliceLength);
35
+
36
+ const customFunction = new Function('output', `return ${functionBody}`);
37
+ return { pass: customFunction(output) };
38
+ } else if (expected.startsWith('grade:')) {
39
+ return matchesLlmRubric(expected.slice(6), output, options.grading);
40
+ } else {
41
+ const pass = expected === output;
42
+ return {
43
+ pass,
44
+ reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
45
+ };
46
+ }
47
+ }
48
+
49
+ export async function matchesSimilarity(
50
+ expected: string,
51
+ output: string,
52
+ threshold: number,
53
+ ): Promise<GradingResult> {
54
+ const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
55
+ const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
56
+
57
+ const tokensUsed = {
58
+ total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
59
+ prompt: (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
60
+ completion:
61
+ (expectedEmbedding.tokenUsage?.completion || 0) +
62
+ (outputEmbedding.tokenUsage?.completion || 0),
63
+ };
64
+
65
+ if (expectedEmbedding.error || outputEmbedding.error) {
66
+ return {
67
+ pass: false,
68
+ reason:
69
+ expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
70
+ tokensUsed,
71
+ };
72
+ }
73
+
74
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
75
+ return {
76
+ pass: false,
77
+ reason: 'Embedding not found',
78
+ tokensUsed,
79
+ };
80
+ }
81
+
82
+ const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
83
+ if (similarity < threshold) {
84
+ return {
85
+ pass: false,
86
+ reason: `Similarity ${similarity} is less than threshold ${threshold}`,
87
+ tokensUsed,
88
+ };
89
+ }
90
+ return {
91
+ pass: true,
92
+ reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
93
+ tokensUsed,
94
+ };
95
+ }
96
+
97
+ export async function matchesLlmRubric(
98
+ expected: string,
99
+ output: string,
100
+ options?: GradingConfig,
101
+ ): Promise<GradingResult> {
102
+ if (!options) {
103
+ throw new Error(
104
+ 'Cannot grade output without grading config. Specify --grader option or grading config.',
105
+ );
106
+ }
107
+
108
+ const prompt = nunjucks.renderString(options.prompt || DEFAULT_GRADING_PROMPT, {
109
+ content: output,
110
+ rubric: expected,
111
+ });
112
+
113
+ let provider = options.provider || DefaultGradingProvider;
114
+ if (typeof provider === 'string') {
115
+ provider = await loadApiProvider(provider);
116
+ }
117
+ const resp = await provider.callApi(prompt);
118
+ if (resp.error || !resp.output) {
119
+ return {
120
+ pass: false,
121
+ reason: resp.error || 'No output',
122
+ tokensUsed: {
123
+ total: resp.tokenUsage?.total || 0,
124
+ prompt: resp.tokenUsage?.prompt || 0,
125
+ completion: resp.tokenUsage?.completion || 0,
126
+ },
127
+ };
128
+ }
129
+
130
+ try {
131
+ const parsed = JSON.parse(resp.output) as GradingResult;
132
+ parsed.tokensUsed = {
133
+ total: resp.tokenUsage?.total || 0,
134
+ prompt: resp.tokenUsage?.prompt || 0,
135
+ completion: resp.tokenUsage?.completion || 0,
136
+ };
137
+ return parsed;
138
+ } catch (err) {
139
+ return {
140
+ pass: false,
141
+ reason: `Output is not valid JSON: ${resp.output}`,
142
+ tokensUsed: {
143
+ total: resp.tokenUsage?.total || 0,
144
+ prompt: resp.tokenUsage?.prompt || 0,
145
+ completion: resp.tokenUsage?.completion || 0,
146
+ },
147
+ };
148
+ }
149
+ }
150
+
151
+ export default {
152
+ matchesSimilarity,
153
+ matchesLlmRubric,
154
+ };
package/src/esm.ts CHANGED
@@ -1,10 +1,13 @@
1
1
  // esm-specific crap that needs to get mocked out in tests
2
2
 
3
- import path from 'path';
4
- import { fileURLToPath } from 'url';
3
+ //import path from 'path';
4
+ //import { fileURLToPath } from 'url';
5
5
 
6
6
  export function getDirectory(): string {
7
+ /*
7
8
  // @ts-ignore: Jest chokes on this
8
9
  const __filename = fileURLToPath(import.meta.url);
9
10
  return path.dirname(__filename);
11
+ */
12
+ return __dirname;
10
13
  }
package/src/evaluator.ts CHANGED
@@ -1,8 +1,11 @@
1
+ import readline from 'node:readline';
2
+
1
3
  import async from 'async';
4
+ import chalk from 'chalk';
2
5
  import nunjucks from 'nunjucks';
3
6
 
4
- import { DEFAULT_GRADING_PROMPT } from './prompts.js';
5
- import { DefaultEmbeddingProvider } from './providers/openai.js';
7
+ import logger from './logger.js';
8
+ import { matchesExpectedValue } from './assertions.js';
6
9
 
7
10
  import type { SingleBar } from 'cli-progress';
8
11
  import type {
@@ -13,9 +16,8 @@ import type {
13
16
  EvaluateSummary,
14
17
  EvaluateTable,
15
18
  Prompt,
16
- TokenUsage,
17
19
  } from './types.js';
18
- import { cosineSimilarity } from './util.js';
20
+ import { generatePrompts } from './suggestions.js';
19
21
 
20
22
  interface RunEvalOptions {
21
23
  provider: ApiProvider;
@@ -27,16 +29,8 @@ interface RunEvalOptions {
27
29
  colIndex: number;
28
30
  }
29
31
 
30
- interface GradingResult {
31
- pass: boolean;
32
- reason: string;
33
- tokensUsed: TokenUsage;
34
- }
35
-
36
32
  const DEFAULT_MAX_CONCURRENCY = 4;
37
33
 
38
- const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
39
-
40
34
  class Evaluator {
41
35
  options: EvaluateOptions;
42
36
  stats: EvaluateStats;
@@ -54,128 +48,6 @@ class Evaluator {
54
48
  };
55
49
  }
56
50
 
57
- async gradeOutput(expected: string, output: string): Promise<GradingResult> {
58
- const { grading } = this.options;
59
-
60
- if (!grading) {
61
- throw new Error(
62
- 'Cannot grade output without grading config. Specify --grader option or grading config.',
63
- );
64
- }
65
-
66
- const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
67
- content: output,
68
- rubric: expected,
69
- });
70
-
71
- const resp = await grading.provider.callApi(prompt);
72
- if (resp.error || !resp.output) {
73
- return {
74
- pass: false,
75
- reason: resp.error || 'No output',
76
- tokensUsed: {
77
- total: resp.tokenUsage?.total || 0,
78
- prompt: resp.tokenUsage?.prompt || 0,
79
- completion: resp.tokenUsage?.completion || 0,
80
- },
81
- };
82
- }
83
-
84
- try {
85
- const parsed = JSON.parse(resp.output) as GradingResult;
86
- parsed.tokensUsed = {
87
- total: resp.tokenUsage?.total || 0,
88
- prompt: resp.tokenUsage?.prompt || 0,
89
- completion: resp.tokenUsage?.completion || 0,
90
- };
91
- return parsed;
92
- } catch (err) {
93
- return {
94
- pass: false,
95
- reason: `Output is not valid JSON: ${resp.output}`,
96
- tokensUsed: {
97
- total: resp.tokenUsage?.total || 0,
98
- prompt: resp.tokenUsage?.prompt || 0,
99
- completion: resp.tokenUsage?.completion || 0,
100
- },
101
- };
102
- }
103
- }
104
-
105
- async checkSimilarity(
106
- expected: string,
107
- output: string,
108
- threshold: number,
109
- ): Promise<GradingResult> {
110
- const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
111
- const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
112
-
113
- const tokensUsed = {
114
- total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
115
- prompt:
116
- (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
117
- completion:
118
- (expectedEmbedding.tokenUsage?.completion || 0) +
119
- (outputEmbedding.tokenUsage?.completion || 0),
120
- };
121
-
122
- if (expectedEmbedding.error || outputEmbedding.error) {
123
- return {
124
- pass: false,
125
- reason:
126
- expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
127
- tokensUsed,
128
- };
129
- }
130
-
131
- if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
132
- return {
133
- pass: false,
134
- reason: 'Embedding not found',
135
- tokensUsed,
136
- };
137
- }
138
-
139
- const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
140
- if (similarity < threshold) {
141
- return {
142
- pass: false,
143
- reason: `Similarity ${similarity} is less than threshold ${threshold}`,
144
- tokensUsed,
145
- };
146
- }
147
- return {
148
- pass: true,
149
- reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
150
- tokensUsed,
151
- };
152
- }
153
-
154
- async checkExpectedValue(
155
- expected: string,
156
- output: string,
157
- ): Promise<{ pass: boolean; reason?: string }> {
158
- const match = expected.match(SIMILAR_REGEX);
159
-
160
- if (match) {
161
- const threshold = parseFloat(match[1]) || 0.8;
162
- const rest = expected.replace(SIMILAR_REGEX, '').trim();
163
- return this.checkSimilarity(rest, output, threshold);
164
- } else if (expected.startsWith('eval:')) {
165
- const evalBody = expected.slice(5);
166
- const evalFunction = new Function('output', `return ${evalBody}`);
167
- return { pass: evalFunction(output) };
168
- } else if (expected.startsWith('grade:')) {
169
- return this.gradeOutput(expected.slice(6), output);
170
- } else {
171
- const pass = expected === output;
172
- return {
173
- pass,
174
- reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
175
- };
176
- }
177
- }
178
-
179
51
  async runEval({
180
52
  provider,
181
53
  prompt,
@@ -207,7 +79,7 @@ class Evaluator {
207
79
  ret.error = response.error;
208
80
  } else if (response.output) {
209
81
  const checkResult = vars.__expected
210
- ? await this.checkExpectedValue(vars.__expected, response.output)
82
+ ? await matchesExpectedValue(vars.__expected, response.output, this.options)
211
83
  : { pass: true };
212
84
  if (!checkResult.pass) {
213
85
  ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
@@ -243,6 +115,48 @@ class Evaluator {
243
115
  const options = this.options;
244
116
  const prompts: Prompt[] = [];
245
117
 
118
+ if (options.prompt?.generateSuggestions) {
119
+ logger.info(`Generating prompt variations...`);
120
+ const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
121
+ if (error || !newPrompts) {
122
+ throw new Error(`Failed to generate prompts: ${error}`);
123
+ }
124
+
125
+ logger.info(chalk.blue('Generated prompts:'));
126
+ let numAdded = 0;
127
+ for (const prompt of newPrompts) {
128
+ logger.info('--------------------------------------------------------');
129
+ logger.info(`${prompt}`);
130
+ logger.info('--------------------------------------------------------');
131
+
132
+ // Ask the user if they want to continue
133
+ await new Promise((resolve) => {
134
+ const rl = readline.createInterface({
135
+ input: process.stdin,
136
+ output: process.stdout,
137
+ });
138
+ rl.question(
139
+ `${chalk.blue('Do you want to test this prompt?')} (y/N): `,
140
+ async (answer) => {
141
+ rl.close();
142
+ if (answer.toLowerCase().startsWith('y')) {
143
+ options.prompts.push(prompt);
144
+ numAdded++;
145
+ } else {
146
+ logger.info('Skipping this prompt.');
147
+ }
148
+ resolve(true);
149
+ },
150
+ );
151
+ });
152
+ }
153
+
154
+ if (numAdded < 1) {
155
+ logger.info(chalk.red('No prompts selected. Aborting.'));
156
+ process.exit(1);
157
+ }
158
+ }
159
+
246
160
  for (const promptContent of options.prompts) {
247
161
  for (const provider of options.providers) {
248
162
  const display =
@@ -255,16 +169,20 @@ class Evaluator {
255
169
  }
256
170
 
257
171
  const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
258
- const varsWithExpectedKeyRemoved = vars.map((v) => {
172
+ const varsWithSpecialColsRemoved = vars.map((v) => {
259
173
  const ret = { ...v };
260
- delete ret.__expected;
174
+ Object.keys(ret).forEach((key) => {
175
+ if (key.startsWith('__')) {
176
+ delete ret[key];
177
+ }
178
+ });
261
179
  return ret;
262
180
  });
263
181
  const isTest = vars[0].__expected;
264
182
  const table: EvaluateTable = {
265
183
  head: {
266
184
  prompts: prompts.map((p) => p.display),
267
- vars: Object.keys(varsWithExpectedKeyRemoved[0]),
185
+ vars: Object.keys(varsWithSpecialColsRemoved[0]),
268
186
  },
269
187
  body: [],
270
188
  };
@@ -292,11 +210,15 @@ class Evaluator {
292
210
  let rowIndex = 0;
293
211
  for (const row of vars) {
294
212
  let colIndex = 0;
213
+
214
+ const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
215
+ const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
216
+
295
217
  for (const promptContent of options.prompts) {
296
218
  for (const provider of options.providers) {
297
219
  runEvalOptions.push({
298
220
  provider,
299
- prompt: promptContent,
221
+ prompt: prependToPrompt + promptContent + appendToPrompt,
300
222
  vars: row,
301
223
  includeProviderId: options.providers.length > 1,
302
224
  rowIndex,
package/src/index.ts CHANGED
@@ -1,8 +1,12 @@
1
1
  import { evaluate as doEvaluate } from './evaluator.js';
2
2
  import { loadApiProvider } from './providers.js';
3
+ import assertions from './assertions.js';
4
+ import providers from './providers.js';
3
5
 
4
6
  import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
5
7
 
8
+ export * from './types.js';
9
+
6
10
  async function evaluate(
7
11
  providers: (string | ApiProvider)[] | (string | ApiProvider),
8
12
  options: Omit<EvaluateOptions, 'providers'>,
@@ -30,6 +34,14 @@ async function evaluate(
30
34
  });
31
35
  }
32
36
 
37
+ module.exports = {
38
+ evaluate,
39
+ assertions,
40
+ providers,
41
+ };
42
+
33
43
  export default {
34
44
  evaluate,
45
+ assertions,
46
+ providers,
35
47
  };