promptfoo 0.91.3 → 0.92.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/dist/drizzle/0006_harsh_caretaker.sql +42 -0
  2. package/dist/drizzle/0007_cloudy_wong.sql +1 -0
  3. package/dist/drizzle/meta/0006_snapshot.json +721 -0
  4. package/dist/drizzle/meta/0007_snapshot.json +723 -0
  5. package/dist/drizzle/meta/_journal.json +14 -0
  6. package/dist/package.json +10 -8
  7. package/dist/src/app/assets/{index-C6z1nbLN.js → index-BpjzEMiv.js} +243 -241
  8. package/dist/src/app/assets/{index.es-oqbvfIxR.js → index.es-ihzvEu35.js} +1 -1
  9. package/dist/src/app/assets/{sync-D2s75VlC.js → sync-BosjlpGJ.js} +1 -1
  10. package/dist/src/app/index.html +3 -3
  11. package/dist/src/assertions.js +2 -2
  12. package/dist/src/assertions.js.map +1 -1
  13. package/dist/src/commands/cache.d.ts.map +1 -1
  14. package/dist/src/commands/cache.js +0 -2
  15. package/dist/src/commands/cache.js.map +1 -1
  16. package/dist/src/commands/eval.d.ts.map +1 -1
  17. package/dist/src/commands/eval.js +19 -16
  18. package/dist/src/commands/eval.js.map +1 -1
  19. package/dist/src/commands/export.d.ts.map +1 -1
  20. package/dist/src/commands/export.js +8 -31
  21. package/dist/src/commands/export.js.map +1 -1
  22. package/dist/src/commands/import.d.ts.map +1 -1
  23. package/dist/src/commands/import.js +52 -13
  24. package/dist/src/commands/import.js.map +1 -1
  25. package/dist/src/commands/list.d.ts.map +1 -1
  26. package/dist/src/commands/list.js +35 -7
  27. package/dist/src/commands/list.js.map +1 -1
  28. package/dist/src/commands/share.d.ts +2 -2
  29. package/dist/src/commands/share.d.ts.map +1 -1
  30. package/dist/src/commands/share.js +12 -13
  31. package/dist/src/commands/share.js.map +1 -1
  32. package/dist/src/commands/show.d.ts.map +1 -1
  33. package/dist/src/commands/show.js +10 -6
  34. package/dist/src/commands/show.js.map +1 -1
  35. package/dist/src/constants.d.ts +1 -0
  36. package/dist/src/constants.d.ts.map +1 -1
  37. package/dist/src/constants.js +2 -1
  38. package/dist/src/constants.js.map +1 -1
  39. package/dist/src/database/index.js +1 -1
  40. package/dist/src/database/index.js.map +1 -1
  41. package/dist/src/database/tables.d.ts +609 -11
  42. package/dist/src/database/tables.d.ts.map +1 -1
  43. package/dist/src/database/tables.js +111 -52
  44. package/dist/src/database/tables.js.map +1 -1
  45. package/dist/src/database/types.d.ts +3 -3
  46. package/dist/src/database/types.d.ts.map +1 -1
  47. package/dist/src/evaluator.d.ts +3 -2
  48. package/dist/src/evaluator.d.ts.map +1 -1
  49. package/dist/src/evaluator.js +75 -104
  50. package/dist/src/evaluator.js.map +1 -1
  51. package/dist/src/evaluatorHelpers.d.ts.map +1 -1
  52. package/dist/src/evaluatorHelpers.js +2 -1
  53. package/dist/src/evaluatorHelpers.js.map +1 -1
  54. package/dist/src/index.d.ts +2 -1
  55. package/dist/src/index.d.ts.map +1 -1
  56. package/dist/src/index.js +18 -10
  57. package/dist/src/index.js.map +1 -1
  58. package/dist/src/models/eval.d.ts +95 -0
  59. package/dist/src/models/eval.d.ts.map +1 -0
  60. package/dist/src/models/eval.js +390 -0
  61. package/dist/src/models/eval.js.map +1 -0
  62. package/dist/src/models/evalResult.d.ts +50 -0
  63. package/dist/src/models/evalResult.d.ts.map +1 -0
  64. package/dist/src/models/evalResult.js +122 -0
  65. package/dist/src/models/evalResult.js.map +1 -0
  66. package/dist/src/models/provider.d.ts +9 -0
  67. package/dist/src/models/provider.d.ts.map +1 -0
  68. package/dist/src/models/provider.js +47 -0
  69. package/dist/src/models/provider.js.map +1 -0
  70. package/dist/src/prompts/index.d.ts.map +1 -1
  71. package/dist/src/prompts/index.js +2 -1
  72. package/dist/src/prompts/index.js.map +1 -1
  73. package/dist/src/prompts/utils.d.ts +1 -0
  74. package/dist/src/prompts/utils.d.ts.map +1 -1
  75. package/dist/src/prompts/utils.js +7 -0
  76. package/dist/src/prompts/utils.js.map +1 -1
  77. package/dist/src/providers/fal.d.ts +2 -2
  78. package/dist/src/providers/fal.d.ts.map +1 -1
  79. package/dist/src/providers/fal.js +2 -1
  80. package/dist/src/providers/fal.js.map +1 -1
  81. package/dist/src/providers/http.js +2 -2
  82. package/dist/src/providers/http.js.map +1 -1
  83. package/dist/src/providers/palm.d.ts +4 -3
  84. package/dist/src/providers/palm.d.ts.map +1 -1
  85. package/dist/src/providers/palm.js +13 -3
  86. package/dist/src/providers/palm.js.map +1 -1
  87. package/dist/src/providers.js +5 -5
  88. package/dist/src/providers.js.map +1 -1
  89. package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +10 -0
  90. package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +10 -0
  91. package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +10 -0
  92. package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +10 -0
  93. package/dist/src/server/server.d.ts +1 -0
  94. package/dist/src/server/server.d.ts.map +1 -1
  95. package/dist/src/server/server.js +70 -31
  96. package/dist/src/server/server.js.map +1 -1
  97. package/dist/src/share.d.ts +2 -2
  98. package/dist/src/share.d.ts.map +1 -1
  99. package/dist/src/share.js +93 -34
  100. package/dist/src/share.js.map +1 -1
  101. package/dist/src/table.d.ts +2 -2
  102. package/dist/src/table.d.ts.map +1 -1
  103. package/dist/src/table.js +3 -3
  104. package/dist/src/table.js.map +1 -1
  105. package/dist/src/types/index.d.ts +163 -11
  106. package/dist/src/types/index.d.ts.map +1 -1
  107. package/dist/src/types/index.js +21 -1
  108. package/dist/src/types/index.js.map +1 -1
  109. package/dist/src/util/config/load.d.ts.map +1 -1
  110. package/dist/src/util/config/load.js +2 -1
  111. package/dist/src/util/config/load.js.map +1 -1
  112. package/dist/src/util/config/manage.d.ts.map +1 -1
  113. package/dist/src/util/config/manage.js.map +1 -1
  114. package/dist/src/util/convertEvalResultsToTable.d.ts +16 -0
  115. package/dist/src/util/convertEvalResultsToTable.d.ts.map +1 -0
  116. package/dist/src/util/convertEvalResultsToTable.js +136 -0
  117. package/dist/src/util/convertEvalResultsToTable.js.map +1 -0
  118. package/dist/src/util/createHash.d.ts +1 -0
  119. package/dist/src/util/createHash.d.ts.map +1 -1
  120. package/dist/src/util/createHash.js +9 -0
  121. package/dist/src/util/createHash.js.map +1 -1
  122. package/dist/src/util/file.d.ts +8 -0
  123. package/dist/src/util/file.d.ts.map +1 -0
  124. package/dist/src/util/file.js +13 -0
  125. package/dist/src/util/file.js.map +1 -0
  126. package/dist/src/util/index.d.ts +9 -14
  127. package/dist/src/util/index.d.ts.map +1 -1
  128. package/dist/src/util/index.js +132 -268
  129. package/dist/src/util/index.js.map +1 -1
  130. package/dist/src/util/time.d.ts +2 -0
  131. package/dist/src/util/time.d.ts.map +1 -0
  132. package/dist/src/util/time.js +7 -0
  133. package/dist/src/util/time.js.map +1 -0
  134. package/dist/src/util/transform.js +2 -2
  135. package/dist/src/util/transform.js.map +1 -1
  136. package/dist/src/validators/providers.d.ts +6 -0
  137. package/dist/src/validators/providers.d.ts.map +1 -1
  138. package/dist/src/validators/providers.js +1 -0
  139. package/dist/src/validators/providers.js.map +1 -1
  140. package/dist/src/validators/redteam.d.ts +6 -0
  141. package/dist/src/validators/redteam.d.ts.map +1 -1
  142. package/dist/test/commands/eval/filterFailingTests.test.js +24 -2
  143. package/dist/test/commands/eval/filterFailingTests.test.js.map +1 -1
  144. package/dist/test/evaluator.test.js +152 -74
  145. package/dist/test/evaluator.test.js.map +1 -1
  146. package/dist/test/factories/data/eval/database_records.d.ts +142 -0
  147. package/dist/test/factories/data/eval/database_records.d.ts.map +1 -0
  148. package/dist/test/factories/data/eval/database_records.js +251 -0
  149. package/dist/test/factories/data/eval/database_records.js.map +1 -0
  150. package/dist/test/factories/evalFactory.d.ts +768 -0
  151. package/dist/test/factories/evalFactory.d.ts.map +1 -0
  152. package/dist/test/factories/evalFactory.js +121 -0
  153. package/dist/test/factories/evalFactory.js.map +1 -0
  154. package/dist/test/index.test.js +20 -35
  155. package/dist/test/index.test.js.map +1 -1
  156. package/dist/test/models/eval.test.d.ts +2 -0
  157. package/dist/test/models/eval.test.d.ts.map +1 -0
  158. package/dist/test/models/eval.test.js +34 -0
  159. package/dist/test/models/eval.test.js.map +1 -0
  160. package/dist/test/providers.test.js +3 -3
  161. package/dist/test/providers.test.js.map +1 -1
  162. package/dist/test/server/share.test.d.ts +2 -0
  163. package/dist/test/server/share.test.d.ts.map +1 -0
  164. package/dist/test/server/share.test.js +36 -0
  165. package/dist/test/server/share.test.js.map +1 -0
  166. package/dist/test/server/v3evalToShare.json +507 -0
  167. package/dist/test/server/v4evalToShare.json +421 -0
  168. package/dist/test/types.test.js +56 -3
  169. package/dist/test/types.test.js.map +1 -1
  170. package/dist/test/util.file.test.d.ts +2 -0
  171. package/dist/test/util.file.test.d.ts.map +1 -0
  172. package/dist/test/util.file.test.js +32 -0
  173. package/dist/test/util.file.test.js.map +1 -0
  174. package/dist/test/util.listPrevious.test.d.ts +2 -0
  175. package/dist/test/util.listPrevious.test.d.ts.map +1 -0
  176. package/dist/test/util.listPrevious.test.js +37 -0
  177. package/dist/test/util.listPrevious.test.js.map +1 -0
  178. package/dist/test/util.test.js +38 -311
  179. package/dist/test/util.test.js.map +1 -1
  180. package/dist/tsconfig.tsbuildinfo +1 -0
  181. package/package.json +10 -8
@@ -3,9 +3,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
3
3
  return (mod && mod.__esModule) ? mod : { "default": mod };
4
4
  };
5
5
  Object.defineProperty(exports, "__esModule", { value: true });
6
+ const crypto_1 = require("crypto");
6
7
  const glob_1 = __importDefault(require("glob"));
7
8
  const evaluator_1 = require("../src/evaluator");
8
9
  const evaluatorHelpers_1 = require("../src/evaluatorHelpers");
10
+ const migrate_1 = require("../src/migrate");
11
+ const eval_1 = __importDefault(require("../src/models/eval"));
9
12
  jest.mock('node-fetch', () => jest.fn());
10
13
  jest.mock('proxy-agent', () => ({
11
14
  ProxyAgent: jest.fn().mockImplementation(() => ({})),
@@ -13,21 +16,7 @@ jest.mock('proxy-agent', () => ({
13
16
  jest.mock('glob', () => ({
14
17
  globSync: jest.fn(),
15
18
  }));
16
- jest.mock('fs', () => ({
17
- readFileSync: jest.fn(),
18
- writeFileSync: jest.fn(),
19
- statSync: jest.fn(),
20
- readdirSync: jest.fn(),
21
- existsSync: jest.fn(),
22
- mkdirSync: jest.fn(),
23
- promises: {
24
- readFile: jest.fn(),
25
- },
26
- }));
27
19
  jest.mock('../src/esm');
28
- jest.mock('../src/database', () => ({
29
- getDb: jest.fn(),
30
- }));
31
20
  jest.mock('../src/logger');
32
21
  jest.mock('../src/evaluatorHelpers', () => ({
33
22
  ...jest.requireActual('../src/evaluatorHelpers'),
@@ -40,6 +29,13 @@ const mockApiProvider = {
40
29
  tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
41
30
  }),
42
31
  };
32
+ const mockApiProvider2 = {
33
+ id: jest.fn().mockReturnValue('test-provider-2'),
34
+ callApi: jest.fn().mockResolvedValue({
35
+ output: 'Test output',
36
+ tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
37
+ }),
38
+ };
43
39
  const mockGradingApiProviderPasses = {
44
40
  id: jest.fn().mockReturnValue('test-grading-provider'),
45
41
  callApi: jest.fn().mockResolvedValue({
@@ -58,6 +54,9 @@ function toPrompt(text) {
58
54
  return { raw: text, label: text };
59
55
  }
60
56
  describe('evaluator', () => {
57
+ beforeAll(async () => {
58
+ await (0, migrate_1.runDbMigrations)();
59
+ });
61
60
  beforeEach(() => {
62
61
  jest.clearAllMocks();
63
62
  });
@@ -74,7 +73,9 @@ describe('evaluator', () => {
74
73
  },
75
74
  ],
76
75
  };
77
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
76
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
77
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
78
+ const summary = await evalRecord.toEvaluateSummary();
78
79
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
79
80
  expect(summary.stats.successes).toBe(1);
80
81
  expect(summary.stats.failures).toBe(0);
@@ -93,7 +94,9 @@ describe('evaluator', () => {
93
94
  },
94
95
  ],
95
96
  };
96
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
97
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
98
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
99
+ const summary = await evalRecord.toEvaluateSummary();
97
100
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
98
101
  expect(summary.stats.successes).toBe(1);
99
102
  expect(summary.stats.failures).toBe(0);
@@ -112,7 +115,9 @@ describe('evaluator', () => {
112
115
  },
113
116
  ],
114
117
  };
115
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
118
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
119
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
120
+ const summary = await evalRecord.toEvaluateSummary();
116
121
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
117
122
  expect(summary.stats.successes).toBe(1);
118
123
  expect(summary.stats.failures).toBe(0);
@@ -131,7 +136,9 @@ describe('evaluator', () => {
131
136
  },
132
137
  ],
133
138
  };
134
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
139
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
140
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
141
+ const summary = await evalRecord.toEvaluateSummary();
135
142
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
136
143
  expect(summary.stats.successes).toBe(1);
137
144
  expect(summary.stats.failures).toBe(0);
@@ -150,7 +157,9 @@ describe('evaluator', () => {
150
157
  },
151
158
  ],
152
159
  };
153
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
160
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
161
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
162
+ const summary = await evalRecord.toEvaluateSummary();
154
163
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
155
164
  expect(summary.stats.successes).toBe(4);
156
165
  expect(summary.stats.failures).toBe(0);
@@ -169,7 +178,9 @@ describe('evaluator', () => {
169
178
  },
170
179
  ],
171
180
  };
172
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
181
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
182
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
183
+ const summary = await evalRecord.toEvaluateSummary();
173
184
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
174
185
  expect(summary.stats.successes).toBe(2);
175
186
  expect(summary.stats.failures).toBe(0);
@@ -183,7 +194,9 @@ describe('evaluator', () => {
183
194
  providers: [mockApiProvider],
184
195
  prompts: [toPrompt('Test prompt')],
185
196
  };
186
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
197
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
198
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
199
+ const summary = await evalRecord.toEvaluateSummary();
187
200
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
188
201
  expect(summary.stats.successes).toBe(1);
189
202
  expect(summary.stats.failures).toBe(0);
@@ -197,7 +210,9 @@ describe('evaluator', () => {
197
210
  providers: [mockApiProvider, mockApiProvider, mockApiProvider],
198
211
  prompts: [toPrompt('Test prompt')],
199
212
  };
200
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
213
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
214
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
215
+ const summary = await evalRecord.toEvaluateSummary();
201
216
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
202
217
  expect(summary.stats.successes).toBe(3);
203
218
  expect(summary.stats.failures).toBe(0);
@@ -221,7 +236,9 @@ describe('evaluator', () => {
221
236
  },
222
237
  ],
223
238
  };
224
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
239
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
240
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
241
+ const summary = await evalRecord.toEvaluateSummary();
225
242
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
226
243
  expect(summary.stats.successes).toBe(1);
227
244
  expect(summary.stats.failures).toBe(0);
@@ -243,7 +260,9 @@ describe('evaluator', () => {
243
260
  },
244
261
  ],
245
262
  };
246
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
263
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
264
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
265
+ const summary = await evalRecord.toEvaluateSummary();
247
266
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
248
267
  expect(summary.stats.successes).toBe(0);
249
268
  expect(summary.stats.failures).toBe(1);
@@ -265,7 +284,9 @@ describe('evaluator', () => {
265
284
  },
266
285
  ],
267
286
  };
268
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
287
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
288
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
289
+ const summary = await evalRecord.toEvaluateSummary();
269
290
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
270
291
  expect(summary.stats.successes).toBe(1);
271
292
  expect(summary.stats.failures).toBe(0);
@@ -287,7 +308,9 @@ describe('evaluator', () => {
287
308
  },
288
309
  ],
289
310
  };
290
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
311
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
312
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
313
+ const summary = await evalRecord.toEvaluateSummary();
291
314
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
292
315
  expect(summary.stats.successes).toBe(0);
293
316
  expect(summary.stats.failures).toBe(1);
@@ -314,7 +337,9 @@ describe('evaluator', () => {
314
337
  },
315
338
  },
316
339
  };
317
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
340
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
341
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
342
+ const summary = await evalRecord.toEvaluateSummary();
318
343
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
319
344
  expect(summary.stats.successes).toBe(1);
320
345
  expect(summary.stats.failures).toBe(0);
@@ -341,7 +366,9 @@ describe('evaluator', () => {
341
366
  },
342
367
  },
343
368
  };
344
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
369
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
370
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
371
+ const summary = await evalRecord.toEvaluateSummary();
345
372
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
346
373
  expect(summary.stats.successes).toBe(0);
347
374
  expect(summary.stats.failures).toBe(1);
@@ -358,7 +385,9 @@ describe('evaluator', () => {
358
385
  },
359
386
  },
360
387
  };
361
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
388
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
389
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
390
+ const summary = await evalRecord.toEvaluateSummary();
362
391
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
363
392
  expect(summary.stats.successes).toBe(1);
364
393
  expect(summary.stats.failures).toBe(0);
@@ -382,7 +411,9 @@ describe('evaluator', () => {
382
411
  },
383
412
  ],
384
413
  };
385
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
414
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
415
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
416
+ const summary = await evalRecord.toEvaluateSummary();
386
417
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
387
418
  expect(summary.stats.successes).toBe(1);
388
419
  expect(summary.stats.failures).toBe(0);
@@ -413,7 +444,9 @@ describe('evaluator', () => {
413
444
  },
414
445
  ],
415
446
  };
416
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
447
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
448
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
449
+ const summary = await evalRecord.toEvaluateSummary();
417
450
  expect(mockApiJsonProvider.callApi).toHaveBeenCalledTimes(1);
418
451
  expect(summary.stats.successes).toBe(1);
419
452
  expect(summary.stats.failures).toBe(0);
@@ -443,7 +476,9 @@ describe('evaluator', () => {
443
476
  },
444
477
  ],
445
478
  };
446
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
479
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
480
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
481
+ const summary = await evalRecord.toEvaluateSummary();
447
482
  expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
448
483
  expect(summary.stats.successes).toBe(1);
449
484
  expect(summary.stats.failures).toBe(0);
@@ -470,7 +505,10 @@ describe('evaluator', () => {
470
505
  },
471
506
  },
472
507
  };
473
- await expect((0, evaluator_1.evaluate)(testSuite, {})).resolves.toEqual(expect.objectContaining({
508
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
509
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
510
+ const summary = await evalRecord.toEvaluateSummary();
511
+ expect(summary).toEqual(expect.objectContaining({
474
512
  stats: expect.objectContaining({
475
513
  successes: 2,
476
514
  failures: 0,
@@ -534,7 +572,10 @@ describe('evaluator', () => {
534
572
  },
535
573
  ],
536
574
  };
537
- await expect((0, evaluator_1.evaluate)(testSuite, {})).resolves.toEqual(expect.objectContaining({
575
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
576
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
577
+ const summary = await evalRecord.toEvaluateSummary();
578
+ expect(summary).toEqual(expect.objectContaining({
538
579
  stats: expect.objectContaining({
539
580
  successes: 1,
540
581
  failures: 0,
@@ -562,7 +603,9 @@ describe('evaluator', () => {
562
603
  },
563
604
  ],
564
605
  };
565
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
606
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
607
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
608
+ const summary = await evalRecord.toEvaluateSummary();
566
609
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
567
610
  expect(summary.stats.successes).toBe(1);
568
611
  expect(summary.stats.failures).toBe(0);
@@ -595,7 +638,9 @@ describe('evaluator', () => {
595
638
  },
596
639
  ],
597
640
  };
598
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
641
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
642
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
643
+ const summary = await evalRecord.toEvaluateSummary();
599
644
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
600
645
  expect(summary).toMatchObject({
601
646
  stats: {
@@ -651,7 +696,9 @@ describe('evaluator', () => {
651
696
  },
652
697
  ],
653
698
  };
654
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
699
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
700
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
701
+ const summary = await evalRecord.toEvaluateSummary();
655
702
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
656
703
  expect(summary.stats.successes).toBe(2);
657
704
  expect(summary.stats.failures).toBe(0);
@@ -706,7 +753,9 @@ describe('evaluator', () => {
706
753
  },
707
754
  ],
708
755
  };
709
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
756
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
757
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
758
+ const summary = await evalRecord.toEvaluateSummary();
710
759
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
711
760
  expect(summary.stats.successes).toBe(4);
712
761
  expect(summary.stats.failures).toBe(0);
@@ -764,8 +813,10 @@ describe('evaluator', () => {
764
813
  },
765
814
  ],
766
815
  };
767
- const result = await (0, evaluator_1.evaluate)(testSuite, {});
768
- expect(result).toMatchObject({
816
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
817
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
818
+ const summary = await evalRecord.toEvaluateSummary();
819
+ expect(summary).toMatchObject({
769
820
  stats: {
770
821
  successes: 2,
771
822
  failures: 0,
@@ -787,7 +838,7 @@ describe('evaluator', () => {
787
838
  }),
788
839
  ]),
789
840
  });
790
- expect(result.table.body[0].test.metadata).toEqual({
841
+ expect(summary.results[0].testCase.metadata).toEqual({
791
842
  defaultKey: 'defaultValue',
792
843
  configKey: 'configValue',
793
844
  testKey: 'testValue',
@@ -807,8 +858,9 @@ describe('evaluator', () => {
807
858
  },
808
859
  ],
809
860
  };
810
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
811
- expect(summary.table.body[0].test.metadata).toEqual({
861
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
862
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
863
+ expect(evalRecord.results[0].testCase.metadata).toEqual({
812
864
  defaultKey: 'defaultValue',
813
865
  testKey: 'testValue',
814
866
  });
@@ -835,7 +887,9 @@ describe('evaluator', () => {
835
887
  },
836
888
  ],
837
889
  };
838
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
890
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
891
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
892
+ const summary = await evalRecord.toEvaluateSummary();
839
893
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
840
894
  expect(summary.stats.successes).toBe(2);
841
895
  expect(summary.stats.failures).toBe(0);
@@ -875,7 +929,10 @@ describe('evaluator', () => {
875
929
  'unlabeled-provider-id': ['prompt2'],
876
930
  },
877
931
  };
878
- await expect((0, evaluator_1.evaluate)(testSuite, {})).resolves.toEqual(expect.objectContaining({
932
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
933
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
934
+ const summary = await evalRecord.toEvaluateSummary();
935
+ expect(summary).toMatchObject({
879
936
  stats: expect.objectContaining({
880
937
  successes: 2,
881
938
  failures: 0,
@@ -900,19 +957,15 @@ describe('evaluator', () => {
900
957
  }),
901
958
  }),
902
959
  ],
903
- table: expect.objectContaining({
904
- head: expect.objectContaining({
905
- prompts: [
906
- expect.objectContaining({
907
- provider: 'Labeled Provider',
908
- }),
909
- expect.objectContaining({
910
- provider: 'unlabeled-provider-id',
911
- }),
912
- ],
913
- }),
960
+ });
961
+ expect(evalRecord.prompts).toEqual(expect.arrayContaining([
962
+ expect.objectContaining({
963
+ provider: 'Labeled Provider',
914
964
  }),
915
- }));
965
+ expect.objectContaining({
966
+ provider: 'unlabeled-provider-id',
967
+ }),
968
+ ]));
916
969
  expect(mockLabeledProvider.callApi).toHaveBeenCalledTimes(1);
917
970
  expect(mockUnlabeledProvider.callApi).toHaveBeenCalledTimes(1);
918
971
  });
@@ -929,7 +982,9 @@ describe('evaluator', () => {
929
982
  },
930
983
  ],
931
984
  };
932
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
985
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
986
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
987
+ const summary = await evalRecord.toEvaluateSummary();
933
988
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
934
989
  expect(summary.stats.successes).toBe(1);
935
990
  expect(summary.stats.failures).toBe(0);
@@ -961,7 +1016,9 @@ describe('evaluator', () => {
961
1016
  },
962
1017
  ],
963
1018
  };
964
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
1019
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1020
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1021
+ const summary = await evalRecord.toEvaluateSummary();
965
1022
  expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
966
1023
  expect(summary.stats.successes).toBe(1);
967
1024
  expect(summary.stats.failures).toBe(0);
@@ -993,19 +1050,16 @@ describe('evaluator', () => {
993
1050
  },
994
1051
  ],
995
1052
  };
996
- await expect((0, evaluator_1.evaluate)(testSuite, {})).resolves.toEqual(expect.objectContaining({
1053
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1054
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1055
+ const summary = await evalRecord.toEvaluateSummary();
1056
+ expect(summary).toMatchObject({
997
1057
  stats: expect.objectContaining({
998
1058
  successes: 1,
999
1059
  failures: 0,
1000
1060
  }),
1001
- results: expect.arrayContaining([
1002
- expect.objectContaining({
1003
- response: expect.objectContaining({
1004
- output: 'Postprocess: Provider: Original output',
1005
- }),
1006
- }),
1007
- ]),
1008
- }));
1061
+ });
1062
+ expect(summary.results[0].response?.output).toBe('Postprocess: Provider: Original output');
1009
1063
  expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
1010
1064
  });
1011
1065
  it('evaluate with provider transform, test transform, and test postprocess (deprecated)', async () => {
@@ -1035,7 +1089,10 @@ describe('evaluator', () => {
1035
1089
  },
1036
1090
  ],
1037
1091
  };
1038
- await expect((0, evaluator_1.evaluate)(testSuite, {})).resolves.toEqual(expect.objectContaining({
1092
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1093
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1094
+ const summary = await evalRecord.toEvaluateSummary();
1095
+ expect(summary).toMatchObject({
1039
1096
  stats: expect.objectContaining({
1040
1097
  successes: 1,
1041
1098
  failures: 0,
@@ -1047,7 +1104,7 @@ describe('evaluator', () => {
1047
1104
  }),
1048
1105
  }),
1049
1106
  ]),
1050
- }));
1107
+ });
1051
1108
  expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
1052
1109
  });
1053
1110
  it('evaluate with no output', async () => {
@@ -1063,7 +1120,9 @@ describe('evaluator', () => {
1063
1120
  prompts: [toPrompt('Test prompt')],
1064
1121
  tests: [],
1065
1122
  };
1066
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
1123
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1124
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1125
+ const summary = await evalRecord.toEvaluateSummary();
1067
1126
  expect(summary.stats.successes).toBe(0);
1068
1127
  expect(summary.stats.failures).toBe(1);
1069
1128
  expect(summary.results[0].error).toBe('No output');
@@ -1084,7 +1143,9 @@ describe('evaluator', () => {
1084
1143
  prompts: [toPrompt('Test prompt')],
1085
1144
  tests: [],
1086
1145
  };
1087
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
1146
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1147
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1148
+ const summary = await evalRecord.toEvaluateSummary();
1088
1149
  expect(summary.stats.successes).toBe(1);
1089
1150
  expect(summary.stats.failures).toBe(0);
1090
1151
  expect(summary.results[0].success).toBe(true);
@@ -1126,7 +1187,9 @@ describe('evaluator', () => {
1126
1187
  ],
1127
1188
  tests: [{ vars: { problem: '8x + 31 = 2' } }],
1128
1189
  };
1129
- const summary = await (0, evaluator_1.evaluate)(testSuite, {});
1190
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1191
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1192
+ const summary = await evalRecord.toEvaluateSummary();
1130
1193
  expect(summary.stats.successes).toBe(1);
1131
1194
  expect(summary.stats.failures).toBe(0);
1132
1195
  expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
@@ -1167,7 +1230,8 @@ describe('evaluator', () => {
1167
1230
  };
1168
1231
  const mockedRunExtensionHook = jest.mocked(evaluatorHelpers_1.runExtensionHook);
1169
1232
  mockedRunExtensionHook.mockClear();
1170
- await (0, evaluator_1.evaluate)(testSuite, {});
1233
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1234
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1171
1235
  // Check if runExtensionHook was called 4 times (beforeAll, beforeEach, afterEach, afterAll)
1172
1236
  expect(mockedRunExtensionHook).toHaveBeenCalledTimes(4);
1173
1237
  // Check beforeAll call
@@ -1207,6 +1271,20 @@ describe('evaluator', () => {
1207
1271
  suite: testSuite,
1208
1272
  }));
1209
1273
  });
1274
+ it('should handle multiple providers', async () => {
1275
+ const testSuite = {
1276
+ providers: [mockApiProvider, mockApiProvider2],
1277
+ prompts: [toPrompt('Test prompt')],
1278
+ tests: [],
1279
+ };
1280
+ const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
1281
+ await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
1282
+ const summary = await evalRecord.toEvaluateSummary();
1283
+ expect(summary.stats.successes).toBe(2);
1284
+ expect(summary.stats.failures).toBe(0);
1285
+ expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
1286
+ expect(mockApiProvider2.callApi).toHaveBeenCalledTimes(1);
1287
+ });
1210
1288
  });
1211
1289
  describe('generateVarCombinations', () => {
1212
1290
  it('should generate combinations for simple variables', () => {