promptfoo 0.91.3 → 0.92.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. package/dist/drizzle/0006_harsh_caretaker.sql +42 -0
  2. package/dist/drizzle/0007_cloudy_wong.sql +1 -0
  3. package/dist/drizzle/meta/0006_snapshot.json +721 -0
  4. package/dist/drizzle/meta/0007_snapshot.json +723 -0
  5. package/dist/drizzle/meta/_journal.json +14 -0
  6. package/dist/package.json +10 -8
  7. package/dist/src/app/assets/{index-C6z1nbLN.js → index-BpjzEMiv.js} +243 -241
  8. package/dist/src/app/assets/{index.es-oqbvfIxR.js → index.es-ihzvEu35.js} +1 -1
  9. package/dist/src/app/assets/{sync-D2s75VlC.js → sync-BosjlpGJ.js} +1 -1
  10. package/dist/src/app/index.html +3 -3
  11. package/dist/src/assertions.js +2 -2
  12. package/dist/src/assertions.js.map +1 -1
  13. package/dist/src/commands/cache.d.ts.map +1 -1
  14. package/dist/src/commands/cache.js +0 -2
  15. package/dist/src/commands/cache.js.map +1 -1
  16. package/dist/src/commands/eval.d.ts.map +1 -1
  17. package/dist/src/commands/eval.js +19 -16
  18. package/dist/src/commands/eval.js.map +1 -1
  19. package/dist/src/commands/export.d.ts.map +1 -1
  20. package/dist/src/commands/export.js +8 -31
  21. package/dist/src/commands/export.js.map +1 -1
  22. package/dist/src/commands/import.d.ts.map +1 -1
  23. package/dist/src/commands/import.js +52 -13
  24. package/dist/src/commands/import.js.map +1 -1
  25. package/dist/src/commands/list.d.ts.map +1 -1
  26. package/dist/src/commands/list.js +35 -7
  27. package/dist/src/commands/list.js.map +1 -1
  28. package/dist/src/commands/share.d.ts +2 -2
  29. package/dist/src/commands/share.d.ts.map +1 -1
  30. package/dist/src/commands/share.js +12 -13
  31. package/dist/src/commands/share.js.map +1 -1
  32. package/dist/src/commands/show.d.ts.map +1 -1
  33. package/dist/src/commands/show.js +10 -6
  34. package/dist/src/commands/show.js.map +1 -1
  35. package/dist/src/constants.d.ts +1 -0
  36. package/dist/src/constants.d.ts.map +1 -1
  37. package/dist/src/constants.js +2 -1
  38. package/dist/src/constants.js.map +1 -1
  39. package/dist/src/database/index.js +1 -1
  40. package/dist/src/database/index.js.map +1 -1
  41. package/dist/src/database/tables.d.ts +609 -11
  42. package/dist/src/database/tables.d.ts.map +1 -1
  43. package/dist/src/database/tables.js +111 -52
  44. package/dist/src/database/tables.js.map +1 -1
  45. package/dist/src/database/types.d.ts +3 -3
  46. package/dist/src/database/types.d.ts.map +1 -1
  47. package/dist/src/evaluator.d.ts +3 -2
  48. package/dist/src/evaluator.d.ts.map +1 -1
  49. package/dist/src/evaluator.js +75 -104
  50. package/dist/src/evaluator.js.map +1 -1
  51. package/dist/src/evaluatorHelpers.d.ts.map +1 -1
  52. package/dist/src/evaluatorHelpers.js +2 -1
  53. package/dist/src/evaluatorHelpers.js.map +1 -1
  54. package/dist/src/index.d.ts +2 -1
  55. package/dist/src/index.d.ts.map +1 -1
  56. package/dist/src/index.js +18 -10
  57. package/dist/src/index.js.map +1 -1
  58. package/dist/src/models/eval.d.ts +95 -0
  59. package/dist/src/models/eval.d.ts.map +1 -0
  60. package/dist/src/models/eval.js +390 -0
  61. package/dist/src/models/eval.js.map +1 -0
  62. package/dist/src/models/evalResult.d.ts +50 -0
  63. package/dist/src/models/evalResult.d.ts.map +1 -0
  64. package/dist/src/models/evalResult.js +122 -0
  65. package/dist/src/models/evalResult.js.map +1 -0
  66. package/dist/src/models/provider.d.ts +9 -0
  67. package/dist/src/models/provider.d.ts.map +1 -0
  68. package/dist/src/models/provider.js +47 -0
  69. package/dist/src/models/provider.js.map +1 -0
  70. package/dist/src/prompts/index.d.ts.map +1 -1
  71. package/dist/src/prompts/index.js +2 -1
  72. package/dist/src/prompts/index.js.map +1 -1
  73. package/dist/src/prompts/utils.d.ts +1 -0
  74. package/dist/src/prompts/utils.d.ts.map +1 -1
  75. package/dist/src/prompts/utils.js +7 -0
  76. package/dist/src/prompts/utils.js.map +1 -1
  77. package/dist/src/providers/fal.d.ts +2 -2
  78. package/dist/src/providers/fal.d.ts.map +1 -1
  79. package/dist/src/providers/fal.js +2 -1
  80. package/dist/src/providers/fal.js.map +1 -1
  81. package/dist/src/providers/http.js +2 -2
  82. package/dist/src/providers/http.js.map +1 -1
  83. package/dist/src/providers/palm.d.ts +4 -3
  84. package/dist/src/providers/palm.d.ts.map +1 -1
  85. package/dist/src/providers/palm.js +13 -3
  86. package/dist/src/providers/palm.js.map +1 -1
  87. package/dist/src/providers.js +5 -5
  88. package/dist/src/providers.js.map +1 -1
  89. package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +10 -0
  90. package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +10 -0
  91. package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +10 -0
  92. package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +10 -0
  93. package/dist/src/server/server.d.ts +1 -0
  94. package/dist/src/server/server.d.ts.map +1 -1
  95. package/dist/src/server/server.js +70 -31
  96. package/dist/src/server/server.js.map +1 -1
  97. package/dist/src/share.d.ts +2 -2
  98. package/dist/src/share.d.ts.map +1 -1
  99. package/dist/src/share.js +93 -34
  100. package/dist/src/share.js.map +1 -1
  101. package/dist/src/table.d.ts +2 -2
  102. package/dist/src/table.d.ts.map +1 -1
  103. package/dist/src/table.js +3 -3
  104. package/dist/src/table.js.map +1 -1
  105. package/dist/src/types/index.d.ts +163 -11
  106. package/dist/src/types/index.d.ts.map +1 -1
  107. package/dist/src/types/index.js +21 -1
  108. package/dist/src/types/index.js.map +1 -1
  109. package/dist/src/util/config/load.d.ts.map +1 -1
  110. package/dist/src/util/config/load.js +2 -1
  111. package/dist/src/util/config/load.js.map +1 -1
  112. package/dist/src/util/config/manage.d.ts.map +1 -1
  113. package/dist/src/util/config/manage.js.map +1 -1
  114. package/dist/src/util/convertEvalResultsToTable.d.ts +16 -0
  115. package/dist/src/util/convertEvalResultsToTable.d.ts.map +1 -0
  116. package/dist/src/util/convertEvalResultsToTable.js +136 -0
  117. package/dist/src/util/convertEvalResultsToTable.js.map +1 -0
  118. package/dist/src/util/createHash.d.ts +1 -0
  119. package/dist/src/util/createHash.d.ts.map +1 -1
  120. package/dist/src/util/createHash.js +9 -0
  121. package/dist/src/util/createHash.js.map +1 -1
  122. package/dist/src/util/file.d.ts +8 -0
  123. package/dist/src/util/file.d.ts.map +1 -0
  124. package/dist/src/util/file.js +13 -0
  125. package/dist/src/util/file.js.map +1 -0
  126. package/dist/src/util/index.d.ts +9 -14
  127. package/dist/src/util/index.d.ts.map +1 -1
  128. package/dist/src/util/index.js +132 -268
  129. package/dist/src/util/index.js.map +1 -1
  130. package/dist/src/util/time.d.ts +2 -0
  131. package/dist/src/util/time.d.ts.map +1 -0
  132. package/dist/src/util/time.js +7 -0
  133. package/dist/src/util/time.js.map +1 -0
  134. package/dist/src/util/transform.js +2 -2
  135. package/dist/src/util/transform.js.map +1 -1
  136. package/dist/src/validators/providers.d.ts +6 -0
  137. package/dist/src/validators/providers.d.ts.map +1 -1
  138. package/dist/src/validators/providers.js +1 -0
  139. package/dist/src/validators/providers.js.map +1 -1
  140. package/dist/src/validators/redteam.d.ts +6 -0
  141. package/dist/src/validators/redteam.d.ts.map +1 -1
  142. package/dist/test/commands/eval/filterFailingTests.test.js +24 -2
  143. package/dist/test/commands/eval/filterFailingTests.test.js.map +1 -1
  144. package/dist/test/evaluator.test.js +152 -74
  145. package/dist/test/evaluator.test.js.map +1 -1
  146. package/dist/test/factories/data/eval/database_records.d.ts +142 -0
  147. package/dist/test/factories/data/eval/database_records.d.ts.map +1 -0
  148. package/dist/test/factories/data/eval/database_records.js +251 -0
  149. package/dist/test/factories/data/eval/database_records.js.map +1 -0
  150. package/dist/test/factories/evalFactory.d.ts +768 -0
  151. package/dist/test/factories/evalFactory.d.ts.map +1 -0
  152. package/dist/test/factories/evalFactory.js +121 -0
  153. package/dist/test/factories/evalFactory.js.map +1 -0
  154. package/dist/test/index.test.js +20 -35
  155. package/dist/test/index.test.js.map +1 -1
  156. package/dist/test/models/eval.test.d.ts +2 -0
  157. package/dist/test/models/eval.test.d.ts.map +1 -0
  158. package/dist/test/models/eval.test.js +34 -0
  159. package/dist/test/models/eval.test.js.map +1 -0
  160. package/dist/test/providers.test.js +3 -3
  161. package/dist/test/providers.test.js.map +1 -1
  162. package/dist/test/server/share.test.d.ts +2 -0
  163. package/dist/test/server/share.test.d.ts.map +1 -0
  164. package/dist/test/server/share.test.js +36 -0
  165. package/dist/test/server/share.test.js.map +1 -0
  166. package/dist/test/server/v3evalToShare.json +507 -0
  167. package/dist/test/server/v4evalToShare.json +421 -0
  168. package/dist/test/types.test.js +56 -3
  169. package/dist/test/types.test.js.map +1 -1
  170. package/dist/test/util.file.test.d.ts +2 -0
  171. package/dist/test/util.file.test.d.ts.map +1 -0
  172. package/dist/test/util.file.test.js +32 -0
  173. package/dist/test/util.file.test.js.map +1 -0
  174. package/dist/test/util.listPrevious.test.d.ts +2 -0
  175. package/dist/test/util.listPrevious.test.d.ts.map +1 -0
  176. package/dist/test/util.listPrevious.test.js +37 -0
  177. package/dist/test/util.listPrevious.test.js.map +1 -0
  178. package/dist/test/util.test.js +38 -311
  179. package/dist/test/util.test.js.map +1 -1
  180. package/dist/tsconfig.tsbuildinfo +1 -0
  181. package/package.json +10 -8
@@ -26,7 +26,6 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
26
26
  return (mod && mod.__esModule) ? mod : { "default": mod };
27
27
  };
28
28
  Object.defineProperty(exports, "__esModule", { value: true });
29
- exports.isJavascriptFile = isJavascriptFile;
30
29
  exports.writeOutput = writeOutput;
31
30
  exports.writeMultipleOutputs = writeMultipleOutputs;
32
31
  exports.readOutput = readOutput;
@@ -39,7 +38,6 @@ exports.filenameToDate = filenameToDate;
39
38
  exports.dateToFilename = dateToFilename;
40
39
  exports.readResult_fileSystem = readResult_fileSystem;
41
40
  exports.migrateResultsFromFileSystemToDatabase = migrateResultsFromFileSystemToDatabase;
42
- exports.cleanupOldFileResults = cleanupOldFileResults;
43
41
  exports.readResult = readResult;
44
42
  exports.updateResult = updateResult;
45
43
  exports.getLatestEval = getLatestEval;
@@ -88,21 +86,14 @@ const accounts_1 = require("../globalConfig/accounts");
88
86
  const googleSheets_1 = require("../googleSheets");
89
87
  const logger_1 = __importDefault(require("../logger"));
90
88
  const migrate_1 = require("../migrate");
89
+ const eval_1 = __importStar(require("../models/eval"));
91
90
  const prompt_1 = require("../models/prompt");
92
91
  const types_1 = require("../types");
93
92
  const manage_1 = require("./config/manage");
94
93
  const createHash_1 = require("./createHash");
94
+ const file_1 = require("./file");
95
95
  const templates_1 = require("./templates");
96
96
  const DEFAULT_QUERY_LIMIT = 100;
97
- /**
98
- * Checks if a file is a JavaScript or TypeScript file based on its extension.
99
- *
100
- * @param filePath - The path of the file to check.
101
- * @returns True if the file has a JavaScript or TypeScript extension, false otherwise.
102
- */
103
- function isJavascriptFile(filePath) {
104
- return /\.(js|cjs|mjs|ts|cts|mts)$/.test(filePath);
105
- }
106
97
  const outputToSimpleString = (output) => {
107
98
  const passFailText = output.pass ? '[PASS]' : '[FAIL]';
108
99
  const namedScoresText = Object.entries(output.namedScores)
@@ -122,14 +113,16 @@ const outputToSimpleString = (output) => {
122
113
  ${gradingResultText}
123
114
  `.trim();
124
115
  };
125
- async function writeOutput(outputPath, evalId, results, config, shareableUrl) {
116
+ async function writeOutput(outputPath, evalRecord, shareableUrl) {
117
+ const table = await evalRecord.getTable();
118
+ (0, tiny_invariant_1.default)(table, 'Table is required');
126
119
  if (outputPath.match(/^https:\/\/docs\.google\.com\/spreadsheets\//)) {
127
- const rows = results.table.body.map((row) => {
120
+ const rows = table.body.map((row) => {
128
121
  const csvRow = {};
129
- results.table.head.vars.forEach((varName, index) => {
122
+ table.head.vars.forEach((varName, index) => {
130
123
  csvRow[varName] = row.vars[index];
131
124
  });
132
- results.table.head.prompts.forEach((prompt, index) => {
125
+ table.head.prompts.forEach((prompt, index) => {
133
126
  csvRow[prompt.label] = outputToSimpleString(row.outputs[index]);
134
127
  });
135
128
  return csvRow;
@@ -148,38 +141,51 @@ async function writeOutput(outputPath, evalId, results, config, shareableUrl) {
148
141
  if (outputExtension === 'csv') {
149
142
  const csvOutput = (0, sync_1.stringify)([
150
143
  [
151
- ...results.table.head.vars,
152
- ...results.table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
144
+ ...table.head.vars,
145
+ ...table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
153
146
  ],
154
- ...results.table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
147
+ ...table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
155
148
  ]);
156
149
  fs.writeFileSync(outputPath, csvOutput);
157
150
  }
158
151
  else if (outputExtension === 'json') {
159
- fs.writeFileSync(outputPath, JSON.stringify({ evalId, results, config, shareableUrl }, null, 2));
152
+ const summary = await evalRecord.toEvaluateSummary();
153
+ fs.writeFileSync(outputPath, JSON.stringify({
154
+ evalId: evalRecord.id,
155
+ results: summary,
156
+ config: evalRecord.config,
157
+ shareableUrl,
158
+ }, null, 2));
160
159
  }
161
160
  else if (outputExtension === 'yaml' || outputExtension === 'yml' || outputExtension === 'txt') {
162
- fs.writeFileSync(outputPath, js_yaml_1.default.dump({ results, config, shareableUrl }));
161
+ const summary = await evalRecord.toEvaluateSummary();
162
+ fs.writeFileSync(outputPath, js_yaml_1.default.dump({
163
+ evalId: evalRecord.id,
164
+ results: summary,
165
+ config: evalRecord.config,
166
+ shareableUrl,
167
+ }));
163
168
  }
164
169
  else if (outputExtension === 'html') {
170
+ const summary = await evalRecord.toEvaluateSummary();
165
171
  const template = fs.readFileSync(`${(0, esm_1.getDirectory)()}/tableOutput.html`, 'utf-8');
166
- const table = [
172
+ const htmlTable = [
167
173
  [
168
- ...results.table.head.vars,
169
- ...results.table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
174
+ ...table.head.vars,
175
+ ...table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
170
176
  ],
171
- ...results.table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
177
+ ...table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
172
178
  ];
173
179
  const htmlOutput = (0, templates_1.getNunjucksEngine)().renderString(template, {
174
- config,
175
- table,
176
- results: results.results,
180
+ config: evalRecord.config,
181
+ table: htmlTable,
182
+ results: summary,
177
183
  });
178
184
  fs.writeFileSync(outputPath, htmlOutput);
179
185
  }
180
186
  }
181
- async function writeMultipleOutputs(outputPaths, evalId, results, config, shareableUrl) {
182
- await Promise.all(outputPaths.map((outputPath) => writeOutput(outputPath, evalId, results, config, shareableUrl)));
187
+ async function writeMultipleOutputs(outputPaths, evalRecord, shareableUrl) {
188
+ await Promise.all(outputPaths.map((outputPath) => writeOutput(outputPath, evalRecord, shareableUrl)));
183
189
  }
184
190
  async function readOutput(outputPath) {
185
191
  const ext = path.parse(outputPath).ext.slice(1);
@@ -197,13 +203,13 @@ async function readOutput(outputPath) {
197
203
  function getLatestResultsPath() {
198
204
  return path.join((0, manage_1.getConfigDirectoryPath)(), 'output', 'latest.json');
199
205
  }
200
- async function writeResultsToDatabase(results, config, createdAt) {
206
+ async function writeResultsToDatabase(results, config, createdAt = new Date()) {
201
207
  createdAt = createdAt || (results.timestamp ? new Date(results.timestamp) : new Date());
202
- const evalId = `eval-${createdAt.toISOString().slice(0, 19)}`;
208
+ const evalId = (0, eval_1.createEvalId)(createdAt);
203
209
  const db = (0, database_1.getDb)();
204
210
  const promises = [];
205
211
  promises.push(db
206
- .insert(tables_1.evals)
212
+ .insert(tables_1.evalsTable)
207
213
  .values({
208
214
  id: evalId,
209
215
  createdAt: createdAt.getTime(),
@@ -216,11 +222,12 @@ async function writeResultsToDatabase(results, config, createdAt) {
216
222
  .run());
217
223
  logger_1.default.debug(`Inserting eval ${evalId}`);
218
224
  // Record prompt relation
225
+ (0, tiny_invariant_1.default)(results.table, 'Table is required');
219
226
  for (const prompt of results.table.head.prompts) {
220
227
  const label = prompt.label || prompt.display || prompt.raw;
221
- const promptId = prompt.id || (0, prompt_1.generateIdFromPrompt)(prompt);
228
+ const promptId = (0, prompt_1.generateIdFromPrompt)(prompt);
222
229
  promises.push(db
223
- .insert(tables_1.prompts)
230
+ .insert(tables_1.promptsTable)
224
231
  .values({
225
232
  id: promptId,
226
233
  prompt: label,
@@ -228,7 +235,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
228
235
  .onConflictDoNothing()
229
236
  .run());
230
237
  promises.push(db
231
- .insert(tables_1.evalsToPrompts)
238
+ .insert(tables_1.evalsToPromptsTable)
232
239
  .values({
233
240
  evalId,
234
241
  promptId,
@@ -240,7 +247,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
240
247
  // Record dataset relation
241
248
  const datasetId = (0, createHash_1.sha256)(JSON.stringify(config.tests || []));
242
249
  promises.push(db
243
- .insert(tables_1.datasets)
250
+ .insert(tables_1.datasetsTable)
244
251
  .values({
245
252
  id: datasetId,
246
253
  tests: config.tests,
@@ -248,7 +255,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
248
255
  .onConflictDoNothing()
249
256
  .run());
250
257
  promises.push(db
251
- .insert(tables_1.evalsToDatasets)
258
+ .insert(tables_1.evalsToDatasetsTable)
252
259
  .values({
253
260
  evalId,
254
261
  datasetId,
@@ -261,7 +268,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
261
268
  for (const [tagKey, tagValue] of Object.entries(config.tags)) {
262
269
  const tagId = (0, createHash_1.sha256)(`${tagKey}:${tagValue}`);
263
270
  promises.push(db
264
- .insert(tables_1.tags)
271
+ .insert(tables_1.tagsTable)
265
272
  .values({
266
273
  id: tagId,
267
274
  name: tagKey,
@@ -270,7 +277,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
270
277
  .onConflictDoNothing()
271
278
  .run());
272
279
  promises.push(db
273
- .insert(tables_1.evalsToTags)
280
+ .insert(tables_1.evalsToTagsTable)
274
281
  .values({
275
282
  evalId,
276
283
  tagId,
@@ -297,21 +304,21 @@ async function writeResultsToDatabase(results, config, createdAt) {
297
304
  *
298
305
  * @returns Last n evals in descending order.
299
306
  */
300
- function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, datasetId) {
307
+ async function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, datasetId) {
301
308
  const db = (0, database_1.getDb)();
302
309
  const startTime = performance.now();
303
310
  const query = db
304
311
  .select({
305
- evalId: tables_1.evals.id,
306
- createdAt: tables_1.evals.createdAt,
307
- description: tables_1.evals.description,
308
- numTests: (0, drizzle_orm_1.sql) `json_array_length(${tables_1.evals.results}->'table'->'body')`,
309
- datasetId: tables_1.evalsToDatasets.datasetId,
312
+ evalId: tables_1.evalsTable.id,
313
+ createdAt: tables_1.evalsTable.createdAt,
314
+ description: tables_1.evalsTable.description,
315
+ numTests: (0, drizzle_orm_1.sql) `json_array_length(${tables_1.evalsTable.results}->'table'->'body')`,
316
+ datasetId: tables_1.evalsToDatasetsTable.datasetId,
310
317
  })
311
- .from(tables_1.evals)
312
- .leftJoin(tables_1.evalsToDatasets, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToDatasets.evalId))
313
- .where((0, drizzle_orm_1.and)(datasetId ? (0, drizzle_orm_1.eq)(tables_1.evalsToDatasets.datasetId, datasetId) : undefined, filterDescription ? (0, drizzle_orm_1.like)(tables_1.evals.description, `%${filterDescription}%`) : undefined));
314
- const results = query.orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt)).limit(limit).all();
318
+ .from(tables_1.evalsTable)
319
+ .leftJoin(tables_1.evalsToDatasetsTable, (0, drizzle_orm_1.eq)(tables_1.evalsTable.id, tables_1.evalsToDatasetsTable.evalId))
320
+ .where((0, drizzle_orm_1.and)(datasetId ? (0, drizzle_orm_1.eq)(tables_1.evalsToDatasetsTable.datasetId, datasetId) : undefined, filterDescription ? (0, drizzle_orm_1.like)(tables_1.evalsTable.description, `%${filterDescription}%`) : undefined, (0, drizzle_orm_1.not)((0, drizzle_orm_1.eq)(tables_1.evalsTable.results, {}))));
321
+ const results = query.orderBy((0, drizzle_orm_1.desc)(tables_1.evalsTable.createdAt)).limit(limit).all();
315
322
  const mappedResults = results.map((result) => ({
316
323
  evalId: result.evalId,
317
324
  createdAt: result.createdAt,
@@ -321,8 +328,10 @@ function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, dat
321
328
  }));
322
329
  const endTime = performance.now();
323
330
  const executionTime = endTime - startTime;
331
+ const evalResults = await (0, eval_1.getSummaryofLatestEvals)(undefined, filterDescription, datasetId);
324
332
  logger_1.default.debug(`listPreviousResults execution time: ${executionTime.toFixed(2)}ms`);
325
- return mappedResults;
333
+ const combinedResults = [...evalResults, ...mappedResults];
334
+ return combinedResults;
326
335
  }
327
336
  /**
328
337
  * @deprecated Used only for migration to sqlite
@@ -409,100 +418,19 @@ function readResult_fileSystem(name) {
409
418
  logger_1.default.error(`Failed to read results from ${resultsPath}:\n${err}`);
410
419
  }
411
420
  }
412
- let attemptedMigration = false;
413
421
  async function migrateResultsFromFileSystemToDatabase() {
414
- if (attemptedMigration) {
415
- // TODO(ian): Record this bit in the database.
416
- return;
417
- }
418
422
  // First run db migrations
419
423
  logger_1.default.debug('Running db migrations...');
420
424
  await (0, migrate_1.runDbMigrations)();
421
- const fileNames = listPreviousResultFilenames_fileSystem();
422
- if (fileNames.length === 0) {
423
- return;
424
- }
425
- logger_1.default.info(`🔁 Migrating ${fileNames.length} flat files to local database.`);
426
- logger_1.default.info('This is a one-time operation and may take a minute...');
427
- attemptedMigration = true;
428
- const outputDir = path.join((0, manage_1.getConfigDirectoryPath)(true /* createIfNotExists */), 'output');
429
- const backupDir = `${outputDir}-backup-${new Date()
430
- .toISOString()
431
- .slice(0, 10)
432
- .replace(/-/g, '')}`;
433
- try {
434
- fs.cpSync(outputDir, backupDir, { recursive: true });
435
- logger_1.default.info(`Backup of output directory created at ${backupDir}`);
436
- }
437
- catch (backupError) {
438
- logger_1.default.error(`Failed to create backup of output directory: ${backupError}`);
439
- return;
440
- }
441
- logger_1.default.info('Moving files into database...');
442
- const migrationPromises = fileNames.map(async (fileName) => {
443
- const fileData = readResult_fileSystem(fileName);
444
- if (fileData) {
445
- await writeResultsToDatabase(fileData.result.results, fileData.result.config, filenameToDate(fileName));
446
- logger_1.default.debug(`Migrated ${fileName} to database.`);
447
- try {
448
- fs.unlinkSync(path.join(outputDir, fileName));
449
- }
450
- catch (err) {
451
- logger_1.default.warn(`Failed to delete ${fileName} after migration: ${err}`);
452
- }
453
- }
454
- else {
455
- logger_1.default.warn(`Failed to migrate result ${fileName} due to read error.`);
456
- }
457
- });
458
- await Promise.all(migrationPromises);
459
- try {
460
- fs.unlinkSync(getLatestResultsPath());
461
- }
462
- catch (err) {
463
- logger_1.default.warn(`Failed to delete latest.json: ${err}`);
464
- }
465
- logger_1.default.info('Migration complete. Please restart your web server if it is running.');
466
- }
467
- const RESULT_HISTORY_LENGTH = (0, envars_1.getEnvInt)('RESULT_HISTORY_LENGTH', DEFAULT_QUERY_LIMIT);
468
- function cleanupOldFileResults(remaining = RESULT_HISTORY_LENGTH) {
469
- const sortedFilenames = listPreviousResultFilenames_fileSystem();
470
- for (let i = 0; i < sortedFilenames.length - remaining; i++) {
471
- fs.unlinkSync(path.join((0, manage_1.getConfigDirectoryPath)(), 'output', sortedFilenames[i]));
472
- }
473
425
  }
474
426
  async function readResult(id) {
475
- const db = (0, database_1.getDb)();
476
427
  try {
477
- const evalResult = await db
478
- .select({
479
- id: tables_1.evals.id,
480
- createdAt: tables_1.evals.createdAt,
481
- author: tables_1.evals.author,
482
- results: tables_1.evals.results,
483
- config: tables_1.evals.config,
484
- datasetId: tables_1.evalsToDatasets.datasetId,
485
- })
486
- .from(tables_1.evals)
487
- .leftJoin(tables_1.evalsToDatasets, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToDatasets.evalId))
488
- .where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
489
- .execute();
490
- if (evalResult.length === 0) {
491
- return undefined;
492
- }
493
- const { id: resultId, createdAt, results, config, author, datasetId } = evalResult[0];
494
- const result = {
495
- version: 3,
496
- createdAt: new Date(createdAt).toISOString().slice(0, 10),
497
- author,
498
- results,
499
- config,
500
- datasetId,
501
- };
428
+ const eval_ = await eval_1.default.findById(id);
429
+ (0, tiny_invariant_1.default)(eval_, `Eval with ID ${id} not found.`);
502
430
  return {
503
- id: resultId,
504
- result,
505
- createdAt: new Date(createdAt),
431
+ id,
432
+ result: await eval_.toResultsFile(),
433
+ createdAt: new Date(eval_.createdAt),
506
434
  };
507
435
  }
508
436
  catch (err) {
@@ -510,38 +438,20 @@ async function readResult(id) {
510
438
  }
511
439
  }
512
440
  async function updateResult(id, newConfig, newTable) {
513
- const db = (0, database_1.getDb)();
514
441
  try {
515
442
  // Fetch the existing eval data from the database
516
- const existingEval = await db
517
- .select({
518
- config: tables_1.evals.config,
519
- results: tables_1.evals.results,
520
- })
521
- .from(tables_1.evals)
522
- .where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
523
- .limit(1)
524
- .all();
525
- if (existingEval.length === 0) {
443
+ const existingEval = await eval_1.default.findById(id);
444
+ if (!existingEval) {
526
445
  logger_1.default.error(`Eval with ID ${id} not found.`);
527
446
  return;
528
447
  }
529
- const evalData = existingEval[0];
530
448
  if (newConfig) {
531
- evalData.config = newConfig;
449
+ existingEval.config = newConfig;
532
450
  }
533
451
  if (newTable) {
534
- evalData.results.table = newTable;
452
+ existingEval.setTable(newTable);
535
453
  }
536
- await db
537
- .update(tables_1.evals)
538
- .set({
539
- description: evalData.config.description,
540
- config: evalData.config,
541
- results: evalData.results,
542
- })
543
- .where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
544
- .run();
454
+ await existingEval.save();
545
455
  logger_1.default.info(`Updated eval with ID ${id}`);
546
456
  }
547
457
  catch (err) {
@@ -549,61 +459,18 @@ async function updateResult(id, newConfig, newTable) {
549
459
  }
550
460
  }
551
461
  async function getLatestEval(filterDescription) {
552
- const db = (0, database_1.getDb)();
553
- let latestResults = await db
554
- .select({
555
- id: tables_1.evals.id,
556
- createdAt: tables_1.evals.createdAt,
557
- author: tables_1.evals.author,
558
- description: tables_1.evals.description,
559
- results: tables_1.evals.results,
560
- config: tables_1.evals.config,
561
- })
562
- .from(tables_1.evals)
563
- .orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
564
- .limit(1);
565
- if (filterDescription) {
566
- const regex = new RegExp(filterDescription, 'i');
567
- latestResults = latestResults.filter((result) => regex.test(result.description || ''));
568
- }
569
- if (!latestResults.length) {
570
- return undefined;
571
- }
572
- const latestResult = latestResults[0];
573
- return {
574
- version: 3,
575
- createdAt: new Date(latestResult.createdAt).toISOString(),
576
- author: latestResult.author,
577
- results: latestResult.results,
578
- config: latestResult.config,
579
- };
462
+ const eval_ = await eval_1.default.latest();
463
+ return await eval_?.toResultsFile();
580
464
  }
581
465
  async function getPromptsWithPredicate(predicate, limit) {
582
466
  // TODO(ian): Make this use a proper database query
583
- const db = (0, database_1.getDb)();
584
- const evals_ = await db
585
- .select({
586
- id: tables_1.evals.id,
587
- createdAt: tables_1.evals.createdAt,
588
- author: tables_1.evals.author,
589
- results: tables_1.evals.results,
590
- config: tables_1.evals.config,
591
- })
592
- .from(tables_1.evals)
593
- .limit(limit)
594
- .all();
467
+ const evals_ = await eval_1.default.getMany(limit);
595
468
  const groupedPrompts = {};
596
469
  for (const eval_ of evals_) {
597
470
  const createdAt = new Date(eval_.createdAt).toISOString();
598
- const resultWrapper = {
599
- version: 3,
600
- createdAt,
601
- author: eval_.author,
602
- results: eval_.results,
603
- config: eval_.config,
604
- };
471
+ const resultWrapper = await eval_.toResultsFile();
605
472
  if (predicate(resultWrapper)) {
606
- for (const prompt of resultWrapper.results.table.head.prompts) {
473
+ for (const prompt of eval_.getPrompts()) {
607
474
  const promptId = (0, createHash_1.sha256)(prompt.raw);
608
475
  const datasetId = resultWrapper.config.tests
609
476
  ? (0, createHash_1.sha256)(JSON.stringify(resultWrapper.config.tests))
@@ -651,29 +518,11 @@ function getPromptsForTestCases(testCases) {
651
518
  return getPromptsForTestCasesHash(testCasesSha256);
652
519
  }
653
520
  async function getTestCasesWithPredicate(predicate, limit) {
654
- const db = (0, database_1.getDb)();
655
- const evals_ = await db
656
- .select({
657
- id: tables_1.evals.id,
658
- createdAt: tables_1.evals.createdAt,
659
- author: tables_1.evals.author,
660
- results: tables_1.evals.results,
661
- config: tables_1.evals.config,
662
- })
663
- .from(tables_1.evals)
664
- .orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
665
- .limit(limit)
666
- .all();
521
+ const evals_ = await eval_1.default.getMany(limit);
667
522
  const groupedTestCases = {};
668
523
  for (const eval_ of evals_) {
669
524
  const createdAt = new Date(eval_.createdAt).toISOString();
670
- const resultWrapper = {
671
- version: 3,
672
- createdAt,
673
- author: eval_.author,
674
- results: eval_.results,
675
- config: eval_.config,
676
- };
525
+ const resultWrapper = await eval_.toResultsFile();
677
526
  const testCases = resultWrapper.config.tests;
678
527
  if (testCases && predicate(resultWrapper)) {
679
528
  const evalId = eval_.id;
@@ -681,7 +530,7 @@ async function getTestCasesWithPredicate(predicate, limit) {
681
530
  if (datasetId in groupedTestCases) {
682
531
  groupedTestCases[datasetId].recentEvalDate = new Date(Math.max(groupedTestCases[datasetId].recentEvalDate.getTime(), eval_.createdAt));
683
532
  groupedTestCases[datasetId].count += 1;
684
- const newPrompts = resultWrapper.results.table.head.prompts.map((prompt) => ({
533
+ const newPrompts = eval_.getPrompts().map((prompt) => ({
685
534
  id: (0, createHash_1.sha256)(prompt.raw),
686
535
  prompt,
687
536
  evalId,
@@ -695,7 +544,7 @@ async function getTestCasesWithPredicate(predicate, limit) {
695
544
  groupedTestCases[datasetId].prompts = Object.values(promptsById);
696
545
  }
697
546
  else {
698
- const newPrompts = resultWrapper.results.table.head.prompts.map((prompt) => ({
547
+ const newPrompts = eval_.getPrompts().map((prompt) => ({
699
548
  id: (0, createHash_1.sha256)(prompt.raw),
700
549
  prompt,
701
550
  evalId,
@@ -747,15 +596,15 @@ async function getEvalsWithPredicate(predicate, limit) {
747
596
  const db = (0, database_1.getDb)();
748
597
  const evals_ = await db
749
598
  .select({
750
- id: tables_1.evals.id,
751
- createdAt: tables_1.evals.createdAt,
752
- author: tables_1.evals.author,
753
- results: tables_1.evals.results,
754
- config: tables_1.evals.config,
755
- description: tables_1.evals.description,
599
+ id: tables_1.evalsTable.id,
600
+ createdAt: tables_1.evalsTable.createdAt,
601
+ author: tables_1.evalsTable.author,
602
+ results: tables_1.evalsTable.results,
603
+ config: tables_1.evalsTable.config,
604
+ description: tables_1.evalsTable.description,
756
605
  })
757
- .from(tables_1.evals)
758
- .orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
606
+ .from(tables_1.evalsTable)
607
+ .orderBy((0, drizzle_orm_1.desc)(tables_1.evalsTable.createdAt))
759
608
  .limit(limit)
760
609
  .all();
761
610
  const ret = [];
@@ -765,6 +614,7 @@ async function getEvalsWithPredicate(predicate, limit) {
765
614
  version: 3,
766
615
  createdAt,
767
616
  author: eval_.author,
617
+ // @ts-ignore
768
618
  results: eval_.results,
769
619
  config: eval_.config,
770
620
  };
@@ -774,6 +624,7 @@ async function getEvalsWithPredicate(predicate, limit) {
774
624
  id: evalId,
775
625
  date: new Date(eval_.createdAt),
776
626
  config: eval_.config,
627
+ // @ts-ignore
777
628
  results: eval_.results,
778
629
  description: eval_.description || undefined,
779
630
  });
@@ -797,10 +648,13 @@ async function deleteEval(evalId) {
797
648
  const db = (0, database_1.getDb)();
798
649
  await db.transaction(async () => {
799
650
  // We need to clean up foreign keys first. We don't have onDelete: 'cascade' set on all these relationships.
800
- await db.delete(tables_1.evalsToPrompts).where((0, drizzle_orm_1.eq)(tables_1.evalsToPrompts.evalId, evalId)).run();
801
- await db.delete(tables_1.evalsToDatasets).where((0, drizzle_orm_1.eq)(tables_1.evalsToDatasets.evalId, evalId)).run();
651
+ await db.delete(tables_1.evalsToPromptsTable).where((0, drizzle_orm_1.eq)(tables_1.evalsToPromptsTable.evalId, evalId)).run();
652
+ await db.delete(tables_1.evalsToDatasetsTable).where((0, drizzle_orm_1.eq)(tables_1.evalsToDatasetsTable.evalId, evalId)).run();
653
+ await db.delete(tables_1.evalsToTagsTable).where((0, drizzle_orm_1.eq)(tables_1.evalsToTagsTable.evalId, evalId)).run();
654
+ await db.delete(tables_1.evalResultsTable).where((0, drizzle_orm_1.eq)(tables_1.evalResultsTable.evalId, evalId)).run();
655
+ await db.delete(tables_1.evalsToProvidersTable).where((0, drizzle_orm_1.eq)(tables_1.evalsToProvidersTable.evalId, evalId)).run();
802
656
  // Finally, delete the eval record
803
- const deletedIds = await db.delete(tables_1.evals).where((0, drizzle_orm_1.eq)(tables_1.evals.id, evalId)).run();
657
+ const deletedIds = await db.delete(tables_1.evalsTable).where((0, drizzle_orm_1.eq)(tables_1.evalsTable.id, evalId)).run();
804
658
  if (deletedIds.changes === 0) {
805
659
  throw new Error(`Eval with ID ${evalId} not found`);
806
660
  }
@@ -814,10 +668,10 @@ async function deleteEval(evalId) {
814
668
  async function deleteAllEvals() {
815
669
  const db = (0, database_1.getDb)();
816
670
  await db.transaction(async (tx) => {
817
- await tx.delete(tables_1.evalsToPrompts).run();
818
- await tx.delete(tables_1.evalsToDatasets).run();
819
- await tx.delete(tables_1.evalsToTags).run();
820
- await tx.delete(tables_1.evals).run();
671
+ await tx.delete(tables_1.evalsToPromptsTable).run();
672
+ await tx.delete(tables_1.evalsToDatasetsTable).run();
673
+ await tx.delete(tables_1.evalsToTagsTable).run();
674
+ await tx.delete(tables_1.evalsTable).run();
821
675
  });
822
676
  }
823
677
  async function readFilters(filters, basePath = '') {
@@ -848,7 +702,7 @@ function setupEnv(envPath) {
848
702
  }
849
703
  }
850
704
  const standaloneEvalCache = new node_cache_1.default({ stdTTL: 60 * 60 * 2 }); // Cache for 2 hours
851
- function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } = {}) {
705
+ async function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } = {}) {
852
706
  const cacheKey = `standalone_evals_${limit}_${tag?.key}_${tag?.value}`;
853
707
  const cachedResult = standaloneEvalCache.get(cacheKey);
854
708
  if (cachedResult) {
@@ -857,35 +711,45 @@ function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } =
857
711
  const db = (0, database_1.getDb)();
858
712
  const results = db
859
713
  .select({
860
- evalId: tables_1.evals.id,
861
- description: tables_1.evals.description,
862
- results: tables_1.evals.results,
863
- createdAt: tables_1.evals.createdAt,
864
- promptId: tables_1.evalsToPrompts.promptId,
865
- datasetId: tables_1.evalsToDatasets.datasetId,
866
- tagName: tables_1.tags.name,
867
- tagValue: tables_1.tags.value,
714
+ evalId: tables_1.evalsTable.id,
715
+ description: tables_1.evalsTable.description,
716
+ results: tables_1.evalsTable.results,
717
+ createdAt: tables_1.evalsTable.createdAt,
718
+ promptId: tables_1.evalsToPromptsTable.promptId,
719
+ datasetId: tables_1.evalsToDatasetsTable.datasetId,
720
+ tagName: tables_1.tagsTable.name,
721
+ tagValue: tables_1.tagsTable.value,
868
722
  isRedteam: (0, drizzle_orm_1.sql) `json_extract(evals.config, '$.redteam') IS NOT NULL`.as('isRedteam'),
869
723
  })
870
- .from(tables_1.evals)
871
- .leftJoin(tables_1.evalsToPrompts, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToPrompts.evalId))
872
- .leftJoin(tables_1.evalsToDatasets, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToDatasets.evalId))
873
- .leftJoin(tables_1.evalsToTags, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToTags.evalId))
874
- .leftJoin(tables_1.tags, (0, drizzle_orm_1.eq)(tables_1.evalsToTags.tagId, tables_1.tags.id))
875
- .where((0, drizzle_orm_1.and)(tag ? (0, drizzle_orm_1.and)((0, drizzle_orm_1.eq)(tables_1.tags.name, tag.key), (0, drizzle_orm_1.eq)(tables_1.tags.value, tag.value)) : undefined, description ? (0, drizzle_orm_1.eq)(tables_1.evals.description, description) : undefined))
876
- .orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
724
+ .from(tables_1.evalsTable)
725
+ .leftJoin(tables_1.evalsToPromptsTable, (0, drizzle_orm_1.eq)(tables_1.evalsTable.id, tables_1.evalsToPromptsTable.evalId))
726
+ .leftJoin(tables_1.evalsToDatasetsTable, (0, drizzle_orm_1.eq)(tables_1.evalsTable.id, tables_1.evalsToDatasetsTable.evalId))
727
+ .leftJoin(tables_1.evalsToTagsTable, (0, drizzle_orm_1.eq)(tables_1.evalsTable.id, tables_1.evalsToTagsTable.evalId))
728
+ .leftJoin(tables_1.tagsTable, (0, drizzle_orm_1.eq)(tables_1.evalsToTagsTable.tagId, tables_1.tagsTable.id))
729
+ .where((0, drizzle_orm_1.and)(tag ? (0, drizzle_orm_1.and)((0, drizzle_orm_1.eq)(tables_1.tagsTable.name, tag.key), (0, drizzle_orm_1.eq)(tables_1.tagsTable.value, tag.value)) : undefined, description ? (0, drizzle_orm_1.eq)(tables_1.evalsTable.description, description) : undefined))
730
+ .orderBy((0, drizzle_orm_1.desc)(tables_1.evalsTable.createdAt))
877
731
  .limit(limit)
878
732
  .all();
879
- const standaloneEvals = results.flatMap((result) => {
880
- const { description, createdAt, evalId, promptId, datasetId, results: { table }, isRedteam, } = result;
881
- return table.head.prompts.map((col, index) => {
733
+ const standaloneEvals = (await Promise.all(results.map(async (result) => {
734
+ const { description, createdAt, evalId, promptId, datasetId,
735
+ // @ts-ignore
736
+ isRedteam, } = result;
737
+ const eval_ = await eval_1.default.findById(evalId);
738
+ (0, tiny_invariant_1.default)(eval_, `Eval with ID ${evalId} not found`);
739
+ const table = (await eval_.getTable()) || { body: [] };
740
+ // @ts-ignore
741
+ return eval_.getPrompts().map((col, index) => {
882
742
  // Compute some stats
883
- const pluginCounts = table.body.reduce((acc, row) => {
743
+ const pluginCounts = table.body.reduce(
744
+ // @ts-ignore
745
+ (acc, row) => {
884
746
  const pluginId = row.test.metadata?.pluginId;
885
747
  if (pluginId) {
886
748
  const isPass = row.outputs[index].pass;
887
- acc.pluginPassCount[pluginId] = (acc.pluginPassCount[pluginId] || 0) + (isPass ? 1 : 0);
888
- acc.pluginFailCount[pluginId] = (acc.pluginFailCount[pluginId] || 0) + (isPass ? 0 : 1);
749
+ acc.pluginPassCount[pluginId] =
750
+ (acc.pluginPassCount[pluginId] || 0) + (isPass ? 1 : 0);
751
+ acc.pluginFailCount[pluginId] =
752
+ (acc.pluginFailCount[pluginId] || 0) + (isPass ? 0 : 1);
889
753
  }
890
754
  return acc;
891
755
  }, { pluginPassCount: {}, pluginFailCount: {} });
@@ -900,7 +764,7 @@ function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } =
900
764
  ...col,
901
765
  };
902
766
  });
903
- });
767
+ }))).flat();
904
768
  standaloneEvalCache.set(cacheKey, standaloneEvals);
905
769
  return standaloneEvals;
906
770
  }
@@ -975,7 +839,7 @@ function parsePathOrGlob(basePath, promptPath) {
975
839
  let functionName;
976
840
  if (filename.includes(':')) {
977
841
  const splits = filename.split(':');
978
- if (splits[0] && (isJavascriptFile(splits[0]) || splits[0].endsWith('.py'))) {
842
+ if (splits[0] && ((0, file_1.isJavascriptFile)(splits[0]) || splits[0].endsWith('.py'))) {
979
843
  [filename, functionName] = splits;
980
844
  }
981
845
  }