promptfoo 0.46.0 → 0.48.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/drizzle/0000_lush_hellion.sql +36 -0
- package/dist/drizzle/0001_wide_calypso.sql +3 -0
- package/dist/drizzle/meta/0000_snapshot.json +244 -0
- package/dist/drizzle/meta/0001_snapshot.json +237 -0
- package/dist/drizzle/meta/_journal.json +20 -0
- package/dist/package.json +10 -3
- package/dist/src/__mocks__/database.d.ts +5 -0
- package/dist/src/__mocks__/database.d.ts.map +1 -0
- package/dist/src/__mocks__/database.js +27 -0
- package/dist/src/__mocks__/database.js.map +1 -0
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +51 -42
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/commands/list.d.ts.map +1 -1
- package/dist/src/commands/list.js +4 -5
- package/dist/src/commands/list.js.map +1 -1
- package/dist/src/commands/show.d.ts +1 -1
- package/dist/src/commands/show.d.ts.map +1 -1
- package/dist/src/commands/show.js +7 -7
- package/dist/src/commands/show.js.map +1 -1
- package/dist/src/csv.d.ts +1 -1
- package/dist/src/csv.d.ts.map +1 -1
- package/dist/src/csv.js +5 -0
- package/dist/src/csv.js.map +1 -1
- package/dist/src/database.d.ts +238 -0
- package/dist/src/database.d.ts.map +1 -0
- package/dist/src/database.js +141 -0
- package/dist/src/database.js.map +1 -0
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +8 -6
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +2 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/main.js +6 -4
- package/dist/src/main.js.map +1 -1
- package/dist/src/migrate.d.ts +5 -0
- package/dist/src/migrate.d.ts.map +1 -0
- package/dist/src/migrate.js +50 -0
- package/dist/src/migrate.js.map +1 -0
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +3 -0
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/providers/anthropic.d.ts.map +1 -1
- package/dist/src/providers/anthropic.js +37 -1
- package/dist/src/providers/anthropic.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts +16 -0
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +6 -2
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/providers/replicate.js +16 -1
- package/dist/src/providers/replicate.js.map +1 -1
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +2 -1
- package/dist/src/providers.js.map +1 -1
- package/dist/src/python/wrapper.d.ts +9 -1
- package/dist/src/python/wrapper.d.ts.map +1 -1
- package/dist/src/python/wrapper.js +32 -4
- package/dist/src/python/wrapper.js.map +1 -1
- package/dist/src/types.d.ts +5 -6
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +52 -19
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +378 -125
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/nextui/404/index.html +1 -1
- package/dist/src/web/nextui/404.html +1 -1
- package/dist/src/web/nextui/_next/static/chunks/952-ede6b209625d42a2.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/datasets/page-ad55f89d622ef8e7.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/prompts/page-01ab4878803b7068.js +1 -0
- package/dist/src/web/nextui/_next/static/chunks/app/setup/page-9c163111247d8da5.js +1 -0
- package/dist/src/web/nextui/api/results +1 -1
- package/dist/src/web/nextui/auth/login/index.html +1 -1
- package/dist/src/web/nextui/auth/login/index.txt +3 -3
- package/dist/src/web/nextui/auth/signup/index.html +1 -1
- package/dist/src/web/nextui/auth/signup/index.txt +3 -3
- package/dist/src/web/nextui/datasets/index.html +1 -1
- package/dist/src/web/nextui/datasets/index.txt +3 -3
- package/dist/src/web/nextui/eval/index.html +1 -1
- package/dist/src/web/nextui/eval/index.txt +3 -3
- package/dist/src/web/nextui/index.html +1 -1
- package/dist/src/web/nextui/index.txt +2 -2
- package/dist/src/web/nextui/prompts/index.html +1 -1
- package/dist/src/web/nextui/prompts/index.txt +3 -3
- package/dist/src/web/nextui/setup/index.html +1 -1
- package/dist/src/web/nextui/setup/index.txt +3 -3
- package/dist/src/web/server.d.ts +1 -1
- package/dist/src/web/server.d.ts.map +1 -1
- package/dist/src/web/server.js +25 -43
- package/dist/src/web/server.js.map +1 -1
- package/package.json +10 -3
- package/dist/src/web/nextui/_next/static/chunks/952-1367984f076e3060.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/datasets/page-44ab188f3b846712.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/prompts/page-0bf3409d6a6bfa22.js +0 -1
- package/dist/src/web/nextui/_next/static/chunks/app/setup/page-83c7e62787113081.js +0 -1
- /package/dist/src/web/nextui/_next/static/{Np8tRhZUzimy-v_hu8F8W → 8yxA5JzS0wXTxJptFRKTo}/_buildManifest.js +0 -0
- /package/dist/src/web/nextui/_next/static/{Np8tRhZUzimy-v_hu8F8W → 8yxA5JzS0wXTxJptFRKTo}/_ssgManifest.js +0 -0
- /package/dist/src/web/nextui/_next/static/chunks/{82-ca0360e473d81167.js → 82-6e8c9ebc91ff932b.js} +0 -0
package/dist/src/util.js
CHANGED
|
@@ -26,7 +26,7 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
26
26
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
27
27
|
};
|
|
28
28
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
29
|
-
exports.transformOutput = exports.printBorder = exports.getNunjucksEngine = exports.readFilters = exports.getEvalsWithPredicate = exports.getEvalFromHash = exports.getEvals = exports.getDatasetFromHash = exports.getPromptFromHash = exports.getTestCasesWithPredicate = exports.getTestCases = exports.getPromptsWithPredicate = exports.getPrompts = exports.sha256 = exports.getPromptsForTestCasesHash = exports.getPromptsForTestCases = exports.readLatestResults = exports.updateResult = exports.readResult = exports.dateToFilename = exports.filenameToDate = exports.
|
|
29
|
+
exports.transformOutput = exports.printBorder = exports.getNunjucksEngine = exports.readFilters = exports.getEvalsWithPredicate = exports.getEvalFromHash = exports.getEvals = exports.getDatasetFromHash = exports.getPromptFromHash = exports.getTestCasesWithPredicate = exports.getTestCases = exports.getPromptsWithPredicate = exports.getPrompts = exports.sha256 = exports.getPromptsForTestCasesHash = exports.getPromptsForTestCases = exports.readLatestResults = exports.updateResult = exports.readResult_fileSystem = exports.readResult = exports.dateToFilename = exports.filenameToDate = exports.cleanupOldFileResults = exports.migrateResultsFromFileSystemToDatabase = exports.listPreviousResults_fileSystem = exports.listPreviousResultFilenames_fileSystem = exports.listPreviousResults = exports.writeResultsToDatabase = exports.getLatestResultsPath = exports.setConfigDirectoryPath = exports.getConfigDirectoryPath = exports.writeOutput = exports.writeMultipleOutputs = exports.readConfigs = exports.readConfig = exports.dereferenceConfig = exports.maybeReadConfig = exports.maybeRecordFirstRun = exports.readGlobalConfig = exports.resetGlobalConfig = void 0;
|
|
30
30
|
const fs = __importStar(require("fs"));
|
|
31
31
|
const path = __importStar(require("path"));
|
|
32
32
|
const os = __importStar(require("os"));
|
|
@@ -37,9 +37,12 @@ const nunjucks_1 = __importDefault(require("nunjucks"));
|
|
|
37
37
|
const js_yaml_1 = __importDefault(require("js-yaml"));
|
|
38
38
|
const sync_1 = require("csv-stringify/sync");
|
|
39
39
|
const glob_1 = require("glob");
|
|
40
|
+
const drizzle_orm_1 = require("drizzle-orm");
|
|
40
41
|
const logger_1 = __importDefault(require("./logger"));
|
|
41
42
|
const esm_1 = require("./esm");
|
|
42
43
|
const testCases_1 = require("./testCases");
|
|
44
|
+
const database_1 = require("./database");
|
|
45
|
+
const migrate_1 = require("./migrate");
|
|
43
46
|
let globalConfigCache = null;
|
|
44
47
|
function resetGlobalConfig() {
|
|
45
48
|
globalConfigCache = null;
|
|
@@ -182,6 +185,12 @@ async function readConfig(configPath) {
|
|
|
182
185
|
}
|
|
183
186
|
}
|
|
184
187
|
exports.readConfig = readConfig;
|
|
188
|
+
/**
|
|
189
|
+
* Reads multiple configuration files and combines them into a single UnifiedConfig.
|
|
190
|
+
*
|
|
191
|
+
* @param {string[]} configPaths - An array of paths to configuration files. Supports glob patterns.
|
|
192
|
+
* @returns {Promise<UnifiedConfig>} A promise that resolves to a unified configuration object.
|
|
193
|
+
*/
|
|
185
194
|
async function readConfigs(configPaths) {
|
|
186
195
|
const configs = [];
|
|
187
196
|
for (const configPath of configPaths) {
|
|
@@ -230,16 +239,18 @@ async function readConfigs(configPaths) {
|
|
|
230
239
|
}
|
|
231
240
|
return relativePath;
|
|
232
241
|
};
|
|
242
|
+
const seenPrompts = new Set();
|
|
233
243
|
configs.forEach((config, idx) => {
|
|
234
244
|
if (typeof config.prompts === 'string') {
|
|
235
245
|
(0, tiny_invariant_1.default)(Array.isArray(prompts), 'Cannot mix string and map-type prompts');
|
|
236
|
-
|
|
237
|
-
|
|
246
|
+
const absolutePrompt = makeAbsolute(configPaths[idx], config.prompts);
|
|
247
|
+
seenPrompts.add(absolutePrompt);
|
|
238
248
|
}
|
|
239
249
|
else if (Array.isArray(config.prompts)) {
|
|
240
250
|
(0, tiny_invariant_1.default)(Array.isArray(prompts), 'Cannot mix configs with map and array-type prompts');
|
|
241
|
-
config.prompts
|
|
242
|
-
|
|
251
|
+
config.prompts
|
|
252
|
+
.map((prompt) => makeAbsolute(configPaths[idx], prompt))
|
|
253
|
+
.forEach((prompt) => seenPrompts.add(prompt));
|
|
243
254
|
}
|
|
244
255
|
else {
|
|
245
256
|
// Object format such as { 'prompts/prompt1.txt': 'foo', 'prompts/prompt2.txt': 'bar' }
|
|
@@ -247,6 +258,9 @@ async function readConfigs(configPaths) {
|
|
|
247
258
|
prompts = { ...prompts, ...config.prompts };
|
|
248
259
|
}
|
|
249
260
|
});
|
|
261
|
+
if (Array.isArray(prompts)) {
|
|
262
|
+
prompts.push(...Array.from(seenPrompts));
|
|
263
|
+
}
|
|
250
264
|
// Combine all configs into a single UnifiedConfig
|
|
251
265
|
const combinedConfig = {
|
|
252
266
|
description: configs.map((config) => config.description).join(', '),
|
|
@@ -345,44 +359,114 @@ function setConfigDirectoryPath(newPath) {
|
|
|
345
359
|
configDirectoryPath = newPath;
|
|
346
360
|
}
|
|
347
361
|
exports.setConfigDirectoryPath = setConfigDirectoryPath;
|
|
362
|
+
/**
|
|
363
|
+
* TODO(ian): Remove this
|
|
364
|
+
* @deprecated Use readLatestResults directly instead.
|
|
365
|
+
*/
|
|
348
366
|
function getLatestResultsPath() {
|
|
349
367
|
return path.join(getConfigDirectoryPath(), 'output', 'latest.json');
|
|
350
368
|
}
|
|
351
369
|
exports.getLatestResultsPath = getLatestResultsPath;
|
|
352
|
-
function
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
const
|
|
356
|
-
const
|
|
357
|
-
|
|
370
|
+
async function writeResultsToDatabase(results, config, createdAt) {
|
|
371
|
+
createdAt = createdAt || new Date();
|
|
372
|
+
const evalId = `eval-${createdAt.toISOString().slice(0, 19)}`;
|
|
373
|
+
const db = (0, database_1.getDb)();
|
|
374
|
+
const promises = [];
|
|
375
|
+
promises.push(db
|
|
376
|
+
.insert(database_1.evals)
|
|
377
|
+
.values({
|
|
378
|
+
id: evalId,
|
|
379
|
+
createdAt: createdAt.getTime(),
|
|
380
|
+
description: config.description,
|
|
381
|
+
config,
|
|
382
|
+
results,
|
|
383
|
+
})
|
|
384
|
+
.onConflictDoNothing()
|
|
385
|
+
.run());
|
|
386
|
+
logger_1.default.debug(`Inserting eval ${evalId}`);
|
|
387
|
+
// Record prompt relation
|
|
388
|
+
for (const prompt of results.table.head.prompts) {
|
|
389
|
+
const promptId = sha256(prompt.display);
|
|
390
|
+
promises.push(db
|
|
391
|
+
.insert(database_1.prompts)
|
|
392
|
+
.values({
|
|
393
|
+
id: promptId,
|
|
394
|
+
prompt: prompt.display,
|
|
395
|
+
})
|
|
396
|
+
.onConflictDoNothing()
|
|
397
|
+
.run());
|
|
398
|
+
promises.push(db
|
|
399
|
+
.insert(database_1.evalsToPrompts)
|
|
400
|
+
.values({
|
|
401
|
+
evalId,
|
|
402
|
+
promptId,
|
|
403
|
+
})
|
|
404
|
+
.onConflictDoNothing()
|
|
405
|
+
.run());
|
|
406
|
+
logger_1.default.debug(`Inserting prompt ${promptId}`);
|
|
407
|
+
}
|
|
408
|
+
// Record dataset relation
|
|
409
|
+
const datasetId = sha256(JSON.stringify(config.tests || []));
|
|
410
|
+
promises.push(db
|
|
411
|
+
.insert(database_1.datasets)
|
|
412
|
+
.values({
|
|
413
|
+
id: datasetId,
|
|
414
|
+
tests: config.tests,
|
|
415
|
+
})
|
|
416
|
+
.onConflictDoNothing()
|
|
417
|
+
.run());
|
|
418
|
+
promises.push(db
|
|
419
|
+
.insert(database_1.evalsToDatasets)
|
|
420
|
+
.values({
|
|
421
|
+
evalId,
|
|
422
|
+
datasetId,
|
|
423
|
+
})
|
|
424
|
+
.onConflictDoNothing()
|
|
425
|
+
.run());
|
|
426
|
+
logger_1.default.debug(`Inserting dataset ${datasetId}`);
|
|
427
|
+
logger_1.default.debug(`Awaiting ${promises.length} promises to database...`);
|
|
428
|
+
await Promise.all(promises);
|
|
429
|
+
// "touch" db signal path
|
|
430
|
+
const filePath = (0, database_1.getDbSignalPath)();
|
|
358
431
|
try {
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
version: 2,
|
|
362
|
-
createdAt: new Date().toISOString(),
|
|
363
|
-
config,
|
|
364
|
-
results,
|
|
365
|
-
};
|
|
366
|
-
fs.writeFileSync(newResultsPath, JSON.stringify(resultsFileData, null, 2));
|
|
367
|
-
// Use copy instead of symlink to avoid issues with Windows permissions.
|
|
368
|
-
try {
|
|
369
|
-
// Backwards compatibility: delete old symlink.
|
|
370
|
-
fs.unlinkSync(latestResultsPath);
|
|
371
|
-
}
|
|
372
|
-
catch { }
|
|
373
|
-
fs.copyFileSync(newResultsPath, latestResultsPath);
|
|
374
|
-
cleanupOldResults();
|
|
375
|
-
return filename;
|
|
432
|
+
const now = new Date();
|
|
433
|
+
fs.utimesSync(filePath, now, now);
|
|
376
434
|
}
|
|
377
435
|
catch (err) {
|
|
378
|
-
|
|
379
|
-
return null;
|
|
436
|
+
fs.closeSync(fs.openSync(filePath, 'w'));
|
|
380
437
|
}
|
|
438
|
+
return evalId;
|
|
381
439
|
}
|
|
382
|
-
exports.
|
|
383
|
-
|
|
384
|
-
|
|
440
|
+
exports.writeResultsToDatabase = writeResultsToDatabase;
|
|
441
|
+
/**
|
|
442
|
+
*
|
|
443
|
+
* @returns Last 100 evals in descending order.
|
|
444
|
+
*/
|
|
445
|
+
function listPreviousResults() {
|
|
446
|
+
const db = (0, database_1.getDb)();
|
|
447
|
+
const results = db
|
|
448
|
+
.select({
|
|
449
|
+
name: database_1.evals.id,
|
|
450
|
+
description: database_1.evals.description,
|
|
451
|
+
})
|
|
452
|
+
.from(database_1.evals)
|
|
453
|
+
.orderBy((0, drizzle_orm_1.desc)(database_1.evals.createdAt))
|
|
454
|
+
.limit(100)
|
|
455
|
+
.all();
|
|
456
|
+
return results.map((result) => ({
|
|
457
|
+
evalId: result.name,
|
|
458
|
+
description: result.description,
|
|
459
|
+
}));
|
|
460
|
+
}
|
|
461
|
+
exports.listPreviousResults = listPreviousResults;
|
|
462
|
+
/**
|
|
463
|
+
* @deprecated Used only for migration to sqlite
|
|
464
|
+
*/
|
|
465
|
+
function listPreviousResultFilenames_fileSystem() {
|
|
385
466
|
const directory = path.join(getConfigDirectoryPath(), 'output');
|
|
467
|
+
if (!fs.existsSync(directory)) {
|
|
468
|
+
return [];
|
|
469
|
+
}
|
|
386
470
|
const files = fs.readdirSync(directory);
|
|
387
471
|
const resultsFiles = files.filter((file) => file.startsWith('eval-') && file.endsWith('.json'));
|
|
388
472
|
return resultsFiles.sort((a, b) => {
|
|
@@ -391,10 +475,17 @@ function listPreviousResultFilenames() {
|
|
|
391
475
|
return statA.birthtime.getTime() - statB.birthtime.getTime(); // sort in ascending order
|
|
392
476
|
});
|
|
393
477
|
}
|
|
394
|
-
exports.
|
|
395
|
-
|
|
478
|
+
exports.listPreviousResultFilenames_fileSystem = listPreviousResultFilenames_fileSystem;
|
|
479
|
+
const resultsCache = {};
|
|
480
|
+
/**
|
|
481
|
+
* @deprecated Used only for migration to sqlite
|
|
482
|
+
*/
|
|
483
|
+
function listPreviousResults_fileSystem() {
|
|
396
484
|
const directory = path.join(getConfigDirectoryPath(), 'output');
|
|
397
|
-
|
|
485
|
+
if (!fs.existsSync(directory)) {
|
|
486
|
+
return [];
|
|
487
|
+
}
|
|
488
|
+
const sortedFiles = listPreviousResultFilenames_fileSystem();
|
|
398
489
|
return sortedFiles.map((fileName) => {
|
|
399
490
|
if (!resultsCache[fileName]) {
|
|
400
491
|
try {
|
|
@@ -412,15 +503,71 @@ function listPreviousResults() {
|
|
|
412
503
|
};
|
|
413
504
|
});
|
|
414
505
|
}
|
|
415
|
-
exports.
|
|
506
|
+
exports.listPreviousResults_fileSystem = listPreviousResults_fileSystem;
|
|
507
|
+
let attemptedMigration = false;
|
|
508
|
+
async function migrateResultsFromFileSystemToDatabase() {
|
|
509
|
+
if (attemptedMigration) {
|
|
510
|
+
// TODO(ian): Record this bit in the database.
|
|
511
|
+
return;
|
|
512
|
+
}
|
|
513
|
+
// First run db migrations
|
|
514
|
+
logger_1.default.debug('Running db migrations...');
|
|
515
|
+
await (0, migrate_1.runDbMigrations)();
|
|
516
|
+
const fileNames = listPreviousResultFilenames_fileSystem();
|
|
517
|
+
if (fileNames.length === 0) {
|
|
518
|
+
return;
|
|
519
|
+
}
|
|
520
|
+
logger_1.default.info(`🔁 Migrating ${fileNames.length} flat files to local database.`);
|
|
521
|
+
logger_1.default.info('This is a one-time operation and may take a minute...');
|
|
522
|
+
attemptedMigration = true;
|
|
523
|
+
const outputDir = path.join(getConfigDirectoryPath(), 'output');
|
|
524
|
+
const backupDir = `${outputDir}-backup-${new Date()
|
|
525
|
+
.toISOString()
|
|
526
|
+
.slice(0, 10)
|
|
527
|
+
.replace(/-/g, '')}`;
|
|
528
|
+
try {
|
|
529
|
+
fs.cpSync(outputDir, backupDir, { recursive: true });
|
|
530
|
+
logger_1.default.info(`Backup of output directory created at ${backupDir}`);
|
|
531
|
+
}
|
|
532
|
+
catch (backupError) {
|
|
533
|
+
logger_1.default.error(`Failed to create backup of output directory: ${backupError}`);
|
|
534
|
+
return;
|
|
535
|
+
}
|
|
536
|
+
logger_1.default.info('Moving files into database...');
|
|
537
|
+
const migrationPromises = fileNames.map(async (fileName) => {
|
|
538
|
+
const fileData = readResult_fileSystem(fileName);
|
|
539
|
+
if (fileData) {
|
|
540
|
+
await writeResultsToDatabase(fileData.result.results, fileData.result.config, filenameToDate(fileName));
|
|
541
|
+
logger_1.default.debug(`Migrated ${fileName} to database.`);
|
|
542
|
+
try {
|
|
543
|
+
fs.unlinkSync(path.join(outputDir, fileName));
|
|
544
|
+
}
|
|
545
|
+
catch (err) {
|
|
546
|
+
logger_1.default.warn(`Failed to delete ${fileName} after migration: ${err}`);
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
else {
|
|
550
|
+
logger_1.default.warn(`Failed to migrate result ${fileName} due to read error.`);
|
|
551
|
+
}
|
|
552
|
+
});
|
|
553
|
+
await Promise.all(migrationPromises);
|
|
554
|
+
try {
|
|
555
|
+
fs.unlinkSync(getLatestResultsPath());
|
|
556
|
+
}
|
|
557
|
+
catch (err) {
|
|
558
|
+
logger_1.default.warn(`Failed to delete latest.json: ${err}`);
|
|
559
|
+
}
|
|
560
|
+
logger_1.default.info('Migration complete. Please restart your web server if it is running.');
|
|
561
|
+
}
|
|
562
|
+
exports.migrateResultsFromFileSystemToDatabase = migrateResultsFromFileSystemToDatabase;
|
|
416
563
|
const RESULT_HISTORY_LENGTH = parseInt(process.env.RESULT_HISTORY_LENGTH || '', 10) || 100;
|
|
417
|
-
function
|
|
418
|
-
const sortedFilenames =
|
|
564
|
+
function cleanupOldFileResults(remaining = RESULT_HISTORY_LENGTH) {
|
|
565
|
+
const sortedFilenames = listPreviousResultFilenames_fileSystem();
|
|
419
566
|
for (let i = 0; i < sortedFilenames.length - remaining; i++) {
|
|
420
567
|
fs.unlinkSync(path.join(getConfigDirectoryPath(), 'output', sortedFilenames[i]));
|
|
421
568
|
}
|
|
422
569
|
}
|
|
423
|
-
exports.
|
|
570
|
+
exports.cleanupOldFileResults = cleanupOldFileResults;
|
|
424
571
|
function filenameToDate(filename) {
|
|
425
572
|
const dateString = filename.slice('eval-'.length, filename.length - '.json'.length);
|
|
426
573
|
// Replace hyphens with colons where necessary (Windows compatibility).
|
|
@@ -428,27 +575,67 @@ function filenameToDate(filename) {
|
|
|
428
575
|
const timePart = dateParts[1].replace(/-/g, ':');
|
|
429
576
|
const formattedDateString = `${dateParts[0]}T${timePart}`;
|
|
430
577
|
const date = new Date(formattedDateString);
|
|
578
|
+
return date;
|
|
579
|
+
/*
|
|
431
580
|
return date.toLocaleDateString('en-US', {
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
581
|
+
year: 'numeric',
|
|
582
|
+
month: 'long',
|
|
583
|
+
day: 'numeric',
|
|
584
|
+
hour: '2-digit',
|
|
585
|
+
minute: '2-digit',
|
|
586
|
+
second: '2-digit',
|
|
587
|
+
timeZoneName: 'short',
|
|
439
588
|
});
|
|
589
|
+
*/
|
|
440
590
|
}
|
|
441
591
|
exports.filenameToDate = filenameToDate;
|
|
442
592
|
function dateToFilename(date) {
|
|
443
593
|
return `eval-${date.toISOString().replace(/:/g, '-')}.json`;
|
|
444
594
|
}
|
|
445
595
|
exports.dateToFilename = dateToFilename;
|
|
446
|
-
function readResult(
|
|
596
|
+
async function readResult(id) {
|
|
597
|
+
const db = (0, database_1.getDb)();
|
|
598
|
+
try {
|
|
599
|
+
const evalResult = await db
|
|
600
|
+
.select({
|
|
601
|
+
id: database_1.evals.id,
|
|
602
|
+
createdAt: database_1.evals.createdAt,
|
|
603
|
+
results: database_1.evals.results,
|
|
604
|
+
config: database_1.evals.config,
|
|
605
|
+
})
|
|
606
|
+
.from(database_1.evals)
|
|
607
|
+
.where((0, drizzle_orm_1.eq)(database_1.evals.id, id))
|
|
608
|
+
.execute();
|
|
609
|
+
if (evalResult.length === 0) {
|
|
610
|
+
return undefined;
|
|
611
|
+
}
|
|
612
|
+
const { id: resultId, createdAt, results, config } = evalResult[0];
|
|
613
|
+
const result = {
|
|
614
|
+
version: 3,
|
|
615
|
+
createdAt: new Date(createdAt).toISOString().slice(0, 10),
|
|
616
|
+
results,
|
|
617
|
+
config,
|
|
618
|
+
};
|
|
619
|
+
return {
|
|
620
|
+
id: resultId,
|
|
621
|
+
result,
|
|
622
|
+
createdAt: new Date(createdAt),
|
|
623
|
+
};
|
|
624
|
+
}
|
|
625
|
+
catch (err) {
|
|
626
|
+
logger_1.default.error(`Failed to read result with ID ${id} from database:\n${err}`);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
exports.readResult = readResult;
|
|
630
|
+
/**
|
|
631
|
+
* @deprecated Used only for migration to sqlite
|
|
632
|
+
*/
|
|
633
|
+
function readResult_fileSystem(name) {
|
|
447
634
|
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
|
|
448
635
|
const resultsPath = path.join(resultsDirectory, name);
|
|
449
636
|
try {
|
|
450
637
|
const result = JSON.parse(fs.readFileSync(fs.realpathSync(resultsPath), 'utf-8'));
|
|
451
|
-
const createdAt =
|
|
638
|
+
const createdAt = filenameToDate(name);
|
|
452
639
|
return {
|
|
453
640
|
id: sha256(JSON.stringify(result.config)),
|
|
454
641
|
result,
|
|
@@ -459,35 +646,70 @@ function readResult(name) {
|
|
|
459
646
|
logger_1.default.error(`Failed to read results from ${resultsPath}:\n${err}`);
|
|
460
647
|
}
|
|
461
648
|
}
|
|
462
|
-
exports.
|
|
463
|
-
function updateResult(
|
|
464
|
-
const
|
|
465
|
-
const safeFilename = path.basename(filename);
|
|
466
|
-
const resultsPath = path.join(resultsDirectory, safeFilename);
|
|
649
|
+
exports.readResult_fileSystem = readResult_fileSystem;
|
|
650
|
+
async function updateResult(id, newConfig, newTable) {
|
|
651
|
+
const db = (0, database_1.getDb)();
|
|
467
652
|
try {
|
|
468
|
-
|
|
653
|
+
// Fetch the existing eval data from the database
|
|
654
|
+
const existingEval = await db
|
|
655
|
+
.select({
|
|
656
|
+
config: database_1.evals.config,
|
|
657
|
+
results: database_1.evals.results,
|
|
658
|
+
})
|
|
659
|
+
.from(database_1.evals)
|
|
660
|
+
.where((0, drizzle_orm_1.eq)(database_1.evals.id, id))
|
|
661
|
+
.limit(1)
|
|
662
|
+
.all();
|
|
663
|
+
if (existingEval.length === 0) {
|
|
664
|
+
logger_1.default.error(`Eval with ID ${id} not found.`);
|
|
665
|
+
return;
|
|
666
|
+
}
|
|
667
|
+
const evalData = existingEval[0];
|
|
469
668
|
if (newConfig) {
|
|
470
669
|
evalData.config = newConfig;
|
|
471
670
|
}
|
|
472
671
|
if (newTable) {
|
|
473
672
|
evalData.results.table = newTable;
|
|
474
673
|
}
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
674
|
+
await db
|
|
675
|
+
.update(database_1.evals)
|
|
676
|
+
.set({
|
|
677
|
+
description: evalData.config.description,
|
|
678
|
+
config: evalData.config,
|
|
679
|
+
results: evalData.results,
|
|
680
|
+
})
|
|
681
|
+
.where((0, drizzle_orm_1.eq)(database_1.evals.id, id))
|
|
682
|
+
.run();
|
|
683
|
+
logger_1.default.info(`Updated eval with ID ${id}`);
|
|
483
684
|
}
|
|
484
685
|
catch (err) {
|
|
485
|
-
logger_1.default.error(`Failed to update eval
|
|
686
|
+
logger_1.default.error(`Failed to update eval with ID ${id}:\n${err}`);
|
|
486
687
|
}
|
|
487
688
|
}
|
|
488
689
|
exports.updateResult = updateResult;
|
|
489
|
-
function readLatestResults() {
|
|
490
|
-
|
|
690
|
+
async function readLatestResults() {
|
|
691
|
+
const db = (0, database_1.getDb)();
|
|
692
|
+
const latestResults = await db
|
|
693
|
+
.select({
|
|
694
|
+
id: database_1.evals.id,
|
|
695
|
+
createdAt: database_1.evals.createdAt,
|
|
696
|
+
description: database_1.evals.description,
|
|
697
|
+
results: database_1.evals.results,
|
|
698
|
+
config: database_1.evals.config,
|
|
699
|
+
})
|
|
700
|
+
.from(database_1.evals)
|
|
701
|
+
.orderBy((0, drizzle_orm_1.desc)(database_1.evals.createdAt))
|
|
702
|
+
.limit(1);
|
|
703
|
+
if (!latestResults || latestResults.length === 0) {
|
|
704
|
+
return undefined;
|
|
705
|
+
}
|
|
706
|
+
const latestResult = latestResults[0];
|
|
707
|
+
return {
|
|
708
|
+
version: 3,
|
|
709
|
+
createdAt: new Date(latestResult.createdAt).toISOString(),
|
|
710
|
+
results: latestResult.results,
|
|
711
|
+
config: latestResult.config,
|
|
712
|
+
};
|
|
491
713
|
}
|
|
492
714
|
exports.readLatestResults = readLatestResults;
|
|
493
715
|
function getPromptsForTestCases(testCases) {
|
|
@@ -512,26 +734,39 @@ function getPrompts() {
|
|
|
512
734
|
return getPromptsWithPredicate(() => true);
|
|
513
735
|
}
|
|
514
736
|
exports.getPrompts = getPrompts;
|
|
515
|
-
function getPromptsWithPredicate(predicate) {
|
|
516
|
-
|
|
737
|
+
async function getPromptsWithPredicate(predicate) {
|
|
738
|
+
// TODO(ian): Make this use a proper database query
|
|
739
|
+
const db = (0, database_1.getDb)();
|
|
740
|
+
const evals_ = await db
|
|
741
|
+
.select({
|
|
742
|
+
id: database_1.evals.id,
|
|
743
|
+
createdAt: database_1.evals.createdAt,
|
|
744
|
+
results: database_1.evals.results,
|
|
745
|
+
config: database_1.evals.config,
|
|
746
|
+
})
|
|
747
|
+
.from(database_1.evals)
|
|
748
|
+
.limit(100)
|
|
749
|
+
.all();
|
|
517
750
|
const groupedPrompts = {};
|
|
518
|
-
for (const
|
|
519
|
-
const
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
751
|
+
for (const eval_ of evals_) {
|
|
752
|
+
const createdAt = new Date(eval_.createdAt).toISOString();
|
|
753
|
+
const resultWrapper = {
|
|
754
|
+
version: 3,
|
|
755
|
+
createdAt,
|
|
756
|
+
results: eval_.results,
|
|
757
|
+
config: eval_.config,
|
|
758
|
+
};
|
|
759
|
+
if (predicate(resultWrapper)) {
|
|
760
|
+
for (const prompt of resultWrapper.results.table.head.prompts) {
|
|
527
761
|
const promptId = sha256(prompt.raw);
|
|
528
|
-
const datasetId =
|
|
762
|
+
const datasetId = resultWrapper.config.tests
|
|
763
|
+
? sha256(JSON.stringify(resultWrapper.config.tests))
|
|
764
|
+
: '-';
|
|
529
765
|
if (promptId in groupedPrompts) {
|
|
530
766
|
groupedPrompts[promptId].recentEvalDate = new Date(Math.max(groupedPrompts[promptId].recentEvalDate.getTime(), new Date(createdAt).getTime()));
|
|
531
767
|
groupedPrompts[promptId].count += 1;
|
|
532
768
|
groupedPrompts[promptId].evals.push({
|
|
533
|
-
id:
|
|
534
|
-
filePath: fileName,
|
|
769
|
+
id: eval_.id,
|
|
535
770
|
datasetId,
|
|
536
771
|
metrics: prompt.metrics,
|
|
537
772
|
});
|
|
@@ -542,12 +777,10 @@ function getPromptsWithPredicate(predicate) {
|
|
|
542
777
|
id: promptId,
|
|
543
778
|
prompt,
|
|
544
779
|
recentEvalDate: new Date(createdAt),
|
|
545
|
-
recentEvalId:
|
|
546
|
-
recentEvalFilepath: fileName,
|
|
780
|
+
recentEvalId: eval_.id,
|
|
547
781
|
evals: [
|
|
548
782
|
{
|
|
549
|
-
id:
|
|
550
|
-
filePath: fileName,
|
|
783
|
+
id: eval_.id,
|
|
551
784
|
datasetId,
|
|
552
785
|
metrics: prompt.metrics,
|
|
553
786
|
},
|
|
@@ -560,31 +793,42 @@ function getPromptsWithPredicate(predicate) {
|
|
|
560
793
|
return Object.values(groupedPrompts);
|
|
561
794
|
}
|
|
562
795
|
exports.getPromptsWithPredicate = getPromptsWithPredicate;
|
|
563
|
-
function getTestCases() {
|
|
796
|
+
async function getTestCases() {
|
|
564
797
|
return getTestCasesWithPredicate(() => true);
|
|
565
798
|
}
|
|
566
799
|
exports.getTestCases = getTestCases;
|
|
567
|
-
function getTestCasesWithPredicate(predicate) {
|
|
568
|
-
const
|
|
800
|
+
async function getTestCasesWithPredicate(predicate) {
|
|
801
|
+
const db = (0, database_1.getDb)();
|
|
802
|
+
const evals_ = await db
|
|
803
|
+
.select({
|
|
804
|
+
id: database_1.evals.id,
|
|
805
|
+
createdAt: database_1.evals.createdAt,
|
|
806
|
+
results: database_1.evals.results,
|
|
807
|
+
config: database_1.evals.config,
|
|
808
|
+
})
|
|
809
|
+
.from(database_1.evals)
|
|
810
|
+
.limit(100)
|
|
811
|
+
.all();
|
|
569
812
|
const groupedTestCases = {};
|
|
570
|
-
for (const
|
|
571
|
-
const
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
813
|
+
for (const eval_ of evals_) {
|
|
814
|
+
const createdAt = new Date(eval_.createdAt).toISOString();
|
|
815
|
+
const resultWrapper = {
|
|
816
|
+
version: 3,
|
|
817
|
+
createdAt,
|
|
818
|
+
results: eval_.results,
|
|
819
|
+
config: eval_.config,
|
|
820
|
+
};
|
|
821
|
+
const testCases = resultWrapper.config.tests;
|
|
822
|
+
if (testCases && predicate(resultWrapper)) {
|
|
823
|
+
const evalId = eval_.id;
|
|
579
824
|
const datasetId = sha256(JSON.stringify(testCases));
|
|
580
825
|
if (datasetId in groupedTestCases) {
|
|
581
|
-
groupedTestCases[datasetId].recentEvalDate = new Date(Math.max(groupedTestCases[datasetId].recentEvalDate.getTime(),
|
|
826
|
+
groupedTestCases[datasetId].recentEvalDate = new Date(Math.max(groupedTestCases[datasetId].recentEvalDate.getTime(), eval_.createdAt));
|
|
582
827
|
groupedTestCases[datasetId].count += 1;
|
|
583
|
-
const newPrompts =
|
|
828
|
+
const newPrompts = resultWrapper.results.table.head.prompts.map((prompt) => ({
|
|
584
829
|
id: sha256(prompt.raw),
|
|
585
830
|
prompt,
|
|
586
831
|
evalId,
|
|
587
|
-
evalFilepath: fileName,
|
|
588
832
|
}));
|
|
589
833
|
const promptsById = {};
|
|
590
834
|
for (const prompt of groupedTestCases[datasetId].prompts.concat(newPrompts)) {
|
|
@@ -595,11 +839,10 @@ function getTestCasesWithPredicate(predicate) {
|
|
|
595
839
|
groupedTestCases[datasetId].prompts = Object.values(promptsById);
|
|
596
840
|
}
|
|
597
841
|
else {
|
|
598
|
-
const newPrompts =
|
|
599
|
-
id:
|
|
842
|
+
const newPrompts = resultWrapper.results.table.head.prompts.map((prompt) => ({
|
|
843
|
+
id: sha256(prompt.raw),
|
|
600
844
|
prompt,
|
|
601
845
|
evalId,
|
|
602
|
-
evalFilepath: fileName,
|
|
603
846
|
}));
|
|
604
847
|
const promptsById = {};
|
|
605
848
|
for (const prompt of newPrompts) {
|
|
@@ -613,7 +856,6 @@ function getTestCasesWithPredicate(predicate) {
|
|
|
613
856
|
testCases,
|
|
614
857
|
recentEvalDate: new Date(createdAt),
|
|
615
858
|
recentEvalId: evalId,
|
|
616
|
-
recentEvalFilepath: fileName,
|
|
617
859
|
prompts: Object.values(promptsById),
|
|
618
860
|
};
|
|
619
861
|
}
|
|
@@ -622,8 +864,8 @@ function getTestCasesWithPredicate(predicate) {
|
|
|
622
864
|
return Object.values(groupedTestCases);
|
|
623
865
|
}
|
|
624
866
|
exports.getTestCasesWithPredicate = getTestCasesWithPredicate;
|
|
625
|
-
function getPromptFromHash(hash) {
|
|
626
|
-
const prompts = getPrompts();
|
|
867
|
+
async function getPromptFromHash(hash) {
|
|
868
|
+
const prompts = await getPrompts();
|
|
627
869
|
for (const prompt of prompts) {
|
|
628
870
|
if (prompt.id.startsWith(hash)) {
|
|
629
871
|
return prompt;
|
|
@@ -632,8 +874,8 @@ function getPromptFromHash(hash) {
|
|
|
632
874
|
return undefined;
|
|
633
875
|
}
|
|
634
876
|
exports.getPromptFromHash = getPromptFromHash;
|
|
635
|
-
function getDatasetFromHash(hash) {
|
|
636
|
-
const datasets = getTestCases();
|
|
877
|
+
async function getDatasetFromHash(hash) {
|
|
878
|
+
const datasets = await getTestCases();
|
|
637
879
|
for (const dataset of datasets) {
|
|
638
880
|
if (dataset.id.startsWith(hash)) {
|
|
639
881
|
return dataset;
|
|
@@ -642,13 +884,13 @@ function getDatasetFromHash(hash) {
|
|
|
642
884
|
return undefined;
|
|
643
885
|
}
|
|
644
886
|
exports.getDatasetFromHash = getDatasetFromHash;
|
|
645
|
-
function getEvals() {
|
|
887
|
+
async function getEvals() {
|
|
646
888
|
return getEvalsWithPredicate(() => true);
|
|
647
889
|
}
|
|
648
890
|
exports.getEvals = getEvals;
|
|
649
|
-
function getEvalFromHash(hash) {
|
|
650
|
-
const
|
|
651
|
-
for (const eval_ of
|
|
891
|
+
async function getEvalFromHash(hash) {
|
|
892
|
+
const evals_ = await getEvals();
|
|
893
|
+
for (const eval_ of evals_) {
|
|
652
894
|
if (eval_.id.startsWith(hash)) {
|
|
653
895
|
return eval_;
|
|
654
896
|
}
|
|
@@ -656,23 +898,34 @@ function getEvalFromHash(hash) {
|
|
|
656
898
|
return undefined;
|
|
657
899
|
}
|
|
658
900
|
exports.getEvalFromHash = getEvalFromHash;
|
|
659
|
-
function getEvalsWithPredicate(predicate) {
|
|
901
|
+
async function getEvalsWithPredicate(predicate) {
|
|
902
|
+
const db = (0, database_1.getDb)();
|
|
903
|
+
const evals_ = await db
|
|
904
|
+
.select({
|
|
905
|
+
id: database_1.evals.id,
|
|
906
|
+
createdAt: database_1.evals.createdAt,
|
|
907
|
+
results: database_1.evals.results,
|
|
908
|
+
config: database_1.evals.config,
|
|
909
|
+
})
|
|
910
|
+
.from(database_1.evals)
|
|
911
|
+
.limit(100)
|
|
912
|
+
.all();
|
|
660
913
|
const ret = [];
|
|
661
|
-
const
|
|
662
|
-
|
|
663
|
-
const
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
914
|
+
for (const eval_ of evals_) {
|
|
915
|
+
const createdAt = new Date(eval_.createdAt).toISOString();
|
|
916
|
+
const resultWrapper = {
|
|
917
|
+
version: 3,
|
|
918
|
+
createdAt: createdAt,
|
|
919
|
+
results: eval_.results,
|
|
920
|
+
config: eval_.config,
|
|
921
|
+
};
|
|
922
|
+
if (predicate(resultWrapper)) {
|
|
923
|
+
const evalId = eval_.id;
|
|
670
924
|
ret.push({
|
|
671
925
|
id: evalId,
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
results: result.results,
|
|
926
|
+
date: new Date(eval_.createdAt),
|
|
927
|
+
config: eval_.config,
|
|
928
|
+
results: eval_.results,
|
|
676
929
|
});
|
|
677
930
|
}
|
|
678
931
|
}
|