promptfoo 0.91.2 → 0.92.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drizzle/0006_harsh_caretaker.sql +42 -0
- package/dist/drizzle/0007_cloudy_wong.sql +1 -0
- package/dist/drizzle/meta/0006_snapshot.json +721 -0
- package/dist/drizzle/meta/0007_snapshot.json +723 -0
- package/dist/drizzle/meta/_journal.json +14 -0
- package/dist/package.json +10 -8
- package/dist/src/app/assets/{index-Bc-q9rGp.js → index-CMDD1oSm.js} +233 -231
- package/dist/src/app/assets/{index.es-b3UhzAjj.js → index.es-D8cSwMq4.js} +1 -1
- package/dist/src/app/assets/{sync-D-OjEwME.js → sync-DJZvzYiS.js} +1 -1
- package/dist/src/app/index.html +1 -1
- package/dist/src/assertions.js +2 -2
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/commands/cache.d.ts.map +1 -1
- package/dist/src/commands/cache.js +0 -2
- package/dist/src/commands/cache.js.map +1 -1
- package/dist/src/commands/eval.d.ts.map +1 -1
- package/dist/src/commands/eval.js +19 -16
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/export.d.ts.map +1 -1
- package/dist/src/commands/export.js +8 -31
- package/dist/src/commands/export.js.map +1 -1
- package/dist/src/commands/import.d.ts.map +1 -1
- package/dist/src/commands/import.js +52 -13
- package/dist/src/commands/import.js.map +1 -1
- package/dist/src/commands/list.d.ts.map +1 -1
- package/dist/src/commands/list.js +35 -7
- package/dist/src/commands/list.js.map +1 -1
- package/dist/src/commands/share.d.ts +2 -2
- package/dist/src/commands/share.d.ts.map +1 -1
- package/dist/src/commands/share.js +12 -13
- package/dist/src/commands/share.js.map +1 -1
- package/dist/src/commands/show.d.ts.map +1 -1
- package/dist/src/commands/show.js +10 -6
- package/dist/src/commands/show.js.map +1 -1
- package/dist/src/constants.d.ts +1 -0
- package/dist/src/constants.d.ts.map +1 -1
- package/dist/src/constants.js +2 -1
- package/dist/src/constants.js.map +1 -1
- package/dist/src/database/index.js +1 -1
- package/dist/src/database/index.js.map +1 -1
- package/dist/src/database/tables.d.ts +602 -4
- package/dist/src/database/tables.d.ts.map +1 -1
- package/dist/src/database/tables.js +67 -8
- package/dist/src/database/tables.js.map +1 -1
- package/dist/src/database/types.d.ts +3 -3
- package/dist/src/database/types.d.ts.map +1 -1
- package/dist/src/evaluator.d.ts +3 -2
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +75 -104
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/evaluatorHelpers.d.ts.map +1 -1
- package/dist/src/evaluatorHelpers.js +2 -1
- package/dist/src/evaluatorHelpers.js.map +1 -1
- package/dist/src/index.d.ts +4 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +12 -9
- package/dist/src/index.js.map +1 -1
- package/dist/src/models/eval.d.ts +95 -0
- package/dist/src/models/eval.d.ts.map +1 -0
- package/dist/src/models/eval.js +390 -0
- package/dist/src/models/eval.js.map +1 -0
- package/dist/src/models/evalResult.d.ts +50 -0
- package/dist/src/models/evalResult.d.ts.map +1 -0
- package/dist/src/models/evalResult.js +122 -0
- package/dist/src/models/evalResult.js.map +1 -0
- package/dist/src/models/provider.d.ts +9 -0
- package/dist/src/models/provider.d.ts.map +1 -0
- package/dist/src/models/provider.js +47 -0
- package/dist/src/models/provider.js.map +1 -0
- package/dist/src/prompts/index.d.ts.map +1 -1
- package/dist/src/prompts/index.js +2 -1
- package/dist/src/prompts/index.js.map +1 -1
- package/dist/src/prompts/utils.d.ts +1 -0
- package/dist/src/prompts/utils.d.ts.map +1 -1
- package/dist/src/prompts/utils.js +7 -0
- package/dist/src/prompts/utils.js.map +1 -1
- package/dist/src/providers/http.js +2 -2
- package/dist/src/providers/http.js.map +1 -1
- package/dist/src/providers.js +5 -5
- package/dist/src/providers.js.map +1 -1
- package/dist/src/redteam/constants.d.ts +1 -1
- package/dist/src/redteam/constants.d.ts.map +1 -1
- package/dist/src/redteam/constants.js +7 -5
- package/dist/src/redteam/constants.js.map +1 -1
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +10 -0
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +10 -0
- package/dist/src/redteam/graders.d.ts +2 -0
- package/dist/src/redteam/graders.d.ts.map +1 -1
- package/dist/src/redteam/graders.js +2 -0
- package/dist/src/redteam/graders.js.map +1 -1
- package/dist/src/redteam/plugins/index.d.ts.map +1 -1
- package/dist/src/redteam/plugins/index.js +1 -0
- package/dist/src/redteam/plugins/index.js.map +1 -1
- package/dist/src/redteam/plugins/religion.d.ts +6 -0
- package/dist/src/redteam/plugins/religion.d.ts.map +1 -0
- package/dist/src/redteam/plugins/religion.js +14 -0
- package/dist/src/redteam/plugins/religion.js.map +1 -0
- package/dist/src/server/routes/evalRoutes.d.ts +1 -0
- package/dist/src/server/routes/evalRoutes.d.ts.map +1 -0
- package/dist/src/server/routes/evalRoutes.js +2 -0
- package/dist/src/server/routes/evalRoutes.js.map +1 -0
- package/dist/src/server/server.d.ts +1 -0
- package/dist/src/server/server.d.ts.map +1 -1
- package/dist/src/server/server.js +70 -31
- package/dist/src/server/server.js.map +1 -1
- package/dist/src/share.d.ts +2 -2
- package/dist/src/share.d.ts.map +1 -1
- package/dist/src/share.js +93 -34
- package/dist/src/share.js.map +1 -1
- package/dist/src/table.d.ts +2 -2
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +3 -3
- package/dist/src/table.js.map +1 -1
- package/dist/src/types/index.d.ts +163 -11
- package/dist/src/types/index.d.ts.map +1 -1
- package/dist/src/types/index.js +21 -1
- package/dist/src/types/index.js.map +1 -1
- package/dist/src/util/config/load.d.ts.map +1 -1
- package/dist/src/util/config/load.js +2 -1
- package/dist/src/util/config/load.js.map +1 -1
- package/dist/src/util/config/manage.d.ts.map +1 -1
- package/dist/src/util/config/manage.js.map +1 -1
- package/dist/src/util/convertEvalResultsToTable.d.ts +16 -0
- package/dist/src/util/convertEvalResultsToTable.d.ts.map +1 -0
- package/dist/src/util/convertEvalResultsToTable.js +137 -0
- package/dist/src/util/convertEvalResultsToTable.js.map +1 -0
- package/dist/src/util/createHash.d.ts +1 -0
- package/dist/src/util/createHash.d.ts.map +1 -1
- package/dist/src/util/createHash.js +9 -0
- package/dist/src/util/createHash.js.map +1 -1
- package/dist/src/util/file.d.ts +8 -0
- package/dist/src/util/file.d.ts.map +1 -0
- package/dist/src/util/file.js +13 -0
- package/dist/src/util/file.js.map +1 -0
- package/dist/src/util/index.d.ts +9 -14
- package/dist/src/util/index.d.ts.map +1 -1
- package/dist/src/util/index.js +87 -223
- package/dist/src/util/index.js.map +1 -1
- package/dist/src/util/time.d.ts +2 -0
- package/dist/src/util/time.d.ts.map +1 -0
- package/dist/src/util/time.js +7 -0
- package/dist/src/util/time.js.map +1 -0
- package/dist/src/util/transform.js +2 -2
- package/dist/src/util/transform.js.map +1 -1
- package/dist/src/validators/providers.d.ts +6 -0
- package/dist/src/validators/providers.d.ts.map +1 -1
- package/dist/src/validators/providers.js +1 -0
- package/dist/src/validators/providers.js.map +1 -1
- package/dist/src/validators/redteam.d.ts +6 -0
- package/dist/src/validators/redteam.d.ts.map +1 -1
- package/dist/test/commands/eval/filterFailingTests.test.js +24 -2
- package/dist/test/commands/eval/filterFailingTests.test.js.map +1 -1
- package/dist/test/evaluator.test.js +153 -74
- package/dist/test/evaluator.test.js.map +1 -1
- package/dist/test/factories/data/eval/database_records.d.ts +142 -0
- package/dist/test/factories/data/eval/database_records.d.ts.map +1 -0
- package/dist/test/factories/data/eval/database_records.js +251 -0
- package/dist/test/factories/data/eval/database_records.js.map +1 -0
- package/dist/test/factories/evalFactory.d.ts +768 -0
- package/dist/test/factories/evalFactory.d.ts.map +1 -0
- package/dist/test/factories/evalFactory.js +121 -0
- package/dist/test/factories/evalFactory.js.map +1 -0
- package/dist/test/factories/index.d.ts +1 -0
- package/dist/test/factories/index.d.ts.map +1 -0
- package/dist/test/factories/index.js +2 -0
- package/dist/test/factories/index.js.map +1 -0
- package/dist/test/index.test.js +17 -33
- package/dist/test/index.test.js.map +1 -1
- package/dist/test/models/eval.test.d.ts +2 -0
- package/dist/test/models/eval.test.d.ts.map +1 -0
- package/dist/test/models/eval.test.js +34 -0
- package/dist/test/models/eval.test.js.map +1 -0
- package/dist/test/providers.test.js +3 -3
- package/dist/test/providers.test.js.map +1 -1
- package/dist/test/server/share.test.d.ts +2 -0
- package/dist/test/server/share.test.d.ts.map +1 -0
- package/dist/test/server/share.test.js +36 -0
- package/dist/test/server/share.test.js.map +1 -0
- package/dist/test/server/v3evalToShare.json +507 -0
- package/dist/test/server/v4evalToShare.json +421 -0
- package/dist/test/types.test.js +58 -0
- package/dist/test/types.test.js.map +1 -1
- package/dist/test/util.file.test.d.ts +2 -0
- package/dist/test/util.file.test.d.ts.map +1 -0
- package/dist/test/util.file.test.js +32 -0
- package/dist/test/util.file.test.js.map +1 -0
- package/dist/test/util.listPrevious.test.d.ts +2 -0
- package/dist/test/util.listPrevious.test.d.ts.map +1 -0
- package/dist/test/util.listPrevious.test.js +37 -0
- package/dist/test/util.listPrevious.test.js.map +1 -0
- package/dist/test/util.test.js +38 -311
- package/dist/test/util.test.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +10 -8
package/dist/src/util/index.d.ts
CHANGED
|
@@ -1,25 +1,19 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
* @param filePath - The path of the file to check.
|
|
6
|
-
* @returns True if the file has a JavaScript or TypeScript extension, false otherwise.
|
|
7
|
-
*/
|
|
8
|
-
export declare function isJavascriptFile(filePath: string): boolean;
|
|
9
|
-
export declare function writeOutput(outputPath: string, evalId: string | null, results: EvaluateSummary, config: Partial<UnifiedConfig>, shareableUrl: string | null): Promise<void>;
|
|
10
|
-
export declare function writeMultipleOutputs(outputPaths: string[], evalId: string | null, results: EvaluateSummary, config: Partial<UnifiedConfig>, shareableUrl: string | null): Promise<void>;
|
|
1
|
+
import Eval from '../models/eval';
|
|
2
|
+
import { type EvalWithMetadata, type EvaluateResult, type EvaluateTable, type NunjucksFilterMap, type PromptWithMetadata, type ResultsFile, type TestCase, type TestCasesWithMetadata, type UnifiedConfig, type OutputFile, type CompletedPrompt, type ResultLightweight, type EvaluateSummaryV2 } from '../types';
|
|
3
|
+
export declare function writeOutput(outputPath: string, evalRecord: Eval, shareableUrl: string | null): Promise<void>;
|
|
4
|
+
export declare function writeMultipleOutputs(outputPaths: string[], evalRecord: Eval, shareableUrl: string | null): Promise<void>;
|
|
11
5
|
export declare function readOutput(outputPath: string): Promise<OutputFile>;
|
|
12
6
|
/**
|
|
13
7
|
* TODO(ian): Remove this
|
|
14
8
|
* @deprecated Use readLatestResults directly instead.
|
|
15
9
|
*/
|
|
16
10
|
export declare function getLatestResultsPath(): string;
|
|
17
|
-
export declare function writeResultsToDatabase(results:
|
|
11
|
+
export declare function writeResultsToDatabase(results: EvaluateSummaryV2, config: Partial<UnifiedConfig>, createdAt?: Date): Promise<string>;
|
|
18
12
|
/**
|
|
19
13
|
*
|
|
20
14
|
* @returns Last n evals in descending order.
|
|
21
15
|
*/
|
|
22
|
-
export declare function listPreviousResults(limit?: number, filterDescription?: string, datasetId?: string): ResultLightweight[]
|
|
16
|
+
export declare function listPreviousResults(limit?: number, filterDescription?: string, datasetId?: string): Promise<ResultLightweight[]>;
|
|
23
17
|
/**
|
|
24
18
|
* @deprecated Used only for migration to sqlite
|
|
25
19
|
*/
|
|
@@ -42,7 +36,6 @@ export declare function readResult_fileSystem(name: string): {
|
|
|
42
36
|
createdAt: Date;
|
|
43
37
|
} | undefined;
|
|
44
38
|
export declare function migrateResultsFromFileSystemToDatabase(): Promise<void>;
|
|
45
|
-
export declare function cleanupOldFileResults(remaining?: number): void;
|
|
46
39
|
export declare function readResult(id: string): Promise<{
|
|
47
40
|
id: string;
|
|
48
41
|
result: ResultsFile;
|
|
@@ -170,6 +163,7 @@ export declare function getTestCases(limit?: number): Promise<{
|
|
|
170
163
|
} | {
|
|
171
164
|
callApi: import("../types").CallApiFunction;
|
|
172
165
|
id: (...args: unknown[]) => string;
|
|
166
|
+
config?: any;
|
|
173
167
|
label?: string | undefined;
|
|
174
168
|
transform?: string | undefined;
|
|
175
169
|
delay?: number | undefined;
|
|
@@ -334,6 +328,7 @@ export declare function getDatasetFromHash(hash: string): Promise<{
|
|
|
334
328
|
} | {
|
|
335
329
|
callApi: import("../types").CallApiFunction;
|
|
336
330
|
id: (...args: unknown[]) => string;
|
|
331
|
+
config?: any;
|
|
337
332
|
label?: string | undefined;
|
|
338
333
|
transform?: string | undefined;
|
|
339
334
|
delay?: number | undefined;
|
|
@@ -412,7 +407,7 @@ export declare function getStandaloneEvals({ limit, tag, description, }?: {
|
|
|
412
407
|
value: string;
|
|
413
408
|
};
|
|
414
409
|
description?: string;
|
|
415
|
-
}): StandaloneEval[]
|
|
410
|
+
}): Promise<StandaloneEval[]>;
|
|
416
411
|
export declare function providerToIdentifier(provider: TestCase['provider']): string | undefined;
|
|
417
412
|
export declare function varsMatch(vars1: Record<string, string | string[] | object> | undefined, vars2: Record<string, string | string[] | object> | undefined): boolean;
|
|
418
413
|
export declare function resultIsForTestCase(result: EvaluateResult, testCase: TestCase): boolean;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/util/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/util/index.ts"],"names":[],"mappings":"AAgCA,OAAO,IAA+C,MAAM,gBAAgB,CAAC;AAE7E,OAAO,EACL,KAAK,gBAAgB,EACrB,KAAK,cAAc,EACnB,KAAK,aAAa,EAElB,KAAK,iBAAiB,EACtB,KAAK,kBAAkB,EACvB,KAAK,WAAW,EAChB,KAAK,QAAQ,EACb,KAAK,qBAAqB,EAE1B,KAAK,aAAa,EAClB,KAAK,UAAU,EACf,KAAK,eAAe,EAEpB,KAAK,iBAAiB,EAItB,KAAK,iBAAiB,EACvB,MAAM,UAAU,CAAC;AA6BlB,wBAAsB,WAAW,CAC/B,UAAU,EAAE,MAAM,EAClB,UAAU,EAAE,IAAI,EAChB,YAAY,EAAE,MAAM,GAAG,IAAI,iBAuF5B;AAED,wBAAsB,oBAAoB,CACxC,WAAW,EAAE,MAAM,EAAE,EACrB,UAAU,EAAE,IAAI,EAChB,YAAY,EAAE,MAAM,GAAG,IAAI,iBAK5B;AAED,wBAAsB,UAAU,CAAC,UAAU,EAAE,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,CASxE;AAED;;;GAGG;AACH,wBAAgB,oBAAoB,IAAI,MAAM,CAE7C;AAED,wBAAsB,sBAAsB,CAC1C,OAAO,EAAE,iBAAiB,EAC1B,MAAM,EAAE,OAAO,CAAC,aAAa,CAAC,EAC9B,SAAS,GAAE,IAAiB,GAC3B,OAAO,CAAC,MAAM,CAAC,CA8HjB;AAED;;;GAGG;AACH,wBAAsB,mBAAmB,CACvC,KAAK,GAAE,MAA4B,EACnC,iBAAiB,CAAC,EAAE,MAAM,EAC1B,SAAS,CAAC,EAAE,MAAM,GACjB,OAAO,CAAC,iBAAiB,EAAE,CAAC,CAqC9B;AAED;;GAEG;AACH,wBAAgB,sCAAsC,IAAI,MAAM,EAAE,CAYjE;AAID;;GAEG;AACH,wBAAgB,8BAA8B,IAAI;IAAE,QAAQ,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,CAAA;CAAE,EAAE,CAqB7F;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,MAAM,QAqB9C;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,IAAI,UAExC;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CACnC,IAAI,EAAE,MAAM,GACX;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,WAAW,CAAC;IAAC,SAAS,EAAE,IAAI,CAAA;CAAE,GAAG,SAAS,CAgBlE;AAED,wBAAsB,sCAAsC,kBAI3D;AAED,wBAAsB,UAAU,CAC9B,EAAE,EAAE,MAAM,GACT,OAAO,CAAC;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,WAAW,CAAC;IAAC,SAAS,EAAE,IAAI,CAAA;CAAE,GAAG,SAAS,CAAC,CAY3E;AAED,wBAAsB,YAAY,CAChC,EAAE,EAAE,MAAM,EACV,SAAS,CAAC,EAAE,OAAO,CAAC,aAAa,CAAC,EAClC,QAAQ,CAAC,EAAE,aAAa,GACvB,OAAO,CAAC,IAAI,CAAC,CAuBf;AAED,wBAAsB,aAAa,CAAC,iBAAiB,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,GAAG,SAAS,CAAC,CAGhG;AAED,wBAAsB,uBAAuB,CAC3C,SAAS,EAAE,CAAC,MAAM,EAAE,WAAW,KAAK,OAAO,EAC3C,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,kBAAkB,EAAE,CAAC,CAiD/B;AAED,wBAAgB,0BAA0B,CACxC,eAAe,EAAE,MAAM,EACvB,KAAK,GAAE,MAA4B,iCAOpC;AAED,wBAAgB,sBAAsB,CAAC,SAAS,EAAE,QAAQ,EAAE,iCAI3D;AAED,wBAAsB,yBAAyB,CAC7C,SAAS,EAAE,CAAC,MAAM,EAAE,WAAW,KAAK,OAAO,EAC3C,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,qBAAqB,EAAE,CAAC,CAuDlC;AAED,wBAAgB,UAAU,CAAC,KAAK,GAAE,MAA4B,iCAE7D;AAED,wBAAsB,YAAY,CAAC,KAAK,GAAE,MAA4B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;KAErE;AAED,wBAAsB,iBAAiB,CAAC,IAAI,EAAE,MAAM,2CAQnD;AAED,wBAAsB,kBAAkB,CAAC,IAAI,EAAE,MAAM;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;eAQpD;AAED,wBAAsB,qBAAqB,CACzC,SAAS,EAAE,CAAC,MAAM,EAAE,WAAW,KAAK,OAAO,EAC3C,KAAK,EAAE,MAAM,GACZ,OAAO,CAAC,gBAAgB,EAAE,CAAC,CA0C7B;AAED,wBAAsB,QAAQ,CAAC,KAAK,GAAE,MAA4B,+BAEjE;AAED,wBAAsB,aAAa,CAAC,IAAI,EAAE,MAAM,yCAQ/C;AAED,wBAAsB,UAAU,CAAC,MAAM,EAAE,MAAM,iBAe9C;AAED;;;;GAIG;AACH,wBAAsB,cAAc,IAAI,OAAO,CAAC,IAAI,CAAC,CAQpD;AAED,wBAAsB,WAAW,CAC/B,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAC/B,QAAQ,GAAE,MAAW,GACpB,OAAO,CAAC,iBAAiB,CAAC,CAa5B;AAED,wBAAgB,WAAW,SAG1B;AAED,wBAAgB,QAAQ,CAAC,OAAO,EAAE,MAAM,GAAG,SAAS,QAOnD;AAED,MAAM,MAAM,cAAc,GAAG,eAAe,GAAG;IAC7C,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,SAAS,EAAE,MAAM,GAAG,IAAI,CAAC;IACzB,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,SAAS,EAAE,OAAO,CAAC;IACnB,SAAS,EAAE,MAAM,CAAC;IAElB,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC,CAAC;AAIF,wBAAsB,kBAAkB,CAAC,EACvC,KAA2B,EAC3B,GAAG,EACH,WAAW,GACZ,GAAE;IACD,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,GAAG,CAAC,EAAE;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAA;KAAE,CAAC;IACrC,WAAW,CAAC,EAAE,MAAM,CAAC;CACjB,GAAG,OAAO,CAAC,cAAc,EAAE,CAAC,CA0FjC;AAED,wBAAgB,oBAAoB,CAAC,QAAQ,EAAE,QAAQ,CAAC,UAAU,CAAC,GAAG,MAAM,GAAG,SAAS,CASvF;AAED,wBAAgB,SAAS,CACvB,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,GAAG,SAAS,EAC7D,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,GAAG,SAAS,WAG9D;AAED,wBAAgB,mBAAmB,CAAC,MAAM,EAAE,cAAc,EAAE,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAMvF;AAED,wBAAgB,kBAAkB,CAAC,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAAC,GAAG,CAAC,CAsBvF;AAED;;;;;;;GAOG;AACH,wBAAgB,eAAe,CAC7B,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,MAAM,GACjB;IACD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,OAAO,CAAC;IACvB,QAAQ,EAAE,MAAM,CAAC;CAClB,CAoCA;AAED;;;;;;;;;;;GAWG;AACH,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,MAAM,GAAG,MAAM,GAAG,QAAQ,GAAG,SAAS,GAAG,IAAI,OA+BhG"}
|
package/dist/src/util/index.js
CHANGED
|
@@ -26,7 +26,6 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
26
26
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
27
27
|
};
|
|
28
28
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
29
|
-
exports.isJavascriptFile = isJavascriptFile;
|
|
30
29
|
exports.writeOutput = writeOutput;
|
|
31
30
|
exports.writeMultipleOutputs = writeMultipleOutputs;
|
|
32
31
|
exports.readOutput = readOutput;
|
|
@@ -39,7 +38,6 @@ exports.filenameToDate = filenameToDate;
|
|
|
39
38
|
exports.dateToFilename = dateToFilename;
|
|
40
39
|
exports.readResult_fileSystem = readResult_fileSystem;
|
|
41
40
|
exports.migrateResultsFromFileSystemToDatabase = migrateResultsFromFileSystemToDatabase;
|
|
42
|
-
exports.cleanupOldFileResults = cleanupOldFileResults;
|
|
43
41
|
exports.readResult = readResult;
|
|
44
42
|
exports.updateResult = updateResult;
|
|
45
43
|
exports.getLatestEval = getLatestEval;
|
|
@@ -88,21 +86,14 @@ const accounts_1 = require("../globalConfig/accounts");
|
|
|
88
86
|
const googleSheets_1 = require("../googleSheets");
|
|
89
87
|
const logger_1 = __importDefault(require("../logger"));
|
|
90
88
|
const migrate_1 = require("../migrate");
|
|
89
|
+
const eval_1 = __importStar(require("../models/eval"));
|
|
91
90
|
const prompt_1 = require("../models/prompt");
|
|
92
91
|
const types_1 = require("../types");
|
|
93
92
|
const manage_1 = require("./config/manage");
|
|
94
93
|
const createHash_1 = require("./createHash");
|
|
94
|
+
const file_1 = require("./file");
|
|
95
95
|
const templates_1 = require("./templates");
|
|
96
96
|
const DEFAULT_QUERY_LIMIT = 100;
|
|
97
|
-
/**
|
|
98
|
-
* Checks if a file is a JavaScript or TypeScript file based on its extension.
|
|
99
|
-
*
|
|
100
|
-
* @param filePath - The path of the file to check.
|
|
101
|
-
* @returns True if the file has a JavaScript or TypeScript extension, false otherwise.
|
|
102
|
-
*/
|
|
103
|
-
function isJavascriptFile(filePath) {
|
|
104
|
-
return /\.(js|cjs|mjs|ts|cts|mts)$/.test(filePath);
|
|
105
|
-
}
|
|
106
97
|
const outputToSimpleString = (output) => {
|
|
107
98
|
const passFailText = output.pass ? '[PASS]' : '[FAIL]';
|
|
108
99
|
const namedScoresText = Object.entries(output.namedScores)
|
|
@@ -122,14 +113,16 @@ const outputToSimpleString = (output) => {
|
|
|
122
113
|
${gradingResultText}
|
|
123
114
|
`.trim();
|
|
124
115
|
};
|
|
125
|
-
async function writeOutput(outputPath,
|
|
116
|
+
async function writeOutput(outputPath, evalRecord, shareableUrl) {
|
|
117
|
+
const table = await evalRecord.getTable();
|
|
118
|
+
(0, tiny_invariant_1.default)(table, 'Table is required');
|
|
126
119
|
if (outputPath.match(/^https:\/\/docs\.google\.com\/spreadsheets\//)) {
|
|
127
|
-
const rows =
|
|
120
|
+
const rows = table.body.map((row) => {
|
|
128
121
|
const csvRow = {};
|
|
129
|
-
|
|
122
|
+
table.head.vars.forEach((varName, index) => {
|
|
130
123
|
csvRow[varName] = row.vars[index];
|
|
131
124
|
});
|
|
132
|
-
|
|
125
|
+
table.head.prompts.forEach((prompt, index) => {
|
|
133
126
|
csvRow[prompt.label] = outputToSimpleString(row.outputs[index]);
|
|
134
127
|
});
|
|
135
128
|
return csvRow;
|
|
@@ -148,38 +141,51 @@ async function writeOutput(outputPath, evalId, results, config, shareableUrl) {
|
|
|
148
141
|
if (outputExtension === 'csv') {
|
|
149
142
|
const csvOutput = (0, sync_1.stringify)([
|
|
150
143
|
[
|
|
151
|
-
...
|
|
152
|
-
...
|
|
144
|
+
...table.head.vars,
|
|
145
|
+
...table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
|
|
153
146
|
],
|
|
154
|
-
...
|
|
147
|
+
...table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
|
|
155
148
|
]);
|
|
156
149
|
fs.writeFileSync(outputPath, csvOutput);
|
|
157
150
|
}
|
|
158
151
|
else if (outputExtension === 'json') {
|
|
159
|
-
|
|
152
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
153
|
+
fs.writeFileSync(outputPath, JSON.stringify({
|
|
154
|
+
evalId: evalRecord.id,
|
|
155
|
+
results: summary,
|
|
156
|
+
config: evalRecord.config,
|
|
157
|
+
shareableUrl,
|
|
158
|
+
}, null, 2));
|
|
160
159
|
}
|
|
161
160
|
else if (outputExtension === 'yaml' || outputExtension === 'yml' || outputExtension === 'txt') {
|
|
162
|
-
|
|
161
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
162
|
+
fs.writeFileSync(outputPath, js_yaml_1.default.dump({
|
|
163
|
+
evalId: evalRecord.id,
|
|
164
|
+
results: summary,
|
|
165
|
+
config: evalRecord.config,
|
|
166
|
+
shareableUrl,
|
|
167
|
+
}));
|
|
163
168
|
}
|
|
164
169
|
else if (outputExtension === 'html') {
|
|
170
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
165
171
|
const template = fs.readFileSync(`${(0, esm_1.getDirectory)()}/tableOutput.html`, 'utf-8');
|
|
166
|
-
const
|
|
172
|
+
const htmlTable = [
|
|
167
173
|
[
|
|
168
|
-
...
|
|
169
|
-
...
|
|
174
|
+
...table.head.vars,
|
|
175
|
+
...table.head.prompts.map((prompt) => `[${prompt.provider}] ${prompt.label}`),
|
|
170
176
|
],
|
|
171
|
-
...
|
|
177
|
+
...table.body.map((row) => [...row.vars, ...row.outputs.map(outputToSimpleString)]),
|
|
172
178
|
];
|
|
173
179
|
const htmlOutput = (0, templates_1.getNunjucksEngine)().renderString(template, {
|
|
174
|
-
config,
|
|
175
|
-
table,
|
|
176
|
-
results:
|
|
180
|
+
config: evalRecord.config,
|
|
181
|
+
table: htmlTable,
|
|
182
|
+
results: summary,
|
|
177
183
|
});
|
|
178
184
|
fs.writeFileSync(outputPath, htmlOutput);
|
|
179
185
|
}
|
|
180
186
|
}
|
|
181
|
-
async function writeMultipleOutputs(outputPaths,
|
|
182
|
-
await Promise.all(outputPaths.map((outputPath) => writeOutput(outputPath,
|
|
187
|
+
async function writeMultipleOutputs(outputPaths, evalRecord, shareableUrl) {
|
|
188
|
+
await Promise.all(outputPaths.map((outputPath) => writeOutput(outputPath, evalRecord, shareableUrl)));
|
|
183
189
|
}
|
|
184
190
|
async function readOutput(outputPath) {
|
|
185
191
|
const ext = path.parse(outputPath).ext.slice(1);
|
|
@@ -197,9 +203,9 @@ async function readOutput(outputPath) {
|
|
|
197
203
|
function getLatestResultsPath() {
|
|
198
204
|
return path.join((0, manage_1.getConfigDirectoryPath)(), 'output', 'latest.json');
|
|
199
205
|
}
|
|
200
|
-
async function writeResultsToDatabase(results, config, createdAt) {
|
|
206
|
+
async function writeResultsToDatabase(results, config, createdAt = new Date()) {
|
|
201
207
|
createdAt = createdAt || (results.timestamp ? new Date(results.timestamp) : new Date());
|
|
202
|
-
const evalId =
|
|
208
|
+
const evalId = (0, eval_1.createEvalId)(createdAt);
|
|
203
209
|
const db = (0, database_1.getDb)();
|
|
204
210
|
const promises = [];
|
|
205
211
|
promises.push(db
|
|
@@ -216,9 +222,10 @@ async function writeResultsToDatabase(results, config, createdAt) {
|
|
|
216
222
|
.run());
|
|
217
223
|
logger_1.default.debug(`Inserting eval ${evalId}`);
|
|
218
224
|
// Record prompt relation
|
|
225
|
+
(0, tiny_invariant_1.default)(results.table, 'Table is required');
|
|
219
226
|
for (const prompt of results.table.head.prompts) {
|
|
220
227
|
const label = prompt.label || prompt.display || prompt.raw;
|
|
221
|
-
const promptId =
|
|
228
|
+
const promptId = (0, prompt_1.generateIdFromPrompt)(prompt);
|
|
222
229
|
promises.push(db
|
|
223
230
|
.insert(tables_1.prompts)
|
|
224
231
|
.values({
|
|
@@ -297,7 +304,7 @@ async function writeResultsToDatabase(results, config, createdAt) {
|
|
|
297
304
|
*
|
|
298
305
|
* @returns Last n evals in descending order.
|
|
299
306
|
*/
|
|
300
|
-
function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, datasetId) {
|
|
307
|
+
async function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, datasetId) {
|
|
301
308
|
const db = (0, database_1.getDb)();
|
|
302
309
|
const startTime = performance.now();
|
|
303
310
|
const query = db
|
|
@@ -310,7 +317,7 @@ function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, dat
|
|
|
310
317
|
})
|
|
311
318
|
.from(tables_1.evals)
|
|
312
319
|
.leftJoin(tables_1.evalsToDatasets, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToDatasets.evalId))
|
|
313
|
-
.where((0, drizzle_orm_1.and)(datasetId ? (0, drizzle_orm_1.eq)(tables_1.evalsToDatasets.datasetId, datasetId) : undefined, filterDescription ? (0, drizzle_orm_1.like)(tables_1.evals.description, `%${filterDescription}%`) : undefined));
|
|
320
|
+
.where((0, drizzle_orm_1.and)(datasetId ? (0, drizzle_orm_1.eq)(tables_1.evalsToDatasets.datasetId, datasetId) : undefined, filterDescription ? (0, drizzle_orm_1.like)(tables_1.evals.description, `%${filterDescription}%`) : undefined, (0, drizzle_orm_1.not)((0, drizzle_orm_1.eq)(tables_1.evals.results, {}))));
|
|
314
321
|
const results = query.orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt)).limit(limit).all();
|
|
315
322
|
const mappedResults = results.map((result) => ({
|
|
316
323
|
evalId: result.evalId,
|
|
@@ -321,8 +328,10 @@ function listPreviousResults(limit = DEFAULT_QUERY_LIMIT, filterDescription, dat
|
|
|
321
328
|
}));
|
|
322
329
|
const endTime = performance.now();
|
|
323
330
|
const executionTime = endTime - startTime;
|
|
331
|
+
const evalResults = await (0, eval_1.getSummaryofLatestEvals)(undefined, filterDescription, datasetId);
|
|
324
332
|
logger_1.default.debug(`listPreviousResults execution time: ${executionTime.toFixed(2)}ms`);
|
|
325
|
-
|
|
333
|
+
const combinedResults = [...evalResults, ...mappedResults];
|
|
334
|
+
return combinedResults;
|
|
326
335
|
}
|
|
327
336
|
/**
|
|
328
337
|
* @deprecated Used only for migration to sqlite
|
|
@@ -409,100 +418,19 @@ function readResult_fileSystem(name) {
|
|
|
409
418
|
logger_1.default.error(`Failed to read results from ${resultsPath}:\n${err}`);
|
|
410
419
|
}
|
|
411
420
|
}
|
|
412
|
-
let attemptedMigration = false;
|
|
413
421
|
async function migrateResultsFromFileSystemToDatabase() {
|
|
414
|
-
if (attemptedMigration) {
|
|
415
|
-
// TODO(ian): Record this bit in the database.
|
|
416
|
-
return;
|
|
417
|
-
}
|
|
418
422
|
// First run db migrations
|
|
419
423
|
logger_1.default.debug('Running db migrations...');
|
|
420
424
|
await (0, migrate_1.runDbMigrations)();
|
|
421
|
-
const fileNames = listPreviousResultFilenames_fileSystem();
|
|
422
|
-
if (fileNames.length === 0) {
|
|
423
|
-
return;
|
|
424
|
-
}
|
|
425
|
-
logger_1.default.info(`🔁 Migrating ${fileNames.length} flat files to local database.`);
|
|
426
|
-
logger_1.default.info('This is a one-time operation and may take a minute...');
|
|
427
|
-
attemptedMigration = true;
|
|
428
|
-
const outputDir = path.join((0, manage_1.getConfigDirectoryPath)(true /* createIfNotExists */), 'output');
|
|
429
|
-
const backupDir = `${outputDir}-backup-${new Date()
|
|
430
|
-
.toISOString()
|
|
431
|
-
.slice(0, 10)
|
|
432
|
-
.replace(/-/g, '')}`;
|
|
433
|
-
try {
|
|
434
|
-
fs.cpSync(outputDir, backupDir, { recursive: true });
|
|
435
|
-
logger_1.default.info(`Backup of output directory created at ${backupDir}`);
|
|
436
|
-
}
|
|
437
|
-
catch (backupError) {
|
|
438
|
-
logger_1.default.error(`Failed to create backup of output directory: ${backupError}`);
|
|
439
|
-
return;
|
|
440
|
-
}
|
|
441
|
-
logger_1.default.info('Moving files into database...');
|
|
442
|
-
const migrationPromises = fileNames.map(async (fileName) => {
|
|
443
|
-
const fileData = readResult_fileSystem(fileName);
|
|
444
|
-
if (fileData) {
|
|
445
|
-
await writeResultsToDatabase(fileData.result.results, fileData.result.config, filenameToDate(fileName));
|
|
446
|
-
logger_1.default.debug(`Migrated ${fileName} to database.`);
|
|
447
|
-
try {
|
|
448
|
-
fs.unlinkSync(path.join(outputDir, fileName));
|
|
449
|
-
}
|
|
450
|
-
catch (err) {
|
|
451
|
-
logger_1.default.warn(`Failed to delete ${fileName} after migration: ${err}`);
|
|
452
|
-
}
|
|
453
|
-
}
|
|
454
|
-
else {
|
|
455
|
-
logger_1.default.warn(`Failed to migrate result ${fileName} due to read error.`);
|
|
456
|
-
}
|
|
457
|
-
});
|
|
458
|
-
await Promise.all(migrationPromises);
|
|
459
|
-
try {
|
|
460
|
-
fs.unlinkSync(getLatestResultsPath());
|
|
461
|
-
}
|
|
462
|
-
catch (err) {
|
|
463
|
-
logger_1.default.warn(`Failed to delete latest.json: ${err}`);
|
|
464
|
-
}
|
|
465
|
-
logger_1.default.info('Migration complete. Please restart your web server if it is running.');
|
|
466
|
-
}
|
|
467
|
-
const RESULT_HISTORY_LENGTH = (0, envars_1.getEnvInt)('RESULT_HISTORY_LENGTH', DEFAULT_QUERY_LIMIT);
|
|
468
|
-
function cleanupOldFileResults(remaining = RESULT_HISTORY_LENGTH) {
|
|
469
|
-
const sortedFilenames = listPreviousResultFilenames_fileSystem();
|
|
470
|
-
for (let i = 0; i < sortedFilenames.length - remaining; i++) {
|
|
471
|
-
fs.unlinkSync(path.join((0, manage_1.getConfigDirectoryPath)(), 'output', sortedFilenames[i]));
|
|
472
|
-
}
|
|
473
425
|
}
|
|
474
426
|
async function readResult(id) {
|
|
475
|
-
const db = (0, database_1.getDb)();
|
|
476
427
|
try {
|
|
477
|
-
const
|
|
478
|
-
|
|
479
|
-
id: tables_1.evals.id,
|
|
480
|
-
createdAt: tables_1.evals.createdAt,
|
|
481
|
-
author: tables_1.evals.author,
|
|
482
|
-
results: tables_1.evals.results,
|
|
483
|
-
config: tables_1.evals.config,
|
|
484
|
-
datasetId: tables_1.evalsToDatasets.datasetId,
|
|
485
|
-
})
|
|
486
|
-
.from(tables_1.evals)
|
|
487
|
-
.leftJoin(tables_1.evalsToDatasets, (0, drizzle_orm_1.eq)(tables_1.evals.id, tables_1.evalsToDatasets.evalId))
|
|
488
|
-
.where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
|
|
489
|
-
.execute();
|
|
490
|
-
if (evalResult.length === 0) {
|
|
491
|
-
return undefined;
|
|
492
|
-
}
|
|
493
|
-
const { id: resultId, createdAt, results, config, author, datasetId } = evalResult[0];
|
|
494
|
-
const result = {
|
|
495
|
-
version: 3,
|
|
496
|
-
createdAt: new Date(createdAt).toISOString().slice(0, 10),
|
|
497
|
-
author,
|
|
498
|
-
results,
|
|
499
|
-
config,
|
|
500
|
-
datasetId,
|
|
501
|
-
};
|
|
428
|
+
const eval_ = await eval_1.default.findById(id);
|
|
429
|
+
(0, tiny_invariant_1.default)(eval_, `Eval with ID ${id} not found.`);
|
|
502
430
|
return {
|
|
503
|
-
id
|
|
504
|
-
result,
|
|
505
|
-
createdAt: new Date(createdAt),
|
|
431
|
+
id,
|
|
432
|
+
result: await eval_.toResultsFile(),
|
|
433
|
+
createdAt: new Date(eval_.createdAt),
|
|
506
434
|
};
|
|
507
435
|
}
|
|
508
436
|
catch (err) {
|
|
@@ -510,38 +438,20 @@ async function readResult(id) {
|
|
|
510
438
|
}
|
|
511
439
|
}
|
|
512
440
|
async function updateResult(id, newConfig, newTable) {
|
|
513
|
-
const db = (0, database_1.getDb)();
|
|
514
441
|
try {
|
|
515
442
|
// Fetch the existing eval data from the database
|
|
516
|
-
const existingEval = await
|
|
517
|
-
|
|
518
|
-
config: tables_1.evals.config,
|
|
519
|
-
results: tables_1.evals.results,
|
|
520
|
-
})
|
|
521
|
-
.from(tables_1.evals)
|
|
522
|
-
.where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
|
|
523
|
-
.limit(1)
|
|
524
|
-
.all();
|
|
525
|
-
if (existingEval.length === 0) {
|
|
443
|
+
const existingEval = await eval_1.default.findById(id);
|
|
444
|
+
if (!existingEval) {
|
|
526
445
|
logger_1.default.error(`Eval with ID ${id} not found.`);
|
|
527
446
|
return;
|
|
528
447
|
}
|
|
529
|
-
const evalData = existingEval[0];
|
|
530
448
|
if (newConfig) {
|
|
531
|
-
|
|
449
|
+
existingEval.config = newConfig;
|
|
532
450
|
}
|
|
533
451
|
if (newTable) {
|
|
534
|
-
|
|
452
|
+
existingEval.setTable(newTable);
|
|
535
453
|
}
|
|
536
|
-
await
|
|
537
|
-
.update(tables_1.evals)
|
|
538
|
-
.set({
|
|
539
|
-
description: evalData.config.description,
|
|
540
|
-
config: evalData.config,
|
|
541
|
-
results: evalData.results,
|
|
542
|
-
})
|
|
543
|
-
.where((0, drizzle_orm_1.eq)(tables_1.evals.id, id))
|
|
544
|
-
.run();
|
|
454
|
+
await existingEval.save();
|
|
545
455
|
logger_1.default.info(`Updated eval with ID ${id}`);
|
|
546
456
|
}
|
|
547
457
|
catch (err) {
|
|
@@ -549,61 +459,18 @@ async function updateResult(id, newConfig, newTable) {
|
|
|
549
459
|
}
|
|
550
460
|
}
|
|
551
461
|
async function getLatestEval(filterDescription) {
|
|
552
|
-
const
|
|
553
|
-
|
|
554
|
-
.select({
|
|
555
|
-
id: tables_1.evals.id,
|
|
556
|
-
createdAt: tables_1.evals.createdAt,
|
|
557
|
-
author: tables_1.evals.author,
|
|
558
|
-
description: tables_1.evals.description,
|
|
559
|
-
results: tables_1.evals.results,
|
|
560
|
-
config: tables_1.evals.config,
|
|
561
|
-
})
|
|
562
|
-
.from(tables_1.evals)
|
|
563
|
-
.orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
|
|
564
|
-
.limit(1);
|
|
565
|
-
if (filterDescription) {
|
|
566
|
-
const regex = new RegExp(filterDescription, 'i');
|
|
567
|
-
latestResults = latestResults.filter((result) => regex.test(result.description || ''));
|
|
568
|
-
}
|
|
569
|
-
if (!latestResults.length) {
|
|
570
|
-
return undefined;
|
|
571
|
-
}
|
|
572
|
-
const latestResult = latestResults[0];
|
|
573
|
-
return {
|
|
574
|
-
version: 3,
|
|
575
|
-
createdAt: new Date(latestResult.createdAt).toISOString(),
|
|
576
|
-
author: latestResult.author,
|
|
577
|
-
results: latestResult.results,
|
|
578
|
-
config: latestResult.config,
|
|
579
|
-
};
|
|
462
|
+
const eval_ = await eval_1.default.latest();
|
|
463
|
+
return await eval_?.toResultsFile();
|
|
580
464
|
}
|
|
581
465
|
async function getPromptsWithPredicate(predicate, limit) {
|
|
582
466
|
// TODO(ian): Make this use a proper database query
|
|
583
|
-
const
|
|
584
|
-
const evals_ = await db
|
|
585
|
-
.select({
|
|
586
|
-
id: tables_1.evals.id,
|
|
587
|
-
createdAt: tables_1.evals.createdAt,
|
|
588
|
-
author: tables_1.evals.author,
|
|
589
|
-
results: tables_1.evals.results,
|
|
590
|
-
config: tables_1.evals.config,
|
|
591
|
-
})
|
|
592
|
-
.from(tables_1.evals)
|
|
593
|
-
.limit(limit)
|
|
594
|
-
.all();
|
|
467
|
+
const evals_ = await eval_1.default.getMany(limit);
|
|
595
468
|
const groupedPrompts = {};
|
|
596
469
|
for (const eval_ of evals_) {
|
|
597
470
|
const createdAt = new Date(eval_.createdAt).toISOString();
|
|
598
|
-
const resultWrapper =
|
|
599
|
-
version: 3,
|
|
600
|
-
createdAt,
|
|
601
|
-
author: eval_.author,
|
|
602
|
-
results: eval_.results,
|
|
603
|
-
config: eval_.config,
|
|
604
|
-
};
|
|
471
|
+
const resultWrapper = await eval_.toResultsFile();
|
|
605
472
|
if (predicate(resultWrapper)) {
|
|
606
|
-
for (const prompt of
|
|
473
|
+
for (const prompt of eval_.getPrompts()) {
|
|
607
474
|
const promptId = (0, createHash_1.sha256)(prompt.raw);
|
|
608
475
|
const datasetId = resultWrapper.config.tests
|
|
609
476
|
? (0, createHash_1.sha256)(JSON.stringify(resultWrapper.config.tests))
|
|
@@ -651,29 +518,11 @@ function getPromptsForTestCases(testCases) {
|
|
|
651
518
|
return getPromptsForTestCasesHash(testCasesSha256);
|
|
652
519
|
}
|
|
653
520
|
async function getTestCasesWithPredicate(predicate, limit) {
|
|
654
|
-
const
|
|
655
|
-
const evals_ = await db
|
|
656
|
-
.select({
|
|
657
|
-
id: tables_1.evals.id,
|
|
658
|
-
createdAt: tables_1.evals.createdAt,
|
|
659
|
-
author: tables_1.evals.author,
|
|
660
|
-
results: tables_1.evals.results,
|
|
661
|
-
config: tables_1.evals.config,
|
|
662
|
-
})
|
|
663
|
-
.from(tables_1.evals)
|
|
664
|
-
.orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
|
|
665
|
-
.limit(limit)
|
|
666
|
-
.all();
|
|
521
|
+
const evals_ = await eval_1.default.getMany(limit);
|
|
667
522
|
const groupedTestCases = {};
|
|
668
523
|
for (const eval_ of evals_) {
|
|
669
524
|
const createdAt = new Date(eval_.createdAt).toISOString();
|
|
670
|
-
const resultWrapper =
|
|
671
|
-
version: 3,
|
|
672
|
-
createdAt,
|
|
673
|
-
author: eval_.author,
|
|
674
|
-
results: eval_.results,
|
|
675
|
-
config: eval_.config,
|
|
676
|
-
};
|
|
525
|
+
const resultWrapper = await eval_.toResultsFile();
|
|
677
526
|
const testCases = resultWrapper.config.tests;
|
|
678
527
|
if (testCases && predicate(resultWrapper)) {
|
|
679
528
|
const evalId = eval_.id;
|
|
@@ -681,7 +530,7 @@ async function getTestCasesWithPredicate(predicate, limit) {
|
|
|
681
530
|
if (datasetId in groupedTestCases) {
|
|
682
531
|
groupedTestCases[datasetId].recentEvalDate = new Date(Math.max(groupedTestCases[datasetId].recentEvalDate.getTime(), eval_.createdAt));
|
|
683
532
|
groupedTestCases[datasetId].count += 1;
|
|
684
|
-
const newPrompts =
|
|
533
|
+
const newPrompts = eval_.getPrompts().map((prompt) => ({
|
|
685
534
|
id: (0, createHash_1.sha256)(prompt.raw),
|
|
686
535
|
prompt,
|
|
687
536
|
evalId,
|
|
@@ -695,7 +544,7 @@ async function getTestCasesWithPredicate(predicate, limit) {
|
|
|
695
544
|
groupedTestCases[datasetId].prompts = Object.values(promptsById);
|
|
696
545
|
}
|
|
697
546
|
else {
|
|
698
|
-
const newPrompts =
|
|
547
|
+
const newPrompts = eval_.getPrompts().map((prompt) => ({
|
|
699
548
|
id: (0, createHash_1.sha256)(prompt.raw),
|
|
700
549
|
prompt,
|
|
701
550
|
evalId,
|
|
@@ -765,6 +614,7 @@ async function getEvalsWithPredicate(predicate, limit) {
|
|
|
765
614
|
version: 3,
|
|
766
615
|
createdAt,
|
|
767
616
|
author: eval_.author,
|
|
617
|
+
// @ts-ignore
|
|
768
618
|
results: eval_.results,
|
|
769
619
|
config: eval_.config,
|
|
770
620
|
};
|
|
@@ -774,6 +624,7 @@ async function getEvalsWithPredicate(predicate, limit) {
|
|
|
774
624
|
id: evalId,
|
|
775
625
|
date: new Date(eval_.createdAt),
|
|
776
626
|
config: eval_.config,
|
|
627
|
+
// @ts-ignore
|
|
777
628
|
results: eval_.results,
|
|
778
629
|
description: eval_.description || undefined,
|
|
779
630
|
});
|
|
@@ -799,6 +650,9 @@ async function deleteEval(evalId) {
|
|
|
799
650
|
// We need to clean up foreign keys first. We don't have onDelete: 'cascade' set on all these relationships.
|
|
800
651
|
await db.delete(tables_1.evalsToPrompts).where((0, drizzle_orm_1.eq)(tables_1.evalsToPrompts.evalId, evalId)).run();
|
|
801
652
|
await db.delete(tables_1.evalsToDatasets).where((0, drizzle_orm_1.eq)(tables_1.evalsToDatasets.evalId, evalId)).run();
|
|
653
|
+
await db.delete(tables_1.evalsToTags).where((0, drizzle_orm_1.eq)(tables_1.evalsToTags.evalId, evalId)).run();
|
|
654
|
+
await db.delete(tables_1.evalResultsTable).where((0, drizzle_orm_1.eq)(tables_1.evalResultsTable.evalId, evalId)).run();
|
|
655
|
+
await db.delete(tables_1.evalsToProviders).where((0, drizzle_orm_1.eq)(tables_1.evalsToProviders.evalId, evalId)).run();
|
|
802
656
|
// Finally, delete the eval record
|
|
803
657
|
const deletedIds = await db.delete(tables_1.evals).where((0, drizzle_orm_1.eq)(tables_1.evals.id, evalId)).run();
|
|
804
658
|
if (deletedIds.changes === 0) {
|
|
@@ -848,7 +702,7 @@ function setupEnv(envPath) {
|
|
|
848
702
|
}
|
|
849
703
|
}
|
|
850
704
|
const standaloneEvalCache = new node_cache_1.default({ stdTTL: 60 * 60 * 2 }); // Cache for 2 hours
|
|
851
|
-
function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } = {}) {
|
|
705
|
+
async function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } = {}) {
|
|
852
706
|
const cacheKey = `standalone_evals_${limit}_${tag?.key}_${tag?.value}`;
|
|
853
707
|
const cachedResult = standaloneEvalCache.get(cacheKey);
|
|
854
708
|
if (cachedResult) {
|
|
@@ -876,16 +730,26 @@ function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } =
|
|
|
876
730
|
.orderBy((0, drizzle_orm_1.desc)(tables_1.evals.createdAt))
|
|
877
731
|
.limit(limit)
|
|
878
732
|
.all();
|
|
879
|
-
const standaloneEvals = results.
|
|
880
|
-
const { description, createdAt, evalId, promptId, datasetId,
|
|
881
|
-
|
|
733
|
+
const standaloneEvals = (await Promise.all(results.map(async (result) => {
|
|
734
|
+
const { description, createdAt, evalId, promptId, datasetId,
|
|
735
|
+
// @ts-ignore
|
|
736
|
+
isRedteam, } = result;
|
|
737
|
+
const eval_ = await eval_1.default.findById(evalId);
|
|
738
|
+
(0, tiny_invariant_1.default)(eval_, `Eval with ID ${evalId} not found`);
|
|
739
|
+
const table = (await eval_.getTable()) || { body: [] };
|
|
740
|
+
// @ts-ignore
|
|
741
|
+
return eval_.getPrompts().map((col, index) => {
|
|
882
742
|
// Compute some stats
|
|
883
|
-
const pluginCounts = table.body.reduce(
|
|
743
|
+
const pluginCounts = table.body.reduce(
|
|
744
|
+
// @ts-ignore
|
|
745
|
+
(acc, row) => {
|
|
884
746
|
const pluginId = row.test.metadata?.pluginId;
|
|
885
747
|
if (pluginId) {
|
|
886
748
|
const isPass = row.outputs[index].pass;
|
|
887
|
-
acc.pluginPassCount[pluginId] =
|
|
888
|
-
|
|
749
|
+
acc.pluginPassCount[pluginId] =
|
|
750
|
+
(acc.pluginPassCount[pluginId] || 0) + (isPass ? 1 : 0);
|
|
751
|
+
acc.pluginFailCount[pluginId] =
|
|
752
|
+
(acc.pluginFailCount[pluginId] || 0) + (isPass ? 0 : 1);
|
|
889
753
|
}
|
|
890
754
|
return acc;
|
|
891
755
|
}, { pluginPassCount: {}, pluginFailCount: {} });
|
|
@@ -900,7 +764,7 @@ function getStandaloneEvals({ limit = DEFAULT_QUERY_LIMIT, tag, description, } =
|
|
|
900
764
|
...col,
|
|
901
765
|
};
|
|
902
766
|
});
|
|
903
|
-
});
|
|
767
|
+
}))).flat();
|
|
904
768
|
standaloneEvalCache.set(cacheKey, standaloneEvals);
|
|
905
769
|
return standaloneEvals;
|
|
906
770
|
}
|
|
@@ -975,7 +839,7 @@ function parsePathOrGlob(basePath, promptPath) {
|
|
|
975
839
|
let functionName;
|
|
976
840
|
if (filename.includes(':')) {
|
|
977
841
|
const splits = filename.split(':');
|
|
978
|
-
if (splits[0] && (isJavascriptFile(splits[0]) || splits[0].endsWith('.py'))) {
|
|
842
|
+
if (splits[0] && ((0, file_1.isJavascriptFile)(splits[0]) || splits[0].endsWith('.py'))) {
|
|
979
843
|
[filename, functionName] = splits;
|
|
980
844
|
}
|
|
981
845
|
}
|