promptfoo 0.91.2 → 0.92.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drizzle/0006_harsh_caretaker.sql +42 -0
- package/dist/drizzle/0007_cloudy_wong.sql +1 -0
- package/dist/drizzle/meta/0006_snapshot.json +721 -0
- package/dist/drizzle/meta/0007_snapshot.json +723 -0
- package/dist/drizzle/meta/_journal.json +14 -0
- package/dist/package.json +10 -8
- package/dist/src/app/assets/{index-Bc-q9rGp.js → index-CMDD1oSm.js} +233 -231
- package/dist/src/app/assets/{index.es-b3UhzAjj.js → index.es-D8cSwMq4.js} +1 -1
- package/dist/src/app/assets/{sync-D-OjEwME.js → sync-DJZvzYiS.js} +1 -1
- package/dist/src/app/index.html +1 -1
- package/dist/src/assertions.js +2 -2
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/commands/cache.d.ts.map +1 -1
- package/dist/src/commands/cache.js +0 -2
- package/dist/src/commands/cache.js.map +1 -1
- package/dist/src/commands/eval.d.ts.map +1 -1
- package/dist/src/commands/eval.js +19 -16
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/export.d.ts.map +1 -1
- package/dist/src/commands/export.js +8 -31
- package/dist/src/commands/export.js.map +1 -1
- package/dist/src/commands/import.d.ts.map +1 -1
- package/dist/src/commands/import.js +52 -13
- package/dist/src/commands/import.js.map +1 -1
- package/dist/src/commands/list.d.ts.map +1 -1
- package/dist/src/commands/list.js +35 -7
- package/dist/src/commands/list.js.map +1 -1
- package/dist/src/commands/share.d.ts +2 -2
- package/dist/src/commands/share.d.ts.map +1 -1
- package/dist/src/commands/share.js +12 -13
- package/dist/src/commands/share.js.map +1 -1
- package/dist/src/commands/show.d.ts.map +1 -1
- package/dist/src/commands/show.js +10 -6
- package/dist/src/commands/show.js.map +1 -1
- package/dist/src/constants.d.ts +1 -0
- package/dist/src/constants.d.ts.map +1 -1
- package/dist/src/constants.js +2 -1
- package/dist/src/constants.js.map +1 -1
- package/dist/src/database/index.js +1 -1
- package/dist/src/database/index.js.map +1 -1
- package/dist/src/database/tables.d.ts +602 -4
- package/dist/src/database/tables.d.ts.map +1 -1
- package/dist/src/database/tables.js +67 -8
- package/dist/src/database/tables.js.map +1 -1
- package/dist/src/database/types.d.ts +3 -3
- package/dist/src/database/types.d.ts.map +1 -1
- package/dist/src/evaluator.d.ts +3 -2
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +75 -104
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/evaluatorHelpers.d.ts.map +1 -1
- package/dist/src/evaluatorHelpers.js +2 -1
- package/dist/src/evaluatorHelpers.js.map +1 -1
- package/dist/src/index.d.ts +4 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +12 -9
- package/dist/src/index.js.map +1 -1
- package/dist/src/models/eval.d.ts +95 -0
- package/dist/src/models/eval.d.ts.map +1 -0
- package/dist/src/models/eval.js +390 -0
- package/dist/src/models/eval.js.map +1 -0
- package/dist/src/models/evalResult.d.ts +50 -0
- package/dist/src/models/evalResult.d.ts.map +1 -0
- package/dist/src/models/evalResult.js +122 -0
- package/dist/src/models/evalResult.js.map +1 -0
- package/dist/src/models/provider.d.ts +9 -0
- package/dist/src/models/provider.d.ts.map +1 -0
- package/dist/src/models/provider.js +47 -0
- package/dist/src/models/provider.js.map +1 -0
- package/dist/src/prompts/index.d.ts.map +1 -1
- package/dist/src/prompts/index.js +2 -1
- package/dist/src/prompts/index.js.map +1 -1
- package/dist/src/prompts/utils.d.ts +1 -0
- package/dist/src/prompts/utils.d.ts.map +1 -1
- package/dist/src/prompts/utils.js +7 -0
- package/dist/src/prompts/utils.js.map +1 -1
- package/dist/src/providers/http.js +2 -2
- package/dist/src/providers/http.js.map +1 -1
- package/dist/src/providers.js +5 -5
- package/dist/src/providers.js.map +1 -1
- package/dist/src/redteam/constants.d.ts +1 -1
- package/dist/src/redteam/constants.d.ts.map +1 -1
- package/dist/src/redteam/constants.js +7 -5
- package/dist/src/redteam/constants.js.map +1 -1
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +10 -0
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +10 -0
- package/dist/src/redteam/graders.d.ts +2 -0
- package/dist/src/redteam/graders.d.ts.map +1 -1
- package/dist/src/redteam/graders.js +2 -0
- package/dist/src/redteam/graders.js.map +1 -1
- package/dist/src/redteam/plugins/index.d.ts.map +1 -1
- package/dist/src/redteam/plugins/index.js +1 -0
- package/dist/src/redteam/plugins/index.js.map +1 -1
- package/dist/src/redteam/plugins/religion.d.ts +6 -0
- package/dist/src/redteam/plugins/religion.d.ts.map +1 -0
- package/dist/src/redteam/plugins/religion.js +14 -0
- package/dist/src/redteam/plugins/religion.js.map +1 -0
- package/dist/src/server/routes/evalRoutes.d.ts +1 -0
- package/dist/src/server/routes/evalRoutes.d.ts.map +1 -0
- package/dist/src/server/routes/evalRoutes.js +2 -0
- package/dist/src/server/routes/evalRoutes.js.map +1 -0
- package/dist/src/server/server.d.ts +1 -0
- package/dist/src/server/server.d.ts.map +1 -1
- package/dist/src/server/server.js +70 -31
- package/dist/src/server/server.js.map +1 -1
- package/dist/src/share.d.ts +2 -2
- package/dist/src/share.d.ts.map +1 -1
- package/dist/src/share.js +93 -34
- package/dist/src/share.js.map +1 -1
- package/dist/src/table.d.ts +2 -2
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +3 -3
- package/dist/src/table.js.map +1 -1
- package/dist/src/types/index.d.ts +163 -11
- package/dist/src/types/index.d.ts.map +1 -1
- package/dist/src/types/index.js +21 -1
- package/dist/src/types/index.js.map +1 -1
- package/dist/src/util/config/load.d.ts.map +1 -1
- package/dist/src/util/config/load.js +2 -1
- package/dist/src/util/config/load.js.map +1 -1
- package/dist/src/util/config/manage.d.ts.map +1 -1
- package/dist/src/util/config/manage.js.map +1 -1
- package/dist/src/util/convertEvalResultsToTable.d.ts +16 -0
- package/dist/src/util/convertEvalResultsToTable.d.ts.map +1 -0
- package/dist/src/util/convertEvalResultsToTable.js +137 -0
- package/dist/src/util/convertEvalResultsToTable.js.map +1 -0
- package/dist/src/util/createHash.d.ts +1 -0
- package/dist/src/util/createHash.d.ts.map +1 -1
- package/dist/src/util/createHash.js +9 -0
- package/dist/src/util/createHash.js.map +1 -1
- package/dist/src/util/file.d.ts +8 -0
- package/dist/src/util/file.d.ts.map +1 -0
- package/dist/src/util/file.js +13 -0
- package/dist/src/util/file.js.map +1 -0
- package/dist/src/util/index.d.ts +9 -14
- package/dist/src/util/index.d.ts.map +1 -1
- package/dist/src/util/index.js +87 -223
- package/dist/src/util/index.js.map +1 -1
- package/dist/src/util/time.d.ts +2 -0
- package/dist/src/util/time.d.ts.map +1 -0
- package/dist/src/util/time.js +7 -0
- package/dist/src/util/time.js.map +1 -0
- package/dist/src/util/transform.js +2 -2
- package/dist/src/util/transform.js.map +1 -1
- package/dist/src/validators/providers.d.ts +6 -0
- package/dist/src/validators/providers.d.ts.map +1 -1
- package/dist/src/validators/providers.js +1 -0
- package/dist/src/validators/providers.js.map +1 -1
- package/dist/src/validators/redteam.d.ts +6 -0
- package/dist/src/validators/redteam.d.ts.map +1 -1
- package/dist/test/commands/eval/filterFailingTests.test.js +24 -2
- package/dist/test/commands/eval/filterFailingTests.test.js.map +1 -1
- package/dist/test/evaluator.test.js +153 -74
- package/dist/test/evaluator.test.js.map +1 -1
- package/dist/test/factories/data/eval/database_records.d.ts +142 -0
- package/dist/test/factories/data/eval/database_records.d.ts.map +1 -0
- package/dist/test/factories/data/eval/database_records.js +251 -0
- package/dist/test/factories/data/eval/database_records.js.map +1 -0
- package/dist/test/factories/evalFactory.d.ts +768 -0
- package/dist/test/factories/evalFactory.d.ts.map +1 -0
- package/dist/test/factories/evalFactory.js +121 -0
- package/dist/test/factories/evalFactory.js.map +1 -0
- package/dist/test/factories/index.d.ts +1 -0
- package/dist/test/factories/index.d.ts.map +1 -0
- package/dist/test/factories/index.js +2 -0
- package/dist/test/factories/index.js.map +1 -0
- package/dist/test/index.test.js +17 -33
- package/dist/test/index.test.js.map +1 -1
- package/dist/test/models/eval.test.d.ts +2 -0
- package/dist/test/models/eval.test.d.ts.map +1 -0
- package/dist/test/models/eval.test.js +34 -0
- package/dist/test/models/eval.test.js.map +1 -0
- package/dist/test/providers.test.js +3 -3
- package/dist/test/providers.test.js.map +1 -1
- package/dist/test/server/share.test.d.ts +2 -0
- package/dist/test/server/share.test.d.ts.map +1 -0
- package/dist/test/server/share.test.js +36 -0
- package/dist/test/server/share.test.js.map +1 -0
- package/dist/test/server/v3evalToShare.json +507 -0
- package/dist/test/server/v4evalToShare.json +421 -0
- package/dist/test/types.test.js +58 -0
- package/dist/test/types.test.js.map +1 -1
- package/dist/test/util.file.test.d.ts +2 -0
- package/dist/test/util.file.test.d.ts.map +1 -0
- package/dist/test/util.file.test.js +32 -0
- package/dist/test/util.file.test.js.map +1 -0
- package/dist/test/util.listPrevious.test.d.ts +2 -0
- package/dist/test/util.listPrevious.test.d.ts.map +1 -0
- package/dist/test/util.listPrevious.test.js +37 -0
- package/dist/test/util.listPrevious.test.js.map +1 -0
- package/dist/test/util.test.js +38 -311
- package/dist/test/util.test.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +10 -8
|
@@ -1,14 +1,20 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.evalsToDatasetsRelations = exports.evalsToPromptsRelations = exports.evalsRelations = exports.datasetsRelations = exports.evalsToDatasets = exports.datasets = exports.evalsToTagsRelations = exports.tagsRelations = exports.evalsToTags = exports.promptsRelations = exports.evalsToPrompts = exports.evals = exports.tags = exports.prompts = void 0;
|
|
3
|
+
exports.evalsToDatasetsRelations = exports.evalsToPromptsRelations = exports.evalsRelations = exports.datasetsRelations = exports.evalsToDatasets = exports.datasets = exports.evalsToProvidersRelations = exports.evalsToProviders = exports.evalsToTagsRelations = exports.tagsRelations = exports.evalsToTags = exports.promptsRelations = exports.evalsToPrompts = exports.evalResultsTable = exports.evals = exports.tags = exports.prompts = exports.providers = void 0;
|
|
4
4
|
const drizzle_orm_1 = require("drizzle-orm");
|
|
5
5
|
const sqlite_core_1 = require("drizzle-orm/sqlite-core");
|
|
6
|
+
// ------------ Providers ------------
|
|
7
|
+
exports.providers = (0, sqlite_core_1.sqliteTable)('providers', {
|
|
8
|
+
id: (0, sqlite_core_1.text)('id').primaryKey(),
|
|
9
|
+
providerId: (0, sqlite_core_1.text)('provider_id').notNull(),
|
|
10
|
+
config: (0, sqlite_core_1.text)('options', { mode: 'json' }).$type().notNull(),
|
|
11
|
+
});
|
|
6
12
|
// ------------ Prompts ------------
|
|
7
13
|
exports.prompts = (0, sqlite_core_1.sqliteTable)('prompts', {
|
|
8
14
|
id: (0, sqlite_core_1.text)('id').primaryKey(),
|
|
9
15
|
createdAt: (0, sqlite_core_1.integer)('created_at')
|
|
10
16
|
.notNull()
|
|
11
|
-
.default((0, drizzle_orm_1.sql) `
|
|
17
|
+
.default((0, drizzle_orm_1.sql) `cast(unixepoch() as int)`),
|
|
12
18
|
prompt: (0, sqlite_core_1.text)('prompt').notNull(),
|
|
13
19
|
}, (table) => ({
|
|
14
20
|
createdAtIdx: (0, sqlite_core_1.index)('prompts_created_at_idx').on(table.createdAt),
|
|
@@ -27,21 +33,54 @@ exports.evals = (0, sqlite_core_1.sqliteTable)('evals', {
|
|
|
27
33
|
id: (0, sqlite_core_1.text)('id').primaryKey(),
|
|
28
34
|
createdAt: (0, sqlite_core_1.integer)('created_at')
|
|
29
35
|
.notNull()
|
|
30
|
-
.default((0, drizzle_orm_1.sql) `
|
|
36
|
+
.default((0, drizzle_orm_1.sql) `cast(unixepoch() as int)`),
|
|
31
37
|
author: (0, sqlite_core_1.text)('author'),
|
|
32
38
|
description: (0, sqlite_core_1.text)('description'),
|
|
33
39
|
results: (0, sqlite_core_1.text)('results', { mode: 'json' }).$type().notNull(),
|
|
34
40
|
config: (0, sqlite_core_1.text)('config', { mode: 'json' }).$type().notNull(),
|
|
41
|
+
prompts: (0, sqlite_core_1.text)('prompts', { mode: 'json' }).$type(),
|
|
35
42
|
}, (table) => ({
|
|
36
43
|
createdAtIdx: (0, sqlite_core_1.index)('evals_created_at_idx').on(table.createdAt),
|
|
37
44
|
authorIdx: (0, sqlite_core_1.index)('evals_author_idx').on(table.author),
|
|
38
45
|
}));
|
|
39
|
-
exports.
|
|
46
|
+
exports.evalResultsTable = (0, sqlite_core_1.sqliteTable)('eval_results', {
|
|
47
|
+
id: (0, sqlite_core_1.text)('id').primaryKey(),
|
|
48
|
+
createdAt: (0, sqlite_core_1.integer)('created_at')
|
|
49
|
+
.notNull()
|
|
50
|
+
.default((0, drizzle_orm_1.sql) `cast(unixepoch() as int)`),
|
|
51
|
+
updatedAt: (0, sqlite_core_1.integer)('updated_at')
|
|
52
|
+
.notNull()
|
|
53
|
+
.default((0, drizzle_orm_1.sql) `cast(unixepoch() as int)`),
|
|
40
54
|
evalId: (0, sqlite_core_1.text)('eval_id')
|
|
41
55
|
.notNull()
|
|
42
56
|
.references(() => exports.evals.id),
|
|
43
|
-
|
|
44
|
-
|
|
57
|
+
promptIdx: (0, sqlite_core_1.integer)('prompt_idx').notNull(),
|
|
58
|
+
testIdx: (0, sqlite_core_1.integer)('test_idx').notNull(),
|
|
59
|
+
testCase: (0, sqlite_core_1.text)('test_case', { mode: 'json' }).$type().notNull(),
|
|
60
|
+
prompt: (0, sqlite_core_1.text)('prompt', { mode: 'json' }).$type().notNull(),
|
|
61
|
+
promptId: (0, sqlite_core_1.text)('prompt_id').references(() => exports.prompts.id),
|
|
62
|
+
// Provider-related fields
|
|
63
|
+
provider: (0, sqlite_core_1.text)('provider', { mode: 'json' }).$type().notNull(),
|
|
64
|
+
providerId: (0, sqlite_core_1.text)('provider_id').references(() => exports.providers.id),
|
|
65
|
+
latencyMs: (0, sqlite_core_1.integer)('latency_ms'),
|
|
66
|
+
cost: (0, sqlite_core_1.real)('cost'),
|
|
67
|
+
// Output-related fields
|
|
68
|
+
response: (0, sqlite_core_1.text)('response', { mode: 'json' }).$type(),
|
|
69
|
+
error: (0, sqlite_core_1.text)('error'),
|
|
70
|
+
// Result-related fields
|
|
71
|
+
success: (0, sqlite_core_1.integer)('success', { mode: 'boolean' }).notNull(),
|
|
72
|
+
score: (0, sqlite_core_1.real)('score').notNull(),
|
|
73
|
+
gradingResult: (0, sqlite_core_1.text)('grading_result', { mode: 'json' }).$type(),
|
|
74
|
+
namedScores: (0, sqlite_core_1.text)('named_scores', { mode: 'json' }).$type(),
|
|
75
|
+
// Metadata fields
|
|
76
|
+
metadata: (0, sqlite_core_1.text)('metadata', { mode: 'json' }).$type(),
|
|
77
|
+
}, (table) => ({
|
|
78
|
+
evalIdIdx: (0, sqlite_core_1.index)('eval_result_eval_id_idx').on(table.evalId),
|
|
79
|
+
}));
|
|
80
|
+
exports.evalsToPrompts = (0, sqlite_core_1.sqliteTable)('evals_to_prompts', {
|
|
81
|
+
evalId: (0, sqlite_core_1.text)('eval_id')
|
|
82
|
+
.notNull()
|
|
83
|
+
.references(() => exports.evals.id, { onDelete: 'cascade' }),
|
|
45
84
|
promptId: (0, sqlite_core_1.text)('prompt_id')
|
|
46
85
|
.notNull()
|
|
47
86
|
.references(() => exports.prompts.id),
|
|
@@ -78,13 +117,33 @@ exports.evalsToTagsRelations = (0, drizzle_orm_1.relations)(exports.evalsToTags,
|
|
|
78
117
|
references: [exports.tags.id],
|
|
79
118
|
}),
|
|
80
119
|
}));
|
|
120
|
+
exports.evalsToProviders = (0, sqlite_core_1.sqliteTable)('evals_to_providers', {
|
|
121
|
+
providerId: (0, sqlite_core_1.text)('provider_id')
|
|
122
|
+
.notNull()
|
|
123
|
+
.references(() => exports.providers.id),
|
|
124
|
+
evalId: (0, sqlite_core_1.text)('eval_id')
|
|
125
|
+
.notNull()
|
|
126
|
+
.references(() => exports.evals.id),
|
|
127
|
+
}, (t) => ({
|
|
128
|
+
pk: (0, sqlite_core_1.primaryKey)({ columns: [t.providerId, t.evalId] }),
|
|
129
|
+
}));
|
|
130
|
+
exports.evalsToProvidersRelations = (0, drizzle_orm_1.relations)(exports.evalsToProviders, ({ one }) => ({
|
|
131
|
+
provider: one(exports.providers, {
|
|
132
|
+
fields: [exports.evalsToProviders.providerId],
|
|
133
|
+
references: [exports.providers.id],
|
|
134
|
+
}),
|
|
135
|
+
eval: one(exports.evals, {
|
|
136
|
+
fields: [exports.evalsToProviders.evalId],
|
|
137
|
+
references: [exports.evals.id],
|
|
138
|
+
}),
|
|
139
|
+
}));
|
|
81
140
|
// ------------ Datasets ------------
|
|
82
141
|
exports.datasets = (0, sqlite_core_1.sqliteTable)('datasets', {
|
|
83
142
|
id: (0, sqlite_core_1.text)('id').primaryKey(),
|
|
84
143
|
tests: (0, sqlite_core_1.text)('tests', { mode: 'json' }).$type(),
|
|
85
144
|
createdAt: (0, sqlite_core_1.integer)('created_at')
|
|
86
145
|
.notNull()
|
|
87
|
-
.default((0, drizzle_orm_1.sql) `
|
|
146
|
+
.default((0, drizzle_orm_1.sql) `cast(unixepoch() as int)`),
|
|
88
147
|
}, (table) => ({
|
|
89
148
|
createdAtIdx: (0, sqlite_core_1.index)('datasets_created_at_idx').on(table.createdAt),
|
|
90
149
|
}));
|
|
@@ -140,7 +199,7 @@ export const llmOutputs = sqliteTable(
|
|
|
140
199
|
id: text('id')
|
|
141
200
|
.notNull()
|
|
142
201
|
.unique(),
|
|
143
|
-
createdAt: integer('created_at').notNull().default(sql`
|
|
202
|
+
createdAt: integer('created_at').notNull().default(sql`cast(unixepoch() as int)`),
|
|
144
203
|
evalId: text('eval_id')
|
|
145
204
|
.notNull()
|
|
146
205
|
.references(() => evals.id),
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tables.js","sourceRoot":"","sources":["../../../src/database/tables.ts"],"names":[],"mappings":";;;AAAA,6CAA6C;AAC7C,
|
|
1
|
+
{"version":3,"file":"tables.js","sourceRoot":"","sources":["../../../src/database/tables.ts"],"names":[],"mappings":";;;AAAA,6CAA6C;AAC7C,yDAQiC;AAYjC,sCAAsC;AAEzB,QAAA,SAAS,GAAG,IAAA,yBAAW,EAAC,WAAW,EAAE;IAChD,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,UAAU,EAAE,IAAA,kBAAI,EAAC,aAAa,CAAC,CAAC,OAAO,EAAE;IACzC,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAuB,CAAC,OAAO,EAAE;CACjF,CAAC,CAAC;AAEH,oCAAoC;AAEvB,QAAA,OAAO,GAAG,IAAA,yBAAW,EAChC,SAAS,EACT;IACE,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;SAC7B,OAAO,EAAE;SACT,OAAO,CAAC,IAAA,iBAAG,EAAA,0BAA0B,CAAC;IACzC,MAAM,EAAE,IAAA,kBAAI,EAAC,QAAQ,CAAC,CAAC,OAAO,EAAE;CACjC,EACD,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACV,YAAY,EAAE,IAAA,mBAAK,EAAC,wBAAwB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC;CAClE,CAAC,CACH,CAAC;AAEF,iCAAiC;AAEpB,QAAA,IAAI,GAAG,IAAA,yBAAW,EAC7B,MAAM,EACN;IACE,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,IAAI,EAAE,IAAA,kBAAI,EAAC,MAAM,CAAC,CAAC,OAAO,EAAE;IAC5B,KAAK,EAAE,IAAA,kBAAI,EAAC,OAAO,CAAC,CAAC,OAAO,EAAE;CAC/B,EACD,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACV,OAAO,EAAE,IAAA,mBAAK,EAAC,eAAe,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;IAC9C,eAAe,EAAE,IAAA,yBAAW,EAAC,wBAAwB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC,KAAK,CAAC;CACnF,CAAC,CACH,CAAC;AAEF,kCAAkC;AAErB,QAAA,KAAK,GAAG,IAAA,yBAAW,EAC9B,OAAO,EACP;IACE,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;SAC7B,OAAO,EAAE;SACT,OAAO,CAAC,IAAA,iBAAG,EAAA,0BAA0B,CAAC;IACzC,MAAM,EAAE,IAAA,kBAAI,EAAC,QAAQ,CAAC;IACtB,WAAW,EAAE,IAAA,kBAAI,EAAC,aAAa,CAAC;IAChC,OAAO,EAAE,IAAA,kBAAI,EAAC,SAAS,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAA8B,CAAC,OAAO,EAAE;IACxF,MAAM,EAAE,IAAA,kBAAI,EAAC,QAAQ,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAA0B,CAAC,OAAO,EAAE;IAClF,OAAO,EAAE,IAAA,kBAAI,EAAC,SAAS,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAqB;CACtE,EACD,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACV,YAAY,EAAE,IAAA,mBAAK,EAAC,sBAAsB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC;IAC/D,SAAS,EAAE,IAAA,mBAAK,EAAC,kBAAkB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC;CACtD,CAAC,CACH,CAAC;AAEW,QAAA,gBAAgB,GAAG,IAAA,yBAAW,EACzC,cAAc,EACd;IACE,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;SAC7B,OAAO,EAAE;SACT,OAAO,CAAC,IAAA,iBAAG,EAAA,0BAA0B,CAAC;IACzC,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;SAC7B,OAAO,EAAE;SACT,OAAO,CAAC,IAAA,iBAAG,EAAA,0BAA0B,CAAC;IACzC,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,CAAC;SACpB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;IAC7B,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC,CAAC,OAAO,EAAE;IAC1C,OAAO,EAAE,IAAA,qBAAO,EAAC,UAAU,CAAC,CAAC,OAAO,EAAE;IAEtC,QAAQ,EAAE,IAAA,kBAAI,EAAC,WAAW,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAkB,CAAC,OAAO,EAAE;IAC/E,MAAM,EAAE,IAAA,kBAAI,EAAC,QAAQ,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAU,CAAC,OAAO,EAAE;IAClE,QAAQ,EAAE,IAAA,kBAAI,EAAC,WAAW,CAAC,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,eAAO,CAAC,EAAE,CAAC;IAExD,0BAA0B;IAC1B,QAAQ,EAAE,IAAA,kBAAI,EAAC,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAmB,CAAC,OAAO,EAAE;IAC/E,UAAU,EAAE,IAAA,kBAAI,EAAC,aAAa,CAAC,CAAC,UAAU,CAAC,GAAG,EAAE,CAAC,iBAAS,CAAC,EAAE,CAAC;IAE9D,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;IAChC,IAAI,EAAE,IAAA,kBAAI,EAAC,MAAM,CAAC;IAElB,wBAAwB;IACxB,QAAQ,EAAE,IAAA,kBAAI,EAAC,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAoB;IACtE,KAAK,EAAE,IAAA,kBAAI,EAAC,OAAO,CAAC;IAEpB,wBAAwB;IACxB,OAAO,EAAE,IAAA,qBAAO,EAAC,SAAS,EAAE,EAAE,IAAI,EAAE,SAAS,EAAE,CAAC,CAAC,OAAO,EAAE;IAC1D,KAAK,EAAE,IAAA,kBAAI,EAAC,OAAO,CAAC,CAAC,OAAO,EAAE;IAC9B,aAAa,EAAE,IAAA,kBAAI,EAAC,gBAAgB,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAAiB;IAC9E,WAAW,EAAE,IAAA,kBAAI,EAAC,cAAc,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAA0B;IAEnF,kBAAkB;IAClB,QAAQ,EAAE,IAAA,kBAAI,EAAC,UAAU,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAA0B;CAC7E,EACD,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACV,SAAS,EAAE,IAAA,mBAAK,EAAC,yBAAyB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,MAAM,CAAC;CAC7D,CAAC,CACH,CAAC;AAEW,QAAA,cAAc,GAAG,IAAA,yBAAW,EACvC,kBAAkB,EAClB;IACE,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,CAAC;SACpB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,aAAK,CAAC,EAAE,EAAE,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC;IACtD,QAAQ,EAAE,IAAA,kBAAI,EAAC,WAAW,CAAC;SACxB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,eAAO,CAAC,EAAE,CAAC;CAChC,EACD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACN,EAAE,EAAE,IAAA,wBAAU,EAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,QAAQ,CAAC,EAAE,CAAC;IACnD,SAAS,EAAE,IAAA,mBAAK,EAAC,8BAA8B,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IAC7D,WAAW,EAAE,IAAA,mBAAK,EAAC,gCAAgC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;CACpE,CAAC,CACH,CAAC;AAEW,QAAA,gBAAgB,GAAG,IAAA,uBAAS,EAAC,eAAO,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAChE,cAAc,EAAE,IAAI,CAAC,sBAAc,CAAC;CACrC,CAAC,CAAC,CAAC;AAES,QAAA,WAAW,GAAG,IAAA,yBAAW,EACpC,eAAe,EACf;IACE,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,CAAC;SACpB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;IAC7B,KAAK,EAAE,IAAA,kBAAI,EAAC,QAAQ,CAAC;SAClB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,YAAI,CAAC,EAAE,CAAC;CAC7B,EACD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACN,EAAE,EAAE,IAAA,wBAAU,EAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,EAAE,CAAC;IAChD,SAAS,EAAE,IAAA,mBAAK,EAAC,2BAA2B,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IAC1D,QAAQ,EAAE,IAAA,mBAAK,EAAC,0BAA0B,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC;CACxD,CAAC,CACH,CAAC;AAEW,QAAA,aAAa,GAAG,IAAA,uBAAS,EAAC,YAAI,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAC1D,WAAW,EAAE,IAAI,CAAC,mBAAW,CAAC;CAC/B,CAAC,CAAC,CAAC;AAES,QAAA,oBAAoB,GAAG,IAAA,uBAAS,EAAC,mBAAW,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC;IACvE,IAAI,EAAE,GAAG,CAAC,aAAK,EAAE;QACf,MAAM,EAAE,CAAC,mBAAW,CAAC,MAAM,CAAC;QAC5B,UAAU,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;KACvB,CAAC;IACF,GAAG,EAAE,GAAG,CAAC,YAAI,EAAE;QACb,MAAM,EAAE,CAAC,mBAAW,CAAC,KAAK,CAAC;QAC3B,UAAU,EAAE,CAAC,YAAI,CAAC,EAAE,CAAC;KACtB,CAAC;CACH,CAAC,CAAC,CAAC;AAES,QAAA,gBAAgB,GAAG,IAAA,yBAAW,EACzC,oBAAoB,EACpB;IACE,UAAU,EAAE,IAAA,kBAAI,EAAC,aAAa,CAAC;SAC5B,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,iBAAS,CAAC,EAAE,CAAC;IACjC,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,CAAC;SACpB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;CAC9B,EACD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACN,EAAE,EAAE,IAAA,wBAAU,EAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,UAAU,EAAE,CAAC,CAAC,MAAM,CAAC,EAAE,CAAC;CACtD,CAAC,CACH,CAAC;AAEW,QAAA,yBAAyB,GAAG,IAAA,uBAAS,EAAC,wBAAgB,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC;IACjF,QAAQ,EAAE,GAAG,CAAC,iBAAS,EAAE;QACvB,MAAM,EAAE,CAAC,wBAAgB,CAAC,UAAU,CAAC;QACrC,UAAU,EAAE,CAAC,iBAAS,CAAC,EAAE,CAAC;KAC3B,CAAC;IACF,IAAI,EAAE,GAAG,CAAC,aAAK,EAAE;QACf,MAAM,EAAE,CAAC,wBAAgB,CAAC,MAAM,CAAC;QACjC,UAAU,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;KACvB,CAAC;CACH,CAAC,CAAC,CAAC;AAEJ,qCAAqC;AAExB,QAAA,QAAQ,GAAG,IAAA,yBAAW,EACjC,UAAU,EACV;IACE,EAAE,EAAE,IAAA,kBAAI,EAAC,IAAI,CAAC,CAAC,UAAU,EAAE;IAC3B,KAAK,EAAE,IAAA,kBAAI,EAAC,OAAO,EAAE,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC,KAAK,EAA0B;IACtE,SAAS,EAAE,IAAA,qBAAO,EAAC,YAAY,CAAC;SAC7B,OAAO,EAAE;SACT,OAAO,CAAC,IAAA,iBAAG,EAAA,0BAA0B,CAAC;CAC1C,EACD,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;IACV,YAAY,EAAE,IAAA,mBAAK,EAAC,yBAAyB,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,SAAS,CAAC;CACnE,CAAC,CACH,CAAC;AAEW,QAAA,eAAe,GAAG,IAAA,yBAAW,EACxC,mBAAmB,EACnB;IACE,MAAM,EAAE,IAAA,kBAAI,EAAC,SAAS,CAAC;SACpB,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;IAC7B,yFAAyF;IACzF,uDAAuD;IACvD,SAAS,EAAE,IAAA,kBAAI,EAAC,YAAY,CAAC;SAC1B,OAAO,EAAE;SACT,UAAU,CAAC,GAAG,EAAE,CAAC,gBAAQ,CAAC,EAAE,CAAC;CACjC,EACD,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IACN,EAAE,EAAE,IAAA,wBAAU,EAAC,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,SAAS,CAAC,EAAE,CAAC;IACpD,SAAS,EAAE,IAAA,mBAAK,EAAC,+BAA+B,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IAC9D,YAAY,EAAE,IAAA,mBAAK,EAAC,kCAAkC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;CACxE,CAAC,CACH,CAAC;AAEW,QAAA,iBAAiB,GAAG,IAAA,uBAAS,EAAC,gBAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAClE,eAAe,EAAE,IAAI,CAAC,uBAAe,CAAC;CACvC,CAAC,CAAC,CAAC;AAEJ,kCAAkC;AAErB,QAAA,cAAc,GAAG,IAAA,uBAAS,EAAC,aAAK,EAAE,CAAC,EAAE,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC;IAC5D,cAAc,EAAE,IAAI,CAAC,sBAAc,CAAC;IACpC,eAAe,EAAE,IAAI,CAAC,uBAAe,CAAC;IACtC,WAAW,EAAE,IAAI,CAAC,mBAAW,CAAC;CAC/B,CAAC,CAAC,CAAC;AAES,QAAA,uBAAuB,GAAG,IAAA,uBAAS,EAAC,sBAAc,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC;IAC7E,IAAI,EAAE,GAAG,CAAC,aAAK,EAAE;QACf,MAAM,EAAE,CAAC,sBAAc,CAAC,MAAM,CAAC;QAC/B,UAAU,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;KACvB,CAAC;IACF,MAAM,EAAE,GAAG,CAAC,eAAO,EAAE;QACnB,MAAM,EAAE,CAAC,sBAAc,CAAC,QAAQ,CAAC;QACjC,UAAU,EAAE,CAAC,eAAO,CAAC,EAAE,CAAC;KACzB,CAAC;CACH,CAAC,CAAC,CAAC;AAES,QAAA,wBAAwB,GAAG,IAAA,uBAAS,EAAC,uBAAe,EAAE,CAAC,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAC;IAC/E,IAAI,EAAE,GAAG,CAAC,aAAK,EAAE;QACf,MAAM,EAAE,CAAC,uBAAe,CAAC,MAAM,CAAC;QAChC,UAAU,EAAE,CAAC,aAAK,CAAC,EAAE,CAAC;KACvB,CAAC;IACF,OAAO,EAAE,GAAG,CAAC,gBAAQ,EAAE;QACrB,MAAM,EAAE,CAAC,uBAAe,CAAC,SAAS,CAAC;QACnC,UAAU,EAAE,CAAC,gBAAQ,CAAC,EAAE,CAAC;KAC1B,CAAC;CACH,CAAC,CAAC,CAAC;AAEJ,oCAAoC;AACpC,wDAAwD;AAExD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAsCE"}
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import type {
|
|
1
|
+
import type { EvaluateSummaryV3, UnifiedConfig } from '../types';
|
|
2
2
|
export interface ResultsFile {
|
|
3
3
|
version: number;
|
|
4
4
|
createdAt: string;
|
|
5
5
|
author: string | null;
|
|
6
|
-
results:
|
|
6
|
+
results: EvaluateSummaryV3;
|
|
7
7
|
config: Partial<UnifiedConfig>;
|
|
8
8
|
datasetId?: string;
|
|
9
9
|
}
|
|
@@ -11,7 +11,7 @@ export interface EvalWithMetadata {
|
|
|
11
11
|
id: string;
|
|
12
12
|
date: Date;
|
|
13
13
|
config: Partial<UnifiedConfig>;
|
|
14
|
-
results:
|
|
14
|
+
results: EvaluateSummaryV3;
|
|
15
15
|
description?: string;
|
|
16
16
|
}
|
|
17
17
|
//# sourceMappingURL=types.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/database/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../src/database/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,iBAAiB,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AAEjE,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,OAAO,EAAE,iBAAiB,CAAC;IAC3B,MAAM,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IAC/B,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,gBAAgB;IAC/B,EAAE,EAAE,MAAM,CAAC;IACX,IAAI,EAAE,IAAI,CAAC;IACX,MAAM,EAAE,OAAO,CAAC,aAAa,CAAC,CAAC;IAC/B,OAAO,EAAE,iBAAiB,CAAC;IAC3B,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB"}
|
package/dist/src/evaluator.d.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import type
|
|
1
|
+
import type Eval from './models/eval';
|
|
2
|
+
import type { EvaluateOptions, Prompt, TestSuite } from './types';
|
|
2
3
|
export declare const DEFAULT_MAX_CONCURRENCY = 4;
|
|
3
4
|
/**
|
|
4
5
|
* Validates if a given prompt is allowed based on the provided list of allowed
|
|
@@ -19,5 +20,5 @@ export declare const DEFAULT_MAX_CONCURRENCY = 4;
|
|
|
19
20
|
*/
|
|
20
21
|
export declare function isAllowedPrompt(prompt: Prompt, allowedPrompts: string[] | undefined): boolean;
|
|
21
22
|
export declare function generateVarCombinations(vars: Record<string, string | string[] | any>): Record<string, string | any[]>[];
|
|
22
|
-
export declare function evaluate(testSuite: TestSuite, options: EvaluateOptions): Promise<
|
|
23
|
+
export declare function evaluate(testSuite: TestSuite, evalRecord: Eval, options: EvaluateOptions): Promise<Eval>;
|
|
23
24
|
//# sourceMappingURL=evaluator.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../src/evaluator.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../src/evaluator.ts"],"names":[],"mappings":"AAaA,OAAO,KAAK,IAAI,MAAM,eAAe,CAAC;AAMtC,OAAO,KAAK,EAIV,eAAe,EAGf,MAAM,EAGN,SAAS,EACV,MAAM,SAAS,CAAC;AAGjB,eAAO,MAAM,uBAAuB,IAAI,CAAC;AAEzC;;;;;;;;;;;;;;;;GAgBG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,MAAM,EAAE,cAAc,EAAE,MAAM,EAAE,GAAG,SAAS,GAAG,OAAO,CAM7F;AAED,wBAAgB,uBAAuB,CACrC,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,GAAG,GAAG,CAAC,GAC5C,MAAM,CAAC,MAAM,EAAE,MAAM,GAAG,GAAG,EAAE,CAAC,EAAE,CAqClC;AAixBD,wBAAgB,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,IAAI,EAAE,OAAO,EAAE,eAAe,iBAGxF"}
|
package/dist/src/evaluator.js
CHANGED
|
@@ -43,6 +43,7 @@ const envars_1 = require("./envars");
|
|
|
43
43
|
const evaluatorHelpers_1 = require("./evaluatorHelpers");
|
|
44
44
|
const logger_1 = __importDefault(require("./logger"));
|
|
45
45
|
const prompt_1 = require("./models/prompt");
|
|
46
|
+
const provider_1 = __importDefault(require("./models/provider"));
|
|
46
47
|
const azureopenaiUtil_1 = require("./providers/azureopenaiUtil");
|
|
47
48
|
const suggestions_1 = require("./suggestions");
|
|
48
49
|
const telemetry_1 = __importDefault(require("./telemetry"));
|
|
@@ -103,8 +104,9 @@ function generateVarCombinations(vars) {
|
|
|
103
104
|
return combinations;
|
|
104
105
|
}
|
|
105
106
|
class Evaluator {
|
|
106
|
-
constructor(testSuite, options) {
|
|
107
|
+
constructor(testSuite, evalRecord, options) {
|
|
107
108
|
this.testSuite = testSuite;
|
|
109
|
+
this.evalRecord = evalRecord;
|
|
108
110
|
this.options = options;
|
|
109
111
|
this.stats = {
|
|
110
112
|
successes: 0,
|
|
@@ -120,7 +122,7 @@ class Evaluator {
|
|
|
120
122
|
this.registers = {};
|
|
121
123
|
}
|
|
122
124
|
async runEval({ provider, prompt, // raw prompt
|
|
123
|
-
test, delay, nunjucksFilters: filters, evaluateOptions, }) {
|
|
125
|
+
test, delay, nunjucksFilters: filters, evaluateOptions, testIdx, promptIdx, }) {
|
|
124
126
|
// Use the original prompt to set the label, not renderedPrompt
|
|
125
127
|
const promptLabel = prompt.label;
|
|
126
128
|
// Set up the special _conversation variable
|
|
@@ -145,6 +147,7 @@ class Evaluator {
|
|
|
145
147
|
provider: {
|
|
146
148
|
id: provider.id(),
|
|
147
149
|
label: provider.label,
|
|
150
|
+
config: provider.config,
|
|
148
151
|
},
|
|
149
152
|
prompt: {
|
|
150
153
|
raw: renderedPrompt,
|
|
@@ -215,6 +218,10 @@ class Evaluator {
|
|
|
215
218
|
latencyMs,
|
|
216
219
|
cost: response.cost,
|
|
217
220
|
metadata: response.metadata,
|
|
221
|
+
promptIdx,
|
|
222
|
+
testIdx,
|
|
223
|
+
testCase: test,
|
|
224
|
+
promptId: prompt.id || '',
|
|
218
225
|
};
|
|
219
226
|
if (response.error) {
|
|
220
227
|
ret.error = response.error;
|
|
@@ -291,12 +298,17 @@ class Evaluator {
|
|
|
291
298
|
score: 0,
|
|
292
299
|
namedScores: {},
|
|
293
300
|
latencyMs,
|
|
301
|
+
promptIdx,
|
|
302
|
+
testIdx,
|
|
303
|
+
testCase: test,
|
|
304
|
+
promptId: prompt.id || '',
|
|
294
305
|
};
|
|
295
306
|
}
|
|
296
307
|
}
|
|
297
308
|
async evaluate() {
|
|
298
309
|
const { testSuite, options } = this;
|
|
299
310
|
const prompts = [];
|
|
311
|
+
const rowsWithSelectBestAssertion = new Set();
|
|
300
312
|
await (0, evaluatorHelpers_1.runExtensionHook)(testSuite.extensions, 'beforeAll', { suite: testSuite });
|
|
301
313
|
if (options.generateSuggestions) {
|
|
302
314
|
// TODO(ian): Move this into its own command/file
|
|
@@ -449,7 +461,7 @@ class Evaluator {
|
|
|
449
461
|
}
|
|
450
462
|
// Set up eval cases
|
|
451
463
|
const runEvalOptions = [];
|
|
452
|
-
let
|
|
464
|
+
let testIdx = 0;
|
|
453
465
|
for (let index = 0; index < tests.length; index++) {
|
|
454
466
|
const testCase = tests[index];
|
|
455
467
|
(0, tiny_invariant_1.default)(Array.isArray(testSuite.defaultTest?.assert || []), `defaultTest.assert is not an array in test case #${index + 1}`);
|
|
@@ -469,7 +481,7 @@ class Evaluator {
|
|
|
469
481
|
const numRepeat = this.options.repeat || 1;
|
|
470
482
|
for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
|
|
471
483
|
for (const vars of varCombinations) {
|
|
472
|
-
let
|
|
484
|
+
let promptIdx = 0;
|
|
473
485
|
// Order matters - keep provider in outer loop to reduce need to swap models during local inference.
|
|
474
486
|
for (const provider of testSuite.providers) {
|
|
475
487
|
for (const prompt of testSuite.prompts) {
|
|
@@ -486,30 +498,18 @@ class Evaluator {
|
|
|
486
498
|
},
|
|
487
499
|
test: { ...testCase, vars, options: testCase.options },
|
|
488
500
|
nunjucksFilters: testSuite.nunjucksFilters,
|
|
489
|
-
|
|
490
|
-
|
|
501
|
+
testIdx,
|
|
502
|
+
promptIdx,
|
|
491
503
|
repeatIndex,
|
|
492
504
|
evaluateOptions: options,
|
|
493
505
|
});
|
|
494
|
-
|
|
506
|
+
promptIdx++;
|
|
495
507
|
}
|
|
496
508
|
}
|
|
497
|
-
|
|
509
|
+
testIdx++;
|
|
498
510
|
}
|
|
499
511
|
}
|
|
500
512
|
}
|
|
501
|
-
// Set up table...
|
|
502
|
-
const isTest = tests.some((t) => !!t.assert);
|
|
503
|
-
const table = {
|
|
504
|
-
head: {
|
|
505
|
-
prompts,
|
|
506
|
-
vars: [
|
|
507
|
-
...Object.keys(testSuite.defaultTest?.vars || {}).sort(),
|
|
508
|
-
...Array.from(varNames).sort(),
|
|
509
|
-
],
|
|
510
|
-
},
|
|
511
|
-
body: [],
|
|
512
|
-
};
|
|
513
513
|
// Determine run parameters
|
|
514
514
|
let concurrency = options.maxConcurrency || exports.DEFAULT_MAX_CONCURRENCY;
|
|
515
515
|
if (concurrency > 1) {
|
|
@@ -525,7 +525,6 @@ class Evaluator {
|
|
|
525
525
|
}
|
|
526
526
|
}
|
|
527
527
|
// Actually run the eval
|
|
528
|
-
const results = [];
|
|
529
528
|
let numComplete = 0;
|
|
530
529
|
const processEvalStep = async (evalStep, index) => {
|
|
531
530
|
if (typeof index !== 'number') {
|
|
@@ -535,61 +534,21 @@ class Evaluator {
|
|
|
535
534
|
test: evalStep.test,
|
|
536
535
|
});
|
|
537
536
|
const row = await this.runEval(evalStep);
|
|
538
|
-
|
|
537
|
+
if (evalStep.test.assert?.some((a) => a.type === 'select-best')) {
|
|
538
|
+
rowsWithSelectBestAssertion.add(row.testIdx);
|
|
539
|
+
}
|
|
539
540
|
numComplete++;
|
|
540
541
|
if (options.progressCallback) {
|
|
541
|
-
options.progressCallback(results.length, runEvalOptions.length, index, evalStep);
|
|
542
|
-
}
|
|
543
|
-
// Bookkeeping for table
|
|
544
|
-
let resultText;
|
|
545
|
-
const outputTextDisplay = (typeof row.response?.output === 'object'
|
|
546
|
-
? JSON.stringify(row.response.output)
|
|
547
|
-
: row.response?.output || row.error || '');
|
|
548
|
-
if (isTest) {
|
|
549
|
-
if (row.success) {
|
|
550
|
-
resultText = `${outputTextDisplay || row.error || ''}`;
|
|
551
|
-
}
|
|
552
|
-
else {
|
|
553
|
-
resultText = `${row.error}\n---\n${outputTextDisplay}`;
|
|
554
|
-
}
|
|
542
|
+
options.progressCallback(this.evalRecord.results.length, runEvalOptions.length, index, evalStep);
|
|
555
543
|
}
|
|
556
|
-
|
|
557
|
-
|
|
544
|
+
try {
|
|
545
|
+
await this.evalRecord.addResult(row, evalStep.test);
|
|
558
546
|
}
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
}
|
|
562
|
-
const { rowIndex, colIndex } = evalStep;
|
|
563
|
-
if (!table.body[rowIndex]) {
|
|
564
|
-
table.body[rowIndex] = {
|
|
565
|
-
description: evalStep.test.description,
|
|
566
|
-
outputs: [],
|
|
567
|
-
test: evalStep.test,
|
|
568
|
-
vars: table.head.vars
|
|
569
|
-
.map((varName) => {
|
|
570
|
-
const varValue = evalStep.test.vars?.[varName] || '';
|
|
571
|
-
if (typeof varValue === 'string') {
|
|
572
|
-
return varValue;
|
|
573
|
-
}
|
|
574
|
-
return JSON.stringify(varValue);
|
|
575
|
-
})
|
|
576
|
-
.flat(),
|
|
577
|
-
};
|
|
547
|
+
catch (error) {
|
|
548
|
+
logger_1.default.error(`Error saving result: ${error} ${JSON.stringify(row)}`);
|
|
578
549
|
}
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
score: row.score,
|
|
582
|
-
namedScores: row.namedScores,
|
|
583
|
-
text: resultText,
|
|
584
|
-
prompt: row.prompt.raw,
|
|
585
|
-
provider: row.provider.label || row.provider.id,
|
|
586
|
-
latencyMs: row.latencyMs,
|
|
587
|
-
tokenUsage: row.response?.tokenUsage,
|
|
588
|
-
gradingResult: row.gradingResult,
|
|
589
|
-
cost: row.cost || 0,
|
|
590
|
-
metadata: row.metadata,
|
|
591
|
-
};
|
|
592
|
-
const metrics = table.head.prompts[colIndex].metrics;
|
|
550
|
+
const { promptIdx } = row;
|
|
551
|
+
const metrics = prompts[promptIdx].metrics;
|
|
593
552
|
(0, tiny_invariant_1.default)(metrics, 'Expected prompt.metrics to be set');
|
|
594
553
|
metrics.score += row.score;
|
|
595
554
|
for (const [key, value] of Object.entries(row.namedScores)) {
|
|
@@ -705,7 +664,7 @@ class Evaluator {
|
|
|
705
664
|
// Then run concurrent evaluations
|
|
706
665
|
await async_1.default.forEachOfLimit(concurrentRunEvalOptions, concurrency, processEvalStep);
|
|
707
666
|
// Do we have to run comparisons between row outputs?
|
|
708
|
-
const compareRowsCount =
|
|
667
|
+
const compareRowsCount = rowsWithSelectBestAssertion.size;
|
|
709
668
|
let progressBar;
|
|
710
669
|
if (compareRowsCount > 0 && multibar) {
|
|
711
670
|
progressBar = multibar.create(compareRowsCount, 0, {
|
|
@@ -714,60 +673,72 @@ class Evaluator {
|
|
|
714
673
|
vars: '',
|
|
715
674
|
});
|
|
716
675
|
}
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
676
|
+
let compareCount = 0;
|
|
677
|
+
for (const testIdx of rowsWithSelectBestAssertion) {
|
|
678
|
+
compareCount++;
|
|
679
|
+
const resultsToCompare = this.evalRecord.results.filter((r) => r.testIdx === testIdx);
|
|
680
|
+
if (resultsToCompare.length === 0) {
|
|
681
|
+
logger_1.default.warn(`Expected results to be found for test index ${testIdx}`);
|
|
682
|
+
continue;
|
|
683
|
+
}
|
|
684
|
+
const compareAssertion = resultsToCompare[0].testCase.assert?.find((a) => a.type === 'select-best');
|
|
720
685
|
if (compareAssertion) {
|
|
721
|
-
const outputs =
|
|
722
|
-
const gradingResults = await (0, assertions_1.runCompareAssertion)(
|
|
723
|
-
|
|
686
|
+
const outputs = resultsToCompare.map((r) => r.response?.output || '');
|
|
687
|
+
const gradingResults = await (0, assertions_1.runCompareAssertion)(resultsToCompare[0].testCase, compareAssertion, outputs);
|
|
688
|
+
for (let index = 0; index < resultsToCompare.length; index++) {
|
|
689
|
+
const result = resultsToCompare[index];
|
|
724
690
|
const gradingResult = gradingResults[index];
|
|
725
|
-
if (
|
|
726
|
-
|
|
691
|
+
if (result.gradingResult) {
|
|
692
|
+
result.gradingResult.tokensUsed = result.gradingResult.tokensUsed || {
|
|
727
693
|
total: 0,
|
|
728
694
|
prompt: 0,
|
|
729
695
|
completion: 0,
|
|
730
696
|
};
|
|
731
|
-
|
|
697
|
+
result.gradingResult.tokensUsed = result.gradingResult.tokensUsed || {
|
|
732
698
|
total: 0,
|
|
733
699
|
prompt: 0,
|
|
734
700
|
completion: 0,
|
|
735
701
|
};
|
|
736
|
-
|
|
737
|
-
(
|
|
738
|
-
|
|
739
|
-
(
|
|
702
|
+
result.gradingResult.tokensUsed.total =
|
|
703
|
+
(result.gradingResult.tokensUsed.total || 0) + (gradingResult.tokensUsed?.total || 0);
|
|
704
|
+
result.gradingResult.tokensUsed.prompt =
|
|
705
|
+
(result.gradingResult.tokensUsed.prompt || 0) +
|
|
740
706
|
(gradingResult.tokensUsed?.prompt || 0);
|
|
741
|
-
|
|
742
|
-
(
|
|
707
|
+
result.gradingResult.tokensUsed.completion =
|
|
708
|
+
(result.gradingResult.tokensUsed.completion || 0) +
|
|
743
709
|
(gradingResult.tokensUsed?.completion || 0);
|
|
744
|
-
|
|
745
|
-
|
|
710
|
+
result.success = result.gradingResult.pass =
|
|
711
|
+
result.gradingResult.pass && gradingResult.pass;
|
|
746
712
|
if (!gradingResult.pass) {
|
|
747
713
|
// Failure overrides the reason and the score
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
output.text = `${gradingResult.reason}\n---\n${output.text}`;
|
|
714
|
+
result.gradingResult.reason = gradingResult.reason;
|
|
715
|
+
result.score = result.gradingResult.score = gradingResult.score;
|
|
751
716
|
}
|
|
752
|
-
if (!
|
|
753
|
-
|
|
717
|
+
if (!result.gradingResult.componentResults) {
|
|
718
|
+
result.gradingResult.componentResults = [];
|
|
754
719
|
}
|
|
755
|
-
|
|
720
|
+
result.gradingResult.componentResults.push(gradingResult);
|
|
756
721
|
}
|
|
757
722
|
else {
|
|
758
|
-
|
|
723
|
+
result.gradingResult = gradingResult;
|
|
759
724
|
}
|
|
760
|
-
|
|
725
|
+
if (this.evalRecord.persisted) {
|
|
726
|
+
await result.save();
|
|
727
|
+
}
|
|
728
|
+
}
|
|
761
729
|
if (progressBar) {
|
|
762
730
|
progressBar.increment({
|
|
763
|
-
prompt:
|
|
731
|
+
prompt: resultsToCompare[0].prompt.raw.slice(0, 10).replace(/\n/g, ''),
|
|
764
732
|
});
|
|
765
733
|
}
|
|
766
734
|
else {
|
|
767
|
-
logger_1.default.debug(`Model-graded comparison #${
|
|
735
|
+
logger_1.default.debug(`Model-graded comparison #${compareCount} of ${compareRowsCount} complete`);
|
|
768
736
|
}
|
|
769
737
|
}
|
|
770
738
|
}
|
|
739
|
+
await this.evalRecord.addPrompts(prompts);
|
|
740
|
+
const providers = await provider_1.default.createMultiple(testSuite.providers);
|
|
741
|
+
await this.evalRecord.addProviders(providers);
|
|
771
742
|
// Finish up
|
|
772
743
|
if (multibar) {
|
|
773
744
|
multibar.stop();
|
|
@@ -776,7 +747,7 @@ class Evaluator {
|
|
|
776
747
|
progressBar.stop();
|
|
777
748
|
}
|
|
778
749
|
await (0, evaluatorHelpers_1.runExtensionHook)(testSuite.extensions, 'afterAll', {
|
|
779
|
-
results,
|
|
750
|
+
results: this.evalRecord.results.map((r) => r.toEvaluateResult()),
|
|
780
751
|
suite: testSuite,
|
|
781
752
|
});
|
|
782
753
|
telemetry_1.default.record('eval_ran', {
|
|
@@ -792,14 +763,14 @@ class Evaluator {
|
|
|
792
763
|
assertionTypes: Array.from(new Set(tests.flatMap((t) => t.assert || []).map((a) => a.type))).sort(),
|
|
793
764
|
eventSource: options.eventSource || 'default',
|
|
794
765
|
ci: (0, envars_1.isCI)(),
|
|
795
|
-
hasAnyPass: results.some((r) => r.success),
|
|
766
|
+
hasAnyPass: this.evalRecord.results.some((r) => r.success),
|
|
796
767
|
isRedteam: Boolean(testSuite.redteam),
|
|
797
768
|
});
|
|
798
|
-
return
|
|
769
|
+
return this.evalRecord;
|
|
799
770
|
}
|
|
800
771
|
}
|
|
801
|
-
function evaluate(testSuite, options) {
|
|
802
|
-
const ev = new Evaluator(testSuite, options);
|
|
772
|
+
function evaluate(testSuite, evalRecord, options) {
|
|
773
|
+
const ev = new Evaluator(testSuite, evalRecord, options);
|
|
803
774
|
return ev.evaluate();
|
|
804
775
|
}
|
|
805
776
|
//# sourceMappingURL=evaluator.js.map
|