promptfoo 0.91.3 → 0.92.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/drizzle/0006_harsh_caretaker.sql +42 -0
- package/dist/drizzle/0007_cloudy_wong.sql +1 -0
- package/dist/drizzle/meta/0006_snapshot.json +721 -0
- package/dist/drizzle/meta/0007_snapshot.json +723 -0
- package/dist/drizzle/meta/_journal.json +14 -0
- package/dist/package.json +10 -8
- package/dist/src/app/assets/{index-C6z1nbLN.js → index-BpjzEMiv.js} +243 -241
- package/dist/src/app/assets/{index.es-oqbvfIxR.js → index.es-ihzvEu35.js} +1 -1
- package/dist/src/app/assets/{sync-D2s75VlC.js → sync-BosjlpGJ.js} +1 -1
- package/dist/src/app/index.html +3 -3
- package/dist/src/assertions.js +2 -2
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/commands/cache.d.ts.map +1 -1
- package/dist/src/commands/cache.js +0 -2
- package/dist/src/commands/cache.js.map +1 -1
- package/dist/src/commands/eval.d.ts.map +1 -1
- package/dist/src/commands/eval.js +19 -16
- package/dist/src/commands/eval.js.map +1 -1
- package/dist/src/commands/export.d.ts.map +1 -1
- package/dist/src/commands/export.js +8 -31
- package/dist/src/commands/export.js.map +1 -1
- package/dist/src/commands/import.d.ts.map +1 -1
- package/dist/src/commands/import.js +52 -13
- package/dist/src/commands/import.js.map +1 -1
- package/dist/src/commands/list.d.ts.map +1 -1
- package/dist/src/commands/list.js +35 -7
- package/dist/src/commands/list.js.map +1 -1
- package/dist/src/commands/share.d.ts +2 -2
- package/dist/src/commands/share.d.ts.map +1 -1
- package/dist/src/commands/share.js +12 -13
- package/dist/src/commands/share.js.map +1 -1
- package/dist/src/commands/show.d.ts.map +1 -1
- package/dist/src/commands/show.js +10 -6
- package/dist/src/commands/show.js.map +1 -1
- package/dist/src/constants.d.ts +1 -0
- package/dist/src/constants.d.ts.map +1 -1
- package/dist/src/constants.js +2 -1
- package/dist/src/constants.js.map +1 -1
- package/dist/src/database/index.js +1 -1
- package/dist/src/database/index.js.map +1 -1
- package/dist/src/database/tables.d.ts +609 -11
- package/dist/src/database/tables.d.ts.map +1 -1
- package/dist/src/database/tables.js +111 -52
- package/dist/src/database/tables.js.map +1 -1
- package/dist/src/database/types.d.ts +3 -3
- package/dist/src/database/types.d.ts.map +1 -1
- package/dist/src/evaluator.d.ts +3 -2
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +75 -104
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/evaluatorHelpers.d.ts.map +1 -1
- package/dist/src/evaluatorHelpers.js +2 -1
- package/dist/src/evaluatorHelpers.js.map +1 -1
- package/dist/src/index.d.ts +2 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +18 -10
- package/dist/src/index.js.map +1 -1
- package/dist/src/models/eval.d.ts +95 -0
- package/dist/src/models/eval.d.ts.map +1 -0
- package/dist/src/models/eval.js +390 -0
- package/dist/src/models/eval.js.map +1 -0
- package/dist/src/models/evalResult.d.ts +50 -0
- package/dist/src/models/evalResult.d.ts.map +1 -0
- package/dist/src/models/evalResult.js +122 -0
- package/dist/src/models/evalResult.js.map +1 -0
- package/dist/src/models/provider.d.ts +9 -0
- package/dist/src/models/provider.d.ts.map +1 -0
- package/dist/src/models/provider.js +47 -0
- package/dist/src/models/provider.js.map +1 -0
- package/dist/src/prompts/index.d.ts.map +1 -1
- package/dist/src/prompts/index.js +2 -1
- package/dist/src/prompts/index.js.map +1 -1
- package/dist/src/prompts/utils.d.ts +1 -0
- package/dist/src/prompts/utils.d.ts.map +1 -1
- package/dist/src/prompts/utils.js +7 -0
- package/dist/src/prompts/utils.js.map +1 -1
- package/dist/src/providers/fal.d.ts +2 -2
- package/dist/src/providers/fal.d.ts.map +1 -1
- package/dist/src/providers/fal.js +2 -1
- package/dist/src/providers/fal.js.map +1 -1
- package/dist/src/providers/http.js +2 -2
- package/dist/src/providers/http.js.map +1 -1
- package/dist/src/providers/palm.d.ts +4 -3
- package/dist/src/providers/palm.d.ts.map +1 -1
- package/dist/src/providers/palm.js +13 -3
- package/dist/src/providers/palm.js.map +1 -1
- package/dist/src/providers.js +5 -5
- package/dist/src/providers.js.map +1 -1
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240617.json +10 -0
- package/dist/src/redteam/eval/excessive-agency/llm_rubric-20240618.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240723.json +10 -0
- package/dist/src/redteam/eval/harmful/llm_rubric-20240724.json +10 -0
- package/dist/src/server/server.d.ts +1 -0
- package/dist/src/server/server.d.ts.map +1 -1
- package/dist/src/server/server.js +70 -31
- package/dist/src/server/server.js.map +1 -1
- package/dist/src/share.d.ts +2 -2
- package/dist/src/share.d.ts.map +1 -1
- package/dist/src/share.js +93 -34
- package/dist/src/share.js.map +1 -1
- package/dist/src/table.d.ts +2 -2
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +3 -3
- package/dist/src/table.js.map +1 -1
- package/dist/src/types/index.d.ts +163 -11
- package/dist/src/types/index.d.ts.map +1 -1
- package/dist/src/types/index.js +21 -1
- package/dist/src/types/index.js.map +1 -1
- package/dist/src/util/config/load.d.ts.map +1 -1
- package/dist/src/util/config/load.js +2 -1
- package/dist/src/util/config/load.js.map +1 -1
- package/dist/src/util/config/manage.d.ts.map +1 -1
- package/dist/src/util/config/manage.js.map +1 -1
- package/dist/src/util/convertEvalResultsToTable.d.ts +16 -0
- package/dist/src/util/convertEvalResultsToTable.d.ts.map +1 -0
- package/dist/src/util/convertEvalResultsToTable.js +136 -0
- package/dist/src/util/convertEvalResultsToTable.js.map +1 -0
- package/dist/src/util/createHash.d.ts +1 -0
- package/dist/src/util/createHash.d.ts.map +1 -1
- package/dist/src/util/createHash.js +9 -0
- package/dist/src/util/createHash.js.map +1 -1
- package/dist/src/util/file.d.ts +8 -0
- package/dist/src/util/file.d.ts.map +1 -0
- package/dist/src/util/file.js +13 -0
- package/dist/src/util/file.js.map +1 -0
- package/dist/src/util/index.d.ts +9 -14
- package/dist/src/util/index.d.ts.map +1 -1
- package/dist/src/util/index.js +132 -268
- package/dist/src/util/index.js.map +1 -1
- package/dist/src/util/time.d.ts +2 -0
- package/dist/src/util/time.d.ts.map +1 -0
- package/dist/src/util/time.js +7 -0
- package/dist/src/util/time.js.map +1 -0
- package/dist/src/util/transform.js +2 -2
- package/dist/src/util/transform.js.map +1 -1
- package/dist/src/validators/providers.d.ts +6 -0
- package/dist/src/validators/providers.d.ts.map +1 -1
- package/dist/src/validators/providers.js +1 -0
- package/dist/src/validators/providers.js.map +1 -1
- package/dist/src/validators/redteam.d.ts +6 -0
- package/dist/src/validators/redteam.d.ts.map +1 -1
- package/dist/test/commands/eval/filterFailingTests.test.js +24 -2
- package/dist/test/commands/eval/filterFailingTests.test.js.map +1 -1
- package/dist/test/evaluator.test.js +152 -74
- package/dist/test/evaluator.test.js.map +1 -1
- package/dist/test/factories/data/eval/database_records.d.ts +142 -0
- package/dist/test/factories/data/eval/database_records.d.ts.map +1 -0
- package/dist/test/factories/data/eval/database_records.js +251 -0
- package/dist/test/factories/data/eval/database_records.js.map +1 -0
- package/dist/test/factories/evalFactory.d.ts +768 -0
- package/dist/test/factories/evalFactory.d.ts.map +1 -0
- package/dist/test/factories/evalFactory.js +121 -0
- package/dist/test/factories/evalFactory.js.map +1 -0
- package/dist/test/index.test.js +20 -35
- package/dist/test/index.test.js.map +1 -1
- package/dist/test/models/eval.test.d.ts +2 -0
- package/dist/test/models/eval.test.d.ts.map +1 -0
- package/dist/test/models/eval.test.js +34 -0
- package/dist/test/models/eval.test.js.map +1 -0
- package/dist/test/providers.test.js +3 -3
- package/dist/test/providers.test.js.map +1 -1
- package/dist/test/server/share.test.d.ts +2 -0
- package/dist/test/server/share.test.d.ts.map +1 -0
- package/dist/test/server/share.test.js +36 -0
- package/dist/test/server/share.test.js.map +1 -0
- package/dist/test/server/v3evalToShare.json +507 -0
- package/dist/test/server/v4evalToShare.json +421 -0
- package/dist/test/types.test.js +56 -3
- package/dist/test/types.test.js.map +1 -1
- package/dist/test/util.file.test.d.ts +2 -0
- package/dist/test/util.file.test.d.ts.map +1 -0
- package/dist/test/util.file.test.js +32 -0
- package/dist/test/util.file.test.js.map +1 -0
- package/dist/test/util.listPrevious.test.d.ts +2 -0
- package/dist/test/util.listPrevious.test.d.ts.map +1 -0
- package/dist/test/util.listPrevious.test.js +37 -0
- package/dist/test/util.listPrevious.test.js.map +1 -0
- package/dist/test/util.test.js +38 -311
- package/dist/test/util.test.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -0
- package/package.json +10 -8
|
@@ -3,9 +3,12 @@ var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
|
3
3
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
4
|
};
|
|
5
5
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
const crypto_1 = require("crypto");
|
|
6
7
|
const glob_1 = __importDefault(require("glob"));
|
|
7
8
|
const evaluator_1 = require("../src/evaluator");
|
|
8
9
|
const evaluatorHelpers_1 = require("../src/evaluatorHelpers");
|
|
10
|
+
const migrate_1 = require("../src/migrate");
|
|
11
|
+
const eval_1 = __importDefault(require("../src/models/eval"));
|
|
9
12
|
jest.mock('node-fetch', () => jest.fn());
|
|
10
13
|
jest.mock('proxy-agent', () => ({
|
|
11
14
|
ProxyAgent: jest.fn().mockImplementation(() => ({})),
|
|
@@ -13,21 +16,7 @@ jest.mock('proxy-agent', () => ({
|
|
|
13
16
|
jest.mock('glob', () => ({
|
|
14
17
|
globSync: jest.fn(),
|
|
15
18
|
}));
|
|
16
|
-
jest.mock('fs', () => ({
|
|
17
|
-
readFileSync: jest.fn(),
|
|
18
|
-
writeFileSync: jest.fn(),
|
|
19
|
-
statSync: jest.fn(),
|
|
20
|
-
readdirSync: jest.fn(),
|
|
21
|
-
existsSync: jest.fn(),
|
|
22
|
-
mkdirSync: jest.fn(),
|
|
23
|
-
promises: {
|
|
24
|
-
readFile: jest.fn(),
|
|
25
|
-
},
|
|
26
|
-
}));
|
|
27
19
|
jest.mock('../src/esm');
|
|
28
|
-
jest.mock('../src/database', () => ({
|
|
29
|
-
getDb: jest.fn(),
|
|
30
|
-
}));
|
|
31
20
|
jest.mock('../src/logger');
|
|
32
21
|
jest.mock('../src/evaluatorHelpers', () => ({
|
|
33
22
|
...jest.requireActual('../src/evaluatorHelpers'),
|
|
@@ -40,6 +29,13 @@ const mockApiProvider = {
|
|
|
40
29
|
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
41
30
|
}),
|
|
42
31
|
};
|
|
32
|
+
const mockApiProvider2 = {
|
|
33
|
+
id: jest.fn().mockReturnValue('test-provider-2'),
|
|
34
|
+
callApi: jest.fn().mockResolvedValue({
|
|
35
|
+
output: 'Test output',
|
|
36
|
+
tokenUsage: { total: 10, prompt: 5, completion: 5, cached: 0 },
|
|
37
|
+
}),
|
|
38
|
+
};
|
|
43
39
|
const mockGradingApiProviderPasses = {
|
|
44
40
|
id: jest.fn().mockReturnValue('test-grading-provider'),
|
|
45
41
|
callApi: jest.fn().mockResolvedValue({
|
|
@@ -58,6 +54,9 @@ function toPrompt(text) {
|
|
|
58
54
|
return { raw: text, label: text };
|
|
59
55
|
}
|
|
60
56
|
describe('evaluator', () => {
|
|
57
|
+
beforeAll(async () => {
|
|
58
|
+
await (0, migrate_1.runDbMigrations)();
|
|
59
|
+
});
|
|
61
60
|
beforeEach(() => {
|
|
62
61
|
jest.clearAllMocks();
|
|
63
62
|
});
|
|
@@ -74,7 +73,9 @@ describe('evaluator', () => {
|
|
|
74
73
|
},
|
|
75
74
|
],
|
|
76
75
|
};
|
|
77
|
-
const
|
|
76
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
77
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
78
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
78
79
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
79
80
|
expect(summary.stats.successes).toBe(1);
|
|
80
81
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -93,7 +94,9 @@ describe('evaluator', () => {
|
|
|
93
94
|
},
|
|
94
95
|
],
|
|
95
96
|
};
|
|
96
|
-
const
|
|
97
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
98
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
99
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
97
100
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
98
101
|
expect(summary.stats.successes).toBe(1);
|
|
99
102
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -112,7 +115,9 @@ describe('evaluator', () => {
|
|
|
112
115
|
},
|
|
113
116
|
],
|
|
114
117
|
};
|
|
115
|
-
const
|
|
118
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
119
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
120
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
116
121
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
117
122
|
expect(summary.stats.successes).toBe(1);
|
|
118
123
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -131,7 +136,9 @@ describe('evaluator', () => {
|
|
|
131
136
|
},
|
|
132
137
|
],
|
|
133
138
|
};
|
|
134
|
-
const
|
|
139
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
140
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
141
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
135
142
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
136
143
|
expect(summary.stats.successes).toBe(1);
|
|
137
144
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -150,7 +157,9 @@ describe('evaluator', () => {
|
|
|
150
157
|
},
|
|
151
158
|
],
|
|
152
159
|
};
|
|
153
|
-
const
|
|
160
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
161
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
162
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
154
163
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
|
|
155
164
|
expect(summary.stats.successes).toBe(4);
|
|
156
165
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -169,7 +178,9 @@ describe('evaluator', () => {
|
|
|
169
178
|
},
|
|
170
179
|
],
|
|
171
180
|
};
|
|
172
|
-
const
|
|
181
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
182
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
183
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
173
184
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
174
185
|
expect(summary.stats.successes).toBe(2);
|
|
175
186
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -183,7 +194,9 @@ describe('evaluator', () => {
|
|
|
183
194
|
providers: [mockApiProvider],
|
|
184
195
|
prompts: [toPrompt('Test prompt')],
|
|
185
196
|
};
|
|
186
|
-
const
|
|
197
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
198
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
199
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
187
200
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
188
201
|
expect(summary.stats.successes).toBe(1);
|
|
189
202
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -197,7 +210,9 @@ describe('evaluator', () => {
|
|
|
197
210
|
providers: [mockApiProvider, mockApiProvider, mockApiProvider],
|
|
198
211
|
prompts: [toPrompt('Test prompt')],
|
|
199
212
|
};
|
|
200
|
-
const
|
|
213
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
214
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
215
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
201
216
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(3);
|
|
202
217
|
expect(summary.stats.successes).toBe(3);
|
|
203
218
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -221,7 +236,9 @@ describe('evaluator', () => {
|
|
|
221
236
|
},
|
|
222
237
|
],
|
|
223
238
|
};
|
|
224
|
-
const
|
|
239
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
240
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
241
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
225
242
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
226
243
|
expect(summary.stats.successes).toBe(1);
|
|
227
244
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -243,7 +260,9 @@ describe('evaluator', () => {
|
|
|
243
260
|
},
|
|
244
261
|
],
|
|
245
262
|
};
|
|
246
|
-
const
|
|
263
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
264
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
265
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
247
266
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
248
267
|
expect(summary.stats.successes).toBe(0);
|
|
249
268
|
expect(summary.stats.failures).toBe(1);
|
|
@@ -265,7 +284,9 @@ describe('evaluator', () => {
|
|
|
265
284
|
},
|
|
266
285
|
],
|
|
267
286
|
};
|
|
268
|
-
const
|
|
287
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
288
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
289
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
269
290
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
270
291
|
expect(summary.stats.successes).toBe(1);
|
|
271
292
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -287,7 +308,9 @@ describe('evaluator', () => {
|
|
|
287
308
|
},
|
|
288
309
|
],
|
|
289
310
|
};
|
|
290
|
-
const
|
|
311
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
312
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
313
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
291
314
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
292
315
|
expect(summary.stats.successes).toBe(0);
|
|
293
316
|
expect(summary.stats.failures).toBe(1);
|
|
@@ -314,7 +337,9 @@ describe('evaluator', () => {
|
|
|
314
337
|
},
|
|
315
338
|
},
|
|
316
339
|
};
|
|
317
|
-
const
|
|
340
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
341
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
342
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
318
343
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
319
344
|
expect(summary.stats.successes).toBe(1);
|
|
320
345
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -341,7 +366,9 @@ describe('evaluator', () => {
|
|
|
341
366
|
},
|
|
342
367
|
},
|
|
343
368
|
};
|
|
344
|
-
const
|
|
369
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
370
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
371
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
345
372
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
346
373
|
expect(summary.stats.successes).toBe(0);
|
|
347
374
|
expect(summary.stats.failures).toBe(1);
|
|
@@ -358,7 +385,9 @@ describe('evaluator', () => {
|
|
|
358
385
|
},
|
|
359
386
|
},
|
|
360
387
|
};
|
|
361
|
-
const
|
|
388
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
389
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
390
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
362
391
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
363
392
|
expect(summary.stats.successes).toBe(1);
|
|
364
393
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -382,7 +411,9 @@ describe('evaluator', () => {
|
|
|
382
411
|
},
|
|
383
412
|
],
|
|
384
413
|
};
|
|
385
|
-
const
|
|
414
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
415
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
416
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
386
417
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
387
418
|
expect(summary.stats.successes).toBe(1);
|
|
388
419
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -413,7 +444,9 @@ describe('evaluator', () => {
|
|
|
413
444
|
},
|
|
414
445
|
],
|
|
415
446
|
};
|
|
416
|
-
const
|
|
447
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
448
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
449
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
417
450
|
expect(mockApiJsonProvider.callApi).toHaveBeenCalledTimes(1);
|
|
418
451
|
expect(summary.stats.successes).toBe(1);
|
|
419
452
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -443,7 +476,9 @@ describe('evaluator', () => {
|
|
|
443
476
|
},
|
|
444
477
|
],
|
|
445
478
|
};
|
|
446
|
-
const
|
|
479
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
480
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
481
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
447
482
|
expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
|
|
448
483
|
expect(summary.stats.successes).toBe(1);
|
|
449
484
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -470,7 +505,10 @@ describe('evaluator', () => {
|
|
|
470
505
|
},
|
|
471
506
|
},
|
|
472
507
|
};
|
|
473
|
-
await
|
|
508
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
509
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
510
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
511
|
+
expect(summary).toEqual(expect.objectContaining({
|
|
474
512
|
stats: expect.objectContaining({
|
|
475
513
|
successes: 2,
|
|
476
514
|
failures: 0,
|
|
@@ -534,7 +572,10 @@ describe('evaluator', () => {
|
|
|
534
572
|
},
|
|
535
573
|
],
|
|
536
574
|
};
|
|
537
|
-
await
|
|
575
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
576
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
577
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
578
|
+
expect(summary).toEqual(expect.objectContaining({
|
|
538
579
|
stats: expect.objectContaining({
|
|
539
580
|
successes: 1,
|
|
540
581
|
failures: 0,
|
|
@@ -562,7 +603,9 @@ describe('evaluator', () => {
|
|
|
562
603
|
},
|
|
563
604
|
],
|
|
564
605
|
};
|
|
565
|
-
const
|
|
606
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
607
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
608
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
566
609
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
567
610
|
expect(summary.stats.successes).toBe(1);
|
|
568
611
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -595,7 +638,9 @@ describe('evaluator', () => {
|
|
|
595
638
|
},
|
|
596
639
|
],
|
|
597
640
|
};
|
|
598
|
-
const
|
|
641
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
642
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
643
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
599
644
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
600
645
|
expect(summary).toMatchObject({
|
|
601
646
|
stats: {
|
|
@@ -651,7 +696,9 @@ describe('evaluator', () => {
|
|
|
651
696
|
},
|
|
652
697
|
],
|
|
653
698
|
};
|
|
654
|
-
const
|
|
699
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
700
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
701
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
655
702
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
656
703
|
expect(summary.stats.successes).toBe(2);
|
|
657
704
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -706,7 +753,9 @@ describe('evaluator', () => {
|
|
|
706
753
|
},
|
|
707
754
|
],
|
|
708
755
|
};
|
|
709
|
-
const
|
|
756
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
757
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
758
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
710
759
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(4);
|
|
711
760
|
expect(summary.stats.successes).toBe(4);
|
|
712
761
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -764,8 +813,10 @@ describe('evaluator', () => {
|
|
|
764
813
|
},
|
|
765
814
|
],
|
|
766
815
|
};
|
|
767
|
-
const
|
|
768
|
-
|
|
816
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
817
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
818
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
819
|
+
expect(summary).toMatchObject({
|
|
769
820
|
stats: {
|
|
770
821
|
successes: 2,
|
|
771
822
|
failures: 0,
|
|
@@ -787,7 +838,7 @@ describe('evaluator', () => {
|
|
|
787
838
|
}),
|
|
788
839
|
]),
|
|
789
840
|
});
|
|
790
|
-
expect(
|
|
841
|
+
expect(summary.results[0].testCase.metadata).toEqual({
|
|
791
842
|
defaultKey: 'defaultValue',
|
|
792
843
|
configKey: 'configValue',
|
|
793
844
|
testKey: 'testValue',
|
|
@@ -807,8 +858,9 @@ describe('evaluator', () => {
|
|
|
807
858
|
},
|
|
808
859
|
],
|
|
809
860
|
};
|
|
810
|
-
const
|
|
811
|
-
|
|
861
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
862
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
863
|
+
expect(evalRecord.results[0].testCase.metadata).toEqual({
|
|
812
864
|
defaultKey: 'defaultValue',
|
|
813
865
|
testKey: 'testValue',
|
|
814
866
|
});
|
|
@@ -835,7 +887,9 @@ describe('evaluator', () => {
|
|
|
835
887
|
},
|
|
836
888
|
],
|
|
837
889
|
};
|
|
838
|
-
const
|
|
890
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
891
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
892
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
839
893
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(2);
|
|
840
894
|
expect(summary.stats.successes).toBe(2);
|
|
841
895
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -875,7 +929,10 @@ describe('evaluator', () => {
|
|
|
875
929
|
'unlabeled-provider-id': ['prompt2'],
|
|
876
930
|
},
|
|
877
931
|
};
|
|
878
|
-
await
|
|
932
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
933
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
934
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
935
|
+
expect(summary).toMatchObject({
|
|
879
936
|
stats: expect.objectContaining({
|
|
880
937
|
successes: 2,
|
|
881
938
|
failures: 0,
|
|
@@ -900,19 +957,15 @@ describe('evaluator', () => {
|
|
|
900
957
|
}),
|
|
901
958
|
}),
|
|
902
959
|
],
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
provider: 'Labeled Provider',
|
|
908
|
-
}),
|
|
909
|
-
expect.objectContaining({
|
|
910
|
-
provider: 'unlabeled-provider-id',
|
|
911
|
-
}),
|
|
912
|
-
],
|
|
913
|
-
}),
|
|
960
|
+
});
|
|
961
|
+
expect(evalRecord.prompts).toEqual(expect.arrayContaining([
|
|
962
|
+
expect.objectContaining({
|
|
963
|
+
provider: 'Labeled Provider',
|
|
914
964
|
}),
|
|
915
|
-
|
|
965
|
+
expect.objectContaining({
|
|
966
|
+
provider: 'unlabeled-provider-id',
|
|
967
|
+
}),
|
|
968
|
+
]));
|
|
916
969
|
expect(mockLabeledProvider.callApi).toHaveBeenCalledTimes(1);
|
|
917
970
|
expect(mockUnlabeledProvider.callApi).toHaveBeenCalledTimes(1);
|
|
918
971
|
});
|
|
@@ -929,7 +982,9 @@ describe('evaluator', () => {
|
|
|
929
982
|
},
|
|
930
983
|
],
|
|
931
984
|
};
|
|
932
|
-
const
|
|
985
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
986
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
987
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
933
988
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
934
989
|
expect(summary.stats.successes).toBe(1);
|
|
935
990
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -961,7 +1016,9 @@ describe('evaluator', () => {
|
|
|
961
1016
|
},
|
|
962
1017
|
],
|
|
963
1018
|
};
|
|
964
|
-
const
|
|
1019
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1020
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1021
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
965
1022
|
expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
|
|
966
1023
|
expect(summary.stats.successes).toBe(1);
|
|
967
1024
|
expect(summary.stats.failures).toBe(0);
|
|
@@ -993,19 +1050,16 @@ describe('evaluator', () => {
|
|
|
993
1050
|
},
|
|
994
1051
|
],
|
|
995
1052
|
};
|
|
996
|
-
await
|
|
1053
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1054
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1055
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1056
|
+
expect(summary).toMatchObject({
|
|
997
1057
|
stats: expect.objectContaining({
|
|
998
1058
|
successes: 1,
|
|
999
1059
|
failures: 0,
|
|
1000
1060
|
}),
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
response: expect.objectContaining({
|
|
1004
|
-
output: 'Postprocess: Provider: Original output',
|
|
1005
|
-
}),
|
|
1006
|
-
}),
|
|
1007
|
-
]),
|
|
1008
|
-
}));
|
|
1061
|
+
});
|
|
1062
|
+
expect(summary.results[0].response?.output).toBe('Postprocess: Provider: Original output');
|
|
1009
1063
|
expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
|
|
1010
1064
|
});
|
|
1011
1065
|
it('evaluate with provider transform, test transform, and test postprocess (deprecated)', async () => {
|
|
@@ -1035,7 +1089,10 @@ describe('evaluator', () => {
|
|
|
1035
1089
|
},
|
|
1036
1090
|
],
|
|
1037
1091
|
};
|
|
1038
|
-
await
|
|
1092
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1093
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1094
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1095
|
+
expect(summary).toMatchObject({
|
|
1039
1096
|
stats: expect.objectContaining({
|
|
1040
1097
|
successes: 1,
|
|
1041
1098
|
failures: 0,
|
|
@@ -1047,7 +1104,7 @@ describe('evaluator', () => {
|
|
|
1047
1104
|
}),
|
|
1048
1105
|
}),
|
|
1049
1106
|
]),
|
|
1050
|
-
})
|
|
1107
|
+
});
|
|
1051
1108
|
expect(mockApiProviderWithTransform.callApi).toHaveBeenCalledTimes(1);
|
|
1052
1109
|
});
|
|
1053
1110
|
it('evaluate with no output', async () => {
|
|
@@ -1063,7 +1120,9 @@ describe('evaluator', () => {
|
|
|
1063
1120
|
prompts: [toPrompt('Test prompt')],
|
|
1064
1121
|
tests: [],
|
|
1065
1122
|
};
|
|
1066
|
-
const
|
|
1123
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1124
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1125
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1067
1126
|
expect(summary.stats.successes).toBe(0);
|
|
1068
1127
|
expect(summary.stats.failures).toBe(1);
|
|
1069
1128
|
expect(summary.results[0].error).toBe('No output');
|
|
@@ -1084,7 +1143,9 @@ describe('evaluator', () => {
|
|
|
1084
1143
|
prompts: [toPrompt('Test prompt')],
|
|
1085
1144
|
tests: [],
|
|
1086
1145
|
};
|
|
1087
|
-
const
|
|
1146
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1147
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1148
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1088
1149
|
expect(summary.stats.successes).toBe(1);
|
|
1089
1150
|
expect(summary.stats.failures).toBe(0);
|
|
1090
1151
|
expect(summary.results[0].success).toBe(true);
|
|
@@ -1126,7 +1187,9 @@ describe('evaluator', () => {
|
|
|
1126
1187
|
],
|
|
1127
1188
|
tests: [{ vars: { problem: '8x + 31 = 2' } }],
|
|
1128
1189
|
};
|
|
1129
|
-
const
|
|
1190
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1191
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1192
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1130
1193
|
expect(summary.stats.successes).toBe(1);
|
|
1131
1194
|
expect(summary.stats.failures).toBe(0);
|
|
1132
1195
|
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
@@ -1167,7 +1230,8 @@ describe('evaluator', () => {
|
|
|
1167
1230
|
};
|
|
1168
1231
|
const mockedRunExtensionHook = jest.mocked(evaluatorHelpers_1.runExtensionHook);
|
|
1169
1232
|
mockedRunExtensionHook.mockClear();
|
|
1170
|
-
await (0,
|
|
1233
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1234
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1171
1235
|
// Check if runExtensionHook was called 4 times (beforeAll, beforeEach, afterEach, afterAll)
|
|
1172
1236
|
expect(mockedRunExtensionHook).toHaveBeenCalledTimes(4);
|
|
1173
1237
|
// Check beforeAll call
|
|
@@ -1207,6 +1271,20 @@ describe('evaluator', () => {
|
|
|
1207
1271
|
suite: testSuite,
|
|
1208
1272
|
}));
|
|
1209
1273
|
});
|
|
1274
|
+
it('should handle multiple providers', async () => {
|
|
1275
|
+
const testSuite = {
|
|
1276
|
+
providers: [mockApiProvider, mockApiProvider2],
|
|
1277
|
+
prompts: [toPrompt('Test prompt')],
|
|
1278
|
+
tests: [],
|
|
1279
|
+
};
|
|
1280
|
+
const evalRecord = await eval_1.default.create({}, testSuite.prompts, { id: (0, crypto_1.randomUUID)() });
|
|
1281
|
+
await (0, evaluator_1.evaluate)(testSuite, evalRecord, {});
|
|
1282
|
+
const summary = await evalRecord.toEvaluateSummary();
|
|
1283
|
+
expect(summary.stats.successes).toBe(2);
|
|
1284
|
+
expect(summary.stats.failures).toBe(0);
|
|
1285
|
+
expect(mockApiProvider.callApi).toHaveBeenCalledTimes(1);
|
|
1286
|
+
expect(mockApiProvider2.callApi).toHaveBeenCalledTimes(1);
|
|
1287
|
+
});
|
|
1210
1288
|
});
|
|
1211
1289
|
describe('generateVarCombinations', () => {
|
|
1212
1290
|
it('should generate combinations for simple variables', () => {
|