promptfoo 0.17.8 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/package.json +1 -1
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +97 -42
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +43 -7
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -0
- package/dist/src/index.js.map +1 -1
- package/dist/src/main.js +9 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers.d.ts +2 -2
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +15 -1
- package/dist/src/providers.js.map +1 -1
- package/dist/src/table.js +2 -2
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +15 -4
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +3 -2
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +70 -18
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-0c6f887d.js → index-8388d689.js} +1 -1
- package/dist/src/web/client/assets/{index-f9b230d1.css → index-d2b6a160.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/package.json +1 -1
- package/src/assertions.ts +102 -49
- package/src/evaluator.ts +42 -4
- package/src/index.ts +6 -1
- package/src/main.ts +14 -0
- package/src/providers.ts +22 -4
- package/src/table.ts +2 -2
- package/src/types.ts +29 -3
- package/src/util.ts +82 -17
- package/src/web/client/package-lock.json +5726 -0
- package/src/web/client/src/ResultsTable.css +11 -1
- package/src/web/client/src/ResultsTable.tsx +10 -0
- package/src/web/client/src/ResultsView.tsx +7 -1
- package/src/web/client/src/types.ts +4 -0
package/src/main.ts
CHANGED
|
@@ -281,6 +281,7 @@ async function main() {
|
|
|
281
281
|
prompts: cmdObj.prompts || fileConfig.prompts || defaultConfig.prompts,
|
|
282
282
|
providers: cmdObj.providers || fileConfig.providers || defaultConfig.providers,
|
|
283
283
|
tests: cmdObj.tests || cmdObj.vars || fileConfig.tests || defaultConfig.tests,
|
|
284
|
+
scenarios: fileConfig.scenarios || defaultConfig.scenarios,
|
|
284
285
|
sharing:
|
|
285
286
|
process.env.PROMPTFOO_DISABLE_SHARING === '1'
|
|
286
287
|
? false
|
|
@@ -310,6 +311,18 @@ async function main() {
|
|
|
310
311
|
config.tests,
|
|
311
312
|
cmdObj.tests ? undefined : basePath,
|
|
312
313
|
);
|
|
314
|
+
|
|
315
|
+
//parse testCases for each scenario
|
|
316
|
+
if (fileConfig.scenarios) {
|
|
317
|
+
for (const scenario of fileConfig.scenarios) {
|
|
318
|
+
const parsedScenarioTests: TestCase[] = await readTests(
|
|
319
|
+
scenario.tests,
|
|
320
|
+
cmdObj.tests ? undefined : basePath,
|
|
321
|
+
);
|
|
322
|
+
scenario.tests = parsedScenarioTests;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
313
326
|
const parsedProviderPromptMap = readProviderPromptMap(config, parsedPrompts);
|
|
314
327
|
|
|
315
328
|
if (parsedPrompts.length === 0) {
|
|
@@ -334,6 +347,7 @@ async function main() {
|
|
|
334
347
|
providers: parsedProviders,
|
|
335
348
|
providerPromptMap: parsedProviderPromptMap,
|
|
336
349
|
tests: parsedTests,
|
|
350
|
+
scenarios: config.scenarios,
|
|
337
351
|
defaultTest,
|
|
338
352
|
};
|
|
339
353
|
|
package/src/providers.ts
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import path from 'path';
|
|
2
2
|
|
|
3
|
-
import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './types';
|
|
4
|
-
|
|
5
3
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
4
|
import { AnthropicCompletionProvider } from './providers/anthropic';
|
|
7
5
|
import { ReplicateProvider } from './providers/replicate';
|
|
@@ -12,17 +10,37 @@ import {
|
|
|
12
10
|
AzureOpenAiCompletionProvider,
|
|
13
11
|
} from './providers/azureopenai';
|
|
14
12
|
|
|
13
|
+
import type {
|
|
14
|
+
ApiProvider,
|
|
15
|
+
ProviderConfig,
|
|
16
|
+
ProviderFunction,
|
|
17
|
+
ProviderId,
|
|
18
|
+
RawProviderConfig,
|
|
19
|
+
} from './types';
|
|
20
|
+
|
|
15
21
|
export async function loadApiProviders(
|
|
16
|
-
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
|
|
22
|
+
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction,
|
|
17
23
|
basePath?: string,
|
|
18
24
|
): Promise<ApiProvider[]> {
|
|
19
25
|
if (typeof providerPaths === 'string') {
|
|
20
26
|
return [await loadApiProvider(providerPaths, undefined, basePath)];
|
|
27
|
+
} else if (typeof providerPaths === 'function') {
|
|
28
|
+
return [
|
|
29
|
+
{
|
|
30
|
+
id: () => 'custom-function',
|
|
31
|
+
callApi: providerPaths,
|
|
32
|
+
},
|
|
33
|
+
];
|
|
21
34
|
} else if (Array.isArray(providerPaths)) {
|
|
22
35
|
return Promise.all(
|
|
23
|
-
providerPaths.map((provider) => {
|
|
36
|
+
providerPaths.map((provider, idx) => {
|
|
24
37
|
if (typeof provider === 'string') {
|
|
25
38
|
return loadApiProvider(provider, undefined, basePath);
|
|
39
|
+
} else if (typeof provider === 'function') {
|
|
40
|
+
return {
|
|
41
|
+
id: () => `custom-function-${idx}`,
|
|
42
|
+
callApi: provider,
|
|
43
|
+
};
|
|
26
44
|
} else {
|
|
27
45
|
const id = Object.keys(provider)[0];
|
|
28
46
|
const context = { ...provider[id], id };
|
package/src/table.ts
CHANGED
|
@@ -24,11 +24,11 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
|
|
|
24
24
|
text = text.slice(0, tableCellMaxLength) + '...';
|
|
25
25
|
}
|
|
26
26
|
if (pass) {
|
|
27
|
-
return chalk.green
|
|
27
|
+
return chalk.green('[PASS] ') + text;
|
|
28
28
|
} else if (!pass) {
|
|
29
29
|
// color everything red up until '---'
|
|
30
30
|
return (
|
|
31
|
-
chalk.red
|
|
31
|
+
chalk.red('[FAIL] ') +
|
|
32
32
|
text
|
|
33
33
|
.split('---')
|
|
34
34
|
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
package/src/types.ts
CHANGED
|
@@ -96,6 +96,7 @@ export interface EvaluateResult {
|
|
|
96
96
|
error?: string;
|
|
97
97
|
success: boolean;
|
|
98
98
|
score: number;
|
|
99
|
+
latencyMs: number;
|
|
99
100
|
}
|
|
100
101
|
|
|
101
102
|
export interface EvaluateTableOutput {
|
|
@@ -103,6 +104,8 @@ export interface EvaluateTableOutput {
|
|
|
103
104
|
score: number;
|
|
104
105
|
text: string;
|
|
105
106
|
prompt: string;
|
|
107
|
+
latencyMs: number;
|
|
108
|
+
tokenUsage?: Partial<TokenUsage>;
|
|
106
109
|
}
|
|
107
110
|
|
|
108
111
|
export interface EvaluateTable {
|
|
@@ -148,6 +151,7 @@ type BaseAssertionTypes =
|
|
|
148
151
|
| 'is-json'
|
|
149
152
|
| 'contains-json'
|
|
150
153
|
| 'javascript'
|
|
154
|
+
| 'python'
|
|
151
155
|
| 'similar'
|
|
152
156
|
| 'llm-rubric'
|
|
153
157
|
| 'webhook'
|
|
@@ -165,7 +169,10 @@ export interface Assertion {
|
|
|
165
169
|
type: AssertionType;
|
|
166
170
|
|
|
167
171
|
// The expected value, if applicable
|
|
168
|
-
value?:
|
|
172
|
+
value?:
|
|
173
|
+
| string
|
|
174
|
+
| string[]
|
|
175
|
+
| ((output: string, testCase: AtomicTestCase, assertion: Assertion) => Promise<GradingResult>);
|
|
169
176
|
|
|
170
177
|
// The threshold value, only applicable for similarity (cosine distance)
|
|
171
178
|
threshold?: number;
|
|
@@ -192,6 +199,17 @@ export interface TestCase {
|
|
|
192
199
|
options?: PromptConfig & OutputConfig & GradingConfig;
|
|
193
200
|
}
|
|
194
201
|
|
|
202
|
+
export interface Scenario {
|
|
203
|
+
// Optional description of what you're testing
|
|
204
|
+
description?: string;
|
|
205
|
+
|
|
206
|
+
// Default test case config
|
|
207
|
+
config: Partial<TestCase>[];
|
|
208
|
+
|
|
209
|
+
// Optional list of automatic checks to run on the LLM output
|
|
210
|
+
tests: TestCase[];
|
|
211
|
+
}
|
|
212
|
+
|
|
195
213
|
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
|
196
214
|
export interface AtomicTestCase extends TestCase {
|
|
197
215
|
vars?: Record<string, string | object>;
|
|
@@ -215,12 +233,17 @@ export interface TestSuite {
|
|
|
215
233
|
// Test cases
|
|
216
234
|
tests?: TestCase[];
|
|
217
235
|
|
|
236
|
+
// scenarios
|
|
237
|
+
scenarios?: Scenario[];
|
|
238
|
+
|
|
218
239
|
// Default test case config
|
|
219
240
|
defaultTest?: Partial<TestCase>;
|
|
220
241
|
}
|
|
221
242
|
|
|
222
243
|
export type ProviderId = string;
|
|
223
244
|
|
|
245
|
+
export type ProviderFunction = (prompt: string) => Promise<ProviderResponse>;
|
|
246
|
+
|
|
224
247
|
export type RawProviderConfig = Record<ProviderId, Omit<ProviderConfig, 'id'>>;
|
|
225
248
|
|
|
226
249
|
// TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
|
|
@@ -229,13 +252,16 @@ export interface TestSuiteConfig {
|
|
|
229
252
|
description?: string;
|
|
230
253
|
|
|
231
254
|
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
|
|
232
|
-
providers: ProviderId | ProviderId[] | RawProviderConfig[];
|
|
255
|
+
providers: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction;
|
|
233
256
|
|
|
234
257
|
// One or more prompt files to load
|
|
235
258
|
prompts: string | string[];
|
|
236
259
|
|
|
237
260
|
// Path to a test file, OR list of LLM prompt variations (aka "test case")
|
|
238
|
-
tests: string | TestCase[];
|
|
261
|
+
tests: string | string[] | TestCase[];
|
|
262
|
+
|
|
263
|
+
// Scenarios, groupings of data and tests to be evaluated
|
|
264
|
+
scenarios?: Scenario[];
|
|
239
265
|
|
|
240
266
|
// Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
|
|
241
267
|
defaultTest?: Omit<TestCase, 'description'>;
|
package/src/util.ts
CHANGED
|
@@ -4,6 +4,7 @@ import * as os from 'os';
|
|
|
4
4
|
|
|
5
5
|
import $RefParser from '@apidevtools/json-schema-ref-parser';
|
|
6
6
|
import fetch from 'node-fetch';
|
|
7
|
+
import invariant from 'tiny-invariant';
|
|
7
8
|
import yaml from 'js-yaml';
|
|
8
9
|
import nunjucks from 'nunjucks';
|
|
9
10
|
import { globSync } from 'glob';
|
|
@@ -44,6 +45,15 @@ export function readProviderPromptMap(
|
|
|
44
45
|
allPrompts.push(prompt.display);
|
|
45
46
|
}
|
|
46
47
|
|
|
48
|
+
invariant(
|
|
49
|
+
typeof config.providers !== 'string',
|
|
50
|
+
'In order to use a provider-prompt map, config.providers should be an array of objects, not a string',
|
|
51
|
+
);
|
|
52
|
+
invariant(
|
|
53
|
+
typeof config.providers !== 'function',
|
|
54
|
+
'In order to use a provider-prompt map, config.providers should be an array of objects, not a function',
|
|
55
|
+
);
|
|
56
|
+
|
|
47
57
|
for (const provider of config.providers) {
|
|
48
58
|
if (typeof provider === 'object') {
|
|
49
59
|
const rawProvider = provider as RawProviderConfig;
|
|
@@ -224,7 +234,31 @@ export async function fetchCsvFromGoogleSheet(url: string): Promise<string> {
|
|
|
224
234
|
return csvData;
|
|
225
235
|
}
|
|
226
236
|
|
|
227
|
-
export async function
|
|
237
|
+
export async function readVarsFiles(
|
|
238
|
+
pathOrGlobs: string | string[],
|
|
239
|
+
basePath: string = '',
|
|
240
|
+
): Promise<Record<string, string | string[] | object>> {
|
|
241
|
+
if (typeof pathOrGlobs === 'string') {
|
|
242
|
+
pathOrGlobs = [pathOrGlobs];
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
const ret: Record<string, string | string[] | object> = {};
|
|
246
|
+
for (const pathOrGlob of pathOrGlobs) {
|
|
247
|
+
const resolvedPath = path.resolve(basePath, pathOrGlob);
|
|
248
|
+
const paths = globSync(resolvedPath);
|
|
249
|
+
|
|
250
|
+
for (const p of paths) {
|
|
251
|
+
const yamlData = yaml.load(fs.readFileSync(p, 'utf-8'));
|
|
252
|
+
Object.assign(ret, yamlData);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
return ret;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
export async function readTestsFile(varsPath: string, basePath: string = ''): Promise<CsvRow[]> {
|
|
260
|
+
// This function is confusingly named - it reads a CSV, JSON, or YAML file of
|
|
261
|
+
// TESTS or test equivalents.
|
|
228
262
|
const resolvedVarsPath = path.resolve(basePath, varsPath);
|
|
229
263
|
const fileExtension = parsePath(varsPath).ext.slice(1);
|
|
230
264
|
let rows: CsvRow[] = [];
|
|
@@ -246,25 +280,53 @@ export async function readVars(varsPath: string, basePath: string = ''): Promise
|
|
|
246
280
|
}
|
|
247
281
|
|
|
248
282
|
export async function readTests(
|
|
249
|
-
tests: string | TestCase[] | undefined,
|
|
283
|
+
tests: string | string[] | TestCase[] | undefined,
|
|
250
284
|
basePath: string = '',
|
|
251
285
|
): Promise<TestCase[]> {
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
286
|
+
const ret: TestCase[] = [];
|
|
287
|
+
|
|
288
|
+
const loadTestsFromGlob = async (loadTestsGlob: string) => {
|
|
289
|
+
const resolvedPath = path.resolve(basePath, loadTestsGlob);
|
|
290
|
+
const testFiles = globSync(resolvedPath);
|
|
291
|
+
for (const testFile of testFiles) {
|
|
292
|
+
const testFileContent = yaml.load(fs.readFileSync(testFile, 'utf-8')) as TestCase[];
|
|
293
|
+
for (const testCase of testFileContent) {
|
|
294
|
+
if (typeof testCase.vars === 'string' || Array.isArray(testCase.vars)) {
|
|
295
|
+
const testcaseBasePath = path.dirname(testFile);
|
|
296
|
+
testCase.vars = await readVarsFiles(testCase.vars, testcaseBasePath);
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
ret.push(...testFileContent);
|
|
300
|
+
}
|
|
301
|
+
};
|
|
255
302
|
|
|
256
303
|
if (typeof tests === 'string') {
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
304
|
+
if (tests.endsWith('yaml') || tests.endsWith('yml')) {
|
|
305
|
+
// Load testcase config from yaml
|
|
306
|
+
await loadTestsFromGlob(tests);
|
|
307
|
+
} else {
|
|
308
|
+
// Legacy load CSV
|
|
309
|
+
const vars = await readTestsFile(tests, basePath);
|
|
310
|
+
return vars.map((row, idx) => {
|
|
311
|
+
const test = testCaseFromCsvRow(row);
|
|
312
|
+
test.description = `Row #${idx + 1}`;
|
|
313
|
+
return test;
|
|
314
|
+
});
|
|
315
|
+
}
|
|
316
|
+
} else if (Array.isArray(tests)) {
|
|
317
|
+
for (const maybeTestsGlob of tests) {
|
|
318
|
+
if (typeof maybeTestsGlob === 'string') {
|
|
319
|
+
// Assume it's a filepath
|
|
320
|
+
await loadTestsFromGlob(maybeTestsGlob);
|
|
321
|
+
} else {
|
|
322
|
+
// Assume it's a full test case
|
|
323
|
+
ret.push(maybeTestsGlob);
|
|
324
|
+
}
|
|
325
|
+
}
|
|
264
326
|
}
|
|
265
327
|
|
|
266
328
|
// Some validation of the shape of tests
|
|
267
|
-
for (const test of
|
|
329
|
+
for (const test of ret) {
|
|
268
330
|
if (!test.assert && !test.vars) {
|
|
269
331
|
throw new Error(
|
|
270
332
|
`Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
|
|
@@ -276,7 +338,7 @@ export async function readTests(
|
|
|
276
338
|
}
|
|
277
339
|
}
|
|
278
340
|
|
|
279
|
-
return
|
|
341
|
+
return ret;
|
|
280
342
|
}
|
|
281
343
|
|
|
282
344
|
export function writeOutput(
|
|
@@ -374,7 +436,10 @@ export function getLatestResultsPath(): string {
|
|
|
374
436
|
|
|
375
437
|
export function writeLatestResults(results: EvaluateSummary, config: Partial<UnifiedConfig>) {
|
|
376
438
|
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
|
|
377
|
-
|
|
439
|
+
|
|
440
|
+
// Replace hyphens with colons (Windows compatibility).
|
|
441
|
+
const timestamp = new Date().toISOString().replace(/:/g, '-');
|
|
442
|
+
|
|
378
443
|
const newResultsPath = path.join(resultsDirectory, `eval-${timestamp}.json`);
|
|
379
444
|
const latestResultsPath = getLatestResultsPath();
|
|
380
445
|
try {
|
|
@@ -391,7 +456,7 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
|
|
|
391
456
|
2,
|
|
392
457
|
),
|
|
393
458
|
);
|
|
394
|
-
if (fs.existsSync(latestResultsPath)) {
|
|
459
|
+
if (fs.existsSync(latestResultsPath) || fs.lstatSync(latestResultsPath).isSymbolicLink()) {
|
|
395
460
|
fs.unlinkSync(latestResultsPath);
|
|
396
461
|
}
|
|
397
462
|
fs.symlinkSync(newResultsPath, latestResultsPath);
|
|
@@ -408,7 +473,7 @@ export function listPreviousResults(): string[] {
|
|
|
408
473
|
const sortedFiles = resultsFiles.sort((a, b) => {
|
|
409
474
|
const statA = fs.statSync(path.join(directory, a));
|
|
410
475
|
const statB = fs.statSync(path.join(directory, b));
|
|
411
|
-
return
|
|
476
|
+
return statA.birthtime.getTime() - statB.birthtime.getTime(); // sort in ascending order
|
|
412
477
|
});
|
|
413
478
|
return sortedFiles;
|
|
414
479
|
}
|