promptfoo 0.17.9 → 0.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +1 -1
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +97 -42
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +35 -7
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +3 -0
- package/dist/src/index.js.map +1 -1
- package/dist/src/main.js +9 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers.d.ts +2 -2
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +15 -1
- package/dist/src/providers.js.map +1 -1
- package/dist/src/table.js +2 -2
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +11 -4
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +5 -2
- package/dist/src/util.js.map +1 -1
- package/package.json +1 -1
- package/src/assertions.ts +102 -49
- package/src/evaluator.ts +33 -4
- package/src/index.ts +6 -1
- package/src/main.ts +14 -0
- package/src/providers.ts +22 -4
- package/src/table.ts +2 -2
- package/src/types.ts +25 -5
- package/src/util.ts +12 -2
package/src/index.ts
CHANGED
|
@@ -3,7 +3,7 @@ import providers from './providers';
|
|
|
3
3
|
import telemetry from './telemetry';
|
|
4
4
|
import { evaluate as doEvaluate } from './evaluator';
|
|
5
5
|
import { loadApiProviders } from './providers';
|
|
6
|
-
import { readTests } from './util';
|
|
6
|
+
import { readTests, writeOutput } from './util';
|
|
7
7
|
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
|
|
8
8
|
|
|
9
9
|
export * from './types';
|
|
@@ -28,6 +28,11 @@ async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions =
|
|
|
28
28
|
};
|
|
29
29
|
telemetry.maybeShowNotice();
|
|
30
30
|
const ret = await doEvaluate(constructedTestSuite, options);
|
|
31
|
+
|
|
32
|
+
if (testSuite.outputPath) {
|
|
33
|
+
writeOutput(testSuite.outputPath, ret, testSuite, null);
|
|
34
|
+
}
|
|
35
|
+
|
|
31
36
|
await telemetry.send();
|
|
32
37
|
return ret;
|
|
33
38
|
}
|
package/src/main.ts
CHANGED
|
@@ -281,6 +281,7 @@ async function main() {
|
|
|
281
281
|
prompts: cmdObj.prompts || fileConfig.prompts || defaultConfig.prompts,
|
|
282
282
|
providers: cmdObj.providers || fileConfig.providers || defaultConfig.providers,
|
|
283
283
|
tests: cmdObj.tests || cmdObj.vars || fileConfig.tests || defaultConfig.tests,
|
|
284
|
+
scenarios: fileConfig.scenarios || defaultConfig.scenarios,
|
|
284
285
|
sharing:
|
|
285
286
|
process.env.PROMPTFOO_DISABLE_SHARING === '1'
|
|
286
287
|
? false
|
|
@@ -310,6 +311,18 @@ async function main() {
|
|
|
310
311
|
config.tests,
|
|
311
312
|
cmdObj.tests ? undefined : basePath,
|
|
312
313
|
);
|
|
314
|
+
|
|
315
|
+
//parse testCases for each scenario
|
|
316
|
+
if (fileConfig.scenarios) {
|
|
317
|
+
for (const scenario of fileConfig.scenarios) {
|
|
318
|
+
const parsedScenarioTests: TestCase[] = await readTests(
|
|
319
|
+
scenario.tests,
|
|
320
|
+
cmdObj.tests ? undefined : basePath,
|
|
321
|
+
);
|
|
322
|
+
scenario.tests = parsedScenarioTests;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
|
|
313
326
|
const parsedProviderPromptMap = readProviderPromptMap(config, parsedPrompts);
|
|
314
327
|
|
|
315
328
|
if (parsedPrompts.length === 0) {
|
|
@@ -334,6 +347,7 @@ async function main() {
|
|
|
334
347
|
providers: parsedProviders,
|
|
335
348
|
providerPromptMap: parsedProviderPromptMap,
|
|
336
349
|
tests: parsedTests,
|
|
350
|
+
scenarios: config.scenarios,
|
|
337
351
|
defaultTest,
|
|
338
352
|
};
|
|
339
353
|
|
package/src/providers.ts
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
1
|
import path from 'path';
|
|
2
2
|
|
|
3
|
-
import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './types';
|
|
4
|
-
|
|
5
3
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
4
|
import { AnthropicCompletionProvider } from './providers/anthropic';
|
|
7
5
|
import { ReplicateProvider } from './providers/replicate';
|
|
@@ -12,17 +10,37 @@ import {
|
|
|
12
10
|
AzureOpenAiCompletionProvider,
|
|
13
11
|
} from './providers/azureopenai';
|
|
14
12
|
|
|
13
|
+
import type {
|
|
14
|
+
ApiProvider,
|
|
15
|
+
ProviderConfig,
|
|
16
|
+
ProviderFunction,
|
|
17
|
+
ProviderId,
|
|
18
|
+
RawProviderConfig,
|
|
19
|
+
} from './types';
|
|
20
|
+
|
|
15
21
|
export async function loadApiProviders(
|
|
16
|
-
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
|
|
22
|
+
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction,
|
|
17
23
|
basePath?: string,
|
|
18
24
|
): Promise<ApiProvider[]> {
|
|
19
25
|
if (typeof providerPaths === 'string') {
|
|
20
26
|
return [await loadApiProvider(providerPaths, undefined, basePath)];
|
|
27
|
+
} else if (typeof providerPaths === 'function') {
|
|
28
|
+
return [
|
|
29
|
+
{
|
|
30
|
+
id: () => 'custom-function',
|
|
31
|
+
callApi: providerPaths,
|
|
32
|
+
},
|
|
33
|
+
];
|
|
21
34
|
} else if (Array.isArray(providerPaths)) {
|
|
22
35
|
return Promise.all(
|
|
23
|
-
providerPaths.map((provider) => {
|
|
36
|
+
providerPaths.map((provider, idx) => {
|
|
24
37
|
if (typeof provider === 'string') {
|
|
25
38
|
return loadApiProvider(provider, undefined, basePath);
|
|
39
|
+
} else if (typeof provider === 'function') {
|
|
40
|
+
return {
|
|
41
|
+
id: () => `custom-function-${idx}`,
|
|
42
|
+
callApi: provider,
|
|
43
|
+
};
|
|
26
44
|
} else {
|
|
27
45
|
const id = Object.keys(provider)[0];
|
|
28
46
|
const context = { ...provider[id], id };
|
package/src/table.ts
CHANGED
|
@@ -24,11 +24,11 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
|
|
|
24
24
|
text = text.slice(0, tableCellMaxLength) + '...';
|
|
25
25
|
}
|
|
26
26
|
if (pass) {
|
|
27
|
-
return chalk.green
|
|
27
|
+
return chalk.green('[PASS] ') + text;
|
|
28
28
|
} else if (!pass) {
|
|
29
29
|
// color everything red up until '---'
|
|
30
30
|
return (
|
|
31
|
-
chalk.red
|
|
31
|
+
chalk.red('[FAIL] ') +
|
|
32
32
|
text
|
|
33
33
|
.split('---')
|
|
34
34
|
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
package/src/types.ts
CHANGED
|
@@ -151,6 +151,7 @@ type BaseAssertionTypes =
|
|
|
151
151
|
| 'is-json'
|
|
152
152
|
| 'contains-json'
|
|
153
153
|
| 'javascript'
|
|
154
|
+
| 'python'
|
|
154
155
|
| 'similar'
|
|
155
156
|
| 'llm-rubric'
|
|
156
157
|
| 'webhook'
|
|
@@ -168,7 +169,10 @@ export interface Assertion {
|
|
|
168
169
|
type: AssertionType;
|
|
169
170
|
|
|
170
171
|
// The expected value, if applicable
|
|
171
|
-
value?:
|
|
172
|
+
value?:
|
|
173
|
+
| string
|
|
174
|
+
| string[]
|
|
175
|
+
| ((output: string, testCase: AtomicTestCase, assertion: Assertion) => Promise<GradingResult>);
|
|
172
176
|
|
|
173
177
|
// The threshold value, only applicable for similarity (cosine distance)
|
|
174
178
|
threshold?: number;
|
|
@@ -188,9 +192,6 @@ export interface TestCase {
|
|
|
188
192
|
// Key-value pairs to substitute in the prompt
|
|
189
193
|
vars?: Record<string, string | string[] | object>;
|
|
190
194
|
|
|
191
|
-
// Optional filepath or glob pattern to load vars from
|
|
192
|
-
loadVars?: string | string[];
|
|
193
|
-
|
|
194
195
|
// Optional list of automatic checks to run on the LLM output
|
|
195
196
|
assert?: Assertion[];
|
|
196
197
|
|
|
@@ -198,6 +199,17 @@ export interface TestCase {
|
|
|
198
199
|
options?: PromptConfig & OutputConfig & GradingConfig;
|
|
199
200
|
}
|
|
200
201
|
|
|
202
|
+
export interface Scenario {
|
|
203
|
+
// Optional description of what you're testing
|
|
204
|
+
description?: string;
|
|
205
|
+
|
|
206
|
+
// Default test case config
|
|
207
|
+
config: Partial<TestCase>[];
|
|
208
|
+
|
|
209
|
+
// Optional list of automatic checks to run on the LLM output
|
|
210
|
+
tests: TestCase[];
|
|
211
|
+
}
|
|
212
|
+
|
|
201
213
|
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
|
202
214
|
export interface AtomicTestCase extends TestCase {
|
|
203
215
|
vars?: Record<string, string | object>;
|
|
@@ -221,12 +233,17 @@ export interface TestSuite {
|
|
|
221
233
|
// Test cases
|
|
222
234
|
tests?: TestCase[];
|
|
223
235
|
|
|
236
|
+
// scenarios
|
|
237
|
+
scenarios?: Scenario[];
|
|
238
|
+
|
|
224
239
|
// Default test case config
|
|
225
240
|
defaultTest?: Partial<TestCase>;
|
|
226
241
|
}
|
|
227
242
|
|
|
228
243
|
export type ProviderId = string;
|
|
229
244
|
|
|
245
|
+
export type ProviderFunction = (prompt: string) => Promise<ProviderResponse>;
|
|
246
|
+
|
|
230
247
|
export type RawProviderConfig = Record<ProviderId, Omit<ProviderConfig, 'id'>>;
|
|
231
248
|
|
|
232
249
|
// TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
|
|
@@ -235,7 +252,7 @@ export interface TestSuiteConfig {
|
|
|
235
252
|
description?: string;
|
|
236
253
|
|
|
237
254
|
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
|
|
238
|
-
providers: ProviderId | ProviderId[] | RawProviderConfig[];
|
|
255
|
+
providers: ProviderId | ProviderId[] | RawProviderConfig[] | ProviderFunction;
|
|
239
256
|
|
|
240
257
|
// One or more prompt files to load
|
|
241
258
|
prompts: string | string[];
|
|
@@ -243,6 +260,9 @@ export interface TestSuiteConfig {
|
|
|
243
260
|
// Path to a test file, OR list of LLM prompt variations (aka "test case")
|
|
244
261
|
tests: string | string[] | TestCase[];
|
|
245
262
|
|
|
263
|
+
// Scenarios, groupings of data and tests to be evaluated
|
|
264
|
+
scenarios?: Scenario[];
|
|
265
|
+
|
|
246
266
|
// Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
|
|
247
267
|
defaultTest?: Omit<TestCase, 'description'>;
|
|
248
268
|
|
package/src/util.ts
CHANGED
|
@@ -4,6 +4,7 @@ import * as os from 'os';
|
|
|
4
4
|
|
|
5
5
|
import $RefParser from '@apidevtools/json-schema-ref-parser';
|
|
6
6
|
import fetch from 'node-fetch';
|
|
7
|
+
import invariant from 'tiny-invariant';
|
|
7
8
|
import yaml from 'js-yaml';
|
|
8
9
|
import nunjucks from 'nunjucks';
|
|
9
10
|
import { globSync } from 'glob';
|
|
@@ -44,6 +45,15 @@ export function readProviderPromptMap(
|
|
|
44
45
|
allPrompts.push(prompt.display);
|
|
45
46
|
}
|
|
46
47
|
|
|
48
|
+
invariant(
|
|
49
|
+
typeof config.providers !== 'string',
|
|
50
|
+
'In order to use a provider-prompt map, config.providers should be an array of objects, not a string',
|
|
51
|
+
);
|
|
52
|
+
invariant(
|
|
53
|
+
typeof config.providers !== 'function',
|
|
54
|
+
'In order to use a provider-prompt map, config.providers should be an array of objects, not a function',
|
|
55
|
+
);
|
|
56
|
+
|
|
47
57
|
for (const provider of config.providers) {
|
|
48
58
|
if (typeof provider === 'object') {
|
|
49
59
|
const rawProvider = provider as RawProviderConfig;
|
|
@@ -446,7 +456,7 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
|
|
|
446
456
|
2,
|
|
447
457
|
),
|
|
448
458
|
);
|
|
449
|
-
if (fs.existsSync(latestResultsPath)) {
|
|
459
|
+
if (fs.existsSync(latestResultsPath) || fs.lstatSync(latestResultsPath).isSymbolicLink()) {
|
|
450
460
|
fs.unlinkSync(latestResultsPath);
|
|
451
461
|
}
|
|
452
462
|
fs.symlinkSync(newResultsPath, latestResultsPath);
|
|
@@ -463,7 +473,7 @@ export function listPreviousResults(): string[] {
|
|
|
463
473
|
const sortedFiles = resultsFiles.sort((a, b) => {
|
|
464
474
|
const statA = fs.statSync(path.join(directory, a));
|
|
465
475
|
const statB = fs.statSync(path.join(directory, b));
|
|
466
|
-
return
|
|
476
|
+
return statA.birthtime.getTime() - statB.birthtime.getTime(); // sort in ascending order
|
|
467
477
|
});
|
|
468
478
|
return sortedFiles;
|
|
469
479
|
}
|