promptfoo 0.12.0 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -2
- package/dist/src/cache.js +1 -1
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +26 -25
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts +4 -0
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +6 -5
- package/dist/src/index.js.map +1 -1
- package/dist/src/logger.d.ts +3 -2
- package/dist/src/logger.d.ts.map +1 -1
- package/dist/src/logger.js +13 -5
- package/dist/src/logger.js.map +1 -1
- package/dist/src/main.js +5 -40
- package/dist/src/main.js.map +1 -1
- package/dist/src/onboarding.d.ts +1 -1
- package/dist/src/onboarding.d.ts.map +1 -1
- package/dist/src/onboarding.js +6 -0
- package/dist/src/onboarding.js.map +1 -1
- package/dist/src/providers/openai.d.ts +11 -3
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +17 -5
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers.d.ts +3 -3
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +16 -7
- package/dist/src/providers.js.map +1 -1
- package/dist/src/table.d.ts +4 -0
- package/dist/src/table.d.ts.map +1 -0
- package/dist/src/table.js +48 -0
- package/dist/src/table.js.map +1 -0
- package/dist/src/types.d.ts +7 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +9 -0
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-eb6d3769.js → index-0f6d6b29.js} +13 -13
- package/dist/src/web/client/index.html +1 -1
- package/package.json +2 -2
- package/src/cache.ts +1 -1
- package/src/evaluator.ts +34 -33
- package/src/index.ts +3 -6
- package/src/logger.ts +13 -5
- package/src/main.ts +6 -41
- package/src/onboarding.ts +6 -0
- package/src/providers/openai.ts +32 -6
- package/src/providers.ts +32 -9
- package/src/table.ts +41 -0
- package/src/types.ts +10 -1
- package/src/util.ts +12 -1
- package/src/web/client/src/App.tsx +5 -1
- package/src/web/client/src/ResultsView.tsx +12 -10
- package/src/web/client/package-lock.json +0 -5726
- /package/dist/{tableOutput.html → src/tableOutput.html} +0 -0
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-0f6d6b29.js"></script>
|
|
9
9
|
<link rel="stylesheet" href="/assets/index-87905193.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "Prompt engineering toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.13.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
|
@@ -30,7 +30,7 @@
|
|
|
30
30
|
"build:clean": "rm -rf dist",
|
|
31
31
|
"build:client": "cd src/web/client && npm run build && cp -r dist/ ../../../dist/src/web/client",
|
|
32
32
|
"build:watch": "tsc --watch",
|
|
33
|
-
"build": "tsc && cp src/*.html dist/ && npm run build:client && chmod +x dist/src/main.js",
|
|
33
|
+
"build": "tsc && cp src/*.html dist/src && npm run build:client && chmod +x dist/src/main.js",
|
|
34
34
|
"prepare": "npm run install:client && npm run build:clean && npm run build",
|
|
35
35
|
"test": "jest",
|
|
36
36
|
"test:watch": "jest --watch",
|
package/src/cache.ts
CHANGED
|
@@ -77,7 +77,7 @@ export async function fetchJsonWithCache(
|
|
|
77
77
|
try {
|
|
78
78
|
const data = await response.json();
|
|
79
79
|
if (response.ok) {
|
|
80
|
-
logger.debug(`Storing ${url} response in cache: ${data}`);
|
|
80
|
+
logger.debug(`Storing ${url} response in cache: ${JSON.stringify(data)}`);
|
|
81
81
|
await cache.set(cacheKey, JSON.stringify(data));
|
|
82
82
|
}
|
|
83
83
|
return {
|
package/src/evaluator.ts
CHANGED
|
@@ -237,41 +237,9 @@ class Evaluator {
|
|
|
237
237
|
}
|
|
238
238
|
}
|
|
239
239
|
|
|
240
|
-
// Set up table...
|
|
241
|
-
const isTest = tests.some((t) => !!t.assert);
|
|
242
|
-
|
|
243
|
-
const table: EvaluateTable = {
|
|
244
|
-
head: {
|
|
245
|
-
prompts: prompts.map((p) => p.display),
|
|
246
|
-
vars: Array.from(varNames).sort(),
|
|
247
|
-
// TODO(ian): add assertions to table?
|
|
248
|
-
},
|
|
249
|
-
body: [],
|
|
250
|
-
};
|
|
251
|
-
|
|
252
|
-
// And progress bar...
|
|
253
|
-
let progressbar: SingleBar | undefined;
|
|
254
|
-
if (options.showProgressBar) {
|
|
255
|
-
// FIXME(ian): Add var combinations too
|
|
256
|
-
const totalNumRuns =
|
|
257
|
-
testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
|
|
258
|
-
const cliProgress = await import('cli-progress');
|
|
259
|
-
progressbar = new cliProgress.SingleBar(
|
|
260
|
-
{
|
|
261
|
-
format:
|
|
262
|
-
'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
263
|
-
},
|
|
264
|
-
cliProgress.Presets.shades_classic,
|
|
265
|
-
);
|
|
266
|
-
progressbar.start(totalNumRuns, 0, {
|
|
267
|
-
provider: '',
|
|
268
|
-
prompt: '',
|
|
269
|
-
vars: '',
|
|
270
|
-
});
|
|
271
|
-
}
|
|
272
|
-
|
|
273
240
|
// Set up eval cases
|
|
274
241
|
const runEvalOptions: RunEvalOptions[] = [];
|
|
242
|
+
let totalVarCombinations = 0;
|
|
275
243
|
let rowIndex = 0;
|
|
276
244
|
for (const testCase of tests) {
|
|
277
245
|
// Handle default properties
|
|
@@ -287,6 +255,7 @@ class Evaluator {
|
|
|
287
255
|
|
|
288
256
|
// Finalize test case eval
|
|
289
257
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
258
|
+
totalVarCombinations += varCombinations.length;
|
|
290
259
|
for (const vars of varCombinations) {
|
|
291
260
|
let colIndex = 0;
|
|
292
261
|
for (const prompt of testSuite.prompts) {
|
|
@@ -309,6 +278,38 @@ class Evaluator {
|
|
|
309
278
|
}
|
|
310
279
|
}
|
|
311
280
|
|
|
281
|
+
// Set up table...
|
|
282
|
+
const isTest = tests.some((t) => !!t.assert);
|
|
283
|
+
|
|
284
|
+
const table: EvaluateTable = {
|
|
285
|
+
head: {
|
|
286
|
+
prompts: prompts.map((p) => p.display),
|
|
287
|
+
vars: Array.from(varNames).sort(),
|
|
288
|
+
// TODO(ian): add assertions to table?
|
|
289
|
+
},
|
|
290
|
+
body: [],
|
|
291
|
+
};
|
|
292
|
+
|
|
293
|
+
// Set up progress bar...
|
|
294
|
+
let progressbar: SingleBar | undefined;
|
|
295
|
+
if (options.showProgressBar) {
|
|
296
|
+
const totalNumRuns =
|
|
297
|
+
testSuite.prompts.length * testSuite.providers.length * (totalVarCombinations || 1);
|
|
298
|
+
const cliProgress = await import('cli-progress');
|
|
299
|
+
progressbar = new cliProgress.SingleBar(
|
|
300
|
+
{
|
|
301
|
+
format:
|
|
302
|
+
'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
303
|
+
},
|
|
304
|
+
cliProgress.Presets.shades_classic,
|
|
305
|
+
);
|
|
306
|
+
progressbar.start(totalNumRuns, 0, {
|
|
307
|
+
provider: '',
|
|
308
|
+
prompt: '',
|
|
309
|
+
vars: '',
|
|
310
|
+
});
|
|
311
|
+
}
|
|
312
|
+
|
|
312
313
|
// Actually run the eval
|
|
313
314
|
const results: EvaluateResult[] = [];
|
|
314
315
|
await async.forEachOfLimit(
|
package/src/index.ts
CHANGED
|
@@ -4,11 +4,12 @@ import telemetry from './telemetry';
|
|
|
4
4
|
import { evaluate as doEvaluate } from './evaluator';
|
|
5
5
|
import { loadApiProviders } from './providers';
|
|
6
6
|
import { readTests } from './util';
|
|
7
|
-
|
|
8
7
|
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
|
|
9
8
|
|
|
10
9
|
export * from './types';
|
|
11
10
|
|
|
11
|
+
export { generateTable } from './table';
|
|
12
|
+
|
|
12
13
|
interface EvaluateTestSuite extends TestSuiteConfig {
|
|
13
14
|
prompts: string[];
|
|
14
15
|
}
|
|
@@ -30,11 +31,7 @@ async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions =
|
|
|
30
31
|
return ret;
|
|
31
32
|
}
|
|
32
33
|
|
|
33
|
-
|
|
34
|
-
evaluate,
|
|
35
|
-
assertions,
|
|
36
|
-
providers,
|
|
37
|
-
};
|
|
34
|
+
export { evaluate, assertions, providers };
|
|
38
35
|
|
|
39
36
|
export default {
|
|
40
37
|
evaluate,
|
package/src/logger.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import chalk from 'chalk';
|
|
2
2
|
import winston from 'winston';
|
|
3
3
|
|
|
4
|
-
const
|
|
4
|
+
export const LOG_LEVELS = {
|
|
5
5
|
error: 0,
|
|
6
6
|
warn: 1,
|
|
7
7
|
info: 2,
|
|
@@ -22,13 +22,21 @@ const customFormatter = winston.format.printf(({ level, message, ...args }) => {
|
|
|
22
22
|
});
|
|
23
23
|
|
|
24
24
|
const logger = winston.createLogger({
|
|
25
|
-
levels:
|
|
25
|
+
levels: LOG_LEVELS,
|
|
26
26
|
format: winston.format.combine(winston.format.simple(), customFormatter),
|
|
27
|
-
transports: [
|
|
27
|
+
transports: [
|
|
28
|
+
new winston.transports.Console({
|
|
29
|
+
level: process.env.LOG_LEVEL || 'info',
|
|
30
|
+
}),
|
|
31
|
+
],
|
|
28
32
|
});
|
|
29
33
|
|
|
30
|
-
export function
|
|
31
|
-
|
|
34
|
+
export function getLogLevel() {
|
|
35
|
+
return logger.transports[0].level;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export function setLogLevel(level: keyof typeof LOG_LEVELS) {
|
|
39
|
+
if (LOG_LEVELS.hasOwnProperty(level)) {
|
|
32
40
|
logger.transports[0].level = level;
|
|
33
41
|
} else {
|
|
34
42
|
throw new Error(`Invalid log level: ${level}`);
|
package/src/main.ts
CHANGED
|
@@ -2,12 +2,11 @@
|
|
|
2
2
|
import { readFileSync, writeFileSync, existsSync, mkdirSync } from 'fs';
|
|
3
3
|
import { join as pathJoin } from 'path';
|
|
4
4
|
|
|
5
|
-
import Table from 'cli-table3';
|
|
6
5
|
import chalk from 'chalk';
|
|
7
6
|
import { Command } from 'commander';
|
|
8
7
|
|
|
9
8
|
import telemetry from './telemetry';
|
|
10
|
-
import logger, { setLogLevel } from './logger';
|
|
9
|
+
import logger, { getLogLevel, setLogLevel } from './logger';
|
|
11
10
|
import { loadApiProvider, loadApiProviders } from './providers';
|
|
12
11
|
import { evaluate } from './evaluator';
|
|
13
12
|
import {
|
|
@@ -31,6 +30,7 @@ import type {
|
|
|
31
30
|
TestSuite,
|
|
32
31
|
UnifiedConfig,
|
|
33
32
|
} from './types';
|
|
33
|
+
import { generateTable } from './table';
|
|
34
34
|
|
|
35
35
|
function createDummyFiles(directory: string | null) {
|
|
36
36
|
if (directory) {
|
|
@@ -123,10 +123,9 @@ async function main() {
|
|
|
123
123
|
.command('eval')
|
|
124
124
|
.description('Evaluate prompts')
|
|
125
125
|
.requiredOption('-p, --prompts <paths...>', 'Paths to prompt files (.txt)', config.prompts)
|
|
126
|
-
.
|
|
126
|
+
.option(
|
|
127
127
|
'-r, --providers <name or path...>',
|
|
128
128
|
'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
|
|
129
|
-
config?.providers,
|
|
130
129
|
)
|
|
131
130
|
.option(
|
|
132
131
|
'-c, --config <path>',
|
|
@@ -243,7 +242,7 @@ async function main() {
|
|
|
243
242
|
};
|
|
244
243
|
|
|
245
244
|
const options: EvaluateOptions = {
|
|
246
|
-
showProgressBar:
|
|
245
|
+
showProgressBar: getLogLevel() !== 'debug',
|
|
247
246
|
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
|
|
248
247
|
...evaluateOptions,
|
|
249
248
|
};
|
|
@@ -261,43 +260,9 @@ async function main() {
|
|
|
261
260
|
if (cmdObj.output) {
|
|
262
261
|
logger.info(chalk.yellow(`Writing output to ${cmdObj.output}`));
|
|
263
262
|
writeOutput(cmdObj.output, summary);
|
|
264
|
-
} else {
|
|
263
|
+
} else if (getLogLevel() !== 'debug') {
|
|
265
264
|
// Output table by default
|
|
266
|
-
const
|
|
267
|
-
const head = summary.table.head;
|
|
268
|
-
const headLength = head.prompts.length + head.vars.length;
|
|
269
|
-
const table = new Table({
|
|
270
|
-
head: [...head.prompts, ...head.vars],
|
|
271
|
-
colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
|
|
272
|
-
wordWrap: true,
|
|
273
|
-
wrapOnWordBoundary: false,
|
|
274
|
-
style: {
|
|
275
|
-
head: ['blue', 'bold'],
|
|
276
|
-
},
|
|
277
|
-
});
|
|
278
|
-
// Skip first row (header) and add the rest. Color PASS/FAIL
|
|
279
|
-
for (const row of summary.table.body.slice(0, 25)) {
|
|
280
|
-
table.push([
|
|
281
|
-
...row.vars,
|
|
282
|
-
...row.outputs.map((col) => {
|
|
283
|
-
const tableCellMaxLength = parseInt(cmdObj.tableCellMaxLength || '', 10);
|
|
284
|
-
if (!isNaN(tableCellMaxLength) && col.length > tableCellMaxLength) {
|
|
285
|
-
col = col.slice(0, tableCellMaxLength) + '...';
|
|
286
|
-
}
|
|
287
|
-
if (col.startsWith('[PASS]')) {
|
|
288
|
-
// color '[PASS]' green
|
|
289
|
-
return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
|
|
290
|
-
} else if (col.startsWith('[FAIL]')) {
|
|
291
|
-
// color everything red up until '---'
|
|
292
|
-
return col
|
|
293
|
-
.split('---')
|
|
294
|
-
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
|
295
|
-
.join('---');
|
|
296
|
-
}
|
|
297
|
-
return col;
|
|
298
|
-
}),
|
|
299
|
-
]);
|
|
300
|
-
}
|
|
265
|
+
const table = generateTable(summary, parseInt(cmdObj.tableCellMaxLength || '', 10));
|
|
301
266
|
|
|
302
267
|
logger.info('\n' + table.toString());
|
|
303
268
|
if (summary.table.body.length > 25) {
|
package/src/onboarding.ts
CHANGED
|
@@ -9,6 +9,12 @@ These prompts are nunjucks templates, so you can use logic like this:
|
|
|
9
9
|
{{ var1 }}
|
|
10
10
|
{% endif %}
|
|
11
11
|
---
|
|
12
|
+
[
|
|
13
|
+
{"role": "system", "content": "Use JSON too for more complex payloads"},
|
|
14
|
+
{"role": "user", "content": "Such as multi-shot prompts"}
|
|
15
|
+
{"role": "user", "content": "Variable substitution still works: {{ var3 }}"}
|
|
16
|
+
]
|
|
17
|
+
---
|
|
12
18
|
If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
|
|
13
19
|
`;
|
|
14
20
|
|
package/src/providers/openai.ts
CHANGED
|
@@ -7,7 +7,13 @@ import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '.
|
|
|
7
7
|
const DEFAULT_OPENAI_HOST = 'api.openai.com';
|
|
8
8
|
|
|
9
9
|
interface OpenAiCompletionOptions {
|
|
10
|
-
temperature
|
|
10
|
+
temperature?: number;
|
|
11
|
+
functions?: {
|
|
12
|
+
name: string;
|
|
13
|
+
description?: string;
|
|
14
|
+
parameters: any;
|
|
15
|
+
}[];
|
|
16
|
+
function_call?: 'none' | 'auto';
|
|
11
17
|
}
|
|
12
18
|
|
|
13
19
|
class OpenAiGenericProvider implements ApiProvider {
|
|
@@ -112,11 +118,14 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
112
118
|
'text-ada-001',
|
|
113
119
|
];
|
|
114
120
|
|
|
115
|
-
|
|
121
|
+
options: OpenAiCompletionOptions;
|
|
122
|
+
|
|
123
|
+
constructor(modelName: string, apiKey?: string, context?: OpenAiCompletionOptions) {
|
|
116
124
|
if (!OpenAiCompletionProvider.OPENAI_COMPLETION_MODELS.includes(modelName)) {
|
|
117
125
|
logger.warn(`Using unknown OpenAI completion model: ${modelName}`);
|
|
118
126
|
}
|
|
119
127
|
super(modelName, apiKey);
|
|
128
|
+
this.options = context || {};
|
|
120
129
|
}
|
|
121
130
|
|
|
122
131
|
async callApi(prompt: string, options?: OpenAiCompletionOptions): Promise<ProviderResponse> {
|
|
@@ -138,7 +147,10 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
138
147
|
model: this.modelName,
|
|
139
148
|
prompt,
|
|
140
149
|
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
141
|
-
temperature:
|
|
150
|
+
temperature:
|
|
151
|
+
options?.temperature ??
|
|
152
|
+
this.options.temperature ??
|
|
153
|
+
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
142
154
|
stop,
|
|
143
155
|
};
|
|
144
156
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
@@ -186,17 +198,22 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
186
198
|
static OPENAI_CHAT_MODELS = [
|
|
187
199
|
'gpt-4',
|
|
188
200
|
'gpt-4-0314',
|
|
201
|
+
'gpt-4-0613',
|
|
189
202
|
'gpt-4-32k',
|
|
190
203
|
'gpt-4-32k-0314',
|
|
191
204
|
'gpt-3.5-turbo',
|
|
192
205
|
'gpt-3.5-turbo-0301',
|
|
206
|
+
'gpt-3.5-turbo-0613',
|
|
193
207
|
];
|
|
194
208
|
|
|
195
|
-
|
|
209
|
+
options: OpenAiCompletionOptions;
|
|
210
|
+
|
|
211
|
+
constructor(modelName: string, apiKey?: string, context?: OpenAiCompletionOptions) {
|
|
196
212
|
if (!OpenAiChatCompletionProvider.OPENAI_CHAT_MODELS.includes(modelName)) {
|
|
197
213
|
logger.warn(`Using unknown OpenAI chat model: ${modelName}`);
|
|
198
214
|
}
|
|
199
215
|
super(modelName, apiKey);
|
|
216
|
+
this.options = context || {};
|
|
200
217
|
}
|
|
201
218
|
|
|
202
219
|
// TODO(ian): support passing in `messages` directly
|
|
@@ -215,11 +232,17 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
215
232
|
} catch (err) {
|
|
216
233
|
messages = [{ role: 'user', content: prompt }];
|
|
217
234
|
}
|
|
235
|
+
|
|
218
236
|
const body = {
|
|
219
237
|
model: this.modelName,
|
|
220
238
|
messages: messages,
|
|
221
239
|
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
222
|
-
temperature:
|
|
240
|
+
temperature:
|
|
241
|
+
options?.temperature ??
|
|
242
|
+
this.options.temperature ??
|
|
243
|
+
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
244
|
+
functions: options?.functions || this.options.functions || undefined,
|
|
245
|
+
function_call: options?.function_call || this.options.function_call || undefined,
|
|
223
246
|
};
|
|
224
247
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
225
248
|
|
|
@@ -246,8 +269,11 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
246
269
|
|
|
247
270
|
logger.debug(`\tOpenAI API response: ${JSON.stringify(data)}`);
|
|
248
271
|
try {
|
|
272
|
+
const message = data.choices[0].message;
|
|
273
|
+
const output =
|
|
274
|
+
message.content === null ? JSON.stringify(message.function_call) : message.content;
|
|
249
275
|
return {
|
|
250
|
-
output
|
|
276
|
+
output,
|
|
251
277
|
tokenUsage: cached
|
|
252
278
|
? { cached: data.usage.total_tokens }
|
|
253
279
|
: {
|
package/src/providers.ts
CHANGED
|
@@ -1,20 +1,35 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
2
|
|
|
3
|
-
import { ApiProvider } from './types';
|
|
3
|
+
import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './types';
|
|
4
4
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
6
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
7
7
|
|
|
8
|
-
export async function loadApiProviders(
|
|
8
|
+
export async function loadApiProviders(
|
|
9
|
+
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
|
|
10
|
+
): Promise<ApiProvider[]> {
|
|
9
11
|
if (typeof providerPaths === 'string') {
|
|
10
12
|
return [await loadApiProvider(providerPaths)];
|
|
11
13
|
} else if (Array.isArray(providerPaths)) {
|
|
12
|
-
return Promise.all(
|
|
14
|
+
return Promise.all(
|
|
15
|
+
providerPaths.map((provider) => {
|
|
16
|
+
if (typeof provider === 'string') {
|
|
17
|
+
return loadApiProvider(provider);
|
|
18
|
+
} else {
|
|
19
|
+
const id = Object.keys(provider)[0];
|
|
20
|
+
const context = { ...provider[id], id };
|
|
21
|
+
return loadApiProvider(id, context);
|
|
22
|
+
}
|
|
23
|
+
}),
|
|
24
|
+
);
|
|
13
25
|
}
|
|
14
26
|
throw new Error('Invalid providers list');
|
|
15
27
|
}
|
|
16
28
|
|
|
17
|
-
export async function loadApiProvider(
|
|
29
|
+
export async function loadApiProvider(
|
|
30
|
+
providerPath: string,
|
|
31
|
+
context: ProviderConfig | undefined = undefined,
|
|
32
|
+
): Promise<ApiProvider> {
|
|
18
33
|
if (providerPath?.startsWith('openai:')) {
|
|
19
34
|
// Load OpenAI module
|
|
20
35
|
const options = providerPath.split(':');
|
|
@@ -22,13 +37,21 @@ export async function loadApiProvider(providerPath: string): Promise<ApiProvider
|
|
|
22
37
|
const modelName = options[2];
|
|
23
38
|
|
|
24
39
|
if (modelType === 'chat') {
|
|
25
|
-
return new OpenAiChatCompletionProvider(
|
|
40
|
+
return new OpenAiChatCompletionProvider(
|
|
41
|
+
modelName || 'gpt-3.5-turbo',
|
|
42
|
+
undefined,
|
|
43
|
+
context?.config,
|
|
44
|
+
);
|
|
26
45
|
} else if (modelType === 'completion') {
|
|
27
|
-
return new OpenAiCompletionProvider(
|
|
46
|
+
return new OpenAiCompletionProvider(
|
|
47
|
+
modelName || 'text-davinci-003',
|
|
48
|
+
undefined,
|
|
49
|
+
context?.config,
|
|
50
|
+
);
|
|
28
51
|
} else if (OpenAiChatCompletionProvider.OPENAI_CHAT_MODELS.includes(modelType)) {
|
|
29
|
-
return new OpenAiChatCompletionProvider(modelType);
|
|
52
|
+
return new OpenAiChatCompletionProvider(modelType, undefined, context?.config);
|
|
30
53
|
} else if (OpenAiCompletionProvider.OPENAI_COMPLETION_MODELS.includes(modelType)) {
|
|
31
|
-
return new OpenAiCompletionProvider(modelType);
|
|
54
|
+
return new OpenAiCompletionProvider(modelType, undefined, context?.config);
|
|
32
55
|
} else {
|
|
33
56
|
throw new Error(
|
|
34
57
|
`Unknown OpenAI model type: ${modelType}. Use one of the following providers: openai:chat:<model name>, openai:completion:<model name>`,
|
|
@@ -52,7 +75,7 @@ export async function loadApiProvider(providerPath: string): Promise<ApiProvider
|
|
|
52
75
|
|
|
53
76
|
// Load custom module
|
|
54
77
|
const CustomApiProvider = (await import(path.join(process.cwd(), providerPath))).default;
|
|
55
|
-
return new CustomApiProvider();
|
|
78
|
+
return new CustomApiProvider(context);
|
|
56
79
|
}
|
|
57
80
|
|
|
58
81
|
export default {
|
package/src/table.ts
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import Table from 'cli-table3';
|
|
2
|
+
import chalk from 'chalk';
|
|
3
|
+
import type { EvaluateSummary } from './types';
|
|
4
|
+
|
|
5
|
+
export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250, maxRows = 25) {
|
|
6
|
+
const maxWidth = process.stdout.columns ? process.stdout.columns - 10 : 120;
|
|
7
|
+
const head = summary.table.head;
|
|
8
|
+
const headLength = head.prompts.length + head.vars.length;
|
|
9
|
+
const table = new Table({
|
|
10
|
+
head: [...head.prompts, ...head.vars],
|
|
11
|
+
colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
|
|
12
|
+
wordWrap: true,
|
|
13
|
+
wrapOnWordBoundary: false,
|
|
14
|
+
style: {
|
|
15
|
+
head: ['blue', 'bold'],
|
|
16
|
+
},
|
|
17
|
+
});
|
|
18
|
+
// Skip first row (header) and add the rest. Color PASS/FAIL
|
|
19
|
+
for (const row of summary.table.body.slice(0, maxRows)) {
|
|
20
|
+
table.push([
|
|
21
|
+
...row.vars,
|
|
22
|
+
...row.outputs.map((col) => {
|
|
23
|
+
if (col.length > tableCellMaxLength) {
|
|
24
|
+
col = col.slice(0, tableCellMaxLength) + '...';
|
|
25
|
+
}
|
|
26
|
+
if (col.startsWith('[PASS]')) {
|
|
27
|
+
// color '[PASS]' green
|
|
28
|
+
return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
|
|
29
|
+
} else if (col.startsWith('[FAIL]')) {
|
|
30
|
+
// color everything red up until '---'
|
|
31
|
+
return col
|
|
32
|
+
.split('---')
|
|
33
|
+
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
|
34
|
+
.join('---');
|
|
35
|
+
}
|
|
36
|
+
return col;
|
|
37
|
+
}),
|
|
38
|
+
]);
|
|
39
|
+
}
|
|
40
|
+
return table;
|
|
41
|
+
}
|
package/src/types.ts
CHANGED
|
@@ -23,6 +23,11 @@ export interface CommandLineOptions {
|
|
|
23
23
|
promptSuffix?: string;
|
|
24
24
|
}
|
|
25
25
|
|
|
26
|
+
export interface ProviderConfig {
|
|
27
|
+
id: ProviderId;
|
|
28
|
+
config?: any;
|
|
29
|
+
}
|
|
30
|
+
|
|
26
31
|
export interface ApiProvider {
|
|
27
32
|
id: () => string;
|
|
28
33
|
callApi: (prompt: string) => Promise<ProviderResponse>;
|
|
@@ -187,13 +192,17 @@ export interface TestSuite {
|
|
|
187
192
|
defaultTest?: Partial<TestCase>;
|
|
188
193
|
}
|
|
189
194
|
|
|
195
|
+
export type ProviderId = string;
|
|
196
|
+
|
|
197
|
+
export type RawProviderConfig = Record<ProviderId, Omit<ProviderConfig, 'id'>>;
|
|
198
|
+
|
|
190
199
|
// TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
|
|
191
200
|
export interface TestSuiteConfig {
|
|
192
201
|
// Optional description of what your LLM is trying to do
|
|
193
202
|
description?: string;
|
|
194
203
|
|
|
195
204
|
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
|
|
196
|
-
providers:
|
|
205
|
+
providers: ProviderId | ProviderId[] | RawProviderConfig[];
|
|
197
206
|
|
|
198
207
|
// One or more prompt files to load
|
|
199
208
|
prompts: string | string[];
|
package/src/util.ts
CHANGED
|
@@ -96,11 +96,22 @@ export function readPrompts(
|
|
|
96
96
|
promptContents.push(...fileContents.map((content) => ({ raw: content, display: content })));
|
|
97
97
|
} else {
|
|
98
98
|
const fileContent = fs.readFileSync(promptPath, 'utf-8');
|
|
99
|
-
|
|
99
|
+
|
|
100
|
+
let display: string | undefined;
|
|
100
101
|
if (inputType === PromptInputType.NAMED) {
|
|
101
102
|
display = (promptPathOrGlobs as Record<string, string>)[promptPath];
|
|
102
103
|
} else {
|
|
103
104
|
display = fileContent.length > 200 ? promptPath : fileContent;
|
|
105
|
+
|
|
106
|
+
const ext = path.parse(promptPath).ext;
|
|
107
|
+
if (ext === '.jsonl') {
|
|
108
|
+
// Special case for JSONL file
|
|
109
|
+
const jsonLines = fileContent.split(/\r?\n/).filter((line) => line.length > 0);
|
|
110
|
+
for (const json of jsonLines) {
|
|
111
|
+
promptContents.push({ raw: json, display: json });
|
|
112
|
+
}
|
|
113
|
+
continue;
|
|
114
|
+
}
|
|
104
115
|
}
|
|
105
116
|
promptContents.push({ raw: fileContent, display });
|
|
106
117
|
}
|
|
@@ -45,7 +45,11 @@ function App() {
|
|
|
45
45
|
loadedFromApi.current = true;
|
|
46
46
|
const response = await fetch(`https://api.promptfoo.dev/eval/${id}`);
|
|
47
47
|
const body = await response.json();
|
|
48
|
-
setTable(
|
|
48
|
+
setTable(
|
|
49
|
+
body.data.results?.table ||
|
|
50
|
+
// Backwards compatibility with <= 0.12.0
|
|
51
|
+
body.data.table,
|
|
52
|
+
);
|
|
49
53
|
setConfig(body.data.config);
|
|
50
54
|
setLoaded(true);
|
|
51
55
|
};
|
|
@@ -38,7 +38,7 @@ const ResponsiveStack = styled(Stack)(({ theme }) => ({
|
|
|
38
38
|
}));
|
|
39
39
|
|
|
40
40
|
export default function ResultsView() {
|
|
41
|
-
const { table } = useStore();
|
|
41
|
+
const { table, config } = useStore();
|
|
42
42
|
const [maxTextLength, setMaxTextLength] = React.useState(250);
|
|
43
43
|
const [columnVisibility, setColumnVisibility] = React.useState<VisibilityState>({});
|
|
44
44
|
const [selectedColumns, setSelectedColumns] = React.useState<string[]>([]);
|
|
@@ -204,15 +204,17 @@ export default function ResultsView() {
|
|
|
204
204
|
<Box flexGrow={1} />
|
|
205
205
|
<Box display="flex" justifyContent="flex-end">
|
|
206
206
|
<ResponsiveStack direction="row" spacing={2}>
|
|
207
|
-
|
|
208
|
-
<
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
207
|
+
{config && (
|
|
208
|
+
<Tooltip title="View config">
|
|
209
|
+
<Button
|
|
210
|
+
color="primary"
|
|
211
|
+
onClick={() => setConfigModalOpen(true)}
|
|
212
|
+
startIcon={<VisibilityIcon />}
|
|
213
|
+
>
|
|
214
|
+
Config
|
|
215
|
+
</Button>
|
|
216
|
+
</Tooltip>
|
|
217
|
+
)}
|
|
216
218
|
<Tooltip title="Generate a unique URL that others can access">
|
|
217
219
|
<Button
|
|
218
220
|
color="primary"
|