promptfoo 0.8.3 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -5
- package/dist/assertions.d.ts +3 -3
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +11 -12
- package/dist/assertions.js.map +1 -1
- package/dist/cache.d.ts.map +1 -1
- package/dist/cache.js +9 -9
- package/dist/cache.js.map +1 -1
- package/dist/evaluator.d.ts +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +60 -34
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +10 -10
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -14
- package/dist/index.js.map +1 -1
- package/dist/main.js +41 -40
- package/dist/main.js.map +1 -1
- package/dist/providers/localai.js +11 -11
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +30 -21
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers.d.ts +3 -3
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +15 -15
- package/dist/providers.js.map +1 -1
- package/dist/types.d.ts +5 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +2 -2
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +43 -15
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-9a9ba400.css +1 -0
- package/dist/web/client/assets/{index-8751749f.js → index-b72d3ca9.js} +12 -12
- package/dist/web/client/index.html +2 -2
- package/dist/web/server.js +9 -9
- package/dist/web/server.js.map +1 -1
- package/package.json +3 -1
- package/src/assertions.ts +8 -9
- package/src/cache.ts +5 -4
- package/src/evaluator.ts +66 -33
- package/src/index.ts +13 -8
- package/src/main.ts +13 -18
- package/src/providers/localai.ts +3 -3
- package/src/providers/openai.ts +16 -8
- package/src/providers.ts +3 -3
- package/src/types.ts +7 -2
- package/src/util.ts +42 -20
- package/src/web/client/package-lock.json +5729 -0
- package/src/web/client/src/ResultsTable.css +19 -0
- package/src/web/client/src/ResultsTable.tsx +51 -37
- package/src/web/client/src/ResultsView.tsx +7 -7
- package/src/web/server.ts +3 -3
- package/dist/web/client/assets/index-207192fc.css +0 -1
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-b72d3ca9.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-9a9ba400.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/dist/web/server.js
CHANGED
|
@@ -13,12 +13,12 @@ const express_1 = __importDefault(require("express"));
|
|
|
13
13
|
const cors_1 = __importDefault(require("cors"));
|
|
14
14
|
const opener_1 = __importDefault(require("opener"));
|
|
15
15
|
const socket_io_1 = require("socket.io");
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
-
const
|
|
16
|
+
const logger_1 = __importDefault(require("../logger"));
|
|
17
|
+
const esm_1 = require("../esm");
|
|
18
|
+
const util_1 = require("../util");
|
|
19
19
|
function init(port = 15500) {
|
|
20
20
|
const app = (0, express_1.default)();
|
|
21
|
-
const staticDir = node_path_1.default.join((0,
|
|
21
|
+
const staticDir = node_path_1.default.join((0, esm_1.getDirectory)(), 'web', 'client');
|
|
22
22
|
app.use((0, cors_1.default)());
|
|
23
23
|
app.use(express_1.default.json());
|
|
24
24
|
app.use(express_1.default.static(staticDir));
|
|
@@ -28,7 +28,7 @@ function init(port = 15500) {
|
|
|
28
28
|
origin: '*',
|
|
29
29
|
},
|
|
30
30
|
});
|
|
31
|
-
const latestJsonPath = (0,
|
|
31
|
+
const latestJsonPath = (0, util_1.getLatestResultsPath)();
|
|
32
32
|
const readLatestJson = () => {
|
|
33
33
|
const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
|
|
34
34
|
const jsonData = JSON.parse(data);
|
|
@@ -46,7 +46,7 @@ function init(port = 15500) {
|
|
|
46
46
|
});
|
|
47
47
|
httpServer.listen(port, () => {
|
|
48
48
|
const url = `http://localhost:${port}`;
|
|
49
|
-
|
|
49
|
+
logger_1.default.info(`Server listening at ${url}`);
|
|
50
50
|
const rl = node_readline_1.default.createInterface({
|
|
51
51
|
input: process.stdin,
|
|
52
52
|
output: process.stdout,
|
|
@@ -55,14 +55,14 @@ function init(port = 15500) {
|
|
|
55
55
|
if (answer.toLowerCase().startsWith('y')) {
|
|
56
56
|
try {
|
|
57
57
|
await (0, opener_1.default)(url);
|
|
58
|
-
|
|
58
|
+
logger_1.default.info(`Opening browser to: ${url}`);
|
|
59
59
|
}
|
|
60
60
|
catch (err) {
|
|
61
|
-
|
|
61
|
+
logger_1.default.error(`Failed to open browser: ${String(err)}`);
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
rl.close();
|
|
65
|
-
|
|
65
|
+
logger_1.default.info('Press Ctrl+C to stop the server');
|
|
66
66
|
});
|
|
67
67
|
});
|
|
68
68
|
}
|
package/dist/web/server.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAI/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "Prompt engineering toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.10.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/index.js",
|
|
@@ -38,6 +38,7 @@
|
|
|
38
38
|
},
|
|
39
39
|
"devDependencies": {
|
|
40
40
|
"@types/async": "^3.2.20",
|
|
41
|
+
"@types/cache-manager": "^4.0.2",
|
|
41
42
|
"@types/cache-manager-fs-hash": "^0.0.1",
|
|
42
43
|
"@types/cli-progress": "^3.11.0",
|
|
43
44
|
"@types/cors": "^2.8.13",
|
|
@@ -50,6 +51,7 @@
|
|
|
50
51
|
"@types/nunjucks": "^3.2.2",
|
|
51
52
|
"@types/opener": "^1.4.0",
|
|
52
53
|
"babel-jest": "^29.5.0",
|
|
54
|
+
"jest": "^29.5.0",
|
|
53
55
|
"jest-watch-typeahead": "^2.2.2",
|
|
54
56
|
"prettier": "^2.8.8",
|
|
55
57
|
"ts-jest": "^29.1.0",
|
package/src/assertions.ts
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
import invariant from 'tiny-invariant';
|
|
2
2
|
import nunjucks from 'nunjucks';
|
|
3
3
|
|
|
4
|
-
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai
|
|
5
|
-
import { cosineSimilarity } from './util
|
|
6
|
-
import { loadApiProvider } from './providers
|
|
7
|
-
import { DEFAULT_GRADING_PROMPT } from './prompts
|
|
4
|
+
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
|
|
5
|
+
import { cosineSimilarity } from './util';
|
|
6
|
+
import { loadApiProvider } from './providers';
|
|
7
|
+
import { DEFAULT_GRADING_PROMPT } from './prompts';
|
|
8
8
|
|
|
9
|
-
import type { Assertion, GradingConfig, TestCase, GradingResult } from './types
|
|
9
|
+
import type { Assertion, GradingConfig, TestCase, GradingResult, AtomicTestCase } from './types';
|
|
10
10
|
|
|
11
11
|
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
12
12
|
|
|
13
13
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
|
|
14
14
|
|
|
15
|
-
export async function runAssertions(test:
|
|
15
|
+
export async function runAssertions(test: AtomicTestCase, output: string): Promise<GradingResult> {
|
|
16
16
|
const tokensUsed = {
|
|
17
17
|
total: 0,
|
|
18
18
|
prompt: 0,
|
|
@@ -41,7 +41,7 @@ export async function runAssertions(test: TestCase, output: string): Promise<Gra
|
|
|
41
41
|
|
|
42
42
|
export async function runAssertion(
|
|
43
43
|
assertion: Assertion,
|
|
44
|
-
test:
|
|
44
|
+
test: AtomicTestCase,
|
|
45
45
|
output: string,
|
|
46
46
|
): Promise<GradingResult> {
|
|
47
47
|
let pass: boolean = false;
|
|
@@ -92,8 +92,7 @@ export async function runAssertion(
|
|
|
92
92
|
|
|
93
93
|
if (assertion.type === 'similar') {
|
|
94
94
|
invariant(assertion.value, 'Similarity assertion must have a string value');
|
|
95
|
-
|
|
96
|
-
return matchesSimilarity(assertion.value, output, assertion.threshold);
|
|
95
|
+
return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75);
|
|
97
96
|
}
|
|
98
97
|
|
|
99
98
|
if (assertion.type === 'llm-rubric') {
|
package/src/cache.ts
CHANGED
|
@@ -4,8 +4,8 @@ import path from 'node:path';
|
|
|
4
4
|
import cacheManager from 'cache-manager';
|
|
5
5
|
import fsStore from 'cache-manager-fs-hash';
|
|
6
6
|
|
|
7
|
-
import logger from './logger
|
|
8
|
-
import { getConfigDirectoryPath, fetchWithTimeout } from './util
|
|
7
|
+
import logger from './logger';
|
|
8
|
+
import { getConfigDirectoryPath, fetchWithTimeout } from './util';
|
|
9
9
|
|
|
10
10
|
import type { Cache } from 'cache-manager';
|
|
11
11
|
import type { RequestInfo, RequestInit } from 'node-fetch';
|
|
@@ -22,9 +22,10 @@ const cacheType =
|
|
|
22
22
|
|
|
23
23
|
function getCache() {
|
|
24
24
|
if (!cacheInstance) {
|
|
25
|
-
const cachePath =
|
|
25
|
+
const cachePath =
|
|
26
|
+
process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
|
|
26
27
|
if (!fs.existsSync(cachePath)) {
|
|
27
|
-
logger.info(
|
|
28
|
+
logger.info(`Creating cache folder at ${cachePath}.`);
|
|
28
29
|
fs.mkdirSync(cachePath, { recursive: true });
|
|
29
30
|
}
|
|
30
31
|
cacheInstance = cacheManager.caching({
|
package/src/evaluator.ts
CHANGED
|
@@ -4,8 +4,8 @@ import async from 'async';
|
|
|
4
4
|
import chalk from 'chalk';
|
|
5
5
|
import nunjucks from 'nunjucks';
|
|
6
6
|
|
|
7
|
-
import logger from './logger
|
|
8
|
-
import { runAssertions } from './assertions
|
|
7
|
+
import logger from './logger';
|
|
8
|
+
import { runAssertions } from './assertions';
|
|
9
9
|
|
|
10
10
|
import type { SingleBar } from 'cli-progress';
|
|
11
11
|
import type {
|
|
@@ -18,14 +18,15 @@ import type {
|
|
|
18
18
|
TestSuite,
|
|
19
19
|
Prompt,
|
|
20
20
|
TestCase,
|
|
21
|
-
|
|
22
|
-
|
|
21
|
+
AtomicTestCase,
|
|
22
|
+
} from './types';
|
|
23
|
+
import { generatePrompts } from './suggestions';
|
|
23
24
|
|
|
24
25
|
interface RunEvalOptions {
|
|
25
26
|
provider: ApiProvider;
|
|
26
|
-
prompt:
|
|
27
|
+
prompt: Prompt;
|
|
27
28
|
|
|
28
|
-
test:
|
|
29
|
+
test: AtomicTestCase;
|
|
29
30
|
|
|
30
31
|
includeProviderId?: boolean;
|
|
31
32
|
|
|
@@ -35,6 +36,29 @@ interface RunEvalOptions {
|
|
|
35
36
|
|
|
36
37
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
37
38
|
|
|
39
|
+
function generateVarCombinations(
|
|
40
|
+
vars: Record<string, string | string[]>,
|
|
41
|
+
): Record<string, string>[] {
|
|
42
|
+
const keys = Object.keys(vars);
|
|
43
|
+
const combinations: Record<string, string>[] = [{}];
|
|
44
|
+
|
|
45
|
+
for (const key of keys) {
|
|
46
|
+
const values = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
|
|
47
|
+
const newCombinations: Record<string, string>[] = [];
|
|
48
|
+
|
|
49
|
+
for (const combination of combinations) {
|
|
50
|
+
for (const value of values) {
|
|
51
|
+
newCombinations.push({ ...combination, [key]: value as string });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
combinations.length = 0;
|
|
56
|
+
combinations.push(...newCombinations);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
return combinations;
|
|
60
|
+
}
|
|
61
|
+
|
|
38
62
|
class Evaluator {
|
|
39
63
|
testSuite: TestSuite;
|
|
40
64
|
options: EvaluateOptions;
|
|
@@ -62,10 +86,13 @@ class Evaluator {
|
|
|
62
86
|
includeProviderId,
|
|
63
87
|
}: RunEvalOptions): Promise<EvaluateResult> {
|
|
64
88
|
const vars = test.vars || {};
|
|
65
|
-
const renderedPrompt = nunjucks.renderString(prompt, vars);
|
|
89
|
+
const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
|
|
66
90
|
|
|
67
91
|
// Note that we're using original prompt, not renderedPrompt
|
|
68
|
-
|
|
92
|
+
let promptDisplay = prompt.display;
|
|
93
|
+
if (includeProviderId) {
|
|
94
|
+
promptDisplay = `[${provider.id()}] ${promptDisplay}`;
|
|
95
|
+
}
|
|
69
96
|
|
|
70
97
|
const setup = {
|
|
71
98
|
prompt: {
|
|
@@ -131,7 +158,7 @@ class Evaluator {
|
|
|
131
158
|
if (options.generateSuggestions) {
|
|
132
159
|
// TODO(ian): Move this into its own command/file
|
|
133
160
|
logger.info(`Generating prompt variations...`);
|
|
134
|
-
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
|
|
161
|
+
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
135
162
|
if (error || !newPrompts) {
|
|
136
163
|
throw new Error(`Failed to generate prompts: ${error}`);
|
|
137
164
|
}
|
|
@@ -154,7 +181,7 @@ class Evaluator {
|
|
|
154
181
|
async (answer) => {
|
|
155
182
|
rl.close();
|
|
156
183
|
if (answer.toLowerCase().startsWith('y')) {
|
|
157
|
-
testSuite.prompts.push(prompt);
|
|
184
|
+
testSuite.prompts.push({ raw: prompt, display: prompt });
|
|
158
185
|
numAdded++;
|
|
159
186
|
} else {
|
|
160
187
|
logger.info('Skipping this prompt.');
|
|
@@ -172,13 +199,13 @@ class Evaluator {
|
|
|
172
199
|
}
|
|
173
200
|
|
|
174
201
|
// Split prompts by provider
|
|
175
|
-
for (const
|
|
202
|
+
for (const prompt of testSuite.prompts) {
|
|
176
203
|
for (const provider of testSuite.providers) {
|
|
177
|
-
const
|
|
178
|
-
testSuite.providers.length > 1 ? `[${provider.id()}] ${
|
|
204
|
+
const updatedDisplay =
|
|
205
|
+
testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
|
|
179
206
|
prompts.push({
|
|
180
|
-
|
|
181
|
-
display,
|
|
207
|
+
...prompt,
|
|
208
|
+
display: updatedDisplay,
|
|
182
209
|
});
|
|
183
210
|
}
|
|
184
211
|
}
|
|
@@ -197,10 +224,10 @@ class Evaluator {
|
|
|
197
224
|
});
|
|
198
225
|
|
|
199
226
|
const varNames: Set<string> = new Set();
|
|
200
|
-
const varsWithSpecialColsRemoved: Record<string, string>[] = [];
|
|
227
|
+
const varsWithSpecialColsRemoved: Record<string, string | string[]>[] = [];
|
|
201
228
|
for (const testCase of tests) {
|
|
202
229
|
if (testCase.vars) {
|
|
203
|
-
const varWithSpecialColsRemoved: Record<string, string> = {};
|
|
230
|
+
const varWithSpecialColsRemoved: Record<string, string | string[]> = {};
|
|
204
231
|
for (const varName of Object.keys(testCase.vars)) {
|
|
205
232
|
varNames.add(varName);
|
|
206
233
|
varWithSpecialColsRemoved[varName] = testCase.vars[varName];
|
|
@@ -224,6 +251,7 @@ class Evaluator {
|
|
|
224
251
|
// And progress bar...
|
|
225
252
|
let progressbar: SingleBar | undefined;
|
|
226
253
|
if (options.showProgressBar) {
|
|
254
|
+
// FIXME(ian): Add var combinations too
|
|
227
255
|
const totalNumRuns =
|
|
228
256
|
testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
|
|
229
257
|
const cliProgress = await import('cli-progress');
|
|
@@ -245,8 +273,6 @@ class Evaluator {
|
|
|
245
273
|
const runEvalOptions: RunEvalOptions[] = [];
|
|
246
274
|
let rowIndex = 0;
|
|
247
275
|
for (const testCase of tests) {
|
|
248
|
-
let colIndex = 0;
|
|
249
|
-
|
|
250
276
|
// Handle default properties
|
|
251
277
|
testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
|
|
252
278
|
testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
|
|
@@ -259,20 +285,27 @@ class Evaluator {
|
|
|
259
285
|
testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
|
|
260
286
|
|
|
261
287
|
// Finalize test case eval
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
288
|
+
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
289
|
+
for (const vars of varCombinations) {
|
|
290
|
+
let colIndex = 0;
|
|
291
|
+
for (const prompt of testSuite.prompts) {
|
|
292
|
+
for (const provider of testSuite.providers) {
|
|
293
|
+
runEvalOptions.push({
|
|
294
|
+
provider,
|
|
295
|
+
prompt: {
|
|
296
|
+
...prompt,
|
|
297
|
+
raw: prependToPrompt + prompt.raw + appendToPrompt,
|
|
298
|
+
},
|
|
299
|
+
test: { ...testCase, vars },
|
|
300
|
+
includeProviderId: testSuite.providers.length > 1,
|
|
301
|
+
rowIndex,
|
|
302
|
+
colIndex,
|
|
303
|
+
});
|
|
304
|
+
colIndex++;
|
|
305
|
+
}
|
|
273
306
|
}
|
|
307
|
+
rowIndex++;
|
|
274
308
|
}
|
|
275
|
-
rowIndex++;
|
|
276
309
|
}
|
|
277
310
|
|
|
278
311
|
// Actually run the eval
|
|
@@ -288,7 +321,7 @@ class Evaluator {
|
|
|
288
321
|
if (progressbar) {
|
|
289
322
|
progressbar.increment({
|
|
290
323
|
provider: options.provider.id(),
|
|
291
|
-
prompt: options.prompt.slice(0, 10),
|
|
324
|
+
prompt: options.prompt.raw.slice(0, 10),
|
|
292
325
|
vars: Object.entries(options.test.vars || {})
|
|
293
326
|
.map(([k, v]) => `${k}=${v}`)
|
|
294
327
|
.join(' ')
|
|
@@ -320,7 +353,7 @@ class Evaluator {
|
|
|
320
353
|
if (!table.body[rowIndex]) {
|
|
321
354
|
table.body[rowIndex] = {
|
|
322
355
|
outputs: [],
|
|
323
|
-
vars: table.head.vars.map((varName) => options.test.vars?.[varName] || ''),
|
|
356
|
+
vars: table.head.vars.map((varName) => options.test.vars?.[varName] || '').flat(),
|
|
324
357
|
};
|
|
325
358
|
}
|
|
326
359
|
table.body[rowIndex].outputs[colIndex] = resultText;
|
package/src/index.ts
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import { evaluate as doEvaluate } from './evaluator
|
|
2
|
-
import { loadApiProviders } from './providers
|
|
3
|
-
import assertions from './assertions
|
|
4
|
-
import providers from './providers
|
|
1
|
+
import { evaluate as doEvaluate } from './evaluator';
|
|
2
|
+
import { loadApiProviders } from './providers';
|
|
3
|
+
import assertions from './assertions';
|
|
4
|
+
import providers from './providers';
|
|
5
5
|
|
|
6
|
-
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types
|
|
7
|
-
import { readTests } from './util
|
|
6
|
+
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
|
|
7
|
+
import { readTests } from './util';
|
|
8
8
|
|
|
9
|
-
export * from './types
|
|
9
|
+
export * from './types';
|
|
10
10
|
|
|
11
11
|
interface EvaluateTestSuite extends TestSuiteConfig {
|
|
12
12
|
prompts: string[];
|
|
@@ -15,9 +15,14 @@ interface EvaluateTestSuite extends TestSuiteConfig {
|
|
|
15
15
|
async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
|
|
16
16
|
const constructedTestSuite: TestSuite = {
|
|
17
17
|
...testSuite,
|
|
18
|
-
prompts: testSuite.prompts, // raw prompts expected
|
|
19
18
|
providers: await loadApiProviders(testSuite.providers),
|
|
20
19
|
tests: await readTests(testSuite.tests),
|
|
20
|
+
|
|
21
|
+
// Full prompts expected (not filepaths)
|
|
22
|
+
prompts: testSuite.prompts.map((promptContent) => ({
|
|
23
|
+
raw: promptContent,
|
|
24
|
+
display: promptContent,
|
|
25
|
+
})),
|
|
21
26
|
};
|
|
22
27
|
return doEvaluate(constructedTestSuite, options);
|
|
23
28
|
}
|
package/src/main.ts
CHANGED
|
@@ -6,9 +6,9 @@ import Table from 'cli-table3';
|
|
|
6
6
|
import chalk from 'chalk';
|
|
7
7
|
import { Command } from 'commander';
|
|
8
8
|
|
|
9
|
-
import logger, { setLogLevel } from './logger
|
|
10
|
-
import { loadApiProvider, loadApiProviders } from './providers
|
|
11
|
-
import { evaluate } from './evaluator
|
|
9
|
+
import logger, { setLogLevel } from './logger';
|
|
10
|
+
import { loadApiProvider, loadApiProviders } from './providers';
|
|
11
|
+
import { evaluate } from './evaluator';
|
|
12
12
|
import {
|
|
13
13
|
maybeReadConfig,
|
|
14
14
|
readConfig,
|
|
@@ -16,10 +16,10 @@ import {
|
|
|
16
16
|
readTests,
|
|
17
17
|
writeLatestResults,
|
|
18
18
|
writeOutput,
|
|
19
|
-
} from './util
|
|
20
|
-
import { getDirectory } from './esm
|
|
21
|
-
import { init } from './web/server
|
|
22
|
-
import { disableCache } from './cache
|
|
19
|
+
} from './util';
|
|
20
|
+
import { getDirectory } from './esm';
|
|
21
|
+
import { init } from './web/server';
|
|
22
|
+
import { disableCache } from './cache';
|
|
23
23
|
|
|
24
24
|
import type {
|
|
25
25
|
CommandLineOptions,
|
|
@@ -27,8 +27,8 @@ import type {
|
|
|
27
27
|
TestCase,
|
|
28
28
|
TestSuite,
|
|
29
29
|
UnifiedConfig,
|
|
30
|
-
} from './types
|
|
31
|
-
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding
|
|
30
|
+
} from './types';
|
|
31
|
+
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
|
|
32
32
|
|
|
33
33
|
function createDummyFiles(directory: string | null) {
|
|
34
34
|
if (directory) {
|
|
@@ -52,11 +52,9 @@ function createDummyFiles(directory: string | null) {
|
|
|
52
52
|
writeFileSync(pathJoin(process.cwd(), directory, 'README.md'), DEFAULT_README);
|
|
53
53
|
|
|
54
54
|
if (directory === '.') {
|
|
55
|
-
logger.info(
|
|
56
|
-
'Wrote prompts.txt, vars.csv, and promptfooconfig.js. Open README.md to get started!',
|
|
57
|
-
);
|
|
55
|
+
logger.info('Wrote prompts.txt and promptfooconfig.js. Open README.md to get started!');
|
|
58
56
|
} else {
|
|
59
|
-
logger.info(`Wrote prompts.txt
|
|
57
|
+
logger.info(`Wrote prompts.txt and promptfooconfig.js to ./${directory}`);
|
|
60
58
|
logger.info(`\`cd ${directory}\` and open README.md to get started!`);
|
|
61
59
|
}
|
|
62
60
|
}
|
|
@@ -128,11 +126,7 @@ async function main() {
|
|
|
128
126
|
'Path to CSV with test cases',
|
|
129
127
|
config?.commandLineOptions?.vars,
|
|
130
128
|
)
|
|
131
|
-
.option(
|
|
132
|
-
'-t, --tests <path>',
|
|
133
|
-
'Path to CSV with test cases',
|
|
134
|
-
config?.commandLineOptions?.tests,
|
|
135
|
-
)
|
|
129
|
+
.option('-t, --tests <path>', 'Path to CSV with test cases', config?.commandLineOptions?.tests)
|
|
136
130
|
.option('-o, --output <path>', 'Path to output file (csv, json, yaml, html)', config.outputPath)
|
|
137
131
|
.option(
|
|
138
132
|
'-j, --max-concurrency <number>',
|
|
@@ -184,6 +178,7 @@ async function main() {
|
|
|
184
178
|
prompts: cmdObj.prompts || config.prompts,
|
|
185
179
|
providers: cmdObj.providers || config.providers,
|
|
186
180
|
tests: cmdObj.tests || cmdObj.vars || config.tests,
|
|
181
|
+
defaultTest: config.defaultTest,
|
|
187
182
|
};
|
|
188
183
|
}
|
|
189
184
|
|
package/src/providers/localai.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import logger from '../logger
|
|
2
|
-
import { fetchJsonWithCache } from '../cache
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared
|
|
1
|
+
import logger from '../logger';
|
|
2
|
+
import { fetchJsonWithCache } from '../cache';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
6
6
|
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import logger from '../logger
|
|
2
|
-
import { fetchJsonWithCache } from '../cache
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared
|
|
1
|
+
import logger from '../logger';
|
|
2
|
+
import { fetchJsonWithCache } from '../cache';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
6
6
|
|
|
@@ -126,12 +126,20 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
126
126
|
);
|
|
127
127
|
}
|
|
128
128
|
|
|
129
|
+
let stop: string;
|
|
130
|
+
try {
|
|
131
|
+
stop = process.env.OPENAI_STOP
|
|
132
|
+
? JSON.parse(process.env.OPENAI_STOP)
|
|
133
|
+
: ['<|im_end|>', '<|endoftext|>'];
|
|
134
|
+
} catch (err) {
|
|
135
|
+
throw new Error(`OPENAI_STOP is not a valid JSON string: ${err}`);
|
|
136
|
+
}
|
|
129
137
|
const body = {
|
|
130
138
|
model: this.modelName,
|
|
131
139
|
prompt,
|
|
132
|
-
max_tokens: process.env.OPENAI_MAX_TOKENS || 1024,
|
|
133
|
-
temperature: options?.temperature ?? (process.env.
|
|
134
|
-
stop
|
|
140
|
+
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
141
|
+
temperature: options?.temperature ?? parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
142
|
+
stop,
|
|
135
143
|
};
|
|
136
144
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
137
145
|
let data,
|
|
@@ -210,8 +218,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
210
218
|
const body = {
|
|
211
219
|
model: this.modelName,
|
|
212
220
|
messages: messages,
|
|
213
|
-
max_tokens: process.env.OPENAI_MAX_TOKENS || 1024,
|
|
214
|
-
temperature: options?.temperature ?? (process.env.
|
|
221
|
+
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
222
|
+
temperature: options?.temperature ?? parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
215
223
|
};
|
|
216
224
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
217
225
|
|
package/src/providers.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
import path from 'node:path';
|
|
2
2
|
|
|
3
|
-
import { ApiProvider } from './types
|
|
3
|
+
import { ApiProvider } from './types';
|
|
4
4
|
|
|
5
|
-
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai
|
|
6
|
-
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai
|
|
5
|
+
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
|
+
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
7
7
|
|
|
8
8
|
export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
|
|
9
9
|
if (typeof providerPaths === 'string') {
|
package/src/types.ts
CHANGED
|
@@ -134,7 +134,7 @@ export interface TestCase {
|
|
|
134
134
|
description?: string;
|
|
135
135
|
|
|
136
136
|
// Key-value pairs to substitute in the prompt
|
|
137
|
-
vars?: Record<string, string>;
|
|
137
|
+
vars?: Record<string, string | string[]>;
|
|
138
138
|
|
|
139
139
|
// Optional list of automatic checks to run on the LLM output
|
|
140
140
|
assert?: Assertion[];
|
|
@@ -143,6 +143,11 @@ export interface TestCase {
|
|
|
143
143
|
options?: PromptConfig & GradingConfig;
|
|
144
144
|
}
|
|
145
145
|
|
|
146
|
+
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
|
147
|
+
export interface AtomicTestCase extends TestCase {
|
|
148
|
+
vars?: Record<string, string>;
|
|
149
|
+
}
|
|
150
|
+
|
|
146
151
|
// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
|
|
147
152
|
export interface TestSuite {
|
|
148
153
|
// Optional description of what your LLM is trying to do
|
|
@@ -152,7 +157,7 @@ export interface TestSuite {
|
|
|
152
157
|
providers: ApiProvider[];
|
|
153
158
|
|
|
154
159
|
// One or more prompt strings
|
|
155
|
-
prompts:
|
|
160
|
+
prompts: Prompt[];
|
|
156
161
|
|
|
157
162
|
// Test cases
|
|
158
163
|
tests?: TestCase[];
|