promptfoo 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -248
- package/dist/__mocks__/esm.js +5 -1
- package/dist/__mocks__/esm.js.map +1 -1
- package/dist/assertions.d.ts +18 -0
- package/dist/assertions.d.ts.map +1 -0
- package/dist/assertions.js +128 -0
- package/dist/assertions.js.map +1 -0
- package/dist/esm.d.ts.map +1 -1
- package/dist/esm.js +10 -3
- package/dist/esm.js.map +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +88 -117
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +34 -5
- package/dist/index.js.map +1 -1
- package/dist/logger.js +18 -11
- package/dist/logger.js.map +1 -1
- package/dist/main.js +95 -53
- package/dist/main.js.map +1 -1
- package/dist/prompts.d.ts +4 -0
- package/dist/prompts.d.ts.map +1 -1
- package/dist/prompts.js +12 -1
- package/dist/prompts.js.map +1 -1
- package/dist/providers/localai.js +21 -13
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts +9 -4
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +39 -29
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/shared.d.ts.map +1 -1
- package/dist/providers/shared.js +5 -2
- package/dist/providers/shared.js.map +1 -1
- package/dist/providers.d.ts +10 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +51 -14
- package/dist/providers.js.map +1 -1
- package/dist/suggestions.d.ts +9 -0
- package/dist/suggestions.d.ts.map +1 -0
- package/dist/suggestions.js +54 -0
- package/dist/suggestions.js.map +1 -0
- package/dist/types.d.ts +11 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -1
- package/dist/util.d.ts +1 -1
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +86 -31
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-207192fc.css +1 -0
- package/dist/web/client/assets/index-8751749f.js +172 -0
- package/dist/web/client/index.html +2 -2
- package/dist/web/server.js +38 -31
- package/dist/web/server.js.map +1 -1
- package/package.json +14 -4
- package/src/assertions.ts +154 -0
- package/src/esm.ts +5 -2
- package/src/evaluator.ts +61 -139
- package/src/index.ts +12 -0
- package/src/main.ts +28 -3
- package/src/prompts.ts +9 -0
- package/src/providers/openai.ts +16 -9
- package/src/providers/shared.ts +1 -1
- package/src/providers.ts +8 -0
- package/src/suggestions.ts +63 -0
- package/src/types.ts +14 -2
- package/src/util.ts +24 -3
- package/src/web/client/package.json +1 -0
- package/src/web/client/src/App.css +4 -0
- package/src/web/client/src/App.tsx +29 -5
- package/src/web/client/src/Logo.css +5 -0
- package/src/web/client/src/NavBar.css +18 -0
- package/src/web/client/src/NavBar.tsx +12 -1
- package/src/web/client/src/index.css +10 -0
- package/src/web/server.ts +2 -2
- package/dist/web/client/assets/index-710f1308.css +0 -1
- package/dist/web/client/assets/index-900b20c0.js +0 -172
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-8751749f.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-207192fc.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/dist/web/server.js
CHANGED
|
@@ -1,24 +1,30 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
const
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.init = void 0;
|
|
7
|
+
const fs_1 = __importDefault(require("fs"));
|
|
8
|
+
const node_path_1 = __importDefault(require("node:path"));
|
|
9
|
+
const node_readline_1 = __importDefault(require("node:readline"));
|
|
10
|
+
const node_http_1 = __importDefault(require("node:http"));
|
|
11
|
+
const debounce_1 = __importDefault(require("debounce"));
|
|
12
|
+
const express_1 = __importDefault(require("express"));
|
|
13
|
+
const cors_1 = __importDefault(require("cors"));
|
|
14
|
+
const opener_1 = __importDefault(require("opener"));
|
|
15
|
+
const socket_io_1 = require("socket.io");
|
|
16
|
+
const index_js_1 = __importDefault(require("../index.js"));
|
|
17
|
+
const logger_js_1 = __importDefault(require("../logger.js"));
|
|
18
|
+
const esm_js_1 = require("../esm.js");
|
|
19
|
+
const util_js_1 = require("../util.js");
|
|
20
|
+
function init(port = 15500) {
|
|
21
|
+
const app = (0, express_1.default)();
|
|
22
|
+
const staticDir = node_path_1.default.join((0, esm_js_1.getDirectory)(), 'web', 'client');
|
|
23
|
+
app.use((0, cors_1.default)());
|
|
24
|
+
app.use(express_1.default.json());
|
|
25
|
+
app.use(express_1.default.static(staticDir));
|
|
26
|
+
const httpServer = node_http_1.default.createServer(app);
|
|
27
|
+
const io = new socket_io_1.Server(httpServer, {
|
|
22
28
|
cors: {
|
|
23
29
|
origin: '*',
|
|
24
30
|
},
|
|
@@ -26,16 +32,16 @@ export function init(port = 15500) {
|
|
|
26
32
|
app.post('/evaluate', async (req, res) => {
|
|
27
33
|
try {
|
|
28
34
|
const { provider, options } = req.body;
|
|
29
|
-
const summary = await
|
|
35
|
+
const summary = await index_js_1.default.evaluate(provider, options);
|
|
30
36
|
res.json(summary);
|
|
31
37
|
}
|
|
32
38
|
catch (error) {
|
|
33
39
|
res.status(500).json({ message: 'Error evaluating prompts' });
|
|
34
40
|
}
|
|
35
41
|
});
|
|
36
|
-
const latestJsonPath = getLatestResultsPath();
|
|
42
|
+
const latestJsonPath = (0, util_js_1.getLatestResultsPath)();
|
|
37
43
|
const readLatestJson = () => {
|
|
38
|
-
const data =
|
|
44
|
+
const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
|
|
39
45
|
const jsonData = JSON.parse(data);
|
|
40
46
|
return jsonData.table;
|
|
41
47
|
};
|
|
@@ -43,7 +49,7 @@ export function init(port = 15500) {
|
|
|
43
49
|
// Send the initial table data when a client connects
|
|
44
50
|
socket.emit('init', { table: readLatestJson() });
|
|
45
51
|
// Watch for changes to latest.json and emit the update event
|
|
46
|
-
|
|
52
|
+
fs_1.default.watch(latestJsonPath, (0, debounce_1.default)((event) => {
|
|
47
53
|
if (event === 'change') {
|
|
48
54
|
socket.emit('update', { table: readLatestJson() });
|
|
49
55
|
}
|
|
@@ -51,24 +57,25 @@ export function init(port = 15500) {
|
|
|
51
57
|
});
|
|
52
58
|
httpServer.listen(port, () => {
|
|
53
59
|
const url = `http://localhost:${port}`;
|
|
54
|
-
|
|
55
|
-
const rl =
|
|
60
|
+
logger_js_1.default.info(`Server listening at ${url}`);
|
|
61
|
+
const rl = node_readline_1.default.createInterface({
|
|
56
62
|
input: process.stdin,
|
|
57
63
|
output: process.stdout,
|
|
58
64
|
});
|
|
59
65
|
rl.question('Do you want to open the browser to the URL? (y/N): ', async (answer) => {
|
|
60
66
|
if (answer.toLowerCase().startsWith('y')) {
|
|
61
67
|
try {
|
|
62
|
-
await
|
|
63
|
-
|
|
68
|
+
await (0, opener_1.default)(url);
|
|
69
|
+
logger_js_1.default.info(`Opening browser to: ${url}`);
|
|
64
70
|
}
|
|
65
71
|
catch (err) {
|
|
66
|
-
|
|
72
|
+
logger_js_1.default.error(`Failed to open browser: ${String(err)}`);
|
|
67
73
|
}
|
|
68
74
|
}
|
|
69
75
|
rl.close();
|
|
70
|
-
|
|
76
|
+
logger_js_1.default.info('Press Ctrl+C to stop the server');
|
|
71
77
|
});
|
|
72
78
|
});
|
|
73
79
|
}
|
|
80
|
+
exports.init = init;
|
|
74
81
|
//# sourceMappingURL=server.js.map
|
package/dist/web/server.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":"AAAA,
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,2DAAoC;AACpC,6DAAkC;AAClC,sCAAyC;AACzC,wCAAkD;AAIlD,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,qBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAUH,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,KAAK,EAAE,GAAY,EAAE,GAAa,EAAE,EAAE;QAC1D,IAAI;YACF,MAAM,EAAE,QAAQ,EAAE,OAAO,EAAE,GAAG,GAAG,CAAC,IAA2B,CAAC;YAC9D,MAAM,OAAO,GAAG,MAAM,kBAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YAC5D,GAAG,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;SACnB;QAAC,OAAO,KAAK,EAAE;YACd,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,0BAA0B,EAAE,CAAC,CAAC;SAC/D;IACH,CAAC,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,8BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,mBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,mBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,mBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA7ED,oBA6EC"}
|
package/package.json
CHANGED
|
@@ -1,10 +1,16 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"author": "Ian Webster",
|
|
4
|
-
"version": "0.
|
|
4
|
+
"version": "0.6.0",
|
|
5
5
|
"license": "MIT",
|
|
6
|
-
"type": "
|
|
6
|
+
"type": "commonjs",
|
|
7
7
|
"main": "dist/index.js",
|
|
8
|
+
"exports": {
|
|
9
|
+
".": {
|
|
10
|
+
"import": "./dist/index.js",
|
|
11
|
+
"require": "./dist/index.js"
|
|
12
|
+
}
|
|
13
|
+
},
|
|
8
14
|
"types": "dist/index.d.ts",
|
|
9
15
|
"typings": "dist/index.d.ts",
|
|
10
16
|
"files": [
|
|
@@ -35,9 +41,12 @@
|
|
|
35
41
|
"@types/cors": "^2.8.13",
|
|
36
42
|
"@types/debounce": "^1.2.1",
|
|
37
43
|
"@types/express": "^4.17.17",
|
|
44
|
+
"@types/glob": "^8.1.0",
|
|
38
45
|
"@types/jest": "^29.5.1",
|
|
39
46
|
"@types/js-yaml": "^4.0.5",
|
|
47
|
+
"@types/node-fetch": "^2.6.4",
|
|
40
48
|
"@types/nunjucks": "^3.2.2",
|
|
49
|
+
"@types/opener": "^1.4.0",
|
|
41
50
|
"babel-jest": "^29.5.0",
|
|
42
51
|
"jest-watch-typeahead": "^2.2.2",
|
|
43
52
|
"prettier": "^2.8.8",
|
|
@@ -56,11 +65,12 @@
|
|
|
56
65
|
"csv-stringify": "^6.3.2",
|
|
57
66
|
"debounce": "^1.2.1",
|
|
58
67
|
"express": "^4.18.2",
|
|
68
|
+
"glob": "^10.2.6",
|
|
59
69
|
"js-yaml": "^4.1.0",
|
|
60
70
|
"lru-cache": "^9.1.1",
|
|
61
|
-
"node-fetch": "^
|
|
71
|
+
"node-fetch": "^2.6.7",
|
|
62
72
|
"nunjucks": "^3.2.4",
|
|
63
|
-
"
|
|
73
|
+
"opener": "^1.5.2",
|
|
64
74
|
"socket.io": "^4.6.1",
|
|
65
75
|
"winston": "^3.8.2"
|
|
66
76
|
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import nunjucks from 'nunjucks';
|
|
2
|
+
|
|
3
|
+
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai.js';
|
|
4
|
+
import { cosineSimilarity } from './util.js';
|
|
5
|
+
import { loadApiProvider } from './providers.js';
|
|
6
|
+
import { DEFAULT_GRADING_PROMPT } from './prompts.js';
|
|
7
|
+
|
|
8
|
+
import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
|
|
9
|
+
|
|
10
|
+
interface GradingResult {
|
|
11
|
+
pass: boolean;
|
|
12
|
+
reason: string;
|
|
13
|
+
tokensUsed: TokenUsage;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
17
|
+
|
|
18
|
+
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
|
|
19
|
+
|
|
20
|
+
export async function matchesExpectedValue(
|
|
21
|
+
expected: string,
|
|
22
|
+
output: string,
|
|
23
|
+
options: EvaluateOptions,
|
|
24
|
+
): Promise<{ pass: boolean; reason?: string }> {
|
|
25
|
+
const match = expected.match(SIMILAR_REGEX);
|
|
26
|
+
|
|
27
|
+
if (match) {
|
|
28
|
+
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
29
|
+
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
30
|
+
return matchesSimilarity(rest, output, threshold);
|
|
31
|
+
} else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
|
|
32
|
+
// TODO(1.0): delete eval: legacy option
|
|
33
|
+
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
|
|
34
|
+
const functionBody = expected.slice(sliceLength);
|
|
35
|
+
|
|
36
|
+
const customFunction = new Function('output', `return ${functionBody}`);
|
|
37
|
+
return { pass: customFunction(output) };
|
|
38
|
+
} else if (expected.startsWith('grade:')) {
|
|
39
|
+
return matchesLlmRubric(expected.slice(6), output, options.grading);
|
|
40
|
+
} else {
|
|
41
|
+
const pass = expected === output;
|
|
42
|
+
return {
|
|
43
|
+
pass,
|
|
44
|
+
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export async function matchesSimilarity(
|
|
50
|
+
expected: string,
|
|
51
|
+
output: string,
|
|
52
|
+
threshold: number,
|
|
53
|
+
): Promise<GradingResult> {
|
|
54
|
+
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
55
|
+
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
56
|
+
|
|
57
|
+
const tokensUsed = {
|
|
58
|
+
total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
|
|
59
|
+
prompt: (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
|
|
60
|
+
completion:
|
|
61
|
+
(expectedEmbedding.tokenUsage?.completion || 0) +
|
|
62
|
+
(outputEmbedding.tokenUsage?.completion || 0),
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
if (expectedEmbedding.error || outputEmbedding.error) {
|
|
66
|
+
return {
|
|
67
|
+
pass: false,
|
|
68
|
+
reason:
|
|
69
|
+
expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
|
|
70
|
+
tokensUsed,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
|
|
75
|
+
return {
|
|
76
|
+
pass: false,
|
|
77
|
+
reason: 'Embedding not found',
|
|
78
|
+
tokensUsed,
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
83
|
+
if (similarity < threshold) {
|
|
84
|
+
return {
|
|
85
|
+
pass: false,
|
|
86
|
+
reason: `Similarity ${similarity} is less than threshold ${threshold}`,
|
|
87
|
+
tokensUsed,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
return {
|
|
91
|
+
pass: true,
|
|
92
|
+
reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
|
|
93
|
+
tokensUsed,
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export async function matchesLlmRubric(
|
|
98
|
+
expected: string,
|
|
99
|
+
output: string,
|
|
100
|
+
options?: GradingConfig,
|
|
101
|
+
): Promise<GradingResult> {
|
|
102
|
+
if (!options) {
|
|
103
|
+
throw new Error(
|
|
104
|
+
'Cannot grade output without grading config. Specify --grader option or grading config.',
|
|
105
|
+
);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const prompt = nunjucks.renderString(options.prompt || DEFAULT_GRADING_PROMPT, {
|
|
109
|
+
content: output,
|
|
110
|
+
rubric: expected,
|
|
111
|
+
});
|
|
112
|
+
|
|
113
|
+
let provider = options.provider || DefaultGradingProvider;
|
|
114
|
+
if (typeof provider === 'string') {
|
|
115
|
+
provider = await loadApiProvider(provider);
|
|
116
|
+
}
|
|
117
|
+
const resp = await provider.callApi(prompt);
|
|
118
|
+
if (resp.error || !resp.output) {
|
|
119
|
+
return {
|
|
120
|
+
pass: false,
|
|
121
|
+
reason: resp.error || 'No output',
|
|
122
|
+
tokensUsed: {
|
|
123
|
+
total: resp.tokenUsage?.total || 0,
|
|
124
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
125
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
126
|
+
},
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
try {
|
|
131
|
+
const parsed = JSON.parse(resp.output) as GradingResult;
|
|
132
|
+
parsed.tokensUsed = {
|
|
133
|
+
total: resp.tokenUsage?.total || 0,
|
|
134
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
135
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
136
|
+
};
|
|
137
|
+
return parsed;
|
|
138
|
+
} catch (err) {
|
|
139
|
+
return {
|
|
140
|
+
pass: false,
|
|
141
|
+
reason: `Output is not valid JSON: ${resp.output}`,
|
|
142
|
+
tokensUsed: {
|
|
143
|
+
total: resp.tokenUsage?.total || 0,
|
|
144
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
145
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
146
|
+
},
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
export default {
|
|
152
|
+
matchesSimilarity,
|
|
153
|
+
matchesLlmRubric,
|
|
154
|
+
};
|
package/src/esm.ts
CHANGED
|
@@ -1,10 +1,13 @@
|
|
|
1
1
|
// esm-specific crap that needs to get mocked out in tests
|
|
2
2
|
|
|
3
|
-
import path from 'path';
|
|
4
|
-
import { fileURLToPath } from 'url';
|
|
3
|
+
//import path from 'path';
|
|
4
|
+
//import { fileURLToPath } from 'url';
|
|
5
5
|
|
|
6
6
|
export function getDirectory(): string {
|
|
7
|
+
/*
|
|
7
8
|
// @ts-ignore: Jest chokes on this
|
|
8
9
|
const __filename = fileURLToPath(import.meta.url);
|
|
9
10
|
return path.dirname(__filename);
|
|
11
|
+
*/
|
|
12
|
+
return __dirname;
|
|
10
13
|
}
|
package/src/evaluator.ts
CHANGED
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
import readline from 'node:readline';
|
|
2
|
+
|
|
1
3
|
import async from 'async';
|
|
4
|
+
import chalk from 'chalk';
|
|
2
5
|
import nunjucks from 'nunjucks';
|
|
3
6
|
|
|
4
|
-
import
|
|
5
|
-
import {
|
|
7
|
+
import logger from './logger.js';
|
|
8
|
+
import { matchesExpectedValue } from './assertions.js';
|
|
6
9
|
|
|
7
10
|
import type { SingleBar } from 'cli-progress';
|
|
8
11
|
import type {
|
|
@@ -13,9 +16,8 @@ import type {
|
|
|
13
16
|
EvaluateSummary,
|
|
14
17
|
EvaluateTable,
|
|
15
18
|
Prompt,
|
|
16
|
-
TokenUsage,
|
|
17
19
|
} from './types.js';
|
|
18
|
-
import {
|
|
20
|
+
import { generatePrompts } from './suggestions.js';
|
|
19
21
|
|
|
20
22
|
interface RunEvalOptions {
|
|
21
23
|
provider: ApiProvider;
|
|
@@ -27,16 +29,8 @@ interface RunEvalOptions {
|
|
|
27
29
|
colIndex: number;
|
|
28
30
|
}
|
|
29
31
|
|
|
30
|
-
interface GradingResult {
|
|
31
|
-
pass: boolean;
|
|
32
|
-
reason: string;
|
|
33
|
-
tokensUsed: TokenUsage;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
32
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
37
33
|
|
|
38
|
-
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
39
|
-
|
|
40
34
|
class Evaluator {
|
|
41
35
|
options: EvaluateOptions;
|
|
42
36
|
stats: EvaluateStats;
|
|
@@ -54,128 +48,6 @@ class Evaluator {
|
|
|
54
48
|
};
|
|
55
49
|
}
|
|
56
50
|
|
|
57
|
-
async gradeOutput(expected: string, output: string): Promise<GradingResult> {
|
|
58
|
-
const { grading } = this.options;
|
|
59
|
-
|
|
60
|
-
if (!grading) {
|
|
61
|
-
throw new Error(
|
|
62
|
-
'Cannot grade output without grading config. Specify --grader option or grading config.',
|
|
63
|
-
);
|
|
64
|
-
}
|
|
65
|
-
|
|
66
|
-
const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
|
|
67
|
-
content: output,
|
|
68
|
-
rubric: expected,
|
|
69
|
-
});
|
|
70
|
-
|
|
71
|
-
const resp = await grading.provider.callApi(prompt);
|
|
72
|
-
if (resp.error || !resp.output) {
|
|
73
|
-
return {
|
|
74
|
-
pass: false,
|
|
75
|
-
reason: resp.error || 'No output',
|
|
76
|
-
tokensUsed: {
|
|
77
|
-
total: resp.tokenUsage?.total || 0,
|
|
78
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
79
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
80
|
-
},
|
|
81
|
-
};
|
|
82
|
-
}
|
|
83
|
-
|
|
84
|
-
try {
|
|
85
|
-
const parsed = JSON.parse(resp.output) as GradingResult;
|
|
86
|
-
parsed.tokensUsed = {
|
|
87
|
-
total: resp.tokenUsage?.total || 0,
|
|
88
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
89
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
90
|
-
};
|
|
91
|
-
return parsed;
|
|
92
|
-
} catch (err) {
|
|
93
|
-
return {
|
|
94
|
-
pass: false,
|
|
95
|
-
reason: `Output is not valid JSON: ${resp.output}`,
|
|
96
|
-
tokensUsed: {
|
|
97
|
-
total: resp.tokenUsage?.total || 0,
|
|
98
|
-
prompt: resp.tokenUsage?.prompt || 0,
|
|
99
|
-
completion: resp.tokenUsage?.completion || 0,
|
|
100
|
-
},
|
|
101
|
-
};
|
|
102
|
-
}
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
async checkSimilarity(
|
|
106
|
-
expected: string,
|
|
107
|
-
output: string,
|
|
108
|
-
threshold: number,
|
|
109
|
-
): Promise<GradingResult> {
|
|
110
|
-
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
111
|
-
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
112
|
-
|
|
113
|
-
const tokensUsed = {
|
|
114
|
-
total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
|
|
115
|
-
prompt:
|
|
116
|
-
(expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
|
|
117
|
-
completion:
|
|
118
|
-
(expectedEmbedding.tokenUsage?.completion || 0) +
|
|
119
|
-
(outputEmbedding.tokenUsage?.completion || 0),
|
|
120
|
-
};
|
|
121
|
-
|
|
122
|
-
if (expectedEmbedding.error || outputEmbedding.error) {
|
|
123
|
-
return {
|
|
124
|
-
pass: false,
|
|
125
|
-
reason:
|
|
126
|
-
expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
|
|
127
|
-
tokensUsed,
|
|
128
|
-
};
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
|
|
132
|
-
return {
|
|
133
|
-
pass: false,
|
|
134
|
-
reason: 'Embedding not found',
|
|
135
|
-
tokensUsed,
|
|
136
|
-
};
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
140
|
-
if (similarity < threshold) {
|
|
141
|
-
return {
|
|
142
|
-
pass: false,
|
|
143
|
-
reason: `Similarity ${similarity} is less than threshold ${threshold}`,
|
|
144
|
-
tokensUsed,
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
return {
|
|
148
|
-
pass: true,
|
|
149
|
-
reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
|
|
150
|
-
tokensUsed,
|
|
151
|
-
};
|
|
152
|
-
}
|
|
153
|
-
|
|
154
|
-
async checkExpectedValue(
|
|
155
|
-
expected: string,
|
|
156
|
-
output: string,
|
|
157
|
-
): Promise<{ pass: boolean; reason?: string }> {
|
|
158
|
-
const match = expected.match(SIMILAR_REGEX);
|
|
159
|
-
|
|
160
|
-
if (match) {
|
|
161
|
-
const threshold = parseFloat(match[1]) || 0.8;
|
|
162
|
-
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
163
|
-
return this.checkSimilarity(rest, output, threshold);
|
|
164
|
-
} else if (expected.startsWith('eval:')) {
|
|
165
|
-
const evalBody = expected.slice(5);
|
|
166
|
-
const evalFunction = new Function('output', `return ${evalBody}`);
|
|
167
|
-
return { pass: evalFunction(output) };
|
|
168
|
-
} else if (expected.startsWith('grade:')) {
|
|
169
|
-
return this.gradeOutput(expected.slice(6), output);
|
|
170
|
-
} else {
|
|
171
|
-
const pass = expected === output;
|
|
172
|
-
return {
|
|
173
|
-
pass,
|
|
174
|
-
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
175
|
-
};
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
51
|
async runEval({
|
|
180
52
|
provider,
|
|
181
53
|
prompt,
|
|
@@ -207,7 +79,7 @@ class Evaluator {
|
|
|
207
79
|
ret.error = response.error;
|
|
208
80
|
} else if (response.output) {
|
|
209
81
|
const checkResult = vars.__expected
|
|
210
|
-
? await
|
|
82
|
+
? await matchesExpectedValue(vars.__expected, response.output, this.options)
|
|
211
83
|
: { pass: true };
|
|
212
84
|
if (!checkResult.pass) {
|
|
213
85
|
ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
|
|
@@ -243,6 +115,48 @@ class Evaluator {
|
|
|
243
115
|
const options = this.options;
|
|
244
116
|
const prompts: Prompt[] = [];
|
|
245
117
|
|
|
118
|
+
if (options.prompt?.generateSuggestions) {
|
|
119
|
+
logger.info(`Generating prompt variations...`);
|
|
120
|
+
const { prompts: newPrompts, error } = await generatePrompts(options.prompts[0], 1);
|
|
121
|
+
if (error || !newPrompts) {
|
|
122
|
+
throw new Error(`Failed to generate prompts: ${error}`);
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
logger.info(chalk.blue('Generated prompts:'));
|
|
126
|
+
let numAdded = 0;
|
|
127
|
+
for (const prompt of newPrompts) {
|
|
128
|
+
logger.info('--------------------------------------------------------');
|
|
129
|
+
logger.info(`${prompt}`);
|
|
130
|
+
logger.info('--------------------------------------------------------');
|
|
131
|
+
|
|
132
|
+
// Ask the user if they want to continue
|
|
133
|
+
await new Promise((resolve) => {
|
|
134
|
+
const rl = readline.createInterface({
|
|
135
|
+
input: process.stdin,
|
|
136
|
+
output: process.stdout,
|
|
137
|
+
});
|
|
138
|
+
rl.question(
|
|
139
|
+
`${chalk.blue('Do you want to test this prompt?')} (y/N): `,
|
|
140
|
+
async (answer) => {
|
|
141
|
+
rl.close();
|
|
142
|
+
if (answer.toLowerCase().startsWith('y')) {
|
|
143
|
+
options.prompts.push(prompt);
|
|
144
|
+
numAdded++;
|
|
145
|
+
} else {
|
|
146
|
+
logger.info('Skipping this prompt.');
|
|
147
|
+
}
|
|
148
|
+
resolve(true);
|
|
149
|
+
},
|
|
150
|
+
);
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if (numAdded < 1) {
|
|
155
|
+
logger.info(chalk.red('No prompts selected. Aborting.'));
|
|
156
|
+
process.exit(1);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
246
160
|
for (const promptContent of options.prompts) {
|
|
247
161
|
for (const provider of options.providers) {
|
|
248
162
|
const display =
|
|
@@ -255,16 +169,20 @@ class Evaluator {
|
|
|
255
169
|
}
|
|
256
170
|
|
|
257
171
|
const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
|
|
258
|
-
const
|
|
172
|
+
const varsWithSpecialColsRemoved = vars.map((v) => {
|
|
259
173
|
const ret = { ...v };
|
|
260
|
-
|
|
174
|
+
Object.keys(ret).forEach((key) => {
|
|
175
|
+
if (key.startsWith('__')) {
|
|
176
|
+
delete ret[key];
|
|
177
|
+
}
|
|
178
|
+
});
|
|
261
179
|
return ret;
|
|
262
180
|
});
|
|
263
181
|
const isTest = vars[0].__expected;
|
|
264
182
|
const table: EvaluateTable = {
|
|
265
183
|
head: {
|
|
266
184
|
prompts: prompts.map((p) => p.display),
|
|
267
|
-
vars: Object.keys(
|
|
185
|
+
vars: Object.keys(varsWithSpecialColsRemoved[0]),
|
|
268
186
|
},
|
|
269
187
|
body: [],
|
|
270
188
|
};
|
|
@@ -292,11 +210,15 @@ class Evaluator {
|
|
|
292
210
|
let rowIndex = 0;
|
|
293
211
|
for (const row of vars) {
|
|
294
212
|
let colIndex = 0;
|
|
213
|
+
|
|
214
|
+
const prependToPrompt = row.__prefix || options.prompt?.prefix || '';
|
|
215
|
+
const appendToPrompt = row.__suffix || options.prompt?.suffix || '';
|
|
216
|
+
|
|
295
217
|
for (const promptContent of options.prompts) {
|
|
296
218
|
for (const provider of options.providers) {
|
|
297
219
|
runEvalOptions.push({
|
|
298
220
|
provider,
|
|
299
|
-
prompt: promptContent,
|
|
221
|
+
prompt: prependToPrompt + promptContent + appendToPrompt,
|
|
300
222
|
vars: row,
|
|
301
223
|
includeProviderId: options.providers.length > 1,
|
|
302
224
|
rowIndex,
|
package/src/index.ts
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
import { evaluate as doEvaluate } from './evaluator.js';
|
|
2
2
|
import { loadApiProvider } from './providers.js';
|
|
3
|
+
import assertions from './assertions.js';
|
|
4
|
+
import providers from './providers.js';
|
|
3
5
|
|
|
4
6
|
import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
|
|
5
7
|
|
|
8
|
+
export * from './types.js';
|
|
9
|
+
|
|
6
10
|
async function evaluate(
|
|
7
11
|
providers: (string | ApiProvider)[] | (string | ApiProvider),
|
|
8
12
|
options: Omit<EvaluateOptions, 'providers'>,
|
|
@@ -30,6 +34,14 @@ async function evaluate(
|
|
|
30
34
|
});
|
|
31
35
|
}
|
|
32
36
|
|
|
37
|
+
module.exports = {
|
|
38
|
+
evaluate,
|
|
39
|
+
assertions,
|
|
40
|
+
providers,
|
|
41
|
+
};
|
|
42
|
+
|
|
33
43
|
export default {
|
|
34
44
|
evaluate,
|
|
45
|
+
assertions,
|
|
46
|
+
providers,
|
|
35
47
|
};
|