promptfoo 0.9.0 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +50 -40
- package/dist/assertions.d.ts +2 -2
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +186 -44
- package/dist/assertions.js.map +1 -1
- package/dist/cache.js +9 -9
- package/dist/cache.js.map +1 -1
- package/dist/evaluator.d.ts +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +30 -23
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +10 -10
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +18 -14
- package/dist/index.js.map +1 -1
- package/dist/main.js +49 -44
- package/dist/main.js.map +1 -1
- package/dist/providers/localai.js +11 -11
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +30 -21
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers.d.ts +3 -3
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +15 -15
- package/dist/providers.js.map +1 -1
- package/dist/types.d.ts +7 -3
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +4 -4
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +49 -18
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-15dfcd18.js +172 -0
- package/dist/web/client/assets/index-87905193.css +1 -0
- package/dist/web/client/index.html +2 -2
- package/dist/web/server.js +9 -9
- package/dist/web/server.js.map +1 -1
- package/package.json +3 -1
- package/src/assertions.ts +249 -38
- package/src/cache.ts +2 -2
- package/src/evaluator.ts +25 -18
- package/src/index.ts +13 -8
- package/src/main.ts +28 -15
- package/src/providers/localai.ts +3 -3
- package/src/providers/openai.ts +16 -8
- package/src/providers.ts +3 -3
- package/src/types.ts +24 -3
- package/src/util.ts +48 -17
- package/src/web/client/package-lock.json +5729 -0
- package/src/web/client/src/ResultsTable.css +35 -4
- package/src/web/client/src/ResultsTable.tsx +150 -70
- package/src/web/client/src/ResultsView.tsx +83 -18
- package/src/web/client/src/index.css +6 -0
- package/src/web/client/src/types.ts +2 -0
- package/src/web/server.ts +3 -3
- package/dist/web/client/assets/index-207192fc.css +0 -1
- package/dist/web/client/assets/index-8751749f.js +0 -172
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-15dfcd18.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-87905193.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/dist/web/server.js
CHANGED
|
@@ -13,12 +13,12 @@ const express_1 = __importDefault(require("express"));
|
|
|
13
13
|
const cors_1 = __importDefault(require("cors"));
|
|
14
14
|
const opener_1 = __importDefault(require("opener"));
|
|
15
15
|
const socket_io_1 = require("socket.io");
|
|
16
|
-
const
|
|
17
|
-
const
|
|
18
|
-
const
|
|
16
|
+
const logger_1 = __importDefault(require("../logger"));
|
|
17
|
+
const esm_1 = require("../esm");
|
|
18
|
+
const util_1 = require("../util");
|
|
19
19
|
function init(port = 15500) {
|
|
20
20
|
const app = (0, express_1.default)();
|
|
21
|
-
const staticDir = node_path_1.default.join((0,
|
|
21
|
+
const staticDir = node_path_1.default.join((0, esm_1.getDirectory)(), 'web', 'client');
|
|
22
22
|
app.use((0, cors_1.default)());
|
|
23
23
|
app.use(express_1.default.json());
|
|
24
24
|
app.use(express_1.default.static(staticDir));
|
|
@@ -28,7 +28,7 @@ function init(port = 15500) {
|
|
|
28
28
|
origin: '*',
|
|
29
29
|
},
|
|
30
30
|
});
|
|
31
|
-
const latestJsonPath = (0,
|
|
31
|
+
const latestJsonPath = (0, util_1.getLatestResultsPath)();
|
|
32
32
|
const readLatestJson = () => {
|
|
33
33
|
const data = fs_1.default.readFileSync(latestJsonPath, 'utf8');
|
|
34
34
|
const jsonData = JSON.parse(data);
|
|
@@ -46,7 +46,7 @@ function init(port = 15500) {
|
|
|
46
46
|
});
|
|
47
47
|
httpServer.listen(port, () => {
|
|
48
48
|
const url = `http://localhost:${port}`;
|
|
49
|
-
|
|
49
|
+
logger_1.default.info(`Server listening at ${url}`);
|
|
50
50
|
const rl = node_readline_1.default.createInterface({
|
|
51
51
|
input: process.stdin,
|
|
52
52
|
output: process.stdout,
|
|
@@ -55,14 +55,14 @@ function init(port = 15500) {
|
|
|
55
55
|
if (answer.toLowerCase().startsWith('y')) {
|
|
56
56
|
try {
|
|
57
57
|
await (0, opener_1.default)(url);
|
|
58
|
-
|
|
58
|
+
logger_1.default.info(`Opening browser to: ${url}`);
|
|
59
59
|
}
|
|
60
60
|
catch (err) {
|
|
61
|
-
|
|
61
|
+
logger_1.default.error(`Failed to open browser: ${String(err)}`);
|
|
62
62
|
}
|
|
63
63
|
}
|
|
64
64
|
rl.close();
|
|
65
|
-
|
|
65
|
+
logger_1.default.info('Press Ctrl+C to stop the server');
|
|
66
66
|
});
|
|
67
67
|
});
|
|
68
68
|
}
|
package/dist/web/server.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAGrD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAI/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAClC,OAAO,QAAQ,CAAC,KAAK,CAAC;IACxB,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;QAEjD,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,EAAE,KAAK,EAAE,cAAc,EAAE,EAAE,CAAC,CAAC;aACpD;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA3DD,oBA2DC"}
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "Prompt engineering toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.11.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/index.js",
|
|
@@ -59,6 +59,7 @@
|
|
|
59
59
|
"typescript": "^5.0.4"
|
|
60
60
|
},
|
|
61
61
|
"dependencies": {
|
|
62
|
+
"@apidevtools/json-schema-ref-parser": "^10.1.0",
|
|
62
63
|
"async": "^3.2.4",
|
|
63
64
|
"cache-manager": "^4.1.0",
|
|
64
65
|
"cache-manager-fs-hash": "^1.0.0",
|
|
@@ -76,6 +77,7 @@
|
|
|
76
77
|
"node-fetch": "^2.6.7",
|
|
77
78
|
"nunjucks": "^3.2.4",
|
|
78
79
|
"opener": "^1.5.2",
|
|
80
|
+
"rouge": "^1.0.3",
|
|
79
81
|
"socket.io": "^4.6.1",
|
|
80
82
|
"tiny-invariant": "^1.3.1",
|
|
81
83
|
"winston": "^3.8.2"
|
package/src/assertions.ts
CHANGED
|
@@ -1,17 +1,49 @@
|
|
|
1
|
+
import rouge from 'rouge';
|
|
1
2
|
import invariant from 'tiny-invariant';
|
|
2
3
|
import nunjucks from 'nunjucks';
|
|
3
4
|
|
|
4
|
-
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai
|
|
5
|
-
import { cosineSimilarity } from './util
|
|
6
|
-
import { loadApiProvider } from './providers
|
|
7
|
-
import { DEFAULT_GRADING_PROMPT } from './prompts
|
|
5
|
+
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
|
|
6
|
+
import { cosineSimilarity, fetchWithTimeout } from './util';
|
|
7
|
+
import { loadApiProvider } from './providers';
|
|
8
|
+
import { DEFAULT_GRADING_PROMPT } from './prompts';
|
|
8
9
|
|
|
9
|
-
import type {
|
|
10
|
+
import type {
|
|
11
|
+
Assertion,
|
|
12
|
+
AssertionType,
|
|
13
|
+
GradingConfig,
|
|
14
|
+
GradingResult,
|
|
15
|
+
AtomicTestCase,
|
|
16
|
+
} from './types';
|
|
10
17
|
|
|
11
18
|
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
12
19
|
|
|
13
20
|
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
|
|
14
21
|
|
|
22
|
+
function handleRougeScore(
|
|
23
|
+
baseType: 'rouge-n',
|
|
24
|
+
assertion: Assertion,
|
|
25
|
+
expected: string | string[],
|
|
26
|
+
output: string,
|
|
27
|
+
inverted: boolean,
|
|
28
|
+
): GradingResult {
|
|
29
|
+
const fnName = baseType[baseType.length - 1] as 'n' | 'l' | 's';
|
|
30
|
+
const rougeMethod = rouge[fnName];
|
|
31
|
+
const score = rougeMethod(output, expected);
|
|
32
|
+
console.log(output, expected, score);
|
|
33
|
+
const pass = score >= (assertion.threshold || 0.75) != inverted;
|
|
34
|
+
|
|
35
|
+
return {
|
|
36
|
+
pass,
|
|
37
|
+
reason: pass
|
|
38
|
+
? `${baseType.toUpperCase()} score ${score} is greater than or equal to threshold ${
|
|
39
|
+
assertion.threshold || 0.75
|
|
40
|
+
}`
|
|
41
|
+
: `${baseType.toUpperCase()} score ${score} is less than threshold ${
|
|
42
|
+
assertion.threshold || 0.75
|
|
43
|
+
}`,
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
|
|
15
47
|
export async function runAssertions(test: AtomicTestCase, output: string): Promise<GradingResult> {
|
|
16
48
|
const tokensUsed = {
|
|
17
49
|
total: 0,
|
|
@@ -46,7 +78,12 @@ export async function runAssertion(
|
|
|
46
78
|
): Promise<GradingResult> {
|
|
47
79
|
let pass: boolean = false;
|
|
48
80
|
|
|
49
|
-
|
|
81
|
+
invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
82
|
+
|
|
83
|
+
const inverse = assertion.type.startsWith('not-');
|
|
84
|
+
const baseType = inverse ? assertion.type.slice(4) : assertion.type;
|
|
85
|
+
|
|
86
|
+
if (baseType === 'equals') {
|
|
50
87
|
pass = assertion.value === output;
|
|
51
88
|
return {
|
|
52
89
|
pass,
|
|
@@ -54,52 +91,194 @@ export async function runAssertion(
|
|
|
54
91
|
};
|
|
55
92
|
}
|
|
56
93
|
|
|
57
|
-
if (
|
|
94
|
+
if (baseType === 'is-json') {
|
|
58
95
|
try {
|
|
59
96
|
JSON.parse(output);
|
|
60
|
-
|
|
97
|
+
pass = !inverse;
|
|
61
98
|
} catch (err) {
|
|
62
|
-
|
|
63
|
-
pass: false,
|
|
64
|
-
reason: `Expected output to be valid JSON, but it isn't.\nError: ${err}`,
|
|
65
|
-
};
|
|
99
|
+
pass = inverse;
|
|
66
100
|
}
|
|
101
|
+
return { pass, reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON' };
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (baseType === 'contains') {
|
|
105
|
+
invariant(assertion.value, '"contains" assertion type must have a string value');
|
|
106
|
+
invariant(
|
|
107
|
+
typeof assertion.value === 'string',
|
|
108
|
+
'"contains" assertion type must have a string value',
|
|
109
|
+
);
|
|
110
|
+
pass = output.includes(assertion.value) !== inverse;
|
|
111
|
+
return {
|
|
112
|
+
pass,
|
|
113
|
+
reason: pass
|
|
114
|
+
? 'Assertion passed'
|
|
115
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (baseType === 'contains-any') {
|
|
120
|
+
invariant(assertion.value, '"contains-any" assertion type must have a value');
|
|
121
|
+
invariant(
|
|
122
|
+
Array.isArray(assertion.value),
|
|
123
|
+
'"contains-any" assertion type must have an array value',
|
|
124
|
+
);
|
|
125
|
+
pass = assertion.value.some((value) => output.includes(value)) !== inverse;
|
|
126
|
+
return {
|
|
127
|
+
pass,
|
|
128
|
+
reason: pass
|
|
129
|
+
? 'Assertion passed'
|
|
130
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${assertion.value.join(
|
|
131
|
+
', ',
|
|
132
|
+
)}"`,
|
|
133
|
+
};
|
|
67
134
|
}
|
|
68
135
|
|
|
69
|
-
if (
|
|
70
|
-
|
|
136
|
+
if (baseType === 'contains-all') {
|
|
137
|
+
invariant(assertion.value, '"contains-all" assertion type must have a value');
|
|
138
|
+
invariant(
|
|
139
|
+
Array.isArray(assertion.value),
|
|
140
|
+
'"contains-all" assertion type must have an array value',
|
|
141
|
+
);
|
|
142
|
+
pass = assertion.value.every((value) => output.includes(value)) !== inverse;
|
|
143
|
+
return {
|
|
144
|
+
pass,
|
|
145
|
+
reason: pass
|
|
146
|
+
? 'Assertion passed'
|
|
147
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${assertion.value.join(
|
|
148
|
+
', ',
|
|
149
|
+
)}"`,
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
if (baseType === 'regex') {
|
|
154
|
+
invariant(assertion.value, '"regex" assertion type must have a string value');
|
|
155
|
+
invariant(
|
|
156
|
+
typeof assertion.value === 'string',
|
|
157
|
+
'"contains" assertion type must have a string value',
|
|
158
|
+
);
|
|
159
|
+
const regex = new RegExp(assertion.value);
|
|
160
|
+
pass = regex.test(output) !== inverse;
|
|
161
|
+
return {
|
|
162
|
+
pass,
|
|
163
|
+
reason: pass
|
|
164
|
+
? 'Assertion passed'
|
|
165
|
+
: `Expected output to ${inverse ? 'not ' : ''}match regex "${assertion.value}"`,
|
|
166
|
+
};
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
if (baseType === 'icontains') {
|
|
170
|
+
invariant(assertion.value, '"icontains" assertion type must have a string value');
|
|
171
|
+
invariant(
|
|
172
|
+
typeof assertion.value === 'string',
|
|
173
|
+
'"icontains" assertion type must have a string value',
|
|
174
|
+
);
|
|
175
|
+
pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
|
|
176
|
+
return {
|
|
177
|
+
pass,
|
|
178
|
+
reason: pass
|
|
179
|
+
? 'Assertion passed'
|
|
180
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
181
|
+
};
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (baseType === 'contains-json') {
|
|
185
|
+
pass = containsJSON(output) !== inverse;
|
|
71
186
|
return {
|
|
72
187
|
pass,
|
|
73
|
-
reason: pass
|
|
188
|
+
reason: pass
|
|
189
|
+
? 'Assertion passed'
|
|
190
|
+
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
|
|
74
191
|
};
|
|
75
192
|
}
|
|
76
193
|
|
|
77
|
-
if (
|
|
194
|
+
if (baseType === 'javascript') {
|
|
78
195
|
try {
|
|
79
|
-
const customFunction = new Function('output', `return ${assertion.value}`);
|
|
80
|
-
|
|
196
|
+
const customFunction = new Function('output', 'context', `return ${assertion.value}`);
|
|
197
|
+
const context = {
|
|
198
|
+
vars: test.vars || {},
|
|
199
|
+
};
|
|
200
|
+
pass = customFunction(output, context) !== inverse;
|
|
81
201
|
} catch (err) {
|
|
82
202
|
return {
|
|
83
203
|
pass: false,
|
|
84
|
-
reason: `Custom function threw error: ${(err as Error).message}
|
|
204
|
+
reason: `Custom function threw error: ${(err as Error).message}
|
|
205
|
+
${assertion.value}`,
|
|
85
206
|
};
|
|
86
207
|
}
|
|
87
208
|
return {
|
|
88
209
|
pass,
|
|
89
|
-
reason: pass
|
|
210
|
+
reason: pass
|
|
211
|
+
? 'Assertion passed'
|
|
212
|
+
: `Custom function returned ${inverse ? 'true' : 'false'}
|
|
213
|
+
${assertion.value}`,
|
|
90
214
|
};
|
|
91
215
|
}
|
|
92
216
|
|
|
93
|
-
if (
|
|
217
|
+
if (baseType === 'similar') {
|
|
94
218
|
invariant(assertion.value, 'Similarity assertion must have a string value');
|
|
95
|
-
|
|
219
|
+
invariant(
|
|
220
|
+
typeof assertion.value === 'string',
|
|
221
|
+
'"contains" assertion type must have a string value',
|
|
222
|
+
);
|
|
223
|
+
return matchesSimilarity(assertion.value, output, assertion.threshold || 0.75, inverse);
|
|
96
224
|
}
|
|
97
225
|
|
|
98
|
-
if (
|
|
226
|
+
if (baseType === 'llm-rubric') {
|
|
99
227
|
invariant(assertion.value, 'Similarity assertion must have a string value');
|
|
228
|
+
invariant(
|
|
229
|
+
typeof assertion.value === 'string',
|
|
230
|
+
'"contains" assertion type must have a string value',
|
|
231
|
+
);
|
|
100
232
|
return matchesLlmRubric(assertion.value, output, test.options);
|
|
101
233
|
}
|
|
102
234
|
|
|
235
|
+
if (baseType === 'webhook') {
|
|
236
|
+
invariant(assertion.value, '"webhook" assertion type must have a URL value');
|
|
237
|
+
invariant(
|
|
238
|
+
typeof assertion.value === 'string',
|
|
239
|
+
'"webhook" assertion type must have a URL value',
|
|
240
|
+
);
|
|
241
|
+
|
|
242
|
+
try {
|
|
243
|
+
const context = {
|
|
244
|
+
vars: test.vars || {},
|
|
245
|
+
};
|
|
246
|
+
const response = await fetchWithTimeout(
|
|
247
|
+
assertion.value,
|
|
248
|
+
{
|
|
249
|
+
method: 'POST',
|
|
250
|
+
headers: {
|
|
251
|
+
'Content-Type': 'application/json',
|
|
252
|
+
},
|
|
253
|
+
body: JSON.stringify({ output, context }),
|
|
254
|
+
},
|
|
255
|
+
process.env.WEBHOOK_TIMEOUT ? parseInt(process.env.WEBHOOK_TIMEOUT, 10) : 5000,
|
|
256
|
+
);
|
|
257
|
+
|
|
258
|
+
if (!response.ok) {
|
|
259
|
+
throw new Error(`Webhook response status: ${response.status}`);
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
const jsonResponse = await response.json();
|
|
263
|
+
pass = jsonResponse.pass !== inverse;
|
|
264
|
+
} catch (err) {
|
|
265
|
+
return {
|
|
266
|
+
pass: false,
|
|
267
|
+
reason: `Webhook error: ${(err as Error).message}`,
|
|
268
|
+
};
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return {
|
|
272
|
+
pass,
|
|
273
|
+
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
if (baseType === 'rouge-n') {
|
|
278
|
+
invariant(assertion.value, '"rouge" assertion type must a value (string or string array)');
|
|
279
|
+
return handleRougeScore(baseType, assertion, assertion.value, output, inverse);
|
|
280
|
+
}
|
|
281
|
+
|
|
103
282
|
throw new Error('Unknown assertion type: ' + assertion.type);
|
|
104
283
|
}
|
|
105
284
|
|
|
@@ -125,6 +304,7 @@ export async function matchesSimilarity(
|
|
|
125
304
|
expected: string,
|
|
126
305
|
output: string,
|
|
127
306
|
threshold: number,
|
|
307
|
+
inverse: boolean = false,
|
|
128
308
|
): Promise<GradingResult> {
|
|
129
309
|
const expectedEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
130
310
|
const outputEmbedding = await DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
@@ -155,16 +335,19 @@ export async function matchesSimilarity(
|
|
|
155
335
|
}
|
|
156
336
|
|
|
157
337
|
const similarity = cosineSimilarity(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
158
|
-
|
|
338
|
+
const pass = inverse ? similarity <= threshold : similarity >= threshold;
|
|
339
|
+
const greaterThanReason = `Similarity ${similarity} is greater than threshold ${threshold}`;
|
|
340
|
+
const lessThanReason = `Similarity ${similarity} is less than threshold ${threshold}`;
|
|
341
|
+
if (pass) {
|
|
159
342
|
return {
|
|
160
|
-
pass:
|
|
161
|
-
reason:
|
|
343
|
+
pass: true,
|
|
344
|
+
reason: inverse ? lessThanReason : greaterThanReason,
|
|
162
345
|
tokensUsed,
|
|
163
346
|
};
|
|
164
347
|
}
|
|
165
348
|
return {
|
|
166
|
-
pass:
|
|
167
|
-
reason:
|
|
349
|
+
pass: false,
|
|
350
|
+
reason: inverse ? greaterThanReason : lessThanReason,
|
|
168
351
|
tokensUsed,
|
|
169
352
|
};
|
|
170
353
|
}
|
|
@@ -224,16 +407,7 @@ export async function matchesLlmRubric(
|
|
|
224
407
|
}
|
|
225
408
|
|
|
226
409
|
export function assertionFromString(expected: string): Assertion {
|
|
227
|
-
|
|
228
|
-
if (match) {
|
|
229
|
-
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
230
|
-
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
231
|
-
return {
|
|
232
|
-
type: 'similar',
|
|
233
|
-
value: rest,
|
|
234
|
-
threshold,
|
|
235
|
-
};
|
|
236
|
-
}
|
|
410
|
+
// Legacy options
|
|
237
411
|
if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
|
|
238
412
|
// TODO(1.0): delete eval: legacy option
|
|
239
413
|
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
|
|
@@ -249,11 +423,48 @@ export function assertionFromString(expected: string): Assertion {
|
|
|
249
423
|
value: expected.slice(6),
|
|
250
424
|
};
|
|
251
425
|
}
|
|
426
|
+
|
|
427
|
+
// New options
|
|
428
|
+
const assertionRegex =
|
|
429
|
+
/^(not-)?(equals|contains|contains-any|contains-all|regex|icontains):(.+)$/;
|
|
430
|
+
const regexMatch = expected.match(assertionRegex);
|
|
431
|
+
|
|
432
|
+
if (regexMatch) {
|
|
433
|
+
const [_, notPrefix, type, value] = regexMatch;
|
|
434
|
+
const fullType = notPrefix ? `not-${type}` : type;
|
|
435
|
+
|
|
436
|
+
if (type === 'contains-any' || type === 'contains-all') {
|
|
437
|
+
return {
|
|
438
|
+
type: fullType as AssertionType,
|
|
439
|
+
value: value.split(',').map((s) => s.trim()),
|
|
440
|
+
};
|
|
441
|
+
} else {
|
|
442
|
+
return {
|
|
443
|
+
type: fullType as AssertionType,
|
|
444
|
+
value,
|
|
445
|
+
};
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
// Options that require some special handling
|
|
450
|
+
const match = expected.match(SIMILAR_REGEX);
|
|
451
|
+
if (match) {
|
|
452
|
+
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
453
|
+
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
454
|
+
return {
|
|
455
|
+
type: 'similar',
|
|
456
|
+
value: rest,
|
|
457
|
+
threshold,
|
|
458
|
+
};
|
|
459
|
+
}
|
|
460
|
+
|
|
252
461
|
if (expected === 'is-json' || expected === 'contains-json') {
|
|
253
462
|
return {
|
|
254
463
|
type: expected,
|
|
255
464
|
};
|
|
256
465
|
}
|
|
466
|
+
|
|
467
|
+
// Default to equality
|
|
257
468
|
return {
|
|
258
469
|
type: 'equals',
|
|
259
470
|
value: expected,
|
package/src/cache.ts
CHANGED
|
@@ -4,8 +4,8 @@ import path from 'node:path';
|
|
|
4
4
|
import cacheManager from 'cache-manager';
|
|
5
5
|
import fsStore from 'cache-manager-fs-hash';
|
|
6
6
|
|
|
7
|
-
import logger from './logger
|
|
8
|
-
import { getConfigDirectoryPath, fetchWithTimeout } from './util
|
|
7
|
+
import logger from './logger';
|
|
8
|
+
import { getConfigDirectoryPath, fetchWithTimeout } from './util';
|
|
9
9
|
|
|
10
10
|
import type { Cache } from 'cache-manager';
|
|
11
11
|
import type { RequestInfo, RequestInit } from 'node-fetch';
|
package/src/evaluator.ts
CHANGED
|
@@ -4,8 +4,8 @@ import async from 'async';
|
|
|
4
4
|
import chalk from 'chalk';
|
|
5
5
|
import nunjucks from 'nunjucks';
|
|
6
6
|
|
|
7
|
-
import logger from './logger
|
|
8
|
-
import { runAssertions } from './assertions
|
|
7
|
+
import logger from './logger';
|
|
8
|
+
import { runAssertions } from './assertions';
|
|
9
9
|
|
|
10
10
|
import type { SingleBar } from 'cli-progress';
|
|
11
11
|
import type {
|
|
@@ -19,12 +19,12 @@ import type {
|
|
|
19
19
|
Prompt,
|
|
20
20
|
TestCase,
|
|
21
21
|
AtomicTestCase,
|
|
22
|
-
} from './types
|
|
23
|
-
import { generatePrompts } from './suggestions
|
|
22
|
+
} from './types';
|
|
23
|
+
import { generatePrompts } from './suggestions';
|
|
24
24
|
|
|
25
25
|
interface RunEvalOptions {
|
|
26
26
|
provider: ApiProvider;
|
|
27
|
-
prompt:
|
|
27
|
+
prompt: Prompt;
|
|
28
28
|
|
|
29
29
|
test: AtomicTestCase;
|
|
30
30
|
|
|
@@ -86,10 +86,13 @@ class Evaluator {
|
|
|
86
86
|
includeProviderId,
|
|
87
87
|
}: RunEvalOptions): Promise<EvaluateResult> {
|
|
88
88
|
const vars = test.vars || {};
|
|
89
|
-
const renderedPrompt = nunjucks.renderString(prompt, vars);
|
|
89
|
+
const renderedPrompt = nunjucks.renderString(prompt.raw, vars);
|
|
90
90
|
|
|
91
91
|
// Note that we're using original prompt, not renderedPrompt
|
|
92
|
-
|
|
92
|
+
let promptDisplay = prompt.display;
|
|
93
|
+
if (includeProviderId) {
|
|
94
|
+
promptDisplay = `[${provider.id()}] ${promptDisplay}`;
|
|
95
|
+
}
|
|
93
96
|
|
|
94
97
|
const setup = {
|
|
95
98
|
prompt: {
|
|
@@ -142,7 +145,7 @@ class Evaluator {
|
|
|
142
145
|
} catch (err) {
|
|
143
146
|
return {
|
|
144
147
|
...setup,
|
|
145
|
-
error: String(err),
|
|
148
|
+
error: String(err) + '\n\n' + (err as Error).stack,
|
|
146
149
|
success: false,
|
|
147
150
|
};
|
|
148
151
|
}
|
|
@@ -155,7 +158,7 @@ class Evaluator {
|
|
|
155
158
|
if (options.generateSuggestions) {
|
|
156
159
|
// TODO(ian): Move this into its own command/file
|
|
157
160
|
logger.info(`Generating prompt variations...`);
|
|
158
|
-
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0], 1);
|
|
161
|
+
const { prompts: newPrompts, error } = await generatePrompts(testSuite.prompts[0].raw, 1);
|
|
159
162
|
if (error || !newPrompts) {
|
|
160
163
|
throw new Error(`Failed to generate prompts: ${error}`);
|
|
161
164
|
}
|
|
@@ -178,7 +181,7 @@ class Evaluator {
|
|
|
178
181
|
async (answer) => {
|
|
179
182
|
rl.close();
|
|
180
183
|
if (answer.toLowerCase().startsWith('y')) {
|
|
181
|
-
testSuite.prompts.push(prompt);
|
|
184
|
+
testSuite.prompts.push({ raw: prompt, display: prompt });
|
|
182
185
|
numAdded++;
|
|
183
186
|
} else {
|
|
184
187
|
logger.info('Skipping this prompt.');
|
|
@@ -196,13 +199,13 @@ class Evaluator {
|
|
|
196
199
|
}
|
|
197
200
|
|
|
198
201
|
// Split prompts by provider
|
|
199
|
-
for (const
|
|
202
|
+
for (const prompt of testSuite.prompts) {
|
|
200
203
|
for (const provider of testSuite.providers) {
|
|
201
|
-
const
|
|
202
|
-
testSuite.providers.length > 1 ? `[${provider.id()}] ${
|
|
204
|
+
const updatedDisplay =
|
|
205
|
+
testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
|
|
203
206
|
prompts.push({
|
|
204
|
-
|
|
205
|
-
display,
|
|
207
|
+
...prompt,
|
|
208
|
+
display: updatedDisplay,
|
|
206
209
|
});
|
|
207
210
|
}
|
|
208
211
|
}
|
|
@@ -248,6 +251,7 @@ class Evaluator {
|
|
|
248
251
|
// And progress bar...
|
|
249
252
|
let progressbar: SingleBar | undefined;
|
|
250
253
|
if (options.showProgressBar) {
|
|
254
|
+
// FIXME(ian): Add var combinations too
|
|
251
255
|
const totalNumRuns =
|
|
252
256
|
testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
|
|
253
257
|
const cliProgress = await import('cli-progress');
|
|
@@ -284,11 +288,14 @@ class Evaluator {
|
|
|
284
288
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
285
289
|
for (const vars of varCombinations) {
|
|
286
290
|
let colIndex = 0;
|
|
287
|
-
for (const
|
|
291
|
+
for (const prompt of testSuite.prompts) {
|
|
288
292
|
for (const provider of testSuite.providers) {
|
|
289
293
|
runEvalOptions.push({
|
|
290
294
|
provider,
|
|
291
|
-
prompt:
|
|
295
|
+
prompt: {
|
|
296
|
+
...prompt,
|
|
297
|
+
raw: prependToPrompt + prompt.raw + appendToPrompt,
|
|
298
|
+
},
|
|
292
299
|
test: { ...testCase, vars },
|
|
293
300
|
includeProviderId: testSuite.providers.length > 1,
|
|
294
301
|
rowIndex,
|
|
@@ -314,7 +321,7 @@ class Evaluator {
|
|
|
314
321
|
if (progressbar) {
|
|
315
322
|
progressbar.increment({
|
|
316
323
|
provider: options.provider.id(),
|
|
317
|
-
prompt: options.prompt.slice(0, 10),
|
|
324
|
+
prompt: options.prompt.raw.slice(0, 10),
|
|
318
325
|
vars: Object.entries(options.test.vars || {})
|
|
319
326
|
.map(([k, v]) => `${k}=${v}`)
|
|
320
327
|
.join(' ')
|
package/src/index.ts
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import { evaluate as doEvaluate } from './evaluator
|
|
2
|
-
import { loadApiProviders } from './providers
|
|
3
|
-
import assertions from './assertions
|
|
4
|
-
import providers from './providers
|
|
1
|
+
import { evaluate as doEvaluate } from './evaluator';
|
|
2
|
+
import { loadApiProviders } from './providers';
|
|
3
|
+
import assertions from './assertions';
|
|
4
|
+
import providers from './providers';
|
|
5
5
|
|
|
6
|
-
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types
|
|
7
|
-
import { readTests } from './util
|
|
6
|
+
import type { EvaluateOptions, TestSuite, TestSuiteConfig } from './types';
|
|
7
|
+
import { readTests } from './util';
|
|
8
8
|
|
|
9
|
-
export * from './types
|
|
9
|
+
export * from './types';
|
|
10
10
|
|
|
11
11
|
interface EvaluateTestSuite extends TestSuiteConfig {
|
|
12
12
|
prompts: string[];
|
|
@@ -15,9 +15,14 @@ interface EvaluateTestSuite extends TestSuiteConfig {
|
|
|
15
15
|
async function evaluate(testSuite: EvaluateTestSuite, options: EvaluateOptions = {}) {
|
|
16
16
|
const constructedTestSuite: TestSuite = {
|
|
17
17
|
...testSuite,
|
|
18
|
-
prompts: testSuite.prompts, // raw prompts expected
|
|
19
18
|
providers: await loadApiProviders(testSuite.providers),
|
|
20
19
|
tests: await readTests(testSuite.tests),
|
|
20
|
+
|
|
21
|
+
// Full prompts expected (not filepaths)
|
|
22
|
+
prompts: testSuite.prompts.map((promptContent) => ({
|
|
23
|
+
raw: promptContent,
|
|
24
|
+
display: promptContent,
|
|
25
|
+
})),
|
|
21
26
|
};
|
|
22
27
|
return doEvaluate(constructedTestSuite, options);
|
|
23
28
|
}
|