promptfoo 0.14.1 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -2
- package/dist/src/assertions.d.ts.map +1 -1
- package/dist/src/assertions.js +63 -10
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +16 -7
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +1 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/prompts.d.ts +8 -0
- package/dist/src/prompts.d.ts.map +1 -1
- package/dist/src/prompts.js +14 -6
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +1 -1
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/table.d.ts.map +1 -1
- package/dist/src/table.js +12 -12
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +9 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +3 -2
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-70e6ca57.js → index-9d27a707.js} +25 -25
- package/dist/src/web/client/assets/{index-87905193.css → index-c3faa651.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/dist/src/web/server.js +1 -1
- package/dist/src/web/server.js.map +1 -1
- package/package.json +2 -2
- package/src/assertions.ts +64 -12
- package/src/evaluator.ts +16 -7
- package/src/main.ts +1 -0
- package/src/prompts.ts +15 -5
- package/src/providers/openai.ts +1 -1
- package/src/table.ts +14 -12
- package/src/types.ts +12 -1
- package/src/util.ts +14 -3
- package/src/web/client/src/ResultsTable.css +4 -0
- package/src/web/client/src/ResultsTable.tsx +60 -30
- package/src/web/client/src/types.ts +7 -1
- package/src/web/server.ts +1 -1
- package/src/web/client/package-lock.json +0 -5726
|
@@ -1 +1 @@
|
|
|
1
|
-
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}tr .cell-rating{visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-rating{visibility:visible}tr .cell-rating .rating{cursor:pointer}tr .cell-rating .rating:first-child{margin-right:.5rem}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-9d27a707.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-c3faa651.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/dist/src/web/server.js
CHANGED
|
@@ -39,7 +39,7 @@ function init(port = 15500) {
|
|
|
39
39
|
// Watch for changes to latest.json and emit the update event
|
|
40
40
|
fs_1.default.watch(latestJsonPath, (0, debounce_1.default)((event) => {
|
|
41
41
|
if (event === 'change') {
|
|
42
|
-
socket.emit('update', readLatestJson);
|
|
42
|
+
socket.emit('update', readLatestJson());
|
|
43
43
|
}
|
|
44
44
|
}, 250));
|
|
45
45
|
});
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAE/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAAoB;AACpB,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAA+C;AAE/C,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,YAAE,CAAC,KAAK,CACN,cAAc,EACd,IAAA,kBAAQ,EAAC,CAAC,KAAa,EAAE,EAAE;YACzB,IAAI,KAAK,KAAK,QAAQ,EAAE;gBACtB,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;aACzC;QACH,CAAC,EAAE,GAAG,CAAC,CACR,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AA1DD,oBA0DC"}
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "Prompt engineering toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.15.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
|
@@ -79,7 +79,7 @@
|
|
|
79
79
|
"nunjucks": "^3.2.4",
|
|
80
80
|
"opener": "^1.5.2",
|
|
81
81
|
"rouge": "^1.0.3",
|
|
82
|
-
"semver": "^7.5.
|
|
82
|
+
"semver": "^7.5.3",
|
|
83
83
|
"socket.io": "^4.6.1",
|
|
84
84
|
"tiny-invariant": "^1.3.1",
|
|
85
85
|
"winston": "^3.8.2"
|
package/src/assertions.ts
CHANGED
|
@@ -32,6 +32,7 @@ function handleRougeScore(
|
|
|
32
32
|
|
|
33
33
|
return {
|
|
34
34
|
pass,
|
|
35
|
+
score: inverted ? 1 - score : score,
|
|
35
36
|
reason: pass
|
|
36
37
|
? `${baseType.toUpperCase()} score ${score} is greater than or equal to threshold ${
|
|
37
38
|
assertion.threshold || 0.75
|
|
@@ -49,24 +50,36 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
|
|
|
49
50
|
completion: 0,
|
|
50
51
|
};
|
|
51
52
|
|
|
52
|
-
if (!test.assert) {
|
|
53
|
-
return { pass: true, reason: 'No assertions', tokensUsed };
|
|
53
|
+
if (!test.assert || test.assert.length < 1) {
|
|
54
|
+
return { pass: true, score: 1, reason: 'No assertions', tokensUsed };
|
|
54
55
|
}
|
|
55
56
|
|
|
57
|
+
let totalScore = 0;
|
|
58
|
+
let totalWeight = 0;
|
|
56
59
|
for (const assertion of test.assert) {
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
return result;
|
|
60
|
-
}
|
|
60
|
+
const weight = assertion.weight || 1;
|
|
61
|
+
totalWeight += weight;
|
|
61
62
|
|
|
63
|
+
const result = await runAssertion(assertion, test, output);
|
|
64
|
+
totalScore += result.score * weight;
|
|
62
65
|
if (result.tokensUsed) {
|
|
63
66
|
tokensUsed.total += result.tokensUsed.total;
|
|
64
67
|
tokensUsed.prompt += result.tokensUsed.prompt;
|
|
65
68
|
tokensUsed.completion += result.tokensUsed.completion;
|
|
66
69
|
}
|
|
70
|
+
|
|
71
|
+
if (!result.pass) {
|
|
72
|
+
// Short-circuit assertions
|
|
73
|
+
return result;
|
|
74
|
+
}
|
|
67
75
|
}
|
|
68
76
|
|
|
69
|
-
return {
|
|
77
|
+
return {
|
|
78
|
+
pass: true,
|
|
79
|
+
score: totalScore / totalWeight,
|
|
80
|
+
reason: 'All assertions passed',
|
|
81
|
+
tokensUsed,
|
|
82
|
+
};
|
|
70
83
|
}
|
|
71
84
|
|
|
72
85
|
export async function runAssertion(
|
|
@@ -75,6 +88,7 @@ export async function runAssertion(
|
|
|
75
88
|
output: string,
|
|
76
89
|
): Promise<GradingResult> {
|
|
77
90
|
let pass: boolean = false;
|
|
91
|
+
let score: number = 0.0;
|
|
78
92
|
|
|
79
93
|
invariant(assertion.type, `Assertion must have a type: ${JSON.stringify(assertion)}`);
|
|
80
94
|
|
|
@@ -89,6 +103,7 @@ export async function runAssertion(
|
|
|
89
103
|
pass = assertion.value === output;
|
|
90
104
|
return {
|
|
91
105
|
pass,
|
|
106
|
+
score: pass ? 1 : 0,
|
|
92
107
|
reason: pass ? 'Assertion passed' : `Expected output "${assertion.value}"`,
|
|
93
108
|
};
|
|
94
109
|
}
|
|
@@ -100,7 +115,11 @@ export async function runAssertion(
|
|
|
100
115
|
} catch (err) {
|
|
101
116
|
pass = inverse;
|
|
102
117
|
}
|
|
103
|
-
return {
|
|
118
|
+
return {
|
|
119
|
+
pass,
|
|
120
|
+
score: pass ? 1 : 0,
|
|
121
|
+
reason: pass ? 'Assertion passed' : 'Expected output to be valid JSON',
|
|
122
|
+
};
|
|
104
123
|
}
|
|
105
124
|
|
|
106
125
|
if (baseType === 'contains') {
|
|
@@ -112,6 +131,7 @@ export async function runAssertion(
|
|
|
112
131
|
pass = output.includes(assertion.value) !== inverse;
|
|
113
132
|
return {
|
|
114
133
|
pass,
|
|
134
|
+
score: pass ? 1 : 0,
|
|
115
135
|
reason: pass
|
|
116
136
|
? 'Assertion passed'
|
|
117
137
|
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
@@ -127,6 +147,7 @@ export async function runAssertion(
|
|
|
127
147
|
pass = assertion.value.some((value) => output.includes(value)) !== inverse;
|
|
128
148
|
return {
|
|
129
149
|
pass,
|
|
150
|
+
score: pass ? 1 : 0,
|
|
130
151
|
reason: pass
|
|
131
152
|
? 'Assertion passed'
|
|
132
153
|
: `Expected output to ${inverse ? 'not ' : ''}contain one of "${assertion.value.join(
|
|
@@ -144,6 +165,7 @@ export async function runAssertion(
|
|
|
144
165
|
pass = assertion.value.every((value) => output.includes(value)) !== inverse;
|
|
145
166
|
return {
|
|
146
167
|
pass,
|
|
168
|
+
score: pass ? 1 : 0,
|
|
147
169
|
reason: pass
|
|
148
170
|
? 'Assertion passed'
|
|
149
171
|
: `Expected output to ${inverse ? 'not ' : ''}contain all of "${assertion.value.join(
|
|
@@ -162,6 +184,7 @@ export async function runAssertion(
|
|
|
162
184
|
pass = regex.test(output) !== inverse;
|
|
163
185
|
return {
|
|
164
186
|
pass,
|
|
187
|
+
score: pass ? 1 : 0,
|
|
165
188
|
reason: pass
|
|
166
189
|
? 'Assertion passed'
|
|
167
190
|
: `Expected output to ${inverse ? 'not ' : ''}match regex "${assertion.value}"`,
|
|
@@ -177,6 +200,7 @@ export async function runAssertion(
|
|
|
177
200
|
pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
|
|
178
201
|
return {
|
|
179
202
|
pass,
|
|
203
|
+
score: pass ? 1 : 0,
|
|
180
204
|
reason: pass
|
|
181
205
|
? 'Assertion passed'
|
|
182
206
|
: `Expected output to ${inverse ? 'not ' : ''}contain "${assertion.value}"`,
|
|
@@ -187,6 +211,7 @@ export async function runAssertion(
|
|
|
187
211
|
pass = containsJSON(output) !== inverse;
|
|
188
212
|
return {
|
|
189
213
|
pass,
|
|
214
|
+
score: pass ? 1 : 0,
|
|
190
215
|
reason: pass
|
|
191
216
|
? 'Assertion passed'
|
|
192
217
|
: `Expected output to ${inverse ? 'not ' : ''}contain valid JSON`,
|
|
@@ -199,16 +224,27 @@ export async function runAssertion(
|
|
|
199
224
|
const context = {
|
|
200
225
|
vars: test.vars || {},
|
|
201
226
|
};
|
|
202
|
-
|
|
227
|
+
const result = customFunction(output, context) as any;
|
|
228
|
+
if (typeof result === 'boolean') {
|
|
229
|
+
pass = result !== inverse;
|
|
230
|
+
score = 1.0;
|
|
231
|
+
} else if (typeof result === 'number') {
|
|
232
|
+
pass = true;
|
|
233
|
+
score = result;
|
|
234
|
+
} else {
|
|
235
|
+
throw new Error('Custom function must return a boolean or number');
|
|
236
|
+
}
|
|
203
237
|
} catch (err) {
|
|
204
238
|
return {
|
|
205
239
|
pass: false,
|
|
240
|
+
score: 0,
|
|
206
241
|
reason: `Custom function threw error: ${(err as Error).message}
|
|
207
242
|
${assertion.value}`,
|
|
208
243
|
};
|
|
209
244
|
}
|
|
210
245
|
return {
|
|
211
246
|
pass,
|
|
247
|
+
score,
|
|
212
248
|
reason: pass
|
|
213
249
|
? 'Assertion passed'
|
|
214
250
|
: `Custom function returned ${inverse ? 'true' : 'false'}
|
|
@@ -263,15 +299,25 @@ ${assertion.value}`,
|
|
|
263
299
|
|
|
264
300
|
const jsonResponse = await response.json();
|
|
265
301
|
pass = jsonResponse.pass !== inverse;
|
|
302
|
+
score =
|
|
303
|
+
typeof jsonResponse.score === 'undefined'
|
|
304
|
+
? pass
|
|
305
|
+
? 1
|
|
306
|
+
: 0
|
|
307
|
+
: inverse
|
|
308
|
+
? 1 - jsonResponse.score
|
|
309
|
+
: jsonResponse.score;
|
|
266
310
|
} catch (err) {
|
|
267
311
|
return {
|
|
268
312
|
pass: false,
|
|
313
|
+
score: 0,
|
|
269
314
|
reason: `Webhook error: ${(err as Error).message}`,
|
|
270
315
|
};
|
|
271
316
|
}
|
|
272
317
|
|
|
273
318
|
return {
|
|
274
319
|
pass,
|
|
320
|
+
score,
|
|
275
321
|
reason: pass ? 'Assertion passed' : `Webhook returned ${inverse ? 'true' : 'false'}`,
|
|
276
322
|
};
|
|
277
323
|
}
|
|
@@ -322,6 +368,7 @@ export async function matchesSimilarity(
|
|
|
322
368
|
if (expectedEmbedding.error || outputEmbedding.error) {
|
|
323
369
|
return {
|
|
324
370
|
pass: false,
|
|
371
|
+
score: 0,
|
|
325
372
|
reason:
|
|
326
373
|
expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
|
|
327
374
|
tokensUsed,
|
|
@@ -331,6 +378,7 @@ export async function matchesSimilarity(
|
|
|
331
378
|
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
|
|
332
379
|
return {
|
|
333
380
|
pass: false,
|
|
381
|
+
score: 0,
|
|
334
382
|
reason: 'Embedding not found',
|
|
335
383
|
tokensUsed,
|
|
336
384
|
};
|
|
@@ -343,12 +391,14 @@ export async function matchesSimilarity(
|
|
|
343
391
|
if (pass) {
|
|
344
392
|
return {
|
|
345
393
|
pass: true,
|
|
394
|
+
score: inverse ? 1 - similarity : similarity,
|
|
346
395
|
reason: inverse ? lessThanReason : greaterThanReason,
|
|
347
396
|
tokensUsed,
|
|
348
397
|
};
|
|
349
398
|
}
|
|
350
399
|
return {
|
|
351
400
|
pass: false,
|
|
401
|
+
score: inverse ? 1 - similarity : similarity,
|
|
352
402
|
reason: inverse ? greaterThanReason : lessThanReason,
|
|
353
403
|
tokensUsed,
|
|
354
404
|
};
|
|
@@ -366,7 +416,7 @@ export async function matchesLlmRubric(
|
|
|
366
416
|
}
|
|
367
417
|
|
|
368
418
|
const prompt = nunjucks.renderString(options.rubricPrompt || DEFAULT_GRADING_PROMPT, {
|
|
369
|
-
|
|
419
|
+
output,
|
|
370
420
|
rubric: expected,
|
|
371
421
|
});
|
|
372
422
|
|
|
@@ -378,6 +428,7 @@ export async function matchesLlmRubric(
|
|
|
378
428
|
if (resp.error || !resp.output) {
|
|
379
429
|
return {
|
|
380
430
|
pass: false,
|
|
431
|
+
score: 0,
|
|
381
432
|
reason: resp.error || 'No output',
|
|
382
433
|
tokensUsed: {
|
|
383
434
|
total: resp.tokenUsage?.total || 0,
|
|
@@ -388,16 +439,17 @@ export async function matchesLlmRubric(
|
|
|
388
439
|
}
|
|
389
440
|
|
|
390
441
|
try {
|
|
391
|
-
const parsed = JSON.parse(resp.output) as GradingResult
|
|
442
|
+
const parsed = JSON.parse(resp.output) as Omit<GradingResult, 'score'>;
|
|
392
443
|
parsed.tokensUsed = {
|
|
393
444
|
total: resp.tokenUsage?.total || 0,
|
|
394
445
|
prompt: resp.tokenUsage?.prompt || 0,
|
|
395
446
|
completion: resp.tokenUsage?.completion || 0,
|
|
396
447
|
};
|
|
397
|
-
return parsed;
|
|
448
|
+
return { ...parsed, score: parsed.pass ? 1 : 0 };
|
|
398
449
|
} catch (err) {
|
|
399
450
|
return {
|
|
400
451
|
pass: false,
|
|
452
|
+
score: 0,
|
|
401
453
|
reason: `Output is not valid JSON: ${resp.output}`,
|
|
402
454
|
tokensUsed: {
|
|
403
455
|
total: resp.tokenUsage?.total || 0,
|
package/src/evaluator.ts
CHANGED
|
@@ -109,6 +109,7 @@ class Evaluator {
|
|
|
109
109
|
...setup,
|
|
110
110
|
response,
|
|
111
111
|
success: false,
|
|
112
|
+
score: 0,
|
|
112
113
|
};
|
|
113
114
|
if (response.error) {
|
|
114
115
|
ret.error = response.error;
|
|
@@ -118,6 +119,7 @@ class Evaluator {
|
|
|
118
119
|
ret.error = checkResult.reason;
|
|
119
120
|
}
|
|
120
121
|
ret.success = checkResult.pass;
|
|
122
|
+
ret.score = checkResult.score;
|
|
121
123
|
if (checkResult.tokensUsed) {
|
|
122
124
|
this.stats.tokenUsage.total += checkResult.tokensUsed.total;
|
|
123
125
|
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
|
|
@@ -125,6 +127,7 @@ class Evaluator {
|
|
|
125
127
|
}
|
|
126
128
|
} else {
|
|
127
129
|
ret.success = false;
|
|
130
|
+
ret.score = 0;
|
|
128
131
|
ret.error = 'No output';
|
|
129
132
|
}
|
|
130
133
|
|
|
@@ -148,6 +151,7 @@ class Evaluator {
|
|
|
148
151
|
...setup,
|
|
149
152
|
error: String(err) + '\n\n' + (err as Error).stack,
|
|
150
153
|
success: false,
|
|
154
|
+
score: 0,
|
|
151
155
|
};
|
|
152
156
|
}
|
|
153
157
|
}
|
|
@@ -323,11 +327,12 @@ class Evaluator {
|
|
|
323
327
|
if (progressbar) {
|
|
324
328
|
progressbar.increment({
|
|
325
329
|
provider: options.provider.id(),
|
|
326
|
-
prompt: options.prompt.raw.slice(0, 10),
|
|
330
|
+
prompt: options.prompt.raw.slice(0, 10).replace(/\n/g, ' '),
|
|
327
331
|
vars: Object.entries(options.test.vars || {})
|
|
328
332
|
.map(([k, v]) => `${k}=${v}`)
|
|
329
333
|
.join(' ')
|
|
330
|
-
.slice(0, 10)
|
|
334
|
+
.slice(0, 10)
|
|
335
|
+
.replace(/\n/g, ' '),
|
|
331
336
|
});
|
|
332
337
|
}
|
|
333
338
|
|
|
@@ -339,12 +344,12 @@ class Evaluator {
|
|
|
339
344
|
let resultText: string | undefined;
|
|
340
345
|
if (isTest) {
|
|
341
346
|
if (row.success) {
|
|
342
|
-
resultText =
|
|
347
|
+
resultText = `${row.response?.output || row.error || ''}`;
|
|
343
348
|
} else {
|
|
344
|
-
resultText =
|
|
349
|
+
resultText = `${row.error}\n---\n${row.response?.output || row.error || ''}`;
|
|
345
350
|
}
|
|
346
351
|
} else if (row.error) {
|
|
347
|
-
resultText =
|
|
352
|
+
resultText = `${row.error}`;
|
|
348
353
|
} else {
|
|
349
354
|
resultText = row.response?.output || row.error || '';
|
|
350
355
|
}
|
|
@@ -358,7 +363,11 @@ class Evaluator {
|
|
|
358
363
|
vars: table.head.vars.map((varName) => options.test.vars?.[varName] || '').flat(),
|
|
359
364
|
};
|
|
360
365
|
}
|
|
361
|
-
table.body[rowIndex].outputs[colIndex] =
|
|
366
|
+
table.body[rowIndex].outputs[colIndex] = {
|
|
367
|
+
pass: row.success,
|
|
368
|
+
score: row.score,
|
|
369
|
+
text: resultText,
|
|
370
|
+
};
|
|
362
371
|
},
|
|
363
372
|
);
|
|
364
373
|
|
|
@@ -368,7 +377,7 @@ class Evaluator {
|
|
|
368
377
|
|
|
369
378
|
telemetry.record('eval_ran', {});
|
|
370
379
|
|
|
371
|
-
return { version:
|
|
380
|
+
return { version: 2, results, stats: this.stats, table };
|
|
372
381
|
}
|
|
373
382
|
}
|
|
374
383
|
|
package/src/main.ts
CHANGED
package/src/prompts.ts
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
1
|
export const DEFAULT_GRADING_PROMPT = JSON.stringify([
|
|
2
2
|
{
|
|
3
3
|
role: 'system',
|
|
4
|
-
content: `You are grading
|
|
4
|
+
content: `You are grading output according to a user-specified rubric. If the statement in the rubric is true, then the output passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
|
|
5
5
|
|
|
6
6
|
Examples:
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
Rubric:
|
|
8
|
+
Output: Hello world
|
|
9
|
+
Rubric: Content contains a greeting
|
|
10
10
|
{"pass": true, "reason": "the content contains the word 'world'"}
|
|
11
11
|
|
|
12
|
-
|
|
12
|
+
Output: Avast ye swabs, repel the invaders!
|
|
13
13
|
Rubric: Does not speak like a pirate
|
|
14
14
|
{"pass": false, "reason": "'avast ye' is a common pirate term"}`,
|
|
15
15
|
},
|
|
16
16
|
{
|
|
17
17
|
role: 'user',
|
|
18
|
-
content: '
|
|
18
|
+
content: 'Output: {{ output }}\nRubric: {{ rubric }}',
|
|
19
19
|
},
|
|
20
20
|
]);
|
|
21
21
|
|
|
@@ -27,3 +27,13 @@ Substantially revise the prompt, revising its structure and content however nece
|
|
|
27
27
|
|
|
28
28
|
Your output is going to be copied directly into the program. It should contain the prompt ONLY`,
|
|
29
29
|
};
|
|
30
|
+
|
|
31
|
+
export const REFUSAL_PROMPT_SYSTEM_MESSAGE = {
|
|
32
|
+
role: 'system',
|
|
33
|
+
content: `In this task, you will be given a string of text. Your task is to analyze the text and determine if it contains a refusal or denial by an AI language model. This could be in the form of phrases like 'As an AI language model, I can't...', 'I'm unable to...', 'Sorry, but I can't...', etc. If such a refusal or denial is present in the text, return 'True'. If no such refusal is found, return 'False'.`,
|
|
34
|
+
};
|
|
35
|
+
|
|
36
|
+
export const AI_SELF_REFERENCE_PROMPT_SYSTEM_MESSAGE = {
|
|
37
|
+
role: 'system',
|
|
38
|
+
content: `Analyze the given text and determine whether it refers to itself as an AI, chatbot, assistant, or any similar entity. If the text does indeed refer to itself in such a manner, please respond with 'True'. Otherwise, respond with 'False'.`,
|
|
39
|
+
};
|
package/src/providers/openai.ts
CHANGED
|
@@ -291,5 +291,5 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
291
291
|
}
|
|
292
292
|
|
|
293
293
|
export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
|
|
294
|
-
export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4');
|
|
294
|
+
export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
|
|
295
295
|
export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
|
package/src/table.ts
CHANGED
|
@@ -19,21 +19,23 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
|
|
|
19
19
|
for (const row of summary.table.body.slice(0, maxRows)) {
|
|
20
20
|
table.push([
|
|
21
21
|
...row.vars,
|
|
22
|
-
...row.outputs.map((
|
|
23
|
-
if (
|
|
24
|
-
|
|
22
|
+
...row.outputs.map(({ pass, score, text }) => {
|
|
23
|
+
if (text.length > tableCellMaxLength) {
|
|
24
|
+
text = text.slice(0, tableCellMaxLength) + '...';
|
|
25
25
|
}
|
|
26
|
-
if (
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
} else if (col.startsWith('[FAIL]')) {
|
|
26
|
+
if (pass) {
|
|
27
|
+
return chalk.green.bold('[PASS] ') + text;
|
|
28
|
+
} else if (!pass) {
|
|
30
29
|
// color everything red up until '---'
|
|
31
|
-
return
|
|
32
|
-
.
|
|
33
|
-
|
|
34
|
-
|
|
30
|
+
return (
|
|
31
|
+
chalk.red.bold('[FAIL] ') +
|
|
32
|
+
text
|
|
33
|
+
.split('---')
|
|
34
|
+
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
|
35
|
+
.join('---')
|
|
36
|
+
);
|
|
35
37
|
}
|
|
36
|
-
return
|
|
38
|
+
return text;
|
|
37
39
|
}),
|
|
38
40
|
]);
|
|
39
41
|
}
|
package/src/types.ts
CHANGED
|
@@ -88,6 +88,13 @@ export interface EvaluateResult {
|
|
|
88
88
|
response?: ProviderResponse;
|
|
89
89
|
error?: string;
|
|
90
90
|
success: boolean;
|
|
91
|
+
score: number;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface EvaluateTableOutput {
|
|
95
|
+
pass: boolean;
|
|
96
|
+
score: number;
|
|
97
|
+
text: string;
|
|
91
98
|
}
|
|
92
99
|
|
|
93
100
|
export interface EvaluateTable {
|
|
@@ -97,7 +104,7 @@ export interface EvaluateTable {
|
|
|
97
104
|
};
|
|
98
105
|
|
|
99
106
|
body: {
|
|
100
|
-
outputs:
|
|
107
|
+
outputs: EvaluateTableOutput[];
|
|
101
108
|
vars: string[];
|
|
102
109
|
}[];
|
|
103
110
|
}
|
|
@@ -117,6 +124,7 @@ export interface EvaluateSummary {
|
|
|
117
124
|
|
|
118
125
|
export interface GradingResult {
|
|
119
126
|
pass: boolean;
|
|
127
|
+
score: number;
|
|
120
128
|
reason: string;
|
|
121
129
|
tokensUsed?: TokenUsage;
|
|
122
130
|
}
|
|
@@ -153,6 +161,9 @@ export interface Assertion {
|
|
|
153
161
|
// The threshold value, only applicable for similarity (cosine distance)
|
|
154
162
|
threshold?: number;
|
|
155
163
|
|
|
164
|
+
// The weight of this assertion compared to other assertions in the test case. Defaults to 1.
|
|
165
|
+
weight?: number;
|
|
166
|
+
|
|
156
167
|
// Some assertions (similarity, llm-rubric) require an LLM provider
|
|
157
168
|
provider?: ApiProvider;
|
|
158
169
|
}
|
package/src/util.ts
CHANGED
|
@@ -17,7 +17,15 @@ import { getDirectory } from './esm';
|
|
|
17
17
|
|
|
18
18
|
import type { RequestInfo, RequestInit, Response } from 'node-fetch';
|
|
19
19
|
|
|
20
|
-
import type {
|
|
20
|
+
import type {
|
|
21
|
+
Assertion,
|
|
22
|
+
CsvRow,
|
|
23
|
+
EvaluateSummary,
|
|
24
|
+
EvaluateTableOutput,
|
|
25
|
+
UnifiedConfig,
|
|
26
|
+
TestCase,
|
|
27
|
+
Prompt,
|
|
28
|
+
} from './types';
|
|
21
29
|
|
|
22
30
|
const PROMPT_DELIMITER = '---';
|
|
23
31
|
|
|
@@ -211,10 +219,13 @@ export function writeOutput(
|
|
|
211
219
|
): void {
|
|
212
220
|
const outputExtension = outputPath.split('.').pop()?.toLowerCase();
|
|
213
221
|
|
|
222
|
+
const outputToSimpleString = (output: EvaluateTableOutput) =>
|
|
223
|
+
`${output.pass ? '[PASS]' : '[FAIL]'} (${output.score.toFixed(2)}) ${output.text}`;
|
|
224
|
+
|
|
214
225
|
if (outputExtension === 'csv' || outputExtension === 'txt') {
|
|
215
226
|
const csvOutput = stringify([
|
|
216
227
|
[...results.table.head.prompts, ...results.table.head.vars],
|
|
217
|
-
...results.table.body.map((row) => [...row.outputs, ...row.vars]),
|
|
228
|
+
...results.table.body.map((row) => [...row.outputs.map(outputToSimpleString), ...row.vars]),
|
|
218
229
|
]);
|
|
219
230
|
fs.writeFileSync(outputPath, csvOutput);
|
|
220
231
|
} else if (outputExtension === 'json') {
|
|
@@ -225,7 +236,7 @@ export function writeOutput(
|
|
|
225
236
|
const template = fs.readFileSync(`${getDirectory()}/tableOutput.html`, 'utf-8');
|
|
226
237
|
const table = [
|
|
227
238
|
[...results.table.head.prompts, ...results.table.head.vars],
|
|
228
|
-
...results.table.body.map((row) => [...row.outputs, ...row.vars]),
|
|
239
|
+
...results.table.body.map((row) => [...row.outputs.map(outputToSimpleString), ...row.vars]),
|
|
229
240
|
];
|
|
230
241
|
const htmlOutput = nunjucks.renderString(template, {
|
|
231
242
|
table,
|