promptfoo 0.17.7 → 0.17.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -0
- package/dist/package.json +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +31 -6
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +2 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts +4 -0
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +15 -0
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/openai.d.ts +4 -0
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +21 -2
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/providers/replicate.js +2 -1
- package/dist/src/providers/replicate.js.map +1 -1
- package/dist/src/providers/shared.d.ts.map +1 -1
- package/dist/src/providers/shared.js.map +1 -1
- package/dist/src/types.d.ts +9 -2
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +10 -3
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +125 -40
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-13198388.js → index-8388d689.js} +25 -25
- package/dist/src/web/client/assets/{index-f9b230d1.css → index-d2b6a160.css} +1 -1
- package/dist/src/web/client/index.html +2 -2
- package/dist/src/web/server.d.ts.map +1 -1
- package/dist/src/web/server.js +26 -3
- package/dist/src/web/server.js.map +1 -1
- package/package.json +1 -1
- package/src/evaluator.ts +37 -6
- package/src/main.ts +3 -0
- package/src/providers/azureopenai.ts +24 -0
- package/src/providers/openai.ts +32 -3
- package/src/providers/replicate.ts +7 -3
- package/src/providers/shared.ts +3 -1
- package/src/types.ts +12 -2
- package/src/util.ts +140 -42
- package/src/web/client/src/App.tsx +24 -1
- package/src/web/client/src/ResultsTable.css +11 -1
- package/src/web/client/src/ResultsTable.tsx +10 -0
- package/src/web/client/src/ResultsView.tsx +48 -3
- package/src/web/client/src/types.ts +4 -0
- package/src/web/server.ts +33 -10
|
@@ -1 +1 @@
|
|
|
1
|
-
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr .cell-detail{visibility:hidden;position:absolute;bottom:.25rem;margin-top:1rem;font-size:.75rem;color:#888}tr:hover .cell-actions,tr:hover .cell-detail{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-8388d689.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-d2b6a160.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":"AAeA,wBAAgB,IAAI,CAAC,IAAI,SAAQ,
|
|
1
|
+
{"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":"AAeA,wBAAgB,IAAI,CAAC,IAAI,SAAQ,QAiFhC"}
|
package/dist/src/web/server.js
CHANGED
|
@@ -37,11 +37,34 @@ function init(port = 15500) {
|
|
|
37
37
|
// Send the initial table data when a client connects
|
|
38
38
|
socket.emit('init', readLatestJson());
|
|
39
39
|
// Watch for changes to latest.json and emit the update event
|
|
40
|
-
|
|
41
|
-
if (
|
|
40
|
+
const watcher = (0, debounce_1.default)((curr, prev) => {
|
|
41
|
+
if (curr.mtime !== prev.mtime) {
|
|
42
42
|
socket.emit('update', readLatestJson());
|
|
43
43
|
}
|
|
44
|
-
}, 250)
|
|
44
|
+
}, 250);
|
|
45
|
+
fs_1.default.watchFile(latestJsonPath, watcher);
|
|
46
|
+
// Stop watching the file when the socket connection is closed
|
|
47
|
+
socket.on('disconnect', () => {
|
|
48
|
+
fs_1.default.unwatchFile(latestJsonPath, watcher);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
app.get('/results', (req, res) => {
|
|
52
|
+
const previousResults = (0, util_1.listPreviousResults)();
|
|
53
|
+
res.json({ data: previousResults });
|
|
54
|
+
});
|
|
55
|
+
app.get('/results/:filename', (req, res) => {
|
|
56
|
+
const filename = req.params.filename;
|
|
57
|
+
const safeFilename = node_path_1.default.basename(filename);
|
|
58
|
+
if (safeFilename !== filename || !(0, util_1.listPreviousResults)().includes(safeFilename)) {
|
|
59
|
+
res.status(400).send('Invalid filename');
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
const result = (0, util_1.readResult)(safeFilename);
|
|
63
|
+
if (!result) {
|
|
64
|
+
res.status(404).send('Result not found');
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
res.json({ data: result });
|
|
45
68
|
});
|
|
46
69
|
httpServer.listen(port, () => {
|
|
47
70
|
const url = `http://localhost:${port}`;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAA+B;AAC/B,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAAgF;AAEhF,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,MAAM,OAAO,GAAG,IAAA,kBAAQ,EAAC,CAAC,IAAW,EAAE,IAAW,EAAE,EAAE;YACpD,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,EAAE;gBAC7B,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;aACzC;QACH,CAAC,EAAE,GAAG,CAAC,CAAC;QACR,YAAE,CAAC,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;QAEtC,8DAA8D;QAC9D,MAAM,CAAC,EAAE,CAAC,YAAY,EAAE,GAAG,EAAE;YAC3B,YAAE,CAAC,WAAW,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;QAC1C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,GAAG,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;QAC/B,MAAM,eAAe,GAAG,IAAA,0BAAmB,GAAE,CAAC;QAC9C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,eAAe,EAAE,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,GAAG,CAAC,GAAG,CAAC,oBAAoB,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;QACzC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;QACrC,MAAM,YAAY,GAAG,mBAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,YAAY,KAAK,QAAQ,IAAI,CAAC,IAAA,0BAAmB,GAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE;YAC9E,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACzC,OAAO;SACR;QACD,MAAM,MAAM,GAAG,IAAA,iBAAU,EAAC,YAAY,CAAC,CAAC;QACxC,IAAI,CAAC,MAAM,EAAE;YACX,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACzC,OAAO;SACR;QACD,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAjFD,oBAiFC"}
|
package/package.json
CHANGED
package/src/evaluator.ts
CHANGED
|
@@ -3,6 +3,7 @@ import readline from 'readline';
|
|
|
3
3
|
import async from 'async';
|
|
4
4
|
import chalk from 'chalk';
|
|
5
5
|
import nunjucks from 'nunjucks';
|
|
6
|
+
import invariant from 'tiny-invariant';
|
|
6
7
|
|
|
7
8
|
import logger from './logger';
|
|
8
9
|
import telemetry from './telemetry';
|
|
@@ -110,18 +111,40 @@ class Evaluator {
|
|
|
110
111
|
vars,
|
|
111
112
|
};
|
|
112
113
|
|
|
114
|
+
let latencyMs = 0;
|
|
113
115
|
try {
|
|
116
|
+
const startTime = Date.now();
|
|
114
117
|
const response = await provider.callApi(renderedPrompt);
|
|
118
|
+
const endTime = Date.now();
|
|
119
|
+
latencyMs = endTime - startTime;
|
|
120
|
+
|
|
115
121
|
const ret: EvaluateResult = {
|
|
116
122
|
...setup,
|
|
117
123
|
response,
|
|
118
124
|
success: false,
|
|
119
125
|
score: 0,
|
|
126
|
+
latencyMs,
|
|
120
127
|
};
|
|
121
128
|
if (response.error) {
|
|
122
129
|
ret.error = response.error;
|
|
123
130
|
} else if (response.output) {
|
|
124
|
-
|
|
131
|
+
// Create a copy of response so we can potentially mutate it.
|
|
132
|
+
let processedResponse = { ...response };
|
|
133
|
+
if (test.options?.postprocess) {
|
|
134
|
+
const { postprocess } = test.options;
|
|
135
|
+
const postprocessFn = new Function(
|
|
136
|
+
'output',
|
|
137
|
+
'context',
|
|
138
|
+
postprocess.includes('\n') ? postprocess : `return ${postprocess}`,
|
|
139
|
+
);
|
|
140
|
+
processedResponse.output = postprocessFn(processedResponse.output);
|
|
141
|
+
if (processedResponse.output == null) {
|
|
142
|
+
throw new Error('Postprocess function did not return a value');
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
invariant(processedResponse.output != null, 'Response output should not be null');
|
|
147
|
+
const checkResult = await runAssertions(test, processedResponse.output);
|
|
125
148
|
if (!checkResult.pass) {
|
|
126
149
|
ret.error = checkResult.reason;
|
|
127
150
|
}
|
|
@@ -132,6 +155,7 @@ class Evaluator {
|
|
|
132
155
|
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
|
|
133
156
|
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
|
|
134
157
|
}
|
|
158
|
+
ret.response = processedResponse;
|
|
135
159
|
} else {
|
|
136
160
|
ret.success = false;
|
|
137
161
|
ret.score = 0;
|
|
@@ -159,6 +183,7 @@ class Evaluator {
|
|
|
159
183
|
error: String(err) + '\n\n' + (err as Error).stack,
|
|
160
184
|
success: false,
|
|
161
185
|
score: 0,
|
|
186
|
+
latencyMs,
|
|
162
187
|
};
|
|
163
188
|
}
|
|
164
189
|
}
|
|
@@ -232,11 +257,13 @@ class Evaluator {
|
|
|
232
257
|
// Aggregate all vars across test cases
|
|
233
258
|
|
|
234
259
|
const tests = (
|
|
235
|
-
testSuite.tests
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
260
|
+
testSuite.tests && testSuite.tests.length > 0
|
|
261
|
+
? testSuite.tests
|
|
262
|
+
: [
|
|
263
|
+
{
|
|
264
|
+
// Dummy test for cases when we're only comparing raw prompts.
|
|
265
|
+
},
|
|
266
|
+
]
|
|
240
267
|
).map((test) => {
|
|
241
268
|
const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
|
|
242
269
|
return Object.assign(finalTestCase, test);
|
|
@@ -270,6 +297,8 @@ class Evaluator {
|
|
|
270
297
|
testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
|
|
271
298
|
const appendToPrompt =
|
|
272
299
|
testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
|
|
300
|
+
testCase.options.postprocess =
|
|
301
|
+
testCase.options.postprocess || testSuite.defaultTest?.options?.postprocess;
|
|
273
302
|
|
|
274
303
|
// Finalize test case eval
|
|
275
304
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
@@ -404,6 +433,8 @@ class Evaluator {
|
|
|
404
433
|
score: row.score,
|
|
405
434
|
text: resultText,
|
|
406
435
|
prompt: row.prompt.raw,
|
|
436
|
+
latencyMs: row.latencyMs,
|
|
437
|
+
tokenUsage: row.response?.tokenUsage,
|
|
407
438
|
};
|
|
408
439
|
},
|
|
409
440
|
);
|
package/src/main.ts
CHANGED
|
@@ -11,6 +11,7 @@ import logger, { getLogLevel, setLogLevel } from './logger';
|
|
|
11
11
|
import { loadApiProvider, loadApiProviders } from './providers';
|
|
12
12
|
import { evaluate } from './evaluator';
|
|
13
13
|
import {
|
|
14
|
+
cleanupOldResults,
|
|
14
15
|
maybeReadConfig,
|
|
15
16
|
readConfig,
|
|
16
17
|
readLatestResults,
|
|
@@ -181,6 +182,7 @@ async function main() {
|
|
|
181
182
|
.action(async () => {
|
|
182
183
|
telemetry.maybeShowNotice();
|
|
183
184
|
await clearCache();
|
|
185
|
+
cleanupOldResults(0);
|
|
184
186
|
telemetry.record('command_used', {
|
|
185
187
|
name: 'cache_clear',
|
|
186
188
|
});
|
|
@@ -321,6 +323,7 @@ async function main() {
|
|
|
321
323
|
suffix: cmdObj.promptSuffix,
|
|
322
324
|
provider: cmdObj.grader,
|
|
323
325
|
// rubricPrompt:
|
|
326
|
+
// postprocess
|
|
324
327
|
},
|
|
325
328
|
...config.defaultTest,
|
|
326
329
|
};
|
|
@@ -6,6 +6,10 @@ import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '.
|
|
|
6
6
|
|
|
7
7
|
interface AzureOpenAiCompletionOptions {
|
|
8
8
|
temperature?: number;
|
|
9
|
+
top_p?: number;
|
|
10
|
+
frequency_penalty?: number;
|
|
11
|
+
presence_penalty?: number;
|
|
12
|
+
best_of?: number;
|
|
9
13
|
functions?: {
|
|
10
14
|
name: string;
|
|
11
15
|
description?: string;
|
|
@@ -144,6 +148,17 @@ export class AzureOpenAiCompletionProvider extends AzureOpenAiGenericProvider {
|
|
|
144
148
|
options?.temperature ??
|
|
145
149
|
this.options.temperature ??
|
|
146
150
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
151
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
152
|
+
presence_penalty:
|
|
153
|
+
options?.presence_penalty ??
|
|
154
|
+
this.options.presence_penalty ??
|
|
155
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
156
|
+
frequency_penalty:
|
|
157
|
+
options?.frequency_penalty ??
|
|
158
|
+
this.options.frequency_penalty ??
|
|
159
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
160
|
+
best_of:
|
|
161
|
+
options?.best_of ?? this.options.best_of ?? parseInt(process.env.OPENAI_BEST_OF || '1'),
|
|
147
162
|
stop,
|
|
148
163
|
};
|
|
149
164
|
logger.debug(`Calling Azure OpenAI API: ${JSON.stringify(body)}`);
|
|
@@ -214,6 +229,15 @@ export class AzureOpenAiChatCompletionProvider extends AzureOpenAiGenericProvide
|
|
|
214
229
|
options?.temperature ??
|
|
215
230
|
this.options.temperature ??
|
|
216
231
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
232
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
233
|
+
presence_penalty:
|
|
234
|
+
options?.presence_penalty ??
|
|
235
|
+
this.options.presence_penalty ??
|
|
236
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
237
|
+
frequency_penalty:
|
|
238
|
+
options?.frequency_penalty ??
|
|
239
|
+
this.options.frequency_penalty ??
|
|
240
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
217
241
|
functions: options?.functions || this.options.functions || undefined,
|
|
218
242
|
function_call: options?.function_call || this.options.function_call || undefined,
|
|
219
243
|
};
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
import logger from '../logger';
|
|
3
2
|
import { fetchJsonWithCache } from '../cache';
|
|
4
3
|
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
@@ -10,6 +9,10 @@ const DEFAULT_OPENAI_HOST = 'api.openai.com';
|
|
|
10
9
|
interface OpenAiCompletionOptions {
|
|
11
10
|
temperature?: number;
|
|
12
11
|
max_tokens?: number;
|
|
12
|
+
top_p?: number;
|
|
13
|
+
frequency_penalty?: number;
|
|
14
|
+
presence_penalty?: number;
|
|
15
|
+
best_of?: number;
|
|
13
16
|
functions?: {
|
|
14
17
|
name: string;
|
|
15
18
|
description?: string;
|
|
@@ -148,11 +151,25 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
148
151
|
const body = {
|
|
149
152
|
model: this.modelName,
|
|
150
153
|
prompt,
|
|
151
|
-
max_tokens:
|
|
154
|
+
max_tokens:
|
|
155
|
+
options?.max_tokens ??
|
|
156
|
+
this.options.max_tokens ??
|
|
157
|
+
parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
152
158
|
temperature:
|
|
153
159
|
options?.temperature ??
|
|
154
160
|
this.options.temperature ??
|
|
155
161
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
162
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
163
|
+
presence_penalty:
|
|
164
|
+
options?.presence_penalty ??
|
|
165
|
+
this.options.presence_penalty ??
|
|
166
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
167
|
+
frequency_penalty:
|
|
168
|
+
options?.frequency_penalty ??
|
|
169
|
+
this.options.frequency_penalty ??
|
|
170
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
171
|
+
best_of:
|
|
172
|
+
options?.best_of ?? this.options.best_of ?? parseInt(process.env.OPENAI_BEST_OF || '1'),
|
|
156
173
|
stop,
|
|
157
174
|
};
|
|
158
175
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
@@ -231,11 +248,23 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
231
248
|
const body = {
|
|
232
249
|
model: this.modelName,
|
|
233
250
|
messages: messages,
|
|
234
|
-
max_tokens:
|
|
251
|
+
max_tokens:
|
|
252
|
+
options?.max_tokens ??
|
|
253
|
+
this.options.max_tokens ??
|
|
254
|
+
parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
235
255
|
temperature:
|
|
236
256
|
options?.temperature ??
|
|
237
257
|
this.options.temperature ??
|
|
238
258
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
259
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
260
|
+
presence_penalty:
|
|
261
|
+
options?.presence_penalty ??
|
|
262
|
+
this.options.presence_penalty ??
|
|
263
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
264
|
+
frequency_penalty:
|
|
265
|
+
options?.frequency_penalty ??
|
|
266
|
+
this.options.frequency_penalty ??
|
|
267
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
239
268
|
functions: options?.functions || this.options.functions || undefined,
|
|
240
269
|
function_call: options?.function_call || this.options.function_call || undefined,
|
|
241
270
|
};
|
|
@@ -65,9 +65,13 @@ export class ReplicateProvider implements ApiProvider {
|
|
|
65
65
|
const data = {
|
|
66
66
|
input: {
|
|
67
67
|
prompt,
|
|
68
|
-
max_length:
|
|
69
|
-
|
|
70
|
-
|
|
68
|
+
max_length:
|
|
69
|
+
this.options.max_length || parseInt(process.env.REPLICATE_MAX_LENGTH || '2046', 10),
|
|
70
|
+
temperature:
|
|
71
|
+
this.options.temperature || parseFloat(process.env.REPLICATE_TEMPERATURE || '0.01'),
|
|
72
|
+
repetition_penalty:
|
|
73
|
+
this.options.repetition_penalty ||
|
|
74
|
+
parseFloat(process.env.REPLICATE_REPETITION_PENALTY || '1.0'),
|
|
71
75
|
},
|
|
72
76
|
};
|
|
73
77
|
response = await replicate.run(this.modelName as any, data);
|
package/src/providers/shared.ts
CHANGED
|
@@ -4,7 +4,9 @@ export const REQUEST_TIMEOUT_MS = process.env.REQUEST_TIMEOUT_MS
|
|
|
4
4
|
? parseInt(process.env.REQUEST_TIMEOUT_MS, 10)
|
|
5
5
|
: 300_000;
|
|
6
6
|
|
|
7
|
-
export function parseChatPrompt(
|
|
7
|
+
export function parseChatPrompt(
|
|
8
|
+
prompt: string,
|
|
9
|
+
): { role: string; content: string; name?: string }[] {
|
|
8
10
|
const trimmedPrompt = prompt.trim();
|
|
9
11
|
if (trimmedPrompt.startsWith('- role:')) {
|
|
10
12
|
try {
|
package/src/types.ts
CHANGED
|
@@ -73,6 +73,10 @@ export interface PromptConfig {
|
|
|
73
73
|
suffix?: string;
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
export interface OutputConfig {
|
|
77
|
+
postprocess?: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
76
80
|
export interface EvaluateOptions {
|
|
77
81
|
maxConcurrency?: number;
|
|
78
82
|
showProgressBar?: boolean;
|
|
@@ -92,6 +96,7 @@ export interface EvaluateResult {
|
|
|
92
96
|
error?: string;
|
|
93
97
|
success: boolean;
|
|
94
98
|
score: number;
|
|
99
|
+
latencyMs: number;
|
|
95
100
|
}
|
|
96
101
|
|
|
97
102
|
export interface EvaluateTableOutput {
|
|
@@ -99,6 +104,8 @@ export interface EvaluateTableOutput {
|
|
|
99
104
|
score: number;
|
|
100
105
|
text: string;
|
|
101
106
|
prompt: string;
|
|
107
|
+
latencyMs: number;
|
|
108
|
+
tokenUsage?: Partial<TokenUsage>;
|
|
102
109
|
}
|
|
103
110
|
|
|
104
111
|
export interface EvaluateTable {
|
|
@@ -181,11 +188,14 @@ export interface TestCase {
|
|
|
181
188
|
// Key-value pairs to substitute in the prompt
|
|
182
189
|
vars?: Record<string, string | string[] | object>;
|
|
183
190
|
|
|
191
|
+
// Optional filepath or glob pattern to load vars from
|
|
192
|
+
loadVars?: string | string[];
|
|
193
|
+
|
|
184
194
|
// Optional list of automatic checks to run on the LLM output
|
|
185
195
|
assert?: Assertion[];
|
|
186
196
|
|
|
187
197
|
// Additional configuration settings for the prompt
|
|
188
|
-
options?: PromptConfig & GradingConfig;
|
|
198
|
+
options?: PromptConfig & OutputConfig & GradingConfig;
|
|
189
199
|
}
|
|
190
200
|
|
|
191
201
|
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
|
@@ -231,7 +241,7 @@ export interface TestSuiteConfig {
|
|
|
231
241
|
prompts: string | string[];
|
|
232
242
|
|
|
233
243
|
// Path to a test file, OR list of LLM prompt variations (aka "test case")
|
|
234
|
-
tests: string | TestCase[];
|
|
244
|
+
tests: string | string[] | TestCase[];
|
|
235
245
|
|
|
236
246
|
// Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
|
|
237
247
|
defaultTest?: Omit<TestCase, 'description'>;
|