promptfoo 0.17.7 → 0.17.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +23 -6
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +2 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts +4 -0
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +15 -0
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/openai.d.ts +4 -0
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +21 -2
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts.map +1 -1
- package/dist/src/providers/replicate.js +2 -1
- package/dist/src/providers/replicate.js.map +1 -1
- package/dist/src/providers/shared.d.ts.map +1 -1
- package/dist/src/providers/shared.js.map +1 -1
- package/dist/src/types.d.ts +4 -1
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +7 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +62 -26
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-13198388.js → index-0c6f887d.js} +25 -25
- package/dist/src/web/client/index.html +1 -1
- package/dist/src/web/server.d.ts.map +1 -1
- package/dist/src/web/server.js +26 -3
- package/dist/src/web/server.js.map +1 -1
- package/package.json +1 -1
- package/src/evaluator.ts +28 -6
- package/src/main.ts +3 -0
- package/src/providers/azureopenai.ts +24 -0
- package/src/providers/openai.ts +32 -3
- package/src/providers/replicate.ts +7 -3
- package/src/providers/shared.ts +3 -1
- package/src/types.ts +5 -1
- package/src/util.ts +71 -28
- package/src/web/client/src/App.tsx +24 -1
- package/src/web/client/src/ResultsView.tsx +42 -3
- package/src/web/server.ts +33 -10
- package/src/web/client/package-lock.json +0 -5726
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-0c6f887d.js"></script>
|
|
9
9
|
<link rel="stylesheet" href="/assets/index-f9b230d1.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":"AAeA,wBAAgB,IAAI,CAAC,IAAI,SAAQ,
|
|
1
|
+
{"version":3,"file":"server.d.ts","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":"AAeA,wBAAgB,IAAI,CAAC,IAAI,SAAQ,QAiFhC"}
|
package/dist/src/web/server.js
CHANGED
|
@@ -37,11 +37,34 @@ function init(port = 15500) {
|
|
|
37
37
|
// Send the initial table data when a client connects
|
|
38
38
|
socket.emit('init', readLatestJson());
|
|
39
39
|
// Watch for changes to latest.json and emit the update event
|
|
40
|
-
|
|
41
|
-
if (
|
|
40
|
+
const watcher = (0, debounce_1.default)((curr, prev) => {
|
|
41
|
+
if (curr.mtime !== prev.mtime) {
|
|
42
42
|
socket.emit('update', readLatestJson());
|
|
43
43
|
}
|
|
44
|
-
}, 250)
|
|
44
|
+
}, 250);
|
|
45
|
+
fs_1.default.watchFile(latestJsonPath, watcher);
|
|
46
|
+
// Stop watching the file when the socket connection is closed
|
|
47
|
+
socket.on('disconnect', () => {
|
|
48
|
+
fs_1.default.unwatchFile(latestJsonPath, watcher);
|
|
49
|
+
});
|
|
50
|
+
});
|
|
51
|
+
app.get('/results', (req, res) => {
|
|
52
|
+
const previousResults = (0, util_1.listPreviousResults)();
|
|
53
|
+
res.json({ data: previousResults });
|
|
54
|
+
});
|
|
55
|
+
app.get('/results/:filename', (req, res) => {
|
|
56
|
+
const filename = req.params.filename;
|
|
57
|
+
const safeFilename = node_path_1.default.basename(filename);
|
|
58
|
+
if (safeFilename !== filename || !(0, util_1.listPreviousResults)().includes(safeFilename)) {
|
|
59
|
+
res.status(400).send('Invalid filename');
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
const result = (0, util_1.readResult)(safeFilename);
|
|
63
|
+
if (!result) {
|
|
64
|
+
res.status(404).send('Result not found');
|
|
65
|
+
return;
|
|
66
|
+
}
|
|
67
|
+
res.json({ data: result });
|
|
45
68
|
});
|
|
46
69
|
httpServer.listen(port, () => {
|
|
47
70
|
const url = `http://localhost:${port}`;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,
|
|
1
|
+
{"version":3,"file":"server.js","sourceRoot":"","sources":["../../../src/web/server.ts"],"names":[],"mappings":";;;;;;AAAA,4CAA+B;AAC/B,0DAA6B;AAC7B,kEAAqC;AACrC,0DAA6B;AAE7B,wDAAgC;AAChC,sDAA8B;AAC9B,gDAAwB;AACxB,oDAA4B;AAC5B,yCAAqD;AAErD,uDAA+B;AAC/B,gCAAsC;AACtC,kCAAgF;AAEhF,SAAgB,IAAI,CAAC,IAAI,GAAG,KAAK;IAC/B,MAAM,GAAG,GAAG,IAAA,iBAAO,GAAE,CAAC;IAEtB,MAAM,SAAS,GAAG,mBAAI,CAAC,IAAI,CAAC,IAAA,kBAAY,GAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,CAAC;IAE7D,GAAG,CAAC,GAAG,CAAC,IAAA,cAAI,GAAE,CAAC,CAAC;IAChB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,IAAI,EAAE,CAAC,CAAC;IACxB,GAAG,CAAC,GAAG,CAAC,iBAAO,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC;IAEnC,MAAM,UAAU,GAAG,mBAAI,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;IAC1C,MAAM,EAAE,GAAG,IAAI,kBAAc,CAAC,UAAU,EAAE;QACxC,IAAI,EAAE;YACJ,MAAM,EAAE,GAAG;SACZ;KACF,CAAC,CAAC;IAEH,MAAM,cAAc,GAAG,IAAA,2BAAoB,GAAE,CAAC;IAC9C,MAAM,cAAc,GAAG,GAAG,EAAE;QAC1B,MAAM,IAAI,GAAG,YAAE,CAAC,YAAY,CAAC,cAAc,EAAE,MAAM,CAAC,CAAC;QACrD,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC,CAAC;IAEF,EAAE,CAAC,EAAE,CAAC,YAAY,EAAE,CAAC,MAAM,EAAE,EAAE;QAC7B,qDAAqD;QACrD,MAAM,CAAC,IAAI,CAAC,MAAM,EAAE,cAAc,EAAE,CAAC,CAAC;QAEtC,6DAA6D;QAC7D,MAAM,OAAO,GAAG,IAAA,kBAAQ,EAAC,CAAC,IAAW,EAAE,IAAW,EAAE,EAAE;YACpD,IAAI,IAAI,CAAC,KAAK,KAAK,IAAI,CAAC,KAAK,EAAE;gBAC7B,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE,cAAc,EAAE,CAAC,CAAC;aACzC;QACH,CAAC,EAAE,GAAG,CAAC,CAAC;QACR,YAAE,CAAC,SAAS,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;QAEtC,8DAA8D;QAC9D,MAAM,CAAC,EAAE,CAAC,YAAY,EAAE,GAAG,EAAE;YAC3B,YAAE,CAAC,WAAW,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC;QAC1C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,GAAG,CAAC,GAAG,CAAC,UAAU,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;QAC/B,MAAM,eAAe,GAAG,IAAA,0BAAmB,GAAE,CAAC;QAC9C,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,eAAe,EAAE,CAAC,CAAC;IACtC,CAAC,CAAC,CAAC;IAEH,GAAG,CAAC,GAAG,CAAC,oBAAoB,EAAE,CAAC,GAAG,EAAE,GAAG,EAAE,EAAE;QACzC,MAAM,QAAQ,GAAG,GAAG,CAAC,MAAM,CAAC,QAAQ,CAAC;QACrC,MAAM,YAAY,GAAG,mBAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;QAC7C,IAAI,YAAY,KAAK,QAAQ,IAAI,CAAC,IAAA,0BAAmB,GAAE,CAAC,QAAQ,CAAC,YAAY,CAAC,EAAE;YAC9E,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACzC,OAAO;SACR;QACD,MAAM,MAAM,GAAG,IAAA,iBAAU,EAAC,YAAY,CAAC,CAAC;QACxC,IAAI,CAAC,MAAM,EAAE;YACX,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,CAAC,kBAAkB,CAAC,CAAC;YACzC,OAAO;SACR;QACD,GAAG,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,UAAU,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,EAAE;QAC3B,MAAM,GAAG,GAAG,oBAAoB,IAAI,EAAE,CAAC;QACvC,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;QAE1C,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;YAClC,KAAK,EAAE,OAAO,CAAC,KAAK;YACpB,MAAM,EAAE,OAAO,CAAC,MAAM;SACvB,CAAC,CAAC;QACH,EAAE,CAAC,QAAQ,CAAC,qDAAqD,EAAE,KAAK,EAAE,MAAM,EAAE,EAAE;YAClF,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;gBACxC,IAAI;oBACF,MAAM,IAAA,gBAAM,EAAC,GAAG,CAAC,CAAC;oBAClB,gBAAM,CAAC,IAAI,CAAC,uBAAuB,GAAG,EAAE,CAAC,CAAC;iBAC3C;gBAAC,OAAO,GAAG,EAAE;oBACZ,gBAAM,CAAC,KAAK,CAAC,2BAA2B,MAAM,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC;iBACxD;aACF;YACD,EAAE,CAAC,KAAK,EAAE,CAAC;YACX,gBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAjFD,oBAiFC"}
|
package/package.json
CHANGED
package/src/evaluator.ts
CHANGED
|
@@ -3,6 +3,7 @@ import readline from 'readline';
|
|
|
3
3
|
import async from 'async';
|
|
4
4
|
import chalk from 'chalk';
|
|
5
5
|
import nunjucks from 'nunjucks';
|
|
6
|
+
import invariant from 'tiny-invariant';
|
|
6
7
|
|
|
7
8
|
import logger from './logger';
|
|
8
9
|
import telemetry from './telemetry';
|
|
@@ -121,7 +122,23 @@ class Evaluator {
|
|
|
121
122
|
if (response.error) {
|
|
122
123
|
ret.error = response.error;
|
|
123
124
|
} else if (response.output) {
|
|
124
|
-
|
|
125
|
+
// Create a copy of response so we can potentially mutate it.
|
|
126
|
+
let processedResponse = { ...response };
|
|
127
|
+
if (test.options?.postprocess) {
|
|
128
|
+
const { postprocess } = test.options;
|
|
129
|
+
const postprocessFn = new Function(
|
|
130
|
+
'output',
|
|
131
|
+
'context',
|
|
132
|
+
postprocess.includes('\n') ? postprocess : `return ${postprocess}`,
|
|
133
|
+
);
|
|
134
|
+
processedResponse.output = postprocessFn(processedResponse.output);
|
|
135
|
+
if (processedResponse.output == null) {
|
|
136
|
+
throw new Error('Postprocess function did not return a value');
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
invariant(processedResponse.output != null, 'Response output should not be null');
|
|
141
|
+
const checkResult = await runAssertions(test, processedResponse.output);
|
|
125
142
|
if (!checkResult.pass) {
|
|
126
143
|
ret.error = checkResult.reason;
|
|
127
144
|
}
|
|
@@ -132,6 +149,7 @@ class Evaluator {
|
|
|
132
149
|
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
|
|
133
150
|
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
|
|
134
151
|
}
|
|
152
|
+
ret.response = processedResponse;
|
|
135
153
|
} else {
|
|
136
154
|
ret.success = false;
|
|
137
155
|
ret.score = 0;
|
|
@@ -232,11 +250,13 @@ class Evaluator {
|
|
|
232
250
|
// Aggregate all vars across test cases
|
|
233
251
|
|
|
234
252
|
const tests = (
|
|
235
|
-
testSuite.tests
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
253
|
+
testSuite.tests && testSuite.tests.length > 0
|
|
254
|
+
? testSuite.tests
|
|
255
|
+
: [
|
|
256
|
+
{
|
|
257
|
+
// Dummy test for cases when we're only comparing raw prompts.
|
|
258
|
+
},
|
|
259
|
+
]
|
|
240
260
|
).map((test) => {
|
|
241
261
|
const finalTestCase: TestCase = Object.assign({}, testSuite.defaultTest);
|
|
242
262
|
return Object.assign(finalTestCase, test);
|
|
@@ -270,6 +290,8 @@ class Evaluator {
|
|
|
270
290
|
testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
|
|
271
291
|
const appendToPrompt =
|
|
272
292
|
testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
|
|
293
|
+
testCase.options.postprocess =
|
|
294
|
+
testCase.options.postprocess || testSuite.defaultTest?.options?.postprocess;
|
|
273
295
|
|
|
274
296
|
// Finalize test case eval
|
|
275
297
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
package/src/main.ts
CHANGED
|
@@ -11,6 +11,7 @@ import logger, { getLogLevel, setLogLevel } from './logger';
|
|
|
11
11
|
import { loadApiProvider, loadApiProviders } from './providers';
|
|
12
12
|
import { evaluate } from './evaluator';
|
|
13
13
|
import {
|
|
14
|
+
cleanupOldResults,
|
|
14
15
|
maybeReadConfig,
|
|
15
16
|
readConfig,
|
|
16
17
|
readLatestResults,
|
|
@@ -181,6 +182,7 @@ async function main() {
|
|
|
181
182
|
.action(async () => {
|
|
182
183
|
telemetry.maybeShowNotice();
|
|
183
184
|
await clearCache();
|
|
185
|
+
cleanupOldResults(0);
|
|
184
186
|
telemetry.record('command_used', {
|
|
185
187
|
name: 'cache_clear',
|
|
186
188
|
});
|
|
@@ -321,6 +323,7 @@ async function main() {
|
|
|
321
323
|
suffix: cmdObj.promptSuffix,
|
|
322
324
|
provider: cmdObj.grader,
|
|
323
325
|
// rubricPrompt:
|
|
326
|
+
// postprocess
|
|
324
327
|
},
|
|
325
328
|
...config.defaultTest,
|
|
326
329
|
};
|
|
@@ -6,6 +6,10 @@ import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '.
|
|
|
6
6
|
|
|
7
7
|
interface AzureOpenAiCompletionOptions {
|
|
8
8
|
temperature?: number;
|
|
9
|
+
top_p?: number;
|
|
10
|
+
frequency_penalty?: number;
|
|
11
|
+
presence_penalty?: number;
|
|
12
|
+
best_of?: number;
|
|
9
13
|
functions?: {
|
|
10
14
|
name: string;
|
|
11
15
|
description?: string;
|
|
@@ -144,6 +148,17 @@ export class AzureOpenAiCompletionProvider extends AzureOpenAiGenericProvider {
|
|
|
144
148
|
options?.temperature ??
|
|
145
149
|
this.options.temperature ??
|
|
146
150
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
151
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
152
|
+
presence_penalty:
|
|
153
|
+
options?.presence_penalty ??
|
|
154
|
+
this.options.presence_penalty ??
|
|
155
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
156
|
+
frequency_penalty:
|
|
157
|
+
options?.frequency_penalty ??
|
|
158
|
+
this.options.frequency_penalty ??
|
|
159
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
160
|
+
best_of:
|
|
161
|
+
options?.best_of ?? this.options.best_of ?? parseInt(process.env.OPENAI_BEST_OF || '1'),
|
|
147
162
|
stop,
|
|
148
163
|
};
|
|
149
164
|
logger.debug(`Calling Azure OpenAI API: ${JSON.stringify(body)}`);
|
|
@@ -214,6 +229,15 @@ export class AzureOpenAiChatCompletionProvider extends AzureOpenAiGenericProvide
|
|
|
214
229
|
options?.temperature ??
|
|
215
230
|
this.options.temperature ??
|
|
216
231
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
232
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
233
|
+
presence_penalty:
|
|
234
|
+
options?.presence_penalty ??
|
|
235
|
+
this.options.presence_penalty ??
|
|
236
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
237
|
+
frequency_penalty:
|
|
238
|
+
options?.frequency_penalty ??
|
|
239
|
+
this.options.frequency_penalty ??
|
|
240
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
217
241
|
functions: options?.functions || this.options.functions || undefined,
|
|
218
242
|
function_call: options?.function_call || this.options.function_call || undefined,
|
|
219
243
|
};
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
|
|
2
1
|
import logger from '../logger';
|
|
3
2
|
import { fetchJsonWithCache } from '../cache';
|
|
4
3
|
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
@@ -10,6 +9,10 @@ const DEFAULT_OPENAI_HOST = 'api.openai.com';
|
|
|
10
9
|
interface OpenAiCompletionOptions {
|
|
11
10
|
temperature?: number;
|
|
12
11
|
max_tokens?: number;
|
|
12
|
+
top_p?: number;
|
|
13
|
+
frequency_penalty?: number;
|
|
14
|
+
presence_penalty?: number;
|
|
15
|
+
best_of?: number;
|
|
13
16
|
functions?: {
|
|
14
17
|
name: string;
|
|
15
18
|
description?: string;
|
|
@@ -148,11 +151,25 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
148
151
|
const body = {
|
|
149
152
|
model: this.modelName,
|
|
150
153
|
prompt,
|
|
151
|
-
max_tokens:
|
|
154
|
+
max_tokens:
|
|
155
|
+
options?.max_tokens ??
|
|
156
|
+
this.options.max_tokens ??
|
|
157
|
+
parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
152
158
|
temperature:
|
|
153
159
|
options?.temperature ??
|
|
154
160
|
this.options.temperature ??
|
|
155
161
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
162
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
163
|
+
presence_penalty:
|
|
164
|
+
options?.presence_penalty ??
|
|
165
|
+
this.options.presence_penalty ??
|
|
166
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
167
|
+
frequency_penalty:
|
|
168
|
+
options?.frequency_penalty ??
|
|
169
|
+
this.options.frequency_penalty ??
|
|
170
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
171
|
+
best_of:
|
|
172
|
+
options?.best_of ?? this.options.best_of ?? parseInt(process.env.OPENAI_BEST_OF || '1'),
|
|
156
173
|
stop,
|
|
157
174
|
};
|
|
158
175
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
@@ -231,11 +248,23 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
231
248
|
const body = {
|
|
232
249
|
model: this.modelName,
|
|
233
250
|
messages: messages,
|
|
234
|
-
max_tokens:
|
|
251
|
+
max_tokens:
|
|
252
|
+
options?.max_tokens ??
|
|
253
|
+
this.options.max_tokens ??
|
|
254
|
+
parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
235
255
|
temperature:
|
|
236
256
|
options?.temperature ??
|
|
237
257
|
this.options.temperature ??
|
|
238
258
|
parseFloat(process.env.OPENAI_TEMPERATURE || '0'),
|
|
259
|
+
top_p: options?.top_p ?? this.options.top_p ?? parseFloat(process.env.OPENAI_TOP_P || '1'),
|
|
260
|
+
presence_penalty:
|
|
261
|
+
options?.presence_penalty ??
|
|
262
|
+
this.options.presence_penalty ??
|
|
263
|
+
parseFloat(process.env.OPENAI_PRESENCE_PENALTY || '0'),
|
|
264
|
+
frequency_penalty:
|
|
265
|
+
options?.frequency_penalty ??
|
|
266
|
+
this.options.frequency_penalty ??
|
|
267
|
+
parseFloat(process.env.OPENAI_FREQUENCY_PENALTY || '0'),
|
|
239
268
|
functions: options?.functions || this.options.functions || undefined,
|
|
240
269
|
function_call: options?.function_call || this.options.function_call || undefined,
|
|
241
270
|
};
|
|
@@ -65,9 +65,13 @@ export class ReplicateProvider implements ApiProvider {
|
|
|
65
65
|
const data = {
|
|
66
66
|
input: {
|
|
67
67
|
prompt,
|
|
68
|
-
max_length:
|
|
69
|
-
|
|
70
|
-
|
|
68
|
+
max_length:
|
|
69
|
+
this.options.max_length || parseInt(process.env.REPLICATE_MAX_LENGTH || '2046', 10),
|
|
70
|
+
temperature:
|
|
71
|
+
this.options.temperature || parseFloat(process.env.REPLICATE_TEMPERATURE || '0.01'),
|
|
72
|
+
repetition_penalty:
|
|
73
|
+
this.options.repetition_penalty ||
|
|
74
|
+
parseFloat(process.env.REPLICATE_REPETITION_PENALTY || '1.0'),
|
|
71
75
|
},
|
|
72
76
|
};
|
|
73
77
|
response = await replicate.run(this.modelName as any, data);
|
package/src/providers/shared.ts
CHANGED
|
@@ -4,7 +4,9 @@ export const REQUEST_TIMEOUT_MS = process.env.REQUEST_TIMEOUT_MS
|
|
|
4
4
|
? parseInt(process.env.REQUEST_TIMEOUT_MS, 10)
|
|
5
5
|
: 300_000;
|
|
6
6
|
|
|
7
|
-
export function parseChatPrompt(
|
|
7
|
+
export function parseChatPrompt(
|
|
8
|
+
prompt: string,
|
|
9
|
+
): { role: string; content: string; name?: string }[] {
|
|
8
10
|
const trimmedPrompt = prompt.trim();
|
|
9
11
|
if (trimmedPrompt.startsWith('- role:')) {
|
|
10
12
|
try {
|
package/src/types.ts
CHANGED
|
@@ -73,6 +73,10 @@ export interface PromptConfig {
|
|
|
73
73
|
suffix?: string;
|
|
74
74
|
}
|
|
75
75
|
|
|
76
|
+
export interface OutputConfig {
|
|
77
|
+
postprocess?: string;
|
|
78
|
+
}
|
|
79
|
+
|
|
76
80
|
export interface EvaluateOptions {
|
|
77
81
|
maxConcurrency?: number;
|
|
78
82
|
showProgressBar?: boolean;
|
|
@@ -185,7 +189,7 @@ export interface TestCase {
|
|
|
185
189
|
assert?: Assertion[];
|
|
186
190
|
|
|
187
191
|
// Additional configuration settings for the prompt
|
|
188
|
-
options?: PromptConfig & GradingConfig;
|
|
192
|
+
options?: PromptConfig & OutputConfig & GradingConfig;
|
|
189
193
|
}
|
|
190
194
|
|
|
191
195
|
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
package/src/util.ts
CHANGED
|
@@ -29,7 +29,10 @@ import type {
|
|
|
29
29
|
TestSuite,
|
|
30
30
|
} from './types';
|
|
31
31
|
|
|
32
|
-
export function readProviderPromptMap(
|
|
32
|
+
export function readProviderPromptMap(
|
|
33
|
+
config: Partial<UnifiedConfig>,
|
|
34
|
+
parsedPrompts: Prompt[],
|
|
35
|
+
): TestSuite['providerPromptMap'] {
|
|
33
36
|
const ret: Record<string, string[]> = {};
|
|
34
37
|
|
|
35
38
|
if (!config.providers) {
|
|
@@ -313,28 +316,31 @@ export function writeOutput(
|
|
|
313
316
|
}
|
|
314
317
|
}
|
|
315
318
|
|
|
316
|
-
export
|
|
319
|
+
export function fetchWithTimeout(
|
|
317
320
|
url: RequestInfo,
|
|
318
321
|
options: RequestInit = {},
|
|
319
322
|
timeout: number,
|
|
320
323
|
): Promise<Response> {
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
324
|
+
return new Promise((resolve, reject) => {
|
|
325
|
+
const controller = new AbortController();
|
|
326
|
+
const { signal } = controller;
|
|
327
|
+
options.signal = signal;
|
|
328
|
+
|
|
329
|
+
const timeoutId = setTimeout(() => {
|
|
330
|
+
controller.abort();
|
|
331
|
+
reject(new Error(`Request timed out after ${timeout} ms`));
|
|
332
|
+
}, timeout);
|
|
333
|
+
|
|
334
|
+
fetch(url, options)
|
|
335
|
+
.then((response) => {
|
|
336
|
+
clearTimeout(timeoutId);
|
|
337
|
+
resolve(response);
|
|
338
|
+
})
|
|
339
|
+
.catch((error) => {
|
|
340
|
+
clearTimeout(timeoutId);
|
|
341
|
+
reject(error);
|
|
342
|
+
});
|
|
343
|
+
});
|
|
338
344
|
}
|
|
339
345
|
|
|
340
346
|
export async function fetchWithRetries(
|
|
@@ -356,6 +362,8 @@ export async function fetchWithRetries(
|
|
|
356
362
|
throw new Error(`Request failed after ${retries} retries: ${(lastError as Error).message}`);
|
|
357
363
|
}
|
|
358
364
|
|
|
365
|
+
const RESULT_HISTORY_LENGTH = 50;
|
|
366
|
+
|
|
359
367
|
export function getConfigDirectoryPath(): string {
|
|
360
368
|
return path.join(os.homedir(), '.promptfoo');
|
|
361
369
|
}
|
|
@@ -365,11 +373,14 @@ export function getLatestResultsPath(): string {
|
|
|
365
373
|
}
|
|
366
374
|
|
|
367
375
|
export function writeLatestResults(results: EvaluateSummary, config: Partial<UnifiedConfig>) {
|
|
376
|
+
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
|
|
377
|
+
const timestamp = new Date().toISOString();
|
|
378
|
+
const newResultsPath = path.join(resultsDirectory, `eval-${timestamp}.json`);
|
|
368
379
|
const latestResultsPath = getLatestResultsPath();
|
|
369
380
|
try {
|
|
370
|
-
fs.mkdirSync(
|
|
381
|
+
fs.mkdirSync(resultsDirectory, { recursive: true });
|
|
371
382
|
fs.writeFileSync(
|
|
372
|
-
|
|
383
|
+
newResultsPath,
|
|
373
384
|
JSON.stringify(
|
|
374
385
|
{
|
|
375
386
|
version: 1,
|
|
@@ -380,8 +391,45 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
|
|
|
380
391
|
2,
|
|
381
392
|
),
|
|
382
393
|
);
|
|
394
|
+
if (fs.existsSync(latestResultsPath)) {
|
|
395
|
+
fs.unlinkSync(latestResultsPath);
|
|
396
|
+
}
|
|
397
|
+
fs.symlinkSync(newResultsPath, latestResultsPath);
|
|
398
|
+
cleanupOldResults();
|
|
383
399
|
} catch (err) {
|
|
384
|
-
logger.error(`Failed to write latest results to ${
|
|
400
|
+
logger.error(`Failed to write latest results to ${newResultsPath}:\n${err}`);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
export function listPreviousResults(): string[] {
|
|
405
|
+
const directory = path.join(getConfigDirectoryPath(), 'output');
|
|
406
|
+
const files = fs.readdirSync(directory);
|
|
407
|
+
const resultsFiles = files.filter((file) => file.startsWith('eval-') && file.endsWith('.json'));
|
|
408
|
+
const sortedFiles = resultsFiles.sort((a, b) => {
|
|
409
|
+
const statA = fs.statSync(path.join(directory, a));
|
|
410
|
+
const statB = fs.statSync(path.join(directory, b));
|
|
411
|
+
return statB.birthtime.getTime() - statA.birthtime.getTime(); // sort in descending order
|
|
412
|
+
});
|
|
413
|
+
return sortedFiles;
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
export function cleanupOldResults(remaining = RESULT_HISTORY_LENGTH) {
|
|
417
|
+
const sortedFiles = listPreviousResults();
|
|
418
|
+
for (let i = 0; i < sortedFiles.length - remaining; i++) {
|
|
419
|
+
fs.unlinkSync(path.join(getConfigDirectoryPath(), 'output', sortedFiles[i]));
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
export function readResult(
|
|
424
|
+
name: string,
|
|
425
|
+
): { results: EvaluateSummary; config: Partial<UnifiedConfig> } | undefined {
|
|
426
|
+
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
|
|
427
|
+
const resultsPath = path.join(resultsDirectory, name);
|
|
428
|
+
try {
|
|
429
|
+
const results = JSON.parse(fs.readFileSync(fs.realpathSync(resultsPath), 'utf-8'));
|
|
430
|
+
return results;
|
|
431
|
+
} catch (err) {
|
|
432
|
+
logger.error(`Failed to read results from ${resultsPath}:\n${err}`);
|
|
385
433
|
}
|
|
386
434
|
}
|
|
387
435
|
|
|
@@ -389,12 +437,7 @@ export function readLatestResults():
|
|
|
389
437
|
| { results: EvaluateSummary; config: Partial<UnifiedConfig> }
|
|
390
438
|
| undefined {
|
|
391
439
|
const latestResultsPath = getLatestResultsPath();
|
|
392
|
-
|
|
393
|
-
const latestResults = JSON.parse(fs.readFileSync(latestResultsPath, 'utf-8'));
|
|
394
|
-
return latestResults;
|
|
395
|
-
} catch (err) {
|
|
396
|
-
logger.error(`Failed to read latest results from ${latestResultsPath}:\n${err}`);
|
|
397
|
-
}
|
|
440
|
+
return readResult(latestResultsPath);
|
|
398
441
|
}
|
|
399
442
|
|
|
400
443
|
export function cosineSimilarity(vecA: number[], vecB: number[]) {
|
|
@@ -14,6 +14,7 @@ function App() {
|
|
|
14
14
|
const { table, setTable, setConfig } = useStore();
|
|
15
15
|
const [loaded, setLoaded] = React.useState<boolean>(false);
|
|
16
16
|
const loadedFromApi = React.useRef(false);
|
|
17
|
+
const [recentFiles, setRecentFiles] = React.useState<string[]>([]);
|
|
17
18
|
|
|
18
19
|
const prefersDarkMode = useMediaQuery('(prefers-color-scheme: dark)');
|
|
19
20
|
const [darkMode, setDarkMode] = React.useState(prefersDarkMode);
|
|
@@ -43,6 +44,22 @@ function App() {
|
|
|
43
44
|
}
|
|
44
45
|
}, [prefersDarkMode]);
|
|
45
46
|
|
|
47
|
+
const fetchRecentFiles = async () => {
|
|
48
|
+
if (!window.location.href.includes('localhost')) {
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
const resp = await fetch(`http://localhost:15500/results`);
|
|
52
|
+
const body = await resp.json();
|
|
53
|
+
setRecentFiles(body.data);
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
const handleRecentFileSelection = async (file: string) => {
|
|
57
|
+
const resp = await fetch(`http://localhost:15500/results/${file}`);
|
|
58
|
+
const body = await resp.json();
|
|
59
|
+
setTable(body.data.results.table);
|
|
60
|
+
setConfig(body.data.config);
|
|
61
|
+
};
|
|
62
|
+
|
|
46
63
|
React.useEffect(() => {
|
|
47
64
|
const fetchEvalData = async (id: string) => {
|
|
48
65
|
if (loadedFromApi.current) {
|
|
@@ -72,12 +89,14 @@ function App() {
|
|
|
72
89
|
setLoaded(true);
|
|
73
90
|
setTable(data.results.table);
|
|
74
91
|
setConfig(data.config);
|
|
92
|
+
fetchRecentFiles();
|
|
75
93
|
});
|
|
76
94
|
|
|
77
95
|
socket.on('update', (data) => {
|
|
78
96
|
console.log('Received data update', data);
|
|
79
97
|
setTable(data.results.table);
|
|
80
98
|
setConfig(data.config);
|
|
99
|
+
fetchRecentFiles();
|
|
81
100
|
});
|
|
82
101
|
}
|
|
83
102
|
|
|
@@ -89,7 +108,11 @@ function App() {
|
|
|
89
108
|
return (
|
|
90
109
|
<ThemeProvider theme={theme}>
|
|
91
110
|
<NavBar darkMode={darkMode} onToggleDarkMode={toggleDarkMode} />
|
|
92
|
-
{loaded && table ?
|
|
111
|
+
{loaded && table ? (
|
|
112
|
+
<ResultsView recentFiles={recentFiles} onRecentFileSelected={handleRecentFileSelection} />
|
|
113
|
+
) : (
|
|
114
|
+
<div>Loading...</div>
|
|
115
|
+
)}
|
|
93
116
|
</ThemeProvider>
|
|
94
117
|
);
|
|
95
118
|
}
|
|
@@ -37,7 +37,26 @@ const ResponsiveStack = styled(Stack)(({ theme }) => ({
|
|
|
37
37
|
},
|
|
38
38
|
}));
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
function filenameToDate(filename: string) {
|
|
41
|
+
const dateString = filename.slice('eval-'.length, filename.length - '.json'.length);
|
|
42
|
+
const date = new Date(dateString);
|
|
43
|
+
return date.toLocaleDateString('en-US', {
|
|
44
|
+
year: 'numeric',
|
|
45
|
+
month: 'long',
|
|
46
|
+
day: 'numeric',
|
|
47
|
+
hour: '2-digit',
|
|
48
|
+
minute: '2-digit',
|
|
49
|
+
second: '2-digit',
|
|
50
|
+
timeZoneName: 'short',
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
interface ResultsViewProps {
|
|
55
|
+
recentFiles: string[];
|
|
56
|
+
onRecentFileSelected: (file: string) => void;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export default function ResultsView({ recentFiles, onRecentFileSelected }: ResultsViewProps) {
|
|
41
60
|
const { table, config } = useStore();
|
|
42
61
|
const [maxTextLength, setMaxTextLength] = React.useState(250);
|
|
43
62
|
const [columnVisibility, setColumnVisibility] = React.useState<VisibilityState>({});
|
|
@@ -148,10 +167,30 @@ export default function ResultsView() {
|
|
|
148
167
|
return (
|
|
149
168
|
<div>
|
|
150
169
|
<Paper py="md">
|
|
151
|
-
<ResponsiveStack direction="row" spacing={
|
|
170
|
+
<ResponsiveStack direction="row" spacing={4} alignItems="center">
|
|
171
|
+
<Box>
|
|
172
|
+
{recentFiles && recentFiles.length > 0 && (
|
|
173
|
+
<FormControl sx={{ m: 1, minWidth: 200 }} size="small">
|
|
174
|
+
<InputLabel>View run</InputLabel>
|
|
175
|
+
<Select
|
|
176
|
+
key={recentFiles.join(',')}
|
|
177
|
+
className="recent-files"
|
|
178
|
+
label="Previous runs"
|
|
179
|
+
defaultValue={recentFiles[0]}
|
|
180
|
+
onChange={(e: SelectChangeEvent) => onRecentFileSelected(e.target.value)}
|
|
181
|
+
>
|
|
182
|
+
{recentFiles.map((file) => (
|
|
183
|
+
<MenuItem key={file} value={file}>
|
|
184
|
+
{filenameToDate(file)}
|
|
185
|
+
</MenuItem>
|
|
186
|
+
))}
|
|
187
|
+
</Select>
|
|
188
|
+
</FormControl>
|
|
189
|
+
)}
|
|
190
|
+
</Box>
|
|
152
191
|
<Box>
|
|
153
192
|
<FormControl sx={{ m: 1, minWidth: 200 }} size="small">
|
|
154
|
-
<InputLabel id="visible-columns-label">
|
|
193
|
+
<InputLabel id="visible-columns-label">Show columns</InputLabel>
|
|
155
194
|
<Select
|
|
156
195
|
labelId="visible-columns-label"
|
|
157
196
|
id="visible-columns"
|
package/src/web/server.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import fs from 'fs';
|
|
1
|
+
import fs, { Stats } from 'fs';
|
|
2
2
|
import path from 'node:path';
|
|
3
3
|
import readline from 'node:readline';
|
|
4
4
|
import http from 'node:http';
|
|
@@ -11,7 +11,7 @@ import { Server as SocketIOServer } from 'socket.io';
|
|
|
11
11
|
|
|
12
12
|
import logger from '../logger';
|
|
13
13
|
import { getDirectory } from '../esm';
|
|
14
|
-
import { getLatestResultsPath } from '../util';
|
|
14
|
+
import { getLatestResultsPath, listPreviousResults, readResult } from '../util';
|
|
15
15
|
|
|
16
16
|
export function init(port = 15500) {
|
|
17
17
|
const app = express();
|
|
@@ -40,14 +40,37 @@ export function init(port = 15500) {
|
|
|
40
40
|
socket.emit('init', readLatestJson());
|
|
41
41
|
|
|
42
42
|
// Watch for changes to latest.json and emit the update event
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
43
|
+
const watcher = debounce((curr: Stats, prev: Stats) => {
|
|
44
|
+
if (curr.mtime !== prev.mtime) {
|
|
45
|
+
socket.emit('update', readLatestJson());
|
|
46
|
+
}
|
|
47
|
+
}, 250);
|
|
48
|
+
fs.watchFile(latestJsonPath, watcher);
|
|
49
|
+
|
|
50
|
+
// Stop watching the file when the socket connection is closed
|
|
51
|
+
socket.on('disconnect', () => {
|
|
52
|
+
fs.unwatchFile(latestJsonPath, watcher);
|
|
53
|
+
});
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
app.get('/results', (req, res) => {
|
|
57
|
+
const previousResults = listPreviousResults();
|
|
58
|
+
res.json({ data: previousResults });
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
app.get('/results/:filename', (req, res) => {
|
|
62
|
+
const filename = req.params.filename;
|
|
63
|
+
const safeFilename = path.basename(filename);
|
|
64
|
+
if (safeFilename !== filename || !listPreviousResults().includes(safeFilename)) {
|
|
65
|
+
res.status(400).send('Invalid filename');
|
|
66
|
+
return;
|
|
67
|
+
}
|
|
68
|
+
const result = readResult(safeFilename);
|
|
69
|
+
if (!result) {
|
|
70
|
+
res.status(404).send('Result not found');
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
res.json({ data: result });
|
|
51
74
|
});
|
|
52
75
|
|
|
53
76
|
httpServer.listen(port, () => {
|