promptfoo 0.17.5 → 0.17.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +3 -2
- package/dist/src/assertions.js +2 -2
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/cache.d.ts +3 -0
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +6 -1
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +35 -17
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts +1 -0
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/main.js +11 -5
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +1 -13
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/localai.d.ts.map +1 -1
- package/dist/src/providers/localai.js +2 -1
- package/dist/src/providers/localai.js.map +1 -1
- package/dist/src/providers/openai.d.ts +1 -0
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +3 -29
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts +18 -0
- package/dist/src/providers/replicate.d.ts.map +1 -0
- package/dist/src/providers/replicate.js +80 -0
- package/dist/src/providers/replicate.js.map +1 -0
- package/dist/src/providers/shared.d.ts +5 -0
- package/dist/src/providers/shared.d.ts.map +1 -1
- package/dist/src/providers/shared.js +33 -1
- package/dist/src/providers/shared.js.map +1 -1
- package/dist/src/providers.d.ts +2 -0
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +8 -0
- package/dist/src/providers.js.map +1 -1
- package/dist/src/types.d.ts +4 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +2 -1
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +20 -1
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-c2756e5d.js → index-13198388.js} +23 -23
- package/dist/src/web/client/assets/index-f9b230d1.css +1 -0
- package/dist/src/web/client/index.html +2 -2
- package/package.json +3 -2
- package/src/assertions.ts +2 -2
- package/src/cache.ts +5 -1
- package/src/evaluator.ts +37 -17
- package/src/main.ts +16 -5
- package/src/providers/azureopenai.ts +2 -18
- package/src/providers/localai.ts +3 -2
- package/src/providers/openai.ts +5 -35
- package/src/providers/replicate.ts +95 -0
- package/src/providers/shared.ts +29 -0
- package/src/providers.ts +8 -0
- package/src/types.ts +7 -0
- package/src/util.ts +25 -0
- package/src/web/client/src/App.tsx +6 -0
- package/src/web/client/src/EvalOutputPromptDialog.tsx +6 -2
- package/src/web/client/src/ResultsTable.tsx +5 -0
- package/src/web/client/src/ResultsView.tsx +2 -1
- package/src/web/client/src/index.css +1 -12
- package/src/web/client/src/types.ts +1 -1
- package/dist/src/web/client/assets/index-b82d0138.css +0 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-13198388.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-f9b230d1.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "LLM eval & testing toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.17.
|
|
5
|
+
"version": "0.17.7",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
"src"
|
|
20
20
|
],
|
|
21
21
|
"engines": {
|
|
22
|
-
"node": ">=
|
|
22
|
+
"node": ">=16"
|
|
23
23
|
},
|
|
24
24
|
"bin": {
|
|
25
25
|
"promptfoo": "dist/src/main.js"
|
|
@@ -79,6 +79,7 @@
|
|
|
79
79
|
"node-fetch": "^2.6.7",
|
|
80
80
|
"nunjucks": "^3.2.4",
|
|
81
81
|
"opener": "^1.5.2",
|
|
82
|
+
"replicate": "^0.12.3",
|
|
82
83
|
"rouge": "^1.0.3",
|
|
83
84
|
"semver": "^7.5.3",
|
|
84
85
|
"socket.io": "^4.6.1",
|
package/src/assertions.ts
CHANGED
|
@@ -432,8 +432,8 @@ export async function matchesLlmRubric(
|
|
|
432
432
|
}
|
|
433
433
|
|
|
434
434
|
const prompt = nunjucks.renderString(options.rubricPrompt || DEFAULT_GRADING_PROMPT, {
|
|
435
|
-
output,
|
|
436
|
-
rubric: expected,
|
|
435
|
+
output: output.replace(/\n/g, '\\n').replace(/"/g, '\\"'),
|
|
436
|
+
rubric: expected.replace(/\n/g, '\\n').replace(/"/g, '\\"'),
|
|
437
437
|
});
|
|
438
438
|
|
|
439
439
|
let provider = options.provider || DefaultGradingProvider;
|
package/src/cache.ts
CHANGED
|
@@ -20,7 +20,7 @@ let enabled =
|
|
|
20
20
|
const cacheType =
|
|
21
21
|
process.env.PROMPTFOO_CACHE_TYPE || (process.env.NODE_ENV === 'test' ? 'memory' : 'disk');
|
|
22
22
|
|
|
23
|
-
function getCache() {
|
|
23
|
+
export function getCache() {
|
|
24
24
|
if (!cacheInstance) {
|
|
25
25
|
const cachePath =
|
|
26
26
|
process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
|
|
@@ -102,3 +102,7 @@ export async function clearCache() {
|
|
|
102
102
|
logger.info('Clearing cache...');
|
|
103
103
|
return getCache().reset();
|
|
104
104
|
}
|
|
105
|
+
|
|
106
|
+
export function isCacheEnabled() {
|
|
107
|
+
return enabled;
|
|
108
|
+
}
|
package/src/evaluator.ts
CHANGED
|
@@ -33,6 +33,7 @@ interface RunEvalOptions {
|
|
|
33
33
|
|
|
34
34
|
rowIndex: number;
|
|
35
35
|
colIndex: number;
|
|
36
|
+
repeatIndex: number;
|
|
36
37
|
}
|
|
37
38
|
|
|
38
39
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
@@ -212,6 +213,13 @@ class Evaluator {
|
|
|
212
213
|
// Split prompts by provider
|
|
213
214
|
for (const prompt of testSuite.prompts) {
|
|
214
215
|
for (const provider of testSuite.providers) {
|
|
216
|
+
// Check if providerPromptMap exists and if it contains the current prompt's display
|
|
217
|
+
if (testSuite.providerPromptMap) {
|
|
218
|
+
const allowedPrompts = testSuite.providerPromptMap[provider.id()];
|
|
219
|
+
if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
215
223
|
const updatedDisplay =
|
|
216
224
|
testSuite.providers.length > 1 ? `[${provider.id()}] ${prompt.display}` : prompt.display;
|
|
217
225
|
prompts.push({
|
|
@@ -266,25 +274,37 @@ class Evaluator {
|
|
|
266
274
|
// Finalize test case eval
|
|
267
275
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
268
276
|
totalVarCombinations += varCombinations.length;
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
277
|
+
|
|
278
|
+
const numRepeat = this.options.repeat || 1;
|
|
279
|
+
for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
|
|
280
|
+
for (const vars of varCombinations) {
|
|
281
|
+
let colIndex = 0;
|
|
282
|
+
for (const prompt of testSuite.prompts) {
|
|
283
|
+
for (const provider of testSuite.providers) {
|
|
284
|
+
if (testSuite.providerPromptMap) {
|
|
285
|
+
const allowedPrompts = testSuite.providerPromptMap[provider.id()];
|
|
286
|
+
if (allowedPrompts && !allowedPrompts.includes(prompt.display)) {
|
|
287
|
+
// This prompt should not be used with this provider.
|
|
288
|
+
continue;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
runEvalOptions.push({
|
|
292
|
+
provider,
|
|
293
|
+
prompt: {
|
|
294
|
+
...prompt,
|
|
295
|
+
raw: prependToPrompt + prompt.raw + appendToPrompt,
|
|
296
|
+
},
|
|
297
|
+
test: { ...testCase, vars },
|
|
298
|
+
includeProviderId: testSuite.providers.length > 1,
|
|
299
|
+
rowIndex,
|
|
300
|
+
colIndex,
|
|
301
|
+
repeatIndex,
|
|
302
|
+
});
|
|
303
|
+
colIndex++;
|
|
304
|
+
}
|
|
285
305
|
}
|
|
306
|
+
rowIndex++;
|
|
286
307
|
}
|
|
287
|
-
rowIndex++;
|
|
288
308
|
}
|
|
289
309
|
}
|
|
290
310
|
|
package/src/main.ts
CHANGED
|
@@ -15,6 +15,7 @@ import {
|
|
|
15
15
|
readConfig,
|
|
16
16
|
readLatestResults,
|
|
17
17
|
readPrompts,
|
|
18
|
+
readProviderPromptMap,
|
|
18
19
|
readTests,
|
|
19
20
|
writeLatestResults,
|
|
20
21
|
writeOutput,
|
|
@@ -130,7 +131,7 @@ async function main() {
|
|
|
130
131
|
|
|
131
132
|
program
|
|
132
133
|
.command('share')
|
|
133
|
-
.description('
|
|
134
|
+
.description('Create a shareable URL of your most recent eval')
|
|
134
135
|
.option('-y, --yes', 'Skip confirmation')
|
|
135
136
|
.action(async (cmdObj: { yes: boolean } & Command) => {
|
|
136
137
|
telemetry.maybeShowNotice();
|
|
@@ -158,10 +159,9 @@ async function main() {
|
|
|
158
159
|
});
|
|
159
160
|
|
|
160
161
|
reader.question(
|
|
161
|
-
'Are you sure you want to create a
|
|
162
|
+
'Are you sure you want to create a shareable URL of your most recent eval? Anyone you give this URL to will be able to view the results [Y/n] ',
|
|
162
163
|
async function (answer: string) {
|
|
163
|
-
if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y') {
|
|
164
|
-
logger.info('Did not create a public URL.');
|
|
164
|
+
if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y' && answer !== '') {
|
|
165
165
|
reader.close();
|
|
166
166
|
return;
|
|
167
167
|
}
|
|
@@ -218,6 +218,13 @@ async function main() {
|
|
|
218
218
|
? String(defaultConfig.evaluateOptions.maxConcurrency)
|
|
219
219
|
: undefined,
|
|
220
220
|
)
|
|
221
|
+
.option(
|
|
222
|
+
'--repeat <number>',
|
|
223
|
+
'Number of times to run each test',
|
|
224
|
+
defaultConfig.evaluateOptions?.repeat
|
|
225
|
+
? String(defaultConfig.evaluateOptions.repeat)
|
|
226
|
+
: undefined,
|
|
227
|
+
)
|
|
221
228
|
.option(
|
|
222
229
|
'--table-cell-max-length <number>',
|
|
223
230
|
'Truncate console table cells to this length',
|
|
@@ -263,7 +270,6 @@ async function main() {
|
|
|
263
270
|
}
|
|
264
271
|
|
|
265
272
|
// Config parsing
|
|
266
|
-
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
267
273
|
let fileConfig: Partial<UnifiedConfig> = {};
|
|
268
274
|
const configPath = cmdObj.config;
|
|
269
275
|
if (configPath) {
|
|
@@ -302,6 +308,7 @@ async function main() {
|
|
|
302
308
|
config.tests,
|
|
303
309
|
cmdObj.tests ? undefined : basePath,
|
|
304
310
|
);
|
|
311
|
+
const parsedProviderPromptMap = readProviderPromptMap(config, parsedPrompts);
|
|
305
312
|
|
|
306
313
|
if (parsedPrompts.length === 0) {
|
|
307
314
|
logger.error(chalk.red('No prompts found'));
|
|
@@ -322,16 +329,20 @@ async function main() {
|
|
|
322
329
|
description: config.description,
|
|
323
330
|
prompts: parsedPrompts,
|
|
324
331
|
providers: parsedProviders,
|
|
332
|
+
providerPromptMap: parsedProviderPromptMap,
|
|
325
333
|
tests: parsedTests,
|
|
326
334
|
defaultTest,
|
|
327
335
|
};
|
|
328
336
|
|
|
337
|
+
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
338
|
+
const iterations = parseInt(cmdObj.repeat || '', 10);
|
|
329
339
|
const options: EvaluateOptions = {
|
|
330
340
|
showProgressBar:
|
|
331
341
|
typeof cmdObj.progressBar === 'undefined'
|
|
332
342
|
? getLogLevel() !== 'debug'
|
|
333
343
|
: cmdObj.progressBar,
|
|
334
344
|
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
|
|
345
|
+
repeat: !isNaN(iterations) && iterations > 0 ? iterations : 1,
|
|
335
346
|
...evaluateOptions,
|
|
336
347
|
};
|
|
337
348
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logger from '../logger';
|
|
2
2
|
import { fetchJsonWithCache } from '../cache';
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
6
6
|
|
|
@@ -205,23 +205,7 @@ export class AzureOpenAiChatCompletionProvider extends AzureOpenAiGenericProvide
|
|
|
205
205
|
throw new Error('Azure OpenAI API host must be set');
|
|
206
206
|
}
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
try {
|
|
210
|
-
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
211
|
-
} catch (err) {
|
|
212
|
-
const trimmedPrompt = prompt.trim();
|
|
213
|
-
if (
|
|
214
|
-
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
215
|
-
trimmedPrompt.startsWith('{') ||
|
|
216
|
-
trimmedPrompt.startsWith('[')
|
|
217
|
-
) {
|
|
218
|
-
throw new Error(
|
|
219
|
-
`Azure OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
220
|
-
);
|
|
221
|
-
}
|
|
222
|
-
messages = [{ role: 'user', content: prompt }];
|
|
223
|
-
}
|
|
224
|
-
|
|
208
|
+
const messages = parseChatPrompt(prompt);
|
|
225
209
|
const body = {
|
|
226
210
|
model: this.deploymentName,
|
|
227
211
|
messages: messages,
|
package/src/providers/localai.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logger from '../logger';
|
|
2
2
|
import { fetchJsonWithCache } from '../cache';
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
6
6
|
|
|
@@ -29,9 +29,10 @@ class LocalAiGenericProvider implements ApiProvider {
|
|
|
29
29
|
|
|
30
30
|
export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
31
31
|
async callApi(prompt: string): Promise<ProviderResponse> {
|
|
32
|
+
const messages = parseChatPrompt(prompt);
|
|
32
33
|
const body = {
|
|
33
34
|
model: this.modelName,
|
|
34
|
-
|
|
35
|
+
messages: messages,
|
|
35
36
|
temperature: process.env.LOCALAI_TEMPERATURE || 0.7,
|
|
36
37
|
};
|
|
37
38
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import yaml from 'js-yaml';
|
|
2
1
|
|
|
3
2
|
import logger from '../logger';
|
|
4
3
|
import { fetchJsonWithCache } from '../cache';
|
|
5
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
4
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
6
5
|
|
|
7
6
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
8
7
|
|
|
@@ -10,6 +9,7 @@ const DEFAULT_OPENAI_HOST = 'api.openai.com';
|
|
|
10
9
|
|
|
11
10
|
interface OpenAiCompletionOptions {
|
|
12
11
|
temperature?: number;
|
|
12
|
+
max_tokens?: number;
|
|
13
13
|
functions?: {
|
|
14
14
|
name: string;
|
|
15
15
|
description?: string;
|
|
@@ -148,7 +148,7 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
148
148
|
const body = {
|
|
149
149
|
model: this.modelName,
|
|
150
150
|
prompt,
|
|
151
|
-
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
151
|
+
max_tokens: options?.max_tokens ?? this.options.max_tokens ?? parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
152
152
|
temperature:
|
|
153
153
|
options?.temperature ??
|
|
154
154
|
this.options.temperature ??
|
|
@@ -227,41 +227,11 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
227
227
|
);
|
|
228
228
|
}
|
|
229
229
|
|
|
230
|
-
|
|
231
|
-
const trimmedPrompt = prompt.trim();
|
|
232
|
-
if (trimmedPrompt.startsWith('- role:')) {
|
|
233
|
-
try {
|
|
234
|
-
// Try YAML
|
|
235
|
-
messages = yaml.load(prompt) as { role: string; content: string }[];
|
|
236
|
-
} catch (err) {
|
|
237
|
-
throw new Error(
|
|
238
|
-
`OpenAI Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`,
|
|
239
|
-
);
|
|
240
|
-
}
|
|
241
|
-
} else {
|
|
242
|
-
try {
|
|
243
|
-
// Try JSON
|
|
244
|
-
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
245
|
-
} catch (err) {
|
|
246
|
-
if (
|
|
247
|
-
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
248
|
-
trimmedPrompt.startsWith('{') ||
|
|
249
|
-
trimmedPrompt.startsWith('[')
|
|
250
|
-
) {
|
|
251
|
-
throw new Error(
|
|
252
|
-
`OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
253
|
-
);
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
// Fall back to wrapping the prompt in a user message
|
|
257
|
-
messages = [{ role: 'user', content: prompt }];
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
|
|
230
|
+
const messages = parseChatPrompt(prompt);
|
|
261
231
|
const body = {
|
|
262
232
|
model: this.modelName,
|
|
263
233
|
messages: messages,
|
|
264
|
-
max_tokens: parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
234
|
+
max_tokens: options?.max_tokens ?? this.options.max_tokens ?? parseInt(process.env.OPENAI_MAX_TOKENS || '1024'),
|
|
265
235
|
temperature:
|
|
266
236
|
options?.temperature ??
|
|
267
237
|
this.options.temperature ??
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
import Replicate from 'replicate';
|
|
2
|
+
|
|
3
|
+
import fetch from 'node-fetch';
|
|
4
|
+
import logger from '../logger';
|
|
5
|
+
import { getCache, isCacheEnabled } from '../cache';
|
|
6
|
+
|
|
7
|
+
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
8
|
+
|
|
9
|
+
interface ReplicateCompletionOptions {
|
|
10
|
+
temperature?: number;
|
|
11
|
+
max_length?: number;
|
|
12
|
+
repetition_penalty?: number;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export class ReplicateProvider implements ApiProvider {
|
|
16
|
+
modelName: string;
|
|
17
|
+
apiKey?: string;
|
|
18
|
+
replicate: any;
|
|
19
|
+
options: ReplicateCompletionOptions;
|
|
20
|
+
|
|
21
|
+
constructor(modelName: string, apiKey?: string, options?: ReplicateCompletionOptions) {
|
|
22
|
+
this.modelName = modelName;
|
|
23
|
+
this.apiKey = apiKey || process.env.REPLICATE_API_TOKEN || process.env.REPLICATE_API_KEY;
|
|
24
|
+
this.options = options || {};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
id(): string {
|
|
28
|
+
return `replicate:${this.modelName}`;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
toString(): string {
|
|
32
|
+
return `[Replicate Provider ${this.modelName}]`;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async callApi(prompt: string): Promise<ProviderResponse> {
|
|
36
|
+
if (!this.apiKey) {
|
|
37
|
+
throw new Error(
|
|
38
|
+
'Replicate API key is not set. Set REPLICATE_API_TOKEN environment variable or pass it as an argument to the constructor.',
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let cache;
|
|
43
|
+
let cacheKey;
|
|
44
|
+
if (isCacheEnabled()) {
|
|
45
|
+
cache = await getCache();
|
|
46
|
+
cacheKey = `replicate:${this.modelName}:${prompt}`;
|
|
47
|
+
|
|
48
|
+
// Try to get the cached response
|
|
49
|
+
const cachedResponse = await cache.get(cacheKey);
|
|
50
|
+
|
|
51
|
+
if (cachedResponse) {
|
|
52
|
+
logger.debug(`Returning cached response for ${prompt}: ${cachedResponse}`);
|
|
53
|
+
return JSON.parse(cachedResponse as string);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const replicate = new Replicate({
|
|
58
|
+
auth: this.apiKey,
|
|
59
|
+
fetch,
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
logger.debug(`Calling Replicate: ${prompt}`);
|
|
63
|
+
let response;
|
|
64
|
+
try {
|
|
65
|
+
const data = {
|
|
66
|
+
input: {
|
|
67
|
+
prompt,
|
|
68
|
+
max_length: this.options.max_length || parseInt(process.env.REPLICATE_MAX_LENGTH || '2046', 10),
|
|
69
|
+
temperature: this.options.temperature || parseFloat(process.env.REPLICATE_TEMPERATURE || '0.01'),
|
|
70
|
+
repetition_penalty: this.options.repetition_penalty || parseFloat(process.env.REPLICATE_REPETITION_PENALTY || '1.0'),
|
|
71
|
+
},
|
|
72
|
+
};
|
|
73
|
+
response = await replicate.run(this.modelName as any, data);
|
|
74
|
+
} catch (err) {
|
|
75
|
+
return {
|
|
76
|
+
error: `API call error: ${String(err)}`,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
logger.debug(`\tReplicate API response: ${JSON.stringify(response)}`);
|
|
80
|
+
try {
|
|
81
|
+
const result = {
|
|
82
|
+
output: (response as string[]).join(''),
|
|
83
|
+
tokenUsage: {}, // TODO: add token usage once Replicate API supports it
|
|
84
|
+
};
|
|
85
|
+
if (cache && cacheKey) {
|
|
86
|
+
await cache.set(cacheKey, JSON.stringify(result));
|
|
87
|
+
}
|
|
88
|
+
return result;
|
|
89
|
+
} catch (err) {
|
|
90
|
+
return {
|
|
91
|
+
error: `API response error: ${String(err)}: ${JSON.stringify(response)}`,
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
}
|
package/src/providers/shared.ts
CHANGED
|
@@ -1,3 +1,32 @@
|
|
|
1
|
+
import yaml from 'js-yaml';
|
|
2
|
+
|
|
1
3
|
export const REQUEST_TIMEOUT_MS = process.env.REQUEST_TIMEOUT_MS
|
|
2
4
|
? parseInt(process.env.REQUEST_TIMEOUT_MS, 10)
|
|
3
5
|
: 300_000;
|
|
6
|
+
|
|
7
|
+
export function parseChatPrompt(prompt: string): { role: string; content: string; name?: string }[] {
|
|
8
|
+
const trimmedPrompt = prompt.trim();
|
|
9
|
+
if (trimmedPrompt.startsWith('- role:')) {
|
|
10
|
+
try {
|
|
11
|
+
// Try YAML
|
|
12
|
+
return yaml.load(prompt) as { role: string; content: string }[];
|
|
13
|
+
} catch (err) {
|
|
14
|
+
throw new Error(`Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`);
|
|
15
|
+
}
|
|
16
|
+
} else {
|
|
17
|
+
try {
|
|
18
|
+
// Try JSON
|
|
19
|
+
return JSON.parse(prompt) as { role: string; content: string }[];
|
|
20
|
+
} catch (err) {
|
|
21
|
+
if (
|
|
22
|
+
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
23
|
+
trimmedPrompt.startsWith('{') ||
|
|
24
|
+
trimmedPrompt.startsWith('[')
|
|
25
|
+
) {
|
|
26
|
+
throw new Error(`Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`);
|
|
27
|
+
}
|
|
28
|
+
// Fall back to wrapping the prompt in a user message
|
|
29
|
+
return [{ role: 'user', content: prompt }];
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
package/src/providers.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
|
|
|
4
4
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
6
|
import { AnthropicCompletionProvider } from './providers/anthropic';
|
|
7
|
+
import { ReplicateProvider } from './providers/replicate';
|
|
7
8
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
8
9
|
import { ScriptCompletionProvider } from './providers/scriptCompletion';
|
|
9
10
|
import {
|
|
@@ -106,6 +107,12 @@ export async function loadApiProvider(
|
|
|
106
107
|
`Unknown Anthropic model type: ${modelType}. Use one of the following providers: anthropic:completion:<model name>`,
|
|
107
108
|
);
|
|
108
109
|
}
|
|
110
|
+
} else if (providerPath?.startsWith('replicate:')) {
|
|
111
|
+
// Load Replicate module
|
|
112
|
+
const options = providerPath.split(':');
|
|
113
|
+
const modelName = options.slice(1).join(':');
|
|
114
|
+
|
|
115
|
+
return new ReplicateProvider(modelName, undefined, context?.config);
|
|
109
116
|
}
|
|
110
117
|
|
|
111
118
|
if (providerPath?.startsWith('localai:')) {
|
|
@@ -131,6 +138,7 @@ export default {
|
|
|
131
138
|
OpenAiCompletionProvider,
|
|
132
139
|
OpenAiChatCompletionProvider,
|
|
133
140
|
AnthropicCompletionProvider,
|
|
141
|
+
ReplicateProvider,
|
|
134
142
|
LocalAiCompletionProvider,
|
|
135
143
|
LocalAiChatProvider,
|
|
136
144
|
loadApiProvider,
|
package/src/types.ts
CHANGED
|
@@ -6,6 +6,7 @@ export interface CommandLineOptions {
|
|
|
6
6
|
|
|
7
7
|
// Shared with EvaluateOptions
|
|
8
8
|
maxConcurrency: string;
|
|
9
|
+
repeat: string;
|
|
9
10
|
|
|
10
11
|
// Command line only
|
|
11
12
|
vars?: string;
|
|
@@ -29,6 +30,7 @@ export interface CommandLineOptions {
|
|
|
29
30
|
export interface ProviderConfig {
|
|
30
31
|
id: ProviderId;
|
|
31
32
|
config?: any;
|
|
33
|
+
prompts?: string[]; // List of prompt display strings
|
|
32
34
|
}
|
|
33
35
|
|
|
34
36
|
export interface ApiProvider {
|
|
@@ -75,6 +77,7 @@ export interface EvaluateOptions {
|
|
|
75
77
|
maxConcurrency?: number;
|
|
76
78
|
showProgressBar?: boolean;
|
|
77
79
|
generateSuggestions?: boolean;
|
|
80
|
+
repeat?: number;
|
|
78
81
|
}
|
|
79
82
|
|
|
80
83
|
export interface Prompt {
|
|
@@ -201,6 +204,10 @@ export interface TestSuite {
|
|
|
201
204
|
// One or more prompt strings
|
|
202
205
|
prompts: Prompt[];
|
|
203
206
|
|
|
207
|
+
// Optional mapping of provider to prompt display strings. If not provided,
|
|
208
|
+
// all prompts are used for all providers.
|
|
209
|
+
providerPromptMap?: Record<string, string[]>;
|
|
210
|
+
|
|
204
211
|
// Test cases
|
|
205
212
|
tests?: TestCase[];
|
|
206
213
|
|
package/src/util.ts
CHANGED
|
@@ -25,8 +25,33 @@ import type {
|
|
|
25
25
|
UnifiedConfig,
|
|
26
26
|
TestCase,
|
|
27
27
|
Prompt,
|
|
28
|
+
RawProviderConfig,
|
|
29
|
+
TestSuite,
|
|
28
30
|
} from './types';
|
|
29
31
|
|
|
32
|
+
export function readProviderPromptMap(config: Partial<UnifiedConfig>, parsedPrompts: Prompt[]): TestSuite["providerPromptMap"] {
|
|
33
|
+
const ret: Record<string, string[]> = {};
|
|
34
|
+
|
|
35
|
+
if (!config.providers) {
|
|
36
|
+
return ret;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const allPrompts = [];
|
|
40
|
+
for (const prompt of parsedPrompts) {
|
|
41
|
+
allPrompts.push(prompt.display);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
for (const provider of config.providers) {
|
|
45
|
+
if (typeof provider === 'object') {
|
|
46
|
+
const rawProvider = provider as RawProviderConfig;
|
|
47
|
+
const id = Object.keys(rawProvider)[0];
|
|
48
|
+
ret[id] = rawProvider[id].prompts || allPrompts;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return ret;
|
|
53
|
+
}
|
|
54
|
+
|
|
30
55
|
const PROMPT_DELIMITER = '---';
|
|
31
56
|
|
|
32
57
|
function parseJson(json: string): any | undefined {
|
|
@@ -37,6 +37,12 @@ function App() {
|
|
|
37
37
|
}
|
|
38
38
|
};
|
|
39
39
|
|
|
40
|
+
React.useEffect(() => {
|
|
41
|
+
if (prefersDarkMode) {
|
|
42
|
+
document.documentElement.setAttribute('data-theme', 'dark');
|
|
43
|
+
}
|
|
44
|
+
}, [prefersDarkMode]);
|
|
45
|
+
|
|
40
46
|
React.useEffect(() => {
|
|
41
47
|
const fetchEvalData = async (id: string) => {
|
|
42
48
|
if (loadedFromApi.current) {
|
|
@@ -37,7 +37,7 @@ export default function EvalOutputPromptDialog({
|
|
|
37
37
|
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
|
|
38
38
|
<DialogTitle>Prompt</DialogTitle>
|
|
39
39
|
<DialogContent>
|
|
40
|
-
<TextareaAutosize readOnly value={prompt} style={{ width: '100%' }} />
|
|
40
|
+
<TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
|
|
41
41
|
<IconButton
|
|
42
42
|
onClick={() => copyToClipboard(prompt)}
|
|
43
43
|
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
|
@@ -49,7 +49,11 @@ export default function EvalOutputPromptDialog({
|
|
|
49
49
|
<>
|
|
50
50
|
<DialogTitle>Output</DialogTitle>
|
|
51
51
|
<DialogContent>
|
|
52
|
-
<TextareaAutosize
|
|
52
|
+
<TextareaAutosize
|
|
53
|
+
readOnly
|
|
54
|
+
value={output}
|
|
55
|
+
style={{ width: '100%', padding: '0.75rem' }}
|
|
56
|
+
/>
|
|
53
57
|
</DialogContent>
|
|
54
58
|
</>
|
|
55
59
|
)}
|
|
@@ -334,6 +334,11 @@ export default function ResultsTable({
|
|
|
334
334
|
return failureFilter[columnId] && isFail;
|
|
335
335
|
});
|
|
336
336
|
});
|
|
337
|
+
} else if (filterMode === 'different') {
|
|
338
|
+
return body.filter((row) => {
|
|
339
|
+
// TODO(ian): This works for strings, but not objects.
|
|
340
|
+
return !row.outputs.every((output) => output.text === row.outputs[0].text);
|
|
341
|
+
});
|
|
337
342
|
}
|
|
338
343
|
return body;
|
|
339
344
|
}, [body, failureFilter, filterMode]);
|
|
@@ -181,7 +181,8 @@ export default function ResultsView() {
|
|
|
181
181
|
label="Filter"
|
|
182
182
|
>
|
|
183
183
|
<MenuItem value="all">Show all results</MenuItem>
|
|
184
|
-
<MenuItem value="failures">Show only
|
|
184
|
+
<MenuItem value="failures">Show failures only</MenuItem>
|
|
185
|
+
<MenuItem value="different">Show different only</MenuItem>
|
|
185
186
|
</Select>
|
|
186
187
|
</FormControl>
|
|
187
188
|
</Box>
|
|
@@ -21,19 +21,8 @@
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
/* Dark mode colors */
|
|
24
|
-
@media (prefers-color-scheme: dark) {
|
|
25
|
-
:root {
|
|
26
|
-
--background-color: #1a1a1a;
|
|
27
|
-
--text-color: #f0f0f0;
|
|
28
|
-
--border-color: #444444;
|
|
29
|
-
--table-border-color: #444444;
|
|
30
|
-
--pass-color: #4caf50;
|
|
31
|
-
--fail-color: #f44336;
|
|
32
|
-
--smalltext-color: #888888;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
24
|
[data-theme='dark'] {
|
|
25
|
+
/* Keep synced with prefers-color-scheme above */
|
|
37
26
|
--background-color: #1a1a1a;
|
|
38
27
|
--text-color: #f0f0f0;
|
|
39
28
|
--border-color: #444444;
|