promptfoo 0.17.5 → 0.17.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/package.json +2 -1
- package/dist/src/cache.d.ts +3 -0
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +6 -1
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +21 -17
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/index.d.ts +1 -0
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/main.js +9 -5
- package/dist/src/main.js.map +1 -1
- package/dist/src/providers/azureopenai.d.ts.map +1 -1
- package/dist/src/providers/azureopenai.js +1 -13
- package/dist/src/providers/azureopenai.js.map +1 -1
- package/dist/src/providers/localai.d.ts.map +1 -1
- package/dist/src/providers/localai.js +2 -1
- package/dist/src/providers/localai.js.map +1 -1
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +1 -27
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/replicate.d.ts +11 -0
- package/dist/src/providers/replicate.d.ts.map +1 -0
- package/dist/src/providers/replicate.js +78 -0
- package/dist/src/providers/replicate.js.map +1 -0
- package/dist/src/providers/shared.d.ts +5 -0
- package/dist/src/providers/shared.d.ts.map +1 -1
- package/dist/src/providers/shared.js +33 -1
- package/dist/src/providers/shared.js.map +1 -1
- package/dist/src/providers.d.ts +2 -0
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +8 -0
- package/dist/src/providers.js.map +1 -1
- package/dist/src/types.d.ts +2 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/web/client/assets/{index-c2756e5d.js → index-13198388.js} +23 -23
- package/dist/src/web/client/assets/index-f9b230d1.css +1 -0
- package/dist/src/web/client/index.html +2 -2
- package/package.json +2 -1
- package/src/cache.ts +5 -1
- package/src/evaluator.ts +23 -17
- package/src/main.ts +13 -5
- package/src/providers/azureopenai.ts +2 -18
- package/src/providers/localai.ts +3 -2
- package/src/providers/openai.ts +2 -33
- package/src/providers/replicate.ts +86 -0
- package/src/providers/shared.ts +29 -0
- package/src/providers.ts +8 -0
- package/src/types.ts +2 -0
- package/src/web/client/src/App.tsx +6 -0
- package/src/web/client/src/EvalOutputPromptDialog.tsx +6 -2
- package/src/web/client/src/ResultsTable.tsx +5 -0
- package/src/web/client/src/ResultsView.tsx +2 -1
- package/src/web/client/src/index.css +1 -12
- package/src/web/client/src/types.ts +1 -1
- package/dist/src/web/client/assets/index-b82d0138.css +0 -1
|
@@ -0,0 +1 @@
|
|
|
1
|
+
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-13198388.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-f9b230d1.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/package.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"name": "promptfoo",
|
|
3
3
|
"description": "LLM eval & testing toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.17.
|
|
5
|
+
"version": "0.17.6",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
|
@@ -79,6 +79,7 @@
|
|
|
79
79
|
"node-fetch": "^2.6.7",
|
|
80
80
|
"nunjucks": "^3.2.4",
|
|
81
81
|
"opener": "^1.5.2",
|
|
82
|
+
"replicate": "^0.12.3",
|
|
82
83
|
"rouge": "^1.0.3",
|
|
83
84
|
"semver": "^7.5.3",
|
|
84
85
|
"socket.io": "^4.6.1",
|
package/src/cache.ts
CHANGED
|
@@ -20,7 +20,7 @@ let enabled =
|
|
|
20
20
|
const cacheType =
|
|
21
21
|
process.env.PROMPTFOO_CACHE_TYPE || (process.env.NODE_ENV === 'test' ? 'memory' : 'disk');
|
|
22
22
|
|
|
23
|
-
function getCache() {
|
|
23
|
+
export function getCache() {
|
|
24
24
|
if (!cacheInstance) {
|
|
25
25
|
const cachePath =
|
|
26
26
|
process.env.PROMPTFOO_CACHE_PATH || path.join(getConfigDirectoryPath(), 'cache');
|
|
@@ -102,3 +102,7 @@ export async function clearCache() {
|
|
|
102
102
|
logger.info('Clearing cache...');
|
|
103
103
|
return getCache().reset();
|
|
104
104
|
}
|
|
105
|
+
|
|
106
|
+
export function isCacheEnabled() {
|
|
107
|
+
return enabled;
|
|
108
|
+
}
|
package/src/evaluator.ts
CHANGED
|
@@ -33,6 +33,7 @@ interface RunEvalOptions {
|
|
|
33
33
|
|
|
34
34
|
rowIndex: number;
|
|
35
35
|
colIndex: number;
|
|
36
|
+
repeatIndex: number;
|
|
36
37
|
}
|
|
37
38
|
|
|
38
39
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
@@ -266,25 +267,30 @@ class Evaluator {
|
|
|
266
267
|
// Finalize test case eval
|
|
267
268
|
const varCombinations = generateVarCombinations(testCase.vars || {});
|
|
268
269
|
totalVarCombinations += varCombinations.length;
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
270
|
+
|
|
271
|
+
const numRepeat = this.options.repeat || 1;
|
|
272
|
+
for (let repeatIndex = 0; repeatIndex < numRepeat; repeatIndex++) {
|
|
273
|
+
for (const vars of varCombinations) {
|
|
274
|
+
let colIndex = 0;
|
|
275
|
+
for (const prompt of testSuite.prompts) {
|
|
276
|
+
for (const provider of testSuite.providers) {
|
|
277
|
+
runEvalOptions.push({
|
|
278
|
+
provider,
|
|
279
|
+
prompt: {
|
|
280
|
+
...prompt,
|
|
281
|
+
raw: prependToPrompt + prompt.raw + appendToPrompt,
|
|
282
|
+
},
|
|
283
|
+
test: { ...testCase, vars },
|
|
284
|
+
includeProviderId: testSuite.providers.length > 1,
|
|
285
|
+
rowIndex,
|
|
286
|
+
colIndex,
|
|
287
|
+
repeatIndex,
|
|
288
|
+
});
|
|
289
|
+
colIndex++;
|
|
290
|
+
}
|
|
285
291
|
}
|
|
292
|
+
rowIndex++;
|
|
286
293
|
}
|
|
287
|
-
rowIndex++;
|
|
288
294
|
}
|
|
289
295
|
}
|
|
290
296
|
|
package/src/main.ts
CHANGED
|
@@ -130,7 +130,7 @@ async function main() {
|
|
|
130
130
|
|
|
131
131
|
program
|
|
132
132
|
.command('share')
|
|
133
|
-
.description('
|
|
133
|
+
.description('Create a shareable URL of your most recent eval')
|
|
134
134
|
.option('-y, --yes', 'Skip confirmation')
|
|
135
135
|
.action(async (cmdObj: { yes: boolean } & Command) => {
|
|
136
136
|
telemetry.maybeShowNotice();
|
|
@@ -158,10 +158,9 @@ async function main() {
|
|
|
158
158
|
});
|
|
159
159
|
|
|
160
160
|
reader.question(
|
|
161
|
-
'Are you sure you want to create a
|
|
161
|
+
'Are you sure you want to create a shareable URL of your most recent eval? Anyone you give this URL to will be able to view the results [Y/n] ',
|
|
162
162
|
async function (answer: string) {
|
|
163
|
-
if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y') {
|
|
164
|
-
logger.info('Did not create a public URL.');
|
|
163
|
+
if (answer.toLowerCase() !== 'yes' && answer.toLowerCase() !== 'y' && answer !== '') {
|
|
165
164
|
reader.close();
|
|
166
165
|
return;
|
|
167
166
|
}
|
|
@@ -218,6 +217,13 @@ async function main() {
|
|
|
218
217
|
? String(defaultConfig.evaluateOptions.maxConcurrency)
|
|
219
218
|
: undefined,
|
|
220
219
|
)
|
|
220
|
+
.option(
|
|
221
|
+
'--repeat <number>',
|
|
222
|
+
'Number of times to run each test',
|
|
223
|
+
defaultConfig.evaluateOptions?.repeat
|
|
224
|
+
? String(defaultConfig.evaluateOptions.repeat)
|
|
225
|
+
: undefined,
|
|
226
|
+
)
|
|
221
227
|
.option(
|
|
222
228
|
'--table-cell-max-length <number>',
|
|
223
229
|
'Truncate console table cells to this length',
|
|
@@ -263,7 +269,6 @@ async function main() {
|
|
|
263
269
|
}
|
|
264
270
|
|
|
265
271
|
// Config parsing
|
|
266
|
-
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
267
272
|
let fileConfig: Partial<UnifiedConfig> = {};
|
|
268
273
|
const configPath = cmdObj.config;
|
|
269
274
|
if (configPath) {
|
|
@@ -326,12 +331,15 @@ async function main() {
|
|
|
326
331
|
defaultTest,
|
|
327
332
|
};
|
|
328
333
|
|
|
334
|
+
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
335
|
+
const iterations = parseInt(cmdObj.repeat || '', 10);
|
|
329
336
|
const options: EvaluateOptions = {
|
|
330
337
|
showProgressBar:
|
|
331
338
|
typeof cmdObj.progressBar === 'undefined'
|
|
332
339
|
? getLogLevel() !== 'debug'
|
|
333
340
|
: cmdObj.progressBar,
|
|
334
341
|
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
|
|
342
|
+
repeat: !isNaN(iterations) && iterations > 0 ? iterations : 1,
|
|
335
343
|
...evaluateOptions,
|
|
336
344
|
};
|
|
337
345
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logger from '../logger';
|
|
2
2
|
import { fetchJsonWithCache } from '../cache';
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
6
6
|
|
|
@@ -205,23 +205,7 @@ export class AzureOpenAiChatCompletionProvider extends AzureOpenAiGenericProvide
|
|
|
205
205
|
throw new Error('Azure OpenAI API host must be set');
|
|
206
206
|
}
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
try {
|
|
210
|
-
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
211
|
-
} catch (err) {
|
|
212
|
-
const trimmedPrompt = prompt.trim();
|
|
213
|
-
if (
|
|
214
|
-
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
215
|
-
trimmedPrompt.startsWith('{') ||
|
|
216
|
-
trimmedPrompt.startsWith('[')
|
|
217
|
-
) {
|
|
218
|
-
throw new Error(
|
|
219
|
-
`Azure OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
220
|
-
);
|
|
221
|
-
}
|
|
222
|
-
messages = [{ role: 'user', content: prompt }];
|
|
223
|
-
}
|
|
224
|
-
|
|
208
|
+
const messages = parseChatPrompt(prompt);
|
|
225
209
|
const body = {
|
|
226
210
|
model: this.deploymentName,
|
|
227
211
|
messages: messages,
|
package/src/providers/localai.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import logger from '../logger';
|
|
2
2
|
import { fetchJsonWithCache } from '../cache';
|
|
3
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
3
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
6
6
|
|
|
@@ -29,9 +29,10 @@ class LocalAiGenericProvider implements ApiProvider {
|
|
|
29
29
|
|
|
30
30
|
export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
31
31
|
async callApi(prompt: string): Promise<ProviderResponse> {
|
|
32
|
+
const messages = parseChatPrompt(prompt);
|
|
32
33
|
const body = {
|
|
33
34
|
model: this.modelName,
|
|
34
|
-
|
|
35
|
+
messages: messages,
|
|
35
36
|
temperature: process.env.LOCALAI_TEMPERATURE || 0.7,
|
|
36
37
|
};
|
|
37
38
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import yaml from 'js-yaml';
|
|
2
1
|
|
|
3
2
|
import logger from '../logger';
|
|
4
3
|
import { fetchJsonWithCache } from '../cache';
|
|
5
|
-
import { REQUEST_TIMEOUT_MS } from './shared';
|
|
4
|
+
import { REQUEST_TIMEOUT_MS, parseChatPrompt } from './shared';
|
|
6
5
|
|
|
7
6
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
8
7
|
|
|
@@ -227,37 +226,7 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
227
226
|
);
|
|
228
227
|
}
|
|
229
228
|
|
|
230
|
-
|
|
231
|
-
const trimmedPrompt = prompt.trim();
|
|
232
|
-
if (trimmedPrompt.startsWith('- role:')) {
|
|
233
|
-
try {
|
|
234
|
-
// Try YAML
|
|
235
|
-
messages = yaml.load(prompt) as { role: string; content: string }[];
|
|
236
|
-
} catch (err) {
|
|
237
|
-
throw new Error(
|
|
238
|
-
`OpenAI Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`,
|
|
239
|
-
);
|
|
240
|
-
}
|
|
241
|
-
} else {
|
|
242
|
-
try {
|
|
243
|
-
// Try JSON
|
|
244
|
-
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
245
|
-
} catch (err) {
|
|
246
|
-
if (
|
|
247
|
-
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
248
|
-
trimmedPrompt.startsWith('{') ||
|
|
249
|
-
trimmedPrompt.startsWith('[')
|
|
250
|
-
) {
|
|
251
|
-
throw new Error(
|
|
252
|
-
`OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
253
|
-
);
|
|
254
|
-
}
|
|
255
|
-
|
|
256
|
-
// Fall back to wrapping the prompt in a user message
|
|
257
|
-
messages = [{ role: 'user', content: prompt }];
|
|
258
|
-
}
|
|
259
|
-
}
|
|
260
|
-
|
|
229
|
+
const messages = parseChatPrompt(prompt);
|
|
261
230
|
const body = {
|
|
262
231
|
model: this.modelName,
|
|
263
232
|
messages: messages,
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import Replicate from 'replicate';
|
|
2
|
+
|
|
3
|
+
import fetch from 'node-fetch';
|
|
4
|
+
import logger from '../logger';
|
|
5
|
+
import { getCache, isCacheEnabled } from '../cache';
|
|
6
|
+
|
|
7
|
+
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
8
|
+
|
|
9
|
+
export class ReplicateProvider implements ApiProvider {
|
|
10
|
+
modelName: string;
|
|
11
|
+
apiKey?: string;
|
|
12
|
+
replicate: any;
|
|
13
|
+
|
|
14
|
+
constructor(modelName: string, apiKey?: string) {
|
|
15
|
+
this.modelName = modelName;
|
|
16
|
+
this.apiKey = apiKey || process.env.REPLICATE_API_TOKEN || process.env.REPLICATE_API_KEY;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
id(): string {
|
|
20
|
+
return `replicate:${this.modelName}`;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
toString(): string {
|
|
24
|
+
return `[Replicate Provider ${this.modelName}]`;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async callApi(prompt: string): Promise<ProviderResponse> {
|
|
28
|
+
if (!this.apiKey) {
|
|
29
|
+
throw new Error(
|
|
30
|
+
'Replicate API key is not set. Set REPLICATE_API_TOKEN environment variable or pass it as an argument to the constructor.',
|
|
31
|
+
);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
let cache;
|
|
35
|
+
let cacheKey;
|
|
36
|
+
if (isCacheEnabled()) {
|
|
37
|
+
cache = await getCache();
|
|
38
|
+
cacheKey = `replicate:${this.modelName}:${prompt}`;
|
|
39
|
+
|
|
40
|
+
// Try to get the cached response
|
|
41
|
+
const cachedResponse = await cache.get(cacheKey);
|
|
42
|
+
|
|
43
|
+
if (cachedResponse) {
|
|
44
|
+
logger.debug(`Returning cached response for ${prompt}: ${cachedResponse}`);
|
|
45
|
+
return JSON.parse(cachedResponse as string);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
const replicate = new Replicate({
|
|
50
|
+
auth: this.apiKey,
|
|
51
|
+
fetch,
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
logger.debug(`Calling Replicate: ${prompt}`);
|
|
55
|
+
let response;
|
|
56
|
+
try {
|
|
57
|
+
response = await replicate.run(this.modelName as any, {
|
|
58
|
+
input: {
|
|
59
|
+
prompt,
|
|
60
|
+
max_length: process.env.REPLICATE_MAX_LENGTH || 2046,
|
|
61
|
+
temperature: process.env.REPLICATE_TEMPERATURE || 0.5,
|
|
62
|
+
repetition_penalty: process.env.REPLICATE_REPETITION_PENALTY || 1.0,
|
|
63
|
+
},
|
|
64
|
+
});
|
|
65
|
+
} catch (err) {
|
|
66
|
+
return {
|
|
67
|
+
error: `API call error: ${String(err)}`,
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
logger.debug(`\tReplicate API response: ${JSON.stringify(response)}`);
|
|
71
|
+
try {
|
|
72
|
+
const result = {
|
|
73
|
+
output: (response as string[]).join(''),
|
|
74
|
+
tokenUsage: {}, // TODO: add token usage once Replicate API supports it
|
|
75
|
+
};
|
|
76
|
+
if (cache && cacheKey) {
|
|
77
|
+
await cache.set(cacheKey, JSON.stringify(result));
|
|
78
|
+
}
|
|
79
|
+
return result;
|
|
80
|
+
} catch (err) {
|
|
81
|
+
return {
|
|
82
|
+
error: `API response error: ${String(err)}: ${JSON.stringify(response)}`,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
}
|
package/src/providers/shared.ts
CHANGED
|
@@ -1,3 +1,32 @@
|
|
|
1
|
+
import yaml from 'js-yaml';
|
|
2
|
+
|
|
1
3
|
export const REQUEST_TIMEOUT_MS = process.env.REQUEST_TIMEOUT_MS
|
|
2
4
|
? parseInt(process.env.REQUEST_TIMEOUT_MS, 10)
|
|
3
5
|
: 300_000;
|
|
6
|
+
|
|
7
|
+
export function parseChatPrompt(prompt: string): { role: string; content: string; name?: string }[] {
|
|
8
|
+
const trimmedPrompt = prompt.trim();
|
|
9
|
+
if (trimmedPrompt.startsWith('- role:')) {
|
|
10
|
+
try {
|
|
11
|
+
// Try YAML
|
|
12
|
+
return yaml.load(prompt) as { role: string; content: string }[];
|
|
13
|
+
} catch (err) {
|
|
14
|
+
throw new Error(`Chat Completion prompt is not a valid YAML string: ${err}\n\n${prompt}`);
|
|
15
|
+
}
|
|
16
|
+
} else {
|
|
17
|
+
try {
|
|
18
|
+
// Try JSON
|
|
19
|
+
return JSON.parse(prompt) as { role: string; content: string }[];
|
|
20
|
+
} catch (err) {
|
|
21
|
+
if (
|
|
22
|
+
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
23
|
+
trimmedPrompt.startsWith('{') ||
|
|
24
|
+
trimmedPrompt.startsWith('[')
|
|
25
|
+
) {
|
|
26
|
+
throw new Error(`Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`);
|
|
27
|
+
}
|
|
28
|
+
// Fall back to wrapping the prompt in a user message
|
|
29
|
+
return [{ role: 'user', content: prompt }];
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
}
|
package/src/providers.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
|
|
|
4
4
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
6
|
import { AnthropicCompletionProvider } from './providers/anthropic';
|
|
7
|
+
import { ReplicateProvider } from './providers/replicate';
|
|
7
8
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
8
9
|
import { ScriptCompletionProvider } from './providers/scriptCompletion';
|
|
9
10
|
import {
|
|
@@ -106,6 +107,12 @@ export async function loadApiProvider(
|
|
|
106
107
|
`Unknown Anthropic model type: ${modelType}. Use one of the following providers: anthropic:completion:<model name>`,
|
|
107
108
|
);
|
|
108
109
|
}
|
|
110
|
+
} else if (providerPath?.startsWith('replicate:')) {
|
|
111
|
+
// Load Replicate module
|
|
112
|
+
const options = providerPath.split(':');
|
|
113
|
+
const modelName = options.slice(1).join(':');
|
|
114
|
+
|
|
115
|
+
return new ReplicateProvider(modelName, undefined);
|
|
109
116
|
}
|
|
110
117
|
|
|
111
118
|
if (providerPath?.startsWith('localai:')) {
|
|
@@ -131,6 +138,7 @@ export default {
|
|
|
131
138
|
OpenAiCompletionProvider,
|
|
132
139
|
OpenAiChatCompletionProvider,
|
|
133
140
|
AnthropicCompletionProvider,
|
|
141
|
+
ReplicateProvider,
|
|
134
142
|
LocalAiCompletionProvider,
|
|
135
143
|
LocalAiChatProvider,
|
|
136
144
|
loadApiProvider,
|
package/src/types.ts
CHANGED
|
@@ -6,6 +6,7 @@ export interface CommandLineOptions {
|
|
|
6
6
|
|
|
7
7
|
// Shared with EvaluateOptions
|
|
8
8
|
maxConcurrency: string;
|
|
9
|
+
repeat: string;
|
|
9
10
|
|
|
10
11
|
// Command line only
|
|
11
12
|
vars?: string;
|
|
@@ -75,6 +76,7 @@ export interface EvaluateOptions {
|
|
|
75
76
|
maxConcurrency?: number;
|
|
76
77
|
showProgressBar?: boolean;
|
|
77
78
|
generateSuggestions?: boolean;
|
|
79
|
+
repeat?: number;
|
|
78
80
|
}
|
|
79
81
|
|
|
80
82
|
export interface Prompt {
|
|
@@ -37,6 +37,12 @@ function App() {
|
|
|
37
37
|
}
|
|
38
38
|
};
|
|
39
39
|
|
|
40
|
+
React.useEffect(() => {
|
|
41
|
+
if (prefersDarkMode) {
|
|
42
|
+
document.documentElement.setAttribute('data-theme', 'dark');
|
|
43
|
+
}
|
|
44
|
+
}, [prefersDarkMode]);
|
|
45
|
+
|
|
40
46
|
React.useEffect(() => {
|
|
41
47
|
const fetchEvalData = async (id: string) => {
|
|
42
48
|
if (loadedFromApi.current) {
|
|
@@ -37,7 +37,7 @@ export default function EvalOutputPromptDialog({
|
|
|
37
37
|
<Dialog open={open} onClose={onClose} fullWidth maxWidth="lg">
|
|
38
38
|
<DialogTitle>Prompt</DialogTitle>
|
|
39
39
|
<DialogContent>
|
|
40
|
-
<TextareaAutosize readOnly value={prompt} style={{ width: '100%' }} />
|
|
40
|
+
<TextareaAutosize readOnly value={prompt} style={{ width: '100%', padding: '0.75rem' }} />
|
|
41
41
|
<IconButton
|
|
42
42
|
onClick={() => copyToClipboard(prompt)}
|
|
43
43
|
style={{ position: 'absolute', right: '10px', top: '10px' }}
|
|
@@ -49,7 +49,11 @@ export default function EvalOutputPromptDialog({
|
|
|
49
49
|
<>
|
|
50
50
|
<DialogTitle>Output</DialogTitle>
|
|
51
51
|
<DialogContent>
|
|
52
|
-
<TextareaAutosize
|
|
52
|
+
<TextareaAutosize
|
|
53
|
+
readOnly
|
|
54
|
+
value={output}
|
|
55
|
+
style={{ width: '100%', padding: '0.75rem' }}
|
|
56
|
+
/>
|
|
53
57
|
</DialogContent>
|
|
54
58
|
</>
|
|
55
59
|
)}
|
|
@@ -334,6 +334,11 @@ export default function ResultsTable({
|
|
|
334
334
|
return failureFilter[columnId] && isFail;
|
|
335
335
|
});
|
|
336
336
|
});
|
|
337
|
+
} else if (filterMode === 'different') {
|
|
338
|
+
return body.filter((row) => {
|
|
339
|
+
// TODO(ian): This works for strings, but not objects.
|
|
340
|
+
return !row.outputs.every((output) => output.text === row.outputs[0].text);
|
|
341
|
+
});
|
|
337
342
|
}
|
|
338
343
|
return body;
|
|
339
344
|
}, [body, failureFilter, filterMode]);
|
|
@@ -181,7 +181,8 @@ export default function ResultsView() {
|
|
|
181
181
|
label="Filter"
|
|
182
182
|
>
|
|
183
183
|
<MenuItem value="all">Show all results</MenuItem>
|
|
184
|
-
<MenuItem value="failures">Show only
|
|
184
|
+
<MenuItem value="failures">Show failures only</MenuItem>
|
|
185
|
+
<MenuItem value="different">Show different only</MenuItem>
|
|
185
186
|
</Select>
|
|
186
187
|
</FormControl>
|
|
187
188
|
</Box>
|
|
@@ -21,19 +21,8 @@
|
|
|
21
21
|
}
|
|
22
22
|
|
|
23
23
|
/* Dark mode colors */
|
|
24
|
-
@media (prefers-color-scheme: dark) {
|
|
25
|
-
:root {
|
|
26
|
-
--background-color: #1a1a1a;
|
|
27
|
-
--text-color: #f0f0f0;
|
|
28
|
-
--border-color: #444444;
|
|
29
|
-
--table-border-color: #444444;
|
|
30
|
-
--pass-color: #4caf50;
|
|
31
|
-
--fail-color: #f44336;
|
|
32
|
-
--smalltext-color: #888888;
|
|
33
|
-
}
|
|
34
|
-
}
|
|
35
|
-
|
|
36
24
|
[data-theme='dark'] {
|
|
25
|
+
/* Keep synced with prefers-color-scheme above */
|
|
37
26
|
--background-color: #1a1a1a;
|
|
38
27
|
--text-color: #f0f0f0;
|
|
39
28
|
--border-color: #444444;
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
:root{font-family:system-ui,Avenir,Helvetica,Arial,sans-serif;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #ffffff;--text-color: #404040;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--smalltext-color: gray;--success-background-color: #d1ffd7;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7}@media (prefers-color-scheme: dark){:root{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888}}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--border-color: #444444;--table-border-color: #444444;--pass-color: #4caf50;--fail-color: #f44336;--smalltext-color: #888888;--success-background-color: #216d2b;--variable-background-color: #333;--header-background-color: #333}html{font-size:calc(14px + (18 - 14) * ((100vw - 300px) / (1600 - 300)))}*{box-sizing:border-box}html{font-family:-apple-system,BlinkMacSystemFont,Segoe UI,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol;font-size:16px;background-color:var(--background-color);color:var(--text-color)}table,.divTable{border:1px solid var(--table-border-color);border-collapse:collapse;width:100%;margin:1rem 0;box-shadow:0 2px 4px #0000001a}.tr{display:flex}tr,.tr{width:fit-content}tr:hover,.tr:hover{background-color:#0000000d}th,.th,td,.td{position:relative;box-shadow:inset 0 0 0 1px var(--border-color);vertical-align:top;padding:1.5rem}th.variable,.th.variable,td.variable,.td.variable{background-color:var(--variable-background-color)}tr.header{background-color:var(--header-background-color)}th,.th{padding:1rem;position:relative;text-align:center;vertical-align:bottom}th .action{cursor:pointer;margin-left:.5rem}tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;bottom:1.25rem;right:0;line-height:0;font-size:1.75rem}tr:hover .cell-actions{visibility:visible}tr .cell-actions .action{cursor:pointer}th .smalltext{visibility:hidden;font-weight:400;font-size:.75rem;color:var(--smalltext-color)}th:hover .smalltext{visibility:visible}th .summary{font-weight:400;font-size:.8rem;padding:.25rem}th .summary.highlight{background-color:var(--success-background-color)}td .status{margin-bottom:.5rem;font-weight:700}td .score{font-weight:400}td .pass{color:var(--pass-color)}td .fail{color:var(--fail-color)}.first-prompt-col{border-left:2px solid #888}.first-prompt-row{border-top:2px solid #888}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.logo{display:flex;align-items:center;gap:4px}.logo img{width:30px}.logo span{margin-bottom:6px;color:var(--text-color)}[data-theme=dark] .logo img{filter:invert(1)}nav{display:flex;justify-content:space-between;align-items:center;margin-bottom:1rem;color:var(--text-color)}.dark-mode-toggle{background-color:transparent;border:none;color:var(--text-color);cursor:pointer;font-size:16px;padding:8px;transition:color .3s}.dark-mode-toggle:hover{color:var(--pass-color)}body{background-color:var(--background-color);color:var(--text-color)}
|