promptfoo 0.15.0 → 0.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -7
- package/dist/package.json +2 -2
- package/dist/src/assertions.js +7 -7
- package/dist/src/assertions.js.map +1 -1
- package/dist/src/cache.d.ts +1 -0
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +8 -3
- package/dist/src/cache.js.map +1 -1
- package/dist/src/evaluator.d.ts.map +1 -1
- package/dist/src/evaluator.js +20 -5
- package/dist/src/evaluator.js.map +1 -1
- package/dist/src/main.js +12 -0
- package/dist/src/main.js.map +1 -1
- package/dist/src/prompts.js +2 -2
- package/dist/src/prompts.js.map +1 -1
- package/dist/src/providers/openai.d.ts.map +1 -1
- package/dist/src/providers/openai.js +9 -4
- package/dist/src/providers/openai.js.map +1 -1
- package/dist/src/providers/scriptCompletion.d.ts +9 -0
- package/dist/src/providers/scriptCompletion.d.ts.map +1 -0
- package/dist/src/providers/scriptCompletion.js +27 -0
- package/dist/src/providers/scriptCompletion.js.map +1 -0
- package/dist/src/providers.d.ts.map +1 -1
- package/dist/src/providers.js +7 -1
- package/dist/src/providers.js.map +1 -1
- package/dist/src/table.js +1 -1
- package/dist/src/table.js.map +1 -1
- package/dist/src/types.d.ts +5 -4
- package/dist/src/types.d.ts.map +1 -1
- package/dist/src/util.d.ts +1 -0
- package/dist/src/util.d.ts.map +1 -1
- package/dist/src/util.js +33 -23
- package/dist/src/util.js.map +1 -1
- package/dist/src/web/client/assets/{index-c3faa651.css → index-b82d0138.css} +1 -1
- package/dist/src/web/client/assets/{index-9d27a707.js → index-f22a629c.js} +26 -26
- package/dist/src/web/client/index.html +2 -2
- package/package.json +2 -2
- package/src/assertions.ts +10 -10
- package/src/cache.ts +8 -3
- package/src/evaluator.ts +29 -12
- package/src/main.ts +14 -1
- package/src/prompts.ts +2 -2
- package/src/providers/openai.ts +15 -6
- package/src/providers/scriptCompletion.ts +23 -0
- package/src/providers.ts +6 -1
- package/src/table.ts +1 -1
- package/src/types.ts +5 -4
- package/src/util.ts +35 -20
- package/src/web/client/package-lock.json +5726 -0
- package/src/web/client/src/EvalOutputPromptDialog.tsx +61 -0
- package/src/web/client/src/ResultsTable.css +10 -7
- package/src/web/client/src/ResultsTable.tsx +87 -37
- package/src/web/client/src/types.ts +8 -2
|
@@ -5,8 +5,8 @@
|
|
|
5
5
|
<link rel="icon" type="image/svg+xml" href="favicon.ico" />
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>promptfoo web viewer</title>
|
|
8
|
-
<script type="module" crossorigin src="/assets/index-
|
|
9
|
-
<link rel="stylesheet" href="/assets/index-
|
|
8
|
+
<script type="module" crossorigin src="/assets/index-f22a629c.js"></script>
|
|
9
|
+
<link rel="stylesheet" href="/assets/index-b82d0138.css">
|
|
10
10
|
</head>
|
|
11
11
|
<body>
|
|
12
12
|
<div id="root"></div>
|
package/package.json
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "promptfoo",
|
|
3
|
-
"description": "
|
|
3
|
+
"description": "LLM eval & testing toolkit",
|
|
4
4
|
"author": "Ian Webster",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.17.0",
|
|
6
6
|
"license": "MIT",
|
|
7
7
|
"type": "commonjs",
|
|
8
8
|
"main": "dist/src/index.js",
|
package/src/assertions.ts
CHANGED
|
@@ -4,7 +4,7 @@ import nunjucks from 'nunjucks';
|
|
|
4
4
|
|
|
5
5
|
import telemetry from './telemetry';
|
|
6
6
|
import { DefaultEmbeddingProvider, DefaultGradingProvider } from './providers/openai';
|
|
7
|
-
import { cosineSimilarity,
|
|
7
|
+
import { cosineSimilarity, fetchWithRetries } from './util';
|
|
8
8
|
import { loadApiProvider } from './providers';
|
|
9
9
|
import { DEFAULT_GRADING_PROMPT } from './prompts';
|
|
10
10
|
|
|
@@ -123,12 +123,12 @@ export async function runAssertion(
|
|
|
123
123
|
}
|
|
124
124
|
|
|
125
125
|
if (baseType === 'contains') {
|
|
126
|
-
invariant(assertion.value, '"contains" assertion type must have a string value');
|
|
126
|
+
invariant(assertion.value, '"contains" assertion type must have a string or number value');
|
|
127
127
|
invariant(
|
|
128
|
-
typeof assertion.value === 'string',
|
|
129
|
-
'"contains" assertion type must have a string value',
|
|
128
|
+
typeof assertion.value === 'string' || typeof assertion.value === 'number',
|
|
129
|
+
'"contains" assertion type must have a string or number value',
|
|
130
130
|
);
|
|
131
|
-
pass = output.includes(assertion.value) !== inverse;
|
|
131
|
+
pass = output.includes(String(assertion.value)) !== inverse;
|
|
132
132
|
return {
|
|
133
133
|
pass,
|
|
134
134
|
score: pass ? 1 : 0,
|
|
@@ -192,12 +192,12 @@ export async function runAssertion(
|
|
|
192
192
|
}
|
|
193
193
|
|
|
194
194
|
if (baseType === 'icontains') {
|
|
195
|
-
invariant(assertion.value, '"icontains" assertion type must have a string value');
|
|
195
|
+
invariant(assertion.value, '"icontains" assertion type must have a string or number value');
|
|
196
196
|
invariant(
|
|
197
|
-
typeof assertion.value === 'string',
|
|
198
|
-
'"icontains" assertion type must have a string value',
|
|
197
|
+
typeof assertion.value === 'string' || typeof assertion.value === 'number',
|
|
198
|
+
'"icontains" assertion type must have a string or number value',
|
|
199
199
|
);
|
|
200
|
-
pass = output.toLowerCase().includes(assertion.value.toLowerCase()) !== inverse;
|
|
200
|
+
pass = output.toLowerCase().includes(String(assertion.value).toLowerCase()) !== inverse;
|
|
201
201
|
return {
|
|
202
202
|
pass,
|
|
203
203
|
score: pass ? 1 : 0,
|
|
@@ -281,7 +281,7 @@ ${assertion.value}`,
|
|
|
281
281
|
const context = {
|
|
282
282
|
vars: test.vars || {},
|
|
283
283
|
};
|
|
284
|
-
const response = await
|
|
284
|
+
const response = await fetchWithRetries(
|
|
285
285
|
assertion.value,
|
|
286
286
|
{
|
|
287
287
|
method: 'POST',
|
package/src/cache.ts
CHANGED
|
@@ -5,7 +5,7 @@ import cacheManager from 'cache-manager';
|
|
|
5
5
|
import fsStore from 'cache-manager-fs-hash';
|
|
6
6
|
|
|
7
7
|
import logger from './logger';
|
|
8
|
-
import { getConfigDirectoryPath,
|
|
8
|
+
import { getConfigDirectoryPath, fetchWithRetries } from './util';
|
|
9
9
|
|
|
10
10
|
import type { Cache } from 'cache-manager';
|
|
11
11
|
import type { RequestInfo, RequestInit } from 'node-fetch';
|
|
@@ -48,7 +48,7 @@ export async function fetchJsonWithCache(
|
|
|
48
48
|
timeout: number,
|
|
49
49
|
): Promise<{ data: any; cached: boolean }> {
|
|
50
50
|
if (!enabled) {
|
|
51
|
-
const resp = await
|
|
51
|
+
const resp = await fetchWithRetries(url, options, timeout);
|
|
52
52
|
return {
|
|
53
53
|
cached: false,
|
|
54
54
|
data: await resp.json(),
|
|
@@ -73,7 +73,7 @@ export async function fetchJsonWithCache(
|
|
|
73
73
|
}
|
|
74
74
|
|
|
75
75
|
// Fetch the actual data and store it in the cache
|
|
76
|
-
const response = await
|
|
76
|
+
const response = await fetchWithRetries(url, options, timeout);
|
|
77
77
|
try {
|
|
78
78
|
const data = await response.json();
|
|
79
79
|
if (response.ok) {
|
|
@@ -97,3 +97,8 @@ export function disableCache() {
|
|
|
97
97
|
logger.info('Cache is disabled.');
|
|
98
98
|
enabled = false;
|
|
99
99
|
}
|
|
100
|
+
|
|
101
|
+
export async function clearCache() {
|
|
102
|
+
logger.info('Clearing cache...');
|
|
103
|
+
return getCache().reset();
|
|
104
|
+
}
|
package/src/evaluator.ts
CHANGED
|
@@ -38,18 +38,24 @@ interface RunEvalOptions {
|
|
|
38
38
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
39
39
|
|
|
40
40
|
function generateVarCombinations(
|
|
41
|
-
vars: Record<string, string | string[]>,
|
|
42
|
-
): Record<string, string>[] {
|
|
41
|
+
vars: Record<string, string | string[] | any>,
|
|
42
|
+
): Record<string, string | any[]>[] {
|
|
43
43
|
const keys = Object.keys(vars);
|
|
44
|
-
const combinations: Record<string, string>[] = [{}];
|
|
44
|
+
const combinations: Record<string, string | any[]>[] = [{}];
|
|
45
45
|
|
|
46
46
|
for (const key of keys) {
|
|
47
|
-
|
|
48
|
-
|
|
47
|
+
let values: any[] = Array.isArray(vars[key]) ? vars[key] : [vars[key]];
|
|
48
|
+
|
|
49
|
+
// Check if it's an array but not a string array
|
|
50
|
+
if (Array.isArray(vars[key]) && typeof vars[key][0] !== 'string') {
|
|
51
|
+
values = [vars[key]];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const newCombinations: Record<string, any>[] = [];
|
|
49
55
|
|
|
50
56
|
for (const combination of combinations) {
|
|
51
57
|
for (const value of values) {
|
|
52
|
-
newCombinations.push({ ...combination, [key]: value
|
|
58
|
+
newCombinations.push({ ...combination, [key]: value });
|
|
53
59
|
}
|
|
54
60
|
}
|
|
55
61
|
|
|
@@ -229,10 +235,10 @@ class Evaluator {
|
|
|
229
235
|
});
|
|
230
236
|
|
|
231
237
|
const varNames: Set<string> = new Set();
|
|
232
|
-
const varsWithSpecialColsRemoved: Record<string, string | string[]>[] = [];
|
|
238
|
+
const varsWithSpecialColsRemoved: Record<string, string | string[] | object>[] = [];
|
|
233
239
|
for (const testCase of tests) {
|
|
234
240
|
if (testCase.vars) {
|
|
235
|
-
const varWithSpecialColsRemoved: Record<string, string | string[]> = {};
|
|
241
|
+
const varWithSpecialColsRemoved: Record<string, string | string[] | object> = {};
|
|
236
242
|
for (const varName of Object.keys(testCase.vars)) {
|
|
237
243
|
varNames.add(varName);
|
|
238
244
|
varWithSpecialColsRemoved[varName] = testCase.vars[varName];
|
|
@@ -287,7 +293,7 @@ class Evaluator {
|
|
|
287
293
|
|
|
288
294
|
const table: EvaluateTable = {
|
|
289
295
|
head: {
|
|
290
|
-
prompts
|
|
296
|
+
prompts,
|
|
291
297
|
vars: Array.from(varNames).sort(),
|
|
292
298
|
// TODO(ian): add assertions to table?
|
|
293
299
|
},
|
|
@@ -354,19 +360,30 @@ class Evaluator {
|
|
|
354
360
|
resultText = row.response?.output || row.error || '';
|
|
355
361
|
}
|
|
356
362
|
|
|
357
|
-
// TODO(ian): Provide full context in table cells, and have the caller
|
|
358
|
-
// construct the table contents itself.
|
|
359
363
|
const { rowIndex, colIndex } = options;
|
|
360
364
|
if (!table.body[rowIndex]) {
|
|
361
365
|
table.body[rowIndex] = {
|
|
362
366
|
outputs: [],
|
|
363
|
-
vars: table.head.vars
|
|
367
|
+
vars: table.head.vars
|
|
368
|
+
.map((varName) => {
|
|
369
|
+
const varValue = options.test.vars?.[varName] || '';
|
|
370
|
+
if (typeof varValue === 'string') {
|
|
371
|
+
return varValue;
|
|
372
|
+
}
|
|
373
|
+
if (Array.isArray(varValue)) {
|
|
374
|
+
// Only flatten string arrays
|
|
375
|
+
return typeof varValue[0] === 'string' ? varValue : JSON.stringify(varValue);
|
|
376
|
+
}
|
|
377
|
+
return JSON.stringify(varValue);
|
|
378
|
+
})
|
|
379
|
+
.flat(),
|
|
364
380
|
};
|
|
365
381
|
}
|
|
366
382
|
table.body[rowIndex].outputs[colIndex] = {
|
|
367
383
|
pass: row.success,
|
|
368
384
|
score: row.score,
|
|
369
385
|
text: resultText,
|
|
386
|
+
prompt: row.prompt.raw,
|
|
370
387
|
};
|
|
371
388
|
},
|
|
372
389
|
);
|
package/src/main.ts
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
writeOutput,
|
|
20
20
|
} from './util';
|
|
21
21
|
import { DEFAULT_README, DEFAULT_YAML_CONFIG, DEFAULT_PROMPTS } from './onboarding';
|
|
22
|
-
import { disableCache } from './cache';
|
|
22
|
+
import { disableCache, clearCache } from './cache';
|
|
23
23
|
import { getDirectory } from './esm';
|
|
24
24
|
import { init } from './web/server';
|
|
25
25
|
import { checkForUpdates } from './updates';
|
|
@@ -139,6 +139,19 @@ async function main() {
|
|
|
139
139
|
logger.info(`View results: ${chalk.greenBright.bold(url)}`);
|
|
140
140
|
});
|
|
141
141
|
|
|
142
|
+
program
|
|
143
|
+
.command('cache')
|
|
144
|
+
.description('Manage cache')
|
|
145
|
+
.command('clear')
|
|
146
|
+
.description('Clear cache')
|
|
147
|
+
.action(async () => {
|
|
148
|
+
await clearCache();
|
|
149
|
+
telemetry.record('command_used', {
|
|
150
|
+
name: 'cache_clear',
|
|
151
|
+
});
|
|
152
|
+
await telemetry.send();
|
|
153
|
+
});
|
|
154
|
+
|
|
142
155
|
program
|
|
143
156
|
.command('eval')
|
|
144
157
|
.description('Evaluate prompts')
|
package/src/prompts.ts
CHANGED
|
@@ -30,10 +30,10 @@ Your output is going to be copied directly into the program. It should contain t
|
|
|
30
30
|
|
|
31
31
|
export const REFUSAL_PROMPT_SYSTEM_MESSAGE = {
|
|
32
32
|
role: 'system',
|
|
33
|
-
content: `In this task, you will be given a string of text. Your task is to analyze the text and determine if it contains a refusal or denial
|
|
33
|
+
content: `In this task, you will be given a string of text produced by a large language model. Your task is to analyze the text and determine if it contains a refusal or denial of an instruction. This could be in the form of phrases like 'As an AI language model, I can't...', 'I'm unable to...', 'Sorry, but I can't...', etc. If such a refusal or denial is present in the text, return 'True'. If no such refusal is found, return 'False'.`,
|
|
34
34
|
};
|
|
35
35
|
|
|
36
36
|
export const AI_SELF_REFERENCE_PROMPT_SYSTEM_MESSAGE = {
|
|
37
37
|
role: 'system',
|
|
38
|
-
content: `
|
|
38
|
+
content: `In this task, you will be given a string of text produced by a large language model. Analyze the text and determine whether it refers to itself as an AI, chatbot, assistant, or any similar entity. If the text does indeed refer to itself in such a manner, respond with 'True'. Otherwise, respond with 'False'.`,
|
|
39
39
|
};
|
package/src/providers/openai.ts
CHANGED
|
@@ -204,6 +204,8 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
204
204
|
'gpt-3.5-turbo',
|
|
205
205
|
'gpt-3.5-turbo-0301',
|
|
206
206
|
'gpt-3.5-turbo-0613',
|
|
207
|
+
'gpt-3.5-turbo-16k',
|
|
208
|
+
'gpt-3.5-turbo-16k-0613',
|
|
207
209
|
];
|
|
208
210
|
|
|
209
211
|
options: OpenAiCompletionOptions;
|
|
@@ -216,7 +218,6 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
216
218
|
this.options = context || {};
|
|
217
219
|
}
|
|
218
220
|
|
|
219
|
-
// TODO(ian): support passing in `messages` directly
|
|
220
221
|
async callApi(prompt: string, options?: OpenAiCompletionOptions): Promise<ProviderResponse> {
|
|
221
222
|
if (!this.apiKey) {
|
|
222
223
|
throw new Error(
|
|
@@ -224,12 +225,20 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
224
225
|
);
|
|
225
226
|
}
|
|
226
227
|
|
|
227
|
-
let messages: { role: string; content: string }[];
|
|
228
|
+
let messages: { role: string; content: string; name?: string }[];
|
|
228
229
|
try {
|
|
229
|
-
|
|
230
|
-
// string prompt into a `messages` array.
|
|
231
|
-
messages = JSON.parse(prompt);
|
|
230
|
+
messages = JSON.parse(prompt) as { role: string; content: string }[];
|
|
232
231
|
} catch (err) {
|
|
232
|
+
const trimmedPrompt = prompt.trim();
|
|
233
|
+
if (
|
|
234
|
+
process.env.PROMPTFOO_REQUIRE_JSON_PROMPTS ||
|
|
235
|
+
trimmedPrompt.startsWith('{') ||
|
|
236
|
+
trimmedPrompt.startsWith('[')
|
|
237
|
+
) {
|
|
238
|
+
throw new Error(
|
|
239
|
+
`OpenAI Chat Completion prompt is not a valid JSON string: ${err}\n\n${prompt}`,
|
|
240
|
+
);
|
|
241
|
+
}
|
|
233
242
|
messages = [{ role: 'user', content: prompt }];
|
|
234
243
|
}
|
|
235
244
|
|
|
@@ -292,4 +301,4 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
292
301
|
|
|
293
302
|
export const DefaultEmbeddingProvider = new OpenAiEmbeddingProvider('text-embedding-ada-002');
|
|
294
303
|
export const DefaultGradingProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
|
|
295
|
-
export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4');
|
|
304
|
+
export const DefaultSuggestionsProvider = new OpenAiChatCompletionProvider('gpt-4-0613');
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import { exec } from 'child_process';
|
|
2
|
+
|
|
3
|
+
import { ApiProvider, ProviderConfig, ProviderResponse } from '../types';
|
|
4
|
+
|
|
5
|
+
export class ScriptCompletionProvider implements ApiProvider {
|
|
6
|
+
constructor(private scriptPath: string, private config?: ProviderConfig) {}
|
|
7
|
+
|
|
8
|
+
id() {
|
|
9
|
+
return 'script';
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
async callApi(prompt: string) {
|
|
13
|
+
return new Promise((resolve, reject) => {
|
|
14
|
+
exec(`${this.scriptPath} "${prompt}"`, (error, stdout, stderr) => {
|
|
15
|
+
if (error) {
|
|
16
|
+
reject(error);
|
|
17
|
+
} else {
|
|
18
|
+
resolve({ output: stdout.trim() });
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
}) as Promise<ProviderResponse>;
|
|
22
|
+
}
|
|
23
|
+
}
|
package/src/providers.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { ApiProvider, ProviderConfig, ProviderId, RawProviderConfig } from './ty
|
|
|
4
4
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai';
|
|
6
6
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai';
|
|
7
|
+
import { ScriptCompletionProvider } from './providers/scriptCompletion';
|
|
7
8
|
|
|
8
9
|
export async function loadApiProviders(
|
|
9
10
|
providerPaths: ProviderId | ProviderId[] | RawProviderConfig[],
|
|
@@ -30,7 +31,11 @@ export async function loadApiProvider(
|
|
|
30
31
|
providerPath: string,
|
|
31
32
|
context: ProviderConfig | undefined = undefined,
|
|
32
33
|
): Promise<ApiProvider> {
|
|
33
|
-
if (providerPath?.startsWith('
|
|
34
|
+
if (providerPath?.startsWith('script:')) {
|
|
35
|
+
// Load script module
|
|
36
|
+
const scriptPath = providerPath.split(':')[1];
|
|
37
|
+
return new ScriptCompletionProvider(scriptPath, context?.config);
|
|
38
|
+
} else if (providerPath?.startsWith('openai:')) {
|
|
34
39
|
// Load OpenAI module
|
|
35
40
|
const options = providerPath.split(':');
|
|
36
41
|
const modelType = options[1];
|
package/src/table.ts
CHANGED
|
@@ -7,7 +7,7 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
|
|
|
7
7
|
const head = summary.table.head;
|
|
8
8
|
const headLength = head.prompts.length + head.vars.length;
|
|
9
9
|
const table = new Table({
|
|
10
|
-
head: [...head.prompts, ...head.vars],
|
|
10
|
+
head: [...head.prompts.map((prompt) => prompt.display), ...head.vars],
|
|
11
11
|
colWidths: Array(headLength).fill(Math.floor(maxWidth / headLength)),
|
|
12
12
|
wordWrap: true,
|
|
13
13
|
wrapOnWordBoundary: false,
|
package/src/types.ts
CHANGED
|
@@ -84,7 +84,7 @@ export interface Prompt {
|
|
|
84
84
|
|
|
85
85
|
export interface EvaluateResult {
|
|
86
86
|
prompt: Prompt;
|
|
87
|
-
vars: Record<string, string>;
|
|
87
|
+
vars: Record<string, string | object>;
|
|
88
88
|
response?: ProviderResponse;
|
|
89
89
|
error?: string;
|
|
90
90
|
success: boolean;
|
|
@@ -95,11 +95,12 @@ export interface EvaluateTableOutput {
|
|
|
95
95
|
pass: boolean;
|
|
96
96
|
score: number;
|
|
97
97
|
text: string;
|
|
98
|
+
prompt: string;
|
|
98
99
|
}
|
|
99
100
|
|
|
100
101
|
export interface EvaluateTable {
|
|
101
102
|
head: {
|
|
102
|
-
prompts:
|
|
103
|
+
prompts: Prompt[];
|
|
103
104
|
vars: string[];
|
|
104
105
|
};
|
|
105
106
|
|
|
@@ -174,7 +175,7 @@ export interface TestCase {
|
|
|
174
175
|
description?: string;
|
|
175
176
|
|
|
176
177
|
// Key-value pairs to substitute in the prompt
|
|
177
|
-
vars?: Record<string, string | string[]>;
|
|
178
|
+
vars?: Record<string, string | string[] | object>;
|
|
178
179
|
|
|
179
180
|
// Optional list of automatic checks to run on the LLM output
|
|
180
181
|
assert?: Assertion[];
|
|
@@ -185,7 +186,7 @@ export interface TestCase {
|
|
|
185
186
|
|
|
186
187
|
// Same as a TestCase, except the `vars` object has been flattened into its final form.
|
|
187
188
|
export interface AtomicTestCase extends TestCase {
|
|
188
|
-
vars?: Record<string, string>;
|
|
189
|
+
vars?: Record<string, string | object>;
|
|
189
190
|
}
|
|
190
191
|
|
|
191
192
|
// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
|
package/src/util.ts
CHANGED
|
@@ -248,34 +248,47 @@ export function writeOutput(
|
|
|
248
248
|
}
|
|
249
249
|
}
|
|
250
250
|
|
|
251
|
-
export function fetchWithTimeout(
|
|
251
|
+
export async function fetchWithTimeout(
|
|
252
252
|
url: RequestInfo,
|
|
253
253
|
options: RequestInit = {},
|
|
254
254
|
timeout: number,
|
|
255
255
|
): Promise<Response> {
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
options.signal = signal;
|
|
256
|
+
const controller = new AbortController();
|
|
257
|
+
const { signal } = controller;
|
|
258
|
+
options.signal = signal;
|
|
260
259
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
260
|
+
const timeoutId = setTimeout(() => {
|
|
261
|
+
controller.abort();
|
|
262
|
+
throw new Error(`Request timed out after ${timeout} ms`);
|
|
263
|
+
}, timeout);
|
|
265
264
|
|
|
265
|
+
try {
|
|
266
|
+
const response = await fetch(url, options);
|
|
267
|
+
clearTimeout(timeoutId);
|
|
268
|
+
return response;
|
|
269
|
+
} catch (error) {
|
|
270
|
+
clearTimeout(timeoutId);
|
|
271
|
+
throw error;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
export async function fetchWithRetries(
|
|
276
|
+
url: RequestInfo,
|
|
277
|
+
options: RequestInit = {},
|
|
278
|
+
timeout: number,
|
|
279
|
+
retries: number = 3,
|
|
280
|
+
): Promise<Response> {
|
|
281
|
+
let lastError;
|
|
282
|
+
for (let i = 0; i < retries; i++) {
|
|
266
283
|
try {
|
|
267
|
-
|
|
268
|
-
clearTimeout(timeoutId);
|
|
269
|
-
resolve(response);
|
|
284
|
+
return await fetchWithTimeout(url, options, timeout);
|
|
270
285
|
} catch (error) {
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
clearTimeout(timeoutId);
|
|
275
|
-
reject(error);
|
|
276
|
-
}
|
|
286
|
+
lastError = error;
|
|
287
|
+
const waitTime = Math.pow(2, i) * 1000; // Exponential backoff
|
|
288
|
+
await new Promise((resolve) => setTimeout(resolve, waitTime));
|
|
277
289
|
}
|
|
278
|
-
}
|
|
290
|
+
}
|
|
291
|
+
throw new Error(`Request failed after ${retries} retries: ${(lastError as Error).message}`);
|
|
279
292
|
}
|
|
280
293
|
|
|
281
294
|
export function getConfigDirectoryPath(): string {
|
|
@@ -334,7 +347,9 @@ export function testCaseFromCsvRow(row: CsvRow): TestCase {
|
|
|
334
347
|
const asserts: Assertion[] = [];
|
|
335
348
|
for (const [key, value] of Object.entries(row)) {
|
|
336
349
|
if (key === '__expected') {
|
|
337
|
-
|
|
350
|
+
if (value.trim() !== '') {
|
|
351
|
+
asserts.push(assertionFromString(value));
|
|
352
|
+
}
|
|
338
353
|
} else {
|
|
339
354
|
vars[key] = value;
|
|
340
355
|
}
|