promptfoo 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -74
- package/dist/assertions.d.ts +4 -10
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +126 -20
- package/dist/assertions.js.map +1 -1
- package/dist/cache.d.ts +8 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/cache.js.map +1 -0
- package/dist/evaluator.d.ts +2 -2
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +73 -40
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +6 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -21
- package/dist/index.js.map +1 -1
- package/dist/main.js +92 -80
- package/dist/main.js.map +1 -1
- package/dist/onboarding.d.ts +4 -0
- package/dist/onboarding.d.ts.map +1 -0
- package/dist/onboarding.js +63 -0
- package/dist/onboarding.js.map +1 -0
- package/dist/providers/localai.d.ts.map +1 -1
- package/dist/providers/localai.js +7 -9
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +31 -38
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers.d.ts +1 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +11 -1
- package/dist/providers.js.map +1 -1
- package/dist/types.d.ts +46 -13
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +6 -3
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +73 -2
- package/dist/util.js.map +1 -1
- package/dist/web/server.d.ts.map +1 -1
- package/dist/web/server.js +0 -11
- package/dist/web/server.js.map +1 -1
- package/package.json +6 -2
- package/src/assertions.ts +141 -28
- package/src/cache.ts +90 -0
- package/src/evaluator.ts +89 -43
- package/src/index.ts +14 -26
- package/src/main.ts +117 -99
- package/src/onboarding.ts +61 -0
- package/src/providers/localai.ts +9 -11
- package/src/providers/openai.ts +34 -42
- package/src/providers.ts +9 -0
- package/src/types.ts +95 -16
- package/src/util.ts +90 -4
- package/src/web/server.ts +0 -18
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
export const DEFAULT_PROMPTS = `Your first prompt goes here
|
|
2
|
+
---
|
|
3
|
+
Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
|
|
4
|
+
---
|
|
5
|
+
This is the next prompt.
|
|
6
|
+
|
|
7
|
+
These prompts are nunjucks templates, so you can use logic like this:
|
|
8
|
+
{% if var1 %}
|
|
9
|
+
{{ var1 }}
|
|
10
|
+
{% endif %}
|
|
11
|
+
---
|
|
12
|
+
If you prefer, you can break prompts into multiple files (make sure to edit promptfooconfig.yaml accordingly)
|
|
13
|
+
`;
|
|
14
|
+
|
|
15
|
+
export const DEFAULT_YAML_CONFIG = `# This configuration runs each prompt through a series of example inputs and checks if they meet requirements.
|
|
16
|
+
|
|
17
|
+
prompts: [prompts.txt]
|
|
18
|
+
providers: [openai:gpt-3.5-turbo]
|
|
19
|
+
tests:
|
|
20
|
+
- description: First test case - automatic review
|
|
21
|
+
vars:
|
|
22
|
+
var1: first variable's value
|
|
23
|
+
var2: another value
|
|
24
|
+
var3: some other value
|
|
25
|
+
assert:
|
|
26
|
+
- type: equality
|
|
27
|
+
value: expected LLM output goes here
|
|
28
|
+
- type: function
|
|
29
|
+
value: output.includes('some text')
|
|
30
|
+
|
|
31
|
+
- description: Second test case - manual review
|
|
32
|
+
# Test cases don't need assertions if you prefer to manually review the output
|
|
33
|
+
vars:
|
|
34
|
+
var1: new value
|
|
35
|
+
var2: another value
|
|
36
|
+
var3: third value
|
|
37
|
+
|
|
38
|
+
- description: Third test case - other types of automatic review
|
|
39
|
+
vars:
|
|
40
|
+
var1: yet another value
|
|
41
|
+
var2: and another
|
|
42
|
+
var3: dear llm, please output your response in json format
|
|
43
|
+
assert:
|
|
44
|
+
- type: contains-json
|
|
45
|
+
- type: similarity
|
|
46
|
+
value: ensures that output is semantically similar to this text
|
|
47
|
+
- type: llm-rubric
|
|
48
|
+
value: ensure that output contains a reference to X
|
|
49
|
+
`;
|
|
50
|
+
|
|
51
|
+
export const DEFAULT_README = `To get started, set your OPENAI_API_KEY environment variable.
|
|
52
|
+
|
|
53
|
+
Next, change a few of the prompts in prompts.txt and edit promptfooconfig.yaml.
|
|
54
|
+
|
|
55
|
+
Then run:
|
|
56
|
+
\`\`\`
|
|
57
|
+
promptfoo eval
|
|
58
|
+
\`\`\`
|
|
59
|
+
|
|
60
|
+
Afterwards, you can view the results by running \`promptfoo view\`
|
|
61
|
+
`;
|
package/src/providers/localai.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logger from '../logger.js';
|
|
2
|
-
import {
|
|
2
|
+
import { fetchJsonWithCache } from '../cache.js';
|
|
3
3
|
import { REQUEST_TIMEOUT_MS } from './shared.js';
|
|
4
4
|
|
|
5
5
|
import type { ApiProvider, ProviderResponse } from '../types.js';
|
|
@@ -36,9 +36,10 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
|
36
36
|
};
|
|
37
37
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
|
38
38
|
|
|
39
|
-
let
|
|
39
|
+
let data,
|
|
40
|
+
cached = false;
|
|
40
41
|
try {
|
|
41
|
-
|
|
42
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
42
43
|
`${this.apiBaseUrl}/chat/completions`,
|
|
43
44
|
{
|
|
44
45
|
method: 'POST',
|
|
@@ -48,9 +49,7 @@ export class LocalAiChatProvider extends LocalAiGenericProvider {
|
|
|
48
49
|
body: JSON.stringify(body),
|
|
49
50
|
},
|
|
50
51
|
REQUEST_TIMEOUT_MS,
|
|
51
|
-
);
|
|
52
|
-
|
|
53
|
-
data = (await response.json()) as unknown as any;
|
|
52
|
+
)) as unknown as any);
|
|
54
53
|
} catch (err) {
|
|
55
54
|
return {
|
|
56
55
|
error: `API call error: ${String(err)}`,
|
|
@@ -78,9 +77,10 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
|
|
|
78
77
|
};
|
|
79
78
|
logger.debug(`Calling LocalAI API: ${JSON.stringify(body)}`);
|
|
80
79
|
|
|
81
|
-
let
|
|
80
|
+
let data,
|
|
81
|
+
cached = false;
|
|
82
82
|
try {
|
|
83
|
-
|
|
83
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
84
84
|
`${this.apiBaseUrl}/completions`,
|
|
85
85
|
{
|
|
86
86
|
method: 'POST',
|
|
@@ -90,9 +90,7 @@ export class LocalAiCompletionProvider extends LocalAiGenericProvider {
|
|
|
90
90
|
body: JSON.stringify(body),
|
|
91
91
|
},
|
|
92
92
|
REQUEST_TIMEOUT_MS,
|
|
93
|
-
);
|
|
94
|
-
|
|
95
|
-
data = (await response.json()) as unknown as any;
|
|
93
|
+
)) as unknown as any);
|
|
96
94
|
} catch (err) {
|
|
97
95
|
return {
|
|
98
96
|
error: `API call error: ${String(err)}`,
|
package/src/providers/openai.ts
CHANGED
|
@@ -1,17 +1,11 @@
|
|
|
1
|
-
import { LRUCache } from 'lru-cache';
|
|
2
|
-
|
|
3
1
|
import logger from '../logger.js';
|
|
4
|
-
import {
|
|
2
|
+
import { fetchJsonWithCache } from '../cache.js';
|
|
5
3
|
import { REQUEST_TIMEOUT_MS } from './shared.js';
|
|
6
4
|
|
|
7
5
|
import type { ApiProvider, ProviderEmbeddingResponse, ProviderResponse } from '../types.js';
|
|
8
6
|
|
|
9
7
|
const DEFAULT_OPENAI_HOST = 'api.openai.com';
|
|
10
8
|
|
|
11
|
-
const embeddingsCache = new LRUCache<string, ProviderEmbeddingResponse>({
|
|
12
|
-
max: 1000,
|
|
13
|
-
});
|
|
14
|
-
|
|
15
9
|
interface OpenAiCompletionOptions {
|
|
16
10
|
temperature: number;
|
|
17
11
|
}
|
|
@@ -49,19 +43,14 @@ export class OpenAiEmbeddingProvider extends OpenAiGenericProvider {
|
|
|
49
43
|
throw new Error('OpenAI API key must be set for similarity comparison');
|
|
50
44
|
}
|
|
51
45
|
|
|
52
|
-
// TODO(ian): Improve cache
|
|
53
|
-
const cached = embeddingsCache.get(text);
|
|
54
|
-
if (cached) {
|
|
55
|
-
return cached;
|
|
56
|
-
}
|
|
57
|
-
|
|
58
46
|
const body = {
|
|
59
47
|
input: text,
|
|
60
48
|
model: this.modelName,
|
|
61
49
|
};
|
|
62
|
-
let
|
|
50
|
+
let data,
|
|
51
|
+
cached = false;
|
|
63
52
|
try {
|
|
64
|
-
|
|
53
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
65
54
|
`https://${this.apiHost}/v1/embeddings`,
|
|
66
55
|
{
|
|
67
56
|
method: 'POST',
|
|
@@ -72,8 +61,7 @@ export class OpenAiEmbeddingProvider extends OpenAiGenericProvider {
|
|
|
72
61
|
body: JSON.stringify(body),
|
|
73
62
|
},
|
|
74
63
|
REQUEST_TIMEOUT_MS,
|
|
75
|
-
);
|
|
76
|
-
data = (await response.json()) as unknown as any;
|
|
64
|
+
)) as unknown as any);
|
|
77
65
|
} catch (err) {
|
|
78
66
|
return {
|
|
79
67
|
error: `API call error: ${String(err)}`,
|
|
@@ -93,13 +81,14 @@ export class OpenAiEmbeddingProvider extends OpenAiGenericProvider {
|
|
|
93
81
|
}
|
|
94
82
|
const ret = {
|
|
95
83
|
embedding,
|
|
96
|
-
tokenUsage:
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
84
|
+
tokenUsage: cached
|
|
85
|
+
? { cached: data.usage.total_tokens }
|
|
86
|
+
: {
|
|
87
|
+
total: data.usage.total_tokens,
|
|
88
|
+
prompt: data.usage.prompt_tokens,
|
|
89
|
+
completion: data.usage.completion_tokens,
|
|
90
|
+
},
|
|
101
91
|
};
|
|
102
|
-
embeddingsCache.set(text, ret);
|
|
103
92
|
return ret;
|
|
104
93
|
} catch (err) {
|
|
105
94
|
return {
|
|
@@ -145,9 +134,10 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
145
134
|
stop: process.env.OPENAI_STOP ? JSON.parse(process.env.OPENAI_STOP) : undefined,
|
|
146
135
|
};
|
|
147
136
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
148
|
-
let
|
|
137
|
+
let data,
|
|
138
|
+
cached = false;
|
|
149
139
|
try {
|
|
150
|
-
|
|
140
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
151
141
|
`https://${this.apiHost}/v1/completions`,
|
|
152
142
|
{
|
|
153
143
|
method: 'POST',
|
|
@@ -158,9 +148,7 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
158
148
|
body: JSON.stringify(body),
|
|
159
149
|
},
|
|
160
150
|
REQUEST_TIMEOUT_MS,
|
|
161
|
-
);
|
|
162
|
-
|
|
163
|
-
data = (await response.json()) as unknown as any;
|
|
151
|
+
)) as unknown as any);
|
|
164
152
|
} catch (err) {
|
|
165
153
|
return {
|
|
166
154
|
error: `API call error: ${String(err)}`,
|
|
@@ -170,11 +158,13 @@ export class OpenAiCompletionProvider extends OpenAiGenericProvider {
|
|
|
170
158
|
try {
|
|
171
159
|
return {
|
|
172
160
|
output: data.choices[0].text,
|
|
173
|
-
tokenUsage:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
161
|
+
tokenUsage: cached
|
|
162
|
+
? { cached: data.usage.total_tokens }
|
|
163
|
+
: {
|
|
164
|
+
total: data.usage.total_tokens,
|
|
165
|
+
prompt: data.usage.prompt_tokens,
|
|
166
|
+
completion: data.usage.completion_tokens,
|
|
167
|
+
},
|
|
178
168
|
};
|
|
179
169
|
} catch (err) {
|
|
180
170
|
return {
|
|
@@ -225,9 +215,10 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
225
215
|
};
|
|
226
216
|
logger.debug(`Calling OpenAI API: ${JSON.stringify(body)}`);
|
|
227
217
|
|
|
228
|
-
let
|
|
218
|
+
let data,
|
|
219
|
+
cached = false;
|
|
229
220
|
try {
|
|
230
|
-
|
|
221
|
+
({ data, cached } = (await fetchJsonWithCache(
|
|
231
222
|
`https://${this.apiHost}/v1/chat/completions`,
|
|
232
223
|
{
|
|
233
224
|
method: 'POST',
|
|
@@ -238,8 +229,7 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
238
229
|
body: JSON.stringify(body),
|
|
239
230
|
},
|
|
240
231
|
REQUEST_TIMEOUT_MS,
|
|
241
|
-
);
|
|
242
|
-
data = (await response.json()) as unknown as any;
|
|
232
|
+
)) as unknown as any);
|
|
243
233
|
} catch (err) {
|
|
244
234
|
return {
|
|
245
235
|
error: `API call error: ${String(err)}`,
|
|
@@ -250,11 +240,13 @@ export class OpenAiChatCompletionProvider extends OpenAiGenericProvider {
|
|
|
250
240
|
try {
|
|
251
241
|
return {
|
|
252
242
|
output: data.choices[0].message.content,
|
|
253
|
-
tokenUsage:
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
243
|
+
tokenUsage: cached
|
|
244
|
+
? { cached: data.usage.total_tokens }
|
|
245
|
+
: {
|
|
246
|
+
total: data.usage.total_tokens,
|
|
247
|
+
prompt: data.usage.prompt_tokens,
|
|
248
|
+
completion: data.usage.completion_tokens,
|
|
249
|
+
},
|
|
258
250
|
};
|
|
259
251
|
} catch (err) {
|
|
260
252
|
return {
|
package/src/providers.ts
CHANGED
|
@@ -5,6 +5,15 @@ import { ApiProvider } from './types.js';
|
|
|
5
5
|
import { OpenAiCompletionProvider, OpenAiChatCompletionProvider } from './providers/openai.js';
|
|
6
6
|
import { LocalAiCompletionProvider, LocalAiChatProvider } from './providers/localai.js';
|
|
7
7
|
|
|
8
|
+
export async function loadApiProviders(providerPaths: string | string[]): Promise<ApiProvider[]> {
|
|
9
|
+
if (typeof providerPaths === 'string') {
|
|
10
|
+
return [await loadApiProvider(providerPaths)];
|
|
11
|
+
} else if (Array.isArray(providerPaths)) {
|
|
12
|
+
return Promise.all(providerPaths.map((provider) => loadApiProvider(provider)));
|
|
13
|
+
}
|
|
14
|
+
throw new Error('Invalid providers list');
|
|
15
|
+
}
|
|
16
|
+
|
|
8
17
|
export async function loadApiProvider(providerPath: string): Promise<ApiProvider> {
|
|
9
18
|
if (providerPath?.startsWith('openai:')) {
|
|
10
19
|
// Load OpenAI module
|
package/src/types.ts
CHANGED
|
@@ -1,15 +1,21 @@
|
|
|
1
1
|
export interface CommandLineOptions {
|
|
2
|
+
// Shared with TestSuite
|
|
2
3
|
prompts: string[];
|
|
3
4
|
providers: string[];
|
|
4
|
-
output
|
|
5
|
+
output: string;
|
|
6
|
+
|
|
7
|
+
// Shared with EvaluateOptions
|
|
8
|
+
maxConcurrency: string;
|
|
9
|
+
|
|
10
|
+
// Command line only
|
|
5
11
|
vars?: string;
|
|
6
12
|
config?: string;
|
|
7
13
|
verbose?: boolean;
|
|
8
|
-
maxConcurrency?: string;
|
|
9
14
|
grader?: string;
|
|
10
15
|
view?: string;
|
|
11
|
-
noWrite?: boolean;
|
|
12
16
|
tableCellMaxLength?: string;
|
|
17
|
+
write?: boolean;
|
|
18
|
+
cache?: boolean;
|
|
13
19
|
|
|
14
20
|
generateSuggestions?: boolean;
|
|
15
21
|
promptPrefix?: string;
|
|
@@ -25,18 +31,19 @@ export interface TokenUsage {
|
|
|
25
31
|
total: number;
|
|
26
32
|
prompt: number;
|
|
27
33
|
completion: number;
|
|
34
|
+
cached?: number;
|
|
28
35
|
}
|
|
29
36
|
|
|
30
37
|
export interface ProviderResponse {
|
|
31
38
|
error?: string;
|
|
32
39
|
output?: string;
|
|
33
|
-
tokenUsage?: TokenUsage
|
|
40
|
+
tokenUsage?: Partial<TokenUsage>;
|
|
34
41
|
}
|
|
35
42
|
|
|
36
43
|
export interface ProviderEmbeddingResponse {
|
|
37
44
|
error?: string;
|
|
38
45
|
embedding?: number[];
|
|
39
|
-
tokenUsage?: TokenUsage
|
|
46
|
+
tokenUsage?: Partial<TokenUsage>;
|
|
40
47
|
}
|
|
41
48
|
|
|
42
49
|
export interface CsvRow {
|
|
@@ -46,27 +53,19 @@ export interface CsvRow {
|
|
|
46
53
|
export type VarMapping = Record<string, string>;
|
|
47
54
|
|
|
48
55
|
export interface GradingConfig {
|
|
49
|
-
|
|
56
|
+
rubricPrompt?: string;
|
|
50
57
|
provider?: string | ApiProvider;
|
|
51
58
|
}
|
|
52
59
|
|
|
53
60
|
export interface PromptConfig {
|
|
54
61
|
prefix?: string;
|
|
55
62
|
suffix?: string;
|
|
56
|
-
generateSuggestions?: boolean;
|
|
57
63
|
}
|
|
58
64
|
|
|
59
65
|
export interface EvaluateOptions {
|
|
60
|
-
providers: ApiProvider[];
|
|
61
|
-
prompts: string[];
|
|
62
|
-
vars?: VarMapping[];
|
|
63
|
-
|
|
64
66
|
maxConcurrency?: number;
|
|
65
67
|
showProgressBar?: boolean;
|
|
66
|
-
|
|
67
|
-
grading?: GradingConfig;
|
|
68
|
-
|
|
69
|
-
prompt?: PromptConfig;
|
|
68
|
+
generateSuggestions?: boolean;
|
|
70
69
|
}
|
|
71
70
|
|
|
72
71
|
export interface Prompt {
|
|
@@ -97,7 +96,7 @@ export interface EvaluateTable {
|
|
|
97
96
|
export interface EvaluateStats {
|
|
98
97
|
successes: number;
|
|
99
98
|
failures: number;
|
|
100
|
-
tokenUsage: TokenUsage
|
|
99
|
+
tokenUsage: Required<TokenUsage>;
|
|
101
100
|
}
|
|
102
101
|
|
|
103
102
|
export interface EvaluateSummary {
|
|
@@ -106,3 +105,83 @@ export interface EvaluateSummary {
|
|
|
106
105
|
table: EvaluateTable;
|
|
107
106
|
stats: EvaluateStats;
|
|
108
107
|
}
|
|
108
|
+
|
|
109
|
+
export interface GradingResult {
|
|
110
|
+
pass: boolean;
|
|
111
|
+
reason: string;
|
|
112
|
+
tokensUsed?: TokenUsage;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// TODO(ian): maybe Assertion should support {type: config} to make the yaml cleaner
|
|
116
|
+
export interface Assertion {
|
|
117
|
+
// Type of assertion
|
|
118
|
+
type: 'equals' | 'is-json' | 'contains-json' | 'javascript' | 'similar' | 'llm-rubric';
|
|
119
|
+
|
|
120
|
+
// The expected value, if applicable
|
|
121
|
+
value?: string;
|
|
122
|
+
|
|
123
|
+
// The threshold value, only applicable for similarity (cosine distance)
|
|
124
|
+
threshold?: number;
|
|
125
|
+
|
|
126
|
+
// Some assertions (similarity, llm-rubric) require an LLM provider
|
|
127
|
+
provider?: ApiProvider;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
// Each test case is graded pass/fail. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
|
|
131
|
+
export interface TestCase {
|
|
132
|
+
// Optional description of what you're testing
|
|
133
|
+
description?: string;
|
|
134
|
+
|
|
135
|
+
// Key-value pairs to substitute in the prompt
|
|
136
|
+
vars?: Record<string, string>;
|
|
137
|
+
|
|
138
|
+
// Optional list of automatic checks to run on the LLM output
|
|
139
|
+
assert?: Assertion[];
|
|
140
|
+
|
|
141
|
+
// Additional configuration settings for the prompt
|
|
142
|
+
options?: PromptConfig & GradingConfig;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// The test suite defines the "knobs" that we are tuning in prompt engineering: providers and prompts
|
|
146
|
+
export interface TestSuite {
|
|
147
|
+
// Optional description of what your LLM is trying to do
|
|
148
|
+
description?: string;
|
|
149
|
+
|
|
150
|
+
// One or more LLM APIs to use
|
|
151
|
+
providers: ApiProvider[];
|
|
152
|
+
|
|
153
|
+
// One or more prompt strings
|
|
154
|
+
prompts: string[];
|
|
155
|
+
|
|
156
|
+
// Test cases
|
|
157
|
+
tests?: TestCase[];
|
|
158
|
+
|
|
159
|
+
// Default test case config
|
|
160
|
+
defaultTest?: Partial<TestCase>;
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
// TestSuiteConfig = Test Suite, but before everything is parsed and resolved. Providers are just strings, prompts are filepaths, tests can be filepath or inline.
|
|
164
|
+
export interface TestSuiteConfig {
|
|
165
|
+
// Optional description of what your LLM is trying to do
|
|
166
|
+
description?: string;
|
|
167
|
+
|
|
168
|
+
// One or more LLM APIs to use, for example: openai:gpt-3.5-turbo, openai:gpt-4, localai:chat:vicuna
|
|
169
|
+
providers: string | string[];
|
|
170
|
+
|
|
171
|
+
// One or more prompt files to load
|
|
172
|
+
prompts: string | string[];
|
|
173
|
+
|
|
174
|
+
// Path to a test file, OR list of LLM prompt variations (aka "test case")
|
|
175
|
+
tests: string | TestCase[];
|
|
176
|
+
|
|
177
|
+
// Sets the default properties for each test case. Useful for setting an assertion, on all test cases, for example.
|
|
178
|
+
defaultTest?: Omit<TestCase, 'description'>;
|
|
179
|
+
|
|
180
|
+
// Path to write output. Writes to console/web viewer if not set.
|
|
181
|
+
outputPath?: string;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export type UnifiedConfig = TestSuiteConfig & {
|
|
185
|
+
evaluateOptions: EvaluateOptions;
|
|
186
|
+
commandLineOptions: Partial<CommandLineOptions>;
|
|
187
|
+
};
|
package/src/util.ts
CHANGED
|
@@ -7,7 +7,6 @@ import yaml from 'js-yaml';
|
|
|
7
7
|
import nunjucks from 'nunjucks';
|
|
8
8
|
import { globSync } from 'glob';
|
|
9
9
|
import { parse as parsePath } from 'path';
|
|
10
|
-
import { CsvRow } from './types.js';
|
|
11
10
|
import { parse as parseCsv } from 'csv-parse/sync';
|
|
12
11
|
import { stringify } from 'csv-stringify/sync';
|
|
13
12
|
|
|
@@ -16,7 +15,16 @@ import { getDirectory } from './esm.js';
|
|
|
16
15
|
|
|
17
16
|
import type { RequestInfo, RequestInit, Response } from 'node-fetch';
|
|
18
17
|
|
|
19
|
-
import type {
|
|
18
|
+
import type {
|
|
19
|
+
Assertion,
|
|
20
|
+
CsvRow,
|
|
21
|
+
EvaluateSummary,
|
|
22
|
+
CommandLineOptions,
|
|
23
|
+
TestSuite,
|
|
24
|
+
UnifiedConfig,
|
|
25
|
+
TestCase,
|
|
26
|
+
} from './types.js';
|
|
27
|
+
import { assertionFromString } from './assertions.js';
|
|
20
28
|
|
|
21
29
|
const PROMPT_DELIMITER = '---';
|
|
22
30
|
|
|
@@ -28,7 +36,35 @@ function parseJson(json: string): any | undefined {
|
|
|
28
36
|
}
|
|
29
37
|
}
|
|
30
38
|
|
|
31
|
-
export function
|
|
39
|
+
export function maybeReadConfig(configPath: string): UnifiedConfig | undefined {
|
|
40
|
+
try {
|
|
41
|
+
return readConfig(configPath);
|
|
42
|
+
} catch {
|
|
43
|
+
return undefined;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export function readConfig(configPath: string): UnifiedConfig {
|
|
48
|
+
if (!fs.existsSync(configPath)) {
|
|
49
|
+
throw new Error(`Config file not found: ${configPath}`);
|
|
50
|
+
}
|
|
51
|
+
const ext = path.parse(configPath).ext;
|
|
52
|
+
switch (ext) {
|
|
53
|
+
case '.json':
|
|
54
|
+
const content = fs.readFileSync(configPath, 'utf-8');
|
|
55
|
+
return JSON.parse(content) as UnifiedConfig;
|
|
56
|
+
case '.js':
|
|
57
|
+
return require(configPath) as UnifiedConfig;
|
|
58
|
+
case '.yaml':
|
|
59
|
+
return yaml.load(fs.readFileSync(configPath, 'utf-8')) as UnifiedConfig;
|
|
60
|
+
default:
|
|
61
|
+
throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
export function readPrompts(promptPathsOrGlobs: string | string[]): string[] {
|
|
66
|
+
promptPathsOrGlobs =
|
|
67
|
+
typeof promptPathsOrGlobs === 'string' ? [promptPathsOrGlobs] : promptPathsOrGlobs;
|
|
32
68
|
const promptPaths = promptPathsOrGlobs.flatMap((pathOrGlob) => globSync(pathOrGlob));
|
|
33
69
|
let promptContents: string[] = [];
|
|
34
70
|
|
|
@@ -49,6 +85,9 @@ export function readPrompts(promptPathsOrGlobs: string[]): string[] {
|
|
|
49
85
|
if (promptContents.length === 1) {
|
|
50
86
|
promptContents = promptContents[0].split(PROMPT_DELIMITER).map((p) => p.trim());
|
|
51
87
|
}
|
|
88
|
+
if (promptContents.length === 0) {
|
|
89
|
+
throw new Error(`There are no prompts in ${promptPathsOrGlobs.join(', ')}`);
|
|
90
|
+
}
|
|
52
91
|
return promptContents;
|
|
53
92
|
}
|
|
54
93
|
|
|
@@ -67,6 +106,37 @@ export function readVars(varsPath: string): CsvRow[] {
|
|
|
67
106
|
return rows;
|
|
68
107
|
}
|
|
69
108
|
|
|
109
|
+
export function readTests(tests: string | TestCase[] | undefined): TestCase[] {
|
|
110
|
+
if (!tests) {
|
|
111
|
+
return [];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (typeof tests === 'string') {
|
|
115
|
+
// It's a filepath, load from CSV
|
|
116
|
+
const vars = readVars(tests);
|
|
117
|
+
return vars.map((row, idx) => {
|
|
118
|
+
const test = testCaseFromCsvRow(row);
|
|
119
|
+
test.description = `Row #${idx + 1}`;
|
|
120
|
+
return test;
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// Some validation of the shape of tests
|
|
125
|
+
for (const test of tests) {
|
|
126
|
+
if (!test.assert && !test.vars) {
|
|
127
|
+
throw new Error(
|
|
128
|
+
`Test case must have either "assert" or "vars" property. Instead got ${JSON.stringify(
|
|
129
|
+
test,
|
|
130
|
+
null,
|
|
131
|
+
2,
|
|
132
|
+
)}`,
|
|
133
|
+
);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
return tests;
|
|
138
|
+
}
|
|
139
|
+
|
|
70
140
|
export function writeOutput(outputPath: string, summary: EvaluateSummary): void {
|
|
71
141
|
const outputExtension = outputPath.split('.').pop()?.toLowerCase();
|
|
72
142
|
|
|
@@ -139,7 +209,6 @@ export function writeLatestResults(results: EvaluateSummary) {
|
|
|
139
209
|
try {
|
|
140
210
|
fs.mkdirSync(path.dirname(latestResultsPath), { recursive: true });
|
|
141
211
|
fs.writeFileSync(latestResultsPath, JSON.stringify(results, null, 2));
|
|
142
|
-
logger.info(`Wrote latest results to ${latestResultsPath}.`);
|
|
143
212
|
} catch (err) {
|
|
144
213
|
logger.error(`Failed to write latest results to ${latestResultsPath}:\n${err}`);
|
|
145
214
|
}
|
|
@@ -154,3 +223,20 @@ export function cosineSimilarity(vecA: number[], vecB: number[]) {
|
|
|
154
223
|
const vecBMagnitude = Math.sqrt(vecB.reduce((acc, val) => acc + val * val, 0));
|
|
155
224
|
return dotProduct / (vecAMagnitude * vecBMagnitude);
|
|
156
225
|
}
|
|
226
|
+
|
|
227
|
+
export function testCaseFromCsvRow(row: CsvRow): TestCase {
|
|
228
|
+
const vars: Record<string, string> = {};
|
|
229
|
+
const asserts: Assertion[] = [];
|
|
230
|
+
for (const [key, value] of Object.entries(row)) {
|
|
231
|
+
if (key === '__expected') {
|
|
232
|
+
asserts.push(assertionFromString(value));
|
|
233
|
+
} else {
|
|
234
|
+
vars[key] = value;
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
return {
|
|
239
|
+
vars,
|
|
240
|
+
assert: asserts,
|
|
241
|
+
};
|
|
242
|
+
}
|
package/src/web/server.ts
CHANGED
|
@@ -32,24 +32,6 @@ export function init(port = 15500) {
|
|
|
32
32
|
},
|
|
33
33
|
});
|
|
34
34
|
|
|
35
|
-
interface EvaluateRequestBody {
|
|
36
|
-
provider: string;
|
|
37
|
-
options: {
|
|
38
|
-
prompts: string[];
|
|
39
|
-
vars: Record<string, string>[];
|
|
40
|
-
};
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
app.post('/evaluate', async (req: Request, res: Response) => {
|
|
44
|
-
try {
|
|
45
|
-
const { provider, options } = req.body as EvaluateRequestBody;
|
|
46
|
-
const summary = await promptfoo.evaluate(provider, options);
|
|
47
|
-
res.json(summary);
|
|
48
|
-
} catch (error) {
|
|
49
|
-
res.status(500).json({ message: 'Error evaluating prompts' });
|
|
50
|
-
}
|
|
51
|
-
});
|
|
52
|
-
|
|
53
35
|
const latestJsonPath = getLatestResultsPath();
|
|
54
36
|
const readLatestJson = () => {
|
|
55
37
|
const data = fs.readFileSync(latestJsonPath, 'utf8');
|