promptfoo 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -14
- package/dist/evaluator.d.ts +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +219 -148
- package/dist/evaluator.js.map +1 -1
- package/dist/main.js +22 -4
- package/dist/main.js.map +1 -1
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +21 -0
- package/dist/prompts.js.map +1 -0
- package/dist/providers.d.ts +1 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +10 -5
- package/dist/providers.js.map +1 -1
- package/dist/tableOutput.html +5 -8
- package/dist/types.d.ts +13 -7
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +2 -0
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +26 -0
- package/dist/util.js.map +1 -1
- package/package.json +1 -1
- package/src/evaluator.ts +278 -175
- package/src/main.ts +24 -4
- package/src/prompts.ts +20 -0
- package/src/providers.ts +32 -15
- package/src/tableOutput.html +5 -8
- package/src/types.ts +16 -6
- package/src/util.ts +33 -0
package/README.md
CHANGED
|
@@ -13,6 +13,15 @@ With promptfoo, you can:
|
|
|
13
13
|
- Use as a command line tool, or integrate into your workflow as a library
|
|
14
14
|
- Use OpenAI API models (built-in support), or integrate custom API providers for any LLM API
|
|
15
15
|
|
|
16
|
+
**» [View docs on website](https://promptfoo.dev/docs/intro) «**
|
|
17
|
+
|
|
18
|
+
promptfoo works by producing matrix views that allow you to quickly review prompt outputs across many inputs. The goal: tune prompts systematically across all relevant test cases, instead of testing prompts one-off.
|
|
19
|
+
|
|
20
|
+
Here's an example of a side-by-side comparison of multiple prompts and inputs. You can manually review outputs, or set up "expectations" that automatically flag bad outputs.
|
|
21
|
+
|
|
22
|
+

|
|
23
|
+
|
|
24
|
+
|
|
16
25
|
## Usage (command line)
|
|
17
26
|
|
|
18
27
|
To get started, run the following command:
|
|
@@ -32,15 +41,16 @@ npx promptfoo eval
|
|
|
32
41
|
If you're looking to customize your usage, you have the full set of parameters at your disposal:
|
|
33
42
|
|
|
34
43
|
```bash
|
|
35
|
-
npx promptfoo eval -p <prompt_paths...> -o <output_path> -r <providers> [-v <vars_path>] [-j <max_concurrency] [-c <config_path>]
|
|
44
|
+
npx promptfoo eval -p <prompt_paths...> -o <output_path> -r <providers> [-v <vars_path>] [-j <max_concurrency] [-c <config_path>] [--grader <grading_provider>]
|
|
36
45
|
```
|
|
37
46
|
|
|
38
47
|
- `<prompt_paths...>`: Paths to prompt file(s)
|
|
39
48
|
- `<output_path>`: Path to output CSV, JSON, YAML, or HTML file. Defaults to terminal output
|
|
40
49
|
- `<providers>`: One or more of: `openai:<model_name>`, or filesystem path to custom API caller module
|
|
41
50
|
- `<vars_path>` (optional): Path to CSV, JSON, or YAML file with prompt variables
|
|
42
|
-
- `<max_concurrency>` (optional): Number of simultaneous API requests. Defaults to
|
|
51
|
+
- `<max_concurrency>` (optional): Number of simultaneous API requests. Defaults to 4
|
|
43
52
|
- `<config_path>` (optional): Path to configuration file
|
|
53
|
+
- `<grading_provider>`: A provider that handles the grading process, if you are using [LLM grading](#expected-outputs)
|
|
44
54
|
|
|
45
55
|
### Examples
|
|
46
56
|
|
|
@@ -64,7 +74,9 @@ This command will evaluate the prompts in `prompts.txt`, substituing the variabl
|
|
|
64
74
|
|
|
65
75
|
Have a look at the setup and full output [here](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli).
|
|
66
76
|
|
|
67
|
-
You can
|
|
77
|
+
You can also output a nice [spreadsheet](https://docs.google.com/spreadsheets/d/1nanoj3_TniWrDl1Sj-qYqIMD6jwm5FBy15xPFdUTsmI/edit?usp=sharing), [JSON](https://github.com/typpo/promptfoo/blob/main/examples/simple-cli/output.json), YAML, or an HTML file:
|
|
78
|
+
|
|
79
|
+

|
|
68
80
|
|
|
69
81
|
#### Model quality
|
|
70
82
|
|
|
@@ -164,9 +176,9 @@ You can use [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax to
|
|
|
164
176
|
Example of a single prompt file with multiple prompts (`prompts.txt`):
|
|
165
177
|
|
|
166
178
|
```
|
|
167
|
-
Translate the following text to French: "{{text}}"
|
|
179
|
+
Translate the following text to French: "{{name}}: {{text}}"
|
|
168
180
|
---
|
|
169
|
-
Translate the following text to German: "{{text}}"
|
|
181
|
+
Translate the following text to German: "{{name}}: {{text}}"
|
|
170
182
|
```
|
|
171
183
|
|
|
172
184
|
Example of multiple prompt files:
|
|
@@ -174,13 +186,13 @@ Example of multiple prompt files:
|
|
|
174
186
|
- `prompt1.txt`:
|
|
175
187
|
|
|
176
188
|
```
|
|
177
|
-
Translate the following text to French: "{{text}}"
|
|
189
|
+
Translate the following text to French: "{{name}}: {{text}}"
|
|
178
190
|
```
|
|
179
191
|
|
|
180
192
|
- `prompt2.txt`:
|
|
181
193
|
|
|
182
194
|
```
|
|
183
|
-
Translate the following text to German: "{{text}}"
|
|
195
|
+
Translate the following text to German: "{{name}}: {{text}}"
|
|
184
196
|
```
|
|
185
197
|
|
|
186
198
|
### Vars File
|
|
@@ -192,24 +204,27 @@ Vars are substituted by [Nunjucks](https://mozilla.github.io/nunjucks/) templati
|
|
|
192
204
|
Example of a vars file (`vars.csv`):
|
|
193
205
|
|
|
194
206
|
```
|
|
195
|
-
text
|
|
196
|
-
"Hello, world!"
|
|
197
|
-
"Goodbye, everyone!"
|
|
207
|
+
"name","text"
|
|
208
|
+
"Bob","Hello, world!"
|
|
209
|
+
"Joe","Goodbye, everyone!"
|
|
198
210
|
```
|
|
199
211
|
|
|
200
212
|
Example of a vars file (`vars.json`):
|
|
201
213
|
|
|
202
214
|
```json
|
|
203
|
-
[
|
|
215
|
+
[
|
|
216
|
+
{ "name": "Bob", "text": "Hello, world!" },
|
|
217
|
+
{ "name": "Joe", "text": "Goodbye, everyone!" }
|
|
218
|
+
]
|
|
204
219
|
```
|
|
205
220
|
|
|
206
|
-
### Expected
|
|
221
|
+
### Expected Outputs
|
|
207
222
|
|
|
208
|
-
You can specify an expected value for each test case to evaluate the success or failure of the model's output. To do this, add a special field called `__expected` in the `vars` file. The `__expected` field supports
|
|
223
|
+
You can specify an expected value for each test case to evaluate the success or failure of the model's output. To do this, add a special field called `__expected` in the `vars` file. The `__expected` field supports these types of value comparisons:
|
|
209
224
|
|
|
210
225
|
1. If the expected value starts with `eval:`, it will evaluate the contents as the body of a JavaScript function defined like: `function(output) { <eval> }`. The function should return a boolean value, where `true` indicates success and `false` indicates failure.
|
|
211
226
|
|
|
212
|
-
2. If the expected value starts with `grade:`, it will
|
|
227
|
+
2. If the expected value starts with `grade:`, it will ask an LLM to evaluate whether the output meets the condition. For example: `grade: don't mention being an AI`. This option requires a provider name to be supplied to promptfoo via the `--grader` argument: `promptfoo --grader openai:gpt-4 ...`.
|
|
213
228
|
|
|
214
229
|
3. Otherwise, it attempts an exact string match comparison between the expected value and the model's output.
|
|
215
230
|
|
|
@@ -219,6 +234,7 @@ Example of a vars file with the `__expected` field (`vars.csv`):
|
|
|
219
234
|
text,__expected
|
|
220
235
|
"Hello, world!","Bonjour le monde"
|
|
221
236
|
"Goodbye, everyone!","eval:return output.includes('Au revoir');"
|
|
237
|
+
"I am a pineapple","grade:doesn't reference any fruits besides pineapple"
|
|
222
238
|
```
|
|
223
239
|
|
|
224
240
|
Example of a vars file with the `__expected` field (`vars.json`):
|
|
@@ -227,6 +243,7 @@ Example of a vars file with the `__expected` field (`vars.json`):
|
|
|
227
243
|
[
|
|
228
244
|
{ "text": "Hello, world!", "__expected": "Bonjour le monde" },
|
|
229
245
|
{ "text": "Goodbye, everyone!", "__expected": "eval:output.includes('Au revoir');" }
|
|
246
|
+
{ "text": "I am a pineapple", "__expected": "grade:doesn't reference any fruits besides pineapple" }
|
|
230
247
|
]
|
|
231
248
|
```
|
|
232
249
|
|
|
@@ -297,6 +314,8 @@ Other OpenAI-related environment variables are supported:
|
|
|
297
314
|
|
|
298
315
|
- `OPENAI_TEMPERATURE` - temperature model parameter, defaults to 0
|
|
299
316
|
- `OPENAI_MAX_TOKENS` - max_tokens model parameter, defaults to 1024
|
|
317
|
+
- `OPENAI_API_HOST` - override the hostname for the API request. Useful for proxies like Helicone.
|
|
318
|
+
- `REQUEST_TIMEOUT_MS` - maximum request time, in milliseconds (defaults to 10000)
|
|
300
319
|
|
|
301
320
|
The OpenAI provider supports the following model formats:
|
|
302
321
|
|
package/dist/evaluator.d.ts
CHANGED
package/dist/evaluator.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,EAGhB,MAAM,YAAY,CAAC;AA2SpB,wBAAgB,QAAQ,CAAC,OAAO,EAAE,eAAe,4BAGhD"}
|
package/dist/evaluator.js
CHANGED
|
@@ -1,175 +1,246 @@
|
|
|
1
1
|
import async from 'async';
|
|
2
2
|
import nunjucks from 'nunjucks';
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
3
|
+
import { DEFAULT_GRADING_PROMPT } from './prompts.js';
|
|
4
|
+
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
5
|
+
class Evaluator {
|
|
6
|
+
constructor(options) {
|
|
7
|
+
this.options = options;
|
|
8
|
+
this.stats = {
|
|
9
|
+
successes: 0,
|
|
10
|
+
failures: 0,
|
|
11
|
+
tokenUsage: {
|
|
12
|
+
total: 0,
|
|
13
|
+
prompt: 0,
|
|
14
|
+
completion: 0,
|
|
15
|
+
},
|
|
16
|
+
};
|
|
13
17
|
}
|
|
14
|
-
|
|
15
|
-
|
|
18
|
+
async gradeOutput(expected, output) {
|
|
19
|
+
const { grading } = this.options;
|
|
20
|
+
if (!grading) {
|
|
21
|
+
throw new Error('Cannot grade output without grading config. Specify --grader option or grading config.');
|
|
22
|
+
}
|
|
23
|
+
const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
|
|
24
|
+
content: output,
|
|
25
|
+
rubric: expected,
|
|
26
|
+
});
|
|
27
|
+
const resp = await grading.provider.callApi(prompt);
|
|
28
|
+
if (resp.error || !resp.output) {
|
|
29
|
+
return {
|
|
30
|
+
pass: false,
|
|
31
|
+
reason: resp.error || 'No output',
|
|
32
|
+
tokensUsed: {
|
|
33
|
+
total: resp.tokenUsage?.total || 0,
|
|
34
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
35
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
36
|
+
},
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
try {
|
|
40
|
+
const parsed = JSON.parse(resp.output);
|
|
41
|
+
parsed.tokensUsed = {
|
|
42
|
+
total: resp.tokenUsage?.total || 0,
|
|
43
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
44
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
45
|
+
};
|
|
46
|
+
return parsed;
|
|
47
|
+
}
|
|
48
|
+
catch (err) {
|
|
49
|
+
return {
|
|
50
|
+
pass: false,
|
|
51
|
+
reason: `Output is not valid JSON: ${resp.output}`,
|
|
52
|
+
tokensUsed: {
|
|
53
|
+
total: resp.tokenUsage?.total || 0,
|
|
54
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
55
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
56
|
+
},
|
|
57
|
+
};
|
|
58
|
+
}
|
|
16
59
|
}
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
|
|
23
|
-
const setup = {
|
|
24
|
-
prompt: {
|
|
25
|
-
raw: renderedPrompt,
|
|
26
|
-
display: promptDisplay,
|
|
27
|
-
},
|
|
28
|
-
vars,
|
|
29
|
-
};
|
|
30
|
-
try {
|
|
31
|
-
const response = await provider.callApi(renderedPrompt);
|
|
32
|
-
const ret = {
|
|
33
|
-
...setup,
|
|
34
|
-
response,
|
|
35
|
-
success: false,
|
|
36
|
-
};
|
|
37
|
-
if (response.error) {
|
|
38
|
-
ret.error = response.error;
|
|
60
|
+
async checkExpectedValue(expected, output) {
|
|
61
|
+
if (expected.startsWith('eval:')) {
|
|
62
|
+
const evalBody = expected.slice(5);
|
|
63
|
+
const evalFunction = new Function('output', `return ${evalBody}`);
|
|
64
|
+
return { pass: evalFunction(output) };
|
|
39
65
|
}
|
|
40
|
-
else if (
|
|
41
|
-
const
|
|
42
|
-
|
|
43
|
-
:
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
}
|
|
47
|
-
ret.success = matchesExpected;
|
|
66
|
+
else if (expected.startsWith('grade:')) {
|
|
67
|
+
const gradingResult = await this.gradeOutput(expected.slice(6), output);
|
|
68
|
+
return {
|
|
69
|
+
pass: gradingResult.pass,
|
|
70
|
+
reason: gradingResult.reason,
|
|
71
|
+
};
|
|
48
72
|
}
|
|
49
73
|
else {
|
|
50
|
-
|
|
51
|
-
|
|
74
|
+
const pass = expected === output;
|
|
75
|
+
return {
|
|
76
|
+
pass,
|
|
77
|
+
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
78
|
+
};
|
|
52
79
|
}
|
|
53
|
-
return ret;
|
|
54
80
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
81
|
+
async runEval({ provider, prompt, vars, includeProviderId, }) {
|
|
82
|
+
vars = vars || {};
|
|
83
|
+
const renderedPrompt = nunjucks.renderString(prompt, vars);
|
|
84
|
+
// Note that we're using original prompt, not renderedPrompt
|
|
85
|
+
const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
|
|
86
|
+
const setup = {
|
|
87
|
+
prompt: {
|
|
88
|
+
raw: renderedPrompt,
|
|
89
|
+
display: promptDisplay,
|
|
90
|
+
},
|
|
91
|
+
vars,
|
|
60
92
|
};
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
93
|
+
try {
|
|
94
|
+
const response = await provider.callApi(renderedPrompt);
|
|
95
|
+
const ret = {
|
|
96
|
+
...setup,
|
|
97
|
+
response,
|
|
98
|
+
success: false,
|
|
99
|
+
};
|
|
100
|
+
if (response.error) {
|
|
101
|
+
ret.error = response.error;
|
|
102
|
+
}
|
|
103
|
+
else if (response.output) {
|
|
104
|
+
const checkResult = vars.__expected
|
|
105
|
+
? await this.checkExpectedValue(vars.__expected, response.output)
|
|
106
|
+
: { pass: true };
|
|
107
|
+
if (!checkResult.pass) {
|
|
108
|
+
ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
|
|
109
|
+
}
|
|
110
|
+
ret.success = checkResult.pass;
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
ret.success = false;
|
|
114
|
+
ret.error = 'No output';
|
|
115
|
+
}
|
|
116
|
+
// Update token usage stats
|
|
117
|
+
this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
|
|
118
|
+
this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
|
|
119
|
+
this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
|
|
120
|
+
if (ret.success) {
|
|
121
|
+
this.stats.successes++;
|
|
122
|
+
}
|
|
123
|
+
else {
|
|
124
|
+
this.stats.failures++;
|
|
125
|
+
}
|
|
126
|
+
return ret;
|
|
127
|
+
}
|
|
128
|
+
catch (err) {
|
|
129
|
+
return {
|
|
130
|
+
...setup,
|
|
131
|
+
error: String(err),
|
|
132
|
+
success: false,
|
|
133
|
+
};
|
|
72
134
|
}
|
|
73
135
|
}
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
const
|
|
77
|
-
delete ret.__expected;
|
|
78
|
-
return ret;
|
|
79
|
-
});
|
|
80
|
-
const isTest = vars[0].__expected;
|
|
81
|
-
const table = [
|
|
82
|
-
isTest
|
|
83
|
-
? [
|
|
84
|
-
'RESULT',
|
|
85
|
-
[...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
86
|
-
].flat()
|
|
87
|
-
: [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
88
|
-
];
|
|
89
|
-
const stats = {
|
|
90
|
-
successes: 0,
|
|
91
|
-
failures: 0,
|
|
92
|
-
tokenUsage: {
|
|
93
|
-
total: 0,
|
|
94
|
-
prompt: 0,
|
|
95
|
-
completion: 0,
|
|
96
|
-
},
|
|
97
|
-
};
|
|
98
|
-
let progressbar;
|
|
99
|
-
if (options.showProgressBar) {
|
|
100
|
-
const totalNumRuns = options.prompts.length * options.providers.length * (options.vars?.length || 1);
|
|
101
|
-
const cliProgress = await import('cli-progress');
|
|
102
|
-
progressbar = new cliProgress.SingleBar({
|
|
103
|
-
format: 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
104
|
-
}, cliProgress.Presets.shades_classic);
|
|
105
|
-
progressbar.start(totalNumRuns, 0, {
|
|
106
|
-
provider: '',
|
|
107
|
-
prompt: '',
|
|
108
|
-
vars: '',
|
|
109
|
-
});
|
|
110
|
-
}
|
|
111
|
-
const runEvalOptions = [];
|
|
112
|
-
for (const row of vars) {
|
|
136
|
+
async evaluate() {
|
|
137
|
+
const options = this.options;
|
|
138
|
+
const prompts = [];
|
|
113
139
|
for (const promptContent of options.prompts) {
|
|
114
140
|
for (const provider of options.providers) {
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
includeProviderId: options.providers.length > 1,
|
|
141
|
+
const display = options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
|
|
142
|
+
prompts.push({
|
|
143
|
+
raw: promptContent,
|
|
144
|
+
display,
|
|
120
145
|
});
|
|
121
146
|
}
|
|
122
147
|
}
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
148
|
+
const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
|
|
149
|
+
const varsWithExpectedKeyRemoved = vars.map((v) => {
|
|
150
|
+
const ret = { ...v };
|
|
151
|
+
delete ret.__expected;
|
|
152
|
+
return ret;
|
|
153
|
+
});
|
|
154
|
+
const isTest = vars[0].__expected;
|
|
155
|
+
const table = [
|
|
156
|
+
[...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
157
|
+
];
|
|
158
|
+
let progressbar;
|
|
159
|
+
if (options.showProgressBar) {
|
|
160
|
+
const totalNumRuns = options.prompts.length * options.providers.length * (options.vars?.length || 1);
|
|
161
|
+
const cliProgress = await import('cli-progress');
|
|
162
|
+
progressbar = new cliProgress.SingleBar({
|
|
163
|
+
format: 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
164
|
+
}, cliProgress.Presets.shades_classic);
|
|
165
|
+
progressbar.start(totalNumRuns, 0, {
|
|
166
|
+
provider: '',
|
|
167
|
+
prompt: '',
|
|
168
|
+
vars: '',
|
|
169
|
+
});
|
|
130
170
|
}
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
171
|
+
const runEvalOptions = [];
|
|
172
|
+
for (const row of vars) {
|
|
173
|
+
for (const promptContent of options.prompts) {
|
|
174
|
+
for (const provider of options.providers) {
|
|
175
|
+
runEvalOptions.push({
|
|
176
|
+
provider,
|
|
177
|
+
prompt: promptContent,
|
|
178
|
+
vars: row,
|
|
179
|
+
includeProviderId: options.providers.length > 1,
|
|
180
|
+
});
|
|
181
|
+
}
|
|
134
182
|
}
|
|
135
|
-
else {
|
|
136
|
-
stats.failures++;
|
|
137
|
-
}
|
|
138
|
-
stats.tokenUsage.total += row.response?.tokenUsage?.total || 0;
|
|
139
|
-
stats.tokenUsage.prompt += row.response?.tokenUsage?.prompt || 0;
|
|
140
|
-
stats.tokenUsage.completion += row.response?.tokenUsage?.completion || 0;
|
|
141
183
|
}
|
|
184
|
+
const tempResults = [];
|
|
185
|
+
const combinedOutputs = new Array(vars.length).fill(null).map(() => []);
|
|
186
|
+
await async.forEachOfLimit(runEvalOptions, options.maxConcurrency || DEFAULT_MAX_CONCURRENCY, async (options, index) => {
|
|
187
|
+
const row = await this.runEval(options);
|
|
188
|
+
//results[index as number] = row;
|
|
189
|
+
tempResults.push({ index: index, row });
|
|
190
|
+
if (progressbar) {
|
|
191
|
+
progressbar.increment({
|
|
192
|
+
provider: options.provider.id(),
|
|
193
|
+
prompt: options.prompt.slice(0, 10),
|
|
194
|
+
vars: Object.entries(options.vars || {})
|
|
195
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
196
|
+
.join(' ')
|
|
197
|
+
.slice(0, 10),
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
// Bookkeeping for table
|
|
201
|
+
if (typeof index !== 'number') {
|
|
202
|
+
throw new Error('Expected index to be a number');
|
|
203
|
+
}
|
|
204
|
+
const combinedOutputIndex = Math.floor(index / prompts.length);
|
|
205
|
+
combinedOutputs[combinedOutputIndex].push(row.response?.output || row.error || '');
|
|
206
|
+
});
|
|
142
207
|
if (progressbar) {
|
|
143
|
-
progressbar.
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
208
|
+
progressbar.stop();
|
|
209
|
+
}
|
|
210
|
+
const results = [];
|
|
211
|
+
tempResults
|
|
212
|
+
.sort((a, b) => a.index - b.index)
|
|
213
|
+
.forEach(({ index, row }) => {
|
|
214
|
+
results[index] = row;
|
|
215
|
+
});
|
|
216
|
+
// TODO(ian): Provide full context in table cells, and have the caller
|
|
217
|
+
// construct the table contents itself.
|
|
218
|
+
if (isTest) {
|
|
219
|
+
// Iterate through each combined output
|
|
220
|
+
combinedOutputs.forEach((output, index) => {
|
|
221
|
+
// Create a new array to store the modified output with [PASS] or [FAIL] prepended
|
|
222
|
+
const modifiedOutput = [];
|
|
223
|
+
// Iterate through each output value and prepend [PASS] or [FAIL] based on the success status
|
|
224
|
+
output.forEach((o, outputIndex) => {
|
|
225
|
+
const resultIndex = index * prompts.length + outputIndex;
|
|
226
|
+
const result = results[resultIndex];
|
|
227
|
+
// TODO(ian): sometimes output and result.error can be identical (in the case of exception)
|
|
228
|
+
const resultStatus = result.success ? `[PASS] ${o}` : `[FAIL] ${result.error}\n---\n${o}`;
|
|
229
|
+
modifiedOutput.push(resultStatus);
|
|
230
|
+
});
|
|
231
|
+
// Add the modified output and the corresponding values from varsWithExpectedKeyRemoved to the table
|
|
232
|
+
const tableRow = [...modifiedOutput, ...Object.values(varsWithExpectedKeyRemoved[index])];
|
|
233
|
+
table.push(tableRow);
|
|
150
234
|
});
|
|
151
235
|
}
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
throw new Error('Expected index to be a number');
|
|
236
|
+
else {
|
|
237
|
+
table.push(...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]));
|
|
155
238
|
}
|
|
156
|
-
|
|
157
|
-
combinedOutputs[combinedOutputIndex].push(row.response?.output || row.error || '');
|
|
158
|
-
});
|
|
159
|
-
if (progressbar) {
|
|
160
|
-
progressbar.stop();
|
|
161
|
-
}
|
|
162
|
-
// TODO(ian): Display errors in table UI.
|
|
163
|
-
if (isTest) {
|
|
164
|
-
table.push(...combinedOutputs.map((output, index) => [
|
|
165
|
-
results[index].success ? 'PASS' : `FAIL: ${results[index].error}`,
|
|
166
|
-
...output,
|
|
167
|
-
...Object.values(varsWithExpectedKeyRemoved[index]),
|
|
168
|
-
]));
|
|
239
|
+
return { results, stats: this.stats, table };
|
|
169
240
|
}
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
return
|
|
241
|
+
}
|
|
242
|
+
export function evaluate(options) {
|
|
243
|
+
const ev = new Evaluator(options);
|
|
244
|
+
return ev.evaluate();
|
|
174
245
|
}
|
|
175
246
|
//# sourceMappingURL=evaluator.js.map
|
package/dist/evaluator.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,QAAQ,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,QAAQ,MAAM,UAAU,CAAC;AAEhC,OAAO,EAAE,sBAAsB,EAAE,MAAM,cAAc,CAAC;AA0BtD,MAAM,uBAAuB,GAAG,CAAC,CAAC;AAElC,MAAM,SAAS;IAIb,YAAY,OAAwB;QAClC,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG;YACX,SAAS,EAAE,CAAC;YACZ,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE;gBACV,KAAK,EAAE,CAAC;gBACR,MAAM,EAAE,CAAC;gBACT,UAAU,EAAE,CAAC;aACd;SACF,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,QAAgB,EAAE,MAAc;QAChD,MAAM,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC,OAAO,CAAC;QAEjC,IAAI,CAAC,OAAO,EAAE;YACZ,MAAM,IAAI,KAAK,CACb,wFAAwF,CACzF,CAAC;SACH;QAED,MAAM,MAAM,GAAG,QAAQ,CAAC,YAAY,CAAC,OAAO,CAAC,MAAM,IAAI,sBAAsB,EAAE;YAC7E,OAAO,EAAE,MAAM;YACf,MAAM,EAAE,QAAQ;SACjB,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,MAAM,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;QACpD,IAAI,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;YAC9B,OAAO;gBACL,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,IAAI,CAAC,KAAK,IAAI,WAAW;gBACjC,UAAU,EAAE;oBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;oBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;oBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;iBAC7C;aACF,CAAC;SACH;QAED,IAAI;YACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAkB,CAAC;YACxD,MAAM,CAAC,UAAU,GAAG;gBAClB,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;gBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;gBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;aAC7C,CAAC;YACF,OAAO,MAAM,CAAC;SACf;QAAC,OAAO,GAAG,EAAE;YACZ,OAAO;gBACL,IAAI,EAAE,KAAK;gBACX,MAAM,EAAE,6BAA6B,IAAI,CAAC,MAAM,EAAE;gBAClD,UAAU,EAAE;oBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;oBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;oBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;iBAC7C;aACF,CAAC;SACH;IACH,CAAC;IAED,KAAK,CAAC,kBAAkB,CACtB,QAAgB,EAChB,MAAc;QAEd,IAAI,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE;YAChC,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YACnC,MAAM,YAAY,GAAG,IAAI,QAAQ,CAAC,QAAQ,EAAE,UAAU,QAAQ,EAAE,CAAC,CAAC;YAClE,OAAO,EAAE,IAAI,EAAE,YAAY,CAAC,MAAM,CAAC,EAAE,CAAC;SACvC;aAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE;YACxC,MAAM,aAAa,GAAG,MAAM,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;YACxE,OAAO;gBACL,IAAI,EAAE,aAAa,CAAC,IAAI;gBACxB,MAAM,EAAE,aAAa,CAAC,MAAM;aAC7B,CAAC;SACH;aAAM;YACL,MAAM,IAAI,GAAG,QAAQ,KAAK,MAAM,CAAC;YACjC,OAAO;gBACL,IAAI;gBACJ,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,aAAa,QAAQ,aAAa,MAAM,EAAE;aACtE,CAAC;SACH;IACH,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,EACZ,QAAQ,EACR,MAAM,EACN,IAAI,EACJ,iBAAiB,GACF;QACf,IAAI,GAAG,IAAI,IAAI,EAAE,CAAC;QAClB,MAAM,cAAc,GAAG,QAAQ,CAAC,YAAY,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QAE3D,4DAA4D;QAC5D,MAAM,aAAa,GAAG,iBAAiB,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QAElF,MAAM,KAAK,GAAG;YACZ,MAAM,EAAE;gBACN,GAAG,EAAE,cAAc;gBACnB,OAAO,EAAE,aAAa;aACvB;YACD,IAAI;SACL,CAAC;QAEF,IAAI;YACF,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YACxD,MAAM,GAAG,GAAmB;gBAC1B,GAAG,KAAK;gBACR,QAAQ;gBACR,OAAO,EAAE,KAAK;aACf,CAAC;YACF,IAAI,QAAQ,CAAC,KAAK,EAAE;gBAClB,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC;aAC5B;iBAAM,IAAI,QAAQ,CAAC,MAAM,EAAE;gBAC1B,MAAM,WAAW,GAAG,IAAI,CAAC,UAAU;oBACjC,CAAC,CAAC,MAAM,IAAI,CAAC,kBAAkB,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC;oBACjE,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;gBACnB,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE;oBACrB,GAAG,CAAC,KAAK,GAAG,WAAW,CAAC,MAAM,IAAI,aAAa,IAAI,CAAC,UAAU,EAAE,CAAC;iBAClE;gBACD,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC;aAChC;iBAAM;gBACL,GAAG,CAAC,OAAO,GAAG,KAAK,CAAC;gBACpB,GAAG,CAAC,KAAK,GAAG,WAAW,CAAC;aACzB;YAED,2BAA2B;YAC3B,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,IAAI,QAAQ,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC;YAC/D,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC;YACjE,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,UAAU,IAAI,QAAQ,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;YAEzE,IAAI,GAAG,CAAC,OAAO,EAAE;gBACf,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;aACxB;iBAAM;gBACL,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;aACvB;YAED,OAAO,GAAG,CAAC;SACZ;QAAC,OAAO,GAAG,EAAE;YACZ,OAAO;gBACL,GAAG,KAAK;gBACR,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC;gBAClB,OAAO,EAAE,KAAK;aACf,CAAC;SACH;IACH,CAAC;IAED,KAAK,CAAC,QAAQ;QACZ,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;QAC7B,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,KAAK,MAAM,aAAa,IAAI,OAAO,CAAC,OAAO,EAAE;YAC3C,KAAK,MAAM,QAAQ,IAAI,OAAO,CAAC,SAAS,EAAE;gBACxC,MAAM,OAAO,GACX,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,aAAa,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC;gBACvF,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG,EAAE,aAAa;oBAClB,OAAO;iBACR,CAAC,CAAC;aACJ;SACF;QAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;QAC3E,MAAM,0BAA0B,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;YAChD,MAAM,GAAG,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;YACrB,OAAO,GAAG,CAAC,UAAU,CAAC;YACtB,OAAO,GAAG,CAAC;QACb,CAAC,CAAC,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QAClC,MAAM,KAAK,GAAe;YACxB,CAAC,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC,CAAC,CAAC;SAClF,CAAC;QAEF,IAAI,WAAkC,CAAC;QACvC,IAAI,OAAO,CAAC,eAAe,EAAE;YAC3B,MAAM,YAAY,GAChB,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC;YAClF,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;YACjD,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CACrC;gBACE,MAAM,EACJ,4FAA4F;aAC/F,EACD,WAAW,CAAC,OAAO,CAAC,cAAc,CACnC,CAAC;YACF,WAAW,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC,EAAE;gBACjC,QAAQ,EAAE,EAAE;gBACZ,MAAM,EAAE,EAAE;gBACV,IAAI,EAAE,EAAE;aACT,CAAC,CAAC;SACJ;QAED,MAAM,cAAc,GAAqB,EAAE,CAAC;QAC5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;YACtB,KAAK,MAAM,aAAa,IAAI,OAAO,CAAC,OAAO,EAAE;gBAC3C,KAAK,MAAM,QAAQ,IAAI,OAAO,CAAC,SAAS,EAAE;oBACxC,cAAc,CAAC,IAAI,CAAC;wBAClB,QAAQ;wBACR,MAAM,EAAE,aAAa;wBACrB,IAAI,EAAE,GAAG;wBACT,iBAAiB,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;qBAChD,CAAC,CAAC;iBACJ;aACF;SACF;QAED,MAAM,WAAW,GAA6C,EAAE,CAAC;QACjE,MAAM,eAAe,GAAe,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;QACpF,MAAM,KAAK,CAAC,cAAc,CACxB,cAAc,EACd,OAAO,CAAC,cAAc,IAAI,uBAAuB,EACjD,KAAK,EAAE,OAAuB,EAAE,KAAsB,EAAE,EAAE;YACxD,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YACxC,iCAAiC;YACjC,WAAW,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,KAAe,EAAE,GAAG,EAAE,CAAC,CAAC;YAElD,IAAI,WAAW,EAAE;gBACf,WAAW,CAAC,SAAS,CAAC;oBACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE;oBAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;oBACnC,IAAI,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC;yBACrC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;yBAC5B,IAAI,CAAC,GAAG,CAAC;yBACT,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;iBAChB,CAAC,CAAC;aACJ;YAED,wBAAwB;YACxB,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;gBAC7B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;aAClD;YACD,MAAM,mBAAmB,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;YAC/D,eAAe,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,IAAI,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC;QACrF,CAAC,CACF,CAAC;QAEF,IAAI,WAAW,EAAE;YACf,WAAW,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,MAAM,OAAO,GAAqB,EAAE,CAAC;QACrC,WAAW;aACR,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC;aACjC,OAAO,CAAC,CAAC,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE;YAC1B,OAAO,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC;QACvB,CAAC,CAAC,CAAC;QAEL,sEAAsE;QACtE,uCAAuC;QACvC,IAAI,MAAM,EAAE;YACV,uCAAuC;YACvC,eAAe,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE;gBACxC,kFAAkF;gBAClF,MAAM,cAAc,GAAa,EAAE,CAAC;gBAEpC,6FAA6F;gBAC7F,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,EAAE;oBAChC,MAAM,WAAW,GAAG,KAAK,GAAG,OAAO,CAAC,MAAM,GAAG,WAAW,CAAC;oBACzD,MAAM,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC,CAAC;oBACpC,2FAA2F;oBAC3F,MAAM,YAAY,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,UAAU,MAAM,CAAC,KAAK,UAAU,CAAC,EAAE,CAAC;oBAC1F,cAAc,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;gBACpC,CAAC,CAAC,CAAC;gBAEH,oGAAoG;gBACpG,MAAM,QAAQ,GAAG,CAAC,GAAG,cAAc,EAAE,GAAG,MAAM,CAAC,MAAM,CAAC,0BAA0B,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAC1F,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACvB,CAAC,CAAC,CAAC;SACJ;aAAM;YACL,KAAK,CAAC,IAAI,CACR,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CACtF,CAAC;SACH;QAED,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC;IAC/C,CAAC;CACF;AAED,MAAM,UAAU,QAAQ,CAAC,OAAwB;IAC/C,MAAM,EAAE,GAAG,IAAI,SAAS,CAAC,OAAO,CAAC,CAAC;IAClC,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC;AACvB,CAAC"}
|
package/dist/main.js
CHANGED
|
@@ -31,7 +31,7 @@ These prompts are nunjucks templates, so you can use logic like this:
|
|
|
31
31
|
prompts: ['prompts.txt'],
|
|
32
32
|
providers: ['openai:gpt-3.5-turbo'],
|
|
33
33
|
vars: 'vars.csv',
|
|
34
|
-
maxConcurrency:
|
|
34
|
+
maxConcurrency: 4,
|
|
35
35
|
};`;
|
|
36
36
|
const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
|
|
37
37
|
\`\`\`
|
|
@@ -89,6 +89,7 @@ async function main() {
|
|
|
89
89
|
.option('-v, --vars <path>', 'Path to file with prompt variables (csv, json, yaml)', defaultConfig.vars)
|
|
90
90
|
.option('-c, --config <path>', 'Path to configuration file. Automatically loads promptfooconfig.js', defaultConfig.config)
|
|
91
91
|
.option('-j, --max-concurrency <number>', 'Maximum number of concurrent API calls', String(defaultConfig.maxConcurrency))
|
|
92
|
+
.option('--grader', 'Model that will grade outputs', defaultConfig.grader)
|
|
92
93
|
.option('--verbose', 'Show debug logs', defaultConfig.verbose)
|
|
93
94
|
.action(async (cmdObj) => {
|
|
94
95
|
if (cmdObj.verbose) {
|
|
@@ -123,6 +124,11 @@ async function main() {
|
|
|
123
124
|
maxConcurrency: cmdObj.maxConcurrency && cmdObj.maxConcurrency > 0 ? cmdObj.maxConcurrency : undefined,
|
|
124
125
|
...config,
|
|
125
126
|
};
|
|
127
|
+
if (cmdObj.grader) {
|
|
128
|
+
options.grading = {
|
|
129
|
+
provider: await loadApiProvider(cmdObj.grader),
|
|
130
|
+
};
|
|
131
|
+
}
|
|
126
132
|
const summary = await evaluate(options);
|
|
127
133
|
if (cmdObj.output) {
|
|
128
134
|
logger.info(chalk.yellow(`Writing output to ${cmdObj.output}`));
|
|
@@ -141,10 +147,22 @@ async function main() {
|
|
|
141
147
|
head: ['blue', 'bold'],
|
|
142
148
|
},
|
|
143
149
|
});
|
|
144
|
-
// Skip first row (header) and add the rest. Color
|
|
150
|
+
// Skip first row (header) and add the rest. Color PASS/FAIL
|
|
145
151
|
for (const row of summary.table.slice(1)) {
|
|
146
|
-
|
|
147
|
-
|
|
152
|
+
table.push(row.map((col) => {
|
|
153
|
+
if (col.startsWith('[PASS]')) {
|
|
154
|
+
// color '[PASS]' green
|
|
155
|
+
return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
|
|
156
|
+
}
|
|
157
|
+
else if (col.startsWith('[FAIL]')) {
|
|
158
|
+
// color everything red up until '---'
|
|
159
|
+
return col
|
|
160
|
+
.split('---')
|
|
161
|
+
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
|
162
|
+
.join('---');
|
|
163
|
+
}
|
|
164
|
+
return col;
|
|
165
|
+
}));
|
|
148
166
|
}
|
|
149
167
|
logger.info('\n' + table.toString());
|
|
150
168
|
}
|