promptfoo 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +33 -14
- package/dist/evaluator.d.ts +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +222 -139
- package/dist/evaluator.js.map +1 -1
- package/dist/main.js +28 -4
- package/dist/main.js.map +1 -1
- package/dist/prompts.d.ts +2 -0
- package/dist/prompts.d.ts.map +1 -0
- package/dist/prompts.js +21 -0
- package/dist/prompts.js.map +1 -0
- package/dist/providers.d.ts +1 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +71 -36
- package/dist/providers.js.map +1 -1
- package/dist/tableOutput.html +5 -8
- package/dist/types.d.ts +15 -8
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +2 -0
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +26 -0
- package/dist/util.js.map +1 -1
- package/package.json +1 -1
- package/src/evaluator.ts +278 -165
- package/src/main.ts +33 -4
- package/src/prompts.ts +20 -0
- package/src/providers.ts +82 -38
- package/src/tableOutput.html +5 -8
- package/src/types.ts +18 -7
- package/src/util.ts +33 -0
package/src/evaluator.ts
CHANGED
|
@@ -1,9 +1,18 @@
|
|
|
1
1
|
import async from 'async';
|
|
2
2
|
import nunjucks from 'nunjucks';
|
|
3
3
|
|
|
4
|
-
import
|
|
4
|
+
import { DEFAULT_GRADING_PROMPT } from './prompts.js';
|
|
5
5
|
|
|
6
|
-
import {
|
|
6
|
+
import type { SingleBar } from 'cli-progress';
|
|
7
|
+
import type {
|
|
8
|
+
ApiProvider,
|
|
9
|
+
EvaluateOptions,
|
|
10
|
+
EvaluateResult,
|
|
11
|
+
EvaluateStats,
|
|
12
|
+
EvaluateSummary,
|
|
13
|
+
Prompt,
|
|
14
|
+
TokenUsage,
|
|
15
|
+
} from './types.js';
|
|
7
16
|
|
|
8
17
|
interface RunEvalOptions {
|
|
9
18
|
provider: ApiProvider;
|
|
@@ -12,193 +21,297 @@ interface RunEvalOptions {
|
|
|
12
21
|
includeProviderId?: boolean;
|
|
13
22
|
}
|
|
14
23
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
const evalBody = expected.slice(5);
|
|
20
|
-
const evalFunction = new Function('output', `return ${evalBody}`);
|
|
21
|
-
return evalFunction(output);
|
|
22
|
-
} else if (expected.startsWith('grade:')) {
|
|
23
|
-
// NYI
|
|
24
|
-
return false;
|
|
25
|
-
} else {
|
|
26
|
-
return expected === output;
|
|
27
|
-
}
|
|
24
|
+
interface GradingResult {
|
|
25
|
+
pass: boolean;
|
|
26
|
+
reason: string;
|
|
27
|
+
tokensUsed: TokenUsage;
|
|
28
28
|
}
|
|
29
29
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
},
|
|
47
|
-
vars,
|
|
48
|
-
};
|
|
49
|
-
|
|
50
|
-
try {
|
|
51
|
-
const response = await provider.callApi(renderedPrompt);
|
|
52
|
-
const success = vars.__expected ? checkExpectedValue(vars.__expected, response.output) : true;
|
|
53
|
-
const ret: EvaluateResult = {
|
|
54
|
-
...setup,
|
|
55
|
-
response,
|
|
56
|
-
success,
|
|
57
|
-
};
|
|
58
|
-
if (!success) {
|
|
59
|
-
ret.error = `Expected ${vars.__expected}, got "${response.output}"`;
|
|
60
|
-
}
|
|
61
|
-
return ret;
|
|
62
|
-
} catch (err) {
|
|
63
|
-
return {
|
|
64
|
-
...setup,
|
|
65
|
-
error: String(err),
|
|
66
|
-
success: false,
|
|
30
|
+
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
31
|
+
|
|
32
|
+
class Evaluator {
|
|
33
|
+
options: EvaluateOptions;
|
|
34
|
+
stats: EvaluateStats;
|
|
35
|
+
|
|
36
|
+
constructor(options: EvaluateOptions) {
|
|
37
|
+
this.options = options;
|
|
38
|
+
this.stats = {
|
|
39
|
+
successes: 0,
|
|
40
|
+
failures: 0,
|
|
41
|
+
tokenUsage: {
|
|
42
|
+
total: 0,
|
|
43
|
+
prompt: 0,
|
|
44
|
+
completion: 0,
|
|
45
|
+
},
|
|
67
46
|
};
|
|
68
47
|
}
|
|
69
|
-
}
|
|
70
48
|
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
const results: EvaluateResult[] = [];
|
|
49
|
+
async gradeOutput(expected: string, output: string): Promise<GradingResult> {
|
|
50
|
+
const { grading } = this.options;
|
|
74
51
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
display:
|
|
80
|
-
options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent,
|
|
81
|
-
});
|
|
52
|
+
if (!grading) {
|
|
53
|
+
throw new Error(
|
|
54
|
+
'Cannot grade output without grading config. Specify --grader option or grading config.',
|
|
55
|
+
);
|
|
82
56
|
}
|
|
83
|
-
}
|
|
84
57
|
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
delete ret.__expected;
|
|
89
|
-
return ret;
|
|
90
|
-
});
|
|
91
|
-
const isTest = vars[0].__expected;
|
|
92
|
-
const table: string[][] = [
|
|
93
|
-
isTest
|
|
94
|
-
? [
|
|
95
|
-
'RESULT',
|
|
96
|
-
[...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
97
|
-
].flat()
|
|
98
|
-
: [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
99
|
-
];
|
|
100
|
-
|
|
101
|
-
const stats = {
|
|
102
|
-
successes: 0,
|
|
103
|
-
failures: 0,
|
|
104
|
-
tokenUsage: {
|
|
105
|
-
total: 0,
|
|
106
|
-
prompt: 0,
|
|
107
|
-
completion: 0,
|
|
108
|
-
},
|
|
109
|
-
};
|
|
110
|
-
|
|
111
|
-
let progressbar: SingleBar | undefined;
|
|
112
|
-
if (options.showProgressBar) {
|
|
113
|
-
const totalNumRuns =
|
|
114
|
-
options.prompts.length * options.providers.length * (options.vars?.length || 1);
|
|
115
|
-
const cliProgress = await import('cli-progress');
|
|
116
|
-
progressbar = new cliProgress.SingleBar(
|
|
117
|
-
{
|
|
118
|
-
format:
|
|
119
|
-
'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
120
|
-
},
|
|
121
|
-
cliProgress.Presets.shades_classic,
|
|
122
|
-
);
|
|
123
|
-
progressbar.start(totalNumRuns, 0, {
|
|
124
|
-
provider: '',
|
|
125
|
-
prompt: '',
|
|
126
|
-
vars: '',
|
|
58
|
+
const prompt = nunjucks.renderString(grading.prompt || DEFAULT_GRADING_PROMPT, {
|
|
59
|
+
content: output,
|
|
60
|
+
rubric: expected,
|
|
127
61
|
});
|
|
62
|
+
|
|
63
|
+
const resp = await grading.provider.callApi(prompt);
|
|
64
|
+
if (resp.error || !resp.output) {
|
|
65
|
+
return {
|
|
66
|
+
pass: false,
|
|
67
|
+
reason: resp.error || 'No output',
|
|
68
|
+
tokensUsed: {
|
|
69
|
+
total: resp.tokenUsage?.total || 0,
|
|
70
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
71
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
72
|
+
},
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
const parsed = JSON.parse(resp.output) as GradingResult;
|
|
78
|
+
parsed.tokensUsed = {
|
|
79
|
+
total: resp.tokenUsage?.total || 0,
|
|
80
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
81
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
82
|
+
};
|
|
83
|
+
return parsed;
|
|
84
|
+
} catch (err) {
|
|
85
|
+
return {
|
|
86
|
+
pass: false,
|
|
87
|
+
reason: `Output is not valid JSON: ${resp.output}`,
|
|
88
|
+
tokensUsed: {
|
|
89
|
+
total: resp.tokenUsage?.total || 0,
|
|
90
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
91
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
}
|
|
128
95
|
}
|
|
129
96
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
97
|
+
async checkExpectedValue(
|
|
98
|
+
expected: string,
|
|
99
|
+
output: string,
|
|
100
|
+
): Promise<{ pass: boolean; reason?: string }> {
|
|
101
|
+
if (expected.startsWith('eval:')) {
|
|
102
|
+
const evalBody = expected.slice(5);
|
|
103
|
+
const evalFunction = new Function('output', `return ${evalBody}`);
|
|
104
|
+
return { pass: evalFunction(output) };
|
|
105
|
+
} else if (expected.startsWith('grade:')) {
|
|
106
|
+
const gradingResult = await this.gradeOutput(expected.slice(6), output);
|
|
107
|
+
return {
|
|
108
|
+
pass: gradingResult.pass,
|
|
109
|
+
reason: gradingResult.reason,
|
|
110
|
+
};
|
|
111
|
+
} else {
|
|
112
|
+
const pass = expected === output;
|
|
113
|
+
return {
|
|
114
|
+
pass,
|
|
115
|
+
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
116
|
+
};
|
|
141
117
|
}
|
|
142
118
|
}
|
|
143
119
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
120
|
+
async runEval({
|
|
121
|
+
provider,
|
|
122
|
+
prompt,
|
|
123
|
+
vars,
|
|
124
|
+
includeProviderId,
|
|
125
|
+
}: RunEvalOptions): Promise<EvaluateResult> {
|
|
126
|
+
vars = vars || {};
|
|
127
|
+
const renderedPrompt = nunjucks.renderString(prompt, vars);
|
|
128
|
+
|
|
129
|
+
// Note that we're using original prompt, not renderedPrompt
|
|
130
|
+
const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
|
|
131
|
+
|
|
132
|
+
const setup = {
|
|
133
|
+
prompt: {
|
|
134
|
+
raw: renderedPrompt,
|
|
135
|
+
display: promptDisplay,
|
|
136
|
+
},
|
|
137
|
+
vars,
|
|
138
|
+
};
|
|
139
|
+
|
|
140
|
+
try {
|
|
141
|
+
const response = await provider.callApi(renderedPrompt);
|
|
142
|
+
const ret: EvaluateResult = {
|
|
143
|
+
...setup,
|
|
144
|
+
response,
|
|
145
|
+
success: false,
|
|
146
|
+
};
|
|
147
|
+
if (response.error) {
|
|
148
|
+
ret.error = response.error;
|
|
149
|
+
} else if (response.output) {
|
|
150
|
+
const checkResult = vars.__expected
|
|
151
|
+
? await this.checkExpectedValue(vars.__expected, response.output)
|
|
152
|
+
: { pass: true };
|
|
153
|
+
if (!checkResult.pass) {
|
|
154
|
+
ret.error = checkResult.reason || `Expected: ${vars.__expected}`;
|
|
158
155
|
}
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
156
|
+
ret.success = checkResult.pass;
|
|
157
|
+
} else {
|
|
158
|
+
ret.success = false;
|
|
159
|
+
ret.error = 'No output';
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Update token usage stats
|
|
163
|
+
this.stats.tokenUsage.total += response.tokenUsage?.total || 0;
|
|
164
|
+
this.stats.tokenUsage.prompt += response.tokenUsage?.prompt || 0;
|
|
165
|
+
this.stats.tokenUsage.completion += response.tokenUsage?.completion || 0;
|
|
166
|
+
|
|
167
|
+
if (ret.success) {
|
|
168
|
+
this.stats.successes++;
|
|
169
|
+
} else {
|
|
170
|
+
this.stats.failures++;
|
|
162
171
|
}
|
|
163
172
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
173
|
+
return ret;
|
|
174
|
+
} catch (err) {
|
|
175
|
+
return {
|
|
176
|
+
...setup,
|
|
177
|
+
error: String(err),
|
|
178
|
+
success: false,
|
|
179
|
+
};
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async evaluate(): Promise<EvaluateSummary> {
|
|
184
|
+
const options = this.options;
|
|
185
|
+
const prompts: Prompt[] = [];
|
|
186
|
+
|
|
187
|
+
for (const promptContent of options.prompts) {
|
|
188
|
+
for (const provider of options.providers) {
|
|
189
|
+
const display =
|
|
190
|
+
options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
|
|
191
|
+
prompts.push({
|
|
192
|
+
raw: promptContent,
|
|
193
|
+
display,
|
|
172
194
|
});
|
|
173
195
|
}
|
|
196
|
+
}
|
|
174
197
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
198
|
+
const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
|
|
199
|
+
const varsWithExpectedKeyRemoved = vars.map((v) => {
|
|
200
|
+
const ret = { ...v };
|
|
201
|
+
delete ret.__expected;
|
|
202
|
+
return ret;
|
|
203
|
+
});
|
|
204
|
+
const isTest = vars[0].__expected;
|
|
205
|
+
const table: string[][] = [
|
|
206
|
+
[...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
|
|
207
|
+
];
|
|
208
|
+
|
|
209
|
+
let progressbar: SingleBar | undefined;
|
|
210
|
+
if (options.showProgressBar) {
|
|
211
|
+
const totalNumRuns =
|
|
212
|
+
options.prompts.length * options.providers.length * (options.vars?.length || 1);
|
|
213
|
+
const cliProgress = await import('cli-progress');
|
|
214
|
+
progressbar = new cliProgress.SingleBar(
|
|
215
|
+
{
|
|
216
|
+
format:
|
|
217
|
+
'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
218
|
+
},
|
|
219
|
+
cliProgress.Presets.shades_classic,
|
|
220
|
+
);
|
|
221
|
+
progressbar.start(totalNumRuns, 0, {
|
|
222
|
+
provider: '',
|
|
223
|
+
prompt: '',
|
|
224
|
+
vars: '',
|
|
225
|
+
});
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
const runEvalOptions: RunEvalOptions[] = [];
|
|
229
|
+
for (const row of vars) {
|
|
230
|
+
for (const promptContent of options.prompts) {
|
|
231
|
+
for (const provider of options.providers) {
|
|
232
|
+
runEvalOptions.push({
|
|
233
|
+
provider,
|
|
234
|
+
prompt: promptContent,
|
|
235
|
+
vars: row,
|
|
236
|
+
includeProviderId: options.providers.length > 1,
|
|
237
|
+
});
|
|
238
|
+
}
|
|
178
239
|
}
|
|
179
|
-
|
|
180
|
-
combinedOutputs[combinedOutputIndex].push(row.response?.output || '');
|
|
181
|
-
},
|
|
182
|
-
);
|
|
240
|
+
}
|
|
183
241
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
242
|
+
const tempResults: { index: number; row: EvaluateResult }[] = [];
|
|
243
|
+
const combinedOutputs: string[][] = new Array(vars.length).fill(null).map(() => []);
|
|
244
|
+
await async.forEachOfLimit(
|
|
245
|
+
runEvalOptions,
|
|
246
|
+
options.maxConcurrency || DEFAULT_MAX_CONCURRENCY,
|
|
247
|
+
async (options: RunEvalOptions, index: number | string) => {
|
|
248
|
+
const row = await this.runEval(options);
|
|
249
|
+
//results[index as number] = row;
|
|
250
|
+
tempResults.push({ index: index as number, row });
|
|
187
251
|
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
252
|
+
if (progressbar) {
|
|
253
|
+
progressbar.increment({
|
|
254
|
+
provider: options.provider.id(),
|
|
255
|
+
prompt: options.prompt.slice(0, 10),
|
|
256
|
+
vars: Object.entries(options.vars || {})
|
|
257
|
+
.map(([k, v]) => `${k}=${v}`)
|
|
258
|
+
.join(' ')
|
|
259
|
+
.slice(0, 10),
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
// Bookkeeping for table
|
|
264
|
+
if (typeof index !== 'number') {
|
|
265
|
+
throw new Error('Expected index to be a number');
|
|
266
|
+
}
|
|
267
|
+
const combinedOutputIndex = Math.floor(index / prompts.length);
|
|
268
|
+
combinedOutputs[combinedOutputIndex].push(row.response?.output || row.error || '');
|
|
269
|
+
},
|
|
200
270
|
);
|
|
271
|
+
|
|
272
|
+
if (progressbar) {
|
|
273
|
+
progressbar.stop();
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const results: EvaluateResult[] = [];
|
|
277
|
+
tempResults
|
|
278
|
+
.sort((a, b) => a.index - b.index)
|
|
279
|
+
.forEach(({ index, row }) => {
|
|
280
|
+
results[index] = row;
|
|
281
|
+
});
|
|
282
|
+
|
|
283
|
+
// TODO(ian): Provide full context in table cells, and have the caller
|
|
284
|
+
// construct the table contents itself.
|
|
285
|
+
if (isTest) {
|
|
286
|
+
// Iterate through each combined output
|
|
287
|
+
combinedOutputs.forEach((output, index) => {
|
|
288
|
+
// Create a new array to store the modified output with [PASS] or [FAIL] prepended
|
|
289
|
+
const modifiedOutput: string[] = [];
|
|
290
|
+
|
|
291
|
+
// Iterate through each output value and prepend [PASS] or [FAIL] based on the success status
|
|
292
|
+
output.forEach((o, outputIndex) => {
|
|
293
|
+
const resultIndex = index * prompts.length + outputIndex;
|
|
294
|
+
const result = results[resultIndex];
|
|
295
|
+
// TODO(ian): sometimes output and result.error can be identical (in the case of exception)
|
|
296
|
+
const resultStatus = result.success ? `[PASS] ${o}` : `[FAIL] ${result.error}\n---\n${o}`;
|
|
297
|
+
modifiedOutput.push(resultStatus);
|
|
298
|
+
});
|
|
299
|
+
|
|
300
|
+
// Add the modified output and the corresponding values from varsWithExpectedKeyRemoved to the table
|
|
301
|
+
const tableRow = [...modifiedOutput, ...Object.values(varsWithExpectedKeyRemoved[index])];
|
|
302
|
+
table.push(tableRow);
|
|
303
|
+
});
|
|
304
|
+
} else {
|
|
305
|
+
table.push(
|
|
306
|
+
...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]),
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return { results, stats: this.stats, table };
|
|
201
311
|
}
|
|
312
|
+
}
|
|
202
313
|
|
|
203
|
-
|
|
314
|
+
export function evaluate(options: EvaluateOptions) {
|
|
315
|
+
const ev = new Evaluator(options);
|
|
316
|
+
return ev.evaluate();
|
|
204
317
|
}
|
package/src/main.ts
CHANGED
|
@@ -10,6 +10,7 @@ import logger, { setLogLevel } from './logger.js';
|
|
|
10
10
|
import { loadApiProvider } from './providers.js';
|
|
11
11
|
import { evaluate } from './evaluator.js';
|
|
12
12
|
import { readPrompts, readVars, writeOutput } from './util.js';
|
|
13
|
+
import { getDirectory } from './esm.js';
|
|
13
14
|
|
|
14
15
|
import type { CommandLineOptions, EvaluateOptions, VarMapping } from './types.js';
|
|
15
16
|
|
|
@@ -36,7 +37,7 @@ These prompts are nunjucks templates, so you can use logic like this:
|
|
|
36
37
|
prompts: ['prompts.txt'],
|
|
37
38
|
providers: ['openai:gpt-3.5-turbo'],
|
|
38
39
|
vars: 'vars.csv',
|
|
39
|
-
maxConcurrency:
|
|
40
|
+
maxConcurrency: 4,
|
|
40
41
|
};`;
|
|
41
42
|
const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
|
|
42
43
|
\`\`\`
|
|
@@ -80,6 +81,14 @@ async function main() {
|
|
|
80
81
|
|
|
81
82
|
const program = new Command();
|
|
82
83
|
|
|
84
|
+
program.option('--version', 'Print version', () => {
|
|
85
|
+
const packageJson = JSON.parse(
|
|
86
|
+
readFileSync(pathJoin(getDirectory(), '../package.json'), 'utf8'),
|
|
87
|
+
);
|
|
88
|
+
console.log(packageJson.version);
|
|
89
|
+
process.exit(0);
|
|
90
|
+
});
|
|
91
|
+
|
|
83
92
|
program
|
|
84
93
|
.command('init [directory]')
|
|
85
94
|
.description('Initialize project with dummy files')
|
|
@@ -120,6 +129,7 @@ async function main() {
|
|
|
120
129
|
'Maximum number of concurrent API calls',
|
|
121
130
|
String(defaultConfig.maxConcurrency),
|
|
122
131
|
)
|
|
132
|
+
.option('--grader', 'Model that will grade outputs', defaultConfig.grader)
|
|
123
133
|
.option('--verbose', 'Show debug logs', defaultConfig.verbose)
|
|
124
134
|
.action(async (cmdObj: CommandLineOptions & Command) => {
|
|
125
135
|
if (cmdObj.verbose) {
|
|
@@ -161,6 +171,12 @@ async function main() {
|
|
|
161
171
|
...config,
|
|
162
172
|
};
|
|
163
173
|
|
|
174
|
+
if (cmdObj.grader) {
|
|
175
|
+
options.grading = {
|
|
176
|
+
provider: await loadApiProvider(cmdObj.grader),
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
164
180
|
const summary = await evaluate(options);
|
|
165
181
|
|
|
166
182
|
if (cmdObj.output) {
|
|
@@ -179,10 +195,23 @@ async function main() {
|
|
|
179
195
|
head: ['blue', 'bold'],
|
|
180
196
|
},
|
|
181
197
|
});
|
|
182
|
-
// Skip first row (header) and add the rest. Color
|
|
198
|
+
// Skip first row (header) and add the rest. Color PASS/FAIL
|
|
183
199
|
for (const row of summary.table.slice(1)) {
|
|
184
|
-
|
|
185
|
-
|
|
200
|
+
table.push(
|
|
201
|
+
row.map((col) => {
|
|
202
|
+
if (col.startsWith('[PASS]')) {
|
|
203
|
+
// color '[PASS]' green
|
|
204
|
+
return chalk.green.bold(col.slice(0, 6)) + col.slice(6);
|
|
205
|
+
} else if (col.startsWith('[FAIL]')) {
|
|
206
|
+
// color everything red up until '---'
|
|
207
|
+
return col
|
|
208
|
+
.split('---')
|
|
209
|
+
.map((c, idx) => (idx === 0 ? chalk.red.bold(c) : c))
|
|
210
|
+
.join('---');
|
|
211
|
+
}
|
|
212
|
+
return col;
|
|
213
|
+
}),
|
|
214
|
+
);
|
|
186
215
|
}
|
|
187
216
|
|
|
188
217
|
logger.info('\n' + table.toString());
|
package/src/prompts.ts
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export const DEFAULT_GRADING_PROMPT = JSON.stringify([
|
|
2
|
+
{
|
|
3
|
+
role: 'system',
|
|
4
|
+
content: `You are grading content according to a user-specified rubric. If the statement in the rubric is true, then the content passes the test. You respond with a JSON object with this structure: {pass: boolean; reason: string;}.
|
|
5
|
+
|
|
6
|
+
Examples:
|
|
7
|
+
|
|
8
|
+
Content: Hello world
|
|
9
|
+
Rubric: Contains a greeting
|
|
10
|
+
{"pass": true, "reason": "the content contains the word 'world'"}
|
|
11
|
+
|
|
12
|
+
Content: Avast ye swabs, repel the invaders!
|
|
13
|
+
Rubric: Does not speak like a pirate
|
|
14
|
+
{"pass": false, "reason": "'avast ye' is a common pirate term"}`,
|
|
15
|
+
},
|
|
16
|
+
{
|
|
17
|
+
role: 'user',
|
|
18
|
+
content: 'Content: {{ content }}\nRubric: {{ rubric }}',
|
|
19
|
+
},
|
|
20
|
+
]);
|