promptfoo 0.6.0 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -74
- package/dist/assertions.d.ts +4 -10
- package/dist/assertions.d.ts.map +1 -1
- package/dist/assertions.js +126 -20
- package/dist/assertions.js.map +1 -1
- package/dist/cache.d.ts +8 -0
- package/dist/cache.d.ts.map +1 -0
- package/dist/cache.js +78 -0
- package/dist/cache.js.map +1 -0
- package/dist/evaluator.d.ts +2 -2
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +73 -40
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +6 -4
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -21
- package/dist/index.js.map +1 -1
- package/dist/main.js +92 -80
- package/dist/main.js.map +1 -1
- package/dist/onboarding.d.ts +4 -0
- package/dist/onboarding.d.ts.map +1 -0
- package/dist/onboarding.js +63 -0
- package/dist/onboarding.js.map +1 -0
- package/dist/providers/localai.d.ts.map +1 -1
- package/dist/providers/localai.js +7 -9
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +31 -38
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers.d.ts +1 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +11 -1
- package/dist/providers.js.map +1 -1
- package/dist/types.d.ts +46 -13
- package/dist/types.d.ts.map +1 -1
- package/dist/util.d.ts +6 -3
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +73 -2
- package/dist/util.js.map +1 -1
- package/dist/web/server.d.ts.map +1 -1
- package/dist/web/server.js +0 -11
- package/dist/web/server.js.map +1 -1
- package/package.json +6 -2
- package/src/assertions.ts +141 -28
- package/src/cache.ts +90 -0
- package/src/evaluator.ts +89 -43
- package/src/index.ts +14 -26
- package/src/main.ts +117 -99
- package/src/onboarding.ts +61 -0
- package/src/providers/localai.ts +9 -11
- package/src/providers/openai.ts +34 -42
- package/src/providers.ts +9 -0
- package/src/types.ts +95 -16
- package/src/util.ts +90 -4
- package/src/web/server.ts +0 -18
package/dist/evaluator.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,EAEf,SAAS,EAGV,MAAM,YAAY,CAAC;AA6TpB,wBAAgB,QAAQ,CAAC,SAAS,EAAE,SAAS,EAAE,OAAO,EAAE,eAAe,4BAGtE"}
|
package/dist/evaluator.js
CHANGED
|
@@ -36,7 +36,8 @@ const assertions_js_1 = require("./assertions.js");
|
|
|
36
36
|
const suggestions_js_1 = require("./suggestions.js");
|
|
37
37
|
const DEFAULT_MAX_CONCURRENCY = 4;
|
|
38
38
|
class Evaluator {
|
|
39
|
-
constructor(options) {
|
|
39
|
+
constructor(testSuite, options) {
|
|
40
|
+
this.testSuite = testSuite;
|
|
40
41
|
this.options = options;
|
|
41
42
|
this.stats = {
|
|
42
43
|
successes: 0,
|
|
@@ -45,11 +46,12 @@ class Evaluator {
|
|
|
45
46
|
total: 0,
|
|
46
47
|
prompt: 0,
|
|
47
48
|
completion: 0,
|
|
49
|
+
cached: 0,
|
|
48
50
|
},
|
|
49
51
|
};
|
|
50
52
|
}
|
|
51
|
-
async runEval({ provider, prompt,
|
|
52
|
-
vars = vars || {};
|
|
53
|
+
async runEval({ provider, prompt, test, includeProviderId, }) {
|
|
54
|
+
const vars = test.vars || {};
|
|
53
55
|
const renderedPrompt = nunjucks_1.default.renderString(prompt, vars);
|
|
54
56
|
// Note that we're using original prompt, not renderedPrompt
|
|
55
57
|
const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
|
|
@@ -71,22 +73,28 @@ class Evaluator {
|
|
|
71
73
|
ret.error = response.error;
|
|
72
74
|
}
|
|
73
75
|
else if (response.output) {
|
|
74
|
-
const checkResult =
|
|
75
|
-
? await (0, assertions_js_1.matchesExpectedValue)(vars.__expected, response.output, this.options)
|
|
76
|
-
: { pass: true };
|
|
76
|
+
const checkResult = await (0, assertions_js_1.runAssertions)(test, response.output);
|
|
77
77
|
if (!checkResult.pass) {
|
|
78
|
-
ret.error = checkResult.reason
|
|
78
|
+
ret.error = checkResult.reason;
|
|
79
79
|
}
|
|
80
80
|
ret.success = checkResult.pass;
|
|
81
|
+
if (checkResult.tokensUsed) {
|
|
82
|
+
this.stats.tokenUsage.total += checkResult.tokensUsed.total;
|
|
83
|
+
this.stats.tokenUsage.prompt += checkResult.tokensUsed.prompt;
|
|
84
|
+
this.stats.tokenUsage.completion += checkResult.tokensUsed.completion;
|
|
85
|
+
}
|
|
81
86
|
}
|
|
82
87
|
else {
|
|
83
88
|
ret.success = false;
|
|
84
89
|
ret.error = 'No output';
|
|
85
90
|
}
|
|
86
91
|
// Update token usage stats
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
92
|
+
if (response.tokenUsage) {
|
|
93
|
+
this.stats.tokenUsage.total += response.tokenUsage.total || 0;
|
|
94
|
+
this.stats.tokenUsage.prompt += response.tokenUsage.prompt || 0;
|
|
95
|
+
this.stats.tokenUsage.completion += response.tokenUsage.completion || 0;
|
|
96
|
+
this.stats.tokenUsage.cached += response.tokenUsage.cached || 0;
|
|
97
|
+
}
|
|
90
98
|
if (ret.success) {
|
|
91
99
|
this.stats.successes++;
|
|
92
100
|
}
|
|
@@ -104,11 +112,12 @@ class Evaluator {
|
|
|
104
112
|
}
|
|
105
113
|
}
|
|
106
114
|
async evaluate() {
|
|
107
|
-
const options = this
|
|
115
|
+
const { testSuite, options } = this;
|
|
108
116
|
const prompts = [];
|
|
109
|
-
if (options.
|
|
117
|
+
if (options.generateSuggestions) {
|
|
118
|
+
// TODO(ian): Move this into its own command/file
|
|
110
119
|
logger_js_1.default.info(`Generating prompt variations...`);
|
|
111
|
-
const { prompts: newPrompts, error } = await (0, suggestions_js_1.generatePrompts)(
|
|
120
|
+
const { prompts: newPrompts, error } = await (0, suggestions_js_1.generatePrompts)(testSuite.prompts[0], 1);
|
|
112
121
|
if (error || !newPrompts) {
|
|
113
122
|
throw new Error(`Failed to generate prompts: ${error}`);
|
|
114
123
|
}
|
|
@@ -127,7 +136,7 @@ class Evaluator {
|
|
|
127
136
|
rl.question(`${chalk_1.default.blue('Do you want to test this prompt?')} (y/N): `, async (answer) => {
|
|
128
137
|
rl.close();
|
|
129
138
|
if (answer.toLowerCase().startsWith('y')) {
|
|
130
|
-
|
|
139
|
+
testSuite.prompts.push(prompt);
|
|
131
140
|
numAdded++;
|
|
132
141
|
}
|
|
133
142
|
else {
|
|
@@ -142,36 +151,51 @@ class Evaluator {
|
|
|
142
151
|
process.exit(1);
|
|
143
152
|
}
|
|
144
153
|
}
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
154
|
+
// Split prompts by provider
|
|
155
|
+
for (const promptContent of testSuite.prompts) {
|
|
156
|
+
for (const provider of testSuite.providers) {
|
|
157
|
+
const display = testSuite.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent;
|
|
148
158
|
prompts.push({
|
|
149
159
|
raw: promptContent,
|
|
150
160
|
display,
|
|
151
161
|
});
|
|
152
162
|
}
|
|
153
163
|
}
|
|
154
|
-
|
|
155
|
-
const
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
return ret;
|
|
164
|
+
// Aggregate all vars across test cases
|
|
165
|
+
const tests = (testSuite.tests || [
|
|
166
|
+
{
|
|
167
|
+
// Dummy test for cases when we're only comparing raw prompts.
|
|
168
|
+
},
|
|
169
|
+
]).map((test) => {
|
|
170
|
+
const finalTestCase = Object.assign({}, testSuite.defaultTest);
|
|
171
|
+
return Object.assign(finalTestCase, test);
|
|
163
172
|
});
|
|
164
|
-
const
|
|
173
|
+
const varNames = new Set();
|
|
174
|
+
const varsWithSpecialColsRemoved = [];
|
|
175
|
+
for (const testCase of tests) {
|
|
176
|
+
if (testCase.vars) {
|
|
177
|
+
const varWithSpecialColsRemoved = {};
|
|
178
|
+
for (const varName of Object.keys(testCase.vars)) {
|
|
179
|
+
varNames.add(varName);
|
|
180
|
+
varWithSpecialColsRemoved[varName] = testCase.vars[varName];
|
|
181
|
+
}
|
|
182
|
+
varsWithSpecialColsRemoved.push(varWithSpecialColsRemoved);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
// Set up table...
|
|
186
|
+
const isTest = tests.some((t) => !!t.assert);
|
|
165
187
|
const table = {
|
|
166
188
|
head: {
|
|
167
189
|
prompts: prompts.map((p) => p.display),
|
|
168
|
-
vars:
|
|
190
|
+
vars: Array.from(varNames).sort(),
|
|
191
|
+
// TODO(ian): add assertions to table?
|
|
169
192
|
},
|
|
170
193
|
body: [],
|
|
171
194
|
};
|
|
195
|
+
// And progress bar...
|
|
172
196
|
let progressbar;
|
|
173
197
|
if (options.showProgressBar) {
|
|
174
|
-
const totalNumRuns =
|
|
198
|
+
const totalNumRuns = testSuite.prompts.length * testSuite.providers.length * (tests.length || 1);
|
|
175
199
|
const cliProgress = await Promise.resolve().then(() => __importStar(require('cli-progress')));
|
|
176
200
|
progressbar = new cliProgress.SingleBar({
|
|
177
201
|
format: 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
|
|
@@ -182,19 +206,27 @@ class Evaluator {
|
|
|
182
206
|
vars: '',
|
|
183
207
|
});
|
|
184
208
|
}
|
|
209
|
+
// Set up eval cases
|
|
185
210
|
const runEvalOptions = [];
|
|
186
211
|
let rowIndex = 0;
|
|
187
|
-
for (const
|
|
212
|
+
for (const testCase of tests) {
|
|
188
213
|
let colIndex = 0;
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
214
|
+
// Handle default properties
|
|
215
|
+
testCase.vars = Object.assign({}, testSuite.defaultTest?.vars, testCase.vars);
|
|
216
|
+
testCase.assert = [...(testSuite.defaultTest?.assert || []), ...(testCase.assert || [])];
|
|
217
|
+
testCase.options = testCase.options || {};
|
|
218
|
+
testCase.options.provider =
|
|
219
|
+
testCase.options.provider || testSuite.defaultTest?.options?.provider;
|
|
220
|
+
const prependToPrompt = testCase.options?.prefix || testSuite.defaultTest?.options?.prefix || '';
|
|
221
|
+
const appendToPrompt = testCase.options?.suffix || testSuite.defaultTest?.options?.suffix || '';
|
|
222
|
+
// Finalize test case eval
|
|
223
|
+
for (const promptContent of testSuite.prompts) {
|
|
224
|
+
for (const provider of testSuite.providers) {
|
|
193
225
|
runEvalOptions.push({
|
|
194
226
|
provider,
|
|
195
227
|
prompt: prependToPrompt + promptContent + appendToPrompt,
|
|
196
|
-
|
|
197
|
-
includeProviderId:
|
|
228
|
+
test: testCase,
|
|
229
|
+
includeProviderId: testSuite.providers.length > 1,
|
|
198
230
|
rowIndex,
|
|
199
231
|
colIndex,
|
|
200
232
|
});
|
|
@@ -203,6 +235,7 @@ class Evaluator {
|
|
|
203
235
|
}
|
|
204
236
|
rowIndex++;
|
|
205
237
|
}
|
|
238
|
+
// Actually run the eval
|
|
206
239
|
const results = [];
|
|
207
240
|
await async_1.default.forEachOfLimit(runEvalOptions, options.maxConcurrency || DEFAULT_MAX_CONCURRENCY, async (options, index) => {
|
|
208
241
|
const row = await this.runEval(options);
|
|
@@ -211,7 +244,7 @@ class Evaluator {
|
|
|
211
244
|
progressbar.increment({
|
|
212
245
|
provider: options.provider.id(),
|
|
213
246
|
prompt: options.prompt.slice(0, 10),
|
|
214
|
-
vars: Object.entries(options.vars || {})
|
|
247
|
+
vars: Object.entries(options.test.vars || {})
|
|
215
248
|
.map(([k, v]) => `${k}=${v}`)
|
|
216
249
|
.join(' ')
|
|
217
250
|
.slice(0, 10),
|
|
@@ -242,7 +275,7 @@ class Evaluator {
|
|
|
242
275
|
if (!table.body[rowIndex]) {
|
|
243
276
|
table.body[rowIndex] = {
|
|
244
277
|
outputs: [],
|
|
245
|
-
vars:
|
|
278
|
+
vars: table.head.vars.map((varName) => options.test.vars?.[varName] || ''),
|
|
246
279
|
};
|
|
247
280
|
}
|
|
248
281
|
table.body[rowIndex].outputs[colIndex] = resultText;
|
|
@@ -253,8 +286,8 @@ class Evaluator {
|
|
|
253
286
|
return { version: 1, results, stats: this.stats, table };
|
|
254
287
|
}
|
|
255
288
|
}
|
|
256
|
-
function evaluate(options) {
|
|
257
|
-
const ev = new Evaluator(options);
|
|
289
|
+
function evaluate(testSuite, options) {
|
|
290
|
+
const ev = new Evaluator(testSuite, options);
|
|
258
291
|
return ev.evaluate();
|
|
259
292
|
}
|
|
260
293
|
exports.evaluate = evaluate;
|
package/dist/evaluator.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,kEAAqC;AAErC,kDAA0B;AAC1B,kDAA0B;AAC1B,wDAAgC;AAEhC,4DAAiC;AACjC,
|
|
1
|
+
{"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,kEAAqC;AAErC,kDAA0B;AAC1B,kDAA0B;AAC1B,wDAAgC;AAEhC,4DAAiC;AACjC,mDAAgD;AAchD,qDAAmD;AAcnD,MAAM,uBAAuB,GAAG,CAAC,CAAC;AAElC,MAAM,SAAS;IAKb,YAAY,SAAoB,EAAE,OAAwB;QACxD,IAAI,CAAC,SAAS,GAAG,SAAS,CAAC;QAC3B,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,KAAK,GAAG;YACX,SAAS,EAAE,CAAC;YACZ,QAAQ,EAAE,CAAC;YACX,UAAU,EAAE;gBACV,KAAK,EAAE,CAAC;gBACR,MAAM,EAAE,CAAC;gBACT,UAAU,EAAE,CAAC;gBACb,MAAM,EAAE,CAAC;aACV;SACF,CAAC;IACJ,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,EACZ,QAAQ,EACR,MAAM,EACN,IAAI,EACJ,iBAAiB,GACF;QACf,MAAM,IAAI,GAAG,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;QAC7B,MAAM,cAAc,GAAG,kBAAQ,CAAC,YAAY,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;QAE3D,4DAA4D;QAC5D,MAAM,aAAa,GAAG,iBAAiB,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;QAElF,MAAM,KAAK,GAAG;YACZ,MAAM,EAAE;gBACN,GAAG,EAAE,cAAc;gBACnB,OAAO,EAAE,aAAa;aACvB;YACD,IAAI;SACL,CAAC;QAEF,IAAI;YACF,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;YACxD,MAAM,GAAG,GAAmB;gBAC1B,GAAG,KAAK;gBACR,QAAQ;gBACR,OAAO,EAAE,KAAK;aACf,CAAC;YACF,IAAI,QAAQ,CAAC,KAAK,EAAE;gBAClB,GAAG,CAAC,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC;aAC5B;iBAAM,IAAI,QAAQ,CAAC,MAAM,EAAE;gBAC1B,MAAM,WAAW,GAAG,MAAM,IAAA,6BAAa,EAAC,IAAI,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC;gBAC/D,IAAI,CAAC,WAAW,CAAC,IAAI,EAAE;oBACrB,GAAG,CAAC,KAAK,GAAG,WAAW,CAAC,MAAM,CAAC;iBAChC;gBACD,GAAG,CAAC,OAAO,GAAG,WAAW,CAAC,IAAI,CAAC;gBAC/B,IAAI,WAAW,CAAC,UAAU,EAAE;oBAC1B,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,IAAI,WAAW,CAAC,UAAU,CAAC,KAAK,CAAC;oBAC5D,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,WAAW,CAAC,UAAU,CAAC,MAAM,CAAC;oBAC9D,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,UAAU,IAAI,WAAW,CAAC,UAAU,CAAC,UAAU,CAAC;iBACvE;aACF;iBAAM;gBACL,GAAG,CAAC,OAAO,GAAG,KAAK,CAAC;gBACpB,GAAG,CAAC,KAAK,GAAG,WAAW,CAAC;aACzB;YAED,2BAA2B;YAC3B,IAAI,QAAQ,CAAC,UAAU,EAAE;gBACvB,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,KAAK,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,IAAI,CAAC,CAAC;gBAC9D,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,IAAI,CAAC,CAAC;gBAChE,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,UAAU,IAAI,QAAQ,CAAC,UAAU,CAAC,UAAU,IAAI,CAAC,CAAC;gBACxE,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,MAAM,IAAI,CAAC,CAAC;aACjE;YAED,IAAI,GAAG,CAAC,OAAO,EAAE;gBACf,IAAI,CAAC,KAAK,CAAC,SAAS,EAAE,CAAC;aACxB;iBAAM;gBACL,IAAI,CAAC,KAAK,CAAC,QAAQ,EAAE,CAAC;aACvB;YAED,OAAO,GAAG,CAAC;SACZ;QAAC,OAAO,GAAG,EAAE;YACZ,OAAO;gBACL,GAAG,KAAK;gBACR,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC;gBAClB,OAAO,EAAE,KAAK;aACf,CAAC;SACH;IACH,CAAC;IAED,KAAK,CAAC,QAAQ;QACZ,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC;QACpC,MAAM,OAAO,GAAa,EAAE,CAAC;QAE7B,IAAI,OAAO,CAAC,mBAAmB,EAAE;YAC/B,iDAAiD;YACjD,mBAAM,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;YAC/C,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE,KAAK,EAAE,GAAG,MAAM,IAAA,gCAAe,EAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;YACtF,IAAI,KAAK,IAAI,CAAC,UAAU,EAAE;gBACxB,MAAM,IAAI,KAAK,CAAC,+BAA+B,KAAK,EAAE,CAAC,CAAC;aACzD;YAED,mBAAM,CAAC,IAAI,CAAC,eAAK,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC;YAC9C,IAAI,QAAQ,GAAG,CAAC,CAAC;YACjB,KAAK,MAAM,MAAM,IAAI,UAAU,EAAE;gBAC/B,mBAAM,CAAC,IAAI,CAAC,0DAA0D,CAAC,CAAC;gBACxE,mBAAM,CAAC,IAAI,CAAC,GAAG,MAAM,EAAE,CAAC,CAAC;gBACzB,mBAAM,CAAC,IAAI,CAAC,0DAA0D,CAAC,CAAC;gBAExE,wCAAwC;gBACxC,MAAM,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,EAAE;oBAC5B,MAAM,EAAE,GAAG,uBAAQ,CAAC,eAAe,CAAC;wBAClC,KAAK,EAAE,OAAO,CAAC,KAAK;wBACpB,MAAM,EAAE,OAAO,CAAC,MAAM;qBACvB,CAAC,CAAC;oBACH,EAAE,CAAC,QAAQ,CACT,GAAG,eAAK,CAAC,IAAI,CAAC,kCAAkC,CAAC,UAAU,EAC3D,KAAK,EAAE,MAAM,EAAE,EAAE;wBACf,EAAE,CAAC,KAAK,EAAE,CAAC;wBACX,IAAI,MAAM,CAAC,WAAW,EAAE,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE;4BACxC,SAAS,CAAC,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;4BAC/B,QAAQ,EAAE,CAAC;yBACZ;6BAAM;4BACL,mBAAM,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;yBACtC;wBACD,OAAO,CAAC,IAAI,CAAC,CAAC;oBAChB,CAAC,CACF,CAAC;gBACJ,CAAC,CAAC,CAAC;aACJ;YAED,IAAI,QAAQ,GAAG,CAAC,EAAE;gBAChB,mBAAM,CAAC,IAAI,CAAC,eAAK,CAAC,GAAG,CAAC,gCAAgC,CAAC,CAAC,CAAC;gBACzD,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;aACjB;SACF;QAED,4BAA4B;QAC5B,KAAK,MAAM,aAAa,IAAI,SAAS,CAAC,OAAO,EAAE;YAC7C,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,SAAS,EAAE;gBAC1C,MAAM,OAAO,GACX,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,aAAa,EAAE,CAAC,CAAC,CAAC,aAAa,CAAC;gBACzF,OAAO,CAAC,IAAI,CAAC;oBACX,GAAG,EAAE,aAAa;oBAClB,OAAO;iBACR,CAAC,CAAC;aACJ;SACF;QAED,uCAAuC;QAEvC,MAAM,KAAK,GAAG,CACZ,SAAS,CAAC,KAAK,IAAI;YACjB;YACE,8DAA8D;aAC/D;SACF,CACF,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE;YACb,MAAM,aAAa,GAAa,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC;YACzE,OAAO,MAAM,CAAC,MAAM,CAAC,aAAa,EAAE,IAAI,CAAC,CAAC;QAC5C,CAAC,CAAC,CAAC;QAEH,MAAM,QAAQ,GAAgB,IAAI,GAAG,EAAE,CAAC;QACxC,MAAM,0BAA0B,GAA6B,EAAE,CAAC;QAChE,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE;YAC5B,IAAI,QAAQ,CAAC,IAAI,EAAE;gBACjB,MAAM,yBAAyB,GAA2B,EAAE,CAAC;gBAC7D,KAAK,MAAM,OAAO,IAAI,MAAM,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE;oBAChD,QAAQ,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;oBACtB,yBAAyB,CAAC,OAAO,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;iBAC7D;gBACD,0BAA0B,CAAC,IAAI,CAAC,yBAAyB,CAAC,CAAC;aAC5D;SACF;QAED,kBAAkB;QAClB,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QAE7C,MAAM,KAAK,GAAkB;YAC3B,IAAI,EAAE;gBACJ,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC;gBACtC,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,IAAI,EAAE;gBACjC,sCAAsC;aACvC;YACD,IAAI,EAAE,EAAE;SACT,CAAC;QAEF,sBAAsB;QACtB,IAAI,WAAkC,CAAC;QACvC,IAAI,OAAO,CAAC,eAAe,EAAE;YAC3B,MAAM,YAAY,GAChB,SAAS,CAAC,OAAO,CAAC,MAAM,GAAG,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,KAAK,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC;YAC9E,MAAM,WAAW,GAAG,wDAAa,cAAc,GAAC,CAAC;YACjD,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CACrC;gBACE,MAAM,EACJ,4FAA4F;aAC/F,EACD,WAAW,CAAC,OAAO,CAAC,cAAc,CACnC,CAAC;YACF,WAAW,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC,EAAE;gBACjC,QAAQ,EAAE,EAAE;gBACZ,MAAM,EAAE,EAAE;gBACV,IAAI,EAAE,EAAE;aACT,CAAC,CAAC;SACJ;QAED,oBAAoB;QACpB,MAAM,cAAc,GAAqB,EAAE,CAAC;QAC5C,IAAI,QAAQ,GAAG,CAAC,CAAC;QACjB,KAAK,MAAM,QAAQ,IAAI,KAAK,EAAE;YAC5B,IAAI,QAAQ,GAAG,CAAC,CAAC;YAEjB,4BAA4B;YAC5B,QAAQ,CAAC,IAAI,GAAG,MAAM,CAAC,MAAM,CAAC,EAAE,EAAE,SAAS,CAAC,WAAW,EAAE,IAAI,EAAE,QAAQ,CAAC,IAAI,CAAC,CAAC;YAC9E,QAAQ,CAAC,MAAM,GAAG,CAAC,GAAG,CAAC,SAAS,CAAC,WAAW,EAAE,MAAM,IAAI,EAAE,CAAC,EAAE,GAAG,CAAC,QAAQ,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,CAAC;YACzF,QAAQ,CAAC,OAAO,GAAG,QAAQ,CAAC,OAAO,IAAI,EAAE,CAAC;YAC1C,QAAQ,CAAC,OAAO,CAAC,QAAQ;gBACvB,QAAQ,CAAC,OAAO,CAAC,QAAQ,IAAI,SAAS,CAAC,WAAW,EAAE,OAAO,EAAE,QAAQ,CAAC;YACxE,MAAM,eAAe,GACnB,QAAQ,CAAC,OAAO,EAAE,MAAM,IAAI,SAAS,CAAC,WAAW,EAAE,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;YAC3E,MAAM,cAAc,GAClB,QAAQ,CAAC,OAAO,EAAE,MAAM,IAAI,SAAS,CAAC,WAAW,EAAE,OAAO,EAAE,MAAM,IAAI,EAAE,CAAC;YAE3E,0BAA0B;YAC1B,KAAK,MAAM,aAAa,IAAI,SAAS,CAAC,OAAO,EAAE;gBAC7C,KAAK,MAAM,QAAQ,IAAI,SAAS,CAAC,SAAS,EAAE;oBAC1C,cAAc,CAAC,IAAI,CAAC;wBAClB,QAAQ;wBACR,MAAM,EAAE,eAAe,GAAG,aAAa,GAAG,cAAc;wBACxD,IAAI,EAAE,QAAQ;wBACd,iBAAiB,EAAE,SAAS,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;wBACjD,QAAQ;wBACR,QAAQ;qBACT,CAAC,CAAC;oBACH,QAAQ,EAAE,CAAC;iBACZ;aACF;YACD,QAAQ,EAAE,CAAC;SACZ;QAED,wBAAwB;QACxB,MAAM,OAAO,GAAqB,EAAE,CAAC;QACrC,MAAM,eAAK,CAAC,cAAc,CACxB,cAAc,EACd,OAAO,CAAC,cAAc,IAAI,uBAAuB,EACjD,KAAK,EAAE,OAAuB,EAAE,KAAsB,EAAE,EAAE;YACxD,MAAM,GAAG,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;YAExC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YAElB,IAAI,WAAW,EAAE;gBACf,WAAW,CAAC,SAAS,CAAC;oBACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE;oBAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;oBACnC,IAAI,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC;yBAC1C,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;yBAC5B,IAAI,CAAC,GAAG,CAAC;yBACT,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;iBAChB,CAAC,CAAC;aACJ;YAED,wBAAwB;YACxB,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;gBAC7B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;aAClD;YAED,IAAI,UAA8B,CAAC;YACnC,IAAI,MAAM,EAAE;gBACV,IAAI,GAAG,CAAC,OAAO,EAAE;oBACf,UAAU,GAAG,UAAU,GAAG,CAAC,QAAQ,EAAE,MAAM,IAAI,GAAG,CAAC,KAAK,IAAI,EAAE,EAAE,CAAC;iBAClE;qBAAM;oBACL,UAAU,GAAG,UAAU,GAAG,CAAC,KAAK,UAAU,GAAG,CAAC,QAAQ,EAAE,MAAM,IAAI,GAAG,CAAC,KAAK,IAAI,EAAE,EAAE,CAAC;iBACrF;aACF;iBAAM,IAAI,GAAG,CAAC,KAAK,EAAE;gBACpB,UAAU,GAAG,UAAU,GAAG,CAAC,KAAK,EAAE,CAAC;aACpC;iBAAM;gBACL,UAAU,GAAG,GAAG,CAAC,QAAQ,EAAE,MAAM,IAAI,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC;aACtD;YAED,sEAAsE;YACtE,uCAAuC;YACvC,MAAM,EAAE,QAAQ,EAAE,QAAQ,EAAE,GAAG,OAAO,CAAC;YACvC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,EAAE;gBACzB,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,GAAG;oBACrB,OAAO,EAAE,EAAE;oBACX,IAAI,EAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,OAAO,CAAC,IAAI,EAAE,CAAC;iBAC3E,CAAC;aACH;YACD,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,UAAU,CAAC;QACtD,CAAC,CACF,CAAC;QAEF,IAAI,WAAW,EAAE;YACf,WAAW,CAAC,IAAI,EAAE,CAAC;SACpB;QAED,OAAO,EAAE,OAAO,EAAE,CAAC,EAAE,OAAO,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,KAAK,EAAE,CAAC;IAC3D,CAAC;CACF;AAED,SAAgB,QAAQ,CAAC,SAAoB,EAAE,OAAwB;IACrE,MAAM,EAAE,GAAG,IAAI,SAAS,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC;IAC7C,OAAO,EAAE,CAAC,QAAQ,EAAE,CAAC;AACvB,CAAC;AAHD,4BAGC"}
|
package/dist/index.d.ts
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
|
|
1
|
+
import type { EvaluateOptions, TestSuiteConfig } from './types.js';
|
|
3
2
|
export * from './types.js';
|
|
4
|
-
|
|
3
|
+
interface EvaluateTestSuite extends TestSuiteConfig {
|
|
4
|
+
prompts: string[];
|
|
5
|
+
}
|
|
6
|
+
declare function evaluate(testSuite: EvaluateTestSuite, options?: EvaluateOptions): Promise<import("./types.js").EvaluateSummary>;
|
|
5
7
|
declare const _default: {
|
|
6
8
|
evaluate: typeof evaluate;
|
|
7
9
|
assertions: {
|
|
@@ -13,7 +15,7 @@ declare const _default: {
|
|
|
13
15
|
OpenAiChatCompletionProvider: typeof import("./providers/openai.js").OpenAiChatCompletionProvider;
|
|
14
16
|
LocalAiCompletionProvider: typeof import("./providers/localai.js").LocalAiCompletionProvider;
|
|
15
17
|
LocalAiChatProvider: typeof import("./providers/localai.js").LocalAiChatProvider;
|
|
16
|
-
loadApiProvider: typeof loadApiProvider;
|
|
18
|
+
loadApiProvider: typeof import("./providers.js").loadApiProvider;
|
|
17
19
|
};
|
|
18
20
|
};
|
|
19
21
|
export default _default;
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,eAAe,EAAa,eAAe,EAAE,MAAM,YAAY,CAAC;AAG9E,cAAc,YAAY,CAAC;AAE3B,UAAU,iBAAkB,SAAQ,eAAe;IACjD,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAED,iBAAe,QAAQ,CAAC,SAAS,EAAE,iBAAiB,EAAE,OAAO,GAAE,eAAoB,iDAQlF;;;;;;;;;;;;;;;AAQD,wBAIE"}
|
package/dist/index.js
CHANGED
|
@@ -21,29 +21,16 @@ const evaluator_js_1 = require("./evaluator.js");
|
|
|
21
21
|
const providers_js_1 = require("./providers.js");
|
|
22
22
|
const assertions_js_1 = __importDefault(require("./assertions.js"));
|
|
23
23
|
const providers_js_2 = __importDefault(require("./providers.js"));
|
|
24
|
+
const util_js_1 = require("./util.js");
|
|
24
25
|
__exportStar(require("./types.js"), exports);
|
|
25
|
-
async function evaluate(
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
else {
|
|
32
|
-
apiProviders.push(provider);
|
|
33
|
-
}
|
|
26
|
+
async function evaluate(testSuite, options = {}) {
|
|
27
|
+
const constructedTestSuite = {
|
|
28
|
+
...testSuite,
|
|
29
|
+
prompts: testSuite.prompts,
|
|
30
|
+
providers: await (0, providers_js_1.loadApiProviders)(testSuite.providers),
|
|
31
|
+
tests: (0, util_js_1.readTests)(testSuite.tests),
|
|
34
32
|
};
|
|
35
|
-
|
|
36
|
-
for (const provider of providers) {
|
|
37
|
-
await addProvider(provider);
|
|
38
|
-
}
|
|
39
|
-
}
|
|
40
|
-
else {
|
|
41
|
-
await addProvider(providers);
|
|
42
|
-
}
|
|
43
|
-
return (0, evaluator_js_1.evaluate)({
|
|
44
|
-
...options,
|
|
45
|
-
providers: apiProviders,
|
|
46
|
-
});
|
|
33
|
+
return (0, evaluator_js_1.evaluate)(constructedTestSuite, options);
|
|
47
34
|
}
|
|
48
35
|
module.exports = {
|
|
49
36
|
evaluate,
|
package/dist/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;AAAA,iDAAwD;AACxD,
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;AAAA,iDAAwD;AACxD,iDAAkD;AAClD,oEAAyC;AACzC,kEAAuC;AAGvC,uCAAsC;AAEtC,6CAA2B;AAM3B,KAAK,UAAU,QAAQ,CAAC,SAA4B,EAAE,UAA2B,EAAE;IACjF,MAAM,oBAAoB,GAAc;QACtC,GAAG,SAAS;QACZ,OAAO,EAAE,SAAS,CAAC,OAAO;QAC1B,SAAS,EAAE,MAAM,IAAA,+BAAgB,EAAC,SAAS,CAAC,SAAS,CAAC;QACtD,KAAK,EAAE,IAAA,mBAAS,EAAC,SAAS,CAAC,KAAK,CAAC;KAClC,CAAC;IACF,OAAO,IAAA,uBAAU,EAAC,oBAAoB,EAAE,OAAO,CAAC,CAAC;AACnD,CAAC;AAED,MAAM,CAAC,OAAO,GAAG;IACf,QAAQ;IACR,UAAU,EAAV,uBAAU;IACV,SAAS,EAAT,sBAAS;CACV,CAAC;AAEF,kBAAe;IACb,QAAQ;IACR,UAAU,EAAV,uBAAU;IACV,SAAS,EAAT,sBAAS;CACV,CAAC"}
|
package/dist/main.js
CHANGED
|
@@ -38,6 +38,8 @@ const evaluator_js_1 = require("./evaluator.js");
|
|
|
38
38
|
const util_js_1 = require("./util.js");
|
|
39
39
|
const esm_js_1 = require("./esm.js");
|
|
40
40
|
const server_js_1 = require("./web/server.js");
|
|
41
|
+
const cache_js_1 = require("./cache.js");
|
|
42
|
+
const onboarding_js_1 = require("./onboarding.js");
|
|
41
43
|
function createDummyFiles(directory) {
|
|
42
44
|
if (directory) {
|
|
43
45
|
// Make the directory if it doesn't exist
|
|
@@ -45,30 +47,6 @@ function createDummyFiles(directory) {
|
|
|
45
47
|
(0, fs_1.mkdirSync)(directory);
|
|
46
48
|
}
|
|
47
49
|
}
|
|
48
|
-
const dummyPrompts = `Your first prompt goes here
|
|
49
|
-
---
|
|
50
|
-
Next prompt goes here. You can substitute variables like this: {{var1}} {{var2}} {{var3}}
|
|
51
|
-
---
|
|
52
|
-
This is the next prompt.
|
|
53
|
-
|
|
54
|
-
These prompts are nunjucks templates, so you can use logic like this:
|
|
55
|
-
{% if var1 %}
|
|
56
|
-
{{ var1 }}
|
|
57
|
-
{% endif %}`;
|
|
58
|
-
const dummyVars = 'var1,var2,var3\nvalue1,value2,value3\nanother value1,another value2,another value3';
|
|
59
|
-
const dummyConfig = `module.exports = {
|
|
60
|
-
prompts: ['prompts.txt'],
|
|
61
|
-
providers: ['openai:gpt-3.5-turbo'],
|
|
62
|
-
vars: 'vars.csv',
|
|
63
|
-
maxConcurrency: 4,
|
|
64
|
-
};`;
|
|
65
|
-
const readme = `To get started, set your OPENAI_API_KEY environment variable. Then run:
|
|
66
|
-
\`\`\`
|
|
67
|
-
promptfoo eval
|
|
68
|
-
\`\`\`
|
|
69
|
-
|
|
70
|
-
You'll probably want to change a few of the prompts in prompts.txt and the variables in vars.csv before letting it rip.
|
|
71
|
-
`;
|
|
72
50
|
if (directory) {
|
|
73
51
|
if (!(0, fs_1.existsSync)(directory)) {
|
|
74
52
|
logger_js_1.default.info(`Creating directory ${directory} ...`);
|
|
@@ -78,10 +56,9 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
|
|
|
78
56
|
else {
|
|
79
57
|
directory = '.';
|
|
80
58
|
}
|
|
81
|
-
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, 'prompts.txt'),
|
|
82
|
-
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, '
|
|
83
|
-
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, '
|
|
84
|
-
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, 'README.md'), readme);
|
|
59
|
+
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, 'prompts.txt'), onboarding_js_1.DEFAULT_PROMPTS);
|
|
60
|
+
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, 'promptfooconfig.yaml'), onboarding_js_1.DEFAULT_YAML_CONFIG);
|
|
61
|
+
(0, fs_1.writeFileSync)((0, path_1.join)(process.cwd(), directory, 'README.md'), onboarding_js_1.DEFAULT_README);
|
|
85
62
|
if (directory === '.') {
|
|
86
63
|
logger_js_1.default.info('Wrote prompts.txt, vars.csv, and promptfooconfig.js. Open README.md to get started!');
|
|
87
64
|
}
|
|
@@ -91,15 +68,25 @@ You'll probably want to change a few of the prompts in prompts.txt and the varia
|
|
|
91
68
|
}
|
|
92
69
|
}
|
|
93
70
|
async function main() {
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
71
|
+
const pwd = process.cwd();
|
|
72
|
+
const potentialPaths = [
|
|
73
|
+
(0, path_1.join)(pwd, 'promptfooconfig.js'),
|
|
74
|
+
(0, path_1.join)(pwd, 'promptfooconfig.json'),
|
|
75
|
+
(0, path_1.join)(pwd, 'promptfooconfig.yaml'),
|
|
76
|
+
];
|
|
77
|
+
let config = {};
|
|
78
|
+
for (const path of potentialPaths) {
|
|
79
|
+
const maybeConfig = (0, util_js_1.maybeReadConfig)(path);
|
|
80
|
+
if (maybeConfig) {
|
|
81
|
+
config = maybeConfig;
|
|
82
|
+
break;
|
|
83
|
+
}
|
|
99
84
|
}
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
85
|
+
let evaluateOptions = {};
|
|
86
|
+
if (config.evaluateOptions) {
|
|
87
|
+
evaluateOptions.generateSuggestions = config.evaluateOptions.generateSuggestions;
|
|
88
|
+
evaluateOptions.maxConcurrency = config.evaluateOptions.maxConcurrency;
|
|
89
|
+
evaluateOptions.showProgressBar = config.evaluateOptions.showProgressBar;
|
|
103
90
|
}
|
|
104
91
|
const program = new commander_1.Command();
|
|
105
92
|
program.option('--version', 'Print version', () => {
|
|
@@ -123,67 +110,92 @@ async function main() {
|
|
|
123
110
|
program
|
|
124
111
|
.command('eval')
|
|
125
112
|
.description('Evaluate prompts')
|
|
126
|
-
.requiredOption('-p, --prompts <paths...>', 'Paths to prompt files (.txt)',
|
|
127
|
-
.requiredOption('-r, --providers <name or path...>', 'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module',
|
|
128
|
-
.option('-
|
|
129
|
-
.option(
|
|
130
|
-
|
|
131
|
-
|
|
113
|
+
.requiredOption('-p, --prompts <paths...>', 'Paths to prompt files (.txt)', config.prompts)
|
|
114
|
+
.requiredOption('-r, --providers <name or path...>', 'One of: openai:chat, openai:completion, openai:<model name>, or path to custom API caller module', config?.providers)
|
|
115
|
+
.option('-c, --config <path>', 'Path to configuration file. Automatically loads promptfooconfig.js/json/yaml')
|
|
116
|
+
.option(
|
|
117
|
+
// TODO(ian): Remove `vars` for v1
|
|
118
|
+
'-v, --vars, -t, --tests <path>', 'Path to CSV with test cases', config?.commandLineOptions?.vars)
|
|
119
|
+
.option('-o, --output <path>', 'Path to output file (csv, json, yaml, html)', config.outputPath)
|
|
120
|
+
.option('-j, --max-concurrency <number>', 'Maximum number of concurrent API calls', config.evaluateOptions?.maxConcurrency
|
|
121
|
+
? String(config.evaluateOptions.maxConcurrency)
|
|
122
|
+
: undefined)
|
|
132
123
|
.option('--table-cell-max-length <number>', 'Truncate console table cells to this length', '250')
|
|
133
124
|
.option('--suggest-prompts <number>', 'Generate N new prompts and append them to the prompt list')
|
|
134
|
-
.option('--prompt-prefix <path>', 'This prefix is prepended to every prompt',
|
|
135
|
-
.option('--prompt-suffix <path>', 'This suffix is append to every prompt',
|
|
125
|
+
.option('--prompt-prefix <path>', 'This prefix is prepended to every prompt', config.defaultTest?.options?.prefix)
|
|
126
|
+
.option('--prompt-suffix <path>', 'This suffix is append to every prompt', config.defaultTest?.options?.suffix)
|
|
136
127
|
.option('--no-write', 'Do not write results to promptfoo directory')
|
|
137
|
-
.option('--
|
|
138
|
-
.option('--
|
|
128
|
+
.option('--no-cache', 'Do not read or write results to disk cache')
|
|
129
|
+
.option('--grader', 'Model that will grade outputs', config?.commandLineOptions?.grader)
|
|
130
|
+
.option('--verbose', 'Show debug logs', config?.commandLineOptions?.verbose)
|
|
139
131
|
.option('--view [port]', 'View in browser ui')
|
|
140
132
|
.action(async (cmdObj) => {
|
|
133
|
+
// Misc settings
|
|
141
134
|
if (cmdObj.verbose) {
|
|
142
135
|
(0, logger_js_1.setLogLevel)('debug');
|
|
143
136
|
}
|
|
137
|
+
if (!cmdObj.cache) {
|
|
138
|
+
(0, cache_js_1.disableCache)();
|
|
139
|
+
}
|
|
140
|
+
// Config parsing
|
|
141
|
+
const maxConcurrency = parseInt(cmdObj.maxConcurrency || '', 10);
|
|
144
142
|
const configPath = cmdObj.config;
|
|
145
|
-
let config = {};
|
|
146
143
|
if (configPath) {
|
|
147
|
-
|
|
148
|
-
switch (ext) {
|
|
149
|
-
case '.json':
|
|
150
|
-
const content = (0, fs_1.readFileSync)(configPath, 'utf-8');
|
|
151
|
-
config = JSON.parse(content);
|
|
152
|
-
break;
|
|
153
|
-
case '.js':
|
|
154
|
-
config = require(configPath);
|
|
155
|
-
break;
|
|
156
|
-
default:
|
|
157
|
-
throw new Error(`Unsupported configuration file format: ${ext}`);
|
|
158
|
-
}
|
|
144
|
+
config = (0, util_js_1.readConfig)(configPath);
|
|
159
145
|
}
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
146
|
+
else {
|
|
147
|
+
config = {
|
|
148
|
+
prompts: cmdObj.prompts || config.prompts,
|
|
149
|
+
providers: cmdObj.providers || config.providers,
|
|
150
|
+
tests: cmdObj.vars || config.tests,
|
|
151
|
+
};
|
|
163
152
|
}
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
153
|
+
// Validation
|
|
154
|
+
if (!config.prompts || config.prompts.length === 0) {
|
|
155
|
+
logger_js_1.default.error(chalk_1.default.red('You must provide at least 1 prompt file'));
|
|
156
|
+
process.exit(1);
|
|
157
|
+
}
|
|
158
|
+
if (!config.providers || config.providers.length === 0) {
|
|
159
|
+
logger_js_1.default.error(chalk_1.default.red('You must specify at least 1 provider (for example, openai:gpt-3.5-turbo)'));
|
|
160
|
+
process.exit(1);
|
|
161
|
+
}
|
|
162
|
+
// Parse prompts, providers, and tests
|
|
163
|
+
const parsedPrompts = (0, util_js_1.readPrompts)(config.prompts);
|
|
164
|
+
const parsedProviders = await (0, providers_js_1.loadApiProviders)(config.providers);
|
|
165
|
+
const parsedTests = (0, util_js_1.readTests)(config.tests);
|
|
166
|
+
if (parsedPrompts.length === 0) {
|
|
167
|
+
logger_js_1.default.error(chalk_1.default.red('No prompts found'));
|
|
168
|
+
process.exit(1);
|
|
169
|
+
}
|
|
170
|
+
const defaultTest = {
|
|
171
|
+
options: {
|
|
173
172
|
prefix: cmdObj.promptPrefix,
|
|
174
173
|
suffix: cmdObj.promptSuffix,
|
|
174
|
+
provider: cmdObj.grader,
|
|
175
|
+
// rubricPrompt:
|
|
175
176
|
},
|
|
176
|
-
...config,
|
|
177
|
+
...config.defaultTest,
|
|
177
178
|
};
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
179
|
+
const testSuite = {
|
|
180
|
+
description: config.description,
|
|
181
|
+
prompts: parsedPrompts,
|
|
182
|
+
providers: parsedProviders,
|
|
183
|
+
tests: parsedTests,
|
|
184
|
+
defaultTest,
|
|
185
|
+
};
|
|
186
|
+
const options = {
|
|
187
|
+
showProgressBar: true,
|
|
188
|
+
maxConcurrency: !isNaN(maxConcurrency) && maxConcurrency > 0 ? maxConcurrency : undefined,
|
|
189
|
+
...evaluateOptions,
|
|
190
|
+
};
|
|
191
|
+
if (cmdObj.grader && testSuite.defaultTest) {
|
|
192
|
+
testSuite.defaultTest.options = testSuite.defaultTest.options || {};
|
|
193
|
+
testSuite.defaultTest.options.provider = await (0, providers_js_1.loadApiProvider)(cmdObj.grader);
|
|
182
194
|
}
|
|
183
195
|
if (cmdObj.generateSuggestions) {
|
|
184
|
-
options.
|
|
196
|
+
options.generateSuggestions = true;
|
|
185
197
|
}
|
|
186
|
-
const summary = await (0, evaluator_js_1.evaluate)(options);
|
|
198
|
+
const summary = await (0, evaluator_js_1.evaluate)(testSuite, options);
|
|
187
199
|
if (cmdObj.output) {
|
|
188
200
|
logger_js_1.default.info(chalk_1.default.yellow(`Writing output to ${cmdObj.output}`));
|
|
189
201
|
(0, util_js_1.writeOutput)(cmdObj.output, summary);
|
|
@@ -228,7 +240,7 @@ async function main() {
|
|
|
228
240
|
}
|
|
229
241
|
logger_js_1.default.info('\n' + table.toString());
|
|
230
242
|
}
|
|
231
|
-
if (cmdObj.
|
|
243
|
+
if (cmdObj.view || !cmdObj.write) {
|
|
232
244
|
logger_js_1.default.info('Evaluation complete');
|
|
233
245
|
}
|
|
234
246
|
else {
|
|
@@ -237,7 +249,7 @@ async function main() {
|
|
|
237
249
|
}
|
|
238
250
|
logger_js_1.default.info(chalk_1.default.green.bold(`Successes: ${summary.stats.successes}`));
|
|
239
251
|
logger_js_1.default.info(chalk_1.default.red.bold(`Failures: ${summary.stats.failures}`));
|
|
240
|
-
logger_js_1.default.info(`Token usage: Total ${summary.stats.tokenUsage.total} Prompt ${summary.stats.tokenUsage.prompt} Completion ${summary.stats.tokenUsage.completion}`);
|
|
252
|
+
logger_js_1.default.info(`Token usage: Total ${summary.stats.tokenUsage.total}, Prompt ${summary.stats.tokenUsage.prompt}, Completion ${summary.stats.tokenUsage.completion}, Cached ${summary.stats.tokenUsage.cached}`);
|
|
241
253
|
logger_js_1.default.info('Done.');
|
|
242
254
|
if (cmdObj.view) {
|
|
243
255
|
(0, server_js_1.init)(parseInt(cmdObj.view, 10) || 15500);
|