promptfoo 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/LICENSE +19 -0
  2. package/README.md +353 -0
  3. package/dist/__mocks__/esm.d.ts +2 -0
  4. package/dist/__mocks__/esm.d.ts.map +1 -0
  5. package/dist/__mocks__/esm.js +4 -0
  6. package/dist/__mocks__/esm.js.map +1 -0
  7. package/dist/esm.d.ts +2 -0
  8. package/dist/esm.d.ts.map +1 -0
  9. package/dist/esm.js +9 -0
  10. package/dist/esm.js.map +1 -0
  11. package/dist/evaluator.d.ts +3 -0
  12. package/dist/evaluator.d.ts.map +1 -0
  13. package/dist/evaluator.js +162 -0
  14. package/dist/evaluator.js.map +1 -0
  15. package/dist/index.d.ts +7 -0
  16. package/dist/index.d.ts.map +1 -0
  17. package/dist/index.js +29 -0
  18. package/dist/index.js.map +1 -0
  19. package/dist/logger.d.ts +11 -0
  20. package/dist/logger.d.ts.map +1 -0
  21. package/dist/logger.js +38 -0
  22. package/dist/logger.js.map +1 -0
  23. package/dist/main.d.ts +3 -0
  24. package/dist/main.d.ts.map +1 -0
  25. package/dist/main.js +90 -0
  26. package/dist/main.js.map +1 -0
  27. package/dist/providers.d.ts +21 -0
  28. package/dist/providers.d.ts.map +1 -0
  29. package/dist/providers.js +145 -0
  30. package/dist/providers.js.map +1 -0
  31. package/dist/tableOutput.html +55 -0
  32. package/dist/types.d.ts +55 -0
  33. package/dist/types.d.ts.map +1 -0
  34. package/dist/types.js +2 -0
  35. package/dist/types.js.map +1 -0
  36. package/dist/util.d.ts +6 -0
  37. package/dist/util.d.ts.map +1 -0
  38. package/dist/util.js +62 -0
  39. package/dist/util.js.map +1 -0
  40. package/package.json +55 -0
  41. package/src/__mocks__/esm.ts +3 -0
  42. package/src/esm.ts +10 -0
  43. package/src/evaluator.ts +203 -0
  44. package/src/index.ts +35 -0
  45. package/src/logger.ts +38 -0
  46. package/src/main.ts +108 -0
  47. package/src/providers.ts +170 -0
  48. package/src/tableOutput.html +55 -0
  49. package/src/types.ts +63 -0
  50. package/src/util.ts +67 -0
package/LICENSE ADDED
@@ -0,0 +1,19 @@
1
+ Copyright (c) Ian Webster 2023
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,353 @@
1
+ # promptfoo
2
+
3
+ `promptfoo` is a library and command-line tool that helps you evaluate LLM prompt & model quality with a systematic approach to comparing model outputs.
4
+
5
+ With promptfoo, you can:
6
+
7
+ - **Test multiple prompts** against predefined test cases
8
+ - **Evaluate quality and catch regressions** by comparing LLM outputs side-by-side
9
+ - **Speed up evaluations** by running tests concurrently
10
+ - Use as a command line tool, or integrate into your workflow as a library
11
+ - Use OpenAI API models (built-in support), or integrate custom API providers for any LLM API
12
+
13
+ ## Usage (command line)
14
+
15
+ To evaluate prompts using `promptfoo`, use the following command:
16
+
17
+ ```bash
18
+ npx promptfoo eval -p <prompt_paths...> -o <output_path> -r <provider> [-v <vars_path>] [-j <max_concurrency] [-c <config_path>]
19
+ ```
20
+
21
+ - `<prompt_paths...>`: Paths to prompt file(s)
22
+ - `<output_path>`: Path to output CSV, JSON, YAML, or HTML file. Defaults to terminal output
23
+ - `<provider>`: One or more of: `openai:<model_name>`, or filesystem path to custom API caller module
24
+ - `<vars_path>` (optional): Path to CSV, JSON, or YAML file with prompt variables
25
+ - `<max_concurrency>` (optional): Number of simultaneous API requests. Defaults to 3
26
+ - `<config_path>` (optional): Path to configuration file
27
+
28
+ ### Examples
29
+
30
+ #### Prompt quality
31
+
32
+ In this example, we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:
33
+
34
+ ```bash
35
+ npx promptfoo eval -p prompts.txt -v vars.csv -r openai:gpt-3.5-turbo
36
+ ```
37
+
38
+ ![Peek 2023-05-01 13-53](https://user-images.githubusercontent.com/310310/235529431-f4d5c395-d569-448e-9697-cd637e0372a5.gif)
39
+
40
+ <!--
41
+ <img width="1362" alt="Side-by-side evaluation of LLM prompt quality, terminal output" src="https://user-images.githubusercontent.com/310310/235329207-e8c22459-5f51-4fee-9714-1b602ac3d7ca.png">
42
+
43
+ ![Side-by-side evaluation of LLM prompt quality, html output](https://user-images.githubusercontent.com/310310/235483444-4ddb832d-e103-4b9c-a862-b0d6cc11cdc0.png)
44
+ -->
45
+
46
+ This command will evaluate the prompts in `prompts.txt`, substituing the variable values from `vars.csv`, and output results in your terminal.
47
+
48
+ Have a look at the setup and full output [here](https://github.com/typpo/promptfoo/tree/main/examples/assistant-cli).
49
+
50
+ You can run the command without an `-o` option to output in your terminal ([example](https://user-images.githubusercontent.com/310310/235329207-e8c22459-5f51-4fee-9714-1b602ac3d7ca.png)), or use `-o` to specify an HTML ([example](https://user-images.githubusercontent.com/310310/235483444-4ddb832d-e103-4b9c-a862-b0d6cc11cdc0.png)), CSV ([example](https://docs.google.com/spreadsheets/d/1nanoj3_TniWrDl1Sj-qYqIMD6jwm5FBy15xPFdUTsmI/edit?usp=sharing)), JSON ([example](https://github.com/typpo/promptfoo/blob/main/examples/simple-cli/output.json)), or YAML output.
51
+
52
+ #### Model quality
53
+
54
+ In this example, we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:
55
+
56
+ ```bash
57
+ npx promptfoo eval -p prompts.txt -r openai:gpt-3.5-turbo openai:gpt-4 -o output.html
58
+ ```
59
+
60
+ Produces this HTML table:
61
+
62
+ ![Side-by-side evaluation of LLM model quality, gpt3 vs gpt4, html output](https://user-images.githubusercontent.com/310310/235490527-e0c31f40-00a0-493a-8afc-8ed6322bb5ca.png)
63
+
64
+ Full setup and output [here](https://github.com/typpo/promptfoo/tree/main/examples/gpt-3.5-vs-4).
65
+
66
+ ## Usage (as a library)
67
+
68
+ You can also use `promptfoo` as a library in your project by importing the `evaluate` function. The function takes the following parameters:
69
+
70
+ - `providers`: a list of provider strings or `ApiProvider` objects, or just a single string or `ApiProvider`.
71
+ - `options`: the prompts and variables you want to test:
72
+
73
+ ```typescript
74
+ {
75
+ prompts: string[];
76
+ vars?: Record<string, string>;
77
+ }
78
+ ```
79
+
80
+ ### Example
81
+
82
+ `promptfoo` exports an `evaluate` function that you can use to run prompt evaluations.
83
+
84
+ ```javascript
85
+ import promptfoo from 'promptfoo';
86
+
87
+ const options = {
88
+ prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
89
+ vars: [{ body: 'Hello world' }, { body: "I'm hungry" }],
90
+ };
91
+
92
+ (async () => {
93
+ const summary = await promptfoo.evaluate('openai:gpt-3.5-turbo', options);
94
+ console.log(summary);
95
+ })();
96
+ ```
97
+
98
+ This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options. The results are logged to the console:
99
+
100
+ ```js
101
+ {
102
+ "results": [
103
+ {
104
+ "prompt": {
105
+ "raw": "Rephrase this in French: Hello world",
106
+ "display": "Rephrase this in French: {{body}}"
107
+ },
108
+ "vars": {
109
+ "body": "Hello world"
110
+ },
111
+ "response": {
112
+ "output": "Bonjour le monde",
113
+ "tokenUsage": {
114
+ "total": 19,
115
+ "prompt": 16,
116
+ "completion": 3
117
+ }
118
+ }
119
+ },
120
+ // ...
121
+ ],
122
+ "stats": {
123
+ "successes": 4,
124
+ "failures": 0,
125
+ "tokenUsage": {
126
+ "total": 120,
127
+ "prompt": 72,
128
+ "completion": 48
129
+ }
130
+ },
131
+ "table": [
132
+ // ...
133
+ ]
134
+ }
135
+ ```
136
+
137
+ [See full example here](https://github.com/typpo/promptfoo/tree/main/examples/simple-import)
138
+
139
+ ## Configuration
140
+
141
+ ### Prompt Files
142
+
143
+ Prompt files are plain text files that contain the prompts you want to test. If you have only one file, you can include multiple prompts in the file, separated by the delimiter `---`. If you have multiple files, each prompt should be in a separate file.
144
+
145
+ You can use [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax to include variables in your prompts, which will be replaced with actual values from the `vars` CSV file during evaluation.
146
+
147
+ Example of a single prompt file with multiple prompts (`prompts.txt`):
148
+
149
+ ```
150
+ Translate the following text to French: "{{text}}"
151
+ ---
152
+ Translate the following text to German: "{{text}}"
153
+ ```
154
+
155
+ Example of multiple prompt files:
156
+
157
+ - `prompt1.txt`:
158
+
159
+ ```
160
+ Translate the following text to French: "{{text}}"
161
+ ```
162
+
163
+ - `prompt2.txt`:
164
+
165
+ ```
166
+ Translate the following text to German: "{{text}}"
167
+ ```
168
+
169
+ ### Vars File
170
+
171
+ The Vars file is a CSV, JSON, or YAML file that contains the values for the variables used in the prompts. The first row of the CSV file should contain the variable names, and each subsequent row should contain the corresponding values for each test case.
172
+
173
+ Vars are substituted by [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax into prompts.
174
+
175
+ Example of a vars file (`vars.csv`):
176
+
177
+ ```
178
+ text
179
+ "Hello, world!"
180
+ "Goodbye, everyone!"
181
+ ```
182
+
183
+ Example of a vars file (`vars.json`):
184
+
185
+ ```json
186
+ [{ "text": "Hello, world!" }, { "text": "Goodbye, everyone!" }]
187
+ ```
188
+
189
+ ### Expected Value
190
+
191
+ You can specify an expected value for each test case to evaluate the success or failure of the model's output. To do this, add a special field called `__expected` in the `vars` file. The `__expected` field supports three types of value comparisons:
192
+
193
+ 1. If the expected value starts with `eval:`, it will evaluate the contents as the body of a JavaScript function defined like: `function(output) { <eval> }`. The function should return a boolean value, where `true` indicates success and `false` indicates failure.
194
+
195
+ 2. If the expected value starts with `grade:`, it will call the `gradeOutput(prompt, output)` function. You should assume this function exists and returns a boolean value, where `true` indicates success and `false` indicates failure.
196
+
197
+ 3. Otherwise, it attempts an exact string match comparison between the expected value and the model's output.
198
+
199
+ Example of a vars file with the `__expected` field (`vars.csv`):
200
+
201
+ ```
202
+ text,__expected
203
+ "Hello, world!","Bonjour le monde"
204
+ "Goodbye, everyone!","eval:return output.includes('Au revoir');"
205
+ ```
206
+
207
+ Example of a vars file with the `__expected` field (`vars.json`):
208
+
209
+ ```json
210
+ [
211
+ { "text": "Hello, world!", "__expected": "Bonjour le monde" },
212
+ { "text": "Goodbye, everyone!", "__expected": "eval:output.includes('Au revoir');" }
213
+ ]
214
+ ```
215
+
216
+ When the `__expected` field is provided, the success and failure statistics in the evaluation summary will be based on whether the expected criteria are met.
217
+
218
+ For more advanced test cases, we recommend using a testing framework like [Jest](https://jestjs.io/) or [Mocha](https://mochajs.org/) and using promptfoo as a library.
219
+
220
+ ### Output File
221
+
222
+ The results of the evaluation are written to this file. Each record in the output file corresponds to a test case and includes the original prompt, the output generated by the LLM, and the values of the variables used in the test case.
223
+
224
+ For example outputs, see the [examples/](https://github.com/typpo/promptfoo/tree/main/examples) directory.
225
+
226
+ ### Configuration File
227
+
228
+ You can specify any option in a configuration file (e.g., `.promptfoorc`, `promptfoo.config.json`). This can help you avoid repetitive command-line options and simplify the CLI invocation.
229
+
230
+ Example of a configuration file (`promptfoo.config.json`):
231
+
232
+ ```json
233
+ {
234
+ "provider": "openai:chat",
235
+ "vars": "/path/to/vars.csv"
236
+ }
237
+ ```
238
+
239
+ ## Installation
240
+
241
+ 1. Clone the repository:
242
+
243
+ ```bash
244
+ git clone https://github.com/typpo/promptfoo.git
245
+ ```
246
+
247
+ 2. Install the dependencies:
248
+
249
+ ```bash
250
+ npm install
251
+ ```
252
+
253
+ 3. Link the CLI tool:
254
+
255
+ ```bash
256
+ npm link
257
+ ```
258
+
259
+ ### Example
260
+
261
+ ```bash
262
+ promptfoo eval -p prompt1.txt prompt2.txt -o results.csv -r openai:chat -v vars.csv
263
+ ```
264
+
265
+ ## API Providers
266
+
267
+ `promptfoo` supports OpenAI API models out of the box. To use a custom API provider, create a custom module that implements the `ApiProvider` interface and pass the path to the module as the `provider` option.
268
+
269
+ ### OpenAI API
270
+
271
+ To use the OpenAI API, set the `OPENAI_API_KEY` environment variable or pass the API key as an argument to the constructor.
272
+
273
+ Example:
274
+
275
+ ```bash
276
+ export OPENAI_API_KEY=your_api_key_here
277
+ ```
278
+
279
+ Other OpenAI-related environment variables are supported:
280
+
281
+ - `OPENAI_TEMPERATURE` - temperature model parameter, defaults to 0
282
+ - `OPENAI_MAX_TOKENS` - max_tokens model parameter, defaults to 1024
283
+
284
+ The OpenAI provider supports the following model formats:
285
+
286
+ - `openai:chat` - defaults to gpt-3.5-turbo
287
+ - `openai:completion` - defaults to `text-davinci-003`
288
+ - `openai:<model name>` - uses a specific model name (mapped automatically to chat or completion endpoint)
289
+ - `openai:chat:<model name>` - uses any model name against the chat endpoint
290
+ - `openai:completion:<model name>` - uses any model name against the completion endpoint
291
+
292
+ The `openai:<endpoint>:<model>` construction is useful if OpenAI releases a new model, or if you have a custom model. For example, if OpenAI releases gpt-5 chat completion, you could begin using it immediately with `openai:chat:gpt-5`.
293
+
294
+ ### Custom API Provider
295
+
296
+ To create a custom API provider, implement the `ApiProvider` interface in a separate module. Here is the interface:
297
+
298
+ ```javascript
299
+ export interface ApiProvider {
300
+ id: () => string;
301
+ callApi: (prompt: string) => Promise<ProviderResult>;
302
+ }
303
+ ```
304
+
305
+ Below is an example of a custom API provider that returns a predefined output and token usage:
306
+
307
+ ```javascript
308
+ // customApiProvider.js
309
+ class CustomApiProvider {
310
+ id() {
311
+ return 'my-custom-api';
312
+ }
313
+
314
+ async callApi(prompt) {
315
+ // Add your custom API logic here
316
+
317
+ return {
318
+ // Required
319
+ output: 'Model output',
320
+
321
+ // Optional
322
+ tokenUsage: {
323
+ total: 10,
324
+ prompt: 5,
325
+ completion: 5,
326
+ },
327
+ };
328
+ }
329
+ }
330
+
331
+ module.exports.default = CustomApiProvider;
332
+ ```
333
+
334
+ To use the custom API provider with `promptfoo`, pass the path to the module as the `provider` option in the CLI invocation:
335
+
336
+ ```bash
337
+ promptfoo eval -p prompt1.txt prompt2.txt -o results.csv -v vars.csv -r ./customApiProvider.js
338
+ ```
339
+
340
+ This command will evaluate the prompts using the custom API provider and save the results to the specified CSV file.
341
+
342
+ ## Development
343
+
344
+ Contributions are welcome! Please feel free to submit a pull request or open an issue.
345
+
346
+ `promptfoo` includes several npm scripts to make development easier and more efficient. To use these scripts, run `npm run <script_name>` in the project directory.
347
+
348
+ Here are some of the available scripts:
349
+
350
+ - `build`: Transpile TypeScript files to JavaScript
351
+ - `watch`: Continuously watch and transpile TypeScript files on changes
352
+ - `test`: Run test suite
353
+ - `test:watch`: Continuously run test suite on changes
@@ -0,0 +1,2 @@
1
+ export declare function getDirectory(): string;
2
+ //# sourceMappingURL=esm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":"AAAA,wBAAgB,YAAY,WAE3B"}
@@ -0,0 +1,4 @@
1
+ export function getDirectory() {
2
+ return '/test/dir';
3
+ }
4
+ //# sourceMappingURL=esm.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"esm.js","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY;IAC1B,OAAO,WAAW,CAAC;AACrB,CAAC"}
package/dist/esm.d.ts ADDED
@@ -0,0 +1,2 @@
1
+ export declare function getDirectory(): string;
2
+ //# sourceMappingURL=esm.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,IAAI,MAAM,CAIrC"}
package/dist/esm.js ADDED
@@ -0,0 +1,9 @@
1
+ // esm-specific crap that needs to get mocked out in tests
2
+ import path from 'path';
3
+ import { fileURLToPath } from 'url';
4
+ export function getDirectory() {
5
+ // @ts-ignore: Jest chokes on this
6
+ const __filename = fileURLToPath(import.meta.url);
7
+ return path.dirname(__filename);
8
+ }
9
+ //# sourceMappingURL=esm.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"esm.js","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAAA,0DAA0D;AAE1D,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAEpC,MAAM,UAAU,YAAY;IAC1B,kCAAkC;IAClC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAClD,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC"}
@@ -0,0 +1,3 @@
1
+ import { EvaluateOptions, EvaluateSummary } from './types.js';
2
+ export declare function evaluate(options: EvaluateOptions): Promise<EvaluateSummary>;
3
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,eAAe,EAAE,eAAe,EAAuC,MAAM,YAAY,CAAC;AAiEnG,wBAAsB,QAAQ,CAAC,OAAO,EAAE,eAAe,GAAG,OAAO,CAAC,eAAe,CAAC,CAoIjF"}
@@ -0,0 +1,162 @@
1
+ import async from 'async';
2
+ import nunjucks from 'nunjucks';
3
+ const DEFAULT_MAX_CONCURRENCY = 3;
4
+ function checkExpectedValue(expected, output) {
5
+ if (expected.startsWith('eval:')) {
6
+ const evalBody = expected.slice(5);
7
+ const evalFunction = new Function('output', `return ${evalBody}`);
8
+ return evalFunction(output);
9
+ }
10
+ else if (expected.startsWith('grade:')) {
11
+ // NYI
12
+ return false;
13
+ }
14
+ else {
15
+ return expected === output;
16
+ }
17
+ }
18
+ async function runEval({ provider, prompt, vars, includeProviderId, }) {
19
+ vars = vars || {};
20
+ const renderedPrompt = nunjucks.renderString(prompt, vars);
21
+ // Note that we're using original prompt, not renderedPrompt
22
+ const promptDisplay = includeProviderId ? `[${provider.id()}] ${prompt}` : prompt;
23
+ const setup = {
24
+ prompt: {
25
+ raw: renderedPrompt,
26
+ display: promptDisplay,
27
+ },
28
+ vars,
29
+ };
30
+ try {
31
+ const response = await provider.callApi(renderedPrompt);
32
+ const success = vars.__expected ? checkExpectedValue(vars.__expected, response.output) : true;
33
+ const ret = {
34
+ ...setup,
35
+ response,
36
+ success,
37
+ };
38
+ if (!success) {
39
+ ret.error = `Expected ${vars.__expected}, got "${response.output}"`;
40
+ }
41
+ return ret;
42
+ }
43
+ catch (err) {
44
+ return {
45
+ ...setup,
46
+ error: String(err),
47
+ success: false,
48
+ };
49
+ }
50
+ }
51
+ export async function evaluate(options) {
52
+ const prompts = [];
53
+ const results = [];
54
+ for (const promptContent of options.prompts) {
55
+ for (const provider of options.providers) {
56
+ prompts.push({
57
+ raw: promptContent,
58
+ display: options.providers.length > 1 ? `[${provider.id()}] ${promptContent}` : promptContent,
59
+ });
60
+ }
61
+ }
62
+ const vars = options.vars && options.vars.length > 0 ? options.vars : [{}];
63
+ const varsWithExpectedKeyRemoved = vars.map((v) => {
64
+ const ret = { ...v };
65
+ delete ret.__expected;
66
+ return ret;
67
+ });
68
+ const isTest = vars[0].__expected;
69
+ const table = [
70
+ isTest
71
+ ? [
72
+ 'RESULT',
73
+ [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
74
+ ].flat()
75
+ : [...prompts.map((p) => p.display), ...Object.keys(varsWithExpectedKeyRemoved[0])],
76
+ ];
77
+ const stats = {
78
+ successes: 0,
79
+ failures: 0,
80
+ tokenUsage: {
81
+ total: 0,
82
+ prompt: 0,
83
+ completion: 0,
84
+ },
85
+ };
86
+ let progressbar;
87
+ if (options.showProgressBar) {
88
+ const totalNumRuns = options.prompts.length * options.providers.length * (options.vars?.length || 1);
89
+ const cliProgress = await import('cli-progress');
90
+ progressbar = new cliProgress.SingleBar({
91
+ format: 'Eval: [{bar}] {percentage}% | ETA: {eta}s | {value}/{total} | {provider} "{prompt}" {vars}',
92
+ }, cliProgress.Presets.shades_classic);
93
+ progressbar.start(totalNumRuns, 0, {
94
+ provider: '',
95
+ prompt: '',
96
+ vars: '',
97
+ });
98
+ }
99
+ const runEvalOptions = [];
100
+ for (const row of vars) {
101
+ for (const promptContent of options.prompts) {
102
+ for (const provider of options.providers) {
103
+ runEvalOptions.push({
104
+ provider,
105
+ prompt: promptContent,
106
+ vars: row,
107
+ includeProviderId: options.providers.length > 1,
108
+ });
109
+ }
110
+ }
111
+ }
112
+ const combinedOutputs = new Array(vars.length).fill(null).map(() => []);
113
+ await async.forEachOfLimit(runEvalOptions, options.maxConcurrency || DEFAULT_MAX_CONCURRENCY, async (options, index) => {
114
+ const row = await runEval(options);
115
+ results.push(row);
116
+ if (row.error) {
117
+ stats.failures++;
118
+ }
119
+ else {
120
+ if (row.success) {
121
+ stats.successes++;
122
+ }
123
+ else {
124
+ stats.failures++;
125
+ }
126
+ stats.tokenUsage.total += row.response?.tokenUsage?.total || 0;
127
+ stats.tokenUsage.prompt += row.response?.tokenUsage?.prompt || 0;
128
+ stats.tokenUsage.completion += row.response?.tokenUsage?.completion || 0;
129
+ }
130
+ if (progressbar) {
131
+ progressbar.increment({
132
+ provider: options.provider.id(),
133
+ prompt: options.prompt.slice(0, 10),
134
+ vars: Object.entries(options.vars || {})
135
+ .map(([k, v]) => `${k}=${v}`)
136
+ .join(' ')
137
+ .slice(0, 10),
138
+ });
139
+ }
140
+ // Bookkeeping for table
141
+ if (typeof index !== 'number') {
142
+ throw new Error('Expected index to be a number');
143
+ }
144
+ const combinedOutputIndex = Math.floor(index / prompts.length);
145
+ combinedOutputs[combinedOutputIndex].push(row.response?.output || '');
146
+ });
147
+ if (progressbar) {
148
+ progressbar.stop();
149
+ }
150
+ if (isTest) {
151
+ table.push(...combinedOutputs.map((output, index) => [
152
+ results[index].success ? 'PASS' : `FAIL: ${results[index].error}`,
153
+ ...output,
154
+ ...Object.values(varsWithExpectedKeyRemoved[index]),
155
+ ]));
156
+ }
157
+ else {
158
+ table.push(...combinedOutputs.map((output, index) => [...output, ...Object.values(vars[index])]));
159
+ }
160
+ return { results, stats, table };
161
+ }
162
+ //# sourceMappingURL=evaluator.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.js","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,MAAM,OAAO,CAAC;AAC1B,OAAO,QAAQ,MAAM,UAAU,CAAC;AAahC,MAAM,uBAAuB,GAAG,CAAC,CAAC;AAElC,SAAS,kBAAkB,CAAC,QAAgB,EAAE,MAAc;IAC1D,IAAI,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE;QAChC,MAAM,QAAQ,GAAG,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QACnC,MAAM,YAAY,GAAG,IAAI,QAAQ,CAAC,QAAQ,EAAE,UAAU,QAAQ,EAAE,CAAC,CAAC;QAClE,OAAO,YAAY,CAAC,MAAM,CAAC,CAAC;KAC7B;SAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE;QACxC,MAAM;QACN,OAAO,KAAK,CAAC;KACd;SAAM;QACL,OAAO,QAAQ,KAAK,MAAM,CAAC;KAC5B;AACH,CAAC;AAED,KAAK,UAAU,OAAO,CAAC,EACrB,QAAQ,EACR,MAAM,EACN,IAAI,EACJ,iBAAiB,GACF;IACf,IAAI,GAAG,IAAI,IAAI,EAAE,CAAC;IAClB,MAAM,cAAc,GAAG,QAAQ,CAAC,YAAY,CAAC,MAAM,EAAE,IAAI,CAAC,CAAC;IAE3D,4DAA4D;IAC5D,MAAM,aAAa,GAAG,iBAAiB,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC;IAElF,MAAM,KAAK,GAAG;QACZ,MAAM,EAAE;YACN,GAAG,EAAE,cAAc;YACnB,OAAO,EAAE,aAAa;SACvB;QACD,IAAI;KACL,CAAC;IAEF,IAAI;QACF,MAAM,QAAQ,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,CAAC;QACxD,MAAM,OAAO,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,kBAAkB,CAAC,IAAI,CAAC,UAAU,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAC9F,MAAM,GAAG,GAAmB;YAC1B,GAAG,KAAK;YACR,QAAQ;YACR,OAAO;SACR,CAAC;QACF,IAAI,CAAC,OAAO,EAAE;YACZ,GAAG,CAAC,KAAK,GAAG,YAAY,IAAI,CAAC,UAAU,UAAU,QAAQ,CAAC,MAAM,GAAG,CAAC;SACrE;QACD,OAAO,GAAG,CAAC;KACZ;IAAC,OAAO,GAAG,EAAE;QACZ,OAAO;YACL,GAAG,KAAK;YACR,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC;YAClB,OAAO,EAAE,KAAK;SACf,CAAC;KACH;AACH,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,QAAQ,CAAC,OAAwB;IACrD,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,MAAM,OAAO,GAAqB,EAAE,CAAC;IAErC,KAAK,MAAM,aAAa,IAAI,OAAO,CAAC,OAAO,EAAE;QAC3C,KAAK,MAAM,QAAQ,IAAI,OAAO,CAAC,SAAS,EAAE;YACxC,OAAO,CAAC,IAAI,CAAC;gBACX,GAAG,EAAE,aAAa;gBAClB,OAAO,EACL,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,QAAQ,CAAC,EAAE,EAAE,KAAK,aAAa,EAAE,CAAC,CAAC,CAAC,aAAa;aACvF,CAAC,CAAC;SACJ;KACF;IAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;IAC3E,MAAM,0BAA0B,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE;QAChD,MAAM,GAAG,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;QACrB,OAAO,GAAG,CAAC,UAAU,CAAC;QACtB,OAAO,GAAG,CAAC;IACb,CAAC,CAAC,CAAC;IACH,MAAM,MAAM,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;IAClC,MAAM,KAAK,GAAe;QACxB,MAAM;YACJ,CAAC,CAAC;gBACE,QAAQ;gBACR,CAAC,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC,CAAC,CAAC;aAClF,CAAC,IAAI,EAAE;YACV,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE,GAAG,MAAM,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC,CAAC,CAAC,CAAC;KACtF,CAAC;IAEF,MAAM,KAAK,GAAG;QACZ,SAAS,EAAE,CAAC;QACZ,QAAQ,EAAE,CAAC;QACX,UAAU,EAAE;YACV,KAAK,EAAE,CAAC;YACR,MAAM,EAAE,CAAC;YACT,UAAU,EAAE,CAAC;SACd;KACF,CAAC;IAEF,IAAI,WAAkC,CAAC;IACvC,IAAI,OAAO,CAAC,eAAe,EAAE;QAC3B,MAAM,YAAY,GAChB,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,OAAO,CAAC,IAAI,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC;QAClF,MAAM,WAAW,GAAG,MAAM,MAAM,CAAC,cAAc,CAAC,CAAC;QACjD,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CACrC;YACE,MAAM,EACJ,4FAA4F;SAC/F,EACD,WAAW,CAAC,OAAO,CAAC,cAAc,CACnC,CAAC;QACF,WAAW,CAAC,KAAK,CAAC,YAAY,EAAE,CAAC,EAAE;YACjC,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,EAAE;YACV,IAAI,EAAE,EAAE;SACT,CAAC,CAAC;KACJ;IAED,MAAM,cAAc,GAAqB,EAAE,CAAC;IAC5C,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE;QACtB,KAAK,MAAM,aAAa,IAAI,OAAO,CAAC,OAAO,EAAE;YAC3C,KAAK,MAAM,QAAQ,IAAI,OAAO,CAAC,SAAS,EAAE;gBACxC,cAAc,CAAC,IAAI,CAAC;oBAClB,QAAQ;oBACR,MAAM,EAAE,aAAa;oBACrB,IAAI,EAAE,GAAG;oBACT,iBAAiB,EAAE,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC;iBAChD,CAAC,CAAC;aACJ;SACF;KACF;IAED,MAAM,eAAe,GAAe,IAAI,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,EAAE,CAAC,CAAC;IACpF,MAAM,KAAK,CAAC,cAAc,CACxB,cAAc,EACd,OAAO,CAAC,cAAc,IAAI,uBAAuB,EACjD,KAAK,EAAE,OAAuB,EAAE,KAAsB,EAAE,EAAE;QACxD,MAAM,GAAG,GAAG,MAAM,OAAO,CAAC,OAAO,CAAC,CAAC;QACnC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAClB,IAAI,GAAG,CAAC,KAAK,EAAE;YACb,KAAK,CAAC,QAAQ,EAAE,CAAC;SAClB;aAAM;YACL,IAAI,GAAG,CAAC,OAAO,EAAE;gBACf,KAAK,CAAC,SAAS,EAAE,CAAC;aACnB;iBAAM;gBACL,KAAK,CAAC,QAAQ,EAAE,CAAC;aAClB;YACD,KAAK,CAAC,UAAU,CAAC,KAAK,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC;YAC/D,KAAK,CAAC,UAAU,CAAC,MAAM,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC;YACjE,KAAK,CAAC,UAAU,CAAC,UAAU,IAAI,GAAG,CAAC,QAAQ,EAAE,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;SAC1E;QAED,IAAI,WAAW,EAAE;YACf,WAAW,CAAC,SAAS,CAAC;gBACpB,QAAQ,EAAE,OAAO,CAAC,QAAQ,CAAC,EAAE,EAAE;gBAC/B,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;gBACnC,IAAI,EAAE,MAAM,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,IAAI,EAAE,CAAC;qBACrC,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;qBAC5B,IAAI,CAAC,GAAG,CAAC;qBACT,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;aAChB,CAAC,CAAC;SACJ;QAED,wBAAwB;QACxB,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE;YAC7B,MAAM,IAAI,KAAK,CAAC,+BAA+B,CAAC,CAAC;SAClD;QACD,MAAM,mBAAmB,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC;QAC/D,eAAe,CAAC,mBAAmB,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,MAAM,IAAI,EAAE,CAAC,CAAC;IACxE,CAAC,CACF,CAAC;IAEF,IAAI,WAAW,EAAE;QACf,WAAW,CAAC,IAAI,EAAE,CAAC;KACpB;IAED,IAAI,MAAM,EAAE;QACV,KAAK,CAAC,IAAI,CACR,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC;YACxC,OAAO,CAAC,KAAK,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,OAAO,CAAC,KAAK,CAAC,CAAC,KAAK,EAAE;YACjE,GAAG,MAAM;YACT,GAAG,MAAM,CAAC,MAAM,CAAC,0BAA0B,CAAC,KAAK,CAAC,CAAC;SACpD,CAAC,CACH,CAAC;KACH;SAAM;QACL,KAAK,CAAC,IAAI,CACR,GAAG,eAAe,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC,CAAC,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CACtF,CAAC;KACH;IAED,OAAO,EAAE,OAAO,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC;AACnC,CAAC"}
@@ -0,0 +1,7 @@
1
+ import type { ApiProvider, EvaluateOptions, EvaluateSummary } from './types.js';
2
+ declare function evaluate(providers: (string | ApiProvider)[] | (string | ApiProvider), options: Omit<EvaluateOptions, 'providers'>): Promise<EvaluateSummary>;
3
+ declare const _default: {
4
+ evaluate: typeof evaluate;
5
+ };
6
+ export default _default;
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAGA,OAAO,KAAK,EAAE,WAAW,EAAE,eAAe,EAAE,eAAe,EAAE,MAAM,YAAY,CAAC;AAEhF,iBAAe,QAAQ,CACrB,SAAS,EAAE,CAAC,MAAM,GAAG,WAAW,CAAC,EAAE,GAAG,CAAC,MAAM,GAAG,WAAW,CAAC,EAC5D,OAAO,EAAE,IAAI,CAAC,eAAe,EAAE,WAAW,CAAC,GAC1C,OAAO,CAAC,eAAe,CAAC,CAsB1B;;;;AAED,wBAEE"}
package/dist/index.js ADDED
@@ -0,0 +1,29 @@
1
+ import { evaluate as doEvaluate } from './evaluator.js';
2
+ import { loadApiProvider } from './providers.js';
3
+ async function evaluate(providers, options) {
4
+ let apiProviders = [];
5
+ const addProvider = async (provider) => {
6
+ if (typeof provider === 'string') {
7
+ apiProviders.push(await loadApiProvider(provider));
8
+ }
9
+ else {
10
+ apiProviders.push(provider);
11
+ }
12
+ };
13
+ if (Array.isArray(providers)) {
14
+ for (const provider of providers) {
15
+ await addProvider(provider);
16
+ }
17
+ }
18
+ else {
19
+ await addProvider(providers);
20
+ }
21
+ return doEvaluate({
22
+ ...options,
23
+ providers: apiProviders,
24
+ });
25
+ }
26
+ export default {
27
+ evaluate,
28
+ };
29
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,IAAI,UAAU,EAAE,MAAM,gBAAgB,CAAC;AACxD,OAAO,EAAE,eAAe,EAAE,MAAM,gBAAgB,CAAC;AAIjD,KAAK,UAAU,QAAQ,CACrB,SAA4D,EAC5D,OAA2C;IAE3C,IAAI,YAAY,GAAkB,EAAE,CAAC;IACrC,MAAM,WAAW,GAAG,KAAK,EAAE,QAA8B,EAAE,EAAE;QAC3D,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE;YAChC,YAAY,CAAC,IAAI,CAAC,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC,CAAC;SACpD;aAAM;YACL,YAAY,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;SAC7B;IACH,CAAC,CAAC;IAEF,IAAI,KAAK,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE;QAC5B,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE;YAChC,MAAM,WAAW,CAAC,QAAQ,CAAC,CAAC;SAC7B;KACF;SAAM;QACL,MAAM,WAAW,CAAC,SAAS,CAAC,CAAC;KAC9B;IAED,OAAO,UAAU,CAAC;QAChB,GAAG,OAAO;QACV,SAAS,EAAE,YAAY;KACxB,CAAC,CAAC;AACL,CAAC;AAED,eAAe;IACb,QAAQ;CACT,CAAC"}
@@ -0,0 +1,11 @@
1
+ import winston from 'winston';
2
+ declare const logLevels: {
3
+ error: number;
4
+ warn: number;
5
+ info: number;
6
+ debug: number;
7
+ };
8
+ declare const logger: winston.Logger;
9
+ export declare function setLogLevel(level: keyof typeof logLevels): void;
10
+ export default logger;
11
+ //# sourceMappingURL=logger.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"logger.d.ts","sourceRoot":"","sources":["../src/logger.ts"],"names":[],"mappings":"AACA,OAAO,OAAO,MAAM,SAAS,CAAC;AAE9B,QAAA,MAAM,SAAS;;;;;CAKd,CAAC;AAeF,QAAA,MAAM,MAAM,gBAIV,CAAC;AAEH,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,OAAO,SAAS,QAMxD;AAED,eAAe,MAAM,CAAC"}