promptfoo 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -248
- package/dist/__mocks__/esm.js +5 -1
- package/dist/__mocks__/esm.js.map +1 -1
- package/dist/assertions.d.ts +18 -0
- package/dist/assertions.d.ts.map +1 -0
- package/dist/assertions.js +128 -0
- package/dist/assertions.js.map +1 -0
- package/dist/esm.d.ts.map +1 -1
- package/dist/esm.js +10 -3
- package/dist/esm.js.map +1 -1
- package/dist/evaluator.d.ts.map +1 -1
- package/dist/evaluator.js +88 -117
- package/dist/evaluator.js.map +1 -1
- package/dist/index.d.ts +13 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +34 -5
- package/dist/index.js.map +1 -1
- package/dist/logger.js +18 -11
- package/dist/logger.js.map +1 -1
- package/dist/main.js +95 -53
- package/dist/main.js.map +1 -1
- package/dist/prompts.d.ts +4 -0
- package/dist/prompts.d.ts.map +1 -1
- package/dist/prompts.js +12 -1
- package/dist/prompts.js.map +1 -1
- package/dist/providers/localai.js +21 -13
- package/dist/providers/localai.js.map +1 -1
- package/dist/providers/openai.d.ts +9 -4
- package/dist/providers/openai.d.ts.map +1 -1
- package/dist/providers/openai.js +39 -29
- package/dist/providers/openai.js.map +1 -1
- package/dist/providers/shared.d.ts.map +1 -1
- package/dist/providers/shared.js +5 -2
- package/dist/providers/shared.js.map +1 -1
- package/dist/providers.d.ts +10 -0
- package/dist/providers.d.ts.map +1 -1
- package/dist/providers.js +51 -14
- package/dist/providers.js.map +1 -1
- package/dist/suggestions.d.ts +9 -0
- package/dist/suggestions.d.ts.map +1 -0
- package/dist/suggestions.js +54 -0
- package/dist/suggestions.js.map +1 -0
- package/dist/types.d.ts +11 -2
- package/dist/types.d.ts.map +1 -1
- package/dist/types.js +2 -1
- package/dist/util.d.ts +1 -1
- package/dist/util.d.ts.map +1 -1
- package/dist/util.js +86 -31
- package/dist/util.js.map +1 -1
- package/dist/web/client/assets/index-207192fc.css +1 -0
- package/dist/web/client/assets/index-8751749f.js +172 -0
- package/dist/web/client/index.html +2 -2
- package/dist/web/server.js +38 -31
- package/dist/web/server.js.map +1 -1
- package/package.json +14 -4
- package/src/assertions.ts +154 -0
- package/src/esm.ts +5 -2
- package/src/evaluator.ts +61 -139
- package/src/index.ts +12 -0
- package/src/main.ts +28 -3
- package/src/prompts.ts +9 -0
- package/src/providers/openai.ts +16 -9
- package/src/providers/shared.ts +1 -1
- package/src/providers.ts +8 -0
- package/src/suggestions.ts +63 -0
- package/src/types.ts +14 -2
- package/src/util.ts +24 -3
- package/src/web/client/package.json +1 -0
- package/src/web/client/src/App.css +4 -0
- package/src/web/client/src/App.tsx +29 -5
- package/src/web/client/src/Logo.css +5 -0
- package/src/web/client/src/NavBar.css +18 -0
- package/src/web/client/src/NavBar.tsx +12 -1
- package/src/web/client/src/index.css +10 -0
- package/src/web/server.ts +2 -2
- package/dist/web/client/assets/index-710f1308.css +0 -1
- package/dist/web/client/assets/index-900b20c0.js +0 -172
package/README.md
CHANGED
|
@@ -14,7 +14,7 @@ With promptfoo, you can:
|
|
|
14
14
|
- Use as a command line tool, or integrate into your workflow as a library
|
|
15
15
|
- Use OpenAI models, open-source models like Llama and Vicuna, or integrate custom API providers for any LLM API
|
|
16
16
|
|
|
17
|
-
|
|
17
|
+
# [» View full documentation «](https://promptfoo.dev/docs/intro)
|
|
18
18
|
|
|
19
19
|
promptfoo produces matrix views that allow you to quickly review prompt outputs across many inputs. The goal: tune prompts systematically across all relevant test cases, instead of testing prompts by trial and error.
|
|
20
20
|
|
|
@@ -41,19 +41,20 @@ After editing the prompts and variables to your liking, run the eval command to
|
|
|
41
41
|
npx promptfoo eval
|
|
42
42
|
```
|
|
43
43
|
|
|
44
|
-
If you're looking to customize your usage, you have
|
|
44
|
+
If you're looking to customize your usage, you have a wide set of parameters at your disposal. See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail:
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
-
|
|
54
|
-
-
|
|
55
|
-
-
|
|
56
|
-
-
|
|
46
|
+
| Option | Description |
|
|
47
|
+
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
48
|
+
| `-p, --prompts <paths...>` | Paths to prompt files, directory, or glob |
|
|
49
|
+
| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://www.promptfoo.dev/docs/configuration/providers) |
|
|
50
|
+
| `-o, --output <path>` | Path to output file (csv, json, yaml, html) |
|
|
51
|
+
| `-v, --vars <path>` | Path to file with prompt variables (csv, json, yaml) |
|
|
52
|
+
| `-c, --config <path>` | Path to configuration file. `promptfooconfig.js[on]` is automatically loaded if present |
|
|
53
|
+
| `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
|
|
54
|
+
| `--table-cell-max-length <number>` | Truncate console table cells to this length |
|
|
55
|
+
| `--prompt-prefix <path>` | This prefix is prepended to every prompt |
|
|
56
|
+
| `--prompt-suffix <path>` | This suffix is append to every prompt |
|
|
57
|
+
| `--grader` | Provider that will grade outputs, if you are using [LLM grading](https://www.promptfoo.dev/docs/configuration/expected-outputs) |
|
|
57
58
|
|
|
58
59
|
After running an eval, you may optionally use the `view` command to open the web viewer:
|
|
59
60
|
|
|
@@ -174,247 +175,16 @@ This code imports the `promptfoo` library, defines the evaluation options, and t
|
|
|
174
175
|
|
|
175
176
|
## Configuration
|
|
176
177
|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
Prompt files are plain text files that contain the prompts you want to test. If you have only one file, you can include multiple prompts in the file, separated by the delimiter `---`. If you have multiple files, each prompt should be in a separate file.
|
|
180
|
-
|
|
181
|
-
You can use [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax to include variables in your prompts, which will be replaced with actual values from the `vars` CSV file during evaluation.
|
|
182
|
-
|
|
183
|
-
Example of a single prompt file with multiple prompts (`prompts.txt`):
|
|
184
|
-
|
|
185
|
-
```
|
|
186
|
-
Translate the following text to French: "{{name}}: {{text}}"
|
|
187
|
-
---
|
|
188
|
-
Translate the following text to German: "{{name}}: {{text}}"
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
Example of multiple prompt files:
|
|
192
|
-
|
|
193
|
-
- `prompt1.txt`:
|
|
194
|
-
|
|
195
|
-
```
|
|
196
|
-
Translate the following text to French: "{{name}}: {{text}}"
|
|
197
|
-
```
|
|
198
|
-
|
|
199
|
-
- `prompt2.txt`:
|
|
200
|
-
|
|
201
|
-
```
|
|
202
|
-
Translate the following text to German: "{{name}}: {{text}}"
|
|
203
|
-
```
|
|
204
|
-
|
|
205
|
-
### Vars File
|
|
206
|
-
|
|
207
|
-
The Vars file is a CSV, JSON, or YAML file that contains the values for the variables used in the prompts. The first row of the CSV file should contain the variable names, and each subsequent row should contain the corresponding values for each test case.
|
|
208
|
-
|
|
209
|
-
Vars are substituted by [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax into prompts.
|
|
210
|
-
|
|
211
|
-
Example of a vars file (`vars.csv`):
|
|
212
|
-
|
|
213
|
-
```
|
|
214
|
-
"name","text"
|
|
215
|
-
"Bob","Hello, world!"
|
|
216
|
-
"Joe","Goodbye, everyone!"
|
|
217
|
-
```
|
|
218
|
-
|
|
219
|
-
Example of a vars file (`vars.json`):
|
|
220
|
-
|
|
221
|
-
```json
|
|
222
|
-
[
|
|
223
|
-
{ "name": "Bob", "text": "Hello, world!" },
|
|
224
|
-
{ "name": "Joe", "text": "Goodbye, everyone!" }
|
|
225
|
-
]
|
|
226
|
-
```
|
|
227
|
-
|
|
228
|
-
### Expected Outputs
|
|
229
|
-
|
|
230
|
-
You can specify an expected value for each test case to evaluate the success or failure of the model's output. To do this, add a special field called `__expected` in the `vars` file. The `__expected` field supports these types of value comparisons:
|
|
231
|
-
|
|
232
|
-
1. If the expected value starts with `eval:`, it will evaluate the contents as the body of a JavaScript function defined like: `function(output) { <eval> }`. The function should return a boolean value, where `true` indicates success and `false` indicates failure.
|
|
233
|
-
|
|
234
|
-
1. If the expected value starts with `similar:`, it will compare the semantic similarity of the expected and output values. For example, `similar: greetings, world!` is semantically similar to "Hello world" even though it's not an exact match.
|
|
235
|
-
|
|
236
|
-
The `similar` directive uses cosine similarity, where 1.0 is the most similar and 0.0 is the least similar. Tune the similarity threshold by specifying `similar(0.8): ...` (passes only if similarity >= 0.8).
|
|
237
|
-
|
|
238
|
-
The embedding model currently supported is OpenAI's `text-embedding-ada-002`. As a result, the `similar` directive requires the OPENAI_API_KEY environment variable to be set.
|
|
239
|
-
|
|
240
|
-
1. If the expected value starts with `grade:`, it will ask an LLM to evaluate whether the output meets the condition. For example, `grade: don't mention being an AI`. This option requires a provider name to be supplied to promptfoo via the `--grader` argument: `promptfoo --grader openai:gpt-4 ...`.
|
|
241
|
-
|
|
242
|
-
1. Otherwise, it attempts an exact string match comparison between the expected value and the model's output.
|
|
243
|
-
|
|
244
|
-
Example of a vars file with the `__expected` field (`vars.csv`):
|
|
245
|
-
|
|
246
|
-
```
|
|
247
|
-
text,__expected
|
|
248
|
-
"Hello, world!","Bonjour le monde"
|
|
249
|
-
"Goodbye, everyone!","eval:output.includes('Au revoir');"
|
|
250
|
-
"I am a pineapple","grade:doesn't reference any fruits besides pineapple"
|
|
251
|
-
```
|
|
252
|
-
|
|
253
|
-
Example of a vars file with the `__expected` field (`vars.json`):
|
|
254
|
-
|
|
255
|
-
```json
|
|
256
|
-
[
|
|
257
|
-
{ "text": "Hello, world!", "__expected": "Bonjour le monde" },
|
|
258
|
-
{ "text": "Goodbye, everyone!", "__expected": "eval:output.includes('Au revoir');" }
|
|
259
|
-
{ "text": "I am a pineapple", "__expected": "grade:doesn't reference any fruits besides pineapple" }
|
|
260
|
-
]
|
|
261
|
-
```
|
|
262
|
-
|
|
263
|
-
When the `__expected` field is provided, the success and failure statistics in the evaluation summary will be based on whether the expected criteria are met.
|
|
264
|
-
|
|
265
|
-
For more advanced test cases, we recommend using a testing framework like [Jest](https://jestjs.io/) or [Mocha](https://mochajs.org/) and using promptfoo as a library.
|
|
266
|
-
|
|
267
|
-
### Output File
|
|
268
|
-
|
|
269
|
-
The results of the evaluation are written to this file. Each record in the output file corresponds to a test case and includes the original prompt, the output generated by the LLM, and the values of the variables used in the test case.
|
|
270
|
-
|
|
271
|
-
For example outputs, see the [examples/](https://github.com/typpo/promptfoo/tree/main/examples) directory.
|
|
272
|
-
|
|
273
|
-
### Configuration File
|
|
274
|
-
|
|
275
|
-
You can specify any option in a configuration file (e.g., `.promptfoorc`, `promptfoo.config.json`). This can help you avoid repetitive command-line options and simplify the CLI invocation.
|
|
276
|
-
|
|
277
|
-
Example of a configuration file (`promptfoo.config.json`):
|
|
278
|
-
|
|
279
|
-
```json
|
|
280
|
-
{
|
|
281
|
-
"provider": "openai:chat",
|
|
282
|
-
"vars": "/path/to/vars.csv"
|
|
283
|
-
}
|
|
284
|
-
```
|
|
178
|
+
- **[Setting up an eval](https://promptfoo.dev/docs/configuration/parameters)**: Learn more about how to set up prompt files, vars file, output, etc.
|
|
179
|
+
- **[Configuring test cases](https://promptfoo.dev/docs/configuration/expected-outputs)**: Learn more about how to configure expected outputs and test assertions.
|
|
285
180
|
|
|
286
181
|
## Installation
|
|
287
182
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
```bash
|
|
291
|
-
git clone https://github.com/typpo/promptfoo.git
|
|
292
|
-
```
|
|
293
|
-
|
|
294
|
-
2. Install the dependencies:
|
|
295
|
-
|
|
296
|
-
```bash
|
|
297
|
-
npm install
|
|
298
|
-
```
|
|
299
|
-
|
|
300
|
-
3. Link the CLI tool:
|
|
301
|
-
|
|
302
|
-
```bash
|
|
303
|
-
npm link
|
|
304
|
-
```
|
|
305
|
-
|
|
306
|
-
4. Build:
|
|
307
|
-
|
|
308
|
-
```bash
|
|
309
|
-
npm run build
|
|
310
|
-
```
|
|
311
|
-
|
|
312
|
-
5. Make the entrypoint executable:
|
|
313
|
-
|
|
314
|
-
```bash
|
|
315
|
-
chmod +x dist/main.js
|
|
316
|
-
```
|
|
183
|
+
See **[installation docs](https://promptfoo.dev/docs/installation)**
|
|
317
184
|
|
|
318
185
|
## API Providers
|
|
319
186
|
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
### OpenAI API
|
|
323
|
-
|
|
324
|
-
To use the OpenAI API, set the `OPENAI_API_KEY` environment variable or pass the API key as an argument to the constructor.
|
|
325
|
-
|
|
326
|
-
Example:
|
|
327
|
-
|
|
328
|
-
```bash
|
|
329
|
-
export OPENAI_API_KEY=your_api_key_here
|
|
330
|
-
```
|
|
331
|
-
|
|
332
|
-
Other OpenAI-related environment variables are supported:
|
|
333
|
-
|
|
334
|
-
- `OPENAI_TEMPERATURE` - temperature model parameter, defaults to 0
|
|
335
|
-
- `OPENAI_MAX_TOKENS` - max_tokens model parameter, defaults to 1024
|
|
336
|
-
- `OPENAI_STOP` - stopwords in JSON format, defaults to []
|
|
337
|
-
- `OPENAI_API_HOST` - override the hostname for the API request. Useful for proxies like Helicone.
|
|
338
|
-
- `REQUEST_TIMEOUT_MS` - maximum request time, in milliseconds. Defaults to 60000.
|
|
339
|
-
|
|
340
|
-
The OpenAI provider supports the following model formats:
|
|
341
|
-
|
|
342
|
-
- `openai:chat` - defaults to gpt-3.5-turbo
|
|
343
|
-
- `openai:completion` - defaults to `text-davinci-003`
|
|
344
|
-
- `openai:<model name>` - uses a specific model name (mapped automatically to chat or completion endpoint)
|
|
345
|
-
- `openai:chat:<model name>` - uses any model name against the chat endpoint
|
|
346
|
-
- `openai:completion:<model name>` - uses any model name against the completion endpoint
|
|
347
|
-
|
|
348
|
-
The `openai:<endpoint>:<model>` construction is useful if OpenAI releases a new model, or if you have a custom model. For example, if OpenAI releases gpt-5 chat completion, you could begin using it immediately with `openai:chat:gpt-5`.
|
|
349
|
-
|
|
350
|
-
### LocalAI
|
|
351
|
-
|
|
352
|
-
LocalAI is an API wrapper for open-source LLMs that is compatible with OpenAI. You can run LocalAI for compatibility with Llama, Alpaca, Vicuna, GPT4All, RedPajama, and many other models compatible with the ggml format.
|
|
353
|
-
|
|
354
|
-
View all compatible models [here](https://github.com/go-skynet/LocalAI#model-compatibility-table).
|
|
355
|
-
|
|
356
|
-
Once you have LocalAI up and running, specify one of the following based on the model you have selected:
|
|
357
|
-
|
|
358
|
-
- `localai:chat:<model name>`
|
|
359
|
-
- `localai:completion:<model name>`
|
|
360
|
-
- `localai:<model name>` - defaults to chat-type model
|
|
361
|
-
|
|
362
|
-
The model name is typically the filename of the .bin file that you downloaded to set up the model in LocalAI. For example, `ggml-vic13b-uncensored-q5_1.bin`
|
|
363
|
-
|
|
364
|
-
Supported environment variables:
|
|
365
|
-
|
|
366
|
-
- `LOCALAI_BASE_URL` - defaults to `http://localhost:8080/v1`
|
|
367
|
-
- `REQUEST_TIMEOUT_MS` - maximum request time, in milliseconds. Defaults to 60000.
|
|
368
|
-
|
|
369
|
-
### Custom API Provider
|
|
370
|
-
|
|
371
|
-
To create a custom API provider, implement the `ApiProvider` interface in a separate module. Here is the interface:
|
|
372
|
-
|
|
373
|
-
```js
|
|
374
|
-
export interface ApiProvider {
|
|
375
|
-
id: () => string;
|
|
376
|
-
callApi: (prompt: string) => Promise<ProviderResult>;
|
|
377
|
-
}
|
|
378
|
-
```
|
|
379
|
-
|
|
380
|
-
Below is an example of a custom API provider that returns a predefined output and token usage:
|
|
381
|
-
|
|
382
|
-
```javascript
|
|
383
|
-
// customApiProvider.js
|
|
384
|
-
import fetch from 'node-fetch';
|
|
385
|
-
|
|
386
|
-
class CustomApiProvider {
|
|
387
|
-
id() {
|
|
388
|
-
return 'my-custom-api';
|
|
389
|
-
}
|
|
390
|
-
|
|
391
|
-
async callApi(prompt) {
|
|
392
|
-
// Add your custom API logic here
|
|
393
|
-
|
|
394
|
-
return {
|
|
395
|
-
// Required
|
|
396
|
-
output: 'Model output',
|
|
397
|
-
|
|
398
|
-
// Optional
|
|
399
|
-
tokenUsage: {
|
|
400
|
-
total: 10,
|
|
401
|
-
prompt: 5,
|
|
402
|
-
completion: 5,
|
|
403
|
-
},
|
|
404
|
-
};
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
export default CustomApiProvider;
|
|
409
|
-
```
|
|
410
|
-
|
|
411
|
-
To use the custom API provider with `promptfoo`, pass the path to the module as the `provider` option in the CLI invocation:
|
|
412
|
-
|
|
413
|
-
```bash
|
|
414
|
-
promptfoo eval -p prompt1.txt prompt2.txt -o results.csv -v vars.csv -r ./customApiProvider.js
|
|
415
|
-
```
|
|
416
|
-
|
|
417
|
-
This command will evaluate the prompts using the custom API provider and save the results to the specified CSV file.
|
|
187
|
+
We support OpenAI's API as well as a number of open-source models. It's also to set up your own custom API provider. **[See Provider documentation](https://promptfoo.dev/docs/configuration/providers)** for more details.
|
|
418
188
|
|
|
419
189
|
## Development
|
|
420
190
|
|
|
@@ -428,3 +198,5 @@ Here are some of the available scripts:
|
|
|
428
198
|
- `build:watch`: Continuously watch and transpile TypeScript files on changes
|
|
429
199
|
- `test`: Run test suite
|
|
430
200
|
- `test:watch`: Continuously run test suite on changes
|
|
201
|
+
|
|
202
|
+
# [» View full documentation «](https://promptfoo.dev/docs/intro)
|
package/dist/__mocks__/esm.js
CHANGED
|
@@ -1,4 +1,8 @@
|
|
|
1
|
-
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.getDirectory = void 0;
|
|
4
|
+
function getDirectory() {
|
|
2
5
|
return '/test/dir';
|
|
3
6
|
}
|
|
7
|
+
exports.getDirectory = getDirectory;
|
|
4
8
|
//# sourceMappingURL=esm.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"esm.js","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":"AAAA,
|
|
1
|
+
{"version":3,"file":"esm.js","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":";;;AAAA,SAAgB,YAAY;IAC1B,OAAO,WAAW,CAAC;AACrB,CAAC;AAFD,oCAEC"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
|
|
2
|
+
interface GradingResult {
|
|
3
|
+
pass: boolean;
|
|
4
|
+
reason: string;
|
|
5
|
+
tokensUsed: TokenUsage;
|
|
6
|
+
}
|
|
7
|
+
export declare function matchesExpectedValue(expected: string, output: string, options: EvaluateOptions): Promise<{
|
|
8
|
+
pass: boolean;
|
|
9
|
+
reason?: string;
|
|
10
|
+
}>;
|
|
11
|
+
export declare function matchesSimilarity(expected: string, output: string, threshold: number): Promise<GradingResult>;
|
|
12
|
+
export declare function matchesLlmRubric(expected: string, output: string, options?: GradingConfig): Promise<GradingResult>;
|
|
13
|
+
declare const _default: {
|
|
14
|
+
matchesSimilarity: typeof matchesSimilarity;
|
|
15
|
+
matchesLlmRubric: typeof matchesLlmRubric;
|
|
16
|
+
};
|
|
17
|
+
export default _default;
|
|
18
|
+
//# sourceMappingURL=assertions.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"assertions.d.ts","sourceRoot":"","sources":["../src/assertions.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,eAAe,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAE7E,UAAU,aAAa;IACrB,IAAI,EAAE,OAAO,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,UAAU,CAAC;CACxB;AAMD,wBAAsB,oBAAoB,CACxC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,eAAe,GACvB,OAAO,CAAC;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAuB7C;AAED,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,aAAa,CAAC,CA0CxB;AAED,wBAAsB,gBAAgB,CACpC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,aAAa,GACtB,OAAO,CAAC,aAAa,CAAC,CAgDxB;;;;;AAED,wBAGE"}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.matchesLlmRubric = exports.matchesSimilarity = exports.matchesExpectedValue = void 0;
|
|
7
|
+
const nunjucks_1 = __importDefault(require("nunjucks"));
|
|
8
|
+
const openai_js_1 = require("./providers/openai.js");
|
|
9
|
+
const util_js_1 = require("./util.js");
|
|
10
|
+
const providers_js_1 = require("./providers.js");
|
|
11
|
+
const prompts_js_1 = require("./prompts.js");
|
|
12
|
+
const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
|
|
13
|
+
const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
|
|
14
|
+
async function matchesExpectedValue(expected, output, options) {
|
|
15
|
+
const match = expected.match(SIMILAR_REGEX);
|
|
16
|
+
if (match) {
|
|
17
|
+
const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
|
|
18
|
+
const rest = expected.replace(SIMILAR_REGEX, '').trim();
|
|
19
|
+
return matchesSimilarity(rest, output, threshold);
|
|
20
|
+
}
|
|
21
|
+
else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
|
|
22
|
+
// TODO(1.0): delete eval: legacy option
|
|
23
|
+
const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
|
|
24
|
+
const functionBody = expected.slice(sliceLength);
|
|
25
|
+
const customFunction = new Function('output', `return ${functionBody}`);
|
|
26
|
+
return { pass: customFunction(output) };
|
|
27
|
+
}
|
|
28
|
+
else if (expected.startsWith('grade:')) {
|
|
29
|
+
return matchesLlmRubric(expected.slice(6), output, options.grading);
|
|
30
|
+
}
|
|
31
|
+
else {
|
|
32
|
+
const pass = expected === output;
|
|
33
|
+
return {
|
|
34
|
+
pass,
|
|
35
|
+
reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
|
|
36
|
+
};
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
exports.matchesExpectedValue = matchesExpectedValue;
|
|
40
|
+
async function matchesSimilarity(expected, output, threshold) {
|
|
41
|
+
const expectedEmbedding = await openai_js_1.DefaultEmbeddingProvider.callEmbeddingApi(expected);
|
|
42
|
+
const outputEmbedding = await openai_js_1.DefaultEmbeddingProvider.callEmbeddingApi(output);
|
|
43
|
+
const tokensUsed = {
|
|
44
|
+
total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
|
|
45
|
+
prompt: (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
|
|
46
|
+
completion: (expectedEmbedding.tokenUsage?.completion || 0) +
|
|
47
|
+
(outputEmbedding.tokenUsage?.completion || 0),
|
|
48
|
+
};
|
|
49
|
+
if (expectedEmbedding.error || outputEmbedding.error) {
|
|
50
|
+
return {
|
|
51
|
+
pass: false,
|
|
52
|
+
reason: expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
|
|
53
|
+
tokensUsed,
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
|
|
57
|
+
return {
|
|
58
|
+
pass: false,
|
|
59
|
+
reason: 'Embedding not found',
|
|
60
|
+
tokensUsed,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
const similarity = (0, util_js_1.cosineSimilarity)(expectedEmbedding.embedding, outputEmbedding.embedding);
|
|
64
|
+
if (similarity < threshold) {
|
|
65
|
+
return {
|
|
66
|
+
pass: false,
|
|
67
|
+
reason: `Similarity ${similarity} is less than threshold ${threshold}`,
|
|
68
|
+
tokensUsed,
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
return {
|
|
72
|
+
pass: true,
|
|
73
|
+
reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
|
|
74
|
+
tokensUsed,
|
|
75
|
+
};
|
|
76
|
+
}
|
|
77
|
+
exports.matchesSimilarity = matchesSimilarity;
|
|
78
|
+
async function matchesLlmRubric(expected, output, options) {
|
|
79
|
+
if (!options) {
|
|
80
|
+
throw new Error('Cannot grade output without grading config. Specify --grader option or grading config.');
|
|
81
|
+
}
|
|
82
|
+
const prompt = nunjucks_1.default.renderString(options.prompt || prompts_js_1.DEFAULT_GRADING_PROMPT, {
|
|
83
|
+
content: output,
|
|
84
|
+
rubric: expected,
|
|
85
|
+
});
|
|
86
|
+
let provider = options.provider || openai_js_1.DefaultGradingProvider;
|
|
87
|
+
if (typeof provider === 'string') {
|
|
88
|
+
provider = await (0, providers_js_1.loadApiProvider)(provider);
|
|
89
|
+
}
|
|
90
|
+
const resp = await provider.callApi(prompt);
|
|
91
|
+
if (resp.error || !resp.output) {
|
|
92
|
+
return {
|
|
93
|
+
pass: false,
|
|
94
|
+
reason: resp.error || 'No output',
|
|
95
|
+
tokensUsed: {
|
|
96
|
+
total: resp.tokenUsage?.total || 0,
|
|
97
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
98
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
99
|
+
},
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
try {
|
|
103
|
+
const parsed = JSON.parse(resp.output);
|
|
104
|
+
parsed.tokensUsed = {
|
|
105
|
+
total: resp.tokenUsage?.total || 0,
|
|
106
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
107
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
108
|
+
};
|
|
109
|
+
return parsed;
|
|
110
|
+
}
|
|
111
|
+
catch (err) {
|
|
112
|
+
return {
|
|
113
|
+
pass: false,
|
|
114
|
+
reason: `Output is not valid JSON: ${resp.output}`,
|
|
115
|
+
tokensUsed: {
|
|
116
|
+
total: resp.tokenUsage?.total || 0,
|
|
117
|
+
prompt: resp.tokenUsage?.prompt || 0,
|
|
118
|
+
completion: resp.tokenUsage?.completion || 0,
|
|
119
|
+
},
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
exports.matchesLlmRubric = matchesLlmRubric;
|
|
124
|
+
exports.default = {
|
|
125
|
+
matchesSimilarity,
|
|
126
|
+
matchesLlmRubric,
|
|
127
|
+
};
|
|
128
|
+
//# sourceMappingURL=assertions.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"assertions.js","sourceRoot":"","sources":["../src/assertions.ts"],"names":[],"mappings":";;;;;;AAAA,wDAAgC;AAEhC,qDAAyF;AACzF,uCAA6C;AAC7C,iDAAiD;AACjD,6CAAsD;AAUtD,MAAM,aAAa,GAAG,iCAAiC,CAAC;AAExD,MAAM,qCAAqC,GAAG,GAAG,CAAC;AAE3C,KAAK,UAAU,oBAAoB,CACxC,QAAgB,EAChB,MAAc,EACd,OAAwB;IAExB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAE5C,IAAI,KAAK,EAAE;QACT,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,qCAAqC,CAAC;QAChF,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACxD,OAAO,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;KACnD;SAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE;QACrE,wCAAwC;QACxC,MAAM,WAAW,GAAG,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC;QAC/E,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QAEjD,MAAM,cAAc,GAAG,IAAI,QAAQ,CAAC,QAAQ,EAAE,UAAU,YAAY,EAAE,CAAC,CAAC;QACxE,OAAO,EAAE,IAAI,EAAE,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC;KACzC;SAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE;QACxC,OAAO,gBAAgB,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;KACrE;SAAM;QACL,MAAM,IAAI,GAAG,QAAQ,KAAK,MAAM,CAAC;QACjC,OAAO;YACL,IAAI;YACJ,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,aAAa,QAAQ,aAAa,MAAM,EAAE;SACtE,CAAC;KACH;AACH,CAAC;AA3BD,oDA2BC;AAEM,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAAc,EACd,SAAiB;IAEjB,MAAM,iBAAiB,GAAG,MAAM,oCAAwB,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IACpF,MAAM,eAAe,GAAG,MAAM,oCAAwB,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;IAEhF,MAAM,UAAU,GAAG;QACjB,KAAK,EAAE,CAAC,iBAAiB,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC,GAAG,CAAC,eAAe,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC;QAC5F,MAAM,EAAE,CAAC,iBAAiB,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,eAAe,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC;QAC/F,UAAU,EACR,CAAC,iBAAiB,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;YAC/C,CAAC,eAAe,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;KAChD,CAAC;IAEF,IAAI,iBAAiB,CAAC,KAAK,IAAI,eAAe,CAAC,KAAK,EAAE;QACpD,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EACJ,iBAAiB,CAAC,KAAK,IAAI,eAAe,CAAC,KAAK,IAAI,mCAAmC;YACzF,UAAU;SACX,CAAC;KACH;IAED,IAAI,CAAC,iBAAiB,CAAC,SAAS,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE;QAC9D,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,qBAAqB;YAC7B,UAAU;SACX,CAAC;KACH;IAED,MAAM,UAAU,GAAG,IAAA,0BAAgB,EAAC,iBAAiB,CAAC,SAAS,EAAE,eAAe,CAAC,SAAS,CAAC,CAAC;IAC5F,IAAI,UAAU,GAAG,SAAS,EAAE;QAC1B,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,cAAc,UAAU,2BAA2B,SAAS,EAAE;YACtE,UAAU;SACX,CAAC;KACH;IACD,OAAO;QACL,IAAI,EAAE,IAAI;QACV,MAAM,EAAE,cAAc,UAAU,8BAA8B,SAAS,EAAE;QACzE,UAAU;KACX,CAAC;AACJ,CAAC;AA9CD,8CA8CC;AAEM,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,MAAc,EACd,OAAuB;IAEvB,IAAI,CAAC,OAAO,EAAE;QACZ,MAAM,IAAI,KAAK,CACb,wFAAwF,CACzF,CAAC;KACH;IAED,MAAM,MAAM,GAAG,kBAAQ,CAAC,YAAY,CAAC,OAAO,CAAC,MAAM,IAAI,mCAAsB,EAAE;QAC7E,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,QAAQ;KACjB,CAAC,CAAC;IAEH,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,kCAAsB,CAAC;IAC1D,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE;QAChC,QAAQ,GAAG,MAAM,IAAA,8BAAe,EAAC,QAAQ,CAAC,CAAC;KAC5C;IACD,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;IAC5C,IAAI,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;QAC9B,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,IAAI,CAAC,KAAK,IAAI,WAAW;YACjC,UAAU,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;gBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;gBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;aAC7C;SACF,CAAC;KACH;IAED,IAAI;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAkB,CAAC;QACxD,MAAM,CAAC,UAAU,GAAG;YAClB,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;YAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;YACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;SAC7C,CAAC;QACF,OAAO,MAAM,CAAC;KACf;IAAC,OAAO,GAAG,EAAE;QACZ,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,6BAA6B,IAAI,CAAC,MAAM,EAAE;YAClD,UAAU,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;gBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;gBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;aAC7C;SACF,CAAC;KACH;AACH,CAAC;AApDD,4CAoDC;AAED,kBAAe;IACb,iBAAiB;IACjB,gBAAgB;CACjB,CAAC"}
|
package/dist/esm.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,IAAI,MAAM,
|
|
1
|
+
{"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,IAAI,MAAM,CAOrC"}
|
package/dist/esm.js
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
|
+
"use strict";
|
|
1
2
|
// esm-specific crap that needs to get mocked out in tests
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.getDirectory = void 0;
|
|
5
|
+
//import path from 'path';
|
|
6
|
+
//import { fileURLToPath } from 'url';
|
|
7
|
+
function getDirectory() {
|
|
8
|
+
/*
|
|
5
9
|
// @ts-ignore: Jest chokes on this
|
|
6
10
|
const __filename = fileURLToPath(import.meta.url);
|
|
7
11
|
return path.dirname(__filename);
|
|
12
|
+
*/
|
|
13
|
+
return __dirname;
|
|
8
14
|
}
|
|
15
|
+
exports.getDirectory = getDirectory;
|
|
9
16
|
//# sourceMappingURL=esm.js.map
|
package/dist/esm.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"esm.js","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAAA,0DAA0D
|
|
1
|
+
{"version":3,"file":"esm.js","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":";AAAA,0DAA0D;;;AAE1D,0BAA0B;AAC1B,sCAAsC;AAEtC,SAAgB,YAAY;IAC1B;;;;KAIC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAPD,oCAOC"}
|
package/dist/evaluator.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,EAGhB,MAAM,YAAY,CAAC;AAiRpB,wBAAgB,QAAQ,CAAC,OAAO,EAAE,eAAe,4BAGhD"}
|