promptfoo 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. package/README.md +20 -248
  2. package/dist/__mocks__/esm.js +5 -1
  3. package/dist/__mocks__/esm.js.map +1 -1
  4. package/dist/assertions.d.ts +18 -0
  5. package/dist/assertions.d.ts.map +1 -0
  6. package/dist/assertions.js +128 -0
  7. package/dist/assertions.js.map +1 -0
  8. package/dist/esm.d.ts.map +1 -1
  9. package/dist/esm.js +10 -3
  10. package/dist/esm.js.map +1 -1
  11. package/dist/evaluator.d.ts.map +1 -1
  12. package/dist/evaluator.js +88 -117
  13. package/dist/evaluator.js.map +1 -1
  14. package/dist/index.d.ts +13 -0
  15. package/dist/index.d.ts.map +1 -1
  16. package/dist/index.js +34 -5
  17. package/dist/index.js.map +1 -1
  18. package/dist/logger.js +18 -11
  19. package/dist/logger.js.map +1 -1
  20. package/dist/main.js +95 -53
  21. package/dist/main.js.map +1 -1
  22. package/dist/prompts.d.ts +4 -0
  23. package/dist/prompts.d.ts.map +1 -1
  24. package/dist/prompts.js +12 -1
  25. package/dist/prompts.js.map +1 -1
  26. package/dist/providers/localai.js +21 -13
  27. package/dist/providers/localai.js.map +1 -1
  28. package/dist/providers/openai.d.ts +10 -5
  29. package/dist/providers/openai.d.ts.map +1 -1
  30. package/dist/providers/openai.js +44 -32
  31. package/dist/providers/openai.js.map +1 -1
  32. package/dist/providers/shared.d.ts.map +1 -1
  33. package/dist/providers/shared.js +5 -2
  34. package/dist/providers/shared.js.map +1 -1
  35. package/dist/providers.d.ts +10 -0
  36. package/dist/providers.d.ts.map +1 -1
  37. package/dist/providers.js +51 -14
  38. package/dist/providers.js.map +1 -1
  39. package/dist/suggestions.d.ts +9 -0
  40. package/dist/suggestions.d.ts.map +1 -0
  41. package/dist/suggestions.js +54 -0
  42. package/dist/suggestions.js.map +1 -0
  43. package/dist/types.d.ts +11 -2
  44. package/dist/types.d.ts.map +1 -1
  45. package/dist/types.js +2 -1
  46. package/dist/util.d.ts +1 -1
  47. package/dist/util.d.ts.map +1 -1
  48. package/dist/util.js +86 -31
  49. package/dist/util.js.map +1 -1
  50. package/dist/web/client/assets/index-207192fc.css +1 -0
  51. package/dist/web/client/assets/index-8751749f.js +172 -0
  52. package/dist/web/client/index.html +2 -2
  53. package/dist/web/server.js +38 -31
  54. package/dist/web/server.js.map +1 -1
  55. package/package.json +14 -4
  56. package/src/assertions.ts +154 -0
  57. package/src/esm.ts +5 -2
  58. package/src/evaluator.ts +61 -139
  59. package/src/index.ts +12 -0
  60. package/src/main.ts +28 -3
  61. package/src/prompts.ts +9 -0
  62. package/src/providers/openai.ts +28 -15
  63. package/src/providers/shared.ts +1 -1
  64. package/src/providers.ts +8 -0
  65. package/src/suggestions.ts +63 -0
  66. package/src/types.ts +14 -2
  67. package/src/util.ts +24 -3
  68. package/src/web/client/package.json +1 -0
  69. package/src/web/client/src/App.css +4 -0
  70. package/src/web/client/src/App.tsx +29 -5
  71. package/src/web/client/src/Logo.css +5 -0
  72. package/src/web/client/src/NavBar.css +18 -0
  73. package/src/web/client/src/NavBar.tsx +12 -1
  74. package/src/web/client/src/index.css +10 -0
  75. package/src/web/server.ts +2 -2
  76. package/dist/web/client/assets/index-710f1308.css +0 -1
  77. package/dist/web/client/assets/index-900b20c0.js +0 -172
package/README.md CHANGED
@@ -14,7 +14,7 @@ With promptfoo, you can:
14
14
  - Use as a command line tool, or integrate into your workflow as a library
15
15
  - Use OpenAI models, open-source models like Llama and Vicuna, or integrate custom API providers for any LLM API
16
16
 
17
- **» [View docs on website](https://promptfoo.dev/docs/intro) «**
17
+ # [» View full documentation «](https://promptfoo.dev/docs/intro)
18
18
 
19
19
  promptfoo produces matrix views that allow you to quickly review prompt outputs across many inputs. The goal: tune prompts systematically across all relevant test cases, instead of testing prompts by trial and error.
20
20
 
@@ -41,19 +41,20 @@ After editing the prompts and variables to your liking, run the eval command to
41
41
  npx promptfoo eval
42
42
  ```
43
43
 
44
- If you're looking to customize your usage, you have the full set of parameters at your disposal:
44
+ If you're looking to customize your usage, you have a wide set of parameters at your disposal. See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/parameters) for more detail:
45
45
 
46
- ```bash
47
- npx promptfoo eval -p <prompt_paths...> -o <output_path> -r <providers> [-v <vars_path>] [-j <max_concurrency] [-c <config_path>] [--grader <grading_provider>]
48
- ```
49
-
50
- - `<prompt_paths...>`: Paths to prompt file(s)
51
- - `<output_path>`: Path to output CSV, JSON, YAML, or HTML file. Defaults to terminal output
52
- - `<providers>`: One or more of: `openai:<model_name>`, or filesystem path to custom API caller module
53
- - `<vars_path>` (optional): Path to CSV, JSON, or YAML file with prompt variables
54
- - `<max_concurrency>` (optional): Number of simultaneous API requests. Defaults to 4
55
- - `<config_path>` (optional): Path to configuration file
56
- - `<grading_provider>`: A provider that handles the grading process, if you are using [LLM grading](#expected-outputs)
46
+ | Option | Description |
47
+ | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
48
+ | `-p, --prompts <paths...>` | Paths to prompt files, directory, or glob |
49
+ | `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers](https://www.promptfoo.dev/docs/configuration/providers) |
50
+ | `-o, --output <path>` | Path to output file (csv, json, yaml, html) |
51
+ | `-v, --vars <path>` | Path to file with prompt variables (csv, json, yaml) |
52
+ | `-c, --config <path>` | Path to configuration file. `promptfooconfig.js[on]` is automatically loaded if present |
53
+ | `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
54
+ | `--table-cell-max-length <number>` | Truncate console table cells to this length |
55
+ | `--prompt-prefix <path>` | This prefix is prepended to every prompt |
56
+ | `--prompt-suffix <path>` | This suffix is append to every prompt |
57
+ | `--grader` | Provider that will grade outputs, if you are using [LLM grading](https://www.promptfoo.dev/docs/configuration/expected-outputs) |
57
58
 
58
59
  After running an eval, you may optionally use the `view` command to open the web viewer:
59
60
 
@@ -174,247 +175,16 @@ This code imports the `promptfoo` library, defines the evaluation options, and t
174
175
 
175
176
  ## Configuration
176
177
 
177
- ### Prompt Files
178
-
179
- Prompt files are plain text files that contain the prompts you want to test. If you have only one file, you can include multiple prompts in the file, separated by the delimiter `---`. If you have multiple files, each prompt should be in a separate file.
180
-
181
- You can use [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax to include variables in your prompts, which will be replaced with actual values from the `vars` CSV file during evaluation.
182
-
183
- Example of a single prompt file with multiple prompts (`prompts.txt`):
184
-
185
- ```
186
- Translate the following text to French: "{{name}}: {{text}}"
187
- ---
188
- Translate the following text to German: "{{name}}: {{text}}"
189
- ```
190
-
191
- Example of multiple prompt files:
192
-
193
- - `prompt1.txt`:
194
-
195
- ```
196
- Translate the following text to French: "{{name}}: {{text}}"
197
- ```
198
-
199
- - `prompt2.txt`:
200
-
201
- ```
202
- Translate the following text to German: "{{name}}: {{text}}"
203
- ```
204
-
205
- ### Vars File
206
-
207
- The Vars file is a CSV, JSON, or YAML file that contains the values for the variables used in the prompts. The first row of the CSV file should contain the variable names, and each subsequent row should contain the corresponding values for each test case.
208
-
209
- Vars are substituted by [Nunjucks](https://mozilla.github.io/nunjucks/) templating syntax into prompts.
210
-
211
- Example of a vars file (`vars.csv`):
212
-
213
- ```
214
- "name","text"
215
- "Bob","Hello, world!"
216
- "Joe","Goodbye, everyone!"
217
- ```
218
-
219
- Example of a vars file (`vars.json`):
220
-
221
- ```json
222
- [
223
- { "name": "Bob", "text": "Hello, world!" },
224
- { "name": "Joe", "text": "Goodbye, everyone!" }
225
- ]
226
- ```
227
-
228
- ### Expected Outputs
229
-
230
- You can specify an expected value for each test case to evaluate the success or failure of the model's output. To do this, add a special field called `__expected` in the `vars` file. The `__expected` field supports these types of value comparisons:
231
-
232
- 1. If the expected value starts with `eval:`, it will evaluate the contents as the body of a JavaScript function defined like: `function(output) { <eval> }`. The function should return a boolean value, where `true` indicates success and `false` indicates failure.
233
-
234
- 1. If the expected value starts with `similar:`, it will compare the semantic similarity of the expected and output values. For example, `similar: greetings, world!` is semantically similar to "Hello world" even though it's not an exact match.
235
-
236
- The `similar` directive uses cosine similarity, where 1.0 is the most similar and 0.0 is the least similar. Tune the similarity threshold by specifying `similar(0.8): ...` (passes only if similarity >= 0.8).
237
-
238
- The embedding model currently supported is OpenAI's `text-embedding-ada-002`. As a result, the `similar` directive requires the OPENAI_API_KEY environment variable to be set.
239
-
240
- 1. If the expected value starts with `grade:`, it will ask an LLM to evaluate whether the output meets the condition. For example, `grade: don't mention being an AI`. This option requires a provider name to be supplied to promptfoo via the `--grader` argument: `promptfoo --grader openai:gpt-4 ...`.
241
-
242
- 1. Otherwise, it attempts an exact string match comparison between the expected value and the model's output.
243
-
244
- Example of a vars file with the `__expected` field (`vars.csv`):
245
-
246
- ```
247
- text,__expected
248
- "Hello, world!","Bonjour le monde"
249
- "Goodbye, everyone!","eval:output.includes('Au revoir');"
250
- "I am a pineapple","grade:doesn't reference any fruits besides pineapple"
251
- ```
252
-
253
- Example of a vars file with the `__expected` field (`vars.json`):
254
-
255
- ```json
256
- [
257
- { "text": "Hello, world!", "__expected": "Bonjour le monde" },
258
- { "text": "Goodbye, everyone!", "__expected": "eval:output.includes('Au revoir');" }
259
- { "text": "I am a pineapple", "__expected": "grade:doesn't reference any fruits besides pineapple" }
260
- ]
261
- ```
262
-
263
- When the `__expected` field is provided, the success and failure statistics in the evaluation summary will be based on whether the expected criteria are met.
264
-
265
- For more advanced test cases, we recommend using a testing framework like [Jest](https://jestjs.io/) or [Mocha](https://mochajs.org/) and using promptfoo as a library.
266
-
267
- ### Output File
268
-
269
- The results of the evaluation are written to this file. Each record in the output file corresponds to a test case and includes the original prompt, the output generated by the LLM, and the values of the variables used in the test case.
270
-
271
- For example outputs, see the [examples/](https://github.com/typpo/promptfoo/tree/main/examples) directory.
272
-
273
- ### Configuration File
274
-
275
- You can specify any option in a configuration file (e.g., `.promptfoorc`, `promptfoo.config.json`). This can help you avoid repetitive command-line options and simplify the CLI invocation.
276
-
277
- Example of a configuration file (`promptfoo.config.json`):
278
-
279
- ```json
280
- {
281
- "provider": "openai:chat",
282
- "vars": "/path/to/vars.csv"
283
- }
284
- ```
178
+ - **[Setting up an eval](https://promptfoo.dev/docs/configuration/parameters)**: Learn more about how to set up prompt files, vars file, output, etc.
179
+ - **[Configuring test cases](https://promptfoo.dev/docs/configuration/expected-outputs)**: Learn more about how to configure expected outputs and test assertions.
285
180
 
286
181
  ## Installation
287
182
 
288
- 1. Clone the repository:
289
-
290
- ```bash
291
- git clone https://github.com/typpo/promptfoo.git
292
- ```
293
-
294
- 2. Install the dependencies:
295
-
296
- ```bash
297
- npm install
298
- ```
299
-
300
- 3. Link the CLI tool:
301
-
302
- ```bash
303
- npm link
304
- ```
305
-
306
- 4. Build:
307
-
308
- ```bash
309
- npm run build
310
- ```
311
-
312
- 5. Make the entrypoint executable:
313
-
314
- ```bash
315
- chmod +x dist/main.js
316
- ```
183
+ See **[installation docs](https://promptfoo.dev/docs/installation)**
317
184
 
318
185
  ## API Providers
319
186
 
320
- `promptfoo` supports OpenAI API models out of the box. To use a custom API provider, create a custom module that implements the `ApiProvider` interface and pass the path to the module as the `provider` option.
321
-
322
- ### OpenAI API
323
-
324
- To use the OpenAI API, set the `OPENAI_API_KEY` environment variable or pass the API key as an argument to the constructor.
325
-
326
- Example:
327
-
328
- ```bash
329
- export OPENAI_API_KEY=your_api_key_here
330
- ```
331
-
332
- Other OpenAI-related environment variables are supported:
333
-
334
- - `OPENAI_TEMPERATURE` - temperature model parameter, defaults to 0
335
- - `OPENAI_MAX_TOKENS` - max_tokens model parameter, defaults to 1024
336
- - `OPENAI_STOP` - stopwords in JSON format, defaults to []
337
- - `OPENAI_API_HOST` - override the hostname for the API request. Useful for proxies like Helicone.
338
- - `REQUEST_TIMEOUT_MS` - maximum request time, in milliseconds. Defaults to 60000.
339
-
340
- The OpenAI provider supports the following model formats:
341
-
342
- - `openai:chat` - defaults to gpt-3.5-turbo
343
- - `openai:completion` - defaults to `text-davinci-003`
344
- - `openai:<model name>` - uses a specific model name (mapped automatically to chat or completion endpoint)
345
- - `openai:chat:<model name>` - uses any model name against the chat endpoint
346
- - `openai:completion:<model name>` - uses any model name against the completion endpoint
347
-
348
- The `openai:<endpoint>:<model>` construction is useful if OpenAI releases a new model, or if you have a custom model. For example, if OpenAI releases gpt-5 chat completion, you could begin using it immediately with `openai:chat:gpt-5`.
349
-
350
- ### LocalAI
351
-
352
- LocalAI is an API wrapper for open-source LLMs that is compatible with OpenAI. You can run LocalAI for compatibility with Llama, Alpaca, Vicuna, GPT4All, RedPajama, and many other models compatible with the ggml format.
353
-
354
- View all compatible models [here](https://github.com/go-skynet/LocalAI#model-compatibility-table).
355
-
356
- Once you have LocalAI up and running, specify one of the following based on the model you have selected:
357
-
358
- - `localai:chat:<model name>`
359
- - `localai:completion:<model name>`
360
- - `localai:<model name>` - defaults to chat-type model
361
-
362
- The model name is typically the filename of the .bin file that you downloaded to set up the model in LocalAI. For example, `ggml-vic13b-uncensored-q5_1.bin`
363
-
364
- Supported environment variables:
365
-
366
- - `LOCALAI_BASE_URL` - defaults to `http://localhost:8080/v1`
367
- - `REQUEST_TIMEOUT_MS` - maximum request time, in milliseconds. Defaults to 60000.
368
-
369
- ### Custom API Provider
370
-
371
- To create a custom API provider, implement the `ApiProvider` interface in a separate module. Here is the interface:
372
-
373
- ```js
374
- export interface ApiProvider {
375
- id: () => string;
376
- callApi: (prompt: string) => Promise<ProviderResult>;
377
- }
378
- ```
379
-
380
- Below is an example of a custom API provider that returns a predefined output and token usage:
381
-
382
- ```javascript
383
- // customApiProvider.js
384
- import fetch from 'node-fetch';
385
-
386
- class CustomApiProvider {
387
- id() {
388
- return 'my-custom-api';
389
- }
390
-
391
- async callApi(prompt) {
392
- // Add your custom API logic here
393
-
394
- return {
395
- // Required
396
- output: 'Model output',
397
-
398
- // Optional
399
- tokenUsage: {
400
- total: 10,
401
- prompt: 5,
402
- completion: 5,
403
- },
404
- };
405
- }
406
- }
407
-
408
- export default CustomApiProvider;
409
- ```
410
-
411
- To use the custom API provider with `promptfoo`, pass the path to the module as the `provider` option in the CLI invocation:
412
-
413
- ```bash
414
- promptfoo eval -p prompt1.txt prompt2.txt -o results.csv -v vars.csv -r ./customApiProvider.js
415
- ```
416
-
417
- This command will evaluate the prompts using the custom API provider and save the results to the specified CSV file.
187
+ We support OpenAI's API as well as a number of open-source models. It's also to set up your own custom API provider. **[See Provider documentation](https://promptfoo.dev/docs/configuration/providers)** for more details.
418
188
 
419
189
  ## Development
420
190
 
@@ -428,3 +198,5 @@ Here are some of the available scripts:
428
198
  - `build:watch`: Continuously watch and transpile TypeScript files on changes
429
199
  - `test`: Run test suite
430
200
  - `test:watch`: Continuously run test suite on changes
201
+
202
+ # [» View full documentation «](https://promptfoo.dev/docs/intro)
@@ -1,4 +1,8 @@
1
- export function getDirectory() {
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.getDirectory = void 0;
4
+ function getDirectory() {
2
5
  return '/test/dir';
3
6
  }
7
+ exports.getDirectory = getDirectory;
4
8
  //# sourceMappingURL=esm.js.map
@@ -1 +1 @@
1
- {"version":3,"file":"esm.js","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":"AAAA,MAAM,UAAU,YAAY;IAC1B,OAAO,WAAW,CAAC;AACrB,CAAC"}
1
+ {"version":3,"file":"esm.js","sourceRoot":"","sources":["../../src/__mocks__/esm.ts"],"names":[],"mappings":";;;AAAA,SAAgB,YAAY;IAC1B,OAAO,WAAW,CAAC;AACrB,CAAC;AAFD,oCAEC"}
@@ -0,0 +1,18 @@
1
+ import type { EvaluateOptions, GradingConfig, TokenUsage } from './types.js';
2
+ interface GradingResult {
3
+ pass: boolean;
4
+ reason: string;
5
+ tokensUsed: TokenUsage;
6
+ }
7
+ export declare function matchesExpectedValue(expected: string, output: string, options: EvaluateOptions): Promise<{
8
+ pass: boolean;
9
+ reason?: string;
10
+ }>;
11
+ export declare function matchesSimilarity(expected: string, output: string, threshold: number): Promise<GradingResult>;
12
+ export declare function matchesLlmRubric(expected: string, output: string, options?: GradingConfig): Promise<GradingResult>;
13
+ declare const _default: {
14
+ matchesSimilarity: typeof matchesSimilarity;
15
+ matchesLlmRubric: typeof matchesLlmRubric;
16
+ };
17
+ export default _default;
18
+ //# sourceMappingURL=assertions.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"assertions.d.ts","sourceRoot":"","sources":["../src/assertions.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAAE,eAAe,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,YAAY,CAAC;AAE7E,UAAU,aAAa;IACrB,IAAI,EAAE,OAAO,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,UAAU,CAAC;CACxB;AAMD,wBAAsB,oBAAoB,CACxC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,eAAe,GACvB,OAAO,CAAC;IAAE,IAAI,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC,CAuB7C;AAED,wBAAsB,iBAAiB,CACrC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,SAAS,EAAE,MAAM,GAChB,OAAO,CAAC,aAAa,CAAC,CA0CxB;AAED,wBAAsB,gBAAgB,CACpC,QAAQ,EAAE,MAAM,EAChB,MAAM,EAAE,MAAM,EACd,OAAO,CAAC,EAAE,aAAa,GACtB,OAAO,CAAC,aAAa,CAAC,CAgDxB;;;;;AAED,wBAGE"}
@@ -0,0 +1,128 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.matchesLlmRubric = exports.matchesSimilarity = exports.matchesExpectedValue = void 0;
7
+ const nunjucks_1 = __importDefault(require("nunjucks"));
8
+ const openai_js_1 = require("./providers/openai.js");
9
+ const util_js_1 = require("./util.js");
10
+ const providers_js_1 = require("./providers.js");
11
+ const prompts_js_1 = require("./prompts.js");
12
+ const SIMILAR_REGEX = /similar(?::|\((\d+(\.\d+)?)\):)/;
13
+ const DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD = 0.8;
14
+ async function matchesExpectedValue(expected, output, options) {
15
+ const match = expected.match(SIMILAR_REGEX);
16
+ if (match) {
17
+ const threshold = parseFloat(match[1]) || DEFAULT_SEMANTIC_SIMILARITY_THRESHOLD;
18
+ const rest = expected.replace(SIMILAR_REGEX, '').trim();
19
+ return matchesSimilarity(rest, output, threshold);
20
+ }
21
+ else if (expected.startsWith('fn:') || expected.startsWith('eval:')) {
22
+ // TODO(1.0): delete eval: legacy option
23
+ const sliceLength = expected.startsWith('fn:') ? 'fn:'.length : 'eval:'.length;
24
+ const functionBody = expected.slice(sliceLength);
25
+ const customFunction = new Function('output', `return ${functionBody}`);
26
+ return { pass: customFunction(output) };
27
+ }
28
+ else if (expected.startsWith('grade:')) {
29
+ return matchesLlmRubric(expected.slice(6), output, options.grading);
30
+ }
31
+ else {
32
+ const pass = expected === output;
33
+ return {
34
+ pass,
35
+ reason: pass ? undefined : `Expected: ${expected}, Output: ${output}`,
36
+ };
37
+ }
38
+ }
39
+ exports.matchesExpectedValue = matchesExpectedValue;
40
+ async function matchesSimilarity(expected, output, threshold) {
41
+ const expectedEmbedding = await openai_js_1.DefaultEmbeddingProvider.callEmbeddingApi(expected);
42
+ const outputEmbedding = await openai_js_1.DefaultEmbeddingProvider.callEmbeddingApi(output);
43
+ const tokensUsed = {
44
+ total: (expectedEmbedding.tokenUsage?.total || 0) + (outputEmbedding.tokenUsage?.total || 0),
45
+ prompt: (expectedEmbedding.tokenUsage?.prompt || 0) + (outputEmbedding.tokenUsage?.prompt || 0),
46
+ completion: (expectedEmbedding.tokenUsage?.completion || 0) +
47
+ (outputEmbedding.tokenUsage?.completion || 0),
48
+ };
49
+ if (expectedEmbedding.error || outputEmbedding.error) {
50
+ return {
51
+ pass: false,
52
+ reason: expectedEmbedding.error || outputEmbedding.error || 'Unknown error fetching embeddings',
53
+ tokensUsed,
54
+ };
55
+ }
56
+ if (!expectedEmbedding.embedding || !outputEmbedding.embedding) {
57
+ return {
58
+ pass: false,
59
+ reason: 'Embedding not found',
60
+ tokensUsed,
61
+ };
62
+ }
63
+ const similarity = (0, util_js_1.cosineSimilarity)(expectedEmbedding.embedding, outputEmbedding.embedding);
64
+ if (similarity < threshold) {
65
+ return {
66
+ pass: false,
67
+ reason: `Similarity ${similarity} is less than threshold ${threshold}`,
68
+ tokensUsed,
69
+ };
70
+ }
71
+ return {
72
+ pass: true,
73
+ reason: `Similarity ${similarity} is greater than threshold ${threshold}`,
74
+ tokensUsed,
75
+ };
76
+ }
77
+ exports.matchesSimilarity = matchesSimilarity;
78
+ async function matchesLlmRubric(expected, output, options) {
79
+ if (!options) {
80
+ throw new Error('Cannot grade output without grading config. Specify --grader option or grading config.');
81
+ }
82
+ const prompt = nunjucks_1.default.renderString(options.prompt || prompts_js_1.DEFAULT_GRADING_PROMPT, {
83
+ content: output,
84
+ rubric: expected,
85
+ });
86
+ let provider = options.provider || openai_js_1.DefaultGradingProvider;
87
+ if (typeof provider === 'string') {
88
+ provider = await (0, providers_js_1.loadApiProvider)(provider);
89
+ }
90
+ const resp = await provider.callApi(prompt);
91
+ if (resp.error || !resp.output) {
92
+ return {
93
+ pass: false,
94
+ reason: resp.error || 'No output',
95
+ tokensUsed: {
96
+ total: resp.tokenUsage?.total || 0,
97
+ prompt: resp.tokenUsage?.prompt || 0,
98
+ completion: resp.tokenUsage?.completion || 0,
99
+ },
100
+ };
101
+ }
102
+ try {
103
+ const parsed = JSON.parse(resp.output);
104
+ parsed.tokensUsed = {
105
+ total: resp.tokenUsage?.total || 0,
106
+ prompt: resp.tokenUsage?.prompt || 0,
107
+ completion: resp.tokenUsage?.completion || 0,
108
+ };
109
+ return parsed;
110
+ }
111
+ catch (err) {
112
+ return {
113
+ pass: false,
114
+ reason: `Output is not valid JSON: ${resp.output}`,
115
+ tokensUsed: {
116
+ total: resp.tokenUsage?.total || 0,
117
+ prompt: resp.tokenUsage?.prompt || 0,
118
+ completion: resp.tokenUsage?.completion || 0,
119
+ },
120
+ };
121
+ }
122
+ }
123
+ exports.matchesLlmRubric = matchesLlmRubric;
124
+ exports.default = {
125
+ matchesSimilarity,
126
+ matchesLlmRubric,
127
+ };
128
+ //# sourceMappingURL=assertions.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"assertions.js","sourceRoot":"","sources":["../src/assertions.ts"],"names":[],"mappings":";;;;;;AAAA,wDAAgC;AAEhC,qDAAyF;AACzF,uCAA6C;AAC7C,iDAAiD;AACjD,6CAAsD;AAUtD,MAAM,aAAa,GAAG,iCAAiC,CAAC;AAExD,MAAM,qCAAqC,GAAG,GAAG,CAAC;AAE3C,KAAK,UAAU,oBAAoB,CACxC,QAAgB,EAChB,MAAc,EACd,OAAwB;IAExB,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAE5C,IAAI,KAAK,EAAE;QACT,MAAM,SAAS,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,qCAAqC,CAAC;QAChF,MAAM,IAAI,GAAG,QAAQ,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;QACxD,OAAO,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAC;KACnD;SAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,IAAI,QAAQ,CAAC,UAAU,CAAC,OAAO,CAAC,EAAE;QACrE,wCAAwC;QACxC,MAAM,WAAW,GAAG,QAAQ,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC;QAC/E,MAAM,YAAY,GAAG,QAAQ,CAAC,KAAK,CAAC,WAAW,CAAC,CAAC;QAEjD,MAAM,cAAc,GAAG,IAAI,QAAQ,CAAC,QAAQ,EAAE,UAAU,YAAY,EAAE,CAAC,CAAC;QACxE,OAAO,EAAE,IAAI,EAAE,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC;KACzC;SAAM,IAAI,QAAQ,CAAC,UAAU,CAAC,QAAQ,CAAC,EAAE;QACxC,OAAO,gBAAgB,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC;KACrE;SAAM;QACL,MAAM,IAAI,GAAG,QAAQ,KAAK,MAAM,CAAC;QACjC,OAAO;YACL,IAAI;YACJ,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,SAAS,CAAC,CAAC,CAAC,aAAa,QAAQ,aAAa,MAAM,EAAE;SACtE,CAAC;KACH;AACH,CAAC;AA3BD,oDA2BC;AAEM,KAAK,UAAU,iBAAiB,CACrC,QAAgB,EAChB,MAAc,EACd,SAAiB;IAEjB,MAAM,iBAAiB,GAAG,MAAM,oCAAwB,CAAC,gBAAgB,CAAC,QAAQ,CAAC,CAAC;IACpF,MAAM,eAAe,GAAG,MAAM,oCAAwB,CAAC,gBAAgB,CAAC,MAAM,CAAC,CAAC;IAEhF,MAAM,UAAU,GAAG;QACjB,KAAK,EAAE,CAAC,iBAAiB,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC,GAAG,CAAC,eAAe,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC,CAAC;QAC5F,MAAM,EAAE,CAAC,iBAAiB,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC,GAAG,CAAC,eAAe,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC,CAAC;QAC/F,UAAU,EACR,CAAC,iBAAiB,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;YAC/C,CAAC,eAAe,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC,CAAC;KAChD,CAAC;IAEF,IAAI,iBAAiB,CAAC,KAAK,IAAI,eAAe,CAAC,KAAK,EAAE;QACpD,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EACJ,iBAAiB,CAAC,KAAK,IAAI,eAAe,CAAC,KAAK,IAAI,mCAAmC;YACzF,UAAU;SACX,CAAC;KACH;IAED,IAAI,CAAC,iBAAiB,CAAC,SAAS,IAAI,CAAC,eAAe,CAAC,SAAS,EAAE;QAC9D,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,qBAAqB;YAC7B,UAAU;SACX,CAAC;KACH;IAED,MAAM,UAAU,GAAG,IAAA,0BAAgB,EAAC,iBAAiB,CAAC,SAAS,EAAE,eAAe,CAAC,SAAS,CAAC,CAAC;IAC5F,IAAI,UAAU,GAAG,SAAS,EAAE;QAC1B,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,cAAc,UAAU,2BAA2B,SAAS,EAAE;YACtE,UAAU;SACX,CAAC;KACH;IACD,OAAO;QACL,IAAI,EAAE,IAAI;QACV,MAAM,EAAE,cAAc,UAAU,8BAA8B,SAAS,EAAE;QACzE,UAAU;KACX,CAAC;AACJ,CAAC;AA9CD,8CA8CC;AAEM,KAAK,UAAU,gBAAgB,CACpC,QAAgB,EAChB,MAAc,EACd,OAAuB;IAEvB,IAAI,CAAC,OAAO,EAAE;QACZ,MAAM,IAAI,KAAK,CACb,wFAAwF,CACzF,CAAC;KACH;IAED,MAAM,MAAM,GAAG,kBAAQ,CAAC,YAAY,CAAC,OAAO,CAAC,MAAM,IAAI,mCAAsB,EAAE;QAC7E,OAAO,EAAE,MAAM;QACf,MAAM,EAAE,QAAQ;KACjB,CAAC,CAAC;IAEH,IAAI,QAAQ,GAAG,OAAO,CAAC,QAAQ,IAAI,kCAAsB,CAAC;IAC1D,IAAI,OAAO,QAAQ,KAAK,QAAQ,EAAE;QAChC,QAAQ,GAAG,MAAM,IAAA,8BAAe,EAAC,QAAQ,CAAC,CAAC;KAC5C;IACD,MAAM,IAAI,GAAG,MAAM,QAAQ,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;IAC5C,IAAI,IAAI,CAAC,KAAK,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE;QAC9B,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,IAAI,CAAC,KAAK,IAAI,WAAW;YACjC,UAAU,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;gBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;gBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;aAC7C;SACF,CAAC;KACH;IAED,IAAI;QACF,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAkB,CAAC;QACxD,MAAM,CAAC,UAAU,GAAG;YAClB,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;YAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;YACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;SAC7C,CAAC;QACF,OAAO,MAAM,CAAC;KACf;IAAC,OAAO,GAAG,EAAE;QACZ,OAAO;YACL,IAAI,EAAE,KAAK;YACX,MAAM,EAAE,6BAA6B,IAAI,CAAC,MAAM,EAAE;YAClD,UAAU,EAAE;gBACV,KAAK,EAAE,IAAI,CAAC,UAAU,EAAE,KAAK,IAAI,CAAC;gBAClC,MAAM,EAAE,IAAI,CAAC,UAAU,EAAE,MAAM,IAAI,CAAC;gBACpC,UAAU,EAAE,IAAI,CAAC,UAAU,EAAE,UAAU,IAAI,CAAC;aAC7C;SACF,CAAC;KACH;AACH,CAAC;AApDD,4CAoDC;AAED,kBAAe;IACb,iBAAiB;IACjB,gBAAgB;CACjB,CAAC"}
package/dist/esm.d.ts.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,IAAI,MAAM,CAIrC"}
1
+ {"version":3,"file":"esm.d.ts","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAKA,wBAAgB,YAAY,IAAI,MAAM,CAOrC"}
package/dist/esm.js CHANGED
@@ -1,9 +1,16 @@
1
+ "use strict";
1
2
  // esm-specific crap that needs to get mocked out in tests
2
- import path from 'path';
3
- import { fileURLToPath } from 'url';
4
- export function getDirectory() {
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ exports.getDirectory = void 0;
5
+ //import path from 'path';
6
+ //import { fileURLToPath } from 'url';
7
+ function getDirectory() {
8
+ /*
5
9
  // @ts-ignore: Jest chokes on this
6
10
  const __filename = fileURLToPath(import.meta.url);
7
11
  return path.dirname(__filename);
12
+ */
13
+ return __dirname;
8
14
  }
15
+ exports.getDirectory = getDirectory;
9
16
  //# sourceMappingURL=esm.js.map
package/dist/esm.js.map CHANGED
@@ -1 +1 @@
1
- {"version":3,"file":"esm.js","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":"AAAA,0DAA0D;AAE1D,OAAO,IAAI,MAAM,MAAM,CAAC;AACxB,OAAO,EAAE,aAAa,EAAE,MAAM,KAAK,CAAC;AAEpC,MAAM,UAAU,YAAY;IAC1B,kCAAkC;IAClC,MAAM,UAAU,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAClD,OAAO,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;AAClC,CAAC"}
1
+ {"version":3,"file":"esm.js","sourceRoot":"","sources":["../src/esm.ts"],"names":[],"mappings":";AAAA,0DAA0D;;;AAE1D,0BAA0B;AAC1B,sCAAsC;AAEtC,SAAgB,YAAY;IAC1B;;;;KAIC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAPD,oCAOC"}
@@ -1 +1 @@
1
- {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAOA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,EAIhB,MAAM,YAAY,CAAC;AAiWpB,wBAAgB,QAAQ,CAAC,OAAO,EAAE,eAAe,4BAGhD"}
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../src/evaluator.ts"],"names":[],"mappings":"AAUA,OAAO,KAAK,EAEV,eAAe,EAGf,eAAe,EAGhB,MAAM,YAAY,CAAC;AAiRpB,wBAAgB,QAAQ,CAAC,OAAO,EAAE,eAAe,4BAGhD"}