promptfoo 0.103.0 → 0.103.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -427
- package/dist/package.json +1 -1
- package/dist/src/app/assets/{index-Vk7H3M29.css → index-DdUNCsxz.css} +1 -1
- package/dist/src/app/assets/{index-hVGk-Oul.js → index-ziw_4_A9.js} +238 -238
- package/dist/src/app/assets/{index.es-CcK3JjZn.js → index.es-XehlSHxK.js} +1 -1
- package/dist/src/app/assets/{sync-BaigR5eq.js → sync-DDIaa9Ut.js} +1 -1
- package/dist/src/app/index.html +2 -2
- package/dist/src/cache.d.ts +1 -1
- package/dist/src/cache.d.ts.map +1 -1
- package/dist/src/cache.js +3 -3
- package/dist/src/cache.js.map +1 -1
- package/dist/src/envars.d.ts +6 -1
- package/dist/src/envars.d.ts.map +1 -1
- package/dist/src/envars.js.map +1 -1
- package/dist/src/esm.d.ts.map +1 -1
- package/dist/src/esm.js +4 -1
- package/dist/src/esm.js.map +1 -1
- package/dist/src/fetch.d.ts.map +1 -1
- package/dist/src/fetch.js +8 -5
- package/dist/src/fetch.js.map +1 -1
- package/dist/src/matchers.d.ts.map +1 -1
- package/dist/src/matchers.js +7 -0
- package/dist/src/matchers.js.map +1 -1
- package/dist/src/models/eval.d.ts.map +1 -1
- package/dist/src/models/eval.js +9 -1
- package/dist/src/models/eval.js.map +1 -1
- package/dist/src/models/evalResult.d.ts.map +1 -1
- package/dist/src/models/evalResult.js +30 -6
- package/dist/src/models/evalResult.js.map +1 -1
- package/dist/src/providers/bedrock.d.ts.map +1 -1
- package/dist/src/providers/bedrock.js +15 -22
- package/dist/src/providers/bedrock.js.map +1 -1
- package/dist/src/providers/http.d.ts +4 -1
- package/dist/src/providers/http.d.ts.map +1 -1
- package/dist/src/providers/http.js +4 -3
- package/dist/src/providers/http.js.map +1 -1
- package/dist/src/providers/promptfoo.d.ts.map +1 -1
- package/dist/src/providers/promptfoo.js +3 -3
- package/dist/src/providers/promptfoo.js.map +1 -1
- package/dist/src/redteam/commands/generate.d.ts.map +1 -1
- package/dist/src/redteam/commands/generate.js +2 -0
- package/dist/src/redteam/commands/generate.js.map +1 -1
- package/dist/src/redteam/commands/run.d.ts.map +1 -1
- package/dist/src/redteam/commands/run.js +6 -0
- package/dist/src/redteam/commands/run.js.map +1 -1
- package/dist/src/redteam/constants.d.ts +1 -1
- package/dist/src/redteam/constants.d.ts.map +1 -1
- package/dist/src/redteam/constants.js +7 -0
- package/dist/src/redteam/constants.js.map +1 -1
- package/dist/src/redteam/graders.d.ts.map +1 -1
- package/dist/src/redteam/graders.js +2 -0
- package/dist/src/redteam/graders.js.map +1 -1
- package/dist/src/redteam/plugins/base.d.ts.map +1 -1
- package/dist/src/redteam/plugins/base.js +7 -0
- package/dist/src/redteam/plugins/base.js.map +1 -1
- package/dist/src/redteam/plugins/divergentRepetition.d.ts +16 -0
- package/dist/src/redteam/plugins/divergentRepetition.d.ts.map +1 -0
- package/dist/src/redteam/plugins/divergentRepetition.js +133 -0
- package/dist/src/redteam/plugins/divergentRepetition.js.map +1 -0
- package/dist/src/redteam/plugins/index.d.ts.map +1 -1
- package/dist/src/redteam/plugins/index.js +2 -0
- package/dist/src/redteam/plugins/index.js.map +1 -1
- package/dist/src/redteam/providers/iterative.d.ts +1 -1
- package/dist/src/redteam/providers/iterative.d.ts.map +1 -1
- package/dist/src/redteam/providers/iterative.js +6 -6
- package/dist/src/redteam/providers/iterative.js.map +1 -1
- package/dist/src/redteam/types.d.ts +2 -0
- package/dist/src/redteam/types.d.ts.map +1 -1
- package/dist/src/redteam/util.d.ts +1 -0
- package/dist/src/redteam/util.d.ts.map +1 -1
- package/dist/src/redteam/util.js +24 -19
- package/dist/src/redteam/util.js.map +1 -1
- package/dist/src/util/cloud.d.ts +2 -0
- package/dist/src/util/cloud.d.ts.map +1 -0
- package/dist/src/util/cloud.js +34 -0
- package/dist/src/util/cloud.js.map +1 -0
- package/dist/test/fetch.test.js +27 -1
- package/dist/test/fetch.test.js.map +1 -1
- package/dist/test/integrations/huggingfaceDatasets.test.js +1 -0
- package/dist/test/integrations/huggingfaceDatasets.test.js.map +1 -1
- package/dist/test/matchers.test.js +15 -0
- package/dist/test/matchers.test.js.map +1 -1
- package/dist/test/providers/bedrock.test.js +21 -0
- package/dist/test/providers/bedrock.test.js.map +1 -1
- package/dist/test/providers/http.test.js +54 -25
- package/dist/test/providers/http.test.js.map +1 -1
- package/dist/test/redteam/plugins/base.test.js +69 -0
- package/dist/test/redteam/plugins/base.test.js.map +1 -1
- package/dist/test/redteam/plugins/intent.test.js +16 -1
- package/dist/test/redteam/plugins/intent.test.js.map +1 -1
- package/dist/test/redteam/providers/iterative.test.js +48 -3
- package/dist/test/redteam/providers/iterative.test.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Promptfoo: LLM evals & red teaming
|
|
2
2
|
|
|
3
3
|
[](https://npmjs.com/package/promptfoo)
|
|
4
4
|
[](https://npmjs.com/package/promptfoo)
|
|
@@ -6,451 +6,60 @@
|
|
|
6
6
|

|
|
7
7
|
[](https://discord.gg/gHPS9jjfbs)
|
|
8
8
|
|
|
9
|
-
`promptfoo` is a tool for testing
|
|
9
|
+
`promptfoo` is a developer-friendly local tool for testing LLM applications. Stop the trial-and-error approach - start shipping secure, reliable AI apps.
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
- **Build reliable prompts, models, and RAGs** with benchmarks specific to your use-case
|
|
14
|
-
- **Secure your apps** with automated [red teaming](https://www.promptfoo.dev/docs/red-team/) and pentesting
|
|
15
|
-
- **Speed up evaluations** with caching, concurrency, and live reloading
|
|
16
|
-
- **Score outputs automatically** by defining [metrics](https://www.promptfoo.dev/docs/configuration/expected-outputs)
|
|
17
|
-
- Use as a [CLI](https://www.promptfoo.dev/docs/usage/command-line), [library](https://www.promptfoo.dev/docs/usage/node-package), or in [CI/CD](https://www.promptfoo.dev/docs/integrations/github-action)
|
|
18
|
-
- Use OpenAI, Anthropic, Azure, Google, HuggingFace, open-source models like Llama, or integrate custom API providers for [any LLM API](https://www.promptfoo.dev/docs/providers)
|
|
19
|
-
|
|
20
|
-
The goal: **test-driven LLM development** instead of trial-and-error.
|
|
11
|
+
## Quick Start
|
|
21
12
|
|
|
22
13
|
```sh
|
|
14
|
+
# Install and initialize project
|
|
23
15
|
npx promptfoo@latest init
|
|
24
|
-
```
|
|
25
|
-
|
|
26
|
-
# [» View full documentation «](https://www.promptfoo.dev/docs/intro)
|
|
27
|
-
|
|
28
|
-
promptfoo produces matrix views that let you quickly evaluate outputs across many prompts and inputs:
|
|
29
|
-
|
|
30
|
-

|
|
31
|
-
|
|
32
|
-
It works on the command line too:
|
|
33
|
-
|
|
34
|
-

|
|
35
|
-
|
|
36
|
-
It also produces high-level vulnerability and risk reports:
|
|
37
|
-
|
|
38
|
-

|
|
39
|
-
|
|
40
|
-
## Why choose promptfoo?
|
|
41
|
-
|
|
42
|
-
There are many different ways to evaluate prompts. Here are some reasons to consider promptfoo:
|
|
43
|
-
|
|
44
|
-
- **Developer friendly**: promptfoo is fast, with quality-of-life features like live reloads and caching.
|
|
45
|
-
- **Battle-tested**: Originally built for LLM apps serving over 10 million users in production. Our tooling is flexible and can be adapted to many setups.
|
|
46
|
-
- **Simple, declarative test cases**: Define evals without writing code or working with heavy notebooks.
|
|
47
|
-
- **Language agnostic**: Use Python, Javascript, or any other language.
|
|
48
|
-
- **Share & collaborate**: Built-in share functionality & web viewer for working with teammates.
|
|
49
|
-
- **Open-source**: LLM evals are a commodity and should be served by 100% open-source projects with no strings attached.
|
|
50
|
-
- **Private**: This software runs completely locally. The evals run on your machine and talk directly with the LLM.
|
|
51
|
-
|
|
52
|
-
## Workflow
|
|
53
|
-
|
|
54
|
-
Start by establishing a handful of test cases - core use cases and failure cases that you want to ensure your prompt can handle.
|
|
55
|
-
|
|
56
|
-
As you explore modifications to the prompt, use `promptfoo eval` to rate all outputs. This ensures the prompt is actually improving overall.
|
|
57
|
-
|
|
58
|
-
As you collect more examples and establish a user feedback loop, continue to build the pool of test cases.
|
|
59
|
-
|
|
60
|
-
<img width="772" alt="LLM ops" src="https://github.com/promptfoo/promptfoo/assets/310310/cf0461a7-2832-4362-9fbb-4ebd911d06ff">
|
|
61
|
-
|
|
62
|
-
## Usage - evals
|
|
63
|
-
|
|
64
|
-
To get started, run this command:
|
|
65
|
-
|
|
66
|
-
```sh
|
|
67
|
-
npx promptfoo@latest init
|
|
68
|
-
```
|
|
69
|
-
|
|
70
|
-
This will create a `promptfooconfig.yaml` placeholder in your current directory.
|
|
71
|
-
|
|
72
|
-
After editing the prompts and variables to your liking, run the eval command to kick off an evaluation:
|
|
73
|
-
|
|
74
|
-
```
|
|
75
|
-
npx promptfoo@latest eval
|
|
76
|
-
```
|
|
77
|
-
|
|
78
|
-
## Usage - red teaming/pentesting
|
|
79
|
-
|
|
80
|
-
Run this command:
|
|
81
|
-
|
|
82
|
-
```sh
|
|
83
|
-
npx promptfoo@latest redteam init
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
This will ask you questions about what types of vulnerabilities you want to find and walk you through running your first scan.
|
|
87
16
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
The YAML configuration format runs each prompt through a series of example inputs (aka "test case") and checks if they meet requirements (aka "assert").
|
|
91
|
-
|
|
92
|
-
See the [Configuration docs](https://www.promptfoo.dev/docs/configuration/guide) for a detailed guide.
|
|
93
|
-
|
|
94
|
-
```yaml
|
|
95
|
-
prompts:
|
|
96
|
-
- file://prompt1.txt
|
|
97
|
-
- file://prompt2.txt
|
|
98
|
-
providers:
|
|
99
|
-
- openai:gpt-4o-mini
|
|
100
|
-
- ollama:llama3.1:70b
|
|
101
|
-
tests:
|
|
102
|
-
- description: 'Test translation to French'
|
|
103
|
-
vars:
|
|
104
|
-
language: French
|
|
105
|
-
input: Hello world
|
|
106
|
-
assert:
|
|
107
|
-
- type: contains-json
|
|
108
|
-
- type: javascript
|
|
109
|
-
value: output.length < 100
|
|
110
|
-
|
|
111
|
-
- description: 'Test translation to German'
|
|
112
|
-
vars:
|
|
113
|
-
language: German
|
|
114
|
-
input: How's it going?
|
|
115
|
-
assert:
|
|
116
|
-
- type: llm-rubric
|
|
117
|
-
value: does not describe self as an AI, model, or chatbot
|
|
118
|
-
- type: similar
|
|
119
|
-
value: was geht
|
|
120
|
-
threshold: 0.6 # cosine similarity
|
|
17
|
+
# Run your first evaluation
|
|
18
|
+
npx promptfoo eval
|
|
121
19
|
```
|
|
122
20
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
See [Test assertions](https://www.promptfoo.dev/docs/configuration/expected-outputs) for full details.
|
|
126
|
-
|
|
127
|
-
Deterministic eval metrics
|
|
128
|
-
|
|
129
|
-
| Assertion Type | Returns true if... |
|
|
130
|
-
| ------------------------------- | ----------------------------------------------------------------- |
|
|
131
|
-
| `equals` | output matches exactly |
|
|
132
|
-
| `contains` | output contains substring |
|
|
133
|
-
| `icontains` | output contains substring, case insensitive |
|
|
134
|
-
| `regex` | output matches regex |
|
|
135
|
-
| `starts-with` | output starts with string |
|
|
136
|
-
| `contains-any` | output contains any of the listed substrings |
|
|
137
|
-
| `contains-all` | output contains all list of substrings |
|
|
138
|
-
| `icontains-any` | output contains any of the listed substrings, case insensitive |
|
|
139
|
-
| `icontains-all` | output contains all list of substrings, case insensitive |
|
|
140
|
-
| `is-json` | output is valid json (optional json schema validation) |
|
|
141
|
-
| `contains-json` | output contains valid json (optional json schema validation) |
|
|
142
|
-
| `is-sql` | output is valid sql |
|
|
143
|
-
| `contains-sql` | output contains valid sql |
|
|
144
|
-
| `is-xml` | output is valid xml |
|
|
145
|
-
| `contains-xml` | output contains valid xml |
|
|
146
|
-
| `is-refusal` | output indicates the model refused to perform the task |
|
|
147
|
-
| `javascript` | provided Javascript function validates the output |
|
|
148
|
-
| `python` | provided Python function validates the output |
|
|
149
|
-
| `webhook` | provided webhook returns `{pass: true}` |
|
|
150
|
-
| `rouge-n` | Rouge-N score is above a given threshold (default 0.75) |
|
|
151
|
-
| `bleu` | BLEU score is above a given threshold (default 0.5) |
|
|
152
|
-
| `levenshtein` | Levenshtein distance is below a threshold |
|
|
153
|
-
| `latency` | Latency is below a threshold (milliseconds) |
|
|
154
|
-
| `perplexity` | Perplexity is below a threshold |
|
|
155
|
-
| `perplexity-score` | Normalized perplexity |
|
|
156
|
-
| `cost` | Cost is below a threshold (for models with cost info such as GPT) |
|
|
157
|
-
| `is-valid-openai-function-call` | Ensure that the function call matches the function's JSON schema |
|
|
158
|
-
| `is-valid-openai-tools-call` | Ensure that all tool calls match the tools JSON schema |
|
|
159
|
-
| `assert-set` | Group assertions together with optional threshold |
|
|
160
|
-
|
|
161
|
-
Model-assisted eval metrics
|
|
162
|
-
|
|
163
|
-
| Assertion Type | Method |
|
|
164
|
-
| --------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------- |
|
|
165
|
-
| [similar](https://www.promptfoo.dev/docs/configuration/expected-outputs/similar) | Embeddings and cosine similarity are above a threshold |
|
|
166
|
-
| [classifier](https://www.promptfoo.dev/docs/configuration/expected-outputs/classifier) | Run LLM output through a classifier |
|
|
167
|
-
| [llm-rubric](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | LLM output matches a given rubric, using a Language Model to grade output |
|
|
168
|
-
| [answer-relevance](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | Ensure that LLM output is related to original query |
|
|
169
|
-
| [context-faithfulness](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | Ensure that LLM output uses the context |
|
|
170
|
-
| [context-recall](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | Ensure that ground truth appears in context |
|
|
171
|
-
| [context-relevance](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | Ensure that context is relevant to original query |
|
|
172
|
-
| [factuality](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | LLM output adheres to the given facts, using Factuality method from OpenAI eval |
|
|
173
|
-
| [model-graded-closedqa](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | LLM output adheres to given criteria, using Closed QA method from OpenAI eval |
|
|
174
|
-
| [moderation](https://www.promptfoo.dev/docs/configuration/expected-outputs/moderation) | Make sure outputs are safe |
|
|
175
|
-
| [select-best](https://www.promptfoo.dev/docs/configuration/expected-outputs/model-graded) | Compare multiple outputs for a test case and pick the best one |
|
|
176
|
-
|
|
177
|
-
Every test type can be negated by prepending `not-`. For example, `not-equals` or `not-regex`.
|
|
178
|
-
|
|
179
|
-
### Tests from spreadsheet
|
|
180
|
-
|
|
181
|
-
Some people prefer to configure their LLM tests in a CSV. In that case, the config is pretty simple:
|
|
182
|
-
|
|
183
|
-
```yaml
|
|
184
|
-
prompts:
|
|
185
|
-
- file://prompts.txt
|
|
186
|
-
providers:
|
|
187
|
-
- openai:gpt-4o-mini
|
|
188
|
-
tests: file://tests.csv
|
|
189
|
-
```
|
|
190
|
-
|
|
191
|
-
See [example CSV](https://github.com/promptfoo/promptfoo/blob/main/examples/simple-test/tests.csv).
|
|
192
|
-
|
|
193
|
-
### Command-line
|
|
194
|
-
|
|
195
|
-
If you're looking to customize your usage, you have a wide set of parameters at your disposal.
|
|
196
|
-
|
|
197
|
-
| Option | Description |
|
|
198
|
-
| ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
|
199
|
-
| `-p, --prompts <paths...>` | Paths to [prompt files](https://www.promptfoo.dev/docs/configuration/parameters#prompts), directory, or glob |
|
|
200
|
-
| `-r, --providers <name or path...>` | One of: openai:chat, openai:completion, openai:model-name, localai:chat:model-name, localai:completion:model-name. See [API providers][providers-docs] |
|
|
201
|
-
| `-o, --output <path>` | Path to [output file](https://www.promptfoo.dev/docs/configuration/parameters#output-file) (csv, json, yaml, html) |
|
|
202
|
-
| `--tests <path>` | Path to [external test file](https://www.promptfoo.dev/docs/configuration/expected-outputs/#load-assertions-from-external-file) |
|
|
203
|
-
| `-c, --config <paths>` | Path to one or more [configuration files](https://www.promptfoo.dev/docs/configuration/guide). `promptfooconfig.yaml` is automatically loaded if present |
|
|
204
|
-
| `-j, --max-concurrency <number>` | Maximum number of concurrent API calls |
|
|
205
|
-
| `--table-cell-max-length <number>` | Truncate console table cells to this length |
|
|
206
|
-
| `--prompt-prefix <path>` | This prefix is prepended to every prompt |
|
|
207
|
-
| `--prompt-suffix <path>` | This suffix is append to every prompt |
|
|
208
|
-
| `--grader` | [Provider][providers-docs] that will conduct the evaluation, if you are [using LLM to grade your output](https://www.promptfoo.dev/docs/configuration/expected-outputs#llm-evaluation) |
|
|
209
|
-
|
|
210
|
-
After running an eval, you may optionally use the `view` command to open the web viewer:
|
|
21
|
+
See [Getting Started](https://www.promptfoo.dev/docs/getting-started/) (evals) or [Red Teaming](https://www.promptfoo.dev/docs/red-team/) (vulnerability scanning) for more.
|
|
211
22
|
|
|
212
|
-
|
|
213
|
-
npx promptfoo view
|
|
214
|
-
```
|
|
215
|
-
|
|
216
|
-
### Examples
|
|
217
|
-
|
|
218
|
-
#### Prompt quality
|
|
219
|
-
|
|
220
|
-
In [this example](https://github.com/promptfoo/promptfoo/tree/main/examples/assistant-cli), we evaluate whether adding adjectives to the personality of an assistant bot affects the responses:
|
|
221
|
-
|
|
222
|
-
```
|
|
223
|
-
npx promptfoo eval -p prompts.txt -r openai:gpt-4o-mini -t tests.csv
|
|
224
|
-
```
|
|
225
|
-
|
|
226
|
-
<!--
|
|
227
|
-
<img width="1362" alt="Side-by-side evaluation of LLM prompt quality, terminal output" src="https://user-images.githubusercontent.com/310310/235329207-e8c22459-5f51-4fee-9714-1b602ac3d7ca.png">
|
|
228
|
-
|
|
229
|
-

|
|
230
|
-
-->
|
|
231
|
-
|
|
232
|
-
This command will evaluate the prompts in `prompts.txt`, substituting the variable values from `vars.csv`, and output results in your terminal.
|
|
233
|
-
|
|
234
|
-
You can also output a nice [spreadsheet](https://docs.google.com/spreadsheets/d/1nanoj3_TniWrDl1Sj-qYqIMD6jwm5FBy15xPFdUTsmI/edit?usp=sharing), [JSON](https://github.com/promptfoo/promptfoo/blob/main/examples/simple-cli/output.json), YAML, or an HTML file:
|
|
235
|
-
|
|
236
|
-

|
|
237
|
-
|
|
238
|
-
#### Model quality
|
|
239
|
-
|
|
240
|
-
In the [next example](https://github.com/promptfoo/promptfoo/tree/main/examples/gpt-4o-vs-4o-mini), we evaluate the difference between GPT 3 and GPT 4 outputs for a given prompt:
|
|
241
|
-
|
|
242
|
-
```
|
|
243
|
-
npx promptfoo eval -p prompts.txt -r openai:gpt-4o openai:gpt-4o-mini -o output.html
|
|
244
|
-
```
|
|
23
|
+
## What can you do with Promptfoo?
|
|
245
24
|
|
|
246
|
-
|
|
25
|
+
- **Test your prompts and models** with [automated evaluations](https://www.promptfoo.dev/docs/getting-started/)
|
|
26
|
+
- **Secure your LLM apps** with [red teaming](https://www.promptfoo.dev/docs/red-team/) and vulnerability scanning
|
|
27
|
+
- **Compare models** side-by-side (OpenAI, Anthropic, Azure, Bedrock, Ollama, and [more](https://www.promptfoo.dev/docs/providers/))
|
|
28
|
+
- **Automate checks** in [CI/CD](https://www.promptfoo.dev/docs/integrations/ci-cd/)
|
|
29
|
+
- **Share results** with your team
|
|
247
30
|
|
|
248
|
-
|
|
31
|
+
Here's what it looks like in action:
|
|
249
32
|
|
|
250
|
-
|
|
33
|
+

|
|
251
34
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
- `testSuite`: the Javascript equivalent of the promptfooconfig.yaml
|
|
255
|
-
|
|
256
|
-
```typescript
|
|
257
|
-
interface EvaluateTestSuite {
|
|
258
|
-
providers: string[]; // Valid provider name (e.g. openai:gpt-4o-mini)
|
|
259
|
-
prompts: string[]; // List of prompts
|
|
260
|
-
tests: string | TestCase[]; // Path to a CSV file, or list of test cases
|
|
261
|
-
|
|
262
|
-
defaultTest?: Omit<TestCase, 'description'>; // Optional: add default vars and assertions on test case
|
|
263
|
-
outputPath?: string | string[]; // Optional: write results to file
|
|
264
|
-
}
|
|
265
|
-
|
|
266
|
-
interface TestCase {
|
|
267
|
-
// Optional description of what you're testing
|
|
268
|
-
description?: string;
|
|
269
|
-
|
|
270
|
-
// Key-value pairs to substitute in the prompt
|
|
271
|
-
vars?: Record<string, string | string[] | object>;
|
|
272
|
-
|
|
273
|
-
// Optional list of automatic checks to run on the LLM output
|
|
274
|
-
assert?: Assertion[];
|
|
275
|
-
|
|
276
|
-
// Additional configuration settings for the prompt
|
|
277
|
-
options?: PromptConfig & OutputConfig & GradingConfig;
|
|
278
|
-
|
|
279
|
-
// The required score for this test case. If not provided, the test case is graded pass/fail.
|
|
280
|
-
threshold?: number;
|
|
281
|
-
|
|
282
|
-
// Override the provider for this test
|
|
283
|
-
provider?: string | ProviderOptions | ApiProvider;
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
interface Assertion {
|
|
287
|
-
type: string;
|
|
288
|
-
value?: string;
|
|
289
|
-
threshold?: number; // Required score for pass
|
|
290
|
-
weight?: number; // The weight of this assertion compared to other assertions in the test case. Defaults to 1.
|
|
291
|
-
provider?: ApiProvider; // For assertions that require an LLM provider
|
|
292
|
-
}
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
- `options`: misc options related to how the tests are run
|
|
296
|
-
|
|
297
|
-
```typescript
|
|
298
|
-
interface EvaluateOptions {
|
|
299
|
-
maxConcurrency?: number;
|
|
300
|
-
showProgressBar?: boolean;
|
|
301
|
-
generateSuggestions?: boolean;
|
|
302
|
-
}
|
|
303
|
-
```
|
|
304
|
-
|
|
305
|
-
### Example
|
|
306
|
-
|
|
307
|
-
`promptfoo` exports an `evaluate` function that you can use to run prompt evaluations.
|
|
308
|
-
|
|
309
|
-
```js
|
|
310
|
-
import promptfoo from 'promptfoo';
|
|
311
|
-
|
|
312
|
-
const results = await promptfoo.evaluate({
|
|
313
|
-
prompts: ['Rephrase this in French: {{body}}', 'Rephrase this like a pirate: {{body}}'],
|
|
314
|
-
providers: ['openai:gpt-4o-mini'],
|
|
315
|
-
tests: [
|
|
316
|
-
{
|
|
317
|
-
vars: {
|
|
318
|
-
body: 'Hello world',
|
|
319
|
-
},
|
|
320
|
-
},
|
|
321
|
-
{
|
|
322
|
-
vars: {
|
|
323
|
-
body: "I'm hungry",
|
|
324
|
-
},
|
|
325
|
-
},
|
|
326
|
-
],
|
|
327
|
-
});
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
This code imports the `promptfoo` library, defines the evaluation options, and then calls the `evaluate` function with these options.
|
|
331
|
-
|
|
332
|
-
See the full example [here](https://github.com/promptfoo/promptfoo/tree/main/examples/simple-import), which includes an example results object.
|
|
333
|
-
|
|
334
|
-
## Configuration
|
|
335
|
-
|
|
336
|
-
- **[Main guide](https://www.promptfoo.dev/docs/configuration/guide)**: Learn about how to configure your YAML file, setup prompt files, etc.
|
|
337
|
-
- **[Configuring test cases](https://www.promptfoo.dev/docs/configuration/expected-outputs)**: Learn more about how to configure assertions and metrics.
|
|
338
|
-
|
|
339
|
-
## Installation
|
|
340
|
-
|
|
341
|
-
Requires Node.js 18 or newer.
|
|
342
|
-
|
|
343
|
-
You can install promptfoo using npm, npx, Homebrew, or by cloning the repository.
|
|
344
|
-
|
|
345
|
-
### npm (recommended)
|
|
346
|
-
|
|
347
|
-
Install `promptfoo` globally:
|
|
348
|
-
|
|
349
|
-
```sh
|
|
350
|
-
npm install -g promptfoo
|
|
351
|
-
```
|
|
352
|
-
|
|
353
|
-
Or install it locally in your project:
|
|
354
|
-
|
|
355
|
-
```sh
|
|
356
|
-
npm install promptfoo
|
|
357
|
-
```
|
|
358
|
-
|
|
359
|
-
### npx
|
|
360
|
-
|
|
361
|
-
Run promptfoo without installing it:
|
|
362
|
-
|
|
363
|
-
```sh
|
|
364
|
-
npx promptfoo@latest init
|
|
365
|
-
```
|
|
366
|
-
|
|
367
|
-
This will create a `promptfooconfig.yaml` placeholder in your current directory.
|
|
368
|
-
|
|
369
|
-
### Homebrew
|
|
370
|
-
|
|
371
|
-
If you prefer using Homebrew, you can install promptfoo with:
|
|
372
|
-
|
|
373
|
-
```sh
|
|
374
|
-
brew install promptfoo
|
|
375
|
-
```
|
|
376
|
-
|
|
377
|
-
### From source
|
|
378
|
-
|
|
379
|
-
For the latest development version:
|
|
380
|
-
|
|
381
|
-
```sh
|
|
382
|
-
git clone https://github.com/promptfoo/promptfoo.git
|
|
383
|
-
cd promptfoo
|
|
384
|
-
npm install
|
|
385
|
-
npm run build
|
|
386
|
-
npm link
|
|
387
|
-
```
|
|
388
|
-
|
|
389
|
-
### Verify installation
|
|
390
|
-
|
|
391
|
-
To verify that promptfoo is installed correctly, run:
|
|
392
|
-
|
|
393
|
-
```sh
|
|
394
|
-
promptfoo --version
|
|
395
|
-
```
|
|
396
|
-
|
|
397
|
-
This should display the version number of promptfoo.
|
|
398
|
-
|
|
399
|
-
For more detailed installation instructions, including system requirements and troubleshooting, please visit our [installation guide](https://www.promptfoo.dev/docs/installation).
|
|
400
|
-
|
|
401
|
-
## API Providers
|
|
402
|
-
|
|
403
|
-
We support OpenAI's API as well as a number of open-source models. It's also to set up your own custom API provider. **[See Provider documentation][providers-docs]** for more details.
|
|
404
|
-
|
|
405
|
-
## Development
|
|
406
|
-
|
|
407
|
-
Here's how to build and run locally:
|
|
408
|
-
|
|
409
|
-
```sh
|
|
410
|
-
git clone https://github.com/promptfoo/promptfoo.git
|
|
411
|
-
cd promptfoo
|
|
412
|
-
|
|
413
|
-
# Optionally use the Node.js version specified in the .nvmrc file - make sure you are on node >= 18
|
|
414
|
-
nvm use
|
|
415
|
-
|
|
416
|
-
npm i
|
|
417
|
-
cd path/to/experiment-with-promptfoo # contains your promptfooconfig.yaml
|
|
418
|
-
npx path/to/promptfoo-source eval
|
|
419
|
-
```
|
|
420
|
-
|
|
421
|
-
The web UI is located in `src/app`. To run it in dev mode, run `npm run local:app`. This will host the web UI at http://localhost:3000. The web UI expects `promptfoo view` to be running separately.
|
|
422
|
-
|
|
423
|
-
Then run:
|
|
424
|
-
|
|
425
|
-
```sh
|
|
426
|
-
npm run build
|
|
427
|
-
```
|
|
35
|
+
It works on the command line too:
|
|
428
36
|
|
|
429
|
-
|
|
37
|
+

|
|
430
38
|
|
|
431
|
-
|
|
39
|
+
It also can generate [security vulnerability reports](https://www.promptfoo.dev/docs/red-team/):
|
|
432
40
|
|
|
433
|
-
|
|
41
|
+

|
|
434
42
|
|
|
435
|
-
|
|
43
|
+
## Why promptfoo?
|
|
436
44
|
|
|
437
|
-
-
|
|
438
|
-
-
|
|
439
|
-
-
|
|
440
|
-
-
|
|
441
|
-
-
|
|
442
|
-
-
|
|
45
|
+
- 🚀 **Developer-first**: Fast, with features like live reload and caching
|
|
46
|
+
- 🔒 **Private**: Runs 100% locally - your prompts never leave your machine
|
|
47
|
+
- 🔧 **Flexible**: Works with any LLM API or programming language
|
|
48
|
+
- 💪 **Battle-tested**: Powers LLM apps serving 10M+ users in production
|
|
49
|
+
- 📊 **Data-driven**: Make decisions based on metrics, not gut feel
|
|
50
|
+
- 🤝 **Open source**: MIT licensed, with an active community
|
|
443
51
|
|
|
444
|
-
|
|
52
|
+
## Learn More
|
|
445
53
|
|
|
446
|
-
|
|
54
|
+
- 📚 [Full Documentation](https://www.promptfoo.dev/docs/intro/)
|
|
55
|
+
- 🔐 [Red Teaming Guide](https://www.promptfoo.dev/docs/red-team/)
|
|
56
|
+
- 🎯 [Getting Started](https://www.promptfoo.dev/docs/getting-started/)
|
|
57
|
+
- 💻 [CLI Usage](https://www.promptfoo.dev/docs/usage/command-line/)
|
|
58
|
+
- 📦 [Node.js Package](https://www.promptfoo.dev/docs/usage/node-package/)
|
|
59
|
+
- 🤖 [Supported Models](https://www.promptfoo.dev/docs/providers/)
|
|
447
60
|
|
|
448
|
-
|
|
61
|
+
## Contributing
|
|
449
62
|
|
|
450
|
-
|
|
63
|
+
We welcome contributions! Check out our [contributing guide](https://www.promptfoo.dev/docs/contributing/) to get started.
|
|
451
64
|
|
|
452
|
-
|
|
453
|
-
2. Update `loadApiProvider` in `src/providers.ts` to load your provider via string
|
|
454
|
-
3. Add test cases in `test/providers.test.ts`
|
|
455
|
-
1. Test the actual provider implementation
|
|
456
|
-
2. Test loading the provider via a `loadApiProvider` test
|
|
65
|
+
Join our [Discord community](https://discord.gg/gHPS9jjfbs) for help and discussion.
|
package/dist/package.json
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
.nav{padding:.25rem 0 .25rem 1rem;gap:1rem;background-color:#333;margin-bottom:1rem;box-shadow:0 4px 6px #0000001a}[data-theme=dark] .nav{background-color:#121212}.nav a{text-decoration:none;align-self:center}@media (max-width: 760px){.nav{font-size:.75rem}}.nav .active{font-weight:700}.nav a{color:#f0f0f0}.nav .right-aligned{display:flex;align-items:center;gap:1rem;margin-left:auto;margin-right:.5rem}.nav a:hover{text-decoration:underline}.nav a:hover{color:#ddd}.prompt-var-highlight{background-color:var(--prompt-highlight-color);padding:.25rem;border-radius:4px}.glowing-border{border:1px solid #5cb3ff;box-shadow:0 0 8px 2px #5cb3ff}code[class*=language-],pre[class*=language-]{color:#000;background:none;text-shadow:0 1px white;font-family:Consolas,Monaco,Andale Mono,Ubuntu Mono,monospace;font-size:1em;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}pre[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,code[class*=language-] ::-moz-selection{text-shadow:none;background:#b3d4fc}pre[class*=language-]::selection,pre[class*=language-] ::selection,code[class*=language-]::selection,code[class*=language-] ::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.comment,.token.prolog,.token.doctype,.token.cdata{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.property,.token.tag,.token.boolean,.token.number,.token.constant,.token.symbol,.token.deleted{color:#905}.token.selector,.token.attr-name,.token.string,.token.char,.token.builtin,.token.inserted{color:#690}.token.operator,.token.entity,.token.url,.language-css .token.string,.style .token.string{color:#9a6e3a;background:#ffffff80}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.function,.token.class-name{color:#dd4a68}.token.regex,.token.important,.token.variable{color:#e90}.token.important,.token.bold{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help}.yaml-config{font-size:10px}.custom-metric-container{display:inline-flex;flex-wrap:wrap;gap:.25rem;font-weight:400}.custom-metric-container>span{padding:.25rem .5rem;border-radius:4px;color:var(--text-color);border:1px solid var(--border-color);margin-right:.25rem}.custom-metric-container>span.clickable{cursor:pointer}.custom-metric-container>span.clickable:hover{border-color:var(--link-color)}.fail-reason-carousel-controls{float:right;color:var(--text-color);font-weight:400}pre{white-space:pre-wrap}.font-bold{font-weight:700}table.results-table,.divTable{border-collapse:collapse;width:100%;margin:1rem 0;background-color:#fff}[data-theme=dark] table.results-table,[data-theme=dark] .divTable{background-color:#1a1a1a}.results-table ins{background-color:var(--insert-highlight-color);text-decoration:none}.results-table del{background-color:var(--delete-highlight-color);text-decoration:strikethrough}.tr{display:flex}.results-table tr,.tr{width:fit-content}.results-table tr:hover,.tr:hover{background-color:#00000005}.results-table th,.th,.results-table td,.td{position:relative;border:1px solid var(--border-color);vertical-align:top}.compact.results-table th,.compact .th,.compact.results-table td,.compact .td{padding:.5rem}.results-table th.variable,.th.variable,.results-table td.variable,.td.variable{background-color:var(--variable-background-color)}.variable .cell{max-height:100%;overflow-y:auto}.results-table tr.header{background-color:var(--header-background-color)}.results-table th,.th{padding:1rem;position:relative;vertical-align:bottom;text-align:left;white-space:pre-wrap;font-weight:400}.results-table th .action{cursor:pointer;margin-left:.5rem}.results-table th .action svg{vertical-align:middle}.results-table td,.results-table th{height:1px}@-moz-document url-prefix(){.results-table td,.results-table th{height:100%}}.firefox-fix{display:table;height:100%}.results-table tr .cell{display:flex;flex-direction:column;white-space:pre-wrap;height:100%;padding:1rem}.results-table tr .cell img{max-width:var(--max-image-width, 256px);max-height:var(--max-image-height, 256px);cursor:zoom-in}.results-table tr .cell .prompt{background-color:var(--variable-background-color);border:1px solid var(--border-color);padding:1rem;border-radius:4px;margin-bottom:.5rem;font-family:Courier New,Courier,monospace;font-size:.8rem}.results-table tr .cell .prompt .pill{display:block;background-color:var(--neutral-background-color);border:1px solid var(--border-color);border-radius:4px;padding:.25rem;width:fit-content;margin-bottom:.5rem}.results-table tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;top:1.25rem;right:.75rem;line-height:0;font-size:1.25rem}.results-table .first-prompt-col:hover .cell-actions,.results-table .second-prompt-column:hover .cell-actions{visibility:visible}.results-table tr .cell-detail{display:flex;flex-wrap:wrap;column-gap:.5rem;row-gap:.25rem;padding-top:1rem;margin-top:auto}.results-table tr .stat-item{font-weight:400;font-size:.75rem;color:#888}.results-table tr .cell-actions .action{cursor:pointer}.results-table tr .cell table{width:100%;border-collapse:collapse;margin:1rem 0}.results-table tr .cell table th,.results-table tr .cell table td{border:1px solid var(--border-color);padding:.5rem;text-align:left}.results-table tr .cell table th{background-color:var(--header-background-color);font-weight:700}.results-table tr .cell table tr:nth-child(2n){background-color:var(--row-background-color)}.results-table tr .cell table tr:hover{background-color:var(--hover-background-color)}.results-table th .output-header{display:flex;flex-direction:column;height:100%;align-items:flex-start}.results-table th .output-header .pills{display:flex;flex-wrap:wrap;gap:.25rem;align-items:center;margin-bottom:.5rem}.results-table th .prompt-container{font-weight:700;margin-bottom:.5rem}.results-table th .prompt-container>*{display:inline}.results-table th .provider{display:inline-block;padding:.25rem .5rem;background-color:var(--neutral-background-color);border:1px solid var(--border-color);border-radius:4px;margin-right:.25rem}.results-table th .summary{font-weight:400}.results-table th .prompt-detail{display:flex;flex-wrap:wrap;column-gap:.5rem;row-gap:.25rem;font-size:.75rem;color:#888;margin-top:auto}.results-table th .summary .highlight{padding:.25rem .5rem;border-radius:4px;background-color:var(--fail-background-color);border:1px solid var(--border-color)}.results-table th .summary .highlight.success{background-color:var(--success-background-color);border:1px solid var(--pass-color)}.results-table .status{display:flex;flex-direction:column;gap:.25rem;font-weight:700;margin-bottom:.5rem}.results-table .status .pill{display:inline-block;padding:.25rem .5rem;border-radius:4px;margin-right:.25rem;align-self:flex-start}.results-table .pass .pill{background-color:var(--success-background-color);color:var(--pass-color);border:1px solid var(--pass-color)}.results-table .fail .pill{border:1px solid var(--fail-color)}.results-table .fail{color:var(--fail-color)}.fail-reason{color:var(--fail-color);font-weight:700}.compact .fail-reason{display:inline}.results-table .fail .pill{background-color:var(--fail-background-color)}.results-table td .score{font-weight:400}.results-table .comment{margin-top:.5rem;padding:.25rem .5rem;border-radius:4px;background-color:var(--neutral-background-color);font-style:italic;cursor:pointer}.results-table td.first-prompt-col{border-left:2px solid #888}.results-table td.first-prompt-row{border-top:2px solid #888}.search-highlight{color:var(--search-text-color);background-color:var(--search-highlight-color)}.results-table tr .cell .lightbox{position:fixed;top:0;left:0;right:0;bottom:0;background-color:#000000b3;display:flex;align-items:center;justify-content:center;z-index:1000}.results-table tr .cell .lightbox img{max-width:90%;max-height:90%;cursor:zoom-out}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;-webkit-user-select:none;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.results-table thead.sticky{position:sticky;top:0;z-index:100;transition:transform .2s ease-out}[data-theme=dark] .results-table thead.sticky{background:#1a1a1a}.header-dismiss{position:absolute;top:8px;right:8px;cursor:pointer;z-index:101;padding:4px;border-radius:4px;background:#eee;display:none}[data-theme=dark] .header-dismiss{background:#333}.header-dismiss:hover{background:#ddd}[data-theme=dark] .header-dismiss:hover{background:#444}thead.collapsed .header-dismiss{display:block}.results-table thead.collapsed .prompt-detail,.results-table thead.collapsed .custom-metric-container{display:none}.results-table thead.collapsed .MuiFormControlLabel-root{margin-top:-8px}.results-table{border-collapse:separate;border-spacing:0}.results-table thead.collapsed th{padding-top:4px;padding-bottom:4px}.results-table thead th{transition:padding .2s ease-out}.results-table thead .output-header{transition:max-height .2s ease-out}.results-table thead.collapsed{border-bottom:2px solid var(--border-color);box-shadow:0 2px 4px #0000001a}.results-table th{background:#fff;box-shadow:0 1px 2px #0000001a}[data-theme=dark] .results-table th{background:#1a1a1a}.lightbox{position:fixed;top:0;left:0;width:100%;height:100%;background:#000c;display:flex;justify-content:center;align-items:center;z-index:1000;cursor:pointer}.lightbox img{max-width:90%;max-height:90%;object-fit:contain}.error-pill{cursor:pointer}.description{cursor:pointer;transition:background-color .3s ease}.description:hover{background-color:#f0f0f0}[data-theme=dark] .description:hover{background-color:#2a2a2a}.eval-header{display:flex;align-items:center;flex-grow:1}.eval-header strong{font-weight:500}body{background-color:var(--background-color);color:var(--text-color)}.notice{display:flex;flex-direction:column;gap:1.5rem;justify-content:center;align-items:center;height:9rem}.framework-compliance-card{transition:box-shadow .3s ease-in-out;border-radius:12px;overflow:hidden;background-color:#f5f5f5}.framework-compliance-card:hover{box-shadow:0 8px 16px #0000001a}.compliance-summary{text-align:center;margin-bottom:32px;padding:24px;background-color:#fff;border-radius:8px;box-shadow:0 2px 4px #0000000d}.framework-grid{margin-top:24px}.framework-item{height:100%;transition:transform .3s ease,box-shadow .3s ease}.framework-item:hover{box-shadow:0 4px 8px #0000001a}.framework-item.compliant{background-color:#e8f5e9}.framework-item.non-compliant{background-color:#ffebee}.icon-compliant{color:#4caf50;font-size:24px}.icon-non-compliant{color:#f44336;font-size:24px}[data-theme=dark] .framework-compliance-card{background-color:#1e1e1e}[data-theme=dark] .compliance-summary{background-color:#2a2a2a}[data-theme=dark] .framework-item.compliant{background-color:#4caf501a}[data-theme=dark] .framework-item.non-compliant{background-color:#f443361a}[data-theme=dark] .framework-compliance-card:hover,[data-theme=dark] .framework-item:hover{box-shadow:0 8px 16px #ffffff1a}.framework-item .MuiList-root,.framework-item .MuiListItem-root{padding-top:0;padding-bottom:0}.framework-item .MuiListItemIcon-root{min-width:24px}.framework-item .MuiListItemText-root{margin-top:0;margin-bottom:0}.severity-card{transition:transform .3s ease-in-out,box-shadow .3s ease-in-out}.severity-card:hover{transform:translateY(-5px);box-shadow:0 8px 16px #0000001a;cursor:pointer}[data-theme=dark] .severity-card:hover{box-shadow:none}.card-critical{border-left:5px solid #ff1744}.card-high{border-left:5px solid #ff9100}.card-medium{border-left:5px solid #ffc400}.card-low{border-left:5px solid #00e676}.risk-category-drawer .MuiListItem-root.failure-item{border:1px solid #e0e0e0;border-radius:8px;margin-bottom:16px;padding:16px;transition:all .3s ease;box-shadow:0 2px 4px #0000000d}.risk-category-drawer .MuiListItem-root.failure-item:hover{border-color:#bdbdbd;box-shadow:0 4px 8px #0000001a}.risk-category-drawer .prompt{font-weight:500;margin-bottom:8px;word-break:break-word;max-width:100%;overflow-wrap:break-all}.risk-category-drawer .output{color:#555;white-space:pre-wrap;word-break:break-word;max-height:100px;overflow-y:auto;overflow-wrap:break-all;background-color:#f5f5f5;padding:8px;border-radius:4px}.risk-category-drawer .failed-tests-header{margin-top:8px;margin-bottom:8px;font-weight:500;border-top:2px solid #e0e0e0;padding-top:1.5rem}[data-theme=dark] .risk-category-drawer .MuiListItem-root.failure-item{border-color:#424242;background-color:#2c2c2c;box-shadow:0 2px 4px #0003}[data-theme=dark] .risk-category-drawer .MuiListItem-root.failure-item:hover{border-color:#616161;box-shadow:0 4px 8px #0000004d}[data-theme=dark] .risk-category-drawer .prompt{color:#e0e0e0}[data-theme=dark] .risk-category-drawer .output{color:#b0b0b0;background-color:#1e1e1e}[data-theme=dark] .risk-category-drawer .failed-tests-header{color:#e0e0e0;border-top-color:#424242}.risk-card-container{padding-top:3rem;padding-bottom:3rem}.risk-card-title{font-weight:700}.risk-card-progress{margin-bottom:1rem}.risk-card-issues{color:red}.risk-card-list-item{padding:0;transition:background-color .3s,box-shadow .3s}.risk-card-list-item:hover{background-color:#f0f8ff;box-shadow:0 0 0 1px #add8e6;border-radius:4px}[data-theme=dark] .risk-card-list-item:hover{background-color:#2a2a2a;box-shadow:0 0 0 1px #555;border-radius:4px}.risk-card-icon-passed{color:green;font-size:16px}.risk-card-icon-failed{color:red;font-size:16px}.risk-card-icon-no-tests{color:#888;font-size:16px}.risk-card-percentage{margin-left:8px;font-weight:700}.risk-card-percentage-high{color:#4caf50}.risk-card-percentage-medium{color:#ff9800}.risk-card-percentage-low{color:#f44336}[data-theme=dark] .risk-card-percentage-high{color:#81c784}[data-theme=dark] .risk-card-percentage-medium{color:#ffb74d}[data-theme=dark] .risk-card-percentage-low{color:#e57373}.strategy-stats-card{border-radius:.75rem}div.strategy-stats-content{padding:2rem}.strategy-grid{display:grid;grid-template-columns:repeat(2,1fr);gap:1.5rem}.strategy-item{border:1px solid #eee;border-radius:.5rem;padding:1rem;height:100%;transition:background-color .2s ease;border-radius:4px;padding:16px}.strategy-item:hover{background-color:#0000000a}[data-theme=dark] .strategy-item{border:1px solid #424242}.strategy-name{font-weight:500;margin-bottom:.25rem}.strategy-description{margin-bottom:.75rem}.progress-container{margin-bottom:.5rem}.fail-rate{text-align:right}.attack-stats{display:block;text-align:right}.pass-high{color:green!important}.pass-medium{color:#cb8503!important}.pass-low{color:red!important}.vuln-critical{color:red!important;font-weight:700!important}.vuln-high{color:red!important}.vuln-medium{color:#cb8503!important}.vuln-low{color:green!important}.report-header{padding:24px;border-radius:12px;box-shadow:0 4px 6px #0000001a}[data-theme=dark] .report-header{box-shadow:none}.report-details{display:flex;flex-wrap:wrap;gap:12px}.report-details .MuiChip-root{font-size:.875rem}.page-content{padding:0;margin-top:-15px}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;font-style:normal;font-weight-normal:400;font-weight-bold:700;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #f5f5f5;--text-color: #404040;--link-color: #2e6585;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--success-background-color: #d1ffd7;--fail-background-color: #ffd1d1;--neutral-background-color: #eee;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7;--insert-highlight-color: #d4fcbc;--delete-highlight-color: #fbb6c2;--prompt-highlight-color: linen;--textarea-background-color: #ffffff;--textarea-color: #404040;--search-highlight-color: #ffff00;--search-text-color: inherit}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--link-color: #6fcaff;--border-color: #444444;--table-border-color: #444444;--pass-color: #b1e9b3;--fail-color: #ee726a;--success-background-color: #216d2b;--fail-background-color: #6d2121;--neutral-background-color: #424242;--variable-background-color: #333;--header-background-color: #333;--insert-highlight-color: #4f8a34;--delete-highlight-color: #8a3434;--prompt-highlight-color: #67605a;--textarea-background-color: #2d2d2d;--textarea-color: #fff;--search-highlight-color: #ffff00;--search-text-color: #404040}html{font-size:16px;background-color:var(--background-color);color:var(--text-color)}body{margin:0}a{color:var(--link-color);text-decoration:none}a:hover{text-decoration:underline}textarea{background-color:var(--textarea-background-color);color:var(--textarea-color);padding:.5em;border:1px solid var(--border-color);border-radius:.25em;resize:vertical}*{box-sizing:border-box}
|
|
1
|
+
.nav{padding:.25rem 0 .25rem 1rem;gap:1rem;background-color:#333;margin-bottom:1rem;box-shadow:0 4px 6px #0000001a}[data-theme=dark] .nav{background-color:#121212}.nav a{text-decoration:none;align-self:center}@media (max-width: 760px){.nav{font-size:.75rem}}.nav .active{font-weight:700}.nav a{color:#f0f0f0}.nav .right-aligned{display:flex;align-items:center;gap:1rem;margin-left:auto;margin-right:.5rem}.nav a:hover{text-decoration:underline}.nav a:hover{color:#ddd}.prompt-var-highlight{background-color:var(--prompt-highlight-color);padding:.25rem;border-radius:4px}.glowing-border{border:1px solid #5cb3ff;box-shadow:0 0 8px 2px #5cb3ff}code[class*=language-],pre[class*=language-]{color:#000;background:none;text-shadow:0 1px white;font-family:Consolas,Monaco,Andale Mono,Ubuntu Mono,monospace;font-size:1em;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}pre[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,code[class*=language-] ::-moz-selection{text-shadow:none;background:#b3d4fc}pre[class*=language-]::selection,pre[class*=language-] ::selection,code[class*=language-]::selection,code[class*=language-] ::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.comment,.token.prolog,.token.doctype,.token.cdata{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.property,.token.tag,.token.boolean,.token.number,.token.constant,.token.symbol,.token.deleted{color:#905}.token.selector,.token.attr-name,.token.string,.token.char,.token.builtin,.token.inserted{color:#690}.token.operator,.token.entity,.token.url,.language-css .token.string,.style .token.string{color:#9a6e3a;background:#ffffff80}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.function,.token.class-name{color:#dd4a68}.token.regex,.token.important,.token.variable{color:#e90}.token.important,.token.bold{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help}.yaml-config{font-size:10px}.custom-metric-container{display:inline-flex;flex-wrap:wrap;gap:.25rem;font-weight:400}.custom-metric-container>span{padding:.25rem .5rem;border-radius:4px;color:var(--text-color);border:1px solid var(--border-color);margin-right:.25rem}.custom-metric-container>span.clickable{cursor:pointer}.custom-metric-container>span.clickable:hover{border-color:var(--link-color)}.fail-reason-carousel-controls{float:right;color:var(--text-color);font-weight:400}pre{white-space:pre-wrap}.font-bold{font-weight:700}table.results-table,.divTable{border-collapse:collapse;width:100%;margin:1rem 0;background-color:#fff}[data-theme=dark] table.results-table,[data-theme=dark] .divTable{background-color:#1a1a1a}.results-table ins{background-color:var(--insert-highlight-color);text-decoration:none}.results-table del{background-color:var(--delete-highlight-color);text-decoration:strikethrough}.tr{display:flex}.results-table tr,.tr{width:fit-content}.results-table tr:hover,.tr:hover{background-color:#00000005}.results-table th,.th,.results-table td,.td{position:relative;border:1px solid var(--border-color);vertical-align:top}.compact.results-table th,.compact .th,.compact.results-table td,.compact .td{padding:.5rem}.results-table th.variable,.th.variable,.results-table td.variable,.td.variable{background-color:var(--variable-background-color)}.variable .cell{max-height:100%;overflow-y:auto}.results-table tr.header{background-color:var(--header-background-color)}.results-table th,.th{padding:1rem;position:relative;vertical-align:bottom;text-align:left;white-space:pre-wrap;font-weight:400}.results-table th .action{cursor:pointer;margin-left:.5rem}.results-table th .action svg{vertical-align:middle}.results-table td,.results-table th{height:1px}@-moz-document url-prefix(){.results-table td,.results-table th{height:100%}}.firefox-fix{display:table;height:100%}.results-table tr .cell{display:flex;flex-direction:column;white-space:pre-wrap;height:100%;padding:1rem}.results-table tr .cell img{max-width:var(--max-image-width, 256px);max-height:var(--max-image-height, 256px);cursor:zoom-in}.results-table tr .cell .prompt{background-color:var(--variable-background-color);border:1px solid var(--border-color);padding:1rem;border-radius:4px;margin-bottom:.5rem;font-family:Courier New,Courier,monospace;font-size:.8rem}.results-table tr .cell .prompt .pill{display:block;background-color:var(--neutral-background-color);border:1px solid var(--border-color);border-radius:4px;padding:.25rem;width:fit-content;margin-bottom:.5rem}.results-table tr .cell-actions{display:flex;gap:.5rem;visibility:hidden;position:absolute;top:1.25rem;right:.75rem;line-height:0;font-size:1.25rem}.results-table .first-prompt-col:hover .cell-actions,.results-table .second-prompt-column:hover .cell-actions{visibility:visible}.results-table tr .cell-detail{display:flex;flex-wrap:wrap;column-gap:.5rem;row-gap:.25rem;padding-top:1rem;margin-top:auto}.results-table tr .stat-item{font-weight:400;font-size:.75rem;color:#888}.results-table tr .cell-actions .action{cursor:pointer}.results-table tr .cell table{width:100%;border-collapse:collapse;margin:1rem 0}.results-table tr .cell table th,.results-table tr .cell table td{border:1px solid var(--border-color);padding:.5rem;text-align:left}.results-table tr .cell table th{background-color:var(--header-background-color);font-weight:700}.results-table tr .cell table tr:nth-child(2n){background-color:var(--row-background-color)}.results-table tr .cell table tr:hover{background-color:var(--hover-background-color)}.results-table th .output-header{display:flex;flex-direction:column;height:100%;align-items:flex-start}.results-table th .output-header .pills{display:flex;flex-wrap:wrap;gap:.25rem;align-items:center;margin-bottom:.5rem}.results-table th .prompt-container{font-weight:700;margin-bottom:.5rem}.results-table th .prompt-container>*{display:inline}.results-table th .provider{display:inline-block;padding:.25rem .5rem;background-color:var(--neutral-background-color);border:1px solid var(--border-color);border-radius:4px;margin-right:.25rem}.results-table th .summary{font-weight:400}.results-table th .prompt-detail{display:flex;flex-wrap:wrap;column-gap:.5rem;row-gap:.25rem;font-size:.75rem;color:#888;margin-top:auto}.results-table th .summary .highlight{padding:.25rem .5rem;border-radius:4px;background-color:var(--fail-background-color);border:1px solid var(--border-color)}.results-table th .summary .highlight.success{background-color:var(--success-background-color);border:1px solid var(--pass-color)}.results-table .status{display:flex;flex-direction:column;gap:.25rem;font-weight:700;margin-bottom:.5rem}.results-table .status .pill{display:inline-block;padding:.25rem .5rem;border-radius:4px;margin-right:.25rem;align-self:flex-start}.results-table .pass .pill{background-color:var(--success-background-color);color:var(--pass-color);border:1px solid var(--pass-color)}.results-table .fail .pill{border:1px solid var(--fail-color)}.results-table .fail{color:var(--fail-color)}.fail-reason{color:var(--fail-color);font-weight:700}.compact .fail-reason{display:inline}.results-table .fail .pill{background-color:var(--fail-background-color)}.results-table td .score{font-weight:400}.results-table .comment{margin-top:.5rem;padding:.25rem .5rem;border-radius:4px;background-color:var(--neutral-background-color);font-style:italic;cursor:pointer}.results-table td.first-prompt-col{border-left:2px solid #888}.results-table td.first-prompt-row{border-top:2px solid #888}.search-highlight{color:var(--search-text-color);background-color:var(--search-highlight-color)}.results-table tr .cell .lightbox{position:fixed;top:0;left:0;right:0;bottom:0;background-color:#000000b3;display:flex;align-items:center;justify-content:center;z-index:1000}.results-table tr .cell .lightbox img{max-width:90%;max-height:90%;cursor:zoom-out}.resizer{position:absolute;right:0;top:0;height:100%;width:5px;cursor:col-resize;-webkit-user-select:none;user-select:none;touch-action:none;background:var(--text-color);opacity:.5}.resizer.isResizing{background:var(--text-color);opacity:1}@media (hover: hover){.resizer{opacity:0}*:hover>.resizer{opacity:1}}.results-table thead.sticky{position:sticky;top:0;z-index:100;transition:transform .2s ease-out}[data-theme=dark] .results-table thead.sticky{background:#1a1a1a}.header-dismiss{position:absolute;top:8px;right:8px;cursor:pointer;z-index:101;padding:4px;border-radius:4px;background:#eee;display:none}[data-theme=dark] .header-dismiss{background:#333}.header-dismiss:hover{background:#ddd}[data-theme=dark] .header-dismiss:hover{background:#444}thead.collapsed .header-dismiss{display:block}.results-table thead.collapsed .prompt-detail,.results-table thead.collapsed .custom-metric-container{display:none}.results-table thead.collapsed .MuiFormControlLabel-root{margin-top:-8px}.results-table{border-collapse:separate;border-spacing:0}.results-table thead.collapsed th{padding-top:4px;padding-bottom:4px}.results-table thead th{transition:padding .2s ease-out}.results-table thead .output-header{transition:max-height .2s ease-out}.results-table thead.collapsed{border-bottom:2px solid var(--border-color);box-shadow:0 2px 4px #0000001a}.results-table th{background:#fff;box-shadow:0 1px 2px #0000001a}[data-theme=dark] .results-table th{background:#1a1a1a}.lightbox{position:fixed;top:0;left:0;width:100%;height:100%;background:#000c;display:flex;justify-content:center;align-items:center;z-index:1000;cursor:pointer}.lightbox img{max-width:90%;max-height:90%;object-fit:contain}.error-pill{cursor:pointer}.description{cursor:pointer;transition:background-color .3s ease}.description:hover{background-color:#f0f0f0}[data-theme=dark] .description:hover{background-color:#2a2a2a}.eval-header{display:flex;align-items:center;flex-grow:1}.eval-header strong{font-weight:500}body{background-color:var(--background-color);color:var(--text-color)}.notice{display:flex;flex-direction:column;gap:1.5rem;justify-content:center;align-items:center;height:9rem}.framework-compliance-card{transition:box-shadow .3s ease-in-out;border-radius:12px;overflow:hidden;background-color:#f5f5f5}.framework-compliance-card:hover{box-shadow:0 8px 16px #0000001a}.compliance-summary{text-align:center;margin-bottom:32px;padding:24px;background-color:#fff;border-radius:8px;box-shadow:0 2px 4px #0000000d}.framework-grid{margin-top:24px}.framework-item{height:100%;transition:transform .3s ease,box-shadow .3s ease}.framework-item:hover{box-shadow:0 4px 8px #0000001a}.framework-item.compliant{background-color:#e8f5e9}.framework-item.non-compliant{background-color:#ffebee}.icon-compliant{color:#4caf50;font-size:24px}.icon-non-compliant{color:#f44336;font-size:24px}[data-theme=dark] .framework-compliance-card{background-color:#1e1e1e}[data-theme=dark] .compliance-summary{background-color:#2a2a2a}[data-theme=dark] .framework-item.compliant{background-color:#4caf501a}[data-theme=dark] .framework-item.non-compliant{background-color:#f443361a}[data-theme=dark] .framework-compliance-card:hover,[data-theme=dark] .framework-item:hover{box-shadow:0 8px 16px #ffffff1a}.framework-item .MuiList-root,.framework-item .MuiListItem-root{padding-top:0;padding-bottom:0}.framework-item .MuiListItemIcon-root{min-width:24px}.framework-item .MuiListItemText-root{margin-top:0;margin-bottom:0}.severity-card{transition:transform .3s ease-in-out,box-shadow .3s ease-in-out}.severity-card:hover{transform:translateY(-5px);box-shadow:0 8px 16px #0000001a;cursor:pointer}[data-theme=dark] .severity-card:hover{box-shadow:none}.card-critical{border-left:5px solid #ff1744}.card-high{border-left:5px solid #ff9100}.card-medium{border-left:5px solid #ffc400}.card-low{border-left:5px solid #00e676}.risk-category-drawer .MuiListItem-root.failure-item{border:1px solid #e0e0e0;border-radius:8px;margin-bottom:16px;padding:16px;transition:all .3s ease;box-shadow:0 2px 4px #0000000d}.risk-category-drawer .MuiListItem-root.failure-item:hover{border-color:#bdbdbd;box-shadow:0 4px 8px #0000001a}.risk-category-drawer .prompt{font-weight:500;margin-bottom:8px;word-break:break-word;max-width:100%;overflow-wrap:break-all}.risk-category-drawer .output{color:#555;white-space:pre-wrap;word-break:break-word;max-height:100px;overflow-y:auto;overflow-wrap:break-all;background-color:#f5f5f5;padding:8px;border-radius:4px}.risk-category-drawer .failed-tests-header{margin-top:8px;margin-bottom:8px;font-weight:500;border-top:2px solid #e0e0e0;padding-top:1.5rem}[data-theme=dark] .risk-category-drawer .MuiListItem-root.failure-item{border-color:#424242;background-color:#2c2c2c;box-shadow:0 2px 4px #0003}[data-theme=dark] .risk-category-drawer .MuiListItem-root.failure-item:hover{border-color:#616161;box-shadow:0 4px 8px #0000004d}[data-theme=dark] .risk-category-drawer .prompt{color:#e0e0e0}[data-theme=dark] .risk-category-drawer .output{color:#b0b0b0;background-color:#1e1e1e}[data-theme=dark] .risk-category-drawer .failed-tests-header{color:#e0e0e0;border-top-color:#424242}.risk-card-container{padding-top:3rem;padding-bottom:3rem}.risk-card-title{font-weight:700}.risk-card-progress{margin-bottom:1rem}.risk-card-issues{color:red}.risk-card-list-item{padding:0;transition:background-color .3s,box-shadow .3s}.risk-card-list-item:hover{background-color:#f0f8ff;box-shadow:0 0 0 1px #add8e6;border-radius:4px}[data-theme=dark] .risk-card-list-item:hover{background-color:#2a2a2a;box-shadow:0 0 0 1px #555;border-radius:4px}.risk-card-icon-passed{color:green;font-size:16px}.risk-card-icon-failed{color:red;font-size:16px}.risk-card-icon-no-tests{color:#888;font-size:16px}.risk-card-percentage{margin-left:8px;font-weight:700}.risk-card-percentage-high{color:#4caf50}.risk-card-percentage-medium{color:#ff9800}.risk-card-percentage-low{color:#f44336}[data-theme=dark] .risk-card-percentage-high{color:#81c784}[data-theme=dark] .risk-card-percentage-medium{color:#ffb74d}[data-theme=dark] .risk-card-percentage-low{color:#e57373}.pass-high{color:green!important}.pass-medium{color:#cb8503!important}.pass-low{color:red!important}.vuln-critical{color:red!important;font-weight:700!important}.vuln-high{color:red!important}.vuln-medium{color:#cb8503!important}.vuln-low{color:green!important}.report-header{padding:24px;border-radius:12px;box-shadow:0 4px 6px #0000001a}[data-theme=dark] .report-header{box-shadow:none}.report-details{display:flex;flex-wrap:wrap;gap:12px}.report-details .MuiChip-root{font-size:.875rem}.page-content{padding:0;margin-top:-15px}:root{font-family:Inter,system-ui,Avenir,Helvetica,Arial,sans-serif;font-style:normal;font-weight-normal:400;font-weight-bold:700;font-synthesis:none;text-rendering:optimizeLegibility;-webkit-font-smoothing:antialiased;-moz-osx-font-smoothing:grayscale;-webkit-text-size-adjust:100%;--background-color: #f5f5f5;--text-color: #404040;--link-color: #2e6585;--border-color: lightgray;--table-border-color: lightgray;--pass-color: green;--fail-color: #ad0000;--success-background-color: #d1ffd7;--fail-background-color: #ffd1d1;--neutral-background-color: #eee;--variable-background-color: #f7f7f7;--header-background-color: #fffdf7;--insert-highlight-color: #d4fcbc;--delete-highlight-color: #fbb6c2;--prompt-highlight-color: linen;--textarea-background-color: #ffffff;--textarea-color: #404040;--search-highlight-color: #ffff00;--search-text-color: inherit}[data-theme=dark]{--background-color: #1a1a1a;--text-color: #f0f0f0;--link-color: #6fcaff;--border-color: #444444;--table-border-color: #444444;--pass-color: #b1e9b3;--fail-color: #ee726a;--success-background-color: #216d2b;--fail-background-color: #6d2121;--neutral-background-color: #424242;--variable-background-color: #333;--header-background-color: #333;--insert-highlight-color: #4f8a34;--delete-highlight-color: #8a3434;--prompt-highlight-color: #67605a;--textarea-background-color: #2d2d2d;--textarea-color: #fff;--search-highlight-color: #ffff00;--search-text-color: #404040}html{font-size:16px;background-color:var(--background-color);color:var(--text-color)}body{margin:0}a{color:var(--link-color);text-decoration:none}a:hover{text-decoration:underline}textarea{background-color:var(--textarea-background-color);color:var(--textarea-color);padding:.5em;border:1px solid var(--border-color);border-radius:.25em;resize:vertical}*{box-sizing:border-box}
|