run-gauntlet 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- run_gauntlet-0.1.0/PKG-INFO +424 -0
- run_gauntlet-0.1.0/README.md +391 -0
- run_gauntlet-0.1.0/gauntlet/__init__.py +3 -0
- run_gauntlet-0.1.0/gauntlet/cli.py +361 -0
- run_gauntlet-0.1.0/gauntlet/layer1/__init__.py +1 -0
- run_gauntlet-0.1.0/gauntlet/layer1/agent_tools.py +302 -0
- run_gauntlet-0.1.0/gauntlet/layer1/contextual_chunking.py +215 -0
- run_gauntlet-0.1.0/gauntlet/layer1/embeddings.py +213 -0
- run_gauntlet-0.1.0/gauntlet/layer1/gemini_harness.py +383 -0
- run_gauntlet-0.1.0/gauntlet/layer1/llms_docs.py +1558 -0
- run_gauntlet-0.1.0/gauntlet/layer1/llms_slim.py +129 -0
- run_gauntlet-0.1.0/gauntlet/layer1/query_rewriting.py +150 -0
- run_gauntlet-0.1.0/gauntlet/layer1/utility.py +642 -0
- run_gauntlet-0.1.0/gauntlet/layer2/__init__.py +1 -0
- run_gauntlet-0.1.0/gauntlet/layer2/agent_runner.py +2679 -0
- run_gauntlet-0.1.0/gauntlet/layer2/personas.py +274 -0
- run_gauntlet-0.1.0/gauntlet/layer3/__init__.py +2 -0
- run_gauntlet-0.1.0/gauntlet/layer3/aggregate_runs.py +308 -0
- run_gauntlet-0.1.0/gauntlet/layer3/build_batch_context.py +124 -0
- run_gauntlet-0.1.0/gauntlet/layer3/cli.py +157 -0
- run_gauntlet-0.1.0/gauntlet/layer3/contracts.py +309 -0
- run_gauntlet-0.1.0/gauntlet/layer3/extract_evidence.py +612 -0
- run_gauntlet-0.1.0/gauntlet/layer3/judge_run.py +108 -0
- run_gauntlet-0.1.0/gauntlet/layer3/normalize_run.py +40 -0
- run_gauntlet-0.1.0/gauntlet/layer3/reasoning_judge.py +977 -0
- run_gauntlet-0.1.0/gauntlet/layer3/render_report.py +507 -0
- run_gauntlet-0.1.0/gauntlet/layer3/retrieve_docs.py +177 -0
- run_gauntlet-0.1.0/gauntlet/layer3/validate_judgment.py +123 -0
- run_gauntlet-0.1.0/gauntlet/layer3/verify_task.py +463 -0
- run_gauntlet-0.1.0/gauntlet/providers.py +140 -0
- run_gauntlet-0.1.0/gauntlet/run_orchestrator.py +278 -0
- run_gauntlet-0.1.0/gauntlet/server/__init__.py +2 -0
- run_gauntlet-0.1.0/gauntlet/server/app.py +44 -0
- run_gauntlet-0.1.0/gauntlet/server/artifacts.py +45 -0
- run_gauntlet-0.1.0/gauntlet/server/config.py +25 -0
- run_gauntlet-0.1.0/gauntlet/server/db.py +28 -0
- run_gauntlet-0.1.0/gauntlet/server/embeddings.py +65 -0
- run_gauntlet-0.1.0/gauntlet/server/repositories.py +218 -0
- run_gauntlet-0.1.0/gauntlet/server/routes/__init__.py +2 -0
- run_gauntlet-0.1.0/gauntlet/server/routes/auth.py +22 -0
- run_gauntlet-0.1.0/gauntlet/server/routes/docs.py +40 -0
- run_gauntlet-0.1.0/gauntlet/server/routes/runs.py +169 -0
- run_gauntlet-0.1.0/gauntlet/server/services.py +217 -0
- run_gauntlet-0.1.0/gauntlet/server/storage.py +71 -0
- run_gauntlet-0.1.0/gauntlet/task_validation.py +93 -0
- run_gauntlet-0.1.0/pyproject.toml +53 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/PKG-INFO +424 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/SOURCES.txt +53 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/dependency_links.txt +1 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/entry_points.txt +2 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/requires.txt +18 -0
- run_gauntlet-0.1.0/run_gauntlet.egg-info/top_level.txt +1 -0
- run_gauntlet-0.1.0/setup.cfg +4 -0
- run_gauntlet-0.1.0/tests/test_gauntlet_regressions.py +729 -0
- run_gauntlet-0.1.0/tests/test_llms_slim.py +86 -0
|
@@ -0,0 +1,424 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: run-gauntlet
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Multi-provider harness for testing whether AI agents can use product docs and tools.
|
|
5
|
+
Author: Gauntlet Contributors
|
|
6
|
+
Keywords: ai,agents,evaluation,cli,llms
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Environment :: Console
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Testing
|
|
15
|
+
Requires-Python: >=3.10
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: httpx<1.0,>=0.27
|
|
18
|
+
Requires-Dist: python-dotenv<2.0,>=1.0
|
|
19
|
+
Requires-Dist: requests<3.0,>=2.31
|
|
20
|
+
Requires-Dist: beautifulsoup4<5.0,>=4.12
|
|
21
|
+
Requires-Dist: playwright<2.0,>=1.44
|
|
22
|
+
Requires-Dist: rank-bm25<1.0,>=0.2.2
|
|
23
|
+
Provides-Extra: server
|
|
24
|
+
Requires-Dist: fastapi<1.0,>=0.115; extra == "server"
|
|
25
|
+
Requires-Dist: uvicorn[standard]<1.0,>=0.30; extra == "server"
|
|
26
|
+
Requires-Dist: psycopg[binary]<4.0,>=3.2; extra == "server"
|
|
27
|
+
Requires-Dist: numpy<3.0,>=1.26; extra == "server"
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: build<2.0,>=1.2; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest<9.0,>=8.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff<1.0,>=0.6; extra == "dev"
|
|
32
|
+
Requires-Dist: twine<7.0,>=5.0; extra == "dev"
|
|
33
|
+
|
|
34
|
+
# Gauntlet
|
|
35
|
+
|
|
36
|
+
Gauntlet is a harness for answering a practical question:
|
|
37
|
+
|
|
38
|
+
**Can real external agents learn and use your product from docs, specs, and tools alone and, if not, exactly where does adoption break?**
|
|
39
|
+
|
|
40
|
+
This project does not try to simulate an idealized agent in a perfect environment. It tries to simulate the messy reality:
|
|
41
|
+
|
|
42
|
+
- different model providers
|
|
43
|
+
- different behavior styles
|
|
44
|
+
- ambiguous docs
|
|
45
|
+
- brittle tool contracts
|
|
46
|
+
- recovery after failure
|
|
47
|
+
- noisy execution traces
|
|
48
|
+
|
|
49
|
+
The result is a pipeline that can:
|
|
50
|
+
|
|
51
|
+
1. run an agent against a product task using only the provided docs and tools
|
|
52
|
+
2. record the execution automatically
|
|
53
|
+
3. judge the run afterward
|
|
54
|
+
4. produce a report showing what happened, where it drifted, and what to fix
|
|
55
|
+
|
|
56
|
+
## What Gauntlet Is Actually Doing
|
|
57
|
+
|
|
58
|
+
Gauntlet is organized as three layers.
|
|
59
|
+
|
|
60
|
+
### Layer 1: Product Context And Execution Primitives
|
|
61
|
+
|
|
62
|
+
Layer 1 is the substrate.
|
|
63
|
+
|
|
64
|
+
It knows how to:
|
|
65
|
+
|
|
66
|
+
- load `llms.txt` and `llms-full.txt`
|
|
67
|
+
- shape docs into a manifest and retrievable chunks
|
|
68
|
+
- run generic execution tools like:
|
|
69
|
+
- HTTP
|
|
70
|
+
- CLI
|
|
71
|
+
- Python code
|
|
72
|
+
- Python SDK invocation
|
|
73
|
+
|
|
74
|
+
This is where the product-facing context lives.
|
|
75
|
+
|
|
76
|
+
### Layer 2: Agent Execution
|
|
77
|
+
|
|
78
|
+
Layer 2 is the agent loop.
|
|
79
|
+
|
|
80
|
+
It takes:
|
|
81
|
+
|
|
82
|
+
- a task
|
|
83
|
+
- a provider/model
|
|
84
|
+
- an optional persona
|
|
85
|
+
- docs sources
|
|
86
|
+
|
|
87
|
+
Then it:
|
|
88
|
+
|
|
89
|
+
- prompts the model
|
|
90
|
+
- lets it call tools
|
|
91
|
+
- captures step-by-step execution
|
|
92
|
+
- returns a final answer or failure
|
|
93
|
+
- writes a structured Layer 2 run record
|
|
94
|
+
|
|
95
|
+
This is the “can the agent do it?” layer.
|
|
96
|
+
|
|
97
|
+
### Layer 3: Judgment
|
|
98
|
+
|
|
99
|
+
Layer 3 is the post-run judge.
|
|
100
|
+
|
|
101
|
+
It consumes Layer 2 run records and produces:
|
|
102
|
+
|
|
103
|
+
- a normalized view of the run
|
|
104
|
+
- extracted evidence
|
|
105
|
+
- retrieved docs context
|
|
106
|
+
- batch-level cross-persona context
|
|
107
|
+
- a judged result
|
|
108
|
+
- a single HTML + JSON report
|
|
109
|
+
|
|
110
|
+
This is the “what happened, why, and how do we improve adoption?” layer.
|
|
111
|
+
|
|
112
|
+
## Why This Exists
|
|
113
|
+
|
|
114
|
+
Most product teams ask some version of:
|
|
115
|
+
|
|
116
|
+
- “Will agents be able to use our product?”
|
|
117
|
+
- “Are our docs good enough for AI agents?”
|
|
118
|
+
- “If an agent fails, is it our product, our docs, or just the model?”
|
|
119
|
+
|
|
120
|
+
Gauntlet is built to make those failures attributable.
|
|
121
|
+
|
|
122
|
+
The goal is not merely pass/fail.
|
|
123
|
+
|
|
124
|
+
The goal is to separate:
|
|
125
|
+
|
|
126
|
+
- product problems
|
|
127
|
+
- docs problems
|
|
128
|
+
- runtime/tooling problems
|
|
129
|
+
- harness problems
|
|
130
|
+
- model capability limits
|
|
131
|
+
|
|
132
|
+
## Core Ideas
|
|
133
|
+
|
|
134
|
+
### Personas Matter
|
|
135
|
+
|
|
136
|
+
A single clean run is not enough.
|
|
137
|
+
|
|
138
|
+
Gauntlet can run multiple built-in personas that stress different failure modes:
|
|
139
|
+
|
|
140
|
+
- `methodical`: follows docs literally
|
|
141
|
+
- `impatient`: optimizes for speed
|
|
142
|
+
- `chaotic`: reorders and perturbs flows
|
|
143
|
+
- `confused`: exposes clarity gaps
|
|
144
|
+
- `long-running`: stresses session continuity
|
|
145
|
+
- `adversarial`: pushes boundaries and validation
|
|
146
|
+
- `parallel`: stresses concurrency and state isolation
|
|
147
|
+
- `recovery`: intentionally fails, then tries to recover
|
|
148
|
+
|
|
149
|
+
These are not cosmetic personas. They change how the agent behaves and what kinds of product adoption failures become visible.
|
|
150
|
+
|
|
151
|
+
### Docs Are First-Class Inputs
|
|
152
|
+
|
|
153
|
+
Gauntlet is designed around product docs, especially:
|
|
154
|
+
|
|
155
|
+
- `llms.txt`
|
|
156
|
+
- `llms-full.txt`
|
|
157
|
+
|
|
158
|
+
The agent is expected to use those as its operating context, and Layer 3 uses them again to judge whether the run:
|
|
159
|
+
|
|
160
|
+
- had enough documentation
|
|
161
|
+
- consulted the docs
|
|
162
|
+
- followed the docs
|
|
163
|
+
- or drifted away from the intended path
|
|
164
|
+
|
|
165
|
+
### Every Run Becomes An Artifact
|
|
166
|
+
|
|
167
|
+
By default, `gauntlet chat` creates a batch folder automatically:
|
|
168
|
+
|
|
169
|
+
```text
|
|
170
|
+
artifacts/gauntlet_runs/gauntlet_###/
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
That folder contains:
|
|
174
|
+
|
|
175
|
+
- one Layer 2 run record per persona
|
|
176
|
+
- one Layer 3 JSON report
|
|
177
|
+
- one Layer 3 HTML report
|
|
178
|
+
|
|
179
|
+
So one command maps to one investigation bundle.
|
|
180
|
+
|
|
181
|
+
## Installation
|
|
182
|
+
|
|
183
|
+
Gauntlet is a Python package.
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
python -m pip install -e .
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Or just install dependencies and run with `python -m`.
|
|
190
|
+
|
|
191
|
+
Current package metadata is in [pyproject.toml](/Users/aryanjain/projects/gauntlet/pyproject.toml).
|
|
192
|
+
|
|
193
|
+
## Environment
|
|
194
|
+
|
|
195
|
+
Gauntlet loads environment variables through Layer 1’s environment loading.
|
|
196
|
+
|
|
197
|
+
Depending on what you run, you may need some of:
|
|
198
|
+
|
|
199
|
+
- `GEMINI_API_KEY`
|
|
200
|
+
- `OPENAI_API_KEY`
|
|
201
|
+
- `OLLAMA_BASE_URL`
|
|
202
|
+
- product-specific keys such as `STEEL_API_KEY`
|
|
203
|
+
|
|
204
|
+
Examples:
|
|
205
|
+
|
|
206
|
+
- Layer 2 provider calls use the provider API keys
|
|
207
|
+
- product workflows may need the product key, such as Steel auth
|
|
208
|
+
|
|
209
|
+
## Quick Start
|
|
210
|
+
|
|
211
|
+
Run a single task with a single persona:
|
|
212
|
+
|
|
213
|
+
```bash
|
|
214
|
+
python -m gauntlet.cli chat \
|
|
215
|
+
--provider gemini \
|
|
216
|
+
--model gemini-2.5-flash \
|
|
217
|
+
--docs https://docs.steel.dev/llms.txt \
|
|
218
|
+
--docs-full https://docs.steel.dev/llms-full.txt \
|
|
219
|
+
"Go to ycombinator.com and take a screenshot"
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Run all personas:
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
python -m gauntlet.cli chat \
|
|
226
|
+
--provider gemini \
|
|
227
|
+
--model gemini-2.5-flash \
|
|
228
|
+
--docs https://docs.steel.dev/llms.txt \
|
|
229
|
+
--docs-full https://docs.steel.dev/llms-full.txt \
|
|
230
|
+
--persona all \
|
|
231
|
+
"Go to espncricinfo and tell me latest IPL game's score"
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
After completion, Gauntlet prints the batch folder and Layer 3 report paths.
|
|
235
|
+
|
|
236
|
+
## Common Commands
|
|
237
|
+
|
|
238
|
+
### List Personas
|
|
239
|
+
|
|
240
|
+
```bash
|
|
241
|
+
python -m gauntlet.cli personas
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Show one persona in detail:
|
|
245
|
+
|
|
246
|
+
```bash
|
|
247
|
+
python -m gauntlet.cli personas recovery
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Run With Debug Logs
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
python -m gauntlet.cli chat \
|
|
254
|
+
--provider gemini \
|
|
255
|
+
--model gemini-2.5-flash \
|
|
256
|
+
--docs https://docs.steel.dev/llms.txt \
|
|
257
|
+
--docs-full https://docs.steel.dev/llms-full.txt \
|
|
258
|
+
--debug \
|
|
259
|
+
"Go to espncricinfo.com and scrape the page"
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
### Override The Output Directory
|
|
263
|
+
|
|
264
|
+
By default, Gauntlet creates `artifacts/gauntlet_runs/gauntlet_<n>/`.
|
|
265
|
+
|
|
266
|
+
If you want a different batch folder:
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
python -m gauntlet.cli chat \
|
|
270
|
+
--provider gemini \
|
|
271
|
+
--model gemini-2.5-flash \
|
|
272
|
+
--docs https://docs.steel.dev/llms.txt \
|
|
273
|
+
--docs-full https://docs.steel.dev/llms-full.txt \
|
|
274
|
+
--run-record artifacts/my_custom_batch \
|
|
275
|
+
"Go to espncricinfo and tell me latest IPL game's score"
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### Run Layer 3 Directly
|
|
279
|
+
|
|
280
|
+
If you already have Layer 2 run records, you can judge them directly:
|
|
281
|
+
|
|
282
|
+
```bash
|
|
283
|
+
python -m gauntlet.layer3.cli \
|
|
284
|
+
'artifacts/gauntlet_runs/gauntlet_003/layer2_run_*.json' \
|
|
285
|
+
--output-dir artifacts/judge_runs \
|
|
286
|
+
--name replay_batch
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
### Use An Optional Layer 3 Judge Model
|
|
290
|
+
|
|
291
|
+
Layer 3 defaults to a deterministic judge path. You can optionally configure a model-backed judge:
|
|
292
|
+
|
|
293
|
+
```bash
|
|
294
|
+
python -m gauntlet.layer3.cli \
|
|
295
|
+
'artifacts/gauntlet_runs/gauntlet_003/layer2_run_*.json' \
|
|
296
|
+
--output-dir artifacts/judge_runs \
|
|
297
|
+
--name llm_judged_batch \
|
|
298
|
+
--judge-provider gemini \
|
|
299
|
+
--judge-model gemini-2.5-pro \
|
|
300
|
+
--judge-fallback-deterministic
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
The same judge flags are available through `gauntlet chat`, and they will be used automatically when Layer 3 runs after Layer 2.
|
|
304
|
+
|
|
305
|
+
## Output Structure
|
|
306
|
+
|
|
307
|
+
Typical batch folder:
|
|
308
|
+
|
|
309
|
+
```text
|
|
310
|
+
artifacts/gauntlet_runs/gauntlet_003/
|
|
311
|
+
layer2_run_default.json
|
|
312
|
+
layer2_run_methodical.json
|
|
313
|
+
layer2_run_impatient.json
|
|
314
|
+
...
|
|
315
|
+
gauntlet_003.json
|
|
316
|
+
gauntlet_003.html
|
|
317
|
+
```
|
|
318
|
+
|
|
319
|
+
### Layer 2 Run Records
|
|
320
|
+
|
|
321
|
+
These contain:
|
|
322
|
+
|
|
323
|
+
- provider/model/persona metadata
|
|
324
|
+
- docs sources
|
|
325
|
+
- step-by-step tool execution
|
|
326
|
+
- tool results
|
|
327
|
+
- final response
|
|
328
|
+
- terminal failure info if present
|
|
329
|
+
|
|
330
|
+
### Layer 3 Report
|
|
331
|
+
|
|
332
|
+
The Layer 3 report is the interesting part.
|
|
333
|
+
|
|
334
|
+
It includes:
|
|
335
|
+
|
|
336
|
+
- batch summary
|
|
337
|
+
- issue breakdowns
|
|
338
|
+
- top recommendations
|
|
339
|
+
- per-persona outcomes
|
|
340
|
+
- execution timeline
|
|
341
|
+
- documentation evidence
|
|
342
|
+
- trace evidence
|
|
343
|
+
- successful path
|
|
344
|
+
- reproduction path
|
|
345
|
+
- cross-agent comparison
|
|
346
|
+
|
|
347
|
+
The HTML report is intended to be human-readable. The JSON report is intended to be machine-readable.
|
|
348
|
+
|
|
349
|
+
## How To Think About The Reports
|
|
350
|
+
|
|
351
|
+
A “completed” run is not automatically a good run.
|
|
352
|
+
|
|
353
|
+
Gauntlet tries to surface:
|
|
354
|
+
|
|
355
|
+
- clean success
|
|
356
|
+
- recovered success
|
|
357
|
+
- suspect success
|
|
358
|
+
- hard failure
|
|
359
|
+
|
|
360
|
+
What matters is whether the agent:
|
|
361
|
+
|
|
362
|
+
- used the product correctly
|
|
363
|
+
- used the docs correctly
|
|
364
|
+
- produced an answer supported by evidence
|
|
365
|
+
|
|
366
|
+
The most valuable output is often not “it failed,” but:
|
|
367
|
+
|
|
368
|
+
> it succeeded only after a noisy recovery path that a real external agent might never find
|
|
369
|
+
|
|
370
|
+
That is adoption signal.
|
|
371
|
+
|
|
372
|
+
## Project Layout
|
|
373
|
+
|
|
374
|
+
```text
|
|
375
|
+
gauntlet/
|
|
376
|
+
cli.py # top-level CLI
|
|
377
|
+
providers.py # provider adapters
|
|
378
|
+
layer1/ # docs + tool primitives
|
|
379
|
+
layer2/ # agent execution loop
|
|
380
|
+
layer3/ # judgment and reporting
|
|
381
|
+
artifacts/
|
|
382
|
+
gauntlet_runs/ # automatic end-to-end run bundles
|
|
383
|
+
```
|
|
384
|
+
|
|
385
|
+
Key files:
|
|
386
|
+
|
|
387
|
+
- [gauntlet/cli.py](/Users/aryanjain/projects/gauntlet/gauntlet/cli.py)
|
|
388
|
+
- [gauntlet/providers.py](/Users/aryanjain/projects/gauntlet/gauntlet/providers.py)
|
|
389
|
+
- [gauntlet/layer2/agent_runner.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer2/agent_runner.py)
|
|
390
|
+
- [gauntlet/layer2/personas.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer2/personas.py)
|
|
391
|
+
- [gauntlet/layer3/cli.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer3/cli.py)
|
|
392
|
+
- [gauntlet/layer3/reasoning_judge.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer3/reasoning_judge.py)
|
|
393
|
+
|
|
394
|
+
## Current Status
|
|
395
|
+
|
|
396
|
+
Gauntlet is still evolving. The pipeline is real and usable, but this is not trying to hide the experimental nature of the work.
|
|
397
|
+
|
|
398
|
+
What is already real:
|
|
399
|
+
|
|
400
|
+
- multi-provider execution
|
|
401
|
+
- multi-persona runs
|
|
402
|
+
- automatic Layer 2 recording
|
|
403
|
+
- automatic Layer 3 reporting
|
|
404
|
+
- deterministic and optional model-backed judgment paths
|
|
405
|
+
|
|
406
|
+
What still needs continued refinement:
|
|
407
|
+
|
|
408
|
+
- docs retrieval quality
|
|
409
|
+
- citation precision
|
|
410
|
+
- judge quality
|
|
411
|
+
- attribution quality
|
|
412
|
+
- richer product-specific mission libraries
|
|
413
|
+
|
|
414
|
+
## The Spirit Of The Project
|
|
415
|
+
|
|
416
|
+
Most agent evals ask:
|
|
417
|
+
|
|
418
|
+
> “Did the model solve the task?”
|
|
419
|
+
|
|
420
|
+
Gauntlet asks a more useful question:
|
|
421
|
+
|
|
422
|
+
> “If a serious external agent tried to adopt this product from the docs alone, where would it break, how would it break, and what should we fix first?”
|
|
423
|
+
|
|
424
|
+
That is what this repo is for.
|