run-gauntlet 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. run_gauntlet-0.1.0/PKG-INFO +424 -0
  2. run_gauntlet-0.1.0/README.md +391 -0
  3. run_gauntlet-0.1.0/gauntlet/__init__.py +3 -0
  4. run_gauntlet-0.1.0/gauntlet/cli.py +361 -0
  5. run_gauntlet-0.1.0/gauntlet/layer1/__init__.py +1 -0
  6. run_gauntlet-0.1.0/gauntlet/layer1/agent_tools.py +302 -0
  7. run_gauntlet-0.1.0/gauntlet/layer1/contextual_chunking.py +215 -0
  8. run_gauntlet-0.1.0/gauntlet/layer1/embeddings.py +213 -0
  9. run_gauntlet-0.1.0/gauntlet/layer1/gemini_harness.py +383 -0
  10. run_gauntlet-0.1.0/gauntlet/layer1/llms_docs.py +1558 -0
  11. run_gauntlet-0.1.0/gauntlet/layer1/llms_slim.py +129 -0
  12. run_gauntlet-0.1.0/gauntlet/layer1/query_rewriting.py +150 -0
  13. run_gauntlet-0.1.0/gauntlet/layer1/utility.py +642 -0
  14. run_gauntlet-0.1.0/gauntlet/layer2/__init__.py +1 -0
  15. run_gauntlet-0.1.0/gauntlet/layer2/agent_runner.py +2679 -0
  16. run_gauntlet-0.1.0/gauntlet/layer2/personas.py +274 -0
  17. run_gauntlet-0.1.0/gauntlet/layer3/__init__.py +2 -0
  18. run_gauntlet-0.1.0/gauntlet/layer3/aggregate_runs.py +308 -0
  19. run_gauntlet-0.1.0/gauntlet/layer3/build_batch_context.py +124 -0
  20. run_gauntlet-0.1.0/gauntlet/layer3/cli.py +157 -0
  21. run_gauntlet-0.1.0/gauntlet/layer3/contracts.py +309 -0
  22. run_gauntlet-0.1.0/gauntlet/layer3/extract_evidence.py +612 -0
  23. run_gauntlet-0.1.0/gauntlet/layer3/judge_run.py +108 -0
  24. run_gauntlet-0.1.0/gauntlet/layer3/normalize_run.py +40 -0
  25. run_gauntlet-0.1.0/gauntlet/layer3/reasoning_judge.py +977 -0
  26. run_gauntlet-0.1.0/gauntlet/layer3/render_report.py +507 -0
  27. run_gauntlet-0.1.0/gauntlet/layer3/retrieve_docs.py +177 -0
  28. run_gauntlet-0.1.0/gauntlet/layer3/validate_judgment.py +123 -0
  29. run_gauntlet-0.1.0/gauntlet/layer3/verify_task.py +463 -0
  30. run_gauntlet-0.1.0/gauntlet/providers.py +140 -0
  31. run_gauntlet-0.1.0/gauntlet/run_orchestrator.py +278 -0
  32. run_gauntlet-0.1.0/gauntlet/server/__init__.py +2 -0
  33. run_gauntlet-0.1.0/gauntlet/server/app.py +44 -0
  34. run_gauntlet-0.1.0/gauntlet/server/artifacts.py +45 -0
  35. run_gauntlet-0.1.0/gauntlet/server/config.py +25 -0
  36. run_gauntlet-0.1.0/gauntlet/server/db.py +28 -0
  37. run_gauntlet-0.1.0/gauntlet/server/embeddings.py +65 -0
  38. run_gauntlet-0.1.0/gauntlet/server/repositories.py +218 -0
  39. run_gauntlet-0.1.0/gauntlet/server/routes/__init__.py +2 -0
  40. run_gauntlet-0.1.0/gauntlet/server/routes/auth.py +22 -0
  41. run_gauntlet-0.1.0/gauntlet/server/routes/docs.py +40 -0
  42. run_gauntlet-0.1.0/gauntlet/server/routes/runs.py +169 -0
  43. run_gauntlet-0.1.0/gauntlet/server/services.py +217 -0
  44. run_gauntlet-0.1.0/gauntlet/server/storage.py +71 -0
  45. run_gauntlet-0.1.0/gauntlet/task_validation.py +93 -0
  46. run_gauntlet-0.1.0/pyproject.toml +53 -0
  47. run_gauntlet-0.1.0/run_gauntlet.egg-info/PKG-INFO +424 -0
  48. run_gauntlet-0.1.0/run_gauntlet.egg-info/SOURCES.txt +53 -0
  49. run_gauntlet-0.1.0/run_gauntlet.egg-info/dependency_links.txt +1 -0
  50. run_gauntlet-0.1.0/run_gauntlet.egg-info/entry_points.txt +2 -0
  51. run_gauntlet-0.1.0/run_gauntlet.egg-info/requires.txt +18 -0
  52. run_gauntlet-0.1.0/run_gauntlet.egg-info/top_level.txt +1 -0
  53. run_gauntlet-0.1.0/setup.cfg +4 -0
  54. run_gauntlet-0.1.0/tests/test_gauntlet_regressions.py +729 -0
  55. run_gauntlet-0.1.0/tests/test_llms_slim.py +86 -0
@@ -0,0 +1,424 @@
1
+ Metadata-Version: 2.4
2
+ Name: run-gauntlet
3
+ Version: 0.1.0
4
+ Summary: Multi-provider harness for testing whether AI agents can use product docs and tools.
5
+ Author: Gauntlet Contributors
6
+ Keywords: ai,agents,evaluation,cli,llms
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Environment :: Console
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.10
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Software Development :: Testing
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: httpx<1.0,>=0.27
18
+ Requires-Dist: python-dotenv<2.0,>=1.0
19
+ Requires-Dist: requests<3.0,>=2.31
20
+ Requires-Dist: beautifulsoup4<5.0,>=4.12
21
+ Requires-Dist: playwright<2.0,>=1.44
22
+ Requires-Dist: rank-bm25<1.0,>=0.2.2
23
+ Provides-Extra: server
24
+ Requires-Dist: fastapi<1.0,>=0.115; extra == "server"
25
+ Requires-Dist: uvicorn[standard]<1.0,>=0.30; extra == "server"
26
+ Requires-Dist: psycopg[binary]<4.0,>=3.2; extra == "server"
27
+ Requires-Dist: numpy<3.0,>=1.26; extra == "server"
28
+ Provides-Extra: dev
29
+ Requires-Dist: build<2.0,>=1.2; extra == "dev"
30
+ Requires-Dist: pytest<9.0,>=8.0; extra == "dev"
31
+ Requires-Dist: ruff<1.0,>=0.6; extra == "dev"
32
+ Requires-Dist: twine<7.0,>=5.0; extra == "dev"
33
+
34
+ # Gauntlet
35
+
36
+ Gauntlet is a harness for answering a practical question:
37
+
38
+ **Can real external agents learn and use your product from docs, specs, and tools alone and, if not, exactly where does adoption break?**
39
+
40
+ This project does not try to simulate an idealized agent in a perfect environment. It tries to simulate the messy reality:
41
+
42
+ - different model providers
43
+ - different behavior styles
44
+ - ambiguous docs
45
+ - brittle tool contracts
46
+ - recovery after failure
47
+ - noisy execution traces
48
+
49
+ The result is a pipeline that can:
50
+
51
+ 1. run an agent against a product task using only the provided docs and tools
52
+ 2. record the execution automatically
53
+ 3. judge the run afterward
54
+ 4. produce a report showing what happened, where it drifted, and what to fix
55
+
56
+ ## What Gauntlet Is Actually Doing
57
+
58
+ Gauntlet is organized as three layers.
59
+
60
+ ### Layer 1: Product Context And Execution Primitives
61
+
62
+ Layer 1 is the substrate.
63
+
64
+ It knows how to:
65
+
66
+ - load `llms.txt` and `llms-full.txt`
67
+ - shape docs into a manifest and retrievable chunks
68
+ - run generic execution tools like:
69
+ - HTTP
70
+ - CLI
71
+ - Python code
72
+ - Python SDK invocation
73
+
74
+ This is where the product-facing context lives.
75
+
76
+ ### Layer 2: Agent Execution
77
+
78
+ Layer 2 is the agent loop.
79
+
80
+ It takes:
81
+
82
+ - a task
83
+ - a provider/model
84
+ - an optional persona
85
+ - docs sources
86
+
87
+ Then it:
88
+
89
+ - prompts the model
90
+ - lets it call tools
91
+ - captures step-by-step execution
92
+ - returns a final answer or failure
93
+ - writes a structured Layer 2 run record
94
+
95
+ This is the “can the agent do it?” layer.
96
+
97
+ ### Layer 3: Judgment
98
+
99
+ Layer 3 is the post-run judge.
100
+
101
+ It consumes Layer 2 run records and produces:
102
+
103
+ - a normalized view of the run
104
+ - extracted evidence
105
+ - retrieved docs context
106
+ - batch-level cross-persona context
107
+ - a judged result
108
+ - a single HTML + JSON report
109
+
110
+ This is the “what happened, why, and how do we improve adoption?” layer.
111
+
112
+ ## Why This Exists
113
+
114
+ Most product teams ask some version of:
115
+
116
+ - “Will agents be able to use our product?”
117
+ - “Are our docs good enough for AI agents?”
118
+ - “If an agent fails, is it our product, our docs, or just the model?”
119
+
120
+ Gauntlet is built to make those failures attributable.
121
+
122
+ The goal is not merely pass/fail.
123
+
124
+ The goal is to separate:
125
+
126
+ - product problems
127
+ - docs problems
128
+ - runtime/tooling problems
129
+ - harness problems
130
+ - model capability limits
131
+
132
+ ## Core Ideas
133
+
134
+ ### Personas Matter
135
+
136
+ A single clean run is not enough.
137
+
138
+ Gauntlet can run multiple built-in personas that stress different failure modes:
139
+
140
+ - `methodical`: follows docs literally
141
+ - `impatient`: optimizes for speed
142
+ - `chaotic`: reorders and perturbs flows
143
+ - `confused`: exposes clarity gaps
144
+ - `long-running`: stresses session continuity
145
+ - `adversarial`: pushes boundaries and validation
146
+ - `parallel`: stresses concurrency and state isolation
147
+ - `recovery`: intentionally fails, then tries to recover
148
+
149
+ These are not cosmetic personas. They change how the agent behaves and what kinds of product adoption failures become visible.
150
+
151
+ ### Docs Are First-Class Inputs
152
+
153
+ Gauntlet is designed around product docs, especially:
154
+
155
+ - `llms.txt`
156
+ - `llms-full.txt`
157
+
158
+ The agent is expected to use those as its operating context, and Layer 3 uses them again to judge whether the run:
159
+
160
+ - had enough documentation
161
+ - consulted the docs
162
+ - followed the docs
163
+ - or drifted away from the intended path
164
+
165
+ ### Every Run Becomes An Artifact
166
+
167
+ By default, `gauntlet chat` creates a batch folder automatically:
168
+
169
+ ```text
170
+ artifacts/gauntlet_runs/gauntlet_###/
171
+ ```
172
+
173
+ That folder contains:
174
+
175
+ - one Layer 2 run record per persona
176
+ - one Layer 3 JSON report
177
+ - one Layer 3 HTML report
178
+
179
+ So one command maps to one investigation bundle.
180
+
181
+ ## Installation
182
+
183
+ Gauntlet is a Python package.
184
+
185
+ ```bash
186
+ python -m pip install -e .
187
+ ```
188
+
189
+ Or just install dependencies and run with `python -m`.
190
+
191
+ Current package metadata is in [pyproject.toml](/Users/aryanjain/projects/gauntlet/pyproject.toml).
192
+
193
+ ## Environment
194
+
195
+ Gauntlet loads environment variables through Layer 1’s environment loading.
196
+
197
+ Depending on what you run, you may need some of:
198
+
199
+ - `GEMINI_API_KEY`
200
+ - `OPENAI_API_KEY`
201
+ - `OLLAMA_BASE_URL`
202
+ - product-specific keys such as `STEEL_API_KEY`
203
+
204
+ Examples:
205
+
206
+ - Layer 2 provider calls use the provider API keys
207
+ - product workflows may need the product key, such as Steel auth
208
+
209
+ ## Quick Start
210
+
211
+ Run a single task with a single persona:
212
+
213
+ ```bash
214
+ python -m gauntlet.cli chat \
215
+ --provider gemini \
216
+ --model gemini-2.5-flash \
217
+ --docs https://docs.steel.dev/llms.txt \
218
+ --docs-full https://docs.steel.dev/llms-full.txt \
219
+ "Go to ycombinator.com and take a screenshot"
220
+ ```
221
+
222
+ Run all personas:
223
+
224
+ ```bash
225
+ python -m gauntlet.cli chat \
226
+ --provider gemini \
227
+ --model gemini-2.5-flash \
228
+ --docs https://docs.steel.dev/llms.txt \
229
+ --docs-full https://docs.steel.dev/llms-full.txt \
230
+ --persona all \
231
+ "Go to espncricinfo and tell me latest IPL game's score"
232
+ ```
233
+
234
+ After completion, Gauntlet prints the batch folder and Layer 3 report paths.
235
+
236
+ ## Common Commands
237
+
238
+ ### List Personas
239
+
240
+ ```bash
241
+ python -m gauntlet.cli personas
242
+ ```
243
+
244
+ Show one persona in detail:
245
+
246
+ ```bash
247
+ python -m gauntlet.cli personas recovery
248
+ ```
249
+
250
+ ### Run With Debug Logs
251
+
252
+ ```bash
253
+ python -m gauntlet.cli chat \
254
+ --provider gemini \
255
+ --model gemini-2.5-flash \
256
+ --docs https://docs.steel.dev/llms.txt \
257
+ --docs-full https://docs.steel.dev/llms-full.txt \
258
+ --debug \
259
+ "Go to espncricinfo.com and scrape the page"
260
+ ```
261
+
262
+ ### Override The Output Directory
263
+
264
+ By default, Gauntlet creates `artifacts/gauntlet_runs/gauntlet_<n>/`.
265
+
266
+ If you want a different batch folder:
267
+
268
+ ```bash
269
+ python -m gauntlet.cli chat \
270
+ --provider gemini \
271
+ --model gemini-2.5-flash \
272
+ --docs https://docs.steel.dev/llms.txt \
273
+ --docs-full https://docs.steel.dev/llms-full.txt \
274
+ --run-record artifacts/my_custom_batch \
275
+ "Go to espncricinfo and tell me latest IPL game's score"
276
+ ```
277
+
278
+ ### Run Layer 3 Directly
279
+
280
+ If you already have Layer 2 run records, you can judge them directly:
281
+
282
+ ```bash
283
+ python -m gauntlet.layer3.cli \
284
+ 'artifacts/gauntlet_runs/gauntlet_003/layer2_run_*.json' \
285
+ --output-dir artifacts/judge_runs \
286
+ --name replay_batch
287
+ ```
288
+
289
+ ### Use An Optional Layer 3 Judge Model
290
+
291
+ Layer 3 defaults to a deterministic judge path. You can optionally configure a model-backed judge:
292
+
293
+ ```bash
294
+ python -m gauntlet.layer3.cli \
295
+ 'artifacts/gauntlet_runs/gauntlet_003/layer2_run_*.json' \
296
+ --output-dir artifacts/judge_runs \
297
+ --name llm_judged_batch \
298
+ --judge-provider gemini \
299
+ --judge-model gemini-2.5-pro \
300
+ --judge-fallback-deterministic
301
+ ```
302
+
303
+ The same judge flags are available through `gauntlet chat`, and they will be used automatically when Layer 3 runs after Layer 2.
304
+
305
+ ## Output Structure
306
+
307
+ Typical batch folder:
308
+
309
+ ```text
310
+ artifacts/gauntlet_runs/gauntlet_003/
311
+ layer2_run_default.json
312
+ layer2_run_methodical.json
313
+ layer2_run_impatient.json
314
+ ...
315
+ gauntlet_003.json
316
+ gauntlet_003.html
317
+ ```
318
+
319
+ ### Layer 2 Run Records
320
+
321
+ These contain:
322
+
323
+ - provider/model/persona metadata
324
+ - docs sources
325
+ - step-by-step tool execution
326
+ - tool results
327
+ - final response
328
+ - terminal failure info if present
329
+
330
+ ### Layer 3 Report
331
+
332
+ The Layer 3 report is the interesting part.
333
+
334
+ It includes:
335
+
336
+ - batch summary
337
+ - issue breakdowns
338
+ - top recommendations
339
+ - per-persona outcomes
340
+ - execution timeline
341
+ - documentation evidence
342
+ - trace evidence
343
+ - successful path
344
+ - reproduction path
345
+ - cross-agent comparison
346
+
347
+ The HTML report is intended to be human-readable. The JSON report is intended to be machine-readable.
348
+
349
+ ## How To Think About The Reports
350
+
351
+ A “completed” run is not automatically a good run.
352
+
353
+ Gauntlet tries to surface:
354
+
355
+ - clean success
356
+ - recovered success
357
+ - suspect success
358
+ - hard failure
359
+
360
+ What matters is whether the agent:
361
+
362
+ - used the product correctly
363
+ - used the docs correctly
364
+ - produced an answer supported by evidence
365
+
366
+ The most valuable output is often not “it failed,” but:
367
+
368
+ > it succeeded only after a noisy recovery path that a real external agent might never find
369
+
370
+ That is adoption signal.
371
+
372
+ ## Project Layout
373
+
374
+ ```text
375
+ gauntlet/
376
+ cli.py # top-level CLI
377
+ providers.py # provider adapters
378
+ layer1/ # docs + tool primitives
379
+ layer2/ # agent execution loop
380
+ layer3/ # judgment and reporting
381
+ artifacts/
382
+ gauntlet_runs/ # automatic end-to-end run bundles
383
+ ```
384
+
385
+ Key files:
386
+
387
+ - [gauntlet/cli.py](/Users/aryanjain/projects/gauntlet/gauntlet/cli.py)
388
+ - [gauntlet/providers.py](/Users/aryanjain/projects/gauntlet/gauntlet/providers.py)
389
+ - [gauntlet/layer2/agent_runner.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer2/agent_runner.py)
390
+ - [gauntlet/layer2/personas.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer2/personas.py)
391
+ - [gauntlet/layer3/cli.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer3/cli.py)
392
+ - [gauntlet/layer3/reasoning_judge.py](/Users/aryanjain/projects/gauntlet/gauntlet/layer3/reasoning_judge.py)
393
+
394
+ ## Current Status
395
+
396
+ Gauntlet is still evolving. The pipeline is real and usable, but this is not trying to hide the experimental nature of the work.
397
+
398
+ What is already real:
399
+
400
+ - multi-provider execution
401
+ - multi-persona runs
402
+ - automatic Layer 2 recording
403
+ - automatic Layer 3 reporting
404
+ - deterministic and optional model-backed judgment paths
405
+
406
+ What still needs continued refinement:
407
+
408
+ - docs retrieval quality
409
+ - citation precision
410
+ - judge quality
411
+ - attribution quality
412
+ - richer product-specific mission libraries
413
+
414
+ ## The Spirit Of The Project
415
+
416
+ Most agent evals ask:
417
+
418
+ > “Did the model solve the task?”
419
+
420
+ Gauntlet asks a more useful question:
421
+
422
+ > “If a serious external agent tried to adopt this product from the docs alone, where would it break, how would it break, and what should we fix first?”
423
+
424
+ That is what this repo is for.