pytest-aitest 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_aitest-0.1.0/.gitignore +57 -0
- pytest_aitest-0.1.0/LICENSE +21 -0
- pytest_aitest-0.1.0/PKG-INFO +320 -0
- pytest_aitest-0.1.0/README.md +280 -0
- pytest_aitest-0.1.0/pyproject.toml +117 -0
- pytest_aitest-0.1.0/src/pytest_aitest/__init__.py +77 -0
- pytest_aitest-0.1.0/src/pytest_aitest/config.py +43 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/__init__.py +36 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/agent.py +208 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/errors.py +48 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/prompt.py +128 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/result.py +127 -0
- pytest_aitest-0.1.0/src/pytest_aitest/core/skill.py +313 -0
- pytest_aitest-0.1.0/src/pytest_aitest/engine.py +12 -0
- pytest_aitest-0.1.0/src/pytest_aitest/execution/__init__.py +18 -0
- pytest_aitest-0.1.0/src/pytest_aitest/execution/engine.py +300 -0
- pytest_aitest-0.1.0/src/pytest_aitest/execution/retry.py +93 -0
- pytest_aitest-0.1.0/src/pytest_aitest/execution/servers.py +494 -0
- pytest_aitest-0.1.0/src/pytest_aitest/execution/skill_tools.py +117 -0
- pytest_aitest-0.1.0/src/pytest_aitest/fixtures/__init__.py +22 -0
- pytest_aitest-0.1.0/src/pytest_aitest/fixtures/factories.py +173 -0
- pytest_aitest-0.1.0/src/pytest_aitest/fixtures/judge.py +46 -0
- pytest_aitest-0.1.0/src/pytest_aitest/fixtures/run.py +102 -0
- pytest_aitest-0.1.0/src/pytest_aitest/fixtures.py +24 -0
- pytest_aitest-0.1.0/src/pytest_aitest/hooks.py +71 -0
- pytest_aitest-0.1.0/src/pytest_aitest/plugin.py +322 -0
- pytest_aitest-0.1.0/src/pytest_aitest/prompts/__init__.py +26 -0
- pytest_aitest-0.1.0/src/pytest_aitest/prompts/ai_summary.md +76 -0
- pytest_aitest-0.1.0/src/pytest_aitest/reporting/__init__.py +31 -0
- pytest_aitest-0.1.0/src/pytest_aitest/reporting/aggregator.py +415 -0
- pytest_aitest-0.1.0/src/pytest_aitest/reporting/collector.py +223 -0
- pytest_aitest-0.1.0/src/pytest_aitest/reporting/generator.py +307 -0
- pytest_aitest-0.1.0/src/pytest_aitest/reporting.py +22 -0
- pytest_aitest-0.1.0/src/pytest_aitest/result.py +14 -0
- pytest_aitest-0.1.0/src/pytest_aitest/servers.py +18 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/__init__.py +15 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/cli_server.py +165 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/mcp_server.py +111 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/store.py +314 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/todo.py +380 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/todo_mcp.py +108 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/weather.py +272 -0
- pytest_aitest-0.1.0/src/pytest_aitest/testing/weather_mcp.py +108 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
.Python
|
|
7
|
+
build/
|
|
8
|
+
develop-eggs/
|
|
9
|
+
dist/
|
|
10
|
+
downloads/
|
|
11
|
+
eggs/
|
|
12
|
+
.eggs/
|
|
13
|
+
lib/
|
|
14
|
+
lib64/
|
|
15
|
+
parts/
|
|
16
|
+
sdist/
|
|
17
|
+
var/
|
|
18
|
+
wheels/
|
|
19
|
+
*.egg-info/
|
|
20
|
+
.installed.cfg
|
|
21
|
+
*.egg
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
26
|
+
ENV/
|
|
27
|
+
env/
|
|
28
|
+
|
|
29
|
+
# IDE
|
|
30
|
+
.idea/
|
|
31
|
+
.vscode/
|
|
32
|
+
*.swp
|
|
33
|
+
*.swo
|
|
34
|
+
*~
|
|
35
|
+
|
|
36
|
+
# Testing
|
|
37
|
+
.pytest_cache/
|
|
38
|
+
.coverage
|
|
39
|
+
htmlcov/
|
|
40
|
+
.tox/
|
|
41
|
+
.nox/
|
|
42
|
+
|
|
43
|
+
# Type checking
|
|
44
|
+
.mypy_cache/
|
|
45
|
+
|
|
46
|
+
# Test outputs
|
|
47
|
+
test_results/
|
|
48
|
+
*.html
|
|
49
|
+
*.json
|
|
50
|
+
|
|
51
|
+
# Environment
|
|
52
|
+
.env
|
|
53
|
+
.env.local
|
|
54
|
+
|
|
55
|
+
# OS
|
|
56
|
+
.DS_Store
|
|
57
|
+
Thumbs.db
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Stefan Brunner
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-aitest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pytest plugin for testing AI agents with MCP and CLI servers
|
|
5
|
+
Project-URL: Homepage, https://github.com/sbroenne/pytest-aitest
|
|
6
|
+
Project-URL: Repository, https://github.com/sbroenne/pytest-aitest
|
|
7
|
+
Author: Stefan Brunner
|
|
8
|
+
License: MIT
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Keywords: agents,ai,llm,mcp,pytest,testing
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Framework :: Pytest
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Testing
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: jinja2>=3.1
|
|
23
|
+
Requires-Dist: litellm>=1.81
|
|
24
|
+
Requires-Dist: markdown>=3.5
|
|
25
|
+
Requires-Dist: mcp>=1.0.0
|
|
26
|
+
Requires-Dist: pydantic>=2.0
|
|
27
|
+
Requires-Dist: pytest-llm-assert>=0.2.0
|
|
28
|
+
Requires-Dist: pytest>=9.0
|
|
29
|
+
Provides-Extra: azure
|
|
30
|
+
Requires-Dist: azure-identity>=1.25; extra == 'azure'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pre-commit>=4.5; extra == 'dev'
|
|
33
|
+
Requires-Dist: pyright>=1.1.408; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest-cov>=6.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: pytest>=9.0; extra == 'dev'
|
|
37
|
+
Requires-Dist: python-dotenv>=1.2; extra == 'dev'
|
|
38
|
+
Requires-Dist: ruff>=0.14; extra == 'dev'
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
|
|
41
|
+
# LLM Interface Testing
|
|
42
|
+
|
|
43
|
+
[](https://pypi.org/project/pytest-aitest/)
|
|
44
|
+
[](https://pypi.org/project/pytest-aitest/)
|
|
45
|
+
[](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml)
|
|
46
|
+
[](https://opensource.org/licenses/MIT)
|
|
47
|
+
|
|
48
|
+
### Agent Contract Testing for MCP Servers and Tools
|
|
49
|
+
|
|
50
|
+
**Behavioural testing for LLM-operated systems.**
|
|
51
|
+
|
|
52
|
+
A pytest plugin for validating whether language models can actually understand and operate your interfaces: MCP servers, agents, prompts, and tools.
|
|
53
|
+
|
|
54
|
+
It tests the *LLM-facing contract* — not just the underlying code.
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## What Problem This Solves
|
|
59
|
+
|
|
60
|
+
Traditional tests validate deterministic code paths.
|
|
61
|
+
LLM-driven systems fail differently.
|
|
62
|
+
|
|
63
|
+
Your implementation can be correct, fully tested, and deployed — and still fail because the model:
|
|
64
|
+
|
|
65
|
+
- Chooses the wrong tool
|
|
66
|
+
- Supplies incorrect parameters
|
|
67
|
+
- Can't recover from errors
|
|
68
|
+
- Changes behaviour after a prompt or model update
|
|
69
|
+
|
|
70
|
+
These failures don't show up in unit tests, and manual testing doesn't scale.
|
|
71
|
+
|
|
72
|
+
**The root cause:**
|
|
73
|
+
Your real API is no longer just functions and endpoints.
|
|
74
|
+
It is the **LLM-facing interface** — descriptions, schemas, prompts, and error semantics.
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## Core Idea
|
|
79
|
+
|
|
80
|
+
### Your test is the prompt.
|
|
81
|
+
|
|
82
|
+
Instead of scripting expected tool calls, you write what a user would say.
|
|
83
|
+
|
|
84
|
+
The model decides:
|
|
85
|
+
- Whether to act
|
|
86
|
+
- Which tool to use
|
|
87
|
+
- How to supply parameters
|
|
88
|
+
- How to respond
|
|
89
|
+
|
|
90
|
+
Your test asserts on the *observed behaviour*.
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
@pytest.mark.asyncio
|
|
94
|
+
async def test_trip_planning(aitest_run, weather_agent_factory):
|
|
95
|
+
"""User asks for trip advice → LLM should compare forecasts."""
|
|
96
|
+
agent = weather_agent_factory("gpt-5-mini", max_turns=10)
|
|
97
|
+
|
|
98
|
+
# The test IS the prompt
|
|
99
|
+
result = await aitest_run(
|
|
100
|
+
agent,
|
|
101
|
+
"I'm planning a trip and can't decide between Paris and Sydney. "
|
|
102
|
+
"Get me a 3-day forecast for both and recommend which has better "
|
|
103
|
+
"weather for sightseeing. I prefer sunny weather.",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
assert result.success
|
|
107
|
+
assert result.tool_call_count("get_forecast") >= 2 # Called for both cities
|
|
108
|
+
assert "paris" in result.final_response.lower()
|
|
109
|
+
assert "sydney" in result.final_response.lower()
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
No mocking. No forced tool calls.
|
|
113
|
+
The model infers everything from the interface you expose.
|
|
114
|
+
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Features
|
|
118
|
+
|
|
119
|
+
### Test MCP Servers
|
|
120
|
+
|
|
121
|
+
Run real models against real interfaces:
|
|
122
|
+
|
|
123
|
+
- Tool discovery and selection
|
|
124
|
+
- Parameter inference
|
|
125
|
+
- Multi-step workflows
|
|
126
|
+
- Error handling and recovery
|
|
127
|
+
|
|
128
|
+
```python
|
|
129
|
+
@pytest.fixture(scope="module")
|
|
130
|
+
def weather_server():
|
|
131
|
+
return MCPServer(
|
|
132
|
+
command=[sys.executable, "-m", "my_weather_mcp"],
|
|
133
|
+
wait=Wait.for_tools(["get_weather", "get_forecast"]),
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Benchmark Models
|
|
138
|
+
|
|
139
|
+
Compare models using native pytest parametrize:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
@pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
|
|
143
|
+
@pytest.mark.asyncio
|
|
144
|
+
async def test_tool_selection(aitest_run, weather_server, model):
|
|
145
|
+
agent = Agent(
|
|
146
|
+
provider=Provider(model=f"azure/{model}"),
|
|
147
|
+
mcp_servers=[weather_server],
|
|
148
|
+
system_prompt="You are a helpful weather assistant.",
|
|
149
|
+
max_turns=5,
|
|
150
|
+
)
|
|
151
|
+
result = await aitest_run(agent, "What's the weather in Paris?")
|
|
152
|
+
assert result.success
|
|
153
|
+
assert result.tool_was_called("get_weather")
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
Reports show pass rate, token usage, and cost per model.
|
|
157
|
+
|
|
158
|
+
### Prompt Arena
|
|
159
|
+
|
|
160
|
+
Compare system prompts head-to-head:
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
PROMPTS = load_prompts(Path("tests/integration/prompts/"))
|
|
164
|
+
|
|
165
|
+
@pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
|
|
166
|
+
@pytest.mark.asyncio
|
|
167
|
+
async def test_prompt_effectiveness(aitest_run, weather_server, prompt):
|
|
168
|
+
agent = Agent(
|
|
169
|
+
provider=Provider(model="azure/gpt-5-mini"),
|
|
170
|
+
mcp_servers=[weather_server],
|
|
171
|
+
system_prompt=prompt.system_prompt,
|
|
172
|
+
max_turns=5,
|
|
173
|
+
)
|
|
174
|
+
result = await aitest_run(agent, "What's the weather in Paris?")
|
|
175
|
+
assert result.success
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Matrix Testing
|
|
179
|
+
|
|
180
|
+
Test every model × prompt combination:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
@pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
|
|
184
|
+
@pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
|
|
185
|
+
@pytest.mark.asyncio
|
|
186
|
+
async def test_matrix(aitest_run, weather_server, model, prompt):
|
|
187
|
+
# Full grid: surface brittle pairings
|
|
188
|
+
...
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
### AI Judge
|
|
192
|
+
|
|
193
|
+
Semantic assertions using LLM evaluation — validate response quality, not just tool usage:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
@pytest.mark.asyncio
|
|
197
|
+
async def test_recommendation_quality(aitest_run, weather_agent_factory, judge):
|
|
198
|
+
agent = weather_agent_factory("gpt-5-mini", max_turns=10)
|
|
199
|
+
|
|
200
|
+
result = await aitest_run(
|
|
201
|
+
agent,
|
|
202
|
+
"Compare weather in Paris and Sydney. Which is better for sightseeing?",
|
|
203
|
+
)
|
|
204
|
+
|
|
205
|
+
assert result.success
|
|
206
|
+
assert judge(result.final_response, """
|
|
207
|
+
- Mentions weather for both Paris and Sydney
|
|
208
|
+
- Makes a recommendation for one city
|
|
209
|
+
- Provides reasoning based on weather data
|
|
210
|
+
""")
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Uses [pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert) under the hood.
|
|
214
|
+
|
|
215
|
+
### CLI Server
|
|
216
|
+
|
|
217
|
+
Test command-line tools as if they were MCP servers:
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
@pytest.fixture(scope="module")
|
|
221
|
+
def git_server():
|
|
222
|
+
return CLIServer(
|
|
223
|
+
name="git",
|
|
224
|
+
command="git",
|
|
225
|
+
tool_prefix="git",
|
|
226
|
+
)
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
Help is discovered automatically — CLIServer runs `--help` at startup and includes the output in the tool description. Customize with `help_flag="-h"` for different CLIs, or provide a `description` directly for full control.
|
|
230
|
+
|
|
231
|
+
See [CLI Server Guide](docs/cli-server.md) for shell selection, help discovery, and assertions.
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Why pytest?
|
|
236
|
+
|
|
237
|
+
This is a **pytest plugin**, not a standalone tool.
|
|
238
|
+
|
|
239
|
+
- Use existing fixtures, markers, and parametrize
|
|
240
|
+
- Works with your CI/CD pipeline
|
|
241
|
+
- No new syntax to learn
|
|
242
|
+
- Combine with other pytest plugins
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## What This Is Not
|
|
247
|
+
|
|
248
|
+
- A replacement for unit tests
|
|
249
|
+
- A mock-based simulator
|
|
250
|
+
- A guarantee of perfect model behaviour
|
|
251
|
+
|
|
252
|
+
This tool complements traditional testing by covering LLM behaviour, which conventional tests cannot observe.
|
|
253
|
+
|
|
254
|
+
---
|
|
255
|
+
|
|
256
|
+
## Who This Is For
|
|
257
|
+
|
|
258
|
+
- MCP server authors
|
|
259
|
+
- Agent and tool builders
|
|
260
|
+
- Teams exposing APIs to LLMs
|
|
261
|
+
- Anyone shipping systems where models operate tools autonomously
|
|
262
|
+
|
|
263
|
+
---
|
|
264
|
+
|
|
265
|
+
## Installation
|
|
266
|
+
|
|
267
|
+
```bash
|
|
268
|
+
pip install pytest-aitest
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## Setup
|
|
272
|
+
|
|
273
|
+
Works out of the box with cloud identity:
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
# Azure (Entra ID)
|
|
277
|
+
export AZURE_API_BASE=https://your-resource.openai.azure.com/
|
|
278
|
+
az login
|
|
279
|
+
|
|
280
|
+
# OpenAI
|
|
281
|
+
export OPENAI_API_KEY=sk-...
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
Supports 100+ providers via [LiteLLM](https://docs.litellm.ai/docs/providers).
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## Documentation
|
|
289
|
+
|
|
290
|
+
- **[Configuration](docs/configuration.md)** — Providers, agents, fixtures
|
|
291
|
+
- **[CLI Server](docs/cli-server.md)** — Test CLI tools with help discovery
|
|
292
|
+
- **[MCP Server](docs/mcp-server.md)** — MCP server configuration and wait strategies
|
|
293
|
+
- **[Assertions](docs/assertions.md)** — AgentResult API and AI judge patterns
|
|
294
|
+
- **[Reporting](docs/reporting.md)** — HTML reports and AI summaries
|
|
295
|
+
- **[API Reference](docs/api-reference.md)** — Full API documentation
|
|
296
|
+
- **[Design](docs/DESIGN.md)** — Architecture and design decisions
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## Coming Soon
|
|
301
|
+
|
|
302
|
+
- **Multi-turn Conversations** — `continue_from()` for stateful sessions
|
|
303
|
+
- **Prompt Templates** — YAML-based prompt management
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
307
|
+
## Related
|
|
308
|
+
|
|
309
|
+
- **[pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert)** — Semantic assertions for pytest
|
|
310
|
+
- **[Contributing](CONTRIBUTING.md)** — Development setup and guidelines
|
|
311
|
+
|
|
312
|
+
## Requirements
|
|
313
|
+
|
|
314
|
+
- Python 3.11+
|
|
315
|
+
- pytest 9.0+
|
|
316
|
+
- An LLM provider (Azure, OpenAI, Anthropic, etc.)
|
|
317
|
+
|
|
318
|
+
## License
|
|
319
|
+
|
|
320
|
+
MIT
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# LLM Interface Testing
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/pytest-aitest/)
|
|
4
|
+
[](https://pypi.org/project/pytest-aitest/)
|
|
5
|
+
[](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
|
|
8
|
+
### Agent Contract Testing for MCP Servers and Tools
|
|
9
|
+
|
|
10
|
+
**Behavioural testing for LLM-operated systems.**
|
|
11
|
+
|
|
12
|
+
A pytest plugin for validating whether language models can actually understand and operate your interfaces: MCP servers, agents, prompts, and tools.
|
|
13
|
+
|
|
14
|
+
It tests the *LLM-facing contract* — not just the underlying code.
|
|
15
|
+
|
|
16
|
+
---
|
|
17
|
+
|
|
18
|
+
## What Problem This Solves
|
|
19
|
+
|
|
20
|
+
Traditional tests validate deterministic code paths.
|
|
21
|
+
LLM-driven systems fail differently.
|
|
22
|
+
|
|
23
|
+
Your implementation can be correct, fully tested, and deployed — and still fail because the model:
|
|
24
|
+
|
|
25
|
+
- Chooses the wrong tool
|
|
26
|
+
- Supplies incorrect parameters
|
|
27
|
+
- Can't recover from errors
|
|
28
|
+
- Changes behaviour after a prompt or model update
|
|
29
|
+
|
|
30
|
+
These failures don't show up in unit tests, and manual testing doesn't scale.
|
|
31
|
+
|
|
32
|
+
**The root cause:**
|
|
33
|
+
Your real API is no longer just functions and endpoints.
|
|
34
|
+
It is the **LLM-facing interface** — descriptions, schemas, prompts, and error semantics.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Core Idea
|
|
39
|
+
|
|
40
|
+
### Your test is the prompt.
|
|
41
|
+
|
|
42
|
+
Instead of scripting expected tool calls, you write what a user would say.
|
|
43
|
+
|
|
44
|
+
The model decides:
|
|
45
|
+
- Whether to act
|
|
46
|
+
- Which tool to use
|
|
47
|
+
- How to supply parameters
|
|
48
|
+
- How to respond
|
|
49
|
+
|
|
50
|
+
Your test asserts on the *observed behaviour*.
|
|
51
|
+
|
|
52
|
+
```python
|
|
53
|
+
@pytest.mark.asyncio
|
|
54
|
+
async def test_trip_planning(aitest_run, weather_agent_factory):
|
|
55
|
+
"""User asks for trip advice → LLM should compare forecasts."""
|
|
56
|
+
agent = weather_agent_factory("gpt-5-mini", max_turns=10)
|
|
57
|
+
|
|
58
|
+
# The test IS the prompt
|
|
59
|
+
result = await aitest_run(
|
|
60
|
+
agent,
|
|
61
|
+
"I'm planning a trip and can't decide between Paris and Sydney. "
|
|
62
|
+
"Get me a 3-day forecast for both and recommend which has better "
|
|
63
|
+
"weather for sightseeing. I prefer sunny weather.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
assert result.success
|
|
67
|
+
assert result.tool_call_count("get_forecast") >= 2 # Called for both cities
|
|
68
|
+
assert "paris" in result.final_response.lower()
|
|
69
|
+
assert "sydney" in result.final_response.lower()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
No mocking. No forced tool calls.
|
|
73
|
+
The model infers everything from the interface you expose.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## Features
|
|
78
|
+
|
|
79
|
+
### Test MCP Servers
|
|
80
|
+
|
|
81
|
+
Run real models against real interfaces:
|
|
82
|
+
|
|
83
|
+
- Tool discovery and selection
|
|
84
|
+
- Parameter inference
|
|
85
|
+
- Multi-step workflows
|
|
86
|
+
- Error handling and recovery
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
@pytest.fixture(scope="module")
|
|
90
|
+
def weather_server():
|
|
91
|
+
return MCPServer(
|
|
92
|
+
command=[sys.executable, "-m", "my_weather_mcp"],
|
|
93
|
+
wait=Wait.for_tools(["get_weather", "get_forecast"]),
|
|
94
|
+
)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### Benchmark Models
|
|
98
|
+
|
|
99
|
+
Compare models using native pytest parametrize:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
@pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
|
|
103
|
+
@pytest.mark.asyncio
|
|
104
|
+
async def test_tool_selection(aitest_run, weather_server, model):
|
|
105
|
+
agent = Agent(
|
|
106
|
+
provider=Provider(model=f"azure/{model}"),
|
|
107
|
+
mcp_servers=[weather_server],
|
|
108
|
+
system_prompt="You are a helpful weather assistant.",
|
|
109
|
+
max_turns=5,
|
|
110
|
+
)
|
|
111
|
+
result = await aitest_run(agent, "What's the weather in Paris?")
|
|
112
|
+
assert result.success
|
|
113
|
+
assert result.tool_was_called("get_weather")
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Reports show pass rate, token usage, and cost per model.
|
|
117
|
+
|
|
118
|
+
### Prompt Arena
|
|
119
|
+
|
|
120
|
+
Compare system prompts head-to-head:
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
PROMPTS = load_prompts(Path("tests/integration/prompts/"))
|
|
124
|
+
|
|
125
|
+
@pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
|
|
126
|
+
@pytest.mark.asyncio
|
|
127
|
+
async def test_prompt_effectiveness(aitest_run, weather_server, prompt):
|
|
128
|
+
agent = Agent(
|
|
129
|
+
provider=Provider(model="azure/gpt-5-mini"),
|
|
130
|
+
mcp_servers=[weather_server],
|
|
131
|
+
system_prompt=prompt.system_prompt,
|
|
132
|
+
max_turns=5,
|
|
133
|
+
)
|
|
134
|
+
result = await aitest_run(agent, "What's the weather in Paris?")
|
|
135
|
+
assert result.success
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### Matrix Testing
|
|
139
|
+
|
|
140
|
+
Test every model × prompt combination:
|
|
141
|
+
|
|
142
|
+
```python
|
|
143
|
+
@pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
|
|
144
|
+
@pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
|
|
145
|
+
@pytest.mark.asyncio
|
|
146
|
+
async def test_matrix(aitest_run, weather_server, model, prompt):
|
|
147
|
+
# Full grid: surface brittle pairings
|
|
148
|
+
...
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### AI Judge
|
|
152
|
+
|
|
153
|
+
Semantic assertions using LLM evaluation — validate response quality, not just tool usage:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
@pytest.mark.asyncio
|
|
157
|
+
async def test_recommendation_quality(aitest_run, weather_agent_factory, judge):
|
|
158
|
+
agent = weather_agent_factory("gpt-5-mini", max_turns=10)
|
|
159
|
+
|
|
160
|
+
result = await aitest_run(
|
|
161
|
+
agent,
|
|
162
|
+
"Compare weather in Paris and Sydney. Which is better for sightseeing?",
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
assert result.success
|
|
166
|
+
assert judge(result.final_response, """
|
|
167
|
+
- Mentions weather for both Paris and Sydney
|
|
168
|
+
- Makes a recommendation for one city
|
|
169
|
+
- Provides reasoning based on weather data
|
|
170
|
+
""")
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
Uses [pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert) under the hood.
|
|
174
|
+
|
|
175
|
+
### CLI Server
|
|
176
|
+
|
|
177
|
+
Test command-line tools as if they were MCP servers:
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
@pytest.fixture(scope="module")
|
|
181
|
+
def git_server():
|
|
182
|
+
return CLIServer(
|
|
183
|
+
name="git",
|
|
184
|
+
command="git",
|
|
185
|
+
tool_prefix="git",
|
|
186
|
+
)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Help is discovered automatically — CLIServer runs `--help` at startup and includes the output in the tool description. Customize with `help_flag="-h"` for different CLIs, or provide a `description` directly for full control.
|
|
190
|
+
|
|
191
|
+
See [CLI Server Guide](docs/cli-server.md) for shell selection, help discovery, and assertions.
|
|
192
|
+
|
|
193
|
+
---
|
|
194
|
+
|
|
195
|
+
## Why pytest?
|
|
196
|
+
|
|
197
|
+
This is a **pytest plugin**, not a standalone tool.
|
|
198
|
+
|
|
199
|
+
- Use existing fixtures, markers, and parametrize
|
|
200
|
+
- Works with your CI/CD pipeline
|
|
201
|
+
- No new syntax to learn
|
|
202
|
+
- Combine with other pytest plugins
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## What This Is Not
|
|
207
|
+
|
|
208
|
+
- A replacement for unit tests
|
|
209
|
+
- A mock-based simulator
|
|
210
|
+
- A guarantee of perfect model behaviour
|
|
211
|
+
|
|
212
|
+
This tool complements traditional testing by covering LLM behaviour, which conventional tests cannot observe.
|
|
213
|
+
|
|
214
|
+
---
|
|
215
|
+
|
|
216
|
+
## Who This Is For
|
|
217
|
+
|
|
218
|
+
- MCP server authors
|
|
219
|
+
- Agent and tool builders
|
|
220
|
+
- Teams exposing APIs to LLMs
|
|
221
|
+
- Anyone shipping systems where models operate tools autonomously
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Installation
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
pip install pytest-aitest
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## Setup
|
|
232
|
+
|
|
233
|
+
Works out of the box with cloud identity:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
# Azure (Entra ID)
|
|
237
|
+
export AZURE_API_BASE=https://your-resource.openai.azure.com/
|
|
238
|
+
az login
|
|
239
|
+
|
|
240
|
+
# OpenAI
|
|
241
|
+
export OPENAI_API_KEY=sk-...
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Supports 100+ providers via [LiteLLM](https://docs.litellm.ai/docs/providers).
|
|
245
|
+
|
|
246
|
+
---
|
|
247
|
+
|
|
248
|
+
## Documentation
|
|
249
|
+
|
|
250
|
+
- **[Configuration](docs/configuration.md)** — Providers, agents, fixtures
|
|
251
|
+
- **[CLI Server](docs/cli-server.md)** — Test CLI tools with help discovery
|
|
252
|
+
- **[MCP Server](docs/mcp-server.md)** — MCP server configuration and wait strategies
|
|
253
|
+
- **[Assertions](docs/assertions.md)** — AgentResult API and AI judge patterns
|
|
254
|
+
- **[Reporting](docs/reporting.md)** — HTML reports and AI summaries
|
|
255
|
+
- **[API Reference](docs/api-reference.md)** — Full API documentation
|
|
256
|
+
- **[Design](docs/DESIGN.md)** — Architecture and design decisions
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Coming Soon
|
|
261
|
+
|
|
262
|
+
- **Multi-turn Conversations** — `continue_from()` for stateful sessions
|
|
263
|
+
- **Prompt Templates** — YAML-based prompt management
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Related
|
|
268
|
+
|
|
269
|
+
- **[pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert)** — Semantic assertions for pytest
|
|
270
|
+
- **[Contributing](CONTRIBUTING.md)** — Development setup and guidelines
|
|
271
|
+
|
|
272
|
+
## Requirements
|
|
273
|
+
|
|
274
|
+
- Python 3.11+
|
|
275
|
+
- pytest 9.0+
|
|
276
|
+
- An LLM provider (Azure, OpenAI, Anthropic, etc.)
|
|
277
|
+
|
|
278
|
+
## License
|
|
279
|
+
|
|
280
|
+
MIT
|