pytest-aitest 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pytest_aitest-0.1.0/.gitignore +57 -0
  2. pytest_aitest-0.1.0/LICENSE +21 -0
  3. pytest_aitest-0.1.0/PKG-INFO +320 -0
  4. pytest_aitest-0.1.0/README.md +280 -0
  5. pytest_aitest-0.1.0/pyproject.toml +117 -0
  6. pytest_aitest-0.1.0/src/pytest_aitest/__init__.py +77 -0
  7. pytest_aitest-0.1.0/src/pytest_aitest/config.py +43 -0
  8. pytest_aitest-0.1.0/src/pytest_aitest/core/__init__.py +36 -0
  9. pytest_aitest-0.1.0/src/pytest_aitest/core/agent.py +208 -0
  10. pytest_aitest-0.1.0/src/pytest_aitest/core/errors.py +48 -0
  11. pytest_aitest-0.1.0/src/pytest_aitest/core/prompt.py +128 -0
  12. pytest_aitest-0.1.0/src/pytest_aitest/core/result.py +127 -0
  13. pytest_aitest-0.1.0/src/pytest_aitest/core/skill.py +313 -0
  14. pytest_aitest-0.1.0/src/pytest_aitest/engine.py +12 -0
  15. pytest_aitest-0.1.0/src/pytest_aitest/execution/__init__.py +18 -0
  16. pytest_aitest-0.1.0/src/pytest_aitest/execution/engine.py +300 -0
  17. pytest_aitest-0.1.0/src/pytest_aitest/execution/retry.py +93 -0
  18. pytest_aitest-0.1.0/src/pytest_aitest/execution/servers.py +494 -0
  19. pytest_aitest-0.1.0/src/pytest_aitest/execution/skill_tools.py +117 -0
  20. pytest_aitest-0.1.0/src/pytest_aitest/fixtures/__init__.py +22 -0
  21. pytest_aitest-0.1.0/src/pytest_aitest/fixtures/factories.py +173 -0
  22. pytest_aitest-0.1.0/src/pytest_aitest/fixtures/judge.py +46 -0
  23. pytest_aitest-0.1.0/src/pytest_aitest/fixtures/run.py +102 -0
  24. pytest_aitest-0.1.0/src/pytest_aitest/fixtures.py +24 -0
  25. pytest_aitest-0.1.0/src/pytest_aitest/hooks.py +71 -0
  26. pytest_aitest-0.1.0/src/pytest_aitest/plugin.py +322 -0
  27. pytest_aitest-0.1.0/src/pytest_aitest/prompts/__init__.py +26 -0
  28. pytest_aitest-0.1.0/src/pytest_aitest/prompts/ai_summary.md +76 -0
  29. pytest_aitest-0.1.0/src/pytest_aitest/reporting/__init__.py +31 -0
  30. pytest_aitest-0.1.0/src/pytest_aitest/reporting/aggregator.py +415 -0
  31. pytest_aitest-0.1.0/src/pytest_aitest/reporting/collector.py +223 -0
  32. pytest_aitest-0.1.0/src/pytest_aitest/reporting/generator.py +307 -0
  33. pytest_aitest-0.1.0/src/pytest_aitest/reporting.py +22 -0
  34. pytest_aitest-0.1.0/src/pytest_aitest/result.py +14 -0
  35. pytest_aitest-0.1.0/src/pytest_aitest/servers.py +18 -0
  36. pytest_aitest-0.1.0/src/pytest_aitest/testing/__init__.py +15 -0
  37. pytest_aitest-0.1.0/src/pytest_aitest/testing/cli_server.py +165 -0
  38. pytest_aitest-0.1.0/src/pytest_aitest/testing/mcp_server.py +111 -0
  39. pytest_aitest-0.1.0/src/pytest_aitest/testing/store.py +314 -0
  40. pytest_aitest-0.1.0/src/pytest_aitest/testing/todo.py +380 -0
  41. pytest_aitest-0.1.0/src/pytest_aitest/testing/todo_mcp.py +108 -0
  42. pytest_aitest-0.1.0/src/pytest_aitest/testing/weather.py +272 -0
  43. pytest_aitest-0.1.0/src/pytest_aitest/testing/weather_mcp.py +108 -0
@@ -0,0 +1,57 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ .venv/
25
+ venv/
26
+ ENV/
27
+ env/
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ *~
35
+
36
+ # Testing
37
+ .pytest_cache/
38
+ .coverage
39
+ htmlcov/
40
+ .tox/
41
+ .nox/
42
+
43
+ # Type checking
44
+ .mypy_cache/
45
+
46
+ # Test outputs
47
+ test_results/
48
+ *.html
49
+ *.json
50
+
51
+ # Environment
52
+ .env
53
+ .env.local
54
+
55
+ # OS
56
+ .DS_Store
57
+ Thumbs.db
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Stefan Brunner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,320 @@
1
+ Metadata-Version: 2.4
2
+ Name: pytest-aitest
3
+ Version: 0.1.0
4
+ Summary: Pytest plugin for testing AI agents with MCP and CLI servers
5
+ Project-URL: Homepage, https://github.com/sbroenne/pytest-aitest
6
+ Project-URL: Repository, https://github.com/sbroenne/pytest-aitest
7
+ Author: Stefan Brunner
8
+ License: MIT
9
+ License-File: LICENSE
10
+ Keywords: agents,ai,llm,mcp,pytest,testing
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Framework :: Pytest
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Software Development :: Testing
21
+ Requires-Python: >=3.11
22
+ Requires-Dist: jinja2>=3.1
23
+ Requires-Dist: litellm>=1.81
24
+ Requires-Dist: markdown>=3.5
25
+ Requires-Dist: mcp>=1.0.0
26
+ Requires-Dist: pydantic>=2.0
27
+ Requires-Dist: pytest-llm-assert>=0.2.0
28
+ Requires-Dist: pytest>=9.0
29
+ Provides-Extra: azure
30
+ Requires-Dist: azure-identity>=1.25; extra == 'azure'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pre-commit>=4.5; extra == 'dev'
33
+ Requires-Dist: pyright>=1.1.408; extra == 'dev'
34
+ Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
35
+ Requires-Dist: pytest-cov>=6.0; extra == 'dev'
36
+ Requires-Dist: pytest>=9.0; extra == 'dev'
37
+ Requires-Dist: python-dotenv>=1.2; extra == 'dev'
38
+ Requires-Dist: ruff>=0.14; extra == 'dev'
39
+ Description-Content-Type: text/markdown
40
+
41
+ # LLM Interface Testing
42
+
43
+ [![PyPI version](https://img.shields.io/pypi/v/pytest-aitest)](https://pypi.org/project/pytest-aitest/)
44
+ [![Python versions](https://img.shields.io/pypi/pyversions/pytest-aitest)](https://pypi.org/project/pytest-aitest/)
45
+ [![CI](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml/badge.svg)](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml)
46
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
47
+
48
+ ### Agent Contract Testing for MCP Servers and Tools
49
+
50
+ **Behavioural testing for LLM-operated systems.**
51
+
52
+ A pytest plugin for validating whether language models can actually understand and operate your interfaces: MCP servers, agents, prompts, and tools.
53
+
54
+ It tests the *LLM-facing contract* — not just the underlying code.
55
+
56
+ ---
57
+
58
+ ## What Problem This Solves
59
+
60
+ Traditional tests validate deterministic code paths.
61
+ LLM-driven systems fail differently.
62
+
63
+ Your implementation can be correct, fully tested, and deployed — and still fail because the model:
64
+
65
+ - Chooses the wrong tool
66
+ - Supplies incorrect parameters
67
+ - Can't recover from errors
68
+ - Changes behaviour after a prompt or model update
69
+
70
+ These failures don't show up in unit tests, and manual testing doesn't scale.
71
+
72
+ **The root cause:**
73
+ Your real API is no longer just functions and endpoints.
74
+ It is the **LLM-facing interface** — descriptions, schemas, prompts, and error semantics.
75
+
76
+ ---
77
+
78
+ ## Core Idea
79
+
80
+ ### Your test is the prompt.
81
+
82
+ Instead of scripting expected tool calls, you write what a user would say.
83
+
84
+ The model decides:
85
+ - Whether to act
86
+ - Which tool to use
87
+ - How to supply parameters
88
+ - How to respond
89
+
90
+ Your test asserts on the *observed behaviour*.
91
+
92
+ ```python
93
+ @pytest.mark.asyncio
94
+ async def test_trip_planning(aitest_run, weather_agent_factory):
95
+ """User asks for trip advice → LLM should compare forecasts."""
96
+ agent = weather_agent_factory("gpt-5-mini", max_turns=10)
97
+
98
+ # The test IS the prompt
99
+ result = await aitest_run(
100
+ agent,
101
+ "I'm planning a trip and can't decide between Paris and Sydney. "
102
+ "Get me a 3-day forecast for both and recommend which has better "
103
+ "weather for sightseeing. I prefer sunny weather.",
104
+ )
105
+
106
+ assert result.success
107
+ assert result.tool_call_count("get_forecast") >= 2 # Called for both cities
108
+ assert "paris" in result.final_response.lower()
109
+ assert "sydney" in result.final_response.lower()
110
+ ```
111
+
112
+ No mocking. No forced tool calls.
113
+ The model infers everything from the interface you expose.
114
+
115
+ ---
116
+
117
+ ## Features
118
+
119
+ ### Test MCP Servers
120
+
121
+ Run real models against real interfaces:
122
+
123
+ - Tool discovery and selection
124
+ - Parameter inference
125
+ - Multi-step workflows
126
+ - Error handling and recovery
127
+
128
+ ```python
129
+ @pytest.fixture(scope="module")
130
+ def weather_server():
131
+ return MCPServer(
132
+ command=[sys.executable, "-m", "my_weather_mcp"],
133
+ wait=Wait.for_tools(["get_weather", "get_forecast"]),
134
+ )
135
+ ```
136
+
137
+ ### Benchmark Models
138
+
139
+ Compare models using native pytest parametrize:
140
+
141
+ ```python
142
+ @pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
143
+ @pytest.mark.asyncio
144
+ async def test_tool_selection(aitest_run, weather_server, model):
145
+ agent = Agent(
146
+ provider=Provider(model=f"azure/{model}"),
147
+ mcp_servers=[weather_server],
148
+ system_prompt="You are a helpful weather assistant.",
149
+ max_turns=5,
150
+ )
151
+ result = await aitest_run(agent, "What's the weather in Paris?")
152
+ assert result.success
153
+ assert result.tool_was_called("get_weather")
154
+ ```
155
+
156
+ Reports show pass rate, token usage, and cost per model.
157
+
158
+ ### Prompt Arena
159
+
160
+ Compare system prompts head-to-head:
161
+
162
+ ```python
163
+ PROMPTS = load_prompts(Path("tests/integration/prompts/"))
164
+
165
+ @pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
166
+ @pytest.mark.asyncio
167
+ async def test_prompt_effectiveness(aitest_run, weather_server, prompt):
168
+ agent = Agent(
169
+ provider=Provider(model="azure/gpt-5-mini"),
170
+ mcp_servers=[weather_server],
171
+ system_prompt=prompt.system_prompt,
172
+ max_turns=5,
173
+ )
174
+ result = await aitest_run(agent, "What's the weather in Paris?")
175
+ assert result.success
176
+ ```
177
+
178
+ ### Matrix Testing
179
+
180
+ Test every model × prompt combination:
181
+
182
+ ```python
183
+ @pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
184
+ @pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
185
+ @pytest.mark.asyncio
186
+ async def test_matrix(aitest_run, weather_server, model, prompt):
187
+ # Full grid: surface brittle pairings
188
+ ...
189
+ ```
190
+
191
+ ### AI Judge
192
+
193
+ Semantic assertions using LLM evaluation — validate response quality, not just tool usage:
194
+
195
+ ```python
196
+ @pytest.mark.asyncio
197
+ async def test_recommendation_quality(aitest_run, weather_agent_factory, judge):
198
+ agent = weather_agent_factory("gpt-5-mini", max_turns=10)
199
+
200
+ result = await aitest_run(
201
+ agent,
202
+ "Compare weather in Paris and Sydney. Which is better for sightseeing?",
203
+ )
204
+
205
+ assert result.success
206
+ assert judge(result.final_response, """
207
+ - Mentions weather for both Paris and Sydney
208
+ - Makes a recommendation for one city
209
+ - Provides reasoning based on weather data
210
+ """)
211
+ ```
212
+
213
+ Uses [pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert) under the hood.
214
+
215
+ ### CLI Server
216
+
217
+ Test command-line tools as if they were MCP servers:
218
+
219
+ ```python
220
+ @pytest.fixture(scope="module")
221
+ def git_server():
222
+ return CLIServer(
223
+ name="git",
224
+ command="git",
225
+ tool_prefix="git",
226
+ )
227
+ ```
228
+
229
+ Help is discovered automatically — CLIServer runs `--help` at startup and includes the output in the tool description. Customize with `help_flag="-h"` for different CLIs, or provide a `description` directly for full control.
230
+
231
+ See [CLI Server Guide](docs/cli-server.md) for shell selection, help discovery, and assertions.
232
+
233
+ ---
234
+
235
+ ## Why pytest?
236
+
237
+ This is a **pytest plugin**, not a standalone tool.
238
+
239
+ - Use existing fixtures, markers, and parametrize
240
+ - Works with your CI/CD pipeline
241
+ - No new syntax to learn
242
+ - Combine with other pytest plugins
243
+
244
+ ---
245
+
246
+ ## What This Is Not
247
+
248
+ - A replacement for unit tests
249
+ - A mock-based simulator
250
+ - A guarantee of perfect model behaviour
251
+
252
+ This tool complements traditional testing by covering LLM behaviour, which conventional tests cannot observe.
253
+
254
+ ---
255
+
256
+ ## Who This Is For
257
+
258
+ - MCP server authors
259
+ - Agent and tool builders
260
+ - Teams exposing APIs to LLMs
261
+ - Anyone shipping systems where models operate tools autonomously
262
+
263
+ ---
264
+
265
+ ## Installation
266
+
267
+ ```bash
268
+ pip install pytest-aitest
269
+ ```
270
+
271
+ ## Setup
272
+
273
+ Works out of the box with cloud identity:
274
+
275
+ ```bash
276
+ # Azure (Entra ID)
277
+ export AZURE_API_BASE=https://your-resource.openai.azure.com/
278
+ az login
279
+
280
+ # OpenAI
281
+ export OPENAI_API_KEY=sk-...
282
+ ```
283
+
284
+ Supports 100+ providers via [LiteLLM](https://docs.litellm.ai/docs/providers).
285
+
286
+ ---
287
+
288
+ ## Documentation
289
+
290
+ - **[Configuration](docs/configuration.md)** — Providers, agents, fixtures
291
+ - **[CLI Server](docs/cli-server.md)** — Test CLI tools with help discovery
292
+ - **[MCP Server](docs/mcp-server.md)** — MCP server configuration and wait strategies
293
+ - **[Assertions](docs/assertions.md)** — AgentResult API and AI judge patterns
294
+ - **[Reporting](docs/reporting.md)** — HTML reports and AI summaries
295
+ - **[API Reference](docs/api-reference.md)** — Full API documentation
296
+ - **[Design](docs/DESIGN.md)** — Architecture and design decisions
297
+
298
+ ---
299
+
300
+ ## Coming Soon
301
+
302
+ - **Multi-turn Conversations** — `continue_from()` for stateful sessions
303
+ - **Prompt Templates** — YAML-based prompt management
304
+
305
+ ---
306
+
307
+ ## Related
308
+
309
+ - **[pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert)** — Semantic assertions for pytest
310
+ - **[Contributing](CONTRIBUTING.md)** — Development setup and guidelines
311
+
312
+ ## Requirements
313
+
314
+ - Python 3.11+
315
+ - pytest 9.0+
316
+ - An LLM provider (Azure, OpenAI, Anthropic, etc.)
317
+
318
+ ## License
319
+
320
+ MIT
@@ -0,0 +1,280 @@
1
+ # LLM Interface Testing
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/pytest-aitest)](https://pypi.org/project/pytest-aitest/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/pytest-aitest)](https://pypi.org/project/pytest-aitest/)
5
+ [![CI](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml/badge.svg)](https://github.com/sbroenne/pytest-aitest/actions/workflows/ci.yml)
6
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
7
+
8
+ ### Agent Contract Testing for MCP Servers and Tools
9
+
10
+ **Behavioural testing for LLM-operated systems.**
11
+
12
+ A pytest plugin for validating whether language models can actually understand and operate your interfaces: MCP servers, agents, prompts, and tools.
13
+
14
+ It tests the *LLM-facing contract* — not just the underlying code.
15
+
16
+ ---
17
+
18
+ ## What Problem This Solves
19
+
20
+ Traditional tests validate deterministic code paths.
21
+ LLM-driven systems fail differently.
22
+
23
+ Your implementation can be correct, fully tested, and deployed — and still fail because the model:
24
+
25
+ - Chooses the wrong tool
26
+ - Supplies incorrect parameters
27
+ - Can't recover from errors
28
+ - Changes behaviour after a prompt or model update
29
+
30
+ These failures don't show up in unit tests, and manual testing doesn't scale.
31
+
32
+ **The root cause:**
33
+ Your real API is no longer just functions and endpoints.
34
+ It is the **LLM-facing interface** — descriptions, schemas, prompts, and error semantics.
35
+
36
+ ---
37
+
38
+ ## Core Idea
39
+
40
+ ### Your test is the prompt.
41
+
42
+ Instead of scripting expected tool calls, you write what a user would say.
43
+
44
+ The model decides:
45
+ - Whether to act
46
+ - Which tool to use
47
+ - How to supply parameters
48
+ - How to respond
49
+
50
+ Your test asserts on the *observed behaviour*.
51
+
52
+ ```python
53
+ @pytest.mark.asyncio
54
+ async def test_trip_planning(aitest_run, weather_agent_factory):
55
+ """User asks for trip advice → LLM should compare forecasts."""
56
+ agent = weather_agent_factory("gpt-5-mini", max_turns=10)
57
+
58
+ # The test IS the prompt
59
+ result = await aitest_run(
60
+ agent,
61
+ "I'm planning a trip and can't decide between Paris and Sydney. "
62
+ "Get me a 3-day forecast for both and recommend which has better "
63
+ "weather for sightseeing. I prefer sunny weather.",
64
+ )
65
+
66
+ assert result.success
67
+ assert result.tool_call_count("get_forecast") >= 2 # Called for both cities
68
+ assert "paris" in result.final_response.lower()
69
+ assert "sydney" in result.final_response.lower()
70
+ ```
71
+
72
+ No mocking. No forced tool calls.
73
+ The model infers everything from the interface you expose.
74
+
75
+ ---
76
+
77
+ ## Features
78
+
79
+ ### Test MCP Servers
80
+
81
+ Run real models against real interfaces:
82
+
83
+ - Tool discovery and selection
84
+ - Parameter inference
85
+ - Multi-step workflows
86
+ - Error handling and recovery
87
+
88
+ ```python
89
+ @pytest.fixture(scope="module")
90
+ def weather_server():
91
+ return MCPServer(
92
+ command=[sys.executable, "-m", "my_weather_mcp"],
93
+ wait=Wait.for_tools(["get_weather", "get_forecast"]),
94
+ )
95
+ ```
96
+
97
+ ### Benchmark Models
98
+
99
+ Compare models using native pytest parametrize:
100
+
101
+ ```python
102
+ @pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
103
+ @pytest.mark.asyncio
104
+ async def test_tool_selection(aitest_run, weather_server, model):
105
+ agent = Agent(
106
+ provider=Provider(model=f"azure/{model}"),
107
+ mcp_servers=[weather_server],
108
+ system_prompt="You are a helpful weather assistant.",
109
+ max_turns=5,
110
+ )
111
+ result = await aitest_run(agent, "What's the weather in Paris?")
112
+ assert result.success
113
+ assert result.tool_was_called("get_weather")
114
+ ```
115
+
116
+ Reports show pass rate, token usage, and cost per model.
117
+
118
+ ### Prompt Arena
119
+
120
+ Compare system prompts head-to-head:
121
+
122
+ ```python
123
+ PROMPTS = load_prompts(Path("tests/integration/prompts/"))
124
+
125
+ @pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
126
+ @pytest.mark.asyncio
127
+ async def test_prompt_effectiveness(aitest_run, weather_server, prompt):
128
+ agent = Agent(
129
+ provider=Provider(model="azure/gpt-5-mini"),
130
+ mcp_servers=[weather_server],
131
+ system_prompt=prompt.system_prompt,
132
+ max_turns=5,
133
+ )
134
+ result = await aitest_run(agent, "What's the weather in Paris?")
135
+ assert result.success
136
+ ```
137
+
138
+ ### Matrix Testing
139
+
140
+ Test every model × prompt combination:
141
+
142
+ ```python
143
+ @pytest.mark.parametrize("model", ["gpt-5-mini", "gpt-4.1"])
144
+ @pytest.mark.parametrize("prompt", PROMPTS, ids=lambda p: p.name)
145
+ @pytest.mark.asyncio
146
+ async def test_matrix(aitest_run, weather_server, model, prompt):
147
+ # Full grid: surface brittle pairings
148
+ ...
149
+ ```
150
+
151
+ ### AI Judge
152
+
153
+ Semantic assertions using LLM evaluation — validate response quality, not just tool usage:
154
+
155
+ ```python
156
+ @pytest.mark.asyncio
157
+ async def test_recommendation_quality(aitest_run, weather_agent_factory, judge):
158
+ agent = weather_agent_factory("gpt-5-mini", max_turns=10)
159
+
160
+ result = await aitest_run(
161
+ agent,
162
+ "Compare weather in Paris and Sydney. Which is better for sightseeing?",
163
+ )
164
+
165
+ assert result.success
166
+ assert judge(result.final_response, """
167
+ - Mentions weather for both Paris and Sydney
168
+ - Makes a recommendation for one city
169
+ - Provides reasoning based on weather data
170
+ """)
171
+ ```
172
+
173
+ Uses [pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert) under the hood.
174
+
175
+ ### CLI Server
176
+
177
+ Test command-line tools as if they were MCP servers:
178
+
179
+ ```python
180
+ @pytest.fixture(scope="module")
181
+ def git_server():
182
+ return CLIServer(
183
+ name="git",
184
+ command="git",
185
+ tool_prefix="git",
186
+ )
187
+ ```
188
+
189
+ Help is discovered automatically — CLIServer runs `--help` at startup and includes the output in the tool description. Customize with `help_flag="-h"` for different CLIs, or provide a `description` directly for full control.
190
+
191
+ See [CLI Server Guide](docs/cli-server.md) for shell selection, help discovery, and assertions.
192
+
193
+ ---
194
+
195
+ ## Why pytest?
196
+
197
+ This is a **pytest plugin**, not a standalone tool.
198
+
199
+ - Use existing fixtures, markers, and parametrize
200
+ - Works with your CI/CD pipeline
201
+ - No new syntax to learn
202
+ - Combine with other pytest plugins
203
+
204
+ ---
205
+
206
+ ## What This Is Not
207
+
208
+ - A replacement for unit tests
209
+ - A mock-based simulator
210
+ - A guarantee of perfect model behaviour
211
+
212
+ This tool complements traditional testing by covering LLM behaviour, which conventional tests cannot observe.
213
+
214
+ ---
215
+
216
+ ## Who This Is For
217
+
218
+ - MCP server authors
219
+ - Agent and tool builders
220
+ - Teams exposing APIs to LLMs
221
+ - Anyone shipping systems where models operate tools autonomously
222
+
223
+ ---
224
+
225
+ ## Installation
226
+
227
+ ```bash
228
+ pip install pytest-aitest
229
+ ```
230
+
231
+ ## Setup
232
+
233
+ Works out of the box with cloud identity:
234
+
235
+ ```bash
236
+ # Azure (Entra ID)
237
+ export AZURE_API_BASE=https://your-resource.openai.azure.com/
238
+ az login
239
+
240
+ # OpenAI
241
+ export OPENAI_API_KEY=sk-...
242
+ ```
243
+
244
+ Supports 100+ providers via [LiteLLM](https://docs.litellm.ai/docs/providers).
245
+
246
+ ---
247
+
248
+ ## Documentation
249
+
250
+ - **[Configuration](docs/configuration.md)** — Providers, agents, fixtures
251
+ - **[CLI Server](docs/cli-server.md)** — Test CLI tools with help discovery
252
+ - **[MCP Server](docs/mcp-server.md)** — MCP server configuration and wait strategies
253
+ - **[Assertions](docs/assertions.md)** — AgentResult API and AI judge patterns
254
+ - **[Reporting](docs/reporting.md)** — HTML reports and AI summaries
255
+ - **[API Reference](docs/api-reference.md)** — Full API documentation
256
+ - **[Design](docs/DESIGN.md)** — Architecture and design decisions
257
+
258
+ ---
259
+
260
+ ## Coming Soon
261
+
262
+ - **Multi-turn Conversations** — `continue_from()` for stateful sessions
263
+ - **Prompt Templates** — YAML-based prompt management
264
+
265
+ ---
266
+
267
+ ## Related
268
+
269
+ - **[pytest-llm-assert](https://github.com/sbroenne/pytest-llm-assert)** — Semantic assertions for pytest
270
+ - **[Contributing](CONTRIBUTING.md)** — Development setup and guidelines
271
+
272
+ ## Requirements
273
+
274
+ - Python 3.11+
275
+ - pytest 9.0+
276
+ - An LLM provider (Azure, OpenAI, Anthropic, etc.)
277
+
278
+ ## License
279
+
280
+ MIT