pytest-llm-assert 0.1.0__py3-none-any.whl → 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_llm_assert/__init__.py +2 -2
- pytest_llm_assert/core.py +87 -35
- pytest_llm_assert/prompts/system_prompt.md +4 -0
- pytest_llm_assert-0.1.1.dist-info/METADATA +135 -0
- pytest_llm_assert-0.1.1.dist-info/RECORD +9 -0
- pytest_llm_assert-0.1.0.dist-info/METADATA +0 -246
- pytest_llm_assert-0.1.0.dist-info/RECORD +0 -8
- {pytest_llm_assert-0.1.0.dist-info → pytest_llm_assert-0.1.1.dist-info}/WHEEL +0 -0
- {pytest_llm_assert-0.1.0.dist-info → pytest_llm_assert-0.1.1.dist-info}/entry_points.txt +0 -0
- {pytest_llm_assert-0.1.0.dist-info → pytest_llm_assert-0.1.1.dist-info}/licenses/LICENSE +0 -0
pytest_llm_assert/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""pytest-llm-assert: Simple LLM-powered assertions for any pytest test."""
|
|
2
2
|
|
|
3
|
-
from pytest_llm_assert.core import LLMAssert
|
|
3
|
+
from pytest_llm_assert.core import AssertionResult, LLMAssert, LLMResponse
|
|
4
4
|
|
|
5
|
-
__all__ = ["LLMAssert"]
|
|
5
|
+
__all__ = ["LLMAssert", "AssertionResult", "LLMResponse"]
|
|
6
6
|
__version__ = "0.1.0"
|
pytest_llm_assert/core.py
CHANGED
|
@@ -2,18 +2,61 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import functools
|
|
5
6
|
import json
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
8
9
|
from dataclasses import dataclass
|
|
10
|
+
from pathlib import Path
|
|
9
11
|
from typing import TYPE_CHECKING, Callable
|
|
10
12
|
|
|
11
13
|
import litellm
|
|
12
14
|
|
|
15
|
+
# Load default system prompt from file
|
|
16
|
+
_PROMPTS_DIR = Path(__file__).parent / "prompts"
|
|
17
|
+
_DEFAULT_SYSTEM_PROMPT = (_PROMPTS_DIR / "system_prompt.md").read_text().strip()
|
|
18
|
+
|
|
13
19
|
if TYPE_CHECKING:
|
|
14
20
|
from typing import Any
|
|
15
21
|
|
|
16
22
|
|
|
23
|
+
@functools.cache
|
|
24
|
+
def _get_azure_ad_token_provider() -> Callable[[], str] | None:
|
|
25
|
+
"""Get Azure AD token provider for Entra ID authentication.
|
|
26
|
+
|
|
27
|
+
Uses LiteLLM's built-in helper which leverages DefaultAzureCredential.
|
|
28
|
+
Cached at module level to avoid recreating credentials on each call.
|
|
29
|
+
"""
|
|
30
|
+
try:
|
|
31
|
+
from litellm.secret_managers.get_azure_ad_token_provider import (
|
|
32
|
+
get_azure_ad_token_provider,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
return get_azure_ad_token_provider()
|
|
36
|
+
except ImportError:
|
|
37
|
+
# azure-identity not installed
|
|
38
|
+
return None
|
|
39
|
+
except Exception:
|
|
40
|
+
# Credential not available
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(slots=True)
|
|
45
|
+
class LLMResponse:
|
|
46
|
+
"""Response details from the last LLM call.
|
|
47
|
+
|
|
48
|
+
Access via `llm.response` after making an assertion call.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model: str | None = None
|
|
52
|
+
prompt_tokens: int | None = None
|
|
53
|
+
completion_tokens: int | None = None
|
|
54
|
+
total_tokens: int | None = None
|
|
55
|
+
cost: float | None = None
|
|
56
|
+
response_id: str | None = None
|
|
57
|
+
created: int | None = None
|
|
58
|
+
|
|
59
|
+
|
|
17
60
|
@dataclass(slots=True)
|
|
18
61
|
class AssertionResult:
|
|
19
62
|
"""Result of an LLM assertion with rich repr for pytest."""
|
|
@@ -57,7 +100,8 @@ class LLMAssert:
|
|
|
57
100
|
|
|
58
101
|
Args:
|
|
59
102
|
model: LiteLLM model string (e.g., "openai/gpt-5-mini", "azure/gpt-4o")
|
|
60
|
-
api_key: API key (supports ${ENV_VAR} expansion).
|
|
103
|
+
api_key: API key (supports ${ENV_VAR} expansion).
|
|
104
|
+
For Azure, leave empty to use Entra ID.
|
|
61
105
|
api_base: Custom API base URL (required for Azure)
|
|
62
106
|
**kwargs: Additional parameters passed to LiteLLM
|
|
63
107
|
"""
|
|
@@ -66,10 +110,12 @@ class LLMAssert:
|
|
|
66
110
|
self.api_base = api_base
|
|
67
111
|
self.kwargs = kwargs
|
|
68
112
|
self._azure_ad_token_provider: Callable[[], str] | None = None
|
|
113
|
+
self._system_prompt: str = _DEFAULT_SYSTEM_PROMPT
|
|
114
|
+
self.response: LLMResponse | None = None
|
|
69
115
|
|
|
70
116
|
# Auto-configure Azure Entra ID when no API key is provided
|
|
71
117
|
if self._is_azure_model() and not self._has_azure_api_key():
|
|
72
|
-
self._azure_ad_token_provider =
|
|
118
|
+
self._azure_ad_token_provider = _get_azure_ad_token_provider()
|
|
73
119
|
|
|
74
120
|
def _is_azure_model(self) -> bool:
|
|
75
121
|
"""Check if the model is an Azure OpenAI model."""
|
|
@@ -79,28 +125,19 @@ class LLMAssert:
|
|
|
79
125
|
"""Check if an Azure API key is available."""
|
|
80
126
|
return bool(self.api_key or os.environ.get("AZURE_API_KEY"))
|
|
81
127
|
|
|
82
|
-
@
|
|
83
|
-
def
|
|
84
|
-
"""Get
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
128
|
+
@property
|
|
129
|
+
def system_prompt(self) -> str:
|
|
130
|
+
"""Get the system prompt used for LLM assertions."""
|
|
131
|
+
return self._system_prompt
|
|
132
|
+
|
|
133
|
+
@system_prompt.setter
|
|
134
|
+
def system_prompt(self, value: str) -> None:
|
|
135
|
+
"""Set a custom system prompt for LLM assertions.
|
|
136
|
+
|
|
137
|
+
The prompt should instruct the LLM to evaluate content against a criterion
|
|
138
|
+
and respond in JSON format with 'result' (PASS/FAIL) and 'reasoning' keys.
|
|
91
139
|
"""
|
|
92
|
-
|
|
93
|
-
from litellm.secret_managers.get_azure_ad_token_provider import (
|
|
94
|
-
get_azure_ad_token_provider,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
return get_azure_ad_token_provider()
|
|
98
|
-
except ImportError:
|
|
99
|
-
# azure-identity not installed
|
|
100
|
-
return None
|
|
101
|
-
except Exception:
|
|
102
|
-
# Credential not available
|
|
103
|
-
return None
|
|
140
|
+
self._system_prompt = value
|
|
104
141
|
|
|
105
142
|
@staticmethod
|
|
106
143
|
def _expand_env(value: str) -> str:
|
|
@@ -116,7 +153,7 @@ class LLMAssert:
|
|
|
116
153
|
return text[: max_len - 3] + "..."
|
|
117
154
|
|
|
118
155
|
def _call_llm(self, messages: list[dict[str, str]]) -> str:
|
|
119
|
-
"""Call the LLM and return response content."""
|
|
156
|
+
"""Call the LLM and return response content. Updates self.response."""
|
|
120
157
|
kwargs = {**self.kwargs}
|
|
121
158
|
|
|
122
159
|
# Use Azure AD token provider if configured (Entra ID auth)
|
|
@@ -130,7 +167,27 @@ class LLMAssert:
|
|
|
130
167
|
api_base=self.api_base,
|
|
131
168
|
**kwargs,
|
|
132
169
|
)
|
|
133
|
-
|
|
170
|
+
content = response.choices[0].message.content or "" # type: ignore[union-attr]
|
|
171
|
+
|
|
172
|
+
# Store response details on instance
|
|
173
|
+
self.response = LLMResponse(
|
|
174
|
+
model=getattr(response, "model", None),
|
|
175
|
+
response_id=getattr(response, "id", None),
|
|
176
|
+
created=getattr(response, "created", None),
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
# Extract usage info
|
|
180
|
+
usage = getattr(response, "usage", None)
|
|
181
|
+
if usage:
|
|
182
|
+
self.response.prompt_tokens = getattr(usage, "prompt_tokens", None)
|
|
183
|
+
self.response.completion_tokens = getattr(usage, "completion_tokens", None)
|
|
184
|
+
self.response.total_tokens = getattr(usage, "total_tokens", None)
|
|
185
|
+
|
|
186
|
+
# Extract cost from hidden params (litellm calculates this)
|
|
187
|
+
if hasattr(response, "_hidden_params"):
|
|
188
|
+
self.response.cost = response._hidden_params.get("response_cost")
|
|
189
|
+
|
|
190
|
+
return content
|
|
134
191
|
|
|
135
192
|
def __call__(self, content: str, criterion: str) -> AssertionResult:
|
|
136
193
|
"""Evaluate if content meets the given criterion.
|
|
@@ -145,12 +202,7 @@ class LLMAssert:
|
|
|
145
202
|
messages = [
|
|
146
203
|
{
|
|
147
204
|
"role": "system",
|
|
148
|
-
"content":
|
|
149
|
-
"You are an assertion evaluator. "
|
|
150
|
-
"Evaluate if the given content meets the specified criterion.\n\n"
|
|
151
|
-
"Respond in JSON format:\n"
|
|
152
|
-
'{"result": "PASS" or "FAIL", "reasoning": "brief explanation"}'
|
|
153
|
-
),
|
|
205
|
+
"content": self._system_prompt,
|
|
154
206
|
},
|
|
155
207
|
{
|
|
156
208
|
"role": "user",
|
|
@@ -158,12 +210,12 @@ class LLMAssert:
|
|
|
158
210
|
},
|
|
159
211
|
]
|
|
160
212
|
|
|
161
|
-
|
|
213
|
+
response_text = self._call_llm(messages)
|
|
162
214
|
|
|
163
215
|
# Parse JSON response
|
|
164
216
|
try:
|
|
165
217
|
# Handle potential markdown code blocks
|
|
166
|
-
text =
|
|
218
|
+
text = response_text.strip()
|
|
167
219
|
if text.startswith("```"):
|
|
168
220
|
text = text.split("```")[1]
|
|
169
221
|
if text.startswith("json"):
|
|
@@ -173,10 +225,10 @@ class LLMAssert:
|
|
|
173
225
|
reasoning = data.get("reasoning", "")
|
|
174
226
|
except (json.JSONDecodeError, AttributeError):
|
|
175
227
|
# Fallback to line-based parsing
|
|
176
|
-
lines =
|
|
228
|
+
lines = response_text.strip().split("\n", 1)
|
|
177
229
|
first_line = lines[0].strip().upper()
|
|
178
230
|
passed = first_line in ("PASS", "YES", "TRUE", "PASSED")
|
|
179
|
-
reasoning = lines[1].strip() if len(lines) > 1 else
|
|
231
|
+
reasoning = lines[1].strip() if len(lines) > 1 else response_text
|
|
180
232
|
|
|
181
233
|
return AssertionResult(
|
|
182
234
|
passed=passed,
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-llm-assert
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Simple LLM-powered assertions for any pytest test
|
|
5
|
+
Project-URL: Homepage, https://github.com/sbroenne/pytest-llm-assert
|
|
6
|
+
Project-URL: Documentation, https://github.com/sbroenne/pytest-llm-assert#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/sbroenne/pytest-llm-assert
|
|
8
|
+
Author: Stefan Broenner
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: ai,assertions,llm,pytest,testing
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Framework :: Pytest
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Requires-Dist: azure-identity>=1.25
|
|
22
|
+
Requires-Dist: litellm>=1.81
|
|
23
|
+
Requires-Dist: pytest>=9.0
|
|
24
|
+
Provides-Extra: dev
|
|
25
|
+
Requires-Dist: pre-commit>=4.5; extra == 'dev'
|
|
26
|
+
Requires-Dist: pyright>=1.1.408; extra == 'dev'
|
|
27
|
+
Requires-Dist: pytest-cov>=6.0; extra == 'dev'
|
|
28
|
+
Requires-Dist: pytest>=9.0; extra == 'dev'
|
|
29
|
+
Requires-Dist: python-dotenv>=1.2; extra == 'dev'
|
|
30
|
+
Requires-Dist: ruff>=0.14; extra == 'dev'
|
|
31
|
+
Description-Content-Type: text/markdown
|
|
32
|
+
|
|
33
|
+
# pytest-llm-assert
|
|
34
|
+
|
|
35
|
+
[](https://pypi.org/project/pytest-llm-assert/)
|
|
36
|
+
[](https://pypi.org/project/pytest-llm-assert/)
|
|
37
|
+
[](https://github.com/sbroenne/pytest-llm-assert/actions/workflows/ci.yml)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
|
|
40
|
+
**Natural language assertions for pytest.**
|
|
41
|
+
|
|
42
|
+
Testing a text-to-SQL agent? Validating LLM-generated code? Checking if error messages are helpful? Now you can:
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
def test_sql_agent_output(llm):
|
|
46
|
+
sql = my_agent.generate("Get names of users over 21")
|
|
47
|
+
|
|
48
|
+
assert llm(sql, "Is this a valid SQL query that selects user names filtered by age > 21?")
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
The LLM evaluates your criterion and returns pass/fail — no regex, no parsing, no exact string matching.
|
|
52
|
+
|
|
53
|
+
## Features
|
|
54
|
+
|
|
55
|
+
- **Semantic assertions** — Assert meaning, not exact strings
|
|
56
|
+
- **100+ LLM providers** — OpenAI, Azure, Anthropic, Ollama, Vertex AI, Bedrock via [LiteLLM](https://docs.litellm.ai/)
|
|
57
|
+
- **pytest native** — Works as a standard pytest plugin/fixture
|
|
58
|
+
- **Response introspection** — Access tokens, cost, and reasoning via `llm.response`
|
|
59
|
+
|
|
60
|
+
## Installation
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install pytest-llm-assert
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
# conftest.py
|
|
70
|
+
import pytest
|
|
71
|
+
from pytest_llm_assert import LLMAssert
|
|
72
|
+
|
|
73
|
+
@pytest.fixture
|
|
74
|
+
def llm():
|
|
75
|
+
return LLMAssert(model="openai/gpt-5-mini")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
# test_my_agent.py
|
|
80
|
+
def test_generated_sql_is_correct(llm):
|
|
81
|
+
sql = "SELECT name FROM users WHERE age > 21 ORDER BY name"
|
|
82
|
+
assert llm(sql, "Is this a valid SELECT query that returns names of users over 21?")
|
|
83
|
+
|
|
84
|
+
def test_error_message_is_helpful(llm):
|
|
85
|
+
error = "ValidationError: 'port' must be an integer, got 'abc'"
|
|
86
|
+
assert llm(error, "Does this explain what went wrong and how to fix it?")
|
|
87
|
+
|
|
88
|
+
def test_summary_captures_key_points(llm):
|
|
89
|
+
summary = generate_summary(document)
|
|
90
|
+
assert llm(summary, "Does this mention the contract duration and parties involved?")
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Setup
|
|
94
|
+
|
|
95
|
+
Works out of the box with cloud identity — no API keys to manage:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
# Azure (Entra ID)
|
|
99
|
+
export AZURE_API_BASE=https://your-resource.openai.azure.com
|
|
100
|
+
az login
|
|
101
|
+
|
|
102
|
+
# Google Cloud (Vertex AI)
|
|
103
|
+
gcloud auth application-default login
|
|
104
|
+
|
|
105
|
+
# AWS (Bedrock)
|
|
106
|
+
aws configure # Uses IAM credentials
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Supports 100+ providers via [LiteLLM](https://docs.litellm.ai/docs/providers) — including API key auth for OpenAI, Anthropic, Ollama, and more.
|
|
110
|
+
|
|
111
|
+
## Documentation
|
|
112
|
+
|
|
113
|
+
- **[Configuration](docs/configuration.md)** — All providers, CLI options, environment variables
|
|
114
|
+
- **[API Reference](docs/api-reference.md)** — Full API documentation
|
|
115
|
+
- **[Comparing Judge Models](docs/comparing-models.md)** — Evaluate which LLM works best for your assertions
|
|
116
|
+
- **[Examples](examples/)** — Working pytest examples
|
|
117
|
+
|
|
118
|
+
## Related
|
|
119
|
+
|
|
120
|
+
- **[pytest-aitest](https://github.com/sbroenne/pytest-aitest)** — Full framework for testing MCP servers, CLIs, and AI agents
|
|
121
|
+
- **[Contributing](CONTRIBUTING.md)** — Development setup and guidelines
|
|
122
|
+
|
|
123
|
+
## Requirements
|
|
124
|
+
|
|
125
|
+
- Python 3.11+
|
|
126
|
+
- pytest 8.0+
|
|
127
|
+
- An LLM (OpenAI, Azure, Anthropic, etc.) or local [Ollama](https://ollama.ai/)
|
|
128
|
+
|
|
129
|
+
## Security
|
|
130
|
+
|
|
131
|
+
- **Sensitive data**: Test content is sent to LLM providers — consider data policies
|
|
132
|
+
|
|
133
|
+
## License
|
|
134
|
+
|
|
135
|
+
MIT
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
pytest_llm_assert/__init__.py,sha256=OcOVlsvqZBBxMzrQssLbaWVkc4qKSvdOMtLVibzDfFQ,233
|
|
2
|
+
pytest_llm_assert/core.py,sha256=0JOttHcZrJF2rA3xwG8BaCBIiEWvQCxFvT5HsYROygg,7983
|
|
3
|
+
pytest_llm_assert/plugin.py,sha256=g3sotHAeUXMuOsFQdaoIbn0CY24i-1CPv0EglrC5qtE,1327
|
|
4
|
+
pytest_llm_assert/prompts/system_prompt.md,sha256=RhSaYrpOjVcVwuG_af_Q50kHFhqXGOCKzubSYBXFzTA,181
|
|
5
|
+
pytest_llm_assert-0.1.1.dist-info/METADATA,sha256=U6PqKG5y_YOvErHyoehbseZyLL91Lr6lcGmS1HUkuDg,4723
|
|
6
|
+
pytest_llm_assert-0.1.1.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
7
|
+
pytest_llm_assert-0.1.1.dist-info/entry_points.txt,sha256=YEYg83TT6znVYdvFvZHJEOJ8XsZbcrqV9pY8uM-ThQE,49
|
|
8
|
+
pytest_llm_assert-0.1.1.dist-info/licenses/LICENSE,sha256=wHrdHpzRm4rdlyMdj-sQw7aou6kHPujW0VmRBEhInJ8,1072
|
|
9
|
+
pytest_llm_assert-0.1.1.dist-info/RECORD,,
|
|
@@ -1,246 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: pytest-llm-assert
|
|
3
|
-
Version: 0.1.0
|
|
4
|
-
Summary: Simple LLM-powered assertions for any pytest test
|
|
5
|
-
Project-URL: Homepage, https://github.com/sbroenne/pytest-llm-assert
|
|
6
|
-
Project-URL: Documentation, https://github.com/sbroenne/pytest-llm-assert#readme
|
|
7
|
-
Project-URL: Repository, https://github.com/sbroenne/pytest-llm-assert
|
|
8
|
-
Author: Stefan Broenner
|
|
9
|
-
License-Expression: MIT
|
|
10
|
-
License-File: LICENSE
|
|
11
|
-
Keywords: ai,assertions,llm,pytest,testing
|
|
12
|
-
Classifier: Development Status :: 3 - Alpha
|
|
13
|
-
Classifier: Framework :: Pytest
|
|
14
|
-
Classifier: Intended Audience :: Developers
|
|
15
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
-
Classifier: Programming Language :: Python :: 3
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
-
Requires-Python: >=3.11
|
|
21
|
-
Requires-Dist: azure-identity>=1.15
|
|
22
|
-
Requires-Dist: litellm>=1.55
|
|
23
|
-
Requires-Dist: pytest>=8.0
|
|
24
|
-
Provides-Extra: dev
|
|
25
|
-
Requires-Dist: pyright>=1.1; extra == 'dev'
|
|
26
|
-
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
27
|
-
Requires-Dist: python-dotenv>=1.0; extra == 'dev'
|
|
28
|
-
Requires-Dist: ruff>=0.8; extra == 'dev'
|
|
29
|
-
Description-Content-Type: text/markdown
|
|
30
|
-
|
|
31
|
-
# pytest-llm-assert
|
|
32
|
-
|
|
33
|
-
**Natural language assertions for pytest.**
|
|
34
|
-
|
|
35
|
-
A pytest plugin that lets you write semantic assertions using LLMs. Stop writing brittle string checks — let an LLM understand what you actually mean.
|
|
36
|
-
|
|
37
|
-
## The Problem
|
|
38
|
-
|
|
39
|
-
```python
|
|
40
|
-
# ❌ These all fail even though they mean "success":
|
|
41
|
-
assert "success" in response # Fails on "Succeeded", "successful", "It worked!"
|
|
42
|
-
assert response == "Operation completed successfully" # Exact match? Really?
|
|
43
|
-
assert re.match(r"success|succeeded|worked", response, re.I) # Regex hell
|
|
44
|
-
```
|
|
45
|
-
|
|
46
|
-
```python
|
|
47
|
-
# You're testing a text-to-SQL agent. How do you validate the output?
|
|
48
|
-
|
|
49
|
-
# ❌ Exact match? There are many valid ways to write the same query:
|
|
50
|
-
assert sql == "SELECT name FROM users WHERE age > 21"
|
|
51
|
-
|
|
52
|
-
# ❌ Regex? Good luck covering all valid SQL syntax:
|
|
53
|
-
assert re.match(r"SELECT\s+name\s+FROM\s+users", sql, re.I)
|
|
54
|
-
|
|
55
|
-
# ❌ Parse it? Now you need a SQL parser as a test dependency:
|
|
56
|
-
assert sqlparse.parse(sql)[0].get_type() == "SELECT"
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
## The Solution
|
|
60
|
-
|
|
61
|
-
```python
|
|
62
|
-
# ✅ Just say what you mean:
|
|
63
|
-
assert llm(response, "Does this indicate the operation succeeded?")
|
|
64
|
-
assert llm(sql, "Is this a valid SELECT query that returns user names for users over 21?")
|
|
65
|
-
```
|
|
66
|
-
|
|
67
|
-
## Why This Works
|
|
68
|
-
|
|
69
|
-
The LLM evaluates your criterion against the content and returns a judgment. It understands:
|
|
70
|
-
|
|
71
|
-
- **Synonyms**: "success", "succeeded", "worked", "completed" all mean the same thing
|
|
72
|
-
- **Semantics**: Two SQL queries can be equivalent even with different syntax
|
|
73
|
-
- **Context**: "The operation failed successfully" is actually a failure
|
|
74
|
-
- **Intent**: Generated code can be correct even if it's not identical to a reference
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
## Installation
|
|
78
|
-
|
|
79
|
-
```bash
|
|
80
|
-
pip install pytest-llm-assert
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
## Setup
|
|
84
|
-
|
|
85
|
-
This library uses [LiteLLM](https://docs.litellm.ai/) under the hood, giving you access to **100+ LLM providers** with a unified API.
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
# OpenAI
|
|
89
|
-
export OPENAI_API_KEY=sk-...
|
|
90
|
-
|
|
91
|
-
# Azure OpenAI with Entra ID (no API keys)
|
|
92
|
-
export AZURE_API_BASE=https://your-resource.openai.azure.com
|
|
93
|
-
export AZURE_API_VERSION=2024-02-15-preview
|
|
94
|
-
# Uses DefaultAzureCredential: az login, managed identity, etc.
|
|
95
|
-
|
|
96
|
-
# Ollama (local)
|
|
97
|
-
# Just run: ollama serve
|
|
98
|
-
```
|
|
99
|
-
|
|
100
|
-
See [LiteLLM docs](https://docs.litellm.ai/docs/providers) for all providers including Vertex AI, Bedrock, Anthropic, and more.
|
|
101
|
-
|
|
102
|
-
## Quick Start
|
|
103
|
-
|
|
104
|
-
```python
|
|
105
|
-
from pytest_llm_assert import LLMAssert
|
|
106
|
-
|
|
107
|
-
llm = LLMAssert(model="openai/gpt-5-mini") # Uses OPENAI_API_KEY from env
|
|
108
|
-
|
|
109
|
-
# Semantic assertions - returns True/False
|
|
110
|
-
assert llm("Operation completed successfully", "Does this indicate success?")
|
|
111
|
-
assert llm("Error: connection refused", "Does this indicate a failure?")
|
|
112
|
-
assert not llm("All tests passed", "Does this indicate a failure?")
|
|
113
|
-
```
|
|
114
|
-
|
|
115
|
-
## Real Examples
|
|
116
|
-
|
|
117
|
-
First, create a fixture in `conftest.py`:
|
|
118
|
-
|
|
119
|
-
```python
|
|
120
|
-
# conftest.py
|
|
121
|
-
import pytest
|
|
122
|
-
from pytest_llm_assert import LLMAssert
|
|
123
|
-
|
|
124
|
-
@pytest.fixture
|
|
125
|
-
def llm():
|
|
126
|
-
return LLMAssert(model="openai/gpt-5-mini")
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
Then use it in your tests:
|
|
130
|
-
|
|
131
|
-
### Testing Error Messages
|
|
132
|
-
|
|
133
|
-
```python
|
|
134
|
-
def test_validation_error_is_helpful(llm):
|
|
135
|
-
"""Error messages should explain the problem clearly."""
|
|
136
|
-
error_msg = "ValidationError: 'port' must be an integer, got 'not-a-number'"
|
|
137
|
-
|
|
138
|
-
assert llm(error_msg, "Does this explain that port must be a number?")
|
|
139
|
-
assert llm(error_msg, "Does this indicate which field failed validation?")
|
|
140
|
-
```
|
|
141
|
-
|
|
142
|
-
### Testing Generated SQL
|
|
143
|
-
|
|
144
|
-
```python
|
|
145
|
-
def test_query_builder_generates_valid_sql(llm):
|
|
146
|
-
"""Query builder should produce semantically correct SQL."""
|
|
147
|
-
query = "SELECT name FROM users WHERE age > 21 ORDER BY name"
|
|
148
|
-
|
|
149
|
-
assert llm(query, "Is this a valid SELECT query that returns names of users over 21?")
|
|
150
|
-
```
|
|
151
|
-
|
|
152
|
-
### Testing LLM Output
|
|
153
|
-
|
|
154
|
-
```python
|
|
155
|
-
def test_summary_is_comprehensive(llm):
|
|
156
|
-
"""Generated summaries should capture key points."""
|
|
157
|
-
summary = "The contract establishes a 2-year service agreement between..."
|
|
158
|
-
|
|
159
|
-
assert llm(summary, "Does this summarize a legal contract?")
|
|
160
|
-
assert llm(summary, "Does this mention the contract duration?")
|
|
161
|
-
```
|
|
162
|
-
|
|
163
|
-
## Comparing Judge Models
|
|
164
|
-
|
|
165
|
-
Not sure which LLM to use as your assertion judge? Run the same tests against multiple models to find the best one for your use case:
|
|
166
|
-
|
|
167
|
-
```python
|
|
168
|
-
import pytest
|
|
169
|
-
from pytest_llm_assert import LLMAssert
|
|
170
|
-
|
|
171
|
-
MODELS = ["openai/gpt-5-mini", "anthropic/claude-sonnet-4-20250514", "ollama/llama3.1:8b"]
|
|
172
|
-
|
|
173
|
-
@pytest.fixture(params=MODELS)
|
|
174
|
-
def llm(request):
|
|
175
|
-
return LLMAssert(model=request.param)
|
|
176
|
-
|
|
177
|
-
def test_validates_sql_equivalence(llm):
|
|
178
|
-
"""Test which models can judge SQL semantic equivalence."""
|
|
179
|
-
sql = "SELECT u.name FROM users AS u WHERE u.age >= 22"
|
|
180
|
-
assert llm(sql, "Is this equivalent to selecting names of users over 21?")
|
|
181
|
-
```
|
|
182
|
-
|
|
183
|
-
Output shows which judge models correctly evaluate your criterion:
|
|
184
|
-
```
|
|
185
|
-
test_validates_sql_equivalence[openai/gpt-5-mini] PASSED
|
|
186
|
-
test_validates_sql_equivalence[anthropic/claude-sonnet-4-20250514] PASSED
|
|
187
|
-
test_validates_sql_equivalence[ollama/llama3.1:8b] FAILED
|
|
188
|
-
```
|
|
189
|
-
|
|
190
|
-
> **Note:** This tests which LLM makes a good *judge* for your assertions. To test AI agents themselves (e.g., "does my coding agent produce working code?"), see [pytest-aitest](https://github.com/sbroenne/pytest-aitest).
|
|
191
|
-
|
|
192
|
-
## Configuration
|
|
193
|
-
|
|
194
|
-
### Programmatic
|
|
195
|
-
|
|
196
|
-
```python
|
|
197
|
-
from pytest_llm_assert import LLMAssert
|
|
198
|
-
|
|
199
|
-
llm = LLMAssert(
|
|
200
|
-
model="openai/gpt-5-mini",
|
|
201
|
-
api_key="sk-...", # Or use env var
|
|
202
|
-
api_base="https://...", # Custom endpoint
|
|
203
|
-
)
|
|
204
|
-
```
|
|
205
|
-
|
|
206
|
-
### CLI Options
|
|
207
|
-
|
|
208
|
-
```bash
|
|
209
|
-
pytest --llm-model=openai/gpt-5-mini
|
|
210
|
-
pytest --llm-api-key='${OPENAI_API_KEY}' # Env var expansion
|
|
211
|
-
pytest --llm-api-base=http://localhost:8080
|
|
212
|
-
```
|
|
213
|
-
|
|
214
|
-
### Environment Variables
|
|
215
|
-
|
|
216
|
-
```bash
|
|
217
|
-
export OPENAI_API_KEY=sk-...
|
|
218
|
-
export LLM_MODEL=openai/gpt-5-mini
|
|
219
|
-
```
|
|
220
|
-
|
|
221
|
-
## API Reference
|
|
222
|
-
|
|
223
|
-
### `LLMAssert(model, api_key=None, api_base=None, **kwargs)`
|
|
224
|
-
|
|
225
|
-
Create an LLM assertion helper.
|
|
226
|
-
|
|
227
|
-
- `model`: LiteLLM model string (e.g., `"openai/gpt-5-mini"`, `"azure/gpt-4o"`)
|
|
228
|
-
- `api_key`: Optional API key (or use environment variables)
|
|
229
|
-
- `api_base`: Optional custom endpoint
|
|
230
|
-
- `**kwargs`: Additional parameters passed to LiteLLM
|
|
231
|
-
|
|
232
|
-
### `llm(content, criterion) -> AssertionResult`
|
|
233
|
-
|
|
234
|
-
Evaluate if content meets the criterion.
|
|
235
|
-
|
|
236
|
-
- Returns `AssertionResult` which is truthy if criterion is met
|
|
237
|
-
- Access `.reasoning` for the LLM's explanation
|
|
238
|
-
|
|
239
|
-
## See Also
|
|
240
|
-
|
|
241
|
-
- **[Examples](examples/)** — Example pytest tests showing basic usage, model comparison, and fixture patterns
|
|
242
|
-
- **[pytest-aitest](https://github.com/sbroenne/pytest-aitest)** — Full framework for testing MCP servers, CLIs, and AI agents. Uses pytest-llm-assert for the judge.
|
|
243
|
-
|
|
244
|
-
## License
|
|
245
|
-
|
|
246
|
-
MIT
|
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
pytest_llm_assert/__init__.py,sha256=gp_z4g6Yf9SnjwEyZc6kPSqEWw2Nyb5er84HRuUaXCA,169
|
|
2
|
-
pytest_llm_assert/core.py,sha256=sDQvcus5EqHQ-_iQyLH2XB9nL4UhLpiWGTXnGhO7YyE,6351
|
|
3
|
-
pytest_llm_assert/plugin.py,sha256=g3sotHAeUXMuOsFQdaoIbn0CY24i-1CPv0EglrC5qtE,1327
|
|
4
|
-
pytest_llm_assert-0.1.0.dist-info/METADATA,sha256=cGK3fmb5T0ZKOBtM0PkmnRkAaGnLZ1aEDhBD5U8-1UQ,7713
|
|
5
|
-
pytest_llm_assert-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
-
pytest_llm_assert-0.1.0.dist-info/entry_points.txt,sha256=YEYg83TT6znVYdvFvZHJEOJ8XsZbcrqV9pY8uM-ThQE,49
|
|
7
|
-
pytest_llm_assert-0.1.0.dist-info/licenses/LICENSE,sha256=wHrdHpzRm4rdlyMdj-sQw7aou6kHPujW0VmRBEhInJ8,1072
|
|
8
|
-
pytest_llm_assert-0.1.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|