genassert 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genassert/__init__.py +43 -0
- genassert/_embed.py +113 -0
- genassert/assertions/__init__.py +1 -0
- genassert/assertions/budget.py +62 -0
- genassert/assertions/hallucination.py +89 -0
- genassert/assertions/intent.py +62 -0
- genassert/assertions/language.py +145 -0
- genassert/assertions/pii.py +107 -0
- genassert/assertions/readability.py +135 -0
- genassert/assertions/schema.py +100 -0
- genassert/assertions/sentiment.py +160 -0
- genassert/assertions/similarity.py +57 -0
- genassert/assertions/tone.py +93 -0
- genassert/baseline.py +142 -0
- genassert/judge.py +185 -0
- genassert/plugin.py +74 -0
- genassert-0.2.0.dist-info/METADATA +452 -0
- genassert-0.2.0.dist-info/RECORD +21 -0
- genassert-0.2.0.dist-info/WHEEL +4 -0
- genassert-0.2.0.dist-info/entry_points.txt +2 -0
- genassert-0.2.0.dist-info/licenses/LICENSE +21 -0
genassert/judge.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LocalJudge: A lightweight local LLM judge for semantic evaluation.
|
|
3
|
+
|
|
4
|
+
Uses a small local model (via transformers or llama-cpp-python) so that
|
|
5
|
+
CI runs have zero API cost. Falls back to embedding-based scoring if
|
|
6
|
+
no local model is available.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import os
|
|
11
|
+
from dataclasses import dataclass
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class JudgeResult:
|
|
16
|
+
passed: bool
|
|
17
|
+
score: float # 0.0–1.0
|
|
18
|
+
reasoning: str
|
|
19
|
+
method: str # "local_model" | "embeddings" | "fallback"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class LocalJudge:
|
|
23
|
+
"""
|
|
24
|
+
A reusable judge that evaluates LLM outputs against criteria.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
model:
|
|
29
|
+
Local model identifier. Defaults to GENASSERT_JUDGE_MODEL env var
|
|
30
|
+
or "Qwen/Qwen2.5-0.5B-Instruct" (tiny, fast, free).
|
|
31
|
+
threshold:
|
|
32
|
+
Default pass threshold (0–1). Default 0.7.
|
|
33
|
+
backend:
|
|
34
|
+
"transformers" | "llama_cpp" | "embeddings" (auto-detected).
|
|
35
|
+
|
|
36
|
+
Examples
|
|
37
|
+
--------
|
|
38
|
+
>>> judge = LocalJudge()
|
|
39
|
+
>>> result = judge.evaluate(
|
|
40
|
+
... response="Paris is the capital of France.",
|
|
41
|
+
... criterion="The response correctly answers a geography question.",
|
|
42
|
+
... )
|
|
43
|
+
>>> assert result.passed
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def __init__(
|
|
47
|
+
self,
|
|
48
|
+
model: str | None = None,
|
|
49
|
+
threshold: float = 0.7,
|
|
50
|
+
backend: str = "auto",
|
|
51
|
+
) -> None:
|
|
52
|
+
self.model = model or os.environ.get(
|
|
53
|
+
"GENASSERT_JUDGE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct"
|
|
54
|
+
)
|
|
55
|
+
self.threshold = threshold
|
|
56
|
+
self.backend = backend
|
|
57
|
+
self._pipeline = None
|
|
58
|
+
|
|
59
|
+
def evaluate(
|
|
60
|
+
self,
|
|
61
|
+
response: str,
|
|
62
|
+
criterion: str,
|
|
63
|
+
threshold: float | None = None,
|
|
64
|
+
) -> JudgeResult:
|
|
65
|
+
"""
|
|
66
|
+
Evaluate whether `response` meets the `criterion`.
|
|
67
|
+
|
|
68
|
+
Parameters
|
|
69
|
+
----------
|
|
70
|
+
response:
|
|
71
|
+
The LLM output to evaluate.
|
|
72
|
+
criterion:
|
|
73
|
+
A plain-English description of what constitutes a passing response.
|
|
74
|
+
threshold:
|
|
75
|
+
Override the instance default threshold.
|
|
76
|
+
|
|
77
|
+
Returns
|
|
78
|
+
-------
|
|
79
|
+
JudgeResult
|
|
80
|
+
Contains passed (bool), score (float), reasoning (str), method (str).
|
|
81
|
+
"""
|
|
82
|
+
cutoff = threshold if threshold is not None else self.threshold
|
|
83
|
+
|
|
84
|
+
if self.backend in ("transformers", "auto"):
|
|
85
|
+
try:
|
|
86
|
+
return self._evaluate_transformers(response, criterion, cutoff)
|
|
87
|
+
except ImportError:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
if self.backend in ("llama_cpp", "auto"):
|
|
91
|
+
try:
|
|
92
|
+
return self._evaluate_llama_cpp(response, criterion, cutoff)
|
|
93
|
+
except ImportError:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Embedding-based fallback
|
|
97
|
+
return self._evaluate_embeddings(response, criterion, cutoff)
|
|
98
|
+
|
|
99
|
+
def _evaluate_transformers(
|
|
100
|
+
self, response: str, criterion: str, threshold: float
|
|
101
|
+
) -> JudgeResult:
|
|
102
|
+
from transformers import pipeline as hf_pipeline
|
|
103
|
+
|
|
104
|
+
if self._pipeline is None:
|
|
105
|
+
self._pipeline = hf_pipeline(
|
|
106
|
+
"text-generation",
|
|
107
|
+
model=self.model,
|
|
108
|
+
max_new_tokens=64,
|
|
109
|
+
do_sample=False,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
prompt = _build_judge_prompt(response, criterion)
|
|
113
|
+
output = self._pipeline(prompt)[0]["generated_text"]
|
|
114
|
+
score, reasoning = _parse_judge_output(output, prompt)
|
|
115
|
+
return JudgeResult(
|
|
116
|
+
passed=score >= threshold,
|
|
117
|
+
score=score,
|
|
118
|
+
reasoning=reasoning,
|
|
119
|
+
method="local_model",
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
def _evaluate_llama_cpp(
|
|
123
|
+
self, response: str, criterion: str, threshold: float
|
|
124
|
+
) -> JudgeResult:
|
|
125
|
+
from llama_cpp import Llama
|
|
126
|
+
|
|
127
|
+
if self._pipeline is None:
|
|
128
|
+
self._pipeline = Llama.from_pretrained(repo_id=self.model)
|
|
129
|
+
|
|
130
|
+
prompt = _build_judge_prompt(response, criterion)
|
|
131
|
+
output = self._pipeline(prompt, max_tokens=64)["choices"][0]["text"]
|
|
132
|
+
score, reasoning = _parse_judge_output(output, "")
|
|
133
|
+
return JudgeResult(
|
|
134
|
+
passed=score >= threshold,
|
|
135
|
+
score=score,
|
|
136
|
+
reasoning=reasoning,
|
|
137
|
+
method="local_model",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
def _evaluate_embeddings(
|
|
141
|
+
self, response: str, criterion: str, threshold: float
|
|
142
|
+
) -> JudgeResult:
|
|
143
|
+
from genassert._embed import embed_text
|
|
144
|
+
from genassert.assertions.intent import _cosine_similarity
|
|
145
|
+
|
|
146
|
+
score = _cosine_similarity(embed_text(response), embed_text(criterion))
|
|
147
|
+
return JudgeResult(
|
|
148
|
+
passed=score >= threshold,
|
|
149
|
+
score=score,
|
|
150
|
+
reasoning=f"Embedding cosine similarity: {score:.3f}",
|
|
151
|
+
method="embeddings",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _build_judge_prompt(response: str, criterion: str) -> str:
|
|
156
|
+
return (
|
|
157
|
+
f"You are an objective evaluator. Score the following response.\n\n"
|
|
158
|
+
f"CRITERION: {criterion}\n\n"
|
|
159
|
+
f"RESPONSE: {response}\n\n"
|
|
160
|
+
f"Rate on a scale of 0.0 to 1.0 and give one-sentence reasoning.\n"
|
|
161
|
+
f"Format: SCORE: <float> | REASON: <text>\n"
|
|
162
|
+
f"SCORE:"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _parse_judge_output(output: str, prompt: str) -> tuple[float, str]:
|
|
167
|
+
"""Parse 'SCORE: 0.8 | REASON: ...' from judge output."""
|
|
168
|
+
# Remove prompt prefix if present
|
|
169
|
+
text = output.replace(prompt, "").strip()
|
|
170
|
+
if "SCORE:" in text:
|
|
171
|
+
text = text.split("SCORE:")[-1].strip()
|
|
172
|
+
|
|
173
|
+
score = 0.5
|
|
174
|
+
reasoning = text
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
parts = text.split("|", 1)
|
|
178
|
+
score_str = parts[0].strip()
|
|
179
|
+
score = max(0.0, min(1.0, float(score_str)))
|
|
180
|
+
if len(parts) > 1:
|
|
181
|
+
reasoning = parts[1].replace("REASON:", "").strip()
|
|
182
|
+
except (ValueError, IndexError):
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
return score, reasoning
|
genassert/plugin.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pytest plugin for genassert.
|
|
3
|
+
|
|
4
|
+
Auto-registered via entry_points in pyproject.toml.
|
|
5
|
+
Provides fixtures and marks for LLM tests.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def pytest_configure(config):
|
|
13
|
+
config.addinivalue_line(
|
|
14
|
+
"markers",
|
|
15
|
+
"llm: mark test as an LLM semantic test (may call embedding APIs)",
|
|
16
|
+
)
|
|
17
|
+
config.addinivalue_line(
|
|
18
|
+
"markers",
|
|
19
|
+
"llm_slow: mark test as slow — uses local judge model inference",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def pytest_addoption(parser):
|
|
24
|
+
group = parser.getgroup("genassert")
|
|
25
|
+
group.addoption(
|
|
26
|
+
"--record-baselines",
|
|
27
|
+
action="store_true",
|
|
28
|
+
default=False,
|
|
29
|
+
help="Record (or overwrite) golden baselines for all LLM tests.",
|
|
30
|
+
)
|
|
31
|
+
group.addoption(
|
|
32
|
+
"--llm-threshold",
|
|
33
|
+
type=float,
|
|
34
|
+
default=None,
|
|
35
|
+
help="Override default similarity threshold for all assertions.",
|
|
36
|
+
)
|
|
37
|
+
group.addoption(
|
|
38
|
+
"--skip-llm",
|
|
39
|
+
action="store_true",
|
|
40
|
+
default=False,
|
|
41
|
+
help="Skip all tests marked with @pytest.mark.llm.",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def pytest_collection_modifyitems(config, items):
|
|
46
|
+
if config.getoption("--skip-llm"):
|
|
47
|
+
skip_llm = pytest.mark.skip(reason="--skip-llm flag set")
|
|
48
|
+
for item in items:
|
|
49
|
+
if "llm" in item.keywords:
|
|
50
|
+
item.add_marker(skip_llm)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@pytest.fixture
|
|
54
|
+
def llm_record(request):
|
|
55
|
+
"""
|
|
56
|
+
Fixture: returns True if --record-baselines flag is set.
|
|
57
|
+
|
|
58
|
+
Usage:
|
|
59
|
+
def test_something(llm_record):
|
|
60
|
+
response = my_llm_call()
|
|
61
|
+
if llm_record:
|
|
62
|
+
record_baseline("my_test", response)
|
|
63
|
+
else:
|
|
64
|
+
compare_baseline("my_test", response)
|
|
65
|
+
"""
|
|
66
|
+
return request.config.getoption("--record-baselines", default=False)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@pytest.fixture
|
|
70
|
+
def llm_threshold(request):
|
|
71
|
+
"""
|
|
72
|
+
Fixture: returns the global threshold override (or None).
|
|
73
|
+
"""
|
|
74
|
+
return request.config.getoption("--llm-threshold", default=None)
|
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genassert
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: pytest-native semantic testing for LLM and generative AI applications. No servers. No SaaS. Works with OpenAI, Anthropic, LiteLLM and any LLM client.
|
|
5
|
+
Project-URL: Homepage, https://github.com/genassert/genassert
|
|
6
|
+
Project-URL: Documentation, https://genassert.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/genassert/genassert
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/genassert/genassert/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/genassert/genassert/blob/main/CHANGELOG.md
|
|
10
|
+
Author: genassert contributors
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: agent testing,ai quality assurance,ai testing,anthropic,claude testing,gen ai,genai testing,generative ai,generative ai testing,golden baseline,gpt testing,hallucination detection,langchain,llm,llm assertions,llm evaluation,llm quality,llm testing,machine learning testing,openai,prompt testing,pytest,pytest plugin,rag testing,regression testing,semantic assertions,semantic testing,testing
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Framework :: Pytest
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
|
+
Classifier: Topic :: Software Development :: Testing
|
|
29
|
+
Classifier: Typing :: Typed
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: jsonschema>=4.0.0; extra == 'all'
|
|
33
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'all'
|
|
36
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'all'
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
39
|
+
Requires-Dist: openai>=1.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
44
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'dev'
|
|
45
|
+
Provides-Extra: jsonschema
|
|
46
|
+
Requires-Dist: jsonschema>=4.0.0; extra == 'jsonschema'
|
|
47
|
+
Provides-Extra: judge
|
|
48
|
+
Requires-Dist: torch>=2.0.0; extra == 'judge'
|
|
49
|
+
Requires-Dist: transformers>=4.40.0; extra == 'judge'
|
|
50
|
+
Provides-Extra: local
|
|
51
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'local'
|
|
52
|
+
Provides-Extra: openai
|
|
53
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
54
|
+
Provides-Extra: pydantic
|
|
55
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'pydantic'
|
|
56
|
+
Provides-Extra: tiktoken
|
|
57
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'tiktoken'
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# genassert
|
|
61
|
+
|
|
62
|
+
**pytest-native semantic testing for LLM applications.**
|
|
63
|
+
No servers. No SaaS. No config. Works with OpenAI, Anthropic, LiteLLM, and any LLM client.
|
|
64
|
+
|
|
65
|
+
[](https://pypi.org/project/genassert/)
|
|
66
|
+
[](https://www.python.org/downloads/)
|
|
67
|
+
[](https://opensource.org/licenses/MIT)
|
|
68
|
+
[](https://docs.pytest.org/)
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Why genassert?
|
|
73
|
+
|
|
74
|
+
Traditional `assert response == expected` breaks the moment your LLM changes a word.
|
|
75
|
+
`genassert` gives you **semantic assertions** — tests that check *meaning*, not strings.
|
|
76
|
+
|
|
77
|
+
| Problem | Traditional testing | genassert |
|
|
78
|
+
|---------|-------------------|-----------|
|
|
79
|
+
| LLM changes wording | Test breaks | Test passes (same meaning) |
|
|
80
|
+
| Response drifts over time | No detection | Baseline regression alert |
|
|
81
|
+
| Wrong tone shipped | No check | `assert_tone(response, "professional")` |
|
|
82
|
+
| Hallucination in response | No check | `assert_no_hallucination(response, facts)` |
|
|
83
|
+
| Response too long | Manual count | `assert_token_budget(response, 200)` |
|
|
84
|
+
| Schema mismatch | Try/except JSON | `assert_schema(response, MyPydanticModel)` |
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Install
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Minimal install (uses hash-based fallback embedder)
|
|
92
|
+
pip install genassert
|
|
93
|
+
|
|
94
|
+
# Recommended: local embeddings — no API cost, runs in CI for free
|
|
95
|
+
pip install "genassert[local]"
|
|
96
|
+
|
|
97
|
+
# OpenAI embeddings backend
|
|
98
|
+
pip install "genassert[openai]"
|
|
99
|
+
|
|
100
|
+
# Everything
|
|
101
|
+
pip install "genassert[all]"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quick Start
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# test_my_llm.py
|
|
110
|
+
import pytest
|
|
111
|
+
from genassert import (
|
|
112
|
+
assert_intent,
|
|
113
|
+
assert_tone,
|
|
114
|
+
assert_no_hallucination,
|
|
115
|
+
assert_token_budget,
|
|
116
|
+
assert_schema,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
@pytest.mark.llm
|
|
120
|
+
def test_summarizer():
|
|
121
|
+
response = my_summarize_function("Long article about climate change...")
|
|
122
|
+
|
|
123
|
+
# Check the response is actually a summary
|
|
124
|
+
assert_intent(response, "a concise summary of the article")
|
|
125
|
+
|
|
126
|
+
# Check it's neutral — no opinion
|
|
127
|
+
assert_tone(response, "neutral")
|
|
128
|
+
|
|
129
|
+
# Check it doesn't hallucinate key facts
|
|
130
|
+
assert_no_hallucination(response, known_facts=[
|
|
131
|
+
"The article is about climate change",
|
|
132
|
+
"CO2 levels are rising",
|
|
133
|
+
])
|
|
134
|
+
|
|
135
|
+
# Check it's not too long
|
|
136
|
+
assert_token_budget(response, max_tokens=250)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Run it:
|
|
140
|
+
```bash
|
|
141
|
+
pytest test_my_llm.py -v
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
That's it. No config files. No API keys needed (with `[local]` install).
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## All Assertions
|
|
149
|
+
|
|
150
|
+
### `assert_intent(response, expected_intent, threshold=0.72)`
|
|
151
|
+
|
|
152
|
+
Checks that the response semantically addresses the expected intent.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
assert_intent(response, "a polite refusal to the user's request")
|
|
156
|
+
assert_intent(response, "Python code that reads a CSV file", threshold=0.80)
|
|
157
|
+
assert_intent(response, "step-by-step instructions for setting up Docker")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### `assert_tone(response, expected_tone, threshold=0.65)`
|
|
161
|
+
|
|
162
|
+
Checks the tone/style of the response.
|
|
163
|
+
|
|
164
|
+
**Built-in tones:** `professional`, `casual`, `friendly`, `formal`, `neutral`, `empathetic`, `assertive`, `humorous`, `concise`
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
assert_tone(response, "professional")
|
|
168
|
+
assert_tone(response, "friendly and concise") # custom description
|
|
169
|
+
assert_tone(response, "formal but empathetic") # combine tones
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### `assert_no_hallucination(response, known_facts)`
|
|
173
|
+
|
|
174
|
+
Checks that the response does NOT contradict known facts.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
assert_no_hallucination(response, known_facts=[
|
|
178
|
+
"The product costs $49 per month",
|
|
179
|
+
"The free trial lasts 14 days",
|
|
180
|
+
"Python was created by Guido van Rossum",
|
|
181
|
+
])
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### `assert_token_budget(response, max_tokens, tokenizer="approx")`
|
|
185
|
+
|
|
186
|
+
Checks the response doesn't exceed a token budget.
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
assert_token_budget(response, max_tokens=200) # fast approx
|
|
190
|
+
assert_token_budget(response, max_tokens=200, tokenizer="tiktoken") # exact (pip install tiktoken)
|
|
191
|
+
assert_token_budget(response, max_tokens=800, tokenizer="chars") # character-based
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### `assert_schema(response, schema)`
|
|
195
|
+
|
|
196
|
+
Checks that the response (JSON string) matches a Pydantic model or JSON schema.
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from pydantic import BaseModel
|
|
200
|
+
|
|
201
|
+
class Summary(BaseModel):
|
|
202
|
+
title: str
|
|
203
|
+
body: str
|
|
204
|
+
word_count: int
|
|
205
|
+
|
|
206
|
+
result = assert_schema(response, Summary)
|
|
207
|
+
print(result.title) # validated Pydantic instance
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
# Or a raw JSON Schema dict
|
|
212
|
+
schema = {
|
|
213
|
+
"type": "object",
|
|
214
|
+
"properties": {"title": {"type": "string"}},
|
|
215
|
+
"required": ["title"],
|
|
216
|
+
}
|
|
217
|
+
assert_schema(response, schema)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### `assert_similar_to(response, reference, threshold=0.80)`
|
|
221
|
+
|
|
222
|
+
Checks that the response is semantically close to a reference string.
|
|
223
|
+
Useful for golden-baseline regression.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
score = assert_similar_to(response, golden_response, threshold=0.85)
|
|
227
|
+
print(f"Similarity: {score:.3f}")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Golden Baseline Regression Testing
|
|
233
|
+
|
|
234
|
+
Record a known-good response once, then detect regression on every CI run.
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from genassert import record_baseline, compare_baseline
|
|
238
|
+
|
|
239
|
+
# Step 1: record (run once, commit the .genassert_baselines/ directory)
|
|
240
|
+
record_baseline("summarizer_v1", response)
|
|
241
|
+
|
|
242
|
+
# Step 2: compare on every subsequent run
|
|
243
|
+
def test_summarizer_no_regression():
|
|
244
|
+
response = my_summarize("article...")
|
|
245
|
+
compare_baseline("summarizer_v1", response, threshold=0.85)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Or use the pytest fixture for `--record-baselines` flag integration:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
def test_summarizer_baseline(llm_record):
|
|
252
|
+
response = my_summarize("article...")
|
|
253
|
+
if llm_record:
|
|
254
|
+
record_baseline("summarizer", response, overwrite=True)
|
|
255
|
+
else:
|
|
256
|
+
compare_baseline("summarizer", response)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
# First run — record
|
|
261
|
+
pytest --record-baselines
|
|
262
|
+
|
|
263
|
+
# Every subsequent run — compare
|
|
264
|
+
pytest
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Local Judge (Zero API Cost)
|
|
270
|
+
|
|
271
|
+
Use `LocalJudge` for complex, nuanced evaluations that go beyond embedding similarity:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from genassert import LocalJudge
|
|
275
|
+
|
|
276
|
+
judge = LocalJudge() # uses a tiny local model (auto-downloaded)
|
|
277
|
+
|
|
278
|
+
result = judge.evaluate(
|
|
279
|
+
response="Paris is the capital of France.",
|
|
280
|
+
criterion="The response correctly answers a geography question.",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
assert result.passed
|
|
284
|
+
print(f"Score: {result.score:.2f}")
|
|
285
|
+
print(f"Reasoning: {result.reasoning}")
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Install the local judge backend:
|
|
289
|
+
```bash
|
|
290
|
+
pip install "genassert[judge]" # installs transformers + torch
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## pytest CLI Options
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# Skip all LLM tests (useful in fast unit-test runs)
|
|
299
|
+
pytest --skip-llm
|
|
300
|
+
|
|
301
|
+
# Override similarity threshold globally
|
|
302
|
+
pytest --llm-threshold=0.75
|
|
303
|
+
|
|
304
|
+
# Record golden baselines
|
|
305
|
+
pytest --record-baselines
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## Configuration
|
|
311
|
+
|
|
312
|
+
All settings via environment variables — no config files needed:
|
|
313
|
+
|
|
314
|
+
| Variable | Default | Description |
|
|
315
|
+
|----------|---------|-------------|
|
|
316
|
+
| `genassert_EMBED_BACKEND` | `auto` | `local`, `openai`, `fallback` |
|
|
317
|
+
| `genassert_EMBED_MODEL` | `all-MiniLM-L6-v2` | Embedding model name |
|
|
318
|
+
| `genassert_JUDGE_MODEL` | `Qwen/Qwen2.5-0.5B-Instruct` | Local judge model |
|
|
319
|
+
| `genassert_BASELINE_DIR` | `.genassert_baselines` | Baseline storage directory |
|
|
320
|
+
| `OPENAI_API_KEY` | — | Required for `openai` backend |
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## Embedding Backends
|
|
325
|
+
|
|
326
|
+
| Backend | Speed | Cost | Accuracy | Install |
|
|
327
|
+
|---------|-------|------|----------|---------|
|
|
328
|
+
| `local` (sentence-transformers) | Fast | Free | High | `pip install "genassert[local]"` |
|
|
329
|
+
| `openai` | Moderate | ~$0.0001/test | Very high | `pip install "genassert[openai]"` |
|
|
330
|
+
| `fallback` (hash-based) | Instant | Free | Smoke test only | Built-in |
|
|
331
|
+
|
|
332
|
+
Set backend:
|
|
333
|
+
```bash
|
|
334
|
+
export genassert_EMBED_BACKEND=local # recommended for CI
|
|
335
|
+
export genassert_EMBED_BACKEND=openai # highest accuracy
|
|
336
|
+
export genassert_EMBED_BACKEND=fallback # no deps, structural tests only
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Framework Compatibility
|
|
342
|
+
|
|
343
|
+
genassert is **framework-agnostic**. Use it with any LLM client:
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
# OpenAI
|
|
347
|
+
import openai
|
|
348
|
+
client = openai.OpenAI()
|
|
349
|
+
response = client.chat.completions.create(...).choices[0].message.content
|
|
350
|
+
|
|
351
|
+
# Anthropic
|
|
352
|
+
import anthropic
|
|
353
|
+
client = anthropic.Anthropic()
|
|
354
|
+
response = client.messages.create(...).content[0].text
|
|
355
|
+
|
|
356
|
+
# LiteLLM
|
|
357
|
+
import litellm
|
|
358
|
+
response = litellm.completion(...).choices[0].message.content
|
|
359
|
+
|
|
360
|
+
# LangChain
|
|
361
|
+
from langchain_openai import ChatOpenAI
|
|
362
|
+
response = ChatOpenAI().invoke("...").content
|
|
363
|
+
|
|
364
|
+
# Any string output — genassert only needs the final response string
|
|
365
|
+
assert_intent(response, "your expected intent here")
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
---
|
|
369
|
+
|
|
370
|
+
## Real-World Example: Testing a RAG Chatbot
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
import pytest
|
|
374
|
+
from genassert import assert_intent, assert_no_hallucination, assert_token_budget, assert_schema
|
|
375
|
+
|
|
376
|
+
PRODUCT_FACTS = [
|
|
377
|
+
"The product is called DataFlow Pro",
|
|
378
|
+
"The price is $99 per month",
|
|
379
|
+
"There is a 30-day free trial",
|
|
380
|
+
"It supports Python, JavaScript, and Go",
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
@pytest.mark.llm
|
|
384
|
+
class TestChatbot:
|
|
385
|
+
def test_pricing_question(self, chatbot):
|
|
386
|
+
response = chatbot.ask("How much does it cost?")
|
|
387
|
+
assert_intent(response, "information about pricing or cost")
|
|
388
|
+
assert_no_hallucination(response, PRODUCT_FACTS)
|
|
389
|
+
assert_token_budget(response, max_tokens=150)
|
|
390
|
+
|
|
391
|
+
def test_technical_question(self, chatbot):
|
|
392
|
+
response = chatbot.ask("What languages are supported?")
|
|
393
|
+
assert_intent(response, "list of supported programming languages")
|
|
394
|
+
assert_no_hallucination(response, PRODUCT_FACTS)
|
|
395
|
+
|
|
396
|
+
def test_structured_response(self, chatbot):
|
|
397
|
+
from pydantic import BaseModel
|
|
398
|
+
class PricingInfo(BaseModel):
|
|
399
|
+
price: str
|
|
400
|
+
trial_days: int
|
|
401
|
+
|
|
402
|
+
response = chatbot.ask_structured("Return pricing as JSON")
|
|
403
|
+
assert_schema(response, PricingInfo)
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## CI Integration
|
|
409
|
+
|
|
410
|
+
```yaml
|
|
411
|
+
# .github/workflows/llm-tests.yml
|
|
412
|
+
name: LLM Tests
|
|
413
|
+
|
|
414
|
+
on: [push, pull_request]
|
|
415
|
+
|
|
416
|
+
jobs:
|
|
417
|
+
test:
|
|
418
|
+
runs-on: ubuntu-latest
|
|
419
|
+
steps:
|
|
420
|
+
- uses: actions/checkout@v4
|
|
421
|
+
- uses: actions/setup-python@v5
|
|
422
|
+
with:
|
|
423
|
+
python-version: "3.11"
|
|
424
|
+
|
|
425
|
+
- name: Install genassert
|
|
426
|
+
run: pip install "genassert[local]" pytest
|
|
427
|
+
|
|
428
|
+
- name: Run LLM tests
|
|
429
|
+
run: pytest tests/ -m llm -v
|
|
430
|
+
env:
|
|
431
|
+
genassert_EMBED_BACKEND: local # free, no API key needed
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
---
|
|
435
|
+
|
|
436
|
+
## License
|
|
437
|
+
|
|
438
|
+
MIT © genassert contributors
|
|
439
|
+
|
|
440
|
+
---
|
|
441
|
+
|
|
442
|
+
## Related Projects
|
|
443
|
+
|
|
444
|
+
- [pytest](https://docs.pytest.org/) — the test framework genassert is built on
|
|
445
|
+
- [sentence-transformers](https://www.sbert.net/) — local embedding models
|
|
446
|
+
- [Pydantic](https://docs.pydantic.dev/) — data validation
|
|
447
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) — unified LLM client
|
|
448
|
+
|
|
449
|
+
---
|
|
450
|
+
|
|
451
|
+
*genassert is the missing pytest plugin for the LLM era.*
|
|
452
|
+
*Stop shipping broken AI features. Start testing them.*
|