genassert 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- genassert-0.2.0/CHANGELOG.md +32 -0
- genassert-0.2.0/LICENSE +21 -0
- genassert-0.2.0/PKG-INFO +452 -0
- genassert-0.2.0/README.md +393 -0
- genassert-0.2.0/genassert/__init__.py +43 -0
- genassert-0.2.0/genassert/_embed.py +113 -0
- genassert-0.2.0/genassert/assertions/__init__.py +1 -0
- genassert-0.2.0/genassert/assertions/budget.py +62 -0
- genassert-0.2.0/genassert/assertions/hallucination.py +89 -0
- genassert-0.2.0/genassert/assertions/intent.py +62 -0
- genassert-0.2.0/genassert/assertions/language.py +145 -0
- genassert-0.2.0/genassert/assertions/pii.py +107 -0
- genassert-0.2.0/genassert/assertions/readability.py +135 -0
- genassert-0.2.0/genassert/assertions/schema.py +100 -0
- genassert-0.2.0/genassert/assertions/sentiment.py +160 -0
- genassert-0.2.0/genassert/assertions/similarity.py +57 -0
- genassert-0.2.0/genassert/assertions/tone.py +93 -0
- genassert-0.2.0/genassert/baseline.py +142 -0
- genassert-0.2.0/genassert/judge.py +185 -0
- genassert-0.2.0/genassert/plugin.py +74 -0
- genassert-0.2.0/pyproject.toml +177 -0
- genassert-0.2.0/tests/__init__.py +0 -0
- genassert-0.2.0/tests/test_assertions.py +391 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## [0.2.0] — 2026-04-06
|
|
4
|
+
|
|
5
|
+
### Added
|
|
6
|
+
- `assert_language` — detect and assert response language (20+ languages, ISO 639-1 codes, zero deps)
|
|
7
|
+
- `assert_no_pii` — detect PII leakage: emails, phones, SSNs, credit cards, IPs, IBANs, passports
|
|
8
|
+
- `assert_readability` — Flesch Reading Ease and Flesch-Kincaid Grade Level assertions
|
|
9
|
+
- `assert_sentiment` — lexicon-based positive/negative/neutral sentiment detection with negation handling
|
|
10
|
+
- `PIIMatch` dataclass — structured PII match result with type, value, and position
|
|
11
|
+
- Package renamed from `assertllm` → `genassert` for broader generative AI coverage
|
|
12
|
+
|
|
13
|
+
### Changed
|
|
14
|
+
- Version bumped to 0.2.0
|
|
15
|
+
- All env vars renamed: `ASSERTLLM_*` → `GENASSERT_*`
|
|
16
|
+
- Baseline directory default: `.assertllm_baselines` → `.genassert_baselines`
|
|
17
|
+
|
|
18
|
+
## [0.1.0] — 2026-04-06
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
- `assert_intent` — semantic intent matching via cosine similarity
|
|
22
|
+
- `assert_tone` — tone/style detection (professional, casual, friendly, etc.)
|
|
23
|
+
- `assert_no_hallucination` — contradiction detection against known facts
|
|
24
|
+
- `assert_token_budget` — token count budget assertion (approx/tiktoken/chars)
|
|
25
|
+
- `assert_schema` — Pydantic model and JSON Schema validation
|
|
26
|
+
- `assert_similar_to` — golden baseline similarity comparison
|
|
27
|
+
- `record_baseline` / `compare_baseline` — baseline regression testing
|
|
28
|
+
- `LocalJudge` — zero-cost local LLM judge (transformers / llama-cpp / embeddings)
|
|
29
|
+
- pytest plugin with `--record-baselines`, `--llm-threshold`, `--skip-llm` flags
|
|
30
|
+
- `llm_record` and `llm_threshold` pytest fixtures
|
|
31
|
+
- Three embedding backends: sentence-transformers, OpenAI, hash fallback
|
|
32
|
+
- Full test suite with offline tests (no external deps required)
|
genassert-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 genassert contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
genassert-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,452 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: genassert
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: pytest-native semantic testing for LLM and generative AI applications. No servers. No SaaS. Works with OpenAI, Anthropic, LiteLLM and any LLM client.
|
|
5
|
+
Project-URL: Homepage, https://github.com/genassert/genassert
|
|
6
|
+
Project-URL: Documentation, https://genassert.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/genassert/genassert
|
|
8
|
+
Project-URL: Bug Tracker, https://github.com/genassert/genassert/issues
|
|
9
|
+
Project-URL: Changelog, https://github.com/genassert/genassert/blob/main/CHANGELOG.md
|
|
10
|
+
Author: genassert contributors
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: agent testing,ai quality assurance,ai testing,anthropic,claude testing,gen ai,genai testing,generative ai,generative ai testing,golden baseline,gpt testing,hallucination detection,langchain,llm,llm assertions,llm evaluation,llm quality,llm testing,machine learning testing,openai,prompt testing,pytest,pytest plugin,rag testing,regression testing,semantic assertions,semantic testing,testing
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Framework :: Pytest
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
19
|
+
Classifier: Operating System :: OS Independent
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
28
|
+
Classifier: Topic :: Software Development :: Testing
|
|
29
|
+
Classifier: Typing :: Typed
|
|
30
|
+
Requires-Python: >=3.9
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: jsonschema>=4.0.0; extra == 'all'
|
|
33
|
+
Requires-Dist: openai>=1.0.0; extra == 'all'
|
|
34
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'all'
|
|
35
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'all'
|
|
36
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'all'
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: mypy; extra == 'dev'
|
|
39
|
+
Requires-Dist: openai>=1.0.0; extra == 'dev'
|
|
40
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'dev'
|
|
41
|
+
Requires-Dist: pytest-cov; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
43
|
+
Requires-Dist: ruff; extra == 'dev'
|
|
44
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'dev'
|
|
45
|
+
Provides-Extra: jsonschema
|
|
46
|
+
Requires-Dist: jsonschema>=4.0.0; extra == 'jsonschema'
|
|
47
|
+
Provides-Extra: judge
|
|
48
|
+
Requires-Dist: torch>=2.0.0; extra == 'judge'
|
|
49
|
+
Requires-Dist: transformers>=4.40.0; extra == 'judge'
|
|
50
|
+
Provides-Extra: local
|
|
51
|
+
Requires-Dist: sentence-transformers>=2.7.0; extra == 'local'
|
|
52
|
+
Provides-Extra: openai
|
|
53
|
+
Requires-Dist: openai>=1.0.0; extra == 'openai'
|
|
54
|
+
Provides-Extra: pydantic
|
|
55
|
+
Requires-Dist: pydantic>=2.0.0; extra == 'pydantic'
|
|
56
|
+
Provides-Extra: tiktoken
|
|
57
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'tiktoken'
|
|
58
|
+
Description-Content-Type: text/markdown
|
|
59
|
+
|
|
60
|
+
# genassert
|
|
61
|
+
|
|
62
|
+
**pytest-native semantic testing for LLM applications.**
|
|
63
|
+
No servers. No SaaS. No config. Works with OpenAI, Anthropic, LiteLLM, and any LLM client.
|
|
64
|
+
|
|
65
|
+
[](https://pypi.org/project/genassert/)
|
|
66
|
+
[](https://www.python.org/downloads/)
|
|
67
|
+
[](https://opensource.org/licenses/MIT)
|
|
68
|
+
[](https://docs.pytest.org/)
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Why genassert?
|
|
73
|
+
|
|
74
|
+
Traditional `assert response == expected` breaks the moment your LLM changes a word.
|
|
75
|
+
`genassert` gives you **semantic assertions** — tests that check *meaning*, not strings.
|
|
76
|
+
|
|
77
|
+
| Problem | Traditional testing | genassert |
|
|
78
|
+
|---------|-------------------|-----------|
|
|
79
|
+
| LLM changes wording | Test breaks | Test passes (same meaning) |
|
|
80
|
+
| Response drifts over time | No detection | Baseline regression alert |
|
|
81
|
+
| Wrong tone shipped | No check | `assert_tone(response, "professional")` |
|
|
82
|
+
| Hallucination in response | No check | `assert_no_hallucination(response, facts)` |
|
|
83
|
+
| Response too long | Manual count | `assert_token_budget(response, 200)` |
|
|
84
|
+
| Schema mismatch | Try/except JSON | `assert_schema(response, MyPydanticModel)` |
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Install
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
# Minimal install (uses hash-based fallback embedder)
|
|
92
|
+
pip install genassert
|
|
93
|
+
|
|
94
|
+
# Recommended: local embeddings — no API cost, runs in CI for free
|
|
95
|
+
pip install "genassert[local]"
|
|
96
|
+
|
|
97
|
+
# OpenAI embeddings backend
|
|
98
|
+
pip install "genassert[openai]"
|
|
99
|
+
|
|
100
|
+
# Everything
|
|
101
|
+
pip install "genassert[all]"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Quick Start
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# test_my_llm.py
|
|
110
|
+
import pytest
|
|
111
|
+
from genassert import (
|
|
112
|
+
assert_intent,
|
|
113
|
+
assert_tone,
|
|
114
|
+
assert_no_hallucination,
|
|
115
|
+
assert_token_budget,
|
|
116
|
+
assert_schema,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
@pytest.mark.llm
|
|
120
|
+
def test_summarizer():
|
|
121
|
+
response = my_summarize_function("Long article about climate change...")
|
|
122
|
+
|
|
123
|
+
# Check the response is actually a summary
|
|
124
|
+
assert_intent(response, "a concise summary of the article")
|
|
125
|
+
|
|
126
|
+
# Check it's neutral — no opinion
|
|
127
|
+
assert_tone(response, "neutral")
|
|
128
|
+
|
|
129
|
+
# Check it doesn't hallucinate key facts
|
|
130
|
+
assert_no_hallucination(response, known_facts=[
|
|
131
|
+
"The article is about climate change",
|
|
132
|
+
"CO2 levels are rising",
|
|
133
|
+
])
|
|
134
|
+
|
|
135
|
+
# Check it's not too long
|
|
136
|
+
assert_token_budget(response, max_tokens=250)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Run it:
|
|
140
|
+
```bash
|
|
141
|
+
pytest test_my_llm.py -v
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
That's it. No config files. No API keys needed (with `[local]` install).
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## All Assertions
|
|
149
|
+
|
|
150
|
+
### `assert_intent(response, expected_intent, threshold=0.72)`
|
|
151
|
+
|
|
152
|
+
Checks that the response semantically addresses the expected intent.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
assert_intent(response, "a polite refusal to the user's request")
|
|
156
|
+
assert_intent(response, "Python code that reads a CSV file", threshold=0.80)
|
|
157
|
+
assert_intent(response, "step-by-step instructions for setting up Docker")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### `assert_tone(response, expected_tone, threshold=0.65)`
|
|
161
|
+
|
|
162
|
+
Checks the tone/style of the response.
|
|
163
|
+
|
|
164
|
+
**Built-in tones:** `professional`, `casual`, `friendly`, `formal`, `neutral`, `empathetic`, `assertive`, `humorous`, `concise`
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
assert_tone(response, "professional")
|
|
168
|
+
assert_tone(response, "friendly and concise") # custom description
|
|
169
|
+
assert_tone(response, "formal but empathetic") # combine tones
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
### `assert_no_hallucination(response, known_facts)`
|
|
173
|
+
|
|
174
|
+
Checks that the response does NOT contradict known facts.
|
|
175
|
+
|
|
176
|
+
```python
|
|
177
|
+
assert_no_hallucination(response, known_facts=[
|
|
178
|
+
"The product costs $49 per month",
|
|
179
|
+
"The free trial lasts 14 days",
|
|
180
|
+
"Python was created by Guido van Rossum",
|
|
181
|
+
])
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
### `assert_token_budget(response, max_tokens, tokenizer="approx")`
|
|
185
|
+
|
|
186
|
+
Checks the response doesn't exceed a token budget.
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
assert_token_budget(response, max_tokens=200) # fast approx
|
|
190
|
+
assert_token_budget(response, max_tokens=200, tokenizer="tiktoken") # exact (pip install tiktoken)
|
|
191
|
+
assert_token_budget(response, max_tokens=800, tokenizer="chars") # character-based
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### `assert_schema(response, schema)`
|
|
195
|
+
|
|
196
|
+
Checks that the response (JSON string) matches a Pydantic model or JSON schema.
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from pydantic import BaseModel
|
|
200
|
+
|
|
201
|
+
class Summary(BaseModel):
|
|
202
|
+
title: str
|
|
203
|
+
body: str
|
|
204
|
+
word_count: int
|
|
205
|
+
|
|
206
|
+
result = assert_schema(response, Summary)
|
|
207
|
+
print(result.title) # validated Pydantic instance
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
# Or a raw JSON Schema dict
|
|
212
|
+
schema = {
|
|
213
|
+
"type": "object",
|
|
214
|
+
"properties": {"title": {"type": "string"}},
|
|
215
|
+
"required": ["title"],
|
|
216
|
+
}
|
|
217
|
+
assert_schema(response, schema)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### `assert_similar_to(response, reference, threshold=0.80)`
|
|
221
|
+
|
|
222
|
+
Checks that the response is semantically close to a reference string.
|
|
223
|
+
Useful for golden-baseline regression.
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
score = assert_similar_to(response, golden_response, threshold=0.85)
|
|
227
|
+
print(f"Similarity: {score:.3f}")
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
---
|
|
231
|
+
|
|
232
|
+
## Golden Baseline Regression Testing
|
|
233
|
+
|
|
234
|
+
Record a known-good response once, then detect regression on every CI run.
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
from genassert import record_baseline, compare_baseline
|
|
238
|
+
|
|
239
|
+
# Step 1: record (run once, commit the .genassert_baselines/ directory)
|
|
240
|
+
record_baseline("summarizer_v1", response)
|
|
241
|
+
|
|
242
|
+
# Step 2: compare on every subsequent run
|
|
243
|
+
def test_summarizer_no_regression():
|
|
244
|
+
response = my_summarize("article...")
|
|
245
|
+
compare_baseline("summarizer_v1", response, threshold=0.85)
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Or use the pytest fixture for `--record-baselines` flag integration:
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
def test_summarizer_baseline(llm_record):
|
|
252
|
+
response = my_summarize("article...")
|
|
253
|
+
if llm_record:
|
|
254
|
+
record_baseline("summarizer", response, overwrite=True)
|
|
255
|
+
else:
|
|
256
|
+
compare_baseline("summarizer", response)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
```bash
|
|
260
|
+
# First run — record
|
|
261
|
+
pytest --record-baselines
|
|
262
|
+
|
|
263
|
+
# Every subsequent run — compare
|
|
264
|
+
pytest
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
---
|
|
268
|
+
|
|
269
|
+
## Local Judge (Zero API Cost)
|
|
270
|
+
|
|
271
|
+
Use `LocalJudge` for complex, nuanced evaluations that go beyond embedding similarity:
|
|
272
|
+
|
|
273
|
+
```python
|
|
274
|
+
from genassert import LocalJudge
|
|
275
|
+
|
|
276
|
+
judge = LocalJudge() # uses a tiny local model (auto-downloaded)
|
|
277
|
+
|
|
278
|
+
result = judge.evaluate(
|
|
279
|
+
response="Paris is the capital of France.",
|
|
280
|
+
criterion="The response correctly answers a geography question.",
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
assert result.passed
|
|
284
|
+
print(f"Score: {result.score:.2f}")
|
|
285
|
+
print(f"Reasoning: {result.reasoning}")
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
Install the local judge backend:
|
|
289
|
+
```bash
|
|
290
|
+
pip install "genassert[judge]" # installs transformers + torch
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
---
|
|
294
|
+
|
|
295
|
+
## pytest CLI Options
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# Skip all LLM tests (useful in fast unit-test runs)
|
|
299
|
+
pytest --skip-llm
|
|
300
|
+
|
|
301
|
+
# Override similarity threshold globally
|
|
302
|
+
pytest --llm-threshold=0.75
|
|
303
|
+
|
|
304
|
+
# Record golden baselines
|
|
305
|
+
pytest --record-baselines
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
---
|
|
309
|
+
|
|
310
|
+
## Configuration
|
|
311
|
+
|
|
312
|
+
All settings via environment variables — no config files needed:
|
|
313
|
+
|
|
314
|
+
| Variable | Default | Description |
|
|
315
|
+
|----------|---------|-------------|
|
|
316
|
+
| `genassert_EMBED_BACKEND` | `auto` | `local`, `openai`, `fallback` |
|
|
317
|
+
| `genassert_EMBED_MODEL` | `all-MiniLM-L6-v2` | Embedding model name |
|
|
318
|
+
| `genassert_JUDGE_MODEL` | `Qwen/Qwen2.5-0.5B-Instruct` | Local judge model |
|
|
319
|
+
| `genassert_BASELINE_DIR` | `.genassert_baselines` | Baseline storage directory |
|
|
320
|
+
| `OPENAI_API_KEY` | — | Required for `openai` backend |
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## Embedding Backends
|
|
325
|
+
|
|
326
|
+
| Backend | Speed | Cost | Accuracy | Install |
|
|
327
|
+
|---------|-------|------|----------|---------|
|
|
328
|
+
| `local` (sentence-transformers) | Fast | Free | High | `pip install "genassert[local]"` |
|
|
329
|
+
| `openai` | Moderate | ~$0.0001/test | Very high | `pip install "genassert[openai]"` |
|
|
330
|
+
| `fallback` (hash-based) | Instant | Free | Smoke test only | Built-in |
|
|
331
|
+
|
|
332
|
+
Set backend:
|
|
333
|
+
```bash
|
|
334
|
+
export genassert_EMBED_BACKEND=local # recommended for CI
|
|
335
|
+
export genassert_EMBED_BACKEND=openai # highest accuracy
|
|
336
|
+
export genassert_EMBED_BACKEND=fallback # no deps, structural tests only
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
---
|
|
340
|
+
|
|
341
|
+
## Framework Compatibility
|
|
342
|
+
|
|
343
|
+
genassert is **framework-agnostic**. Use it with any LLM client:
|
|
344
|
+
|
|
345
|
+
```python
|
|
346
|
+
# OpenAI
|
|
347
|
+
import openai
|
|
348
|
+
client = openai.OpenAI()
|
|
349
|
+
response = client.chat.completions.create(...).choices[0].message.content
|
|
350
|
+
|
|
351
|
+
# Anthropic
|
|
352
|
+
import anthropic
|
|
353
|
+
client = anthropic.Anthropic()
|
|
354
|
+
response = client.messages.create(...).content[0].text
|
|
355
|
+
|
|
356
|
+
# LiteLLM
|
|
357
|
+
import litellm
|
|
358
|
+
response = litellm.completion(...).choices[0].message.content
|
|
359
|
+
|
|
360
|
+
# LangChain
|
|
361
|
+
from langchain_openai import ChatOpenAI
|
|
362
|
+
response = ChatOpenAI().invoke("...").content
|
|
363
|
+
|
|
364
|
+
# Any string output — genassert only needs the final response string
|
|
365
|
+
assert_intent(response, "your expected intent here")
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
---
|
|
369
|
+
|
|
370
|
+
## Real-World Example: Testing a RAG Chatbot
|
|
371
|
+
|
|
372
|
+
```python
|
|
373
|
+
import pytest
|
|
374
|
+
from genassert import assert_intent, assert_no_hallucination, assert_token_budget, assert_schema
|
|
375
|
+
|
|
376
|
+
PRODUCT_FACTS = [
|
|
377
|
+
"The product is called DataFlow Pro",
|
|
378
|
+
"The price is $99 per month",
|
|
379
|
+
"There is a 30-day free trial",
|
|
380
|
+
"It supports Python, JavaScript, and Go",
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
@pytest.mark.llm
|
|
384
|
+
class TestChatbot:
|
|
385
|
+
def test_pricing_question(self, chatbot):
|
|
386
|
+
response = chatbot.ask("How much does it cost?")
|
|
387
|
+
assert_intent(response, "information about pricing or cost")
|
|
388
|
+
assert_no_hallucination(response, PRODUCT_FACTS)
|
|
389
|
+
assert_token_budget(response, max_tokens=150)
|
|
390
|
+
|
|
391
|
+
def test_technical_question(self, chatbot):
|
|
392
|
+
response = chatbot.ask("What languages are supported?")
|
|
393
|
+
assert_intent(response, "list of supported programming languages")
|
|
394
|
+
assert_no_hallucination(response, PRODUCT_FACTS)
|
|
395
|
+
|
|
396
|
+
def test_structured_response(self, chatbot):
|
|
397
|
+
from pydantic import BaseModel
|
|
398
|
+
class PricingInfo(BaseModel):
|
|
399
|
+
price: str
|
|
400
|
+
trial_days: int
|
|
401
|
+
|
|
402
|
+
response = chatbot.ask_structured("Return pricing as JSON")
|
|
403
|
+
assert_schema(response, PricingInfo)
|
|
404
|
+
```
|
|
405
|
+
|
|
406
|
+
---
|
|
407
|
+
|
|
408
|
+
## CI Integration
|
|
409
|
+
|
|
410
|
+
```yaml
|
|
411
|
+
# .github/workflows/llm-tests.yml
|
|
412
|
+
name: LLM Tests
|
|
413
|
+
|
|
414
|
+
on: [push, pull_request]
|
|
415
|
+
|
|
416
|
+
jobs:
|
|
417
|
+
test:
|
|
418
|
+
runs-on: ubuntu-latest
|
|
419
|
+
steps:
|
|
420
|
+
- uses: actions/checkout@v4
|
|
421
|
+
- uses: actions/setup-python@v5
|
|
422
|
+
with:
|
|
423
|
+
python-version: "3.11"
|
|
424
|
+
|
|
425
|
+
- name: Install genassert
|
|
426
|
+
run: pip install "genassert[local]" pytest
|
|
427
|
+
|
|
428
|
+
- name: Run LLM tests
|
|
429
|
+
run: pytest tests/ -m llm -v
|
|
430
|
+
env:
|
|
431
|
+
genassert_EMBED_BACKEND: local # free, no API key needed
|
|
432
|
+
```
|
|
433
|
+
|
|
434
|
+
---
|
|
435
|
+
|
|
436
|
+
## License
|
|
437
|
+
|
|
438
|
+
MIT © genassert contributors
|
|
439
|
+
|
|
440
|
+
---
|
|
441
|
+
|
|
442
|
+
## Related Projects
|
|
443
|
+
|
|
444
|
+
- [pytest](https://docs.pytest.org/) — the test framework genassert is built on
|
|
445
|
+
- [sentence-transformers](https://www.sbert.net/) — local embedding models
|
|
446
|
+
- [Pydantic](https://docs.pydantic.dev/) — data validation
|
|
447
|
+
- [LiteLLM](https://github.com/BerriAI/litellm) — unified LLM client
|
|
448
|
+
|
|
449
|
+
---
|
|
450
|
+
|
|
451
|
+
*genassert is the missing pytest plugin for the LLM era.*
|
|
452
|
+
*Stop shipping broken AI features. Start testing them.*
|