genassert 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
genassert/judge.py ADDED
@@ -0,0 +1,185 @@
1
+ """
2
+ LocalJudge: A lightweight local LLM judge for semantic evaluation.
3
+
4
+ Uses a small local model (via transformers or llama-cpp-python) so that
5
+ CI runs have zero API cost. Falls back to embedding-based scoring if
6
+ no local model is available.
7
+ """
8
+
9
+ from __future__ import annotations
10
+ import os
11
+ from dataclasses import dataclass
12
+
13
+
14
+ @dataclass
15
+ class JudgeResult:
16
+ passed: bool
17
+ score: float # 0.0–1.0
18
+ reasoning: str
19
+ method: str # "local_model" | "embeddings" | "fallback"
20
+
21
+
22
+ class LocalJudge:
23
+ """
24
+ A reusable judge that evaluates LLM outputs against criteria.
25
+
26
+ Parameters
27
+ ----------
28
+ model:
29
+ Local model identifier. Defaults to GENASSERT_JUDGE_MODEL env var
30
+ or "Qwen/Qwen2.5-0.5B-Instruct" (tiny, fast, free).
31
+ threshold:
32
+ Default pass threshold (0–1). Default 0.7.
33
+ backend:
34
+ "transformers" | "llama_cpp" | "embeddings" (auto-detected).
35
+
36
+ Examples
37
+ --------
38
+ >>> judge = LocalJudge()
39
+ >>> result = judge.evaluate(
40
+ ... response="Paris is the capital of France.",
41
+ ... criterion="The response correctly answers a geography question.",
42
+ ... )
43
+ >>> assert result.passed
44
+ """
45
+
46
+ def __init__(
47
+ self,
48
+ model: str | None = None,
49
+ threshold: float = 0.7,
50
+ backend: str = "auto",
51
+ ) -> None:
52
+ self.model = model or os.environ.get(
53
+ "GENASSERT_JUDGE_MODEL", "Qwen/Qwen2.5-0.5B-Instruct"
54
+ )
55
+ self.threshold = threshold
56
+ self.backend = backend
57
+ self._pipeline = None
58
+
59
+ def evaluate(
60
+ self,
61
+ response: str,
62
+ criterion: str,
63
+ threshold: float | None = None,
64
+ ) -> JudgeResult:
65
+ """
66
+ Evaluate whether `response` meets the `criterion`.
67
+
68
+ Parameters
69
+ ----------
70
+ response:
71
+ The LLM output to evaluate.
72
+ criterion:
73
+ A plain-English description of what constitutes a passing response.
74
+ threshold:
75
+ Override the instance default threshold.
76
+
77
+ Returns
78
+ -------
79
+ JudgeResult
80
+ Contains passed (bool), score (float), reasoning (str), method (str).
81
+ """
82
+ cutoff = threshold if threshold is not None else self.threshold
83
+
84
+ if self.backend in ("transformers", "auto"):
85
+ try:
86
+ return self._evaluate_transformers(response, criterion, cutoff)
87
+ except ImportError:
88
+ pass
89
+
90
+ if self.backend in ("llama_cpp", "auto"):
91
+ try:
92
+ return self._evaluate_llama_cpp(response, criterion, cutoff)
93
+ except ImportError:
94
+ pass
95
+
96
+ # Embedding-based fallback
97
+ return self._evaluate_embeddings(response, criterion, cutoff)
98
+
99
+ def _evaluate_transformers(
100
+ self, response: str, criterion: str, threshold: float
101
+ ) -> JudgeResult:
102
+ from transformers import pipeline as hf_pipeline
103
+
104
+ if self._pipeline is None:
105
+ self._pipeline = hf_pipeline(
106
+ "text-generation",
107
+ model=self.model,
108
+ max_new_tokens=64,
109
+ do_sample=False,
110
+ )
111
+
112
+ prompt = _build_judge_prompt(response, criterion)
113
+ output = self._pipeline(prompt)[0]["generated_text"]
114
+ score, reasoning = _parse_judge_output(output, prompt)
115
+ return JudgeResult(
116
+ passed=score >= threshold,
117
+ score=score,
118
+ reasoning=reasoning,
119
+ method="local_model",
120
+ )
121
+
122
+ def _evaluate_llama_cpp(
123
+ self, response: str, criterion: str, threshold: float
124
+ ) -> JudgeResult:
125
+ from llama_cpp import Llama
126
+
127
+ if self._pipeline is None:
128
+ self._pipeline = Llama.from_pretrained(repo_id=self.model)
129
+
130
+ prompt = _build_judge_prompt(response, criterion)
131
+ output = self._pipeline(prompt, max_tokens=64)["choices"][0]["text"]
132
+ score, reasoning = _parse_judge_output(output, "")
133
+ return JudgeResult(
134
+ passed=score >= threshold,
135
+ score=score,
136
+ reasoning=reasoning,
137
+ method="local_model",
138
+ )
139
+
140
+ def _evaluate_embeddings(
141
+ self, response: str, criterion: str, threshold: float
142
+ ) -> JudgeResult:
143
+ from genassert._embed import embed_text
144
+ from genassert.assertions.intent import _cosine_similarity
145
+
146
+ score = _cosine_similarity(embed_text(response), embed_text(criterion))
147
+ return JudgeResult(
148
+ passed=score >= threshold,
149
+ score=score,
150
+ reasoning=f"Embedding cosine similarity: {score:.3f}",
151
+ method="embeddings",
152
+ )
153
+
154
+
155
+ def _build_judge_prompt(response: str, criterion: str) -> str:
156
+ return (
157
+ f"You are an objective evaluator. Score the following response.\n\n"
158
+ f"CRITERION: {criterion}\n\n"
159
+ f"RESPONSE: {response}\n\n"
160
+ f"Rate on a scale of 0.0 to 1.0 and give one-sentence reasoning.\n"
161
+ f"Format: SCORE: <float> | REASON: <text>\n"
162
+ f"SCORE:"
163
+ )
164
+
165
+
166
+ def _parse_judge_output(output: str, prompt: str) -> tuple[float, str]:
167
+ """Parse 'SCORE: 0.8 | REASON: ...' from judge output."""
168
+ # Remove prompt prefix if present
169
+ text = output.replace(prompt, "").strip()
170
+ if "SCORE:" in text:
171
+ text = text.split("SCORE:")[-1].strip()
172
+
173
+ score = 0.5
174
+ reasoning = text
175
+
176
+ try:
177
+ parts = text.split("|", 1)
178
+ score_str = parts[0].strip()
179
+ score = max(0.0, min(1.0, float(score_str)))
180
+ if len(parts) > 1:
181
+ reasoning = parts[1].replace("REASON:", "").strip()
182
+ except (ValueError, IndexError):
183
+ pass
184
+
185
+ return score, reasoning
genassert/plugin.py ADDED
@@ -0,0 +1,74 @@
1
+ """
2
+ pytest plugin for genassert.
3
+
4
+ Auto-registered via entry_points in pyproject.toml.
5
+ Provides fixtures and marks for LLM tests.
6
+ """
7
+
8
+ from __future__ import annotations
9
+ import pytest
10
+
11
+
12
+ def pytest_configure(config):
13
+ config.addinivalue_line(
14
+ "markers",
15
+ "llm: mark test as an LLM semantic test (may call embedding APIs)",
16
+ )
17
+ config.addinivalue_line(
18
+ "markers",
19
+ "llm_slow: mark test as slow — uses local judge model inference",
20
+ )
21
+
22
+
23
+ def pytest_addoption(parser):
24
+ group = parser.getgroup("genassert")
25
+ group.addoption(
26
+ "--record-baselines",
27
+ action="store_true",
28
+ default=False,
29
+ help="Record (or overwrite) golden baselines for all LLM tests.",
30
+ )
31
+ group.addoption(
32
+ "--llm-threshold",
33
+ type=float,
34
+ default=None,
35
+ help="Override default similarity threshold for all assertions.",
36
+ )
37
+ group.addoption(
38
+ "--skip-llm",
39
+ action="store_true",
40
+ default=False,
41
+ help="Skip all tests marked with @pytest.mark.llm.",
42
+ )
43
+
44
+
45
+ def pytest_collection_modifyitems(config, items):
46
+ if config.getoption("--skip-llm"):
47
+ skip_llm = pytest.mark.skip(reason="--skip-llm flag set")
48
+ for item in items:
49
+ if "llm" in item.keywords:
50
+ item.add_marker(skip_llm)
51
+
52
+
53
+ @pytest.fixture
54
+ def llm_record(request):
55
+ """
56
+ Fixture: returns True if --record-baselines flag is set.
57
+
58
+ Usage:
59
+ def test_something(llm_record):
60
+ response = my_llm_call()
61
+ if llm_record:
62
+ record_baseline("my_test", response)
63
+ else:
64
+ compare_baseline("my_test", response)
65
+ """
66
+ return request.config.getoption("--record-baselines", default=False)
67
+
68
+
69
+ @pytest.fixture
70
+ def llm_threshold(request):
71
+ """
72
+ Fixture: returns the global threshold override (or None).
73
+ """
74
+ return request.config.getoption("--llm-threshold", default=None)
@@ -0,0 +1,452 @@
1
+ Metadata-Version: 2.4
2
+ Name: genassert
3
+ Version: 0.2.0
4
+ Summary: pytest-native semantic testing for LLM and generative AI applications. No servers. No SaaS. Works with OpenAI, Anthropic, LiteLLM and any LLM client.
5
+ Project-URL: Homepage, https://github.com/genassert/genassert
6
+ Project-URL: Documentation, https://genassert.readthedocs.io
7
+ Project-URL: Repository, https://github.com/genassert/genassert
8
+ Project-URL: Bug Tracker, https://github.com/genassert/genassert/issues
9
+ Project-URL: Changelog, https://github.com/genassert/genassert/blob/main/CHANGELOG.md
10
+ Author: genassert contributors
11
+ License: MIT
12
+ License-File: LICENSE
13
+ Keywords: agent testing,ai quality assurance,ai testing,anthropic,claude testing,gen ai,genai testing,generative ai,generative ai testing,golden baseline,gpt testing,hallucination detection,langchain,llm,llm assertions,llm evaluation,llm quality,llm testing,machine learning testing,openai,prompt testing,pytest,pytest plugin,rag testing,regression testing,semantic assertions,semantic testing,testing
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Framework :: Pytest
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Science/Research
18
+ Classifier: License :: OSI Approved :: MIT License
19
+ Classifier: Operating System :: OS Independent
20
+ Classifier: Programming Language :: Python :: 3
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Programming Language :: Python :: 3.13
26
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
28
+ Classifier: Topic :: Software Development :: Testing
29
+ Classifier: Typing :: Typed
30
+ Requires-Python: >=3.9
31
+ Provides-Extra: all
32
+ Requires-Dist: jsonschema>=4.0.0; extra == 'all'
33
+ Requires-Dist: openai>=1.0.0; extra == 'all'
34
+ Requires-Dist: pydantic>=2.0.0; extra == 'all'
35
+ Requires-Dist: sentence-transformers>=2.7.0; extra == 'all'
36
+ Requires-Dist: tiktoken>=0.5.0; extra == 'all'
37
+ Provides-Extra: dev
38
+ Requires-Dist: mypy; extra == 'dev'
39
+ Requires-Dist: openai>=1.0.0; extra == 'dev'
40
+ Requires-Dist: pydantic>=2.0.0; extra == 'dev'
41
+ Requires-Dist: pytest-cov; extra == 'dev'
42
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
43
+ Requires-Dist: ruff; extra == 'dev'
44
+ Requires-Dist: sentence-transformers>=2.7.0; extra == 'dev'
45
+ Provides-Extra: jsonschema
46
+ Requires-Dist: jsonschema>=4.0.0; extra == 'jsonschema'
47
+ Provides-Extra: judge
48
+ Requires-Dist: torch>=2.0.0; extra == 'judge'
49
+ Requires-Dist: transformers>=4.40.0; extra == 'judge'
50
+ Provides-Extra: local
51
+ Requires-Dist: sentence-transformers>=2.7.0; extra == 'local'
52
+ Provides-Extra: openai
53
+ Requires-Dist: openai>=1.0.0; extra == 'openai'
54
+ Provides-Extra: pydantic
55
+ Requires-Dist: pydantic>=2.0.0; extra == 'pydantic'
56
+ Provides-Extra: tiktoken
57
+ Requires-Dist: tiktoken>=0.5.0; extra == 'tiktoken'
58
+ Description-Content-Type: text/markdown
59
+
60
+ # genassert
61
+
62
+ **pytest-native semantic testing for LLM applications.**
63
+ No servers. No SaaS. No config. Works with OpenAI, Anthropic, LiteLLM, and any LLM client.
64
+
65
+ [![PyPI version](https://badge.fury.io/py/genassert.svg)](https://pypi.org/project/genassert/)
66
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
67
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
68
+ [![pytest](https://img.shields.io/badge/framework-pytest-orange)](https://docs.pytest.org/)
69
+
70
+ ---
71
+
72
+ ## Why genassert?
73
+
74
+ Traditional `assert response == expected` breaks the moment your LLM changes a word.
75
+ `genassert` gives you **semantic assertions** — tests that check *meaning*, not strings.
76
+
77
+ | Problem | Traditional testing | genassert |
78
+ |---------|-------------------|-----------|
79
+ | LLM changes wording | Test breaks | Test passes (same meaning) |
80
+ | Response drifts over time | No detection | Baseline regression alert |
81
+ | Wrong tone shipped | No check | `assert_tone(response, "professional")` |
82
+ | Hallucination in response | No check | `assert_no_hallucination(response, facts)` |
83
+ | Response too long | Manual count | `assert_token_budget(response, 200)` |
84
+ | Schema mismatch | Try/except JSON | `assert_schema(response, MyPydanticModel)` |
85
+
86
+ ---
87
+
88
+ ## Install
89
+
90
+ ```bash
91
+ # Minimal install (uses hash-based fallback embedder)
92
+ pip install genassert
93
+
94
+ # Recommended: local embeddings — no API cost, runs in CI for free
95
+ pip install "genassert[local]"
96
+
97
+ # OpenAI embeddings backend
98
+ pip install "genassert[openai]"
99
+
100
+ # Everything
101
+ pip install "genassert[all]"
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Quick Start
107
+
108
+ ```python
109
+ # test_my_llm.py
110
+ import pytest
111
+ from genassert import (
112
+ assert_intent,
113
+ assert_tone,
114
+ assert_no_hallucination,
115
+ assert_token_budget,
116
+ assert_schema,
117
+ )
118
+
119
+ @pytest.mark.llm
120
+ def test_summarizer():
121
+ response = my_summarize_function("Long article about climate change...")
122
+
123
+ # Check the response is actually a summary
124
+ assert_intent(response, "a concise summary of the article")
125
+
126
+ # Check it's neutral — no opinion
127
+ assert_tone(response, "neutral")
128
+
129
+ # Check it doesn't hallucinate key facts
130
+ assert_no_hallucination(response, known_facts=[
131
+ "The article is about climate change",
132
+ "CO2 levels are rising",
133
+ ])
134
+
135
+ # Check it's not too long
136
+ assert_token_budget(response, max_tokens=250)
137
+ ```
138
+
139
+ Run it:
140
+ ```bash
141
+ pytest test_my_llm.py -v
142
+ ```
143
+
144
+ That's it. No config files. No API keys needed (with `[local]` install).
145
+
146
+ ---
147
+
148
+ ## All Assertions
149
+
150
+ ### `assert_intent(response, expected_intent, threshold=0.72)`
151
+
152
+ Checks that the response semantically addresses the expected intent.
153
+
154
+ ```python
155
+ assert_intent(response, "a polite refusal to the user's request")
156
+ assert_intent(response, "Python code that reads a CSV file", threshold=0.80)
157
+ assert_intent(response, "step-by-step instructions for setting up Docker")
158
+ ```
159
+
160
+ ### `assert_tone(response, expected_tone, threshold=0.65)`
161
+
162
+ Checks the tone/style of the response.
163
+
164
+ **Built-in tones:** `professional`, `casual`, `friendly`, `formal`, `neutral`, `empathetic`, `assertive`, `humorous`, `concise`
165
+
166
+ ```python
167
+ assert_tone(response, "professional")
168
+ assert_tone(response, "friendly and concise") # custom description
169
+ assert_tone(response, "formal but empathetic") # combine tones
170
+ ```
171
+
172
+ ### `assert_no_hallucination(response, known_facts)`
173
+
174
+ Checks that the response does NOT contradict known facts.
175
+
176
+ ```python
177
+ assert_no_hallucination(response, known_facts=[
178
+ "The product costs $49 per month",
179
+ "The free trial lasts 14 days",
180
+ "Python was created by Guido van Rossum",
181
+ ])
182
+ ```
183
+
184
+ ### `assert_token_budget(response, max_tokens, tokenizer="approx")`
185
+
186
+ Checks the response doesn't exceed a token budget.
187
+
188
+ ```python
189
+ assert_token_budget(response, max_tokens=200) # fast approx
190
+ assert_token_budget(response, max_tokens=200, tokenizer="tiktoken") # exact (pip install tiktoken)
191
+ assert_token_budget(response, max_tokens=800, tokenizer="chars") # character-based
192
+ ```
193
+
194
+ ### `assert_schema(response, schema)`
195
+
196
+ Checks that the response (JSON string) matches a Pydantic model or JSON schema.
197
+
198
+ ```python
199
+ from pydantic import BaseModel
200
+
201
+ class Summary(BaseModel):
202
+ title: str
203
+ body: str
204
+ word_count: int
205
+
206
+ result = assert_schema(response, Summary)
207
+ print(result.title) # validated Pydantic instance
208
+ ```
209
+
210
+ ```python
211
+ # Or a raw JSON Schema dict
212
+ schema = {
213
+ "type": "object",
214
+ "properties": {"title": {"type": "string"}},
215
+ "required": ["title"],
216
+ }
217
+ assert_schema(response, schema)
218
+ ```
219
+
220
+ ### `assert_similar_to(response, reference, threshold=0.80)`
221
+
222
+ Checks that the response is semantically close to a reference string.
223
+ Useful for golden-baseline regression.
224
+
225
+ ```python
226
+ score = assert_similar_to(response, golden_response, threshold=0.85)
227
+ print(f"Similarity: {score:.3f}")
228
+ ```
229
+
230
+ ---
231
+
232
+ ## Golden Baseline Regression Testing
233
+
234
+ Record a known-good response once, then detect regression on every CI run.
235
+
236
+ ```python
237
+ from genassert import record_baseline, compare_baseline
238
+
239
+ # Step 1: record (run once, commit the .genassert_baselines/ directory)
240
+ record_baseline("summarizer_v1", response)
241
+
242
+ # Step 2: compare on every subsequent run
243
+ def test_summarizer_no_regression():
244
+ response = my_summarize("article...")
245
+ compare_baseline("summarizer_v1", response, threshold=0.85)
246
+ ```
247
+
248
+ Or use the pytest fixture for `--record-baselines` flag integration:
249
+
250
+ ```python
251
+ def test_summarizer_baseline(llm_record):
252
+ response = my_summarize("article...")
253
+ if llm_record:
254
+ record_baseline("summarizer", response, overwrite=True)
255
+ else:
256
+ compare_baseline("summarizer", response)
257
+ ```
258
+
259
+ ```bash
260
+ # First run — record
261
+ pytest --record-baselines
262
+
263
+ # Every subsequent run — compare
264
+ pytest
265
+ ```
266
+
267
+ ---
268
+
269
+ ## Local Judge (Zero API Cost)
270
+
271
+ Use `LocalJudge` for complex, nuanced evaluations that go beyond embedding similarity:
272
+
273
+ ```python
274
+ from genassert import LocalJudge
275
+
276
+ judge = LocalJudge() # uses a tiny local model (auto-downloaded)
277
+
278
+ result = judge.evaluate(
279
+ response="Paris is the capital of France.",
280
+ criterion="The response correctly answers a geography question.",
281
+ )
282
+
283
+ assert result.passed
284
+ print(f"Score: {result.score:.2f}")
285
+ print(f"Reasoning: {result.reasoning}")
286
+ ```
287
+
288
+ Install the local judge backend:
289
+ ```bash
290
+ pip install "genassert[judge]" # installs transformers + torch
291
+ ```
292
+
293
+ ---
294
+
295
+ ## pytest CLI Options
296
+
297
+ ```bash
298
+ # Skip all LLM tests (useful in fast unit-test runs)
299
+ pytest --skip-llm
300
+
301
+ # Override similarity threshold globally
302
+ pytest --llm-threshold=0.75
303
+
304
+ # Record golden baselines
305
+ pytest --record-baselines
306
+ ```
307
+
308
+ ---
309
+
310
+ ## Configuration
311
+
312
+ All settings via environment variables — no config files needed:
313
+
314
+ | Variable | Default | Description |
315
+ |----------|---------|-------------|
316
+ | `genassert_EMBED_BACKEND` | `auto` | `local`, `openai`, `fallback` |
317
+ | `genassert_EMBED_MODEL` | `all-MiniLM-L6-v2` | Embedding model name |
318
+ | `genassert_JUDGE_MODEL` | `Qwen/Qwen2.5-0.5B-Instruct` | Local judge model |
319
+ | `genassert_BASELINE_DIR` | `.genassert_baselines` | Baseline storage directory |
320
+ | `OPENAI_API_KEY` | — | Required for `openai` backend |
321
+
322
+ ---
323
+
324
+ ## Embedding Backends
325
+
326
+ | Backend | Speed | Cost | Accuracy | Install |
327
+ |---------|-------|------|----------|---------|
328
+ | `local` (sentence-transformers) | Fast | Free | High | `pip install "genassert[local]"` |
329
+ | `openai` | Moderate | ~$0.0001/test | Very high | `pip install "genassert[openai]"` |
330
+ | `fallback` (hash-based) | Instant | Free | Smoke test only | Built-in |
331
+
332
+ Set backend:
333
+ ```bash
334
+ export genassert_EMBED_BACKEND=local # recommended for CI
335
+ export genassert_EMBED_BACKEND=openai # highest accuracy
336
+ export genassert_EMBED_BACKEND=fallback # no deps, structural tests only
337
+ ```
338
+
339
+ ---
340
+
341
+ ## Framework Compatibility
342
+
343
+ genassert is **framework-agnostic**. Use it with any LLM client:
344
+
345
+ ```python
346
+ # OpenAI
347
+ import openai
348
+ client = openai.OpenAI()
349
+ response = client.chat.completions.create(...).choices[0].message.content
350
+
351
+ # Anthropic
352
+ import anthropic
353
+ client = anthropic.Anthropic()
354
+ response = client.messages.create(...).content[0].text
355
+
356
+ # LiteLLM
357
+ import litellm
358
+ response = litellm.completion(...).choices[0].message.content
359
+
360
+ # LangChain
361
+ from langchain_openai import ChatOpenAI
362
+ response = ChatOpenAI().invoke("...").content
363
+
364
+ # Any string output — genassert only needs the final response string
365
+ assert_intent(response, "your expected intent here")
366
+ ```
367
+
368
+ ---
369
+
370
+ ## Real-World Example: Testing a RAG Chatbot
371
+
372
+ ```python
373
+ import pytest
374
+ from genassert import assert_intent, assert_no_hallucination, assert_token_budget, assert_schema
375
+
376
+ PRODUCT_FACTS = [
377
+ "The product is called DataFlow Pro",
378
+ "The price is $99 per month",
379
+ "There is a 30-day free trial",
380
+ "It supports Python, JavaScript, and Go",
381
+ ]
382
+
383
+ @pytest.mark.llm
384
+ class TestChatbot:
385
+ def test_pricing_question(self, chatbot):
386
+ response = chatbot.ask("How much does it cost?")
387
+ assert_intent(response, "information about pricing or cost")
388
+ assert_no_hallucination(response, PRODUCT_FACTS)
389
+ assert_token_budget(response, max_tokens=150)
390
+
391
+ def test_technical_question(self, chatbot):
392
+ response = chatbot.ask("What languages are supported?")
393
+ assert_intent(response, "list of supported programming languages")
394
+ assert_no_hallucination(response, PRODUCT_FACTS)
395
+
396
+ def test_structured_response(self, chatbot):
397
+ from pydantic import BaseModel
398
+ class PricingInfo(BaseModel):
399
+ price: str
400
+ trial_days: int
401
+
402
+ response = chatbot.ask_structured("Return pricing as JSON")
403
+ assert_schema(response, PricingInfo)
404
+ ```
405
+
406
+ ---
407
+
408
+ ## CI Integration
409
+
410
+ ```yaml
411
+ # .github/workflows/llm-tests.yml
412
+ name: LLM Tests
413
+
414
+ on: [push, pull_request]
415
+
416
+ jobs:
417
+ test:
418
+ runs-on: ubuntu-latest
419
+ steps:
420
+ - uses: actions/checkout@v4
421
+ - uses: actions/setup-python@v5
422
+ with:
423
+ python-version: "3.11"
424
+
425
+ - name: Install genassert
426
+ run: pip install "genassert[local]" pytest
427
+
428
+ - name: Run LLM tests
429
+ run: pytest tests/ -m llm -v
430
+ env:
431
+ genassert_EMBED_BACKEND: local # free, no API key needed
432
+ ```
433
+
434
+ ---
435
+
436
+ ## License
437
+
438
+ MIT © genassert contributors
439
+
440
+ ---
441
+
442
+ ## Related Projects
443
+
444
+ - [pytest](https://docs.pytest.org/) — the test framework genassert is built on
445
+ - [sentence-transformers](https://www.sbert.net/) — local embedding models
446
+ - [Pydantic](https://docs.pydantic.dev/) — data validation
447
+ - [LiteLLM](https://github.com/BerriAI/litellm) — unified LLM client
448
+
449
+ ---
450
+
451
+ *genassert is the missing pytest plugin for the LLM era.*
452
+ *Stop shipping broken AI features. Start testing them.*