checkllm 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checkllm-0.1.0/.github/workflows/ci.yml +49 -0
- checkllm-0.1.0/.github/workflows/publish.yml +30 -0
- checkllm-0.1.0/.gitignore +9 -0
- checkllm-0.1.0/CHANGELOG.md +21 -0
- checkllm-0.1.0/LICENSE +21 -0
- checkllm-0.1.0/PKG-INFO +404 -0
- checkllm-0.1.0/README.md +367 -0
- checkllm-0.1.0/examples/README.md +29 -0
- checkllm-0.1.0/examples/qa_dataset.yaml +9 -0
- checkllm-0.1.0/examples/test_basic.py +65 -0
- checkllm-0.1.0/examples/test_custom_metrics.py +72 -0
- checkllm-0.1.0/examples/test_dataset_driven.py +67 -0
- checkllm-0.1.0/examples/test_llm_judge.py +54 -0
- checkllm-0.1.0/examples/test_regression_workflow.py +63 -0
- checkllm-0.1.0/pyproject.toml +71 -0
- checkllm-0.1.0/src/checkllm/__init__.py +23 -0
- checkllm-0.1.0/src/checkllm/check.py +264 -0
- checkllm-0.1.0/src/checkllm/cli.py +476 -0
- checkllm-0.1.0/src/checkllm/config.py +62 -0
- checkllm-0.1.0/src/checkllm/datasets/__init__.py +4 -0
- checkllm-0.1.0/src/checkllm/datasets/case.py +21 -0
- checkllm-0.1.0/src/checkllm/datasets/loader.py +33 -0
- checkllm-0.1.0/src/checkllm/deterministic.py +139 -0
- checkllm-0.1.0/src/checkllm/judge.py +244 -0
- checkllm-0.1.0/src/checkllm/metrics/__init__.py +52 -0
- checkllm-0.1.0/src/checkllm/metrics/hallucination.py +44 -0
- checkllm-0.1.0/src/checkllm/metrics/relevance.py +44 -0
- checkllm-0.1.0/src/checkllm/metrics/rubric.py +45 -0
- checkllm-0.1.0/src/checkllm/metrics/toxicity.py +45 -0
- checkllm-0.1.0/src/checkllm/models.py +41 -0
- checkllm-0.1.0/src/checkllm/py.typed +0 -0
- checkllm-0.1.0/src/checkllm/pytest_plugin.py +238 -0
- checkllm-0.1.0/src/checkllm/regression/__init__.py +14 -0
- checkllm-0.1.0/src/checkllm/regression/compare.py +68 -0
- checkllm-0.1.0/src/checkllm/regression/snapshot.py +58 -0
- checkllm-0.1.0/src/checkllm/regression/stats.py +97 -0
- checkllm-0.1.0/src/checkllm/reporting/__init__.py +10 -0
- checkllm-0.1.0/src/checkllm/reporting/html.py +33 -0
- checkllm-0.1.0/src/checkllm/reporting/junit.py +63 -0
- checkllm-0.1.0/src/checkllm/reporting/terminal.py +105 -0
- checkllm-0.1.0/src/checkllm/runner.py +20 -0
- checkllm-0.1.0/src/checkllm/templates/report.html.j2 +54 -0
- checkllm-0.1.0/tests/__init__.py +0 -0
- checkllm-0.1.0/tests/conftest.py +1 -0
- checkllm-0.1.0/tests/fixtures/sample_dataset.yaml +7 -0
- checkllm-0.1.0/tests/fixtures/sample_snapshot.json +20 -0
- checkllm-0.1.0/tests/test_check.py +122 -0
- checkllm-0.1.0/tests/test_cli.py +55 -0
- checkllm-0.1.0/tests/test_cli_integration.py +182 -0
- checkllm-0.1.0/tests/test_config.py +64 -0
- checkllm-0.1.0/tests/test_datasets.py +72 -0
- checkllm-0.1.0/tests/test_deterministic.py +126 -0
- checkllm-0.1.0/tests/test_e2e.py +132 -0
- checkllm-0.1.0/tests/test_judge.py +106 -0
- checkllm-0.1.0/tests/test_metrics/__init__.py +0 -0
- checkllm-0.1.0/tests/test_metrics/test_hallucination.py +61 -0
- checkllm-0.1.0/tests/test_metrics/test_relevance.py +49 -0
- checkllm-0.1.0/tests/test_metrics/test_rubric.py +67 -0
- checkllm-0.1.0/tests/test_metrics/test_toxicity.py +42 -0
- checkllm-0.1.0/tests/test_models.py +93 -0
- checkllm-0.1.0/tests/test_new_features.py +172 -0
- checkllm-0.1.0/tests/test_plugin_system.py +81 -0
- checkllm-0.1.0/tests/test_pytest_plugin.py +79 -0
- checkllm-0.1.0/tests/test_regression/__init__.py +0 -0
- checkllm-0.1.0/tests/test_regression/test_compare.py +68 -0
- checkllm-0.1.0/tests/test_regression/test_snapshot.py +74 -0
- checkllm-0.1.0/tests/test_regression/test_stats.py +79 -0
- checkllm-0.1.0/tests/test_reporting/__init__.py +0 -0
- checkllm-0.1.0/tests/test_reporting/test_html.py +55 -0
- checkllm-0.1.0/tests/test_reporting/test_junit.py +73 -0
- checkllm-0.1.0/tests/test_reporting/test_terminal.py +68 -0
- checkllm-0.1.0/tests/test_session_collection.py +102 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ${{ matrix.os }}
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
os: [ubuntu-latest, windows-latest, macos-latest]
|
|
15
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
16
|
+
fail-fast: false
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: ${{ matrix.python-version }}
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: pip install -e ".[dev]"
|
|
28
|
+
|
|
29
|
+
- name: Run tests (deterministic only, no API key needed)
|
|
30
|
+
run: pytest tests/ -v -m "not llm" --tb=short
|
|
31
|
+
|
|
32
|
+
- name: Check CLI works
|
|
33
|
+
run: checkllm --version
|
|
34
|
+
|
|
35
|
+
lint:
|
|
36
|
+
runs-on: ubuntu-latest
|
|
37
|
+
steps:
|
|
38
|
+
- uses: actions/checkout@v4
|
|
39
|
+
|
|
40
|
+
- name: Set up Python
|
|
41
|
+
uses: actions/setup-python@v5
|
|
42
|
+
with:
|
|
43
|
+
python-version: "3.12"
|
|
44
|
+
|
|
45
|
+
- name: Install dependencies
|
|
46
|
+
run: pip install -e ".[dev]"
|
|
47
|
+
|
|
48
|
+
- name: Type check
|
|
49
|
+
run: python -m py_compile src/checkllm/__init__.py
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
id-token: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
environment: pypi
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
|
|
18
|
+
- name: Set up Python
|
|
19
|
+
uses: actions/setup-python@v5
|
|
20
|
+
with:
|
|
21
|
+
python-version: "3.12"
|
|
22
|
+
|
|
23
|
+
- name: Install build tools
|
|
24
|
+
run: pip install build
|
|
25
|
+
|
|
26
|
+
- name: Build package
|
|
27
|
+
run: python -m build
|
|
28
|
+
|
|
29
|
+
- name: Publish to PyPI
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
## 0.1.0 (2026-03-28)
|
|
4
|
+
|
|
5
|
+
Initial release.
|
|
6
|
+
|
|
7
|
+
### Features
|
|
8
|
+
|
|
9
|
+
- **pytest plugin** with `check` fixture for LLM testing in pytest
|
|
10
|
+
- **Deterministic checks**: `contains`, `not_contains`, `regex`, `json_schema`, `max_tokens`, `latency`, `cost`
|
|
11
|
+
- **LLM-as-judge metrics**: `hallucination`, `relevance`, `toxicity`, `rubric`
|
|
12
|
+
- **Custom metrics** via `@metric` decorator and plugin entry points
|
|
13
|
+
- **Dataset system**: YAML loading, generator functions, `@dataset` decorator for parametrized tests
|
|
14
|
+
- **Regression detection**: Welch's t-test with configurable p-value threshold
|
|
15
|
+
- **Snapshot system**: save/load/compare test result baselines
|
|
16
|
+
- **Reporting**: Rich terminal output, HTML reports, JUnit XML
|
|
17
|
+
- **CLI**: `checkllm run`, `snapshot`, `report`, `eval`, `diff`, `init`
|
|
18
|
+
- **Multiple judge backends**: OpenAI and Anthropic
|
|
19
|
+
- **Retry logic** with exponential backoff for transient API failures
|
|
20
|
+
- **Cost tracking** from OpenAI/Anthropic token usage
|
|
21
|
+
- **Configuration** via `pyproject.toml [tool.checkllm]` and environment variables
|
checkllm-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 checkllm contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
checkllm-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,404 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: checkllm
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Test LLM-powered applications with the same rigor as traditional software.
|
|
5
|
+
Author: checkllm contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Framework :: Pytest
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Testing
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Requires-Dist: jinja2>=3.1.0
|
|
20
|
+
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: pydantic>=2.0.0
|
|
22
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
23
|
+
Requires-Dist: rich>=13.0.0
|
|
24
|
+
Requires-Dist: scipy>=1.11.0
|
|
25
|
+
Requires-Dist: tenacity>=8.0.0
|
|
26
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
27
|
+
Requires-Dist: typer>=0.9.0
|
|
28
|
+
Provides-Extra: all
|
|
29
|
+
Requires-Dist: anthropic>=0.20.0; extra == 'all'
|
|
30
|
+
Provides-Extra: anthropic
|
|
31
|
+
Requires-Dist: anthropic>=0.20.0; extra == 'anthropic'
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: coverage>=7.0.0; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
36
|
+
Description-Content-Type: text/markdown
|
|
37
|
+
|
|
38
|
+
# checkllm
|
|
39
|
+
|
|
40
|
+
Test LLM-powered applications with the same rigor as traditional software.
|
|
41
|
+
|
|
42
|
+
checkllm is a pytest plugin and CLI that lets you write assertions for LLM outputs using deterministic checks, LLM-as-judge evaluation, and statistical regression detection.
|
|
43
|
+
|
|
44
|
+
## Why checkllm?
|
|
45
|
+
|
|
46
|
+
- **Works with pytest** - no new test runner to learn, just add a `check` fixture
|
|
47
|
+
- **Free deterministic checks** run instantly with zero API calls
|
|
48
|
+
- **LLM-as-judge** for subjective quality (hallucination, relevance, toxicity, custom rubrics)
|
|
49
|
+
- **Statistical regression detection** using Welch's t-test, not just "did it change?"
|
|
50
|
+
- **Multiple judge backends** - OpenAI and Anthropic, or bring your own
|
|
51
|
+
- **One command** to snapshot, report, or diff your test results
|
|
52
|
+
|
|
53
|
+
## Installation
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
pip install checkllm
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
For Anthropic Claude support:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install checkllm[anthropic]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Quick Start
|
|
66
|
+
|
|
67
|
+
### 1. Write a test
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
# tests/test_my_agent.py
|
|
71
|
+
|
|
72
|
+
def test_output_quality(check):
|
|
73
|
+
output = my_agent("What is Python?")
|
|
74
|
+
|
|
75
|
+
# Deterministic checks (free, instant)
|
|
76
|
+
check.contains(output, "programming language")
|
|
77
|
+
check.not_contains(output, "JavaScript")
|
|
78
|
+
check.max_tokens(output, limit=200)
|
|
79
|
+
|
|
80
|
+
# LLM-as-judge checks (requires OPENAI_API_KEY)
|
|
81
|
+
check.hallucination(output, context="Python is a high-level programming language.")
|
|
82
|
+
check.relevance(output, query="What is Python?")
|
|
83
|
+
check.toxicity(output)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### 2. Run it
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
export OPENAI_API_KEY=sk-...
|
|
90
|
+
|
|
91
|
+
pytest tests/test_my_agent.py -v
|
|
92
|
+
|
|
93
|
+
# Or use the CLI
|
|
94
|
+
checkllm run tests/test_my_agent.py
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### 3. Track regressions
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
checkllm snapshot tests/ --output .checkllm/snapshots/baseline.json
|
|
101
|
+
|
|
102
|
+
# After changes, compare
|
|
103
|
+
checkllm snapshot tests/ --output .checkllm/snapshots/current.json
|
|
104
|
+
checkllm diff --baseline .checkllm/snapshots/baseline.json \
|
|
105
|
+
--current .checkllm/snapshots/current.json
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
## Deterministic Checks
|
|
109
|
+
|
|
110
|
+
Zero-cost, zero-latency checks that run locally:
|
|
111
|
+
|
|
112
|
+
```python
|
|
113
|
+
def test_deterministic(check):
|
|
114
|
+
output = my_agent("...")
|
|
115
|
+
|
|
116
|
+
check.contains(output, "expected substring")
|
|
117
|
+
check.not_contains(output, "forbidden text")
|
|
118
|
+
check.exact_match(output, "exact expected output")
|
|
119
|
+
check.exact_match(output, "EXPECTED", ignore_case=True)
|
|
120
|
+
check.starts_with(output, "Python")
|
|
121
|
+
check.ends_with(output, "language.")
|
|
122
|
+
check.regex(output, pattern=r"\d{3}-\d{4}")
|
|
123
|
+
check.max_tokens(output, limit=500)
|
|
124
|
+
check.latency(response_time_ms, max_ms=2000)
|
|
125
|
+
check.cost(api_cost_usd, max_usd=0.05)
|
|
126
|
+
|
|
127
|
+
# Validate JSON structure
|
|
128
|
+
from pydantic import BaseModel
|
|
129
|
+
|
|
130
|
+
class Response(BaseModel):
|
|
131
|
+
answer: str
|
|
132
|
+
confidence: float
|
|
133
|
+
|
|
134
|
+
check.json_schema(output, schema=Response)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## LLM-as-Judge Metrics
|
|
138
|
+
|
|
139
|
+
Use GPT-4o (or Claude) as an automated judge:
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
def test_llm_quality(check):
|
|
143
|
+
output = my_agent("Summarize this article about climate change.")
|
|
144
|
+
article = "..."
|
|
145
|
+
|
|
146
|
+
check.hallucination(output, context=article)
|
|
147
|
+
check.relevance(output, query="Summarize the article")
|
|
148
|
+
check.toxicity(output)
|
|
149
|
+
check.rubric(output, criteria="concise, under 3 sentences, mentions key findings")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Each check records a score (0.0-1.0), pass/fail status, reasoning, cost, and latency.
|
|
153
|
+
|
|
154
|
+
### Custom Thresholds
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
check.hallucination(output, context=ctx, threshold=0.9) # stricter
|
|
158
|
+
check.relevance(output, query=q, threshold=0.6) # more lenient
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
### Multiple Runs
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
check.hallucination(output, context=ctx, runs=5)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Or set globally:
|
|
168
|
+
|
|
169
|
+
```toml
|
|
170
|
+
[tool.checkllm]
|
|
171
|
+
runs_per_test = 3
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Dataset-Driven Testing
|
|
175
|
+
|
|
176
|
+
```yaml
|
|
177
|
+
# tests/fixtures/cases.yaml
|
|
178
|
+
- input: "What is Python?"
|
|
179
|
+
expected: "Python is a programming language"
|
|
180
|
+
query: "Explain Python"
|
|
181
|
+
context: "Python was created by Guido van Rossum in 1991."
|
|
182
|
+
criteria: "accurate, mentions creator"
|
|
183
|
+
|
|
184
|
+
- input: "What is 2+2?"
|
|
185
|
+
expected: "4"
|
|
186
|
+
criteria: "correct, concise"
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
```python
|
|
190
|
+
from checkllm import dataset
|
|
191
|
+
|
|
192
|
+
@dataset("tests/fixtures/cases.yaml")
|
|
193
|
+
def test_across_cases(check, case):
|
|
194
|
+
output = my_agent(case.input)
|
|
195
|
+
check.contains(output, case.expected)
|
|
196
|
+
if case.context:
|
|
197
|
+
check.hallucination(output, context=case.context)
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
Or use a Python generator:
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
from checkllm import Case, dataset
|
|
204
|
+
|
|
205
|
+
def my_cases():
|
|
206
|
+
yield Case(input="Hello", expected="greeting", criteria="friendly")
|
|
207
|
+
yield Case(input="Goodbye", expected="farewell", criteria="polite")
|
|
208
|
+
|
|
209
|
+
@dataset(my_cases)
|
|
210
|
+
def test_generated(check, case):
|
|
211
|
+
output = my_agent(case.input)
|
|
212
|
+
check.rubric(output, criteria=case.criteria)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
## Custom Metrics
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
import checkllm
|
|
219
|
+
from checkllm import CheckResult
|
|
220
|
+
|
|
221
|
+
@checkllm.metric("brevity")
|
|
222
|
+
def brevity_check(output: str, max_words: int = 50, **kwargs) -> CheckResult:
|
|
223
|
+
word_count = len(output.split())
|
|
224
|
+
return CheckResult(
|
|
225
|
+
passed=word_count <= max_words,
|
|
226
|
+
score=min(1.0, max_words / max(word_count, 1)),
|
|
227
|
+
reasoning=f"{word_count} words (limit: {max_words})",
|
|
228
|
+
cost=0.0,
|
|
229
|
+
latency_ms=0,
|
|
230
|
+
metric_name="brevity",
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
def test_brevity(check):
|
|
234
|
+
output = my_agent("Explain quantum physics")
|
|
235
|
+
check.run_metric("brevity", output=output, max_words=100)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
## Async Tests
|
|
239
|
+
|
|
240
|
+
```python
|
|
241
|
+
import pytest
|
|
242
|
+
|
|
243
|
+
@pytest.mark.asyncio
|
|
244
|
+
async def test_async_quality(check):
|
|
245
|
+
output = await my_async_agent("What is Python?")
|
|
246
|
+
|
|
247
|
+
await check.ahallucination(output, context="...")
|
|
248
|
+
await check.arelevance(output, query="What is Python?")
|
|
249
|
+
await check.atoxicity(output)
|
|
250
|
+
await check.arubric(output, criteria="concise and accurate")
|
|
251
|
+
|
|
252
|
+
# Deterministic checks are always sync (instant, no I/O)
|
|
253
|
+
check.contains(output, "Python")
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
## Separating Fast and Slow Tests
|
|
257
|
+
|
|
258
|
+
Mark LLM tests so you can skip them in fast CI runs:
|
|
259
|
+
|
|
260
|
+
```python
|
|
261
|
+
import pytest
|
|
262
|
+
|
|
263
|
+
@pytest.mark.llm
|
|
264
|
+
def test_with_llm(check):
|
|
265
|
+
check.hallucination(output, context=ctx)
|
|
266
|
+
|
|
267
|
+
def test_fast(check):
|
|
268
|
+
check.contains(output, "Python")
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
# Run only fast deterministic tests
|
|
273
|
+
pytest -m "not llm"
|
|
274
|
+
|
|
275
|
+
# Run only LLM tests
|
|
276
|
+
pytest -m llm
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
If `OPENAI_API_KEY` is not set, LLM checks automatically skip instead of crashing.
|
|
280
|
+
|
|
281
|
+
## Regression Detection
|
|
282
|
+
|
|
283
|
+
checkllm uses Welch's t-test to detect statistically significant score regressions.
|
|
284
|
+
|
|
285
|
+
```bash
|
|
286
|
+
checkllm snapshot tests/ --output .checkllm/snapshots/v1.json
|
|
287
|
+
# ... make changes ...
|
|
288
|
+
checkllm snapshot tests/ --output .checkllm/snapshots/v2.json
|
|
289
|
+
checkllm diff -b .checkllm/snapshots/v1.json -c .checkllm/snapshots/v2.json
|
|
290
|
+
|
|
291
|
+
# Fail CI on regression
|
|
292
|
+
checkllm diff -b v1.json -c v2.json --fail-on-regression
|
|
293
|
+
```
|
|
294
|
+
|
|
295
|
+
## Reporting
|
|
296
|
+
|
|
297
|
+
```bash
|
|
298
|
+
# HTML report
|
|
299
|
+
checkllm report tests/ --output report.html
|
|
300
|
+
|
|
301
|
+
# JUnit XML for CI/CD
|
|
302
|
+
checkllm run tests/ --junit-xml results.xml
|
|
303
|
+
|
|
304
|
+
# pytest flags work directly
|
|
305
|
+
pytest tests/ --checkllm-snapshot=snap.json --checkllm-report=report.html
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
## CLI Reference
|
|
309
|
+
|
|
310
|
+
| Command | Description |
|
|
311
|
+
|---------|-------------|
|
|
312
|
+
| `checkllm run <path>` | Run tests with `--snapshot`, `--html-report`, `--junit-xml`, `--compare`, `--fail-on-regression` |
|
|
313
|
+
| `checkllm snapshot <path>` | Save test results as baseline (`--output PATH`) |
|
|
314
|
+
| `checkllm report <path>` | Generate HTML report (`--output PATH`, `--junit-xml PATH`) |
|
|
315
|
+
| `checkllm diff` | Compare snapshots (`--baseline`, `--current`, `--fail-on-regression`) |
|
|
316
|
+
| `checkllm eval` | Evaluate prompt template (`--prompt`, `--dataset`, `--metric`, `--threshold`) |
|
|
317
|
+
| `checkllm init [path]` | Scaffold a new project |
|
|
318
|
+
| `checkllm list-metrics` | List available metrics |
|
|
319
|
+
| `checkllm --version` | Show version |
|
|
320
|
+
|
|
321
|
+
## Configuration
|
|
322
|
+
|
|
323
|
+
```toml
|
|
324
|
+
[tool.checkllm]
|
|
325
|
+
judge_backend = "openai" # "openai" or "anthropic"
|
|
326
|
+
judge_model = "gpt-4o" # Model for LLM-as-judge
|
|
327
|
+
default_threshold = 0.8 # Pass/fail threshold (0.0-1.0)
|
|
328
|
+
runs_per_test = 1 # Repeat LLM checks N times
|
|
329
|
+
snapshot_dir = ".checkllm/snapshots"
|
|
330
|
+
confidence_level = 0.95
|
|
331
|
+
p_value_threshold = 0.05
|
|
332
|
+
```
|
|
333
|
+
|
|
334
|
+
Environment variable overrides: `CHECKLLM_JUDGE_BACKEND`, `CHECKLLM_JUDGE_MODEL`, `CHECKLLM_DEFAULT_THRESHOLD`, `CHECKLLM_RUNS_PER_TEST`.
|
|
335
|
+
|
|
336
|
+
## Custom Judge Backends
|
|
337
|
+
|
|
338
|
+
### Anthropic Claude
|
|
339
|
+
|
|
340
|
+
```toml
|
|
341
|
+
[tool.checkllm]
|
|
342
|
+
judge_backend = "anthropic"
|
|
343
|
+
judge_model = "claude-sonnet-4-6"
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
### Your Own Backend
|
|
347
|
+
|
|
348
|
+
Implement the `JudgeBackend` protocol:
|
|
349
|
+
|
|
350
|
+
```python
|
|
351
|
+
from checkllm import JudgeBackend, JudgeResponse
|
|
352
|
+
from checkllm.check import CheckCollector
|
|
353
|
+
from checkllm.config import CheckllmConfig
|
|
354
|
+
|
|
355
|
+
class MyJudge:
|
|
356
|
+
async def evaluate(self, prompt: str, system_prompt: str | None = None) -> JudgeResponse:
|
|
357
|
+
return JudgeResponse(score=0.9, reasoning="Looks good", cost=0.0)
|
|
358
|
+
|
|
359
|
+
config = CheckllmConfig()
|
|
360
|
+
collector = CheckCollector(config=config, judge=MyJudge())
|
|
361
|
+
```
|
|
362
|
+
|
|
363
|
+
## Configuring the Judge in conftest.py
|
|
364
|
+
|
|
365
|
+
To use a cheaper model or a custom backend for all tests:
|
|
366
|
+
|
|
367
|
+
```python
|
|
368
|
+
# tests/conftest.py
|
|
369
|
+
import pytest
|
|
370
|
+
from checkllm.check import CheckCollector
|
|
371
|
+
from checkllm.config import load_config
|
|
372
|
+
from checkllm.judge import OpenAIJudge
|
|
373
|
+
from checkllm.pytest_plugin import _CHECKLLM_KEY
|
|
374
|
+
|
|
375
|
+
@pytest.fixture
|
|
376
|
+
def check(request):
|
|
377
|
+
config = load_config()
|
|
378
|
+
judge = OpenAIJudge(model="gpt-4o-mini") # cheaper model for dev
|
|
379
|
+
collector = CheckCollector(config=config, judge=judge)
|
|
380
|
+
request.node.stash[_CHECKLLM_KEY] = collector
|
|
381
|
+
return collector
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
## Project Setup
|
|
385
|
+
|
|
386
|
+
```bash
|
|
387
|
+
checkllm init
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
Creates `pyproject.toml`, `tests/conftest.py`, sample test file, sample dataset, and `.checkllm/snapshots/` directory.
|
|
391
|
+
|
|
392
|
+
## Examples
|
|
393
|
+
|
|
394
|
+
See the [examples/](examples/) directory for working code:
|
|
395
|
+
|
|
396
|
+
- [test_basic.py](examples/test_basic.py) - Deterministic checks (no API key needed)
|
|
397
|
+
- [test_dataset_driven.py](examples/test_dataset_driven.py) - YAML and generator datasets
|
|
398
|
+
- [test_custom_metrics.py](examples/test_custom_metrics.py) - Register domain-specific metrics
|
|
399
|
+
- [test_llm_judge.py](examples/test_llm_judge.py) - LLM-as-judge evaluation
|
|
400
|
+
- [test_regression_workflow.py](examples/test_regression_workflow.py) - Snapshot and regression detection
|
|
401
|
+
|
|
402
|
+
## License
|
|
403
|
+
|
|
404
|
+
MIT
|