checkllm 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checkllm-0.1.0/.github/workflows/ci.yml +49 -0
  2. checkllm-0.1.0/.github/workflows/publish.yml +30 -0
  3. checkllm-0.1.0/.gitignore +9 -0
  4. checkllm-0.1.0/CHANGELOG.md +21 -0
  5. checkllm-0.1.0/LICENSE +21 -0
  6. checkllm-0.1.0/PKG-INFO +404 -0
  7. checkllm-0.1.0/README.md +367 -0
  8. checkllm-0.1.0/examples/README.md +29 -0
  9. checkllm-0.1.0/examples/qa_dataset.yaml +9 -0
  10. checkllm-0.1.0/examples/test_basic.py +65 -0
  11. checkllm-0.1.0/examples/test_custom_metrics.py +72 -0
  12. checkllm-0.1.0/examples/test_dataset_driven.py +67 -0
  13. checkllm-0.1.0/examples/test_llm_judge.py +54 -0
  14. checkllm-0.1.0/examples/test_regression_workflow.py +63 -0
  15. checkllm-0.1.0/pyproject.toml +71 -0
  16. checkllm-0.1.0/src/checkllm/__init__.py +23 -0
  17. checkllm-0.1.0/src/checkllm/check.py +264 -0
  18. checkllm-0.1.0/src/checkllm/cli.py +476 -0
  19. checkllm-0.1.0/src/checkllm/config.py +62 -0
  20. checkllm-0.1.0/src/checkllm/datasets/__init__.py +4 -0
  21. checkllm-0.1.0/src/checkllm/datasets/case.py +21 -0
  22. checkllm-0.1.0/src/checkllm/datasets/loader.py +33 -0
  23. checkllm-0.1.0/src/checkllm/deterministic.py +139 -0
  24. checkllm-0.1.0/src/checkllm/judge.py +244 -0
  25. checkllm-0.1.0/src/checkllm/metrics/__init__.py +52 -0
  26. checkllm-0.1.0/src/checkllm/metrics/hallucination.py +44 -0
  27. checkllm-0.1.0/src/checkllm/metrics/relevance.py +44 -0
  28. checkllm-0.1.0/src/checkllm/metrics/rubric.py +45 -0
  29. checkllm-0.1.0/src/checkllm/metrics/toxicity.py +45 -0
  30. checkllm-0.1.0/src/checkllm/models.py +41 -0
  31. checkllm-0.1.0/src/checkllm/py.typed +0 -0
  32. checkllm-0.1.0/src/checkllm/pytest_plugin.py +238 -0
  33. checkllm-0.1.0/src/checkllm/regression/__init__.py +14 -0
  34. checkllm-0.1.0/src/checkllm/regression/compare.py +68 -0
  35. checkllm-0.1.0/src/checkllm/regression/snapshot.py +58 -0
  36. checkllm-0.1.0/src/checkllm/regression/stats.py +97 -0
  37. checkllm-0.1.0/src/checkllm/reporting/__init__.py +10 -0
  38. checkllm-0.1.0/src/checkllm/reporting/html.py +33 -0
  39. checkllm-0.1.0/src/checkllm/reporting/junit.py +63 -0
  40. checkllm-0.1.0/src/checkllm/reporting/terminal.py +105 -0
  41. checkllm-0.1.0/src/checkllm/runner.py +20 -0
  42. checkllm-0.1.0/src/checkllm/templates/report.html.j2 +54 -0
  43. checkllm-0.1.0/tests/__init__.py +0 -0
  44. checkllm-0.1.0/tests/conftest.py +1 -0
  45. checkllm-0.1.0/tests/fixtures/sample_dataset.yaml +7 -0
  46. checkllm-0.1.0/tests/fixtures/sample_snapshot.json +20 -0
  47. checkllm-0.1.0/tests/test_check.py +122 -0
  48. checkllm-0.1.0/tests/test_cli.py +55 -0
  49. checkllm-0.1.0/tests/test_cli_integration.py +182 -0
  50. checkllm-0.1.0/tests/test_config.py +64 -0
  51. checkllm-0.1.0/tests/test_datasets.py +72 -0
  52. checkllm-0.1.0/tests/test_deterministic.py +126 -0
  53. checkllm-0.1.0/tests/test_e2e.py +132 -0
  54. checkllm-0.1.0/tests/test_judge.py +106 -0
  55. checkllm-0.1.0/tests/test_metrics/__init__.py +0 -0
  56. checkllm-0.1.0/tests/test_metrics/test_hallucination.py +61 -0
  57. checkllm-0.1.0/tests/test_metrics/test_relevance.py +49 -0
  58. checkllm-0.1.0/tests/test_metrics/test_rubric.py +67 -0
  59. checkllm-0.1.0/tests/test_metrics/test_toxicity.py +42 -0
  60. checkllm-0.1.0/tests/test_models.py +93 -0
  61. checkllm-0.1.0/tests/test_new_features.py +172 -0
  62. checkllm-0.1.0/tests/test_plugin_system.py +81 -0
  63. checkllm-0.1.0/tests/test_pytest_plugin.py +79 -0
  64. checkllm-0.1.0/tests/test_regression/__init__.py +0 -0
  65. checkllm-0.1.0/tests/test_regression/test_compare.py +68 -0
  66. checkllm-0.1.0/tests/test_regression/test_snapshot.py +74 -0
  67. checkllm-0.1.0/tests/test_regression/test_stats.py +79 -0
  68. checkllm-0.1.0/tests/test_reporting/__init__.py +0 -0
  69. checkllm-0.1.0/tests/test_reporting/test_html.py +55 -0
  70. checkllm-0.1.0/tests/test_reporting/test_junit.py +73 -0
  71. checkllm-0.1.0/tests/test_reporting/test_terminal.py +68 -0
  72. checkllm-0.1.0/tests/test_session_collection.py +102 -0
@@ -0,0 +1,49 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ${{ matrix.os }}
12
+ strategy:
13
+ matrix:
14
+ os: [ubuntu-latest, windows-latest, macos-latest]
15
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
16
+ fail-fast: false
17
+
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+
21
+ - name: Set up Python ${{ matrix.python-version }}
22
+ uses: actions/setup-python@v5
23
+ with:
24
+ python-version: ${{ matrix.python-version }}
25
+
26
+ - name: Install dependencies
27
+ run: pip install -e ".[dev]"
28
+
29
+ - name: Run tests (deterministic only, no API key needed)
30
+ run: pytest tests/ -v -m "not llm" --tb=short
31
+
32
+ - name: Check CLI works
33
+ run: checkllm --version
34
+
35
+ lint:
36
+ runs-on: ubuntu-latest
37
+ steps:
38
+ - uses: actions/checkout@v4
39
+
40
+ - name: Set up Python
41
+ uses: actions/setup-python@v5
42
+ with:
43
+ python-version: "3.12"
44
+
45
+ - name: Install dependencies
46
+ run: pip install -e ".[dev]"
47
+
48
+ - name: Type check
49
+ run: python -m py_compile src/checkllm/__init__.py
@@ -0,0 +1,30 @@
1
+ name: Publish to PyPI
2
+
3
+ on:
4
+ release:
5
+ types: [published]
6
+
7
+ permissions:
8
+ contents: read
9
+ id-token: write
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+ environment: pypi
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+
18
+ - name: Set up Python
19
+ uses: actions/setup-python@v5
20
+ with:
21
+ python-version: "3.12"
22
+
23
+ - name: Install build tools
24
+ run: pip install build
25
+
26
+ - name: Build package
27
+ run: python -m build
28
+
29
+ - name: Publish to PyPI
30
+ uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,9 @@
1
+ .worktrees/
2
+ __pycache__/
3
+ *.pyc
4
+ *.egg-info/
5
+ dist/
6
+ build/
7
+ .checkllm/
8
+ *.egg
9
+ .venv/
@@ -0,0 +1,21 @@
1
+ # Changelog
2
+
3
+ ## 0.1.0 (2026-03-28)
4
+
5
+ Initial release.
6
+
7
+ ### Features
8
+
9
+ - **pytest plugin** with `check` fixture for LLM testing in pytest
10
+ - **Deterministic checks**: `contains`, `not_contains`, `regex`, `json_schema`, `max_tokens`, `latency`, `cost`
11
+ - **LLM-as-judge metrics**: `hallucination`, `relevance`, `toxicity`, `rubric`
12
+ - **Custom metrics** via `@metric` decorator and plugin entry points
13
+ - **Dataset system**: YAML loading, generator functions, `@dataset` decorator for parametrized tests
14
+ - **Regression detection**: Welch's t-test with configurable p-value threshold
15
+ - **Snapshot system**: save/load/compare test result baselines
16
+ - **Reporting**: Rich terminal output, HTML reports, JUnit XML
17
+ - **CLI**: `checkllm run`, `snapshot`, `report`, `eval`, `diff`, `init`
18
+ - **Multiple judge backends**: OpenAI and Anthropic
19
+ - **Retry logic** with exponential backoff for transient API failures
20
+ - **Cost tracking** from OpenAI/Anthropic token usage
21
+ - **Configuration** via `pyproject.toml [tool.checkllm]` and environment variables
checkllm-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 checkllm contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,404 @@
1
+ Metadata-Version: 2.4
2
+ Name: checkllm
3
+ Version: 0.1.0
4
+ Summary: Test LLM-powered applications with the same rigor as traditional software.
5
+ Author: checkllm contributors
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Framework :: Pytest
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Testing
18
+ Requires-Python: >=3.10
19
+ Requires-Dist: jinja2>=3.1.0
20
+ Requires-Dist: openai>=1.0.0
21
+ Requires-Dist: pydantic>=2.0.0
22
+ Requires-Dist: pyyaml>=6.0.0
23
+ Requires-Dist: rich>=13.0.0
24
+ Requires-Dist: scipy>=1.11.0
25
+ Requires-Dist: tenacity>=8.0.0
26
+ Requires-Dist: tiktoken>=0.5.0
27
+ Requires-Dist: typer>=0.9.0
28
+ Provides-Extra: all
29
+ Requires-Dist: anthropic>=0.20.0; extra == 'all'
30
+ Provides-Extra: anthropic
31
+ Requires-Dist: anthropic>=0.20.0; extra == 'anthropic'
32
+ Provides-Extra: dev
33
+ Requires-Dist: coverage>=7.0.0; extra == 'dev'
34
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
35
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ # checkllm
39
+
40
+ Test LLM-powered applications with the same rigor as traditional software.
41
+
42
+ checkllm is a pytest plugin and CLI that lets you write assertions for LLM outputs using deterministic checks, LLM-as-judge evaluation, and statistical regression detection.
43
+
44
+ ## Why checkllm?
45
+
46
+ - **Works with pytest** - no new test runner to learn, just add a `check` fixture
47
+ - **Free deterministic checks** run instantly with zero API calls
48
+ - **LLM-as-judge** for subjective quality (hallucination, relevance, toxicity, custom rubrics)
49
+ - **Statistical regression detection** using Welch's t-test, not just "did it change?"
50
+ - **Multiple judge backends** - OpenAI and Anthropic, or bring your own
51
+ - **One command** to snapshot, report, or diff your test results
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ pip install checkllm
57
+ ```
58
+
59
+ For Anthropic Claude support:
60
+
61
+ ```bash
62
+ pip install checkllm[anthropic]
63
+ ```
64
+
65
+ ## Quick Start
66
+
67
+ ### 1. Write a test
68
+
69
+ ```python
70
+ # tests/test_my_agent.py
71
+
72
+ def test_output_quality(check):
73
+ output = my_agent("What is Python?")
74
+
75
+ # Deterministic checks (free, instant)
76
+ check.contains(output, "programming language")
77
+ check.not_contains(output, "JavaScript")
78
+ check.max_tokens(output, limit=200)
79
+
80
+ # LLM-as-judge checks (requires OPENAI_API_KEY)
81
+ check.hallucination(output, context="Python is a high-level programming language.")
82
+ check.relevance(output, query="What is Python?")
83
+ check.toxicity(output)
84
+ ```
85
+
86
+ ### 2. Run it
87
+
88
+ ```bash
89
+ export OPENAI_API_KEY=sk-...
90
+
91
+ pytest tests/test_my_agent.py -v
92
+
93
+ # Or use the CLI
94
+ checkllm run tests/test_my_agent.py
95
+ ```
96
+
97
+ ### 3. Track regressions
98
+
99
+ ```bash
100
+ checkllm snapshot tests/ --output .checkllm/snapshots/baseline.json
101
+
102
+ # After changes, compare
103
+ checkllm snapshot tests/ --output .checkllm/snapshots/current.json
104
+ checkllm diff --baseline .checkllm/snapshots/baseline.json \
105
+ --current .checkllm/snapshots/current.json
106
+ ```
107
+
108
+ ## Deterministic Checks
109
+
110
+ Zero-cost, zero-latency checks that run locally:
111
+
112
+ ```python
113
+ def test_deterministic(check):
114
+ output = my_agent("...")
115
+
116
+ check.contains(output, "expected substring")
117
+ check.not_contains(output, "forbidden text")
118
+ check.exact_match(output, "exact expected output")
119
+ check.exact_match(output, "EXPECTED", ignore_case=True)
120
+ check.starts_with(output, "Python")
121
+ check.ends_with(output, "language.")
122
+ check.regex(output, pattern=r"\d{3}-\d{4}")
123
+ check.max_tokens(output, limit=500)
124
+ check.latency(response_time_ms, max_ms=2000)
125
+ check.cost(api_cost_usd, max_usd=0.05)
126
+
127
+ # Validate JSON structure
128
+ from pydantic import BaseModel
129
+
130
+ class Response(BaseModel):
131
+ answer: str
132
+ confidence: float
133
+
134
+ check.json_schema(output, schema=Response)
135
+ ```
136
+
137
+ ## LLM-as-Judge Metrics
138
+
139
+ Use GPT-4o (or Claude) as an automated judge:
140
+
141
+ ```python
142
+ def test_llm_quality(check):
143
+ output = my_agent("Summarize this article about climate change.")
144
+ article = "..."
145
+
146
+ check.hallucination(output, context=article)
147
+ check.relevance(output, query="Summarize the article")
148
+ check.toxicity(output)
149
+ check.rubric(output, criteria="concise, under 3 sentences, mentions key findings")
150
+ ```
151
+
152
+ Each check records a score (0.0-1.0), pass/fail status, reasoning, cost, and latency.
153
+
154
+ ### Custom Thresholds
155
+
156
+ ```python
157
+ check.hallucination(output, context=ctx, threshold=0.9) # stricter
158
+ check.relevance(output, query=q, threshold=0.6) # more lenient
159
+ ```
160
+
161
+ ### Multiple Runs
162
+
163
+ ```python
164
+ check.hallucination(output, context=ctx, runs=5)
165
+ ```
166
+
167
+ Or set globally:
168
+
169
+ ```toml
170
+ [tool.checkllm]
171
+ runs_per_test = 3
172
+ ```
173
+
174
+ ## Dataset-Driven Testing
175
+
176
+ ```yaml
177
+ # tests/fixtures/cases.yaml
178
+ - input: "What is Python?"
179
+ expected: "Python is a programming language"
180
+ query: "Explain Python"
181
+ context: "Python was created by Guido van Rossum in 1991."
182
+ criteria: "accurate, mentions creator"
183
+
184
+ - input: "What is 2+2?"
185
+ expected: "4"
186
+ criteria: "correct, concise"
187
+ ```
188
+
189
+ ```python
190
+ from checkllm import dataset
191
+
192
+ @dataset("tests/fixtures/cases.yaml")
193
+ def test_across_cases(check, case):
194
+ output = my_agent(case.input)
195
+ check.contains(output, case.expected)
196
+ if case.context:
197
+ check.hallucination(output, context=case.context)
198
+ ```
199
+
200
+ Or use a Python generator:
201
+
202
+ ```python
203
+ from checkllm import Case, dataset
204
+
205
+ def my_cases():
206
+ yield Case(input="Hello", expected="greeting", criteria="friendly")
207
+ yield Case(input="Goodbye", expected="farewell", criteria="polite")
208
+
209
+ @dataset(my_cases)
210
+ def test_generated(check, case):
211
+ output = my_agent(case.input)
212
+ check.rubric(output, criteria=case.criteria)
213
+ ```
214
+
215
+ ## Custom Metrics
216
+
217
+ ```python
218
+ import checkllm
219
+ from checkllm import CheckResult
220
+
221
+ @checkllm.metric("brevity")
222
+ def brevity_check(output: str, max_words: int = 50, **kwargs) -> CheckResult:
223
+ word_count = len(output.split())
224
+ return CheckResult(
225
+ passed=word_count <= max_words,
226
+ score=min(1.0, max_words / max(word_count, 1)),
227
+ reasoning=f"{word_count} words (limit: {max_words})",
228
+ cost=0.0,
229
+ latency_ms=0,
230
+ metric_name="brevity",
231
+ )
232
+
233
+ def test_brevity(check):
234
+ output = my_agent("Explain quantum physics")
235
+ check.run_metric("brevity", output=output, max_words=100)
236
+ ```
237
+
238
+ ## Async Tests
239
+
240
+ ```python
241
+ import pytest
242
+
243
+ @pytest.mark.asyncio
244
+ async def test_async_quality(check):
245
+ output = await my_async_agent("What is Python?")
246
+
247
+ await check.ahallucination(output, context="...")
248
+ await check.arelevance(output, query="What is Python?")
249
+ await check.atoxicity(output)
250
+ await check.arubric(output, criteria="concise and accurate")
251
+
252
+ # Deterministic checks are always sync (instant, no I/O)
253
+ check.contains(output, "Python")
254
+ ```
255
+
256
+ ## Separating Fast and Slow Tests
257
+
258
+ Mark LLM tests so you can skip them in fast CI runs:
259
+
260
+ ```python
261
+ import pytest
262
+
263
+ @pytest.mark.llm
264
+ def test_with_llm(check):
265
+ check.hallucination(output, context=ctx)
266
+
267
+ def test_fast(check):
268
+ check.contains(output, "Python")
269
+ ```
270
+
271
+ ```bash
272
+ # Run only fast deterministic tests
273
+ pytest -m "not llm"
274
+
275
+ # Run only LLM tests
276
+ pytest -m llm
277
+ ```
278
+
279
+ If `OPENAI_API_KEY` is not set, LLM checks automatically skip instead of crashing.
280
+
281
+ ## Regression Detection
282
+
283
+ checkllm uses Welch's t-test to detect statistically significant score regressions.
284
+
285
+ ```bash
286
+ checkllm snapshot tests/ --output .checkllm/snapshots/v1.json
287
+ # ... make changes ...
288
+ checkllm snapshot tests/ --output .checkllm/snapshots/v2.json
289
+ checkllm diff -b .checkllm/snapshots/v1.json -c .checkllm/snapshots/v2.json
290
+
291
+ # Fail CI on regression
292
+ checkllm diff -b v1.json -c v2.json --fail-on-regression
293
+ ```
294
+
295
+ ## Reporting
296
+
297
+ ```bash
298
+ # HTML report
299
+ checkllm report tests/ --output report.html
300
+
301
+ # JUnit XML for CI/CD
302
+ checkllm run tests/ --junit-xml results.xml
303
+
304
+ # pytest flags work directly
305
+ pytest tests/ --checkllm-snapshot=snap.json --checkllm-report=report.html
306
+ ```
307
+
308
+ ## CLI Reference
309
+
310
+ | Command | Description |
311
+ |---------|-------------|
312
+ | `checkllm run <path>` | Run tests with `--snapshot`, `--html-report`, `--junit-xml`, `--compare`, `--fail-on-regression` |
313
+ | `checkllm snapshot <path>` | Save test results as baseline (`--output PATH`) |
314
+ | `checkllm report <path>` | Generate HTML report (`--output PATH`, `--junit-xml PATH`) |
315
+ | `checkllm diff` | Compare snapshots (`--baseline`, `--current`, `--fail-on-regression`) |
316
+ | `checkllm eval` | Evaluate prompt template (`--prompt`, `--dataset`, `--metric`, `--threshold`) |
317
+ | `checkllm init [path]` | Scaffold a new project |
318
+ | `checkllm list-metrics` | List available metrics |
319
+ | `checkllm --version` | Show version |
320
+
321
+ ## Configuration
322
+
323
+ ```toml
324
+ [tool.checkllm]
325
+ judge_backend = "openai" # "openai" or "anthropic"
326
+ judge_model = "gpt-4o" # Model for LLM-as-judge
327
+ default_threshold = 0.8 # Pass/fail threshold (0.0-1.0)
328
+ runs_per_test = 1 # Repeat LLM checks N times
329
+ snapshot_dir = ".checkllm/snapshots"
330
+ confidence_level = 0.95
331
+ p_value_threshold = 0.05
332
+ ```
333
+
334
+ Environment variable overrides: `CHECKLLM_JUDGE_BACKEND`, `CHECKLLM_JUDGE_MODEL`, `CHECKLLM_DEFAULT_THRESHOLD`, `CHECKLLM_RUNS_PER_TEST`.
335
+
336
+ ## Custom Judge Backends
337
+
338
+ ### Anthropic Claude
339
+
340
+ ```toml
341
+ [tool.checkllm]
342
+ judge_backend = "anthropic"
343
+ judge_model = "claude-sonnet-4-6"
344
+ ```
345
+
346
+ ### Your Own Backend
347
+
348
+ Implement the `JudgeBackend` protocol:
349
+
350
+ ```python
351
+ from checkllm import JudgeBackend, JudgeResponse
352
+ from checkllm.check import CheckCollector
353
+ from checkllm.config import CheckllmConfig
354
+
355
+ class MyJudge:
356
+ async def evaluate(self, prompt: str, system_prompt: str | None = None) -> JudgeResponse:
357
+ return JudgeResponse(score=0.9, reasoning="Looks good", cost=0.0)
358
+
359
+ config = CheckllmConfig()
360
+ collector = CheckCollector(config=config, judge=MyJudge())
361
+ ```
362
+
363
+ ## Configuring the Judge in conftest.py
364
+
365
+ To use a cheaper model or a custom backend for all tests:
366
+
367
+ ```python
368
+ # tests/conftest.py
369
+ import pytest
370
+ from checkllm.check import CheckCollector
371
+ from checkllm.config import load_config
372
+ from checkllm.judge import OpenAIJudge
373
+ from checkllm.pytest_plugin import _CHECKLLM_KEY
374
+
375
+ @pytest.fixture
376
+ def check(request):
377
+ config = load_config()
378
+ judge = OpenAIJudge(model="gpt-4o-mini") # cheaper model for dev
379
+ collector = CheckCollector(config=config, judge=judge)
380
+ request.node.stash[_CHECKLLM_KEY] = collector
381
+ return collector
382
+ ```
383
+
384
+ ## Project Setup
385
+
386
+ ```bash
387
+ checkllm init
388
+ ```
389
+
390
+ Creates `pyproject.toml`, `tests/conftest.py`, sample test file, sample dataset, and `.checkllm/snapshots/` directory.
391
+
392
+ ## Examples
393
+
394
+ See the [examples/](examples/) directory for working code:
395
+
396
+ - [test_basic.py](examples/test_basic.py) - Deterministic checks (no API key needed)
397
+ - [test_dataset_driven.py](examples/test_dataset_driven.py) - YAML and generator datasets
398
+ - [test_custom_metrics.py](examples/test_custom_metrics.py) - Register domain-specific metrics
399
+ - [test_llm_judge.py](examples/test_llm_judge.py) - LLM-as-judge evaluation
400
+ - [test_regression_workflow.py](examples/test_regression_workflow.py) - Snapshot and regression detection
401
+
402
+ ## License
403
+
404
+ MIT