signaltest 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. signaltest-0.1.0/.github/workflows/ci.yml +25 -0
  2. signaltest-0.1.0/.github/workflows/release.yml +19 -0
  3. signaltest-0.1.0/.gitignore +13 -0
  4. signaltest-0.1.0/.pre-commit-config.yaml +14 -0
  5. signaltest-0.1.0/CHANGELOG.md +29 -0
  6. signaltest-0.1.0/CONTRIBUTING.md +31 -0
  7. signaltest-0.1.0/LICENSE +21 -0
  8. signaltest-0.1.0/PKG-INFO +277 -0
  9. signaltest-0.1.0/README.md +259 -0
  10. signaltest-0.1.0/docs/architecture.md +96 -0
  11. signaltest-0.1.0/examples/demo.py +33 -0
  12. signaltest-0.1.0/examples/tool_agent.py +70 -0
  13. signaltest-0.1.0/pyproject.toml +46 -0
  14. signaltest-0.1.0/src/signaltest/__init__.py +35 -0
  15. signaltest-0.1.0/src/signaltest/baseline/__init__.py +0 -0
  16. signaltest-0.1.0/src/signaltest/baseline/record.py +21 -0
  17. signaltest-0.1.0/src/signaltest/baseline/store.py +25 -0
  18. signaltest-0.1.0/src/signaltest/cli.py +39 -0
  19. signaltest-0.1.0/src/signaltest/metrics/__init__.py +0 -0
  20. signaltest-0.1.0/src/signaltest/metrics/base.py +17 -0
  21. signaltest-0.1.0/src/signaltest/metrics/contains.py +12 -0
  22. signaltest-0.1.0/src/signaltest/metrics/exact.py +12 -0
  23. signaltest-0.1.0/src/signaltest/metrics/judge.py +15 -0
  24. signaltest-0.1.0/src/signaltest/metrics/numeric.py +14 -0
  25. signaltest-0.1.0/src/signaltest/metrics/trajectory.py +17 -0
  26. signaltest-0.1.0/src/signaltest/plugin.py +5 -0
  27. signaltest-0.1.0/src/signaltest/py.typed +0 -0
  28. signaltest-0.1.0/src/signaltest/report.py +32 -0
  29. signaltest-0.1.0/src/signaltest/runner.py +136 -0
  30. signaltest-0.1.0/src/signaltest/stats/__init__.py +0 -0
  31. signaltest-0.1.0/src/signaltest/stats/correction.py +10 -0
  32. signaltest-0.1.0/src/signaltest/stats/effect.py +28 -0
  33. signaltest-0.1.0/src/signaltest/stats/gate.py +45 -0
  34. signaltest-0.1.0/src/signaltest/stats/significance.py +35 -0
  35. signaltest-0.1.0/src/signaltest/trajectory/__init__.py +0 -0
  36. signaltest-0.1.0/src/signaltest/trajectory/diff.py +22 -0
  37. signaltest-0.1.0/src/signaltest/trajectory/match.py +21 -0
  38. signaltest-0.1.0/src/signaltest/trajectory/model.py +8 -0
  39. signaltest-0.1.0/tests/test_api.py +16 -0
  40. signaltest-0.1.0/tests/test_baseline.py +38 -0
  41. signaltest-0.1.0/tests/test_boolean_significance.py +20 -0
  42. signaltest-0.1.0/tests/test_cli.py +44 -0
  43. signaltest-0.1.0/tests/test_correction.py +21 -0
  44. signaltest-0.1.0/tests/test_diff.py +34 -0
  45. signaltest-0.1.0/tests/test_e2e.py +28 -0
  46. signaltest-0.1.0/tests/test_effect_ci.py +28 -0
  47. signaltest-0.1.0/tests/test_gate.py +44 -0
  48. signaltest-0.1.0/tests/test_integration.py +51 -0
  49. signaltest-0.1.0/tests/test_judge.py +30 -0
  50. signaltest-0.1.0/tests/test_metrics.py +16 -0
  51. signaltest-0.1.0/tests/test_model.py +31 -0
  52. signaltest-0.1.0/tests/test_more_metrics.py +21 -0
  53. signaltest-0.1.0/tests/test_plugin.py +20 -0
  54. signaltest-0.1.0/tests/test_record.py +25 -0
  55. signaltest-0.1.0/tests/test_report.py +21 -0
  56. signaltest-0.1.0/tests/test_report_detail.py +21 -0
  57. signaltest-0.1.0/tests/test_runner.py +64 -0
  58. signaltest-0.1.0/tests/test_significance.py +26 -0
  59. signaltest-0.1.0/tests/test_suite.py +36 -0
  60. signaltest-0.1.0/tests/test_trajectory.py +33 -0
  61. signaltest-0.1.0/tests/test_trajectory_metric.py +26 -0
  62. signaltest-0.1.0/tests/test_version.py +5 -0
@@ -0,0 +1,25 @@
1
+ name: ci
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ jobs:
9
+ test:
10
+ runs-on: ubuntu-latest
11
+ strategy:
12
+ matrix:
13
+ python-version: ["3.10", "3.11", "3.12"]
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: astral-sh/setup-uv@v5
17
+ with:
18
+ python-version: ${{ matrix.python-version }}
19
+ - run: uv sync --extra dev
20
+ - run: uv run ruff check src tests examples
21
+ - run: uv run ruff format --check src tests examples
22
+ - run: uv run mypy
23
+ if: matrix.python-version == '3.12'
24
+ - run: uv run coverage run -m pytest -q
25
+ - run: uv run coverage report
@@ -0,0 +1,19 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ jobs:
8
+ publish:
9
+ runs-on: ubuntu-latest
10
+ permissions:
11
+ id-token: write
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.12"
17
+ - run: pip install build
18
+ - run: python -m build
19
+ - uses: pypa/gh-action-pypi-publish@release/v1
@@ -0,0 +1,13 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .venv/
4
+ venv/
5
+ *.egg-info/
6
+ dist/
7
+ build/
8
+ .pytest_cache/
9
+ .ruff_cache/
10
+ .coverage
11
+ .env
12
+ .env.local
13
+ uv.lock
@@ -0,0 +1,14 @@
1
+ repos:
2
+ - repo: https://github.com/pre-commit/pre-commit-hooks
3
+ rev: v5.0.0
4
+ hooks:
5
+ - id: trailing-whitespace
6
+ - id: end-of-file-fixer
7
+ - id: check-yaml
8
+ - id: check-toml
9
+ - repo: https://github.com/astral-sh/ruff-pre-commit
10
+ rev: v0.15.20
11
+ hooks:
12
+ - id: ruff
13
+ args: [--fix]
14
+ - id: ruff-format
@@ -0,0 +1,29 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented here. The format is based on
4
+ [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
5
+
6
+ ## Unreleased
7
+
8
+ ## 0.1.0 - 2026-06-30
9
+
10
+ ### Added
11
+ - Statistical gate: permutation and Fisher significance, bootstrap effect-size
12
+ interval, Benjamini-Hochberg correction, and a decision that blocks only on a
13
+ significant regression past a minimum effect size.
14
+ - Underpowered detection so cases with too few samples are flagged, not passed.
15
+ - Metrics: exact match, contains, numeric (configurable polarity), trajectory
16
+ match, and an LLM-judge metric that wraps any scoring callable.
17
+ - `Metric` protocol so custom metrics need only `name`, `kind`, `polarity`, and
18
+ `score`.
19
+ - Tool-trajectory model, match score, and a git-style diff renderer.
20
+ - Baseline JSON store with cold-start record-only and corrupt-file detection.
21
+ - Model versioning: baselines re-record on a model change instead of reporting it
22
+ as a regression.
23
+ - pytest plugin and `assert_no_regression` for single cases.
24
+ - `run_suite` with suite-level correction, plus a text report and CI exit code.
25
+ - Reports show the measured effect size and p-value on every case.
26
+ - `signaltest` CLI for inspecting baselines.
27
+ - Type hints across the package, checked with mypy `strict`.
28
+ - Offline demos (a minimal case and a tool-using agent) and an end-to-end test
29
+ that run with no API key.
@@ -0,0 +1,31 @@
1
+ # Contributing
2
+
3
+ Thanks for your interest in signaltest.
4
+
5
+ ## Setup
6
+
7
+ ```sh
8
+ python -m venv .venv && . .venv/bin/activate
9
+ pip install -e ".[dev]"
10
+ pre-commit install # optional: run the linters on every commit
11
+ ```
12
+
13
+ ## Before opening a PR
14
+
15
+ - Add a test for anything you change.
16
+ - Run the checks (the same ones CI runs):
17
+
18
+ ```sh
19
+ ruff check src tests examples
20
+ ruff format --check src tests examples
21
+ mypy
22
+ coverage run -m pytest && coverage report
23
+ ```
24
+
25
+ - Keep changes small and focused — one idea per PR.
26
+ - Match the existing style: simple, direct code, few comments.
27
+
28
+ ## Reporting bugs
29
+
30
+ Open an issue with a minimal reproduction: the metric, the inputs, and what you
31
+ expected versus what happened.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hatim
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,277 @@
1
+ Metadata-Version: 2.4
2
+ Name: signaltest
3
+ Version: 0.1.0
4
+ Summary: Flake-proof regression testing for LLM agents
5
+ Project-URL: Homepage, https://github.com/Falcon305/signaltest
6
+ Author: Hatim
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Requires-Python: >=3.10
10
+ Requires-Dist: scipy>=1.11
11
+ Provides-Extra: dev
12
+ Requires-Dist: coverage>=7.0; extra == 'dev'
13
+ Requires-Dist: mypy>=1.11; extra == 'dev'
14
+ Requires-Dist: pre-commit>=3.0; extra == 'dev'
15
+ Requires-Dist: pytest>=8.0; extra == 'dev'
16
+ Requires-Dist: ruff>=0.6; extra == 'dev'
17
+ Description-Content-Type: text/markdown
18
+
19
+ # signaltest
20
+
21
+ [![ci](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml/badge.svg)](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml)
22
+
23
+ Regression tests for LLM agents that don't fail your CI on noise.
24
+
25
+ LLMs are non-deterministic, so naive eval checks flake: a score drifts a point on
26
+ randomness, CI goes red, the team stops trusting it, and the check gets deleted.
27
+ signaltest runs each case several times and blocks a PR only when a regression is
28
+ **statistically real and large enough to matter** — then shows a diff of what
29
+ actually changed in the agent's run.
30
+
31
+ Local-first. No account, no service, no data leaves your repo.
32
+
33
+ Status: v0.1.0.
34
+
35
+ ## Contents
36
+
37
+ - [Why](#why)
38
+ - [Install](#install)
39
+ - [Quick start](#quick-start)
40
+ - [Testing a whole suite](#testing-a-whole-suite)
41
+ - [Metrics](#metrics)
42
+ - [How it works](#how-it-works)
43
+ - [Configuration](#configuration)
44
+ - [Baselines](#baselines)
45
+ - [Using it in CI](#using-it-in-ci)
46
+ - [CLI](#cli)
47
+ - [Development](#development)
48
+ - [FAQ](#faq)
49
+ - [Contributing](#contributing)
50
+ - [License](#license)
51
+
52
+ ## Why
53
+
54
+ Most eval tools score an agent once and compare against a fixed threshold. With a
55
+ stochastic model, that threshold flakes: the same prompt scores 0.84 one run and
56
+ 0.81 the next. CI fails on the bad draw, people stop believing it, and the safety
57
+ net is gone.
58
+
59
+ signaltest treats the score as a distribution, not a number. It samples the agent
60
+ `n` times for the candidate, compares against `n` recorded baseline samples, and
61
+ only fails when the difference is **statistically significant** *and* clears a
62
+ **minimum effect size**. Noise stays green. Real regressions go red.
63
+
64
+ ## Install
65
+
66
+ ```sh
67
+ pip install signaltest
68
+ ```
69
+
70
+ Or with [uv](https://docs.astral.sh/uv/):
71
+
72
+ ```sh
73
+ uv pip install signaltest # into the active environment
74
+ uv add signaltest # into a uv-managed project
75
+ ```
76
+
77
+ ## Quick start
78
+
79
+ Write a normal pytest test. Give signaltest a way to run your agent, the expected
80
+ output, and a metric.
81
+
82
+ ```python
83
+ from signaltest import Case, assert_no_regression, ExactMatch
84
+
85
+
86
+ def test_math_agent():
87
+ case = Case(
88
+ case_id="math_qa",
89
+ run=lambda: my_agent("what is 2 + 2?"),
90
+ expected="4",
91
+ metric=ExactMatch(),
92
+ )
93
+ assert_no_regression(case, "baselines/math_agent.json", n=10)
94
+ ```
95
+
96
+ The first run records a baseline (committed as JSON in your repo). Later runs
97
+ compare against it and fail the test only on a real regression.
98
+
99
+ ## Testing a whole suite
100
+
101
+ `run_suite` runs many cases and applies a multiple-comparison correction across
102
+ them, so a suite of 50 cases doesn't go red just because one flaked.
103
+
104
+ ```python
105
+ from signaltest import Case, run_suite, format_report, exit_code, ExactMatch
106
+
107
+ cases = [
108
+ Case("math", run=lambda: my_agent("2 + 2?"), expected="4", metric=ExactMatch()),
109
+ Case("geo", run=lambda: my_agent("capital of France?"), expected="Paris", metric=ExactMatch()),
110
+ ]
111
+
112
+ results = run_suite(cases, "baselines/agent.json", n=10)
113
+ print(format_report(results))
114
+ raise SystemExit(exit_code(results))
115
+ ```
116
+
117
+ `format_report` prints a per-case summary; `exit_code` returns `1` if any case
118
+ regressed, `0` otherwise — drop it straight into a CI step.
119
+
120
+ A failing case reports the measured effect size and p-value, so you see *how
121
+ big* the regression is, not just that one happened:
122
+
123
+ ```
124
+ PASS geo: no significant regression
125
+ FAIL math: significant regression past the effect floor (effect=-0.180, p=0.004)
126
+ 1 passed, 1 failed, 0 inconclusive
127
+ ```
128
+
129
+ ## Metrics
130
+
131
+ A metric declares its `kind` (numeric or boolean, which picks the significance
132
+ test) and its `polarity` (is higher or lower better).
133
+
134
+ | Metric | Kind | Polarity | Scores |
135
+ |--------|------|----------|--------|
136
+ | `ExactMatch()` | boolean | higher better | `output == expected` |
137
+ | `Contains()` | boolean | higher better | `expected in output` |
138
+ | `Numeric(name, polarity)` | numeric | configurable | the raw value (latency, cost, judge score) |
139
+ | `TrajectoryMatch(ignore_keys=...)` | numeric | higher better | fraction of matching agent tool-calls |
140
+
141
+ `Numeric` with `polarity="lower_better"` is how you gate latency or cost — a real
142
+ *increase* becomes the regression.
143
+
144
+ ```python
145
+ from signaltest import Numeric
146
+ from signaltest.metrics.base import LOWER_BETTER
147
+
148
+ latency = Numeric(name="latency_ms", polarity=LOWER_BETTER)
149
+ ```
150
+
151
+ `TrajectoryMatch` compares the agent's tool-call path against a reference path and
152
+ ignores volatile keys (timestamps, ids):
153
+
154
+ ```python
155
+ from signaltest import TrajectoryMatch, Step
156
+
157
+ expected_path = [Step("search", {"q": "weather"}), Step("answer", {})]
158
+ metric = TrajectoryMatch(ignore_keys=("request_id",))
159
+ ```
160
+
161
+ ## How it works
162
+
163
+ ```
164
+ candidate runs n times ─┐
165
+ ├─> significance test ─┐
166
+ stored baseline samples ┘ ├─> block only if
167
+ │ significant AND
168
+ effect size ─────────┘ past the floor
169
+ ```
170
+
171
+ - **Significance** — a permutation test for numeric metrics, Fisher's exact test
172
+ for boolean metrics. Both are seeded, so the same inputs always give the same
173
+ result. The gate that kills flakiness is not itself flaky.
174
+ - **Effect floor** — a regression must also clear a minimum effect size, so a
175
+ statistically significant but meaningless 0.1% drift never blocks the build.
176
+ - **Multiple comparisons** — across a suite, p-values are adjusted with the
177
+ Benjamini-Hochberg procedure, so flakiness doesn't reappear at the suite level.
178
+ - **Power** — cases with too few samples to detect a real change are flagged
179
+ `inconclusive`, never passed silently.
180
+ - **Model versioning** — a baseline records the model it was captured under. If you
181
+ pass a new `model=` and it differs, the baseline is re-recorded instead of
182
+ reported as a regression, so a provider model swap can't masquerade as one.
183
+
184
+ ## Configuration
185
+
186
+ Every `assert_no_regression` / `check_case` / `run_suite` call accepts:
187
+
188
+ | Argument | Default | Meaning |
189
+ |----------|---------|---------|
190
+ | `n` | `10` | samples per run (boolean metrics usually want more) |
191
+ | `alpha` | `0.05` | significance threshold |
192
+ | `min_effect` | `0.03` numeric / `0.10` boolean | minimum effect size to count |
193
+ | `min_valid` | `2` | fewer valid samples than this → `inconclusive` |
194
+ | `model` | `None` | model id recorded with the baseline |
195
+
196
+ ## Baselines
197
+
198
+ A baseline is a JSON file committed to your repo. Each entry is keyed by
199
+ `case_id::metric_name` and stores the recorded scores and the model.
200
+
201
+ - **Cold start** — the first run records the baseline and passes.
202
+ - **Updating** — to accept a new baseline on purpose, delete the case's entry and
203
+ re-run, or edit the JSON. The change is a reviewable diff in the same PR.
204
+ - **Inspecting** — use the CLI (below).
205
+
206
+ ## Using it in CI
207
+
208
+ Because cases are plain pytest tests, your existing `pytest` step gates them:
209
+
210
+ ```yaml
211
+ - run: pip install -e ".[dev]"
212
+ - run: pytest
213
+ ```
214
+
215
+ A failed case fails the build. Baselines live in the repo, so CI needs no secrets
216
+ and nothing leaves your infrastructure.
217
+
218
+ ## CLI
219
+
220
+ ```sh
221
+ signaltest version
222
+ signaltest baselines baselines/agent.json # list recorded cases
223
+ signaltest show baselines/agent.json math::exact_match
224
+ ```
225
+
226
+ ## Development
227
+
228
+ ```sh
229
+ git clone https://github.com/Falcon305/signaltest
230
+ cd signaltest
231
+ python -m venv .venv && . .venv/bin/activate
232
+ pip install -e ".[dev]"
233
+ pytest
234
+ ruff check src tests examples
235
+ ```
236
+
237
+ With uv the setup is a single command (it creates the environment for you):
238
+
239
+ ```sh
240
+ uv sync --extra dev
241
+ uv run pytest
242
+ ```
243
+
244
+ Try the offline examples (cached responses, no API key):
245
+
246
+ ```sh
247
+ python examples/demo.py # smallest possible case
248
+ python examples/tool_agent.py # tool-using agent: trajectory + answer checks
249
+ ```
250
+
251
+ See [docs/architecture.md](docs/architecture.md) for how the pieces fit together
252
+ and how to add your own metric.
253
+
254
+ ## FAQ
255
+
256
+ **Does it call my LLM?** Only through the `run` function you provide. signaltest
257
+ never talks to a provider itself.
258
+
259
+ **How many samples do I need?** `n=10` is a sane default for numeric metrics.
260
+ Boolean metrics resolve in coarser steps, so they need more — bump `n` and watch
261
+ for `inconclusive`, which means the test can't yet detect a change of the size you
262
+ care about.
263
+
264
+ **Why did a case come back `inconclusive`?** Too few valid samples to be
265
+ trustworthy. Increase `n`, or fix whatever made runs error out.
266
+
267
+ **Does my data leave my machine?** No. Baselines are local JSON; there is no
268
+ service.
269
+
270
+ ## Contributing
271
+
272
+ Issues and pull requests are welcome. Keep changes small and focused, and add a
273
+ test for anything you change. See `CONTRIBUTING.md`.
274
+
275
+ ## License
276
+
277
+ MIT
@@ -0,0 +1,259 @@
1
+ # signaltest
2
+
3
+ [![ci](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml/badge.svg)](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml)
4
+
5
+ Regression tests for LLM agents that don't fail your CI on noise.
6
+
7
+ LLMs are non-deterministic, so naive eval checks flake: a score drifts a point on
8
+ randomness, CI goes red, the team stops trusting it, and the check gets deleted.
9
+ signaltest runs each case several times and blocks a PR only when a regression is
10
+ **statistically real and large enough to matter** — then shows a diff of what
11
+ actually changed in the agent's run.
12
+
13
+ Local-first. No account, no service, no data leaves your repo.
14
+
15
+ Status: v0.1.0.
16
+
17
+ ## Contents
18
+
19
+ - [Why](#why)
20
+ - [Install](#install)
21
+ - [Quick start](#quick-start)
22
+ - [Testing a whole suite](#testing-a-whole-suite)
23
+ - [Metrics](#metrics)
24
+ - [How it works](#how-it-works)
25
+ - [Configuration](#configuration)
26
+ - [Baselines](#baselines)
27
+ - [Using it in CI](#using-it-in-ci)
28
+ - [CLI](#cli)
29
+ - [Development](#development)
30
+ - [FAQ](#faq)
31
+ - [Contributing](#contributing)
32
+ - [License](#license)
33
+
34
+ ## Why
35
+
36
+ Most eval tools score an agent once and compare against a fixed threshold. With a
37
+ stochastic model, that threshold flakes: the same prompt scores 0.84 one run and
38
+ 0.81 the next. CI fails on the bad draw, people stop believing it, and the safety
39
+ net is gone.
40
+
41
+ signaltest treats the score as a distribution, not a number. It samples the agent
42
+ `n` times for the candidate, compares against `n` recorded baseline samples, and
43
+ only fails when the difference is **statistically significant** *and* clears a
44
+ **minimum effect size**. Noise stays green. Real regressions go red.
45
+
46
+ ## Install
47
+
48
+ ```sh
49
+ pip install signaltest
50
+ ```
51
+
52
+ Or with [uv](https://docs.astral.sh/uv/):
53
+
54
+ ```sh
55
+ uv pip install signaltest # into the active environment
56
+ uv add signaltest # into a uv-managed project
57
+ ```
58
+
59
+ ## Quick start
60
+
61
+ Write a normal pytest test. Give signaltest a way to run your agent, the expected
62
+ output, and a metric.
63
+
64
+ ```python
65
+ from signaltest import Case, assert_no_regression, ExactMatch
66
+
67
+
68
+ def test_math_agent():
69
+ case = Case(
70
+ case_id="math_qa",
71
+ run=lambda: my_agent("what is 2 + 2?"),
72
+ expected="4",
73
+ metric=ExactMatch(),
74
+ )
75
+ assert_no_regression(case, "baselines/math_agent.json", n=10)
76
+ ```
77
+
78
+ The first run records a baseline (committed as JSON in your repo). Later runs
79
+ compare against it and fail the test only on a real regression.
80
+
81
+ ## Testing a whole suite
82
+
83
+ `run_suite` runs many cases and applies a multiple-comparison correction across
84
+ them, so a suite of 50 cases doesn't go red just because one flaked.
85
+
86
+ ```python
87
+ from signaltest import Case, run_suite, format_report, exit_code, ExactMatch
88
+
89
+ cases = [
90
+ Case("math", run=lambda: my_agent("2 + 2?"), expected="4", metric=ExactMatch()),
91
+ Case("geo", run=lambda: my_agent("capital of France?"), expected="Paris", metric=ExactMatch()),
92
+ ]
93
+
94
+ results = run_suite(cases, "baselines/agent.json", n=10)
95
+ print(format_report(results))
96
+ raise SystemExit(exit_code(results))
97
+ ```
98
+
99
+ `format_report` prints a per-case summary; `exit_code` returns `1` if any case
100
+ regressed, `0` otherwise — drop it straight into a CI step.
101
+
102
+ A failing case reports the measured effect size and p-value, so you see *how
103
+ big* the regression is, not just that one happened:
104
+
105
+ ```
106
+ PASS geo: no significant regression
107
+ FAIL math: significant regression past the effect floor (effect=-0.180, p=0.004)
108
+ 1 passed, 1 failed, 0 inconclusive
109
+ ```
110
+
111
+ ## Metrics
112
+
113
+ A metric declares its `kind` (numeric or boolean, which picks the significance
114
+ test) and its `polarity` (is higher or lower better).
115
+
116
+ | Metric | Kind | Polarity | Scores |
117
+ |--------|------|----------|--------|
118
+ | `ExactMatch()` | boolean | higher better | `output == expected` |
119
+ | `Contains()` | boolean | higher better | `expected in output` |
120
+ | `Numeric(name, polarity)` | numeric | configurable | the raw value (latency, cost, judge score) |
121
+ | `TrajectoryMatch(ignore_keys=...)` | numeric | higher better | fraction of matching agent tool-calls |
122
+
123
+ `Numeric` with `polarity="lower_better"` is how you gate latency or cost — a real
124
+ *increase* becomes the regression.
125
+
126
+ ```python
127
+ from signaltest import Numeric
128
+ from signaltest.metrics.base import LOWER_BETTER
129
+
130
+ latency = Numeric(name="latency_ms", polarity=LOWER_BETTER)
131
+ ```
132
+
133
+ `TrajectoryMatch` compares the agent's tool-call path against a reference path and
134
+ ignores volatile keys (timestamps, ids):
135
+
136
+ ```python
137
+ from signaltest import TrajectoryMatch, Step
138
+
139
+ expected_path = [Step("search", {"q": "weather"}), Step("answer", {})]
140
+ metric = TrajectoryMatch(ignore_keys=("request_id",))
141
+ ```
142
+
143
+ ## How it works
144
+
145
+ ```
146
+ candidate runs n times ─┐
147
+ ├─> significance test ─┐
148
+ stored baseline samples ┘ ├─> block only if
149
+ │ significant AND
150
+ effect size ─────────┘ past the floor
151
+ ```
152
+
153
+ - **Significance** — a permutation test for numeric metrics, Fisher's exact test
154
+ for boolean metrics. Both are seeded, so the same inputs always give the same
155
+ result. The gate that kills flakiness is not itself flaky.
156
+ - **Effect floor** — a regression must also clear a minimum effect size, so a
157
+ statistically significant but meaningless 0.1% drift never blocks the build.
158
+ - **Multiple comparisons** — across a suite, p-values are adjusted with the
159
+ Benjamini-Hochberg procedure, so flakiness doesn't reappear at the suite level.
160
+ - **Power** — cases with too few samples to detect a real change are flagged
161
+ `inconclusive`, never passed silently.
162
+ - **Model versioning** — a baseline records the model it was captured under. If you
163
+ pass a new `model=` and it differs, the baseline is re-recorded instead of
164
+ reported as a regression, so a provider model swap can't masquerade as one.
165
+
166
+ ## Configuration
167
+
168
+ Every `assert_no_regression` / `check_case` / `run_suite` call accepts:
169
+
170
+ | Argument | Default | Meaning |
171
+ |----------|---------|---------|
172
+ | `n` | `10` | samples per run (boolean metrics usually want more) |
173
+ | `alpha` | `0.05` | significance threshold |
174
+ | `min_effect` | `0.03` numeric / `0.10` boolean | minimum effect size to count |
175
+ | `min_valid` | `2` | fewer valid samples than this → `inconclusive` |
176
+ | `model` | `None` | model id recorded with the baseline |
177
+
178
+ ## Baselines
179
+
180
+ A baseline is a JSON file committed to your repo. Each entry is keyed by
181
+ `case_id::metric_name` and stores the recorded scores and the model.
182
+
183
+ - **Cold start** — the first run records the baseline and passes.
184
+ - **Updating** — to accept a new baseline on purpose, delete the case's entry and
185
+ re-run, or edit the JSON. The change is a reviewable diff in the same PR.
186
+ - **Inspecting** — use the CLI (below).
187
+
188
+ ## Using it in CI
189
+
190
+ Because cases are plain pytest tests, your existing `pytest` step gates them:
191
+
192
+ ```yaml
193
+ - run: pip install -e ".[dev]"
194
+ - run: pytest
195
+ ```
196
+
197
+ A failed case fails the build. Baselines live in the repo, so CI needs no secrets
198
+ and nothing leaves your infrastructure.
199
+
200
+ ## CLI
201
+
202
+ ```sh
203
+ signaltest version
204
+ signaltest baselines baselines/agent.json # list recorded cases
205
+ signaltest show baselines/agent.json math::exact_match
206
+ ```
207
+
208
+ ## Development
209
+
210
+ ```sh
211
+ git clone https://github.com/Falcon305/signaltest
212
+ cd signaltest
213
+ python -m venv .venv && . .venv/bin/activate
214
+ pip install -e ".[dev]"
215
+ pytest
216
+ ruff check src tests examples
217
+ ```
218
+
219
+ With uv the setup is a single command (it creates the environment for you):
220
+
221
+ ```sh
222
+ uv sync --extra dev
223
+ uv run pytest
224
+ ```
225
+
226
+ Try the offline examples (cached responses, no API key):
227
+
228
+ ```sh
229
+ python examples/demo.py # smallest possible case
230
+ python examples/tool_agent.py # tool-using agent: trajectory + answer checks
231
+ ```
232
+
233
+ See [docs/architecture.md](docs/architecture.md) for how the pieces fit together
234
+ and how to add your own metric.
235
+
236
+ ## FAQ
237
+
238
+ **Does it call my LLM?** Only through the `run` function you provide. signaltest
239
+ never talks to a provider itself.
240
+
241
+ **How many samples do I need?** `n=10` is a sane default for numeric metrics.
242
+ Boolean metrics resolve in coarser steps, so they need more — bump `n` and watch
243
+ for `inconclusive`, which means the test can't yet detect a change of the size you
244
+ care about.
245
+
246
+ **Why did a case come back `inconclusive`?** Too few valid samples to be
247
+ trustworthy. Increase `n`, or fix whatever made runs error out.
248
+
249
+ **Does my data leave my machine?** No. Baselines are local JSON; there is no
250
+ service.
251
+
252
+ ## Contributing
253
+
254
+ Issues and pull requests are welcome. Keep changes small and focused, and add a
255
+ test for anything you change. See `CONTRIBUTING.md`.
256
+
257
+ ## License
258
+
259
+ MIT