signaltest 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- signaltest-0.1.0/.github/workflows/ci.yml +25 -0
- signaltest-0.1.0/.github/workflows/release.yml +19 -0
- signaltest-0.1.0/.gitignore +13 -0
- signaltest-0.1.0/.pre-commit-config.yaml +14 -0
- signaltest-0.1.0/CHANGELOG.md +29 -0
- signaltest-0.1.0/CONTRIBUTING.md +31 -0
- signaltest-0.1.0/LICENSE +21 -0
- signaltest-0.1.0/PKG-INFO +277 -0
- signaltest-0.1.0/README.md +259 -0
- signaltest-0.1.0/docs/architecture.md +96 -0
- signaltest-0.1.0/examples/demo.py +33 -0
- signaltest-0.1.0/examples/tool_agent.py +70 -0
- signaltest-0.1.0/pyproject.toml +46 -0
- signaltest-0.1.0/src/signaltest/__init__.py +35 -0
- signaltest-0.1.0/src/signaltest/baseline/__init__.py +0 -0
- signaltest-0.1.0/src/signaltest/baseline/record.py +21 -0
- signaltest-0.1.0/src/signaltest/baseline/store.py +25 -0
- signaltest-0.1.0/src/signaltest/cli.py +39 -0
- signaltest-0.1.0/src/signaltest/metrics/__init__.py +0 -0
- signaltest-0.1.0/src/signaltest/metrics/base.py +17 -0
- signaltest-0.1.0/src/signaltest/metrics/contains.py +12 -0
- signaltest-0.1.0/src/signaltest/metrics/exact.py +12 -0
- signaltest-0.1.0/src/signaltest/metrics/judge.py +15 -0
- signaltest-0.1.0/src/signaltest/metrics/numeric.py +14 -0
- signaltest-0.1.0/src/signaltest/metrics/trajectory.py +17 -0
- signaltest-0.1.0/src/signaltest/plugin.py +5 -0
- signaltest-0.1.0/src/signaltest/py.typed +0 -0
- signaltest-0.1.0/src/signaltest/report.py +32 -0
- signaltest-0.1.0/src/signaltest/runner.py +136 -0
- signaltest-0.1.0/src/signaltest/stats/__init__.py +0 -0
- signaltest-0.1.0/src/signaltest/stats/correction.py +10 -0
- signaltest-0.1.0/src/signaltest/stats/effect.py +28 -0
- signaltest-0.1.0/src/signaltest/stats/gate.py +45 -0
- signaltest-0.1.0/src/signaltest/stats/significance.py +35 -0
- signaltest-0.1.0/src/signaltest/trajectory/__init__.py +0 -0
- signaltest-0.1.0/src/signaltest/trajectory/diff.py +22 -0
- signaltest-0.1.0/src/signaltest/trajectory/match.py +21 -0
- signaltest-0.1.0/src/signaltest/trajectory/model.py +8 -0
- signaltest-0.1.0/tests/test_api.py +16 -0
- signaltest-0.1.0/tests/test_baseline.py +38 -0
- signaltest-0.1.0/tests/test_boolean_significance.py +20 -0
- signaltest-0.1.0/tests/test_cli.py +44 -0
- signaltest-0.1.0/tests/test_correction.py +21 -0
- signaltest-0.1.0/tests/test_diff.py +34 -0
- signaltest-0.1.0/tests/test_e2e.py +28 -0
- signaltest-0.1.0/tests/test_effect_ci.py +28 -0
- signaltest-0.1.0/tests/test_gate.py +44 -0
- signaltest-0.1.0/tests/test_integration.py +51 -0
- signaltest-0.1.0/tests/test_judge.py +30 -0
- signaltest-0.1.0/tests/test_metrics.py +16 -0
- signaltest-0.1.0/tests/test_model.py +31 -0
- signaltest-0.1.0/tests/test_more_metrics.py +21 -0
- signaltest-0.1.0/tests/test_plugin.py +20 -0
- signaltest-0.1.0/tests/test_record.py +25 -0
- signaltest-0.1.0/tests/test_report.py +21 -0
- signaltest-0.1.0/tests/test_report_detail.py +21 -0
- signaltest-0.1.0/tests/test_runner.py +64 -0
- signaltest-0.1.0/tests/test_significance.py +26 -0
- signaltest-0.1.0/tests/test_suite.py +36 -0
- signaltest-0.1.0/tests/test_trajectory.py +33 -0
- signaltest-0.1.0/tests/test_trajectory_metric.py +26 -0
- signaltest-0.1.0/tests/test_version.py +5 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
name: ci
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
test:
|
|
10
|
+
runs-on: ubuntu-latest
|
|
11
|
+
strategy:
|
|
12
|
+
matrix:
|
|
13
|
+
python-version: ["3.10", "3.11", "3.12"]
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v5
|
|
17
|
+
with:
|
|
18
|
+
python-version: ${{ matrix.python-version }}
|
|
19
|
+
- run: uv sync --extra dev
|
|
20
|
+
- run: uv run ruff check src tests examples
|
|
21
|
+
- run: uv run ruff format --check src tests examples
|
|
22
|
+
- run: uv run mypy
|
|
23
|
+
if: matrix.python-version == '3.12'
|
|
24
|
+
- run: uv run coverage run -m pytest -q
|
|
25
|
+
- run: uv run coverage report
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
permissions:
|
|
11
|
+
id-token: write
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- uses: actions/setup-python@v5
|
|
15
|
+
with:
|
|
16
|
+
python-version: "3.12"
|
|
17
|
+
- run: pip install build
|
|
18
|
+
- run: python -m build
|
|
19
|
+
- uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
3
|
+
rev: v5.0.0
|
|
4
|
+
hooks:
|
|
5
|
+
- id: trailing-whitespace
|
|
6
|
+
- id: end-of-file-fixer
|
|
7
|
+
- id: check-yaml
|
|
8
|
+
- id: check-toml
|
|
9
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
10
|
+
rev: v0.15.20
|
|
11
|
+
hooks:
|
|
12
|
+
- id: ruff
|
|
13
|
+
args: [--fix]
|
|
14
|
+
- id: ruff-format
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project are documented here. The format is based on
|
|
4
|
+
[Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|
5
|
+
|
|
6
|
+
## Unreleased
|
|
7
|
+
|
|
8
|
+
## 0.1.0 - 2026-06-30
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Statistical gate: permutation and Fisher significance, bootstrap effect-size
|
|
12
|
+
interval, Benjamini-Hochberg correction, and a decision that blocks only on a
|
|
13
|
+
significant regression past a minimum effect size.
|
|
14
|
+
- Underpowered detection so cases with too few samples are flagged, not passed.
|
|
15
|
+
- Metrics: exact match, contains, numeric (configurable polarity), trajectory
|
|
16
|
+
match, and an LLM-judge metric that wraps any scoring callable.
|
|
17
|
+
- `Metric` protocol so custom metrics need only `name`, `kind`, `polarity`, and
|
|
18
|
+
`score`.
|
|
19
|
+
- Tool-trajectory model, match score, and a git-style diff renderer.
|
|
20
|
+
- Baseline JSON store with cold-start record-only and corrupt-file detection.
|
|
21
|
+
- Model versioning: baselines re-record on a model change instead of reporting it
|
|
22
|
+
as a regression.
|
|
23
|
+
- pytest plugin and `assert_no_regression` for single cases.
|
|
24
|
+
- `run_suite` with suite-level correction, plus a text report and CI exit code.
|
|
25
|
+
- Reports show the measured effect size and p-value on every case.
|
|
26
|
+
- `signaltest` CLI for inspecting baselines.
|
|
27
|
+
- Type hints across the package, checked with mypy `strict`.
|
|
28
|
+
- Offline demos (a minimal case and a tool-using agent) and an end-to-end test
|
|
29
|
+
that run with no API key.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Contributing
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in signaltest.
|
|
4
|
+
|
|
5
|
+
## Setup
|
|
6
|
+
|
|
7
|
+
```sh
|
|
8
|
+
python -m venv .venv && . .venv/bin/activate
|
|
9
|
+
pip install -e ".[dev]"
|
|
10
|
+
pre-commit install # optional: run the linters on every commit
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
## Before opening a PR
|
|
14
|
+
|
|
15
|
+
- Add a test for anything you change.
|
|
16
|
+
- Run the checks (the same ones CI runs):
|
|
17
|
+
|
|
18
|
+
```sh
|
|
19
|
+
ruff check src tests examples
|
|
20
|
+
ruff format --check src tests examples
|
|
21
|
+
mypy
|
|
22
|
+
coverage run -m pytest && coverage report
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
- Keep changes small and focused — one idea per PR.
|
|
26
|
+
- Match the existing style: simple, direct code, few comments.
|
|
27
|
+
|
|
28
|
+
## Reporting bugs
|
|
29
|
+
|
|
30
|
+
Open an issue with a minimal reproduction: the metric, the inputs, and what you
|
|
31
|
+
expected versus what happened.
|
signaltest-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hatim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: signaltest
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Flake-proof regression testing for LLM agents
|
|
5
|
+
Project-URL: Homepage, https://github.com/Falcon305/signaltest
|
|
6
|
+
Author: Hatim
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Requires-Dist: scipy>=1.11
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: coverage>=7.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: mypy>=1.11; extra == 'dev'
|
|
14
|
+
Requires-Dist: pre-commit>=3.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: ruff>=0.6; extra == 'dev'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# signaltest
|
|
20
|
+
|
|
21
|
+
[](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml)
|
|
22
|
+
|
|
23
|
+
Regression tests for LLM agents that don't fail your CI on noise.
|
|
24
|
+
|
|
25
|
+
LLMs are non-deterministic, so naive eval checks flake: a score drifts a point on
|
|
26
|
+
randomness, CI goes red, the team stops trusting it, and the check gets deleted.
|
|
27
|
+
signaltest runs each case several times and blocks a PR only when a regression is
|
|
28
|
+
**statistically real and large enough to matter** — then shows a diff of what
|
|
29
|
+
actually changed in the agent's run.
|
|
30
|
+
|
|
31
|
+
Local-first. No account, no service, no data leaves your repo.
|
|
32
|
+
|
|
33
|
+
Status: v0.1.0.
|
|
34
|
+
|
|
35
|
+
## Contents
|
|
36
|
+
|
|
37
|
+
- [Why](#why)
|
|
38
|
+
- [Install](#install)
|
|
39
|
+
- [Quick start](#quick-start)
|
|
40
|
+
- [Testing a whole suite](#testing-a-whole-suite)
|
|
41
|
+
- [Metrics](#metrics)
|
|
42
|
+
- [How it works](#how-it-works)
|
|
43
|
+
- [Configuration](#configuration)
|
|
44
|
+
- [Baselines](#baselines)
|
|
45
|
+
- [Using it in CI](#using-it-in-ci)
|
|
46
|
+
- [CLI](#cli)
|
|
47
|
+
- [Development](#development)
|
|
48
|
+
- [FAQ](#faq)
|
|
49
|
+
- [Contributing](#contributing)
|
|
50
|
+
- [License](#license)
|
|
51
|
+
|
|
52
|
+
## Why
|
|
53
|
+
|
|
54
|
+
Most eval tools score an agent once and compare against a fixed threshold. With a
|
|
55
|
+
stochastic model, that threshold flakes: the same prompt scores 0.84 one run and
|
|
56
|
+
0.81 the next. CI fails on the bad draw, people stop believing it, and the safety
|
|
57
|
+
net is gone.
|
|
58
|
+
|
|
59
|
+
signaltest treats the score as a distribution, not a number. It samples the agent
|
|
60
|
+
`n` times for the candidate, compares against `n` recorded baseline samples, and
|
|
61
|
+
only fails when the difference is **statistically significant** *and* clears a
|
|
62
|
+
**minimum effect size**. Noise stays green. Real regressions go red.
|
|
63
|
+
|
|
64
|
+
## Install
|
|
65
|
+
|
|
66
|
+
```sh
|
|
67
|
+
pip install signaltest
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
71
|
+
|
|
72
|
+
```sh
|
|
73
|
+
uv pip install signaltest # into the active environment
|
|
74
|
+
uv add signaltest # into a uv-managed project
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Quick start
|
|
78
|
+
|
|
79
|
+
Write a normal pytest test. Give signaltest a way to run your agent, the expected
|
|
80
|
+
output, and a metric.
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from signaltest import Case, assert_no_regression, ExactMatch
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def test_math_agent():
|
|
87
|
+
case = Case(
|
|
88
|
+
case_id="math_qa",
|
|
89
|
+
run=lambda: my_agent("what is 2 + 2?"),
|
|
90
|
+
expected="4",
|
|
91
|
+
metric=ExactMatch(),
|
|
92
|
+
)
|
|
93
|
+
assert_no_regression(case, "baselines/math_agent.json", n=10)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
The first run records a baseline (committed as JSON in your repo). Later runs
|
|
97
|
+
compare against it and fail the test only on a real regression.
|
|
98
|
+
|
|
99
|
+
## Testing a whole suite
|
|
100
|
+
|
|
101
|
+
`run_suite` runs many cases and applies a multiple-comparison correction across
|
|
102
|
+
them, so a suite of 50 cases doesn't go red just because one flaked.
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from signaltest import Case, run_suite, format_report, exit_code, ExactMatch
|
|
106
|
+
|
|
107
|
+
cases = [
|
|
108
|
+
Case("math", run=lambda: my_agent("2 + 2?"), expected="4", metric=ExactMatch()),
|
|
109
|
+
Case("geo", run=lambda: my_agent("capital of France?"), expected="Paris", metric=ExactMatch()),
|
|
110
|
+
]
|
|
111
|
+
|
|
112
|
+
results = run_suite(cases, "baselines/agent.json", n=10)
|
|
113
|
+
print(format_report(results))
|
|
114
|
+
raise SystemExit(exit_code(results))
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
`format_report` prints a per-case summary; `exit_code` returns `1` if any case
|
|
118
|
+
regressed, `0` otherwise — drop it straight into a CI step.
|
|
119
|
+
|
|
120
|
+
A failing case reports the measured effect size and p-value, so you see *how
|
|
121
|
+
big* the regression is, not just that one happened:
|
|
122
|
+
|
|
123
|
+
```
|
|
124
|
+
PASS geo: no significant regression
|
|
125
|
+
FAIL math: significant regression past the effect floor (effect=-0.180, p=0.004)
|
|
126
|
+
1 passed, 1 failed, 0 inconclusive
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Metrics
|
|
130
|
+
|
|
131
|
+
A metric declares its `kind` (numeric or boolean, which picks the significance
|
|
132
|
+
test) and its `polarity` (is higher or lower better).
|
|
133
|
+
|
|
134
|
+
| Metric | Kind | Polarity | Scores |
|
|
135
|
+
|--------|------|----------|--------|
|
|
136
|
+
| `ExactMatch()` | boolean | higher better | `output == expected` |
|
|
137
|
+
| `Contains()` | boolean | higher better | `expected in output` |
|
|
138
|
+
| `Numeric(name, polarity)` | numeric | configurable | the raw value (latency, cost, judge score) |
|
|
139
|
+
| `TrajectoryMatch(ignore_keys=...)` | numeric | higher better | fraction of matching agent tool-calls |
|
|
140
|
+
|
|
141
|
+
`Numeric` with `polarity="lower_better"` is how you gate latency or cost — a real
|
|
142
|
+
*increase* becomes the regression.
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
from signaltest import Numeric
|
|
146
|
+
from signaltest.metrics.base import LOWER_BETTER
|
|
147
|
+
|
|
148
|
+
latency = Numeric(name="latency_ms", polarity=LOWER_BETTER)
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
`TrajectoryMatch` compares the agent's tool-call path against a reference path and
|
|
152
|
+
ignores volatile keys (timestamps, ids):
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
from signaltest import TrajectoryMatch, Step
|
|
156
|
+
|
|
157
|
+
expected_path = [Step("search", {"q": "weather"}), Step("answer", {})]
|
|
158
|
+
metric = TrajectoryMatch(ignore_keys=("request_id",))
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
## How it works
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
candidate runs n times ─┐
|
|
165
|
+
├─> significance test ─┐
|
|
166
|
+
stored baseline samples ┘ ├─> block only if
|
|
167
|
+
│ significant AND
|
|
168
|
+
effect size ─────────┘ past the floor
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
- **Significance** — a permutation test for numeric metrics, Fisher's exact test
|
|
172
|
+
for boolean metrics. Both are seeded, so the same inputs always give the same
|
|
173
|
+
result. The gate that kills flakiness is not itself flaky.
|
|
174
|
+
- **Effect floor** — a regression must also clear a minimum effect size, so a
|
|
175
|
+
statistically significant but meaningless 0.1% drift never blocks the build.
|
|
176
|
+
- **Multiple comparisons** — across a suite, p-values are adjusted with the
|
|
177
|
+
Benjamini-Hochberg procedure, so flakiness doesn't reappear at the suite level.
|
|
178
|
+
- **Power** — cases with too few samples to detect a real change are flagged
|
|
179
|
+
`inconclusive`, never passed silently.
|
|
180
|
+
- **Model versioning** — a baseline records the model it was captured under. If you
|
|
181
|
+
pass a new `model=` and it differs, the baseline is re-recorded instead of
|
|
182
|
+
reported as a regression, so a provider model swap can't masquerade as one.
|
|
183
|
+
|
|
184
|
+
## Configuration
|
|
185
|
+
|
|
186
|
+
Every `assert_no_regression` / `check_case` / `run_suite` call accepts:
|
|
187
|
+
|
|
188
|
+
| Argument | Default | Meaning |
|
|
189
|
+
|----------|---------|---------|
|
|
190
|
+
| `n` | `10` | samples per run (boolean metrics usually want more) |
|
|
191
|
+
| `alpha` | `0.05` | significance threshold |
|
|
192
|
+
| `min_effect` | `0.03` numeric / `0.10` boolean | minimum effect size to count |
|
|
193
|
+
| `min_valid` | `2` | fewer valid samples than this → `inconclusive` |
|
|
194
|
+
| `model` | `None` | model id recorded with the baseline |
|
|
195
|
+
|
|
196
|
+
## Baselines
|
|
197
|
+
|
|
198
|
+
A baseline is a JSON file committed to your repo. Each entry is keyed by
|
|
199
|
+
`case_id::metric_name` and stores the recorded scores and the model.
|
|
200
|
+
|
|
201
|
+
- **Cold start** — the first run records the baseline and passes.
|
|
202
|
+
- **Updating** — to accept a new baseline on purpose, delete the case's entry and
|
|
203
|
+
re-run, or edit the JSON. The change is a reviewable diff in the same PR.
|
|
204
|
+
- **Inspecting** — use the CLI (below).
|
|
205
|
+
|
|
206
|
+
## Using it in CI
|
|
207
|
+
|
|
208
|
+
Because cases are plain pytest tests, your existing `pytest` step gates them:
|
|
209
|
+
|
|
210
|
+
```yaml
|
|
211
|
+
- run: pip install -e ".[dev]"
|
|
212
|
+
- run: pytest
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
A failed case fails the build. Baselines live in the repo, so CI needs no secrets
|
|
216
|
+
and nothing leaves your infrastructure.
|
|
217
|
+
|
|
218
|
+
## CLI
|
|
219
|
+
|
|
220
|
+
```sh
|
|
221
|
+
signaltest version
|
|
222
|
+
signaltest baselines baselines/agent.json # list recorded cases
|
|
223
|
+
signaltest show baselines/agent.json math::exact_match
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
## Development
|
|
227
|
+
|
|
228
|
+
```sh
|
|
229
|
+
git clone https://github.com/Falcon305/signaltest
|
|
230
|
+
cd signaltest
|
|
231
|
+
python -m venv .venv && . .venv/bin/activate
|
|
232
|
+
pip install -e ".[dev]"
|
|
233
|
+
pytest
|
|
234
|
+
ruff check src tests examples
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
With uv the setup is a single command (it creates the environment for you):
|
|
238
|
+
|
|
239
|
+
```sh
|
|
240
|
+
uv sync --extra dev
|
|
241
|
+
uv run pytest
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
Try the offline examples (cached responses, no API key):
|
|
245
|
+
|
|
246
|
+
```sh
|
|
247
|
+
python examples/demo.py # smallest possible case
|
|
248
|
+
python examples/tool_agent.py # tool-using agent: trajectory + answer checks
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
See [docs/architecture.md](docs/architecture.md) for how the pieces fit together
|
|
252
|
+
and how to add your own metric.
|
|
253
|
+
|
|
254
|
+
## FAQ
|
|
255
|
+
|
|
256
|
+
**Does it call my LLM?** Only through the `run` function you provide. signaltest
|
|
257
|
+
never talks to a provider itself.
|
|
258
|
+
|
|
259
|
+
**How many samples do I need?** `n=10` is a sane default for numeric metrics.
|
|
260
|
+
Boolean metrics resolve in coarser steps, so they need more — bump `n` and watch
|
|
261
|
+
for `inconclusive`, which means the test can't yet detect a change of the size you
|
|
262
|
+
care about.
|
|
263
|
+
|
|
264
|
+
**Why did a case come back `inconclusive`?** Too few valid samples to be
|
|
265
|
+
trustworthy. Increase `n`, or fix whatever made runs error out.
|
|
266
|
+
|
|
267
|
+
**Does my data leave my machine?** No. Baselines are local JSON; there is no
|
|
268
|
+
service.
|
|
269
|
+
|
|
270
|
+
## Contributing
|
|
271
|
+
|
|
272
|
+
Issues and pull requests are welcome. Keep changes small and focused, and add a
|
|
273
|
+
test for anything you change. See `CONTRIBUTING.md`.
|
|
274
|
+
|
|
275
|
+
## License
|
|
276
|
+
|
|
277
|
+
MIT
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
# signaltest
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Falcon305/signaltest/actions/workflows/ci.yml)
|
|
4
|
+
|
|
5
|
+
Regression tests for LLM agents that don't fail your CI on noise.
|
|
6
|
+
|
|
7
|
+
LLMs are non-deterministic, so naive eval checks flake: a score drifts a point on
|
|
8
|
+
randomness, CI goes red, the team stops trusting it, and the check gets deleted.
|
|
9
|
+
signaltest runs each case several times and blocks a PR only when a regression is
|
|
10
|
+
**statistically real and large enough to matter** — then shows a diff of what
|
|
11
|
+
actually changed in the agent's run.
|
|
12
|
+
|
|
13
|
+
Local-first. No account, no service, no data leaves your repo.
|
|
14
|
+
|
|
15
|
+
Status: v0.1.0.
|
|
16
|
+
|
|
17
|
+
## Contents
|
|
18
|
+
|
|
19
|
+
- [Why](#why)
|
|
20
|
+
- [Install](#install)
|
|
21
|
+
- [Quick start](#quick-start)
|
|
22
|
+
- [Testing a whole suite](#testing-a-whole-suite)
|
|
23
|
+
- [Metrics](#metrics)
|
|
24
|
+
- [How it works](#how-it-works)
|
|
25
|
+
- [Configuration](#configuration)
|
|
26
|
+
- [Baselines](#baselines)
|
|
27
|
+
- [Using it in CI](#using-it-in-ci)
|
|
28
|
+
- [CLI](#cli)
|
|
29
|
+
- [Development](#development)
|
|
30
|
+
- [FAQ](#faq)
|
|
31
|
+
- [Contributing](#contributing)
|
|
32
|
+
- [License](#license)
|
|
33
|
+
|
|
34
|
+
## Why
|
|
35
|
+
|
|
36
|
+
Most eval tools score an agent once and compare against a fixed threshold. With a
|
|
37
|
+
stochastic model, that threshold flakes: the same prompt scores 0.84 one run and
|
|
38
|
+
0.81 the next. CI fails on the bad draw, people stop believing it, and the safety
|
|
39
|
+
net is gone.
|
|
40
|
+
|
|
41
|
+
signaltest treats the score as a distribution, not a number. It samples the agent
|
|
42
|
+
`n` times for the candidate, compares against `n` recorded baseline samples, and
|
|
43
|
+
only fails when the difference is **statistically significant** *and* clears a
|
|
44
|
+
**minimum effect size**. Noise stays green. Real regressions go red.
|
|
45
|
+
|
|
46
|
+
## Install
|
|
47
|
+
|
|
48
|
+
```sh
|
|
49
|
+
pip install signaltest
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Or with [uv](https://docs.astral.sh/uv/):
|
|
53
|
+
|
|
54
|
+
```sh
|
|
55
|
+
uv pip install signaltest # into the active environment
|
|
56
|
+
uv add signaltest # into a uv-managed project
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Quick start
|
|
60
|
+
|
|
61
|
+
Write a normal pytest test. Give signaltest a way to run your agent, the expected
|
|
62
|
+
output, and a metric.
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from signaltest import Case, assert_no_regression, ExactMatch
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_math_agent():
|
|
69
|
+
case = Case(
|
|
70
|
+
case_id="math_qa",
|
|
71
|
+
run=lambda: my_agent("what is 2 + 2?"),
|
|
72
|
+
expected="4",
|
|
73
|
+
metric=ExactMatch(),
|
|
74
|
+
)
|
|
75
|
+
assert_no_regression(case, "baselines/math_agent.json", n=10)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
The first run records a baseline (committed as JSON in your repo). Later runs
|
|
79
|
+
compare against it and fail the test only on a real regression.
|
|
80
|
+
|
|
81
|
+
## Testing a whole suite
|
|
82
|
+
|
|
83
|
+
`run_suite` runs many cases and applies a multiple-comparison correction across
|
|
84
|
+
them, so a suite of 50 cases doesn't go red just because one flaked.
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
from signaltest import Case, run_suite, format_report, exit_code, ExactMatch
|
|
88
|
+
|
|
89
|
+
cases = [
|
|
90
|
+
Case("math", run=lambda: my_agent("2 + 2?"), expected="4", metric=ExactMatch()),
|
|
91
|
+
Case("geo", run=lambda: my_agent("capital of France?"), expected="Paris", metric=ExactMatch()),
|
|
92
|
+
]
|
|
93
|
+
|
|
94
|
+
results = run_suite(cases, "baselines/agent.json", n=10)
|
|
95
|
+
print(format_report(results))
|
|
96
|
+
raise SystemExit(exit_code(results))
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
`format_report` prints a per-case summary; `exit_code` returns `1` if any case
|
|
100
|
+
regressed, `0` otherwise — drop it straight into a CI step.
|
|
101
|
+
|
|
102
|
+
A failing case reports the measured effect size and p-value, so you see *how
|
|
103
|
+
big* the regression is, not just that one happened:
|
|
104
|
+
|
|
105
|
+
```
|
|
106
|
+
PASS geo: no significant regression
|
|
107
|
+
FAIL math: significant regression past the effect floor (effect=-0.180, p=0.004)
|
|
108
|
+
1 passed, 1 failed, 0 inconclusive
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## Metrics
|
|
112
|
+
|
|
113
|
+
A metric declares its `kind` (numeric or boolean, which picks the significance
|
|
114
|
+
test) and its `polarity` (is higher or lower better).
|
|
115
|
+
|
|
116
|
+
| Metric | Kind | Polarity | Scores |
|
|
117
|
+
|--------|------|----------|--------|
|
|
118
|
+
| `ExactMatch()` | boolean | higher better | `output == expected` |
|
|
119
|
+
| `Contains()` | boolean | higher better | `expected in output` |
|
|
120
|
+
| `Numeric(name, polarity)` | numeric | configurable | the raw value (latency, cost, judge score) |
|
|
121
|
+
| `TrajectoryMatch(ignore_keys=...)` | numeric | higher better | fraction of matching agent tool-calls |
|
|
122
|
+
|
|
123
|
+
`Numeric` with `polarity="lower_better"` is how you gate latency or cost — a real
|
|
124
|
+
*increase* becomes the regression.
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from signaltest import Numeric
|
|
128
|
+
from signaltest.metrics.base import LOWER_BETTER
|
|
129
|
+
|
|
130
|
+
latency = Numeric(name="latency_ms", polarity=LOWER_BETTER)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
`TrajectoryMatch` compares the agent's tool-call path against a reference path and
|
|
134
|
+
ignores volatile keys (timestamps, ids):
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from signaltest import TrajectoryMatch, Step
|
|
138
|
+
|
|
139
|
+
expected_path = [Step("search", {"q": "weather"}), Step("answer", {})]
|
|
140
|
+
metric = TrajectoryMatch(ignore_keys=("request_id",))
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
## How it works
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
candidate runs n times ─┐
|
|
147
|
+
├─> significance test ─┐
|
|
148
|
+
stored baseline samples ┘ ├─> block only if
|
|
149
|
+
│ significant AND
|
|
150
|
+
effect size ─────────┘ past the floor
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
- **Significance** — a permutation test for numeric metrics, Fisher's exact test
|
|
154
|
+
for boolean metrics. Both are seeded, so the same inputs always give the same
|
|
155
|
+
result. The gate that kills flakiness is not itself flaky.
|
|
156
|
+
- **Effect floor** — a regression must also clear a minimum effect size, so a
|
|
157
|
+
statistically significant but meaningless 0.1% drift never blocks the build.
|
|
158
|
+
- **Multiple comparisons** — across a suite, p-values are adjusted with the
|
|
159
|
+
Benjamini-Hochberg procedure, so flakiness doesn't reappear at the suite level.
|
|
160
|
+
- **Power** — cases with too few samples to detect a real change are flagged
|
|
161
|
+
`inconclusive`, never passed silently.
|
|
162
|
+
- **Model versioning** — a baseline records the model it was captured under. If you
|
|
163
|
+
pass a new `model=` and it differs, the baseline is re-recorded instead of
|
|
164
|
+
reported as a regression, so a provider model swap can't masquerade as one.
|
|
165
|
+
|
|
166
|
+
## Configuration
|
|
167
|
+
|
|
168
|
+
Every `assert_no_regression` / `check_case` / `run_suite` call accepts:
|
|
169
|
+
|
|
170
|
+
| Argument | Default | Meaning |
|
|
171
|
+
|----------|---------|---------|
|
|
172
|
+
| `n` | `10` | samples per run (boolean metrics usually want more) |
|
|
173
|
+
| `alpha` | `0.05` | significance threshold |
|
|
174
|
+
| `min_effect` | `0.03` numeric / `0.10` boolean | minimum effect size to count |
|
|
175
|
+
| `min_valid` | `2` | fewer valid samples than this → `inconclusive` |
|
|
176
|
+
| `model` | `None` | model id recorded with the baseline |
|
|
177
|
+
|
|
178
|
+
## Baselines
|
|
179
|
+
|
|
180
|
+
A baseline is a JSON file committed to your repo. Each entry is keyed by
|
|
181
|
+
`case_id::metric_name` and stores the recorded scores and the model.
|
|
182
|
+
|
|
183
|
+
- **Cold start** — the first run records the baseline and passes.
|
|
184
|
+
- **Updating** — to accept a new baseline on purpose, delete the case's entry and
|
|
185
|
+
re-run, or edit the JSON. The change is a reviewable diff in the same PR.
|
|
186
|
+
- **Inspecting** — use the CLI (below).
|
|
187
|
+
|
|
188
|
+
## Using it in CI
|
|
189
|
+
|
|
190
|
+
Because cases are plain pytest tests, your existing `pytest` step gates them:
|
|
191
|
+
|
|
192
|
+
```yaml
|
|
193
|
+
- run: pip install -e ".[dev]"
|
|
194
|
+
- run: pytest
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
A failed case fails the build. Baselines live in the repo, so CI needs no secrets
|
|
198
|
+
and nothing leaves your infrastructure.
|
|
199
|
+
|
|
200
|
+
## CLI
|
|
201
|
+
|
|
202
|
+
```sh
|
|
203
|
+
signaltest version
|
|
204
|
+
signaltest baselines baselines/agent.json # list recorded cases
|
|
205
|
+
signaltest show baselines/agent.json math::exact_match
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
## Development
|
|
209
|
+
|
|
210
|
+
```sh
|
|
211
|
+
git clone https://github.com/Falcon305/signaltest
|
|
212
|
+
cd signaltest
|
|
213
|
+
python -m venv .venv && . .venv/bin/activate
|
|
214
|
+
pip install -e ".[dev]"
|
|
215
|
+
pytest
|
|
216
|
+
ruff check src tests examples
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
With uv the setup is a single command (it creates the environment for you):
|
|
220
|
+
|
|
221
|
+
```sh
|
|
222
|
+
uv sync --extra dev
|
|
223
|
+
uv run pytest
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
Try the offline examples (cached responses, no API key):
|
|
227
|
+
|
|
228
|
+
```sh
|
|
229
|
+
python examples/demo.py # smallest possible case
|
|
230
|
+
python examples/tool_agent.py # tool-using agent: trajectory + answer checks
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
See [docs/architecture.md](docs/architecture.md) for how the pieces fit together
|
|
234
|
+
and how to add your own metric.
|
|
235
|
+
|
|
236
|
+
## FAQ
|
|
237
|
+
|
|
238
|
+
**Does it call my LLM?** Only through the `run` function you provide. signaltest
|
|
239
|
+
never talks to a provider itself.
|
|
240
|
+
|
|
241
|
+
**How many samples do I need?** `n=10` is a sane default for numeric metrics.
|
|
242
|
+
Boolean metrics resolve in coarser steps, so they need more — bump `n` and watch
|
|
243
|
+
for `inconclusive`, which means the test can't yet detect a change of the size you
|
|
244
|
+
care about.
|
|
245
|
+
|
|
246
|
+
**Why did a case come back `inconclusive`?** Too few valid samples to be
|
|
247
|
+
trustworthy. Increase `n`, or fix whatever made runs error out.
|
|
248
|
+
|
|
249
|
+
**Does my data leave my machine?** No. Baselines are local JSON; there is no
|
|
250
|
+
service.
|
|
251
|
+
|
|
252
|
+
## Contributing
|
|
253
|
+
|
|
254
|
+
Issues and pull requests are welcome. Keep changes small and focused, and add a
|
|
255
|
+
test for anything you change. See `CONTRIBUTING.md`.
|
|
256
|
+
|
|
257
|
+
## License
|
|
258
|
+
|
|
259
|
+
MIT
|