pytest-flakehunter 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_flakehunter-1.0.1/PKG-INFO +238 -0
- pytest_flakehunter-1.0.1/README.md +213 -0
- pytest_flakehunter-1.0.1/pyproject.toml +36 -0
- pytest_flakehunter-1.0.1/src/pytest_flakehunter/__init__.py +1 -0
- pytest_flakehunter-1.0.1/src/pytest_flakehunter/ai_analysis.py +134 -0
- pytest_flakehunter-1.0.1/src/pytest_flakehunter/history.py +419 -0
- pytest_flakehunter-1.0.1/src/pytest_flakehunter/plugin.py +307 -0
- pytest_flakehunter-1.0.1/src/pytest_flakehunter/reporter.py +1216 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-flakehunter
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Re-run tests N times, visualize failure heatmaps, and get AI root cause hypotheses
|
|
5
|
+
Keywords: pytest,flaky,testing,qa,heatmap,ai
|
|
6
|
+
Author: jkasser
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Classifier: Framework :: Pytest
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Software Development :: Testing
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Dist: pytest>=7.0
|
|
17
|
+
Requires-Dist: pytest-xdist ; extra == 'xdist'
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Project-URL: Homepage, https://github.com/jkasser/pytest-flakehunter
|
|
20
|
+
Project-URL: Repository, https://github.com/jkasser/pytest-flakehunter
|
|
21
|
+
Project-URL: Bug Tracker, https://github.com/jkasser/pytest-flakehunter/issues
|
|
22
|
+
Provides-Extra: ai
|
|
23
|
+
Provides-Extra: xdist
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# pytest-flakehunter 🎯
|
|
27
|
+
|
|
28
|
+
**Re-run tests N times, visualize failure heatmaps, and get AI-powered root cause hypotheses.**
|
|
29
|
+
|
|
30
|
+
Flaky tests are expensive. They erode trust in CI, waste engineer time, and mask real bugs.
|
|
31
|
+
`pytest-flakehunter` helps you stop guessing and start diagnosing.
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## What it does
|
|
36
|
+
|
|
37
|
+
Run any test suite with `--fh` and get a self-contained HTML report with:
|
|
38
|
+
|
|
39
|
+
- **Flake rate badges** per test — instantly see your worst offenders
|
|
40
|
+
- **Duration scatter plots** — spot timeout-related flakes vs. race conditions at a glance
|
|
41
|
+
- **Failure heatmaps** — shows *exactly* which line of code fails, and in which run attempts
|
|
42
|
+
- **Stack trace clustering** — groups failures by fingerprint so you know if it's one bug or three
|
|
43
|
+
- **AI root cause hypotheses** — Claude analyzes the clusters and tells you *why* it's probably flaking
|
|
44
|
+
|
|
45
|
+

|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install pytest-flakehunter
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Usage
|
|
56
|
+
|
|
57
|
+
If you use `uv`, pass your `.env` file explicitly:
|
|
58
|
+
```bash
|
|
59
|
+
uv run --env-file=.env pytest tests/ --fh --fh-ai
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
# Basic: run each test 10 times, generate report
|
|
64
|
+
pytest tests/ --fh
|
|
65
|
+
|
|
66
|
+
# Custom run count
|
|
67
|
+
pytest tests/ --fh --fh-runs=20
|
|
68
|
+
|
|
69
|
+
# With AI analysis (requires ANTHROPIC_API_KEY env var)
|
|
70
|
+
pytest tests/ --fh --fh-runs=15 --fh-ai
|
|
71
|
+
|
|
72
|
+
# Custom report path
|
|
73
|
+
pytest tests/ --fh --fh-report=reports/flake_$(date +%Y%m%d).html
|
|
74
|
+
|
|
75
|
+
# Full fixture isolation between attempts (slower but catches session-state flakes)
|
|
76
|
+
pytest tests/ --fh --fh-isolate
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## CLI options
|
|
80
|
+
|
|
81
|
+
| Option | Default | Description |
|
|
82
|
+
|--------|---------|-------------|
|
|
83
|
+
| `--fh` | off | Enable flake hunter mode |
|
|
84
|
+
| `--fh-runs N` | 10 | Re-run count per test |
|
|
85
|
+
| `--fh-report PATH` | `flakehunter_report.html` | Output report path |
|
|
86
|
+
| `--fh-ai` | off | AI root cause analysis via Claude |
|
|
87
|
+
| `--fh-history-dir PATH` | `.flakehunter/history` | Directory for persistent run history |
|
|
88
|
+
| `--fh-no-history` | off | Skip writing to history this run |
|
|
89
|
+
| `--fh-isolate` | off | Full fixture teardown between every attempt (see below) |
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## Reading the report
|
|
94
|
+
|
|
95
|
+
### Duration scatter
|
|
96
|
+
Each dot is one run attempt. **Red = failed, green = passed.**
|
|
97
|
+
- Failures clustering at a consistent high duration → likely a **timeout**
|
|
98
|
+
- Failures scattered at random durations → likely a **race condition or state issue**
|
|
99
|
+
- Single outlier failure at low duration → likely **environment or setup flake**
|
|
100
|
+
|
|
101
|
+
### Failure commonality heatmap
|
|
102
|
+
A multi-dimensional view of what your failures have in common.
|
|
103
|
+
|
|
104
|
+
**Current run** — columns are failed attempts, rows are:
|
|
105
|
+
- **Duration** — was this attempt slow, medium, or fast relative to the run? Helps spot timeout-related flakes.
|
|
106
|
+
- **Traceback** — which frames appeared in the stack trace, and in what % of failures.
|
|
107
|
+
|
|
108
|
+
**Historical** (when history is enabled) — columns are past runs, rows are:
|
|
109
|
+
- **Environment** — which host had failures, and how often.
|
|
110
|
+
- **Branch** — which git branch failures occurred on.
|
|
111
|
+
- **Date** — which week failures clustered in.
|
|
112
|
+
|
|
113
|
+
Cell brightness = how common that value is across failures. Bright = strong pattern.
|
|
114
|
+
|
|
115
|
+
### Stack trace clusters
|
|
116
|
+
Failures are grouped by a fingerprint of their innermost stack frames.
|
|
117
|
+
Multiple clusters = multiple distinct failure modes (e.g., two different bugs).
|
|
118
|
+
One cluster = one root cause, just intermittently triggered.
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Historical tracking
|
|
123
|
+
|
|
124
|
+
By default, flakehunter appends each run's results to per-test CSV files in `.flakehunter/history/`. These accumulate across runs, giving the report progressively richer data:
|
|
125
|
+
|
|
126
|
+
- **Flake rate trend** — see if a test is getting more or less flaky over time
|
|
127
|
+
- **Environment breakdown** — which CI host/branch has the most failures
|
|
128
|
+
- **Argument correlation** — for parametrized tests, which parameter combos fail most
|
|
129
|
+
|
|
130
|
+
To opt out for a single run:
|
|
131
|
+
```bash
|
|
132
|
+
pytest tests/ --fh --fh-no-history
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
To store history in a non-default location (e.g., shared network path):
|
|
136
|
+
```bash
|
|
137
|
+
pytest tests/ --fh --fh-history-dir=.cache/flakehunter
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
The CSV files are safe to commit — they're small, append-only, and give you a free audit trail.
|
|
141
|
+
|
|
142
|
+
---
|
|
143
|
+
|
|
144
|
+
## Fixture isolation
|
|
145
|
+
|
|
146
|
+
By default, flakehunter preserves session- and module-scoped fixtures across re-runs. This means a Playwright browser instance, database connection pool, or asyncio event loop is only created once per test — making repeated runs fast.
|
|
147
|
+
|
|
148
|
+
**When to use `--fh-isolate`:**
|
|
149
|
+
|
|
150
|
+
```bash
|
|
151
|
+
pytest tests/ --fh --fh-isolate
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
This passes a full teardown signal after every attempt, so every re-run starts with a completely clean fixture state. It is slower but will surface a class of flakiness that the default mode can miss — things like:
|
|
155
|
+
|
|
156
|
+
- Connection pools that degrade after a certain number of operations
|
|
157
|
+
- Session-scoped caches or queues that accumulate state
|
|
158
|
+
- Event loop pollution from uncancelled async tasks
|
|
159
|
+
|
|
160
|
+
If your tests are fast and stateless, the default is fine. If you're seeing flakiness that only appears after many runs in a long session, try `--fh-isolate`.
|
|
161
|
+
|
|
162
|
+
---
|
|
163
|
+
|
|
164
|
+
## AI analysis
|
|
165
|
+
|
|
166
|
+
Enable with `--fh-ai`. Requires an `ANTHROPIC_API_KEY` — set it using whichever method fits your workflow:
|
|
167
|
+
|
|
168
|
+
**Shell / CI environment variable** (recommended for CI):
|
|
169
|
+
```bash
|
|
170
|
+
# Linux / macOS
|
|
171
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
172
|
+
pytest tests/ --fh --fh-ai
|
|
173
|
+
|
|
174
|
+
# Windows CMD
|
|
175
|
+
set ANTHROPIC_API_KEY=sk-ant-...
|
|
176
|
+
pytest tests/ --fh --fh-ai
|
|
177
|
+
|
|
178
|
+
# Windows PowerShell
|
|
179
|
+
$env:ANTHROPIC_API_KEY = "sk-ant-..."
|
|
180
|
+
pytest tests/ --fh --fh-ai
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
**`.env` file** (recommended for local development — never commit this file):
|
|
184
|
+
```
|
|
185
|
+
# .env
|
|
186
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
187
|
+
```
|
|
188
|
+
If you use `uv`, pass the file explicitly:
|
|
189
|
+
```bash
|
|
190
|
+
uv run --env-file=.env pytest --fh --fh-ai
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
> **Note:** Do not put your API key in `pytest.ini` or `pyproject.toml` — those files are typically committed to version control.
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
âš¡ AI Root Cause Hypothesis
|
|
197
|
+
|
|
198
|
+
This test fails most often in setup_db_connection() after ~2.3s, which is
|
|
199
|
+
consistent with a connection pool exhaustion pattern under parallel test
|
|
200
|
+
execution. Consider increasing the test pool size or adding explicit
|
|
201
|
+
teardown that returns connections to the pool between retry attempts.
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
The AI receives the clustered stack traces, failure rates, and timing data —
|
|
205
|
+
not raw code — so it works on any codebase without needing source access.
|
|
206
|
+
|
|
207
|
+
---
|
|
208
|
+
|
|
209
|
+
## Architecture
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
pytest_flakehunter/
|
|
213
|
+
├── plugin.py # pytest hooks: --fh option, re-run protocol, xdist support
|
|
214
|
+
├── reporter.py # Self-contained HTML report with inline SVG charts
|
|
215
|
+
├── history.py # Append-only CSV history per test, cross-run trend analysis
|
|
216
|
+
└── ai_analysis.py # Anthropic API integration for root cause hypotheses
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
**No external dependencies** beyond pytest itself. The HTML report is fully
|
|
220
|
+
self-contained (no CDN calls, no JS frameworks) — safe to commit to repos or
|
|
221
|
+
attach to Jira tickets.
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## Contributing
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
git clone https://github.com/jkasser/pytest-flakehunter
|
|
229
|
+
cd pytest-flakehunter
|
|
230
|
+
pip install -e ".[dev]"
|
|
231
|
+
pytest demo/sample_tests.py --fh --fh-runs=15
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## License
|
|
237
|
+
|
|
238
|
+
MIT
|
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
# pytest-flakehunter 🎯
|
|
2
|
+
|
|
3
|
+
**Re-run tests N times, visualize failure heatmaps, and get AI-powered root cause hypotheses.**
|
|
4
|
+
|
|
5
|
+
Flaky tests are expensive. They erode trust in CI, waste engineer time, and mask real bugs.
|
|
6
|
+
`pytest-flakehunter` helps you stop guessing and start diagnosing.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## What it does
|
|
11
|
+
|
|
12
|
+
Run any test suite with `--fh` and get a self-contained HTML report with:
|
|
13
|
+
|
|
14
|
+
- **Flake rate badges** per test — instantly see your worst offenders
|
|
15
|
+
- **Duration scatter plots** — spot timeout-related flakes vs. race conditions at a glance
|
|
16
|
+
- **Failure heatmaps** — shows *exactly* which line of code fails, and in which run attempts
|
|
17
|
+
- **Stack trace clustering** — groups failures by fingerprint so you know if it's one bug or three
|
|
18
|
+
- **AI root cause hypotheses** — Claude analyzes the clusters and tells you *why* it's probably flaking
|
|
19
|
+
|
|
20
|
+

|
|
21
|
+
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install pytest-flakehunter
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
If you use `uv`, pass your `.env` file explicitly:
|
|
33
|
+
```bash
|
|
34
|
+
uv run --env-file=.env pytest tests/ --fh --fh-ai
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
# Basic: run each test 10 times, generate report
|
|
39
|
+
pytest tests/ --fh
|
|
40
|
+
|
|
41
|
+
# Custom run count
|
|
42
|
+
pytest tests/ --fh --fh-runs=20
|
|
43
|
+
|
|
44
|
+
# With AI analysis (requires ANTHROPIC_API_KEY env var)
|
|
45
|
+
pytest tests/ --fh --fh-runs=15 --fh-ai
|
|
46
|
+
|
|
47
|
+
# Custom report path
|
|
48
|
+
pytest tests/ --fh --fh-report=reports/flake_$(date +%Y%m%d).html
|
|
49
|
+
|
|
50
|
+
# Full fixture isolation between attempts (slower but catches session-state flakes)
|
|
51
|
+
pytest tests/ --fh --fh-isolate
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## CLI options
|
|
55
|
+
|
|
56
|
+
| Option | Default | Description |
|
|
57
|
+
|--------|---------|-------------|
|
|
58
|
+
| `--fh` | off | Enable flake hunter mode |
|
|
59
|
+
| `--fh-runs N` | 10 | Re-run count per test |
|
|
60
|
+
| `--fh-report PATH` | `flakehunter_report.html` | Output report path |
|
|
61
|
+
| `--fh-ai` | off | AI root cause analysis via Claude |
|
|
62
|
+
| `--fh-history-dir PATH` | `.flakehunter/history` | Directory for persistent run history |
|
|
63
|
+
| `--fh-no-history` | off | Skip writing to history this run |
|
|
64
|
+
| `--fh-isolate` | off | Full fixture teardown between every attempt (see below) |
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Reading the report
|
|
69
|
+
|
|
70
|
+
### Duration scatter
|
|
71
|
+
Each dot is one run attempt. **Red = failed, green = passed.**
|
|
72
|
+
- Failures clustering at a consistent high duration → likely a **timeout**
|
|
73
|
+
- Failures scattered at random durations → likely a **race condition or state issue**
|
|
74
|
+
- Single outlier failure at low duration → likely **environment or setup flake**
|
|
75
|
+
|
|
76
|
+
### Failure commonality heatmap
|
|
77
|
+
A multi-dimensional view of what your failures have in common.
|
|
78
|
+
|
|
79
|
+
**Current run** — columns are failed attempts, rows are:
|
|
80
|
+
- **Duration** — was this attempt slow, medium, or fast relative to the run? Helps spot timeout-related flakes.
|
|
81
|
+
- **Traceback** — which frames appeared in the stack trace, and in what % of failures.
|
|
82
|
+
|
|
83
|
+
**Historical** (when history is enabled) — columns are past runs, rows are:
|
|
84
|
+
- **Environment** — which host had failures, and how often.
|
|
85
|
+
- **Branch** — which git branch failures occurred on.
|
|
86
|
+
- **Date** — which week failures clustered in.
|
|
87
|
+
|
|
88
|
+
Cell brightness = how common that value is across failures. Bright = strong pattern.
|
|
89
|
+
|
|
90
|
+
### Stack trace clusters
|
|
91
|
+
Failures are grouped by a fingerprint of their innermost stack frames.
|
|
92
|
+
Multiple clusters = multiple distinct failure modes (e.g., two different bugs).
|
|
93
|
+
One cluster = one root cause, just intermittently triggered.
|
|
94
|
+
|
|
95
|
+
---
|
|
96
|
+
|
|
97
|
+
## Historical tracking
|
|
98
|
+
|
|
99
|
+
By default, flakehunter appends each run's results to per-test CSV files in `.flakehunter/history/`. These accumulate across runs, giving the report progressively richer data:
|
|
100
|
+
|
|
101
|
+
- **Flake rate trend** — see if a test is getting more or less flaky over time
|
|
102
|
+
- **Environment breakdown** — which CI host/branch has the most failures
|
|
103
|
+
- **Argument correlation** — for parametrized tests, which parameter combos fail most
|
|
104
|
+
|
|
105
|
+
To opt out for a single run:
|
|
106
|
+
```bash
|
|
107
|
+
pytest tests/ --fh --fh-no-history
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
To store history in a non-default location (e.g., shared network path):
|
|
111
|
+
```bash
|
|
112
|
+
pytest tests/ --fh --fh-history-dir=.cache/flakehunter
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
The CSV files are safe to commit — they're small, append-only, and give you a free audit trail.
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Fixture isolation
|
|
120
|
+
|
|
121
|
+
By default, flakehunter preserves session- and module-scoped fixtures across re-runs. This means a Playwright browser instance, database connection pool, or asyncio event loop is only created once per test — making repeated runs fast.
|
|
122
|
+
|
|
123
|
+
**When to use `--fh-isolate`:**
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
pytest tests/ --fh --fh-isolate
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
This passes a full teardown signal after every attempt, so every re-run starts with a completely clean fixture state. It is slower but will surface a class of flakiness that the default mode can miss — things like:
|
|
130
|
+
|
|
131
|
+
- Connection pools that degrade after a certain number of operations
|
|
132
|
+
- Session-scoped caches or queues that accumulate state
|
|
133
|
+
- Event loop pollution from uncancelled async tasks
|
|
134
|
+
|
|
135
|
+
If your tests are fast and stateless, the default is fine. If you're seeing flakiness that only appears after many runs in a long session, try `--fh-isolate`.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## AI analysis
|
|
140
|
+
|
|
141
|
+
Enable with `--fh-ai`. Requires an `ANTHROPIC_API_KEY` — set it using whichever method fits your workflow:
|
|
142
|
+
|
|
143
|
+
**Shell / CI environment variable** (recommended for CI):
|
|
144
|
+
```bash
|
|
145
|
+
# Linux / macOS
|
|
146
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
147
|
+
pytest tests/ --fh --fh-ai
|
|
148
|
+
|
|
149
|
+
# Windows CMD
|
|
150
|
+
set ANTHROPIC_API_KEY=sk-ant-...
|
|
151
|
+
pytest tests/ --fh --fh-ai
|
|
152
|
+
|
|
153
|
+
# Windows PowerShell
|
|
154
|
+
$env:ANTHROPIC_API_KEY = "sk-ant-..."
|
|
155
|
+
pytest tests/ --fh --fh-ai
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**`.env` file** (recommended for local development — never commit this file):
|
|
159
|
+
```
|
|
160
|
+
# .env
|
|
161
|
+
ANTHROPIC_API_KEY=sk-ant-...
|
|
162
|
+
```
|
|
163
|
+
If you use `uv`, pass the file explicitly:
|
|
164
|
+
```bash
|
|
165
|
+
uv run --env-file=.env pytest --fh --fh-ai
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
> **Note:** Do not put your API key in `pytest.ini` or `pyproject.toml` — those files are typically committed to version control.
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
âš¡ AI Root Cause Hypothesis
|
|
172
|
+
|
|
173
|
+
This test fails most often in setup_db_connection() after ~2.3s, which is
|
|
174
|
+
consistent with a connection pool exhaustion pattern under parallel test
|
|
175
|
+
execution. Consider increasing the test pool size or adding explicit
|
|
176
|
+
teardown that returns connections to the pool between retry attempts.
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
The AI receives the clustered stack traces, failure rates, and timing data —
|
|
180
|
+
not raw code — so it works on any codebase without needing source access.
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## Architecture
|
|
185
|
+
|
|
186
|
+
```
|
|
187
|
+
pytest_flakehunter/
|
|
188
|
+
├── plugin.py # pytest hooks: --fh option, re-run protocol, xdist support
|
|
189
|
+
├── reporter.py # Self-contained HTML report with inline SVG charts
|
|
190
|
+
├── history.py # Append-only CSV history per test, cross-run trend analysis
|
|
191
|
+
└── ai_analysis.py # Anthropic API integration for root cause hypotheses
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**No external dependencies** beyond pytest itself. The HTML report is fully
|
|
195
|
+
self-contained (no CDN calls, no JS frameworks) — safe to commit to repos or
|
|
196
|
+
attach to Jira tickets.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## Contributing
|
|
201
|
+
|
|
202
|
+
```bash
|
|
203
|
+
git clone https://github.com/jkasser/pytest-flakehunter
|
|
204
|
+
cd pytest-flakehunter
|
|
205
|
+
pip install -e ".[dev]"
|
|
206
|
+
pytest demo/sample_tests.py --fh --fh-runs=15
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## License
|
|
212
|
+
|
|
213
|
+
MIT
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["uv_build >= 0.10.10, <0.11.0"]
|
|
3
|
+
build-backend = "uv_build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pytest-flakehunter"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "Re-run tests N times, visualize failure heatmaps, and get AI root cause hypotheses"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = [{ name = "jkasser" }]
|
|
13
|
+
keywords = ["pytest", "flaky", "testing", "qa", "heatmap", "ai"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Framework :: Pytest",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Programming Language :: Python :: 3.9",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Software Development :: Testing",
|
|
22
|
+
"Operating System :: OS Independent",
|
|
23
|
+
]
|
|
24
|
+
dependencies = ["pytest>=7.0"]
|
|
25
|
+
|
|
26
|
+
[project.urls]
|
|
27
|
+
Homepage = "https://github.com/jkasser/pytest-flakehunter"
|
|
28
|
+
Repository = "https://github.com/jkasser/pytest-flakehunter"
|
|
29
|
+
"Bug Tracker" = "https://github.com/jkasser/pytest-flakehunter/issues"
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
xdist = ["pytest-xdist"]
|
|
33
|
+
ai = [] # Anthropic API uses stdlib urllib; env vars loaded by uv from .env automatically
|
|
34
|
+
|
|
35
|
+
[project.entry-points."pytest11"]
|
|
36
|
+
flakehunter = "pytest_flakehunter.plugin"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.1"
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AI-powered root cause analysis for flaky tests.
|
|
3
|
+
Calls the Anthropic API with clustered stack traces AND historical trend data
|
|
4
|
+
to generate richer, more specific hypotheses.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
from typing import TYPE_CHECKING, Optional
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from pytest_flakehunter.plugin import TestFlakeRecord
|
|
14
|
+
|
|
15
|
+
from pytest_flakehunter.plugin import extract_frames, extract_error, short_path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def analyze_flaky_test(
|
|
19
|
+
record: "TestFlakeRecord",
|
|
20
|
+
history_summary: Optional[dict] = None,
|
|
21
|
+
) -> str:
|
|
22
|
+
api_key = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
23
|
+
if not api_key:
|
|
24
|
+
return ""
|
|
25
|
+
|
|
26
|
+
try:
|
|
27
|
+
import urllib.request
|
|
28
|
+
import urllib.error
|
|
29
|
+
|
|
30
|
+
clusters = record.failure_clusters()
|
|
31
|
+
cluster_summaries = []
|
|
32
|
+
for fp, attempts in clusters.items():
|
|
33
|
+
fr = attempts[0].failed_report
|
|
34
|
+
if not fr:
|
|
35
|
+
continue
|
|
36
|
+
error_type, error_msg = extract_error(fr.longrepr)
|
|
37
|
+
frames = extract_frames(fr.longrepr)
|
|
38
|
+
frames_text = "\n".join(
|
|
39
|
+
f" {short_path(f[0])}:{f[1]} in {f[2]}() -- {f[3]}"
|
|
40
|
+
for f in frames[-5:]
|
|
41
|
+
)
|
|
42
|
+
durations = [a.total_duration for a in attempts]
|
|
43
|
+
cluster_summaries.append(
|
|
44
|
+
f"Cluster {fp} ({len(attempts)} occurrences, "
|
|
45
|
+
f"avg {sum(durations)/len(durations):.2f}s):\n"
|
|
46
|
+
f"Error: {error_type}: {error_msg}\n"
|
|
47
|
+
f"Stack (innermost last):\n{frames_text}"
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if not cluster_summaries and not history_summary:
|
|
51
|
+
return ""
|
|
52
|
+
|
|
53
|
+
history_section = ""
|
|
54
|
+
if history_summary and history_summary.get("total_attempts", 0) > 0:
|
|
55
|
+
hs = history_summary
|
|
56
|
+
lines = [
|
|
57
|
+
f"\nHISTORICAL DATA ({hs.get('date_range', ('?','?'))[0]} -> {hs.get('date_range', ('?','?'))[1]}):",
|
|
58
|
+
f" Total historical attempts: {hs.get('total_attempts')}",
|
|
59
|
+
f" Overall historical flake rate: {hs.get('overall_flake_rate', 0):.1%}",
|
|
60
|
+
f" Avg duration: {hs.get('avg_duration_s', 0):.3f}s P95: {hs.get('p95_duration_s', 0):.3f}s",
|
|
61
|
+
]
|
|
62
|
+
run_rates = hs.get("run_flake_rates", [])
|
|
63
|
+
if len(run_rates) >= 3:
|
|
64
|
+
recent_3 = [r["flake_rate"] for r in run_rates[-3:]]
|
|
65
|
+
early_3 = [r["flake_rate"] for r in run_rates[:3]]
|
|
66
|
+
trend = sum(recent_3)/3 - sum(early_3)/3
|
|
67
|
+
direction = "INCREASING" if trend > 0.05 else "DECREASING" if trend < -0.05 else "STABLE"
|
|
68
|
+
lines.append(f" Flake rate trend: {direction} (delta {trend:+.1%} vs earliest runs)")
|
|
69
|
+
env = hs.get("env_breakdown", {})
|
|
70
|
+
if len(env) > 1:
|
|
71
|
+
lines.append(" Per-host flake rates:")
|
|
72
|
+
for host, v in env.items():
|
|
73
|
+
rate = v["failed"] / v["total"] if v["total"] else 0
|
|
74
|
+
lines.append(f" {host}: {rate:.0%} ({v['failed']}/{v['total']})")
|
|
75
|
+
arg_corr = hs.get("arg_correlation", {})
|
|
76
|
+
if arg_corr:
|
|
77
|
+
high_fail = {k: v for k, v in arg_corr.items() if v["rate"] > 0.3 and v["total"] >= 3}
|
|
78
|
+
if high_fail:
|
|
79
|
+
lines.append(" High-failure parameter combinations:")
|
|
80
|
+
for combo, v in sorted(high_fail.items(), key=lambda x: -x[1]["rate"])[:3]:
|
|
81
|
+
lines.append(f" {combo}: {v['rate']:.0%} fail rate ({v['failed']}/{v['total']})")
|
|
82
|
+
branches = hs.get("git_branches_seen", [])
|
|
83
|
+
if branches:
|
|
84
|
+
lines.append(f" Git branches with failures: {', '.join(branches[:5])}")
|
|
85
|
+
history_section = "\n".join(lines)
|
|
86
|
+
|
|
87
|
+
prompt = f"""You are an expert software engineer analyzing a flaky pytest test.
|
|
88
|
+
|
|
89
|
+
CURRENT RUN:
|
|
90
|
+
Test: {record.nodeid}
|
|
91
|
+
Flake rate this run: {record.flake_rate:.0%} ({record.fail_count} failures in {len(record.attempts)} runs)
|
|
92
|
+
Avg duration on failure: {_avg_fail_duration(record):.2f}s
|
|
93
|
+
|
|
94
|
+
Failure clusters:
|
|
95
|
+
{chr(10).join(cluster_summaries) if cluster_summaries else "(no failures this run -- see historical data)"}
|
|
96
|
+
{history_section}
|
|
97
|
+
|
|
98
|
+
In 3-4 sentences, give a concise hypothesis about WHY this test is flaky.
|
|
99
|
+
Be specific: timing races, state pollution, resource exhaustion, external deps, parameter-specific bugs.
|
|
100
|
+
If historical data shows a trend, host correlation, or parameter correlation, call it out explicitly.
|
|
101
|
+
Do not repeat the stack trace. Be direct and actionable."""
|
|
102
|
+
|
|
103
|
+
payload = json.dumps({
|
|
104
|
+
"model": "claude-sonnet-4-6",
|
|
105
|
+
"max_tokens": 400,
|
|
106
|
+
"messages": [{"role": "user", "content": prompt}]
|
|
107
|
+
}).encode()
|
|
108
|
+
|
|
109
|
+
req = urllib.request.Request(
|
|
110
|
+
"https://api.anthropic.com/v1/messages",
|
|
111
|
+
data=payload,
|
|
112
|
+
headers={
|
|
113
|
+
"x-api-key": api_key,
|
|
114
|
+
"anthropic-version": "2023-06-01",
|
|
115
|
+
"content-type": "application/json",
|
|
116
|
+
}
|
|
117
|
+
)
|
|
118
|
+
try:
|
|
119
|
+
with urllib.request.urlopen(req, timeout=20) as resp:
|
|
120
|
+
data = json.loads(resp.read())
|
|
121
|
+
return data["content"][0]["text"].strip()
|
|
122
|
+
except urllib.error.HTTPError as exc:
|
|
123
|
+
body = exc.read().decode("utf-8", errors="replace")
|
|
124
|
+
print(f"\n[flakehunter] AI analysis failed for {record.nodeid}: {exc} — {body}")
|
|
125
|
+
return ""
|
|
126
|
+
|
|
127
|
+
except Exception as exc:
|
|
128
|
+
print(f"\n[flakehunter] AI analysis failed for {record.nodeid}: {exc}")
|
|
129
|
+
return ""
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _avg_fail_duration(record: "TestFlakeRecord") -> float:
|
|
133
|
+
durations = [a.total_duration for a in record.attempts if a.outcome == "failed"]
|
|
134
|
+
return sum(durations) / len(durations) if durations else 0.0
|