pytest-self-healer 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pytest_self_healer-0.1.0/.gitignore +32 -0
- pytest_self_healer-0.1.0/Dockerfile +25 -0
- pytest_self_healer-0.1.0/LICENSE +21 -0
- pytest_self_healer-0.1.0/PKG-INFO +326 -0
- pytest_self_healer-0.1.0/README.md +297 -0
- pytest_self_healer-0.1.0/docker/docker-compose.yml +43 -0
- pytest_self_healer-0.1.0/docs/PROJECT_WRITEUP.md +259 -0
- pytest_self_healer-0.1.0/docs/architecture.svg +125 -0
- pytest_self_healer-0.1.0/pyproject.toml +75 -0
- pytest_self_healer-0.1.0/requirements.txt +20 -0
- pytest_self_healer-0.1.0/src/evals/compare_models.py +87 -0
- pytest_self_healer-0.1.0/src/evals/run_eval.py +234 -0
- pytest_self_healer-0.1.0/src/evals/selector_evalset.json +114 -0
- pytest_self_healer-0.1.0/src/pytest_self_healer/__init__.py +25 -0
- pytest_self_healer-0.1.0/src/pytest_self_healer/healing_engine.py +371 -0
- pytest_self_healer-0.1.0/src/pytest_self_healer/page_wrapper.py +104 -0
- pytest_self_healer-0.1.0/src/pytest_self_healer/plugin.py +156 -0
- pytest_self_healer-0.1.0/src/tests/test_accuracy.py +329 -0
- pytest_self_healer-0.1.0/src/tests/test_evalset.py +74 -0
- pytest_self_healer-0.1.0/src/tests/test_healing_examples.py +127 -0
- pytest_self_healer-0.1.0/src/tests/test_unit.py +277 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*.pyo
|
|
5
|
+
.pytest_cache/
|
|
6
|
+
*.egg-info/
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
.venv/
|
|
10
|
+
venv/
|
|
11
|
+
env/
|
|
12
|
+
|
|
13
|
+
# Reports & screenshots (generated at runtime)
|
|
14
|
+
reports/
|
|
15
|
+
|
|
16
|
+
# Env / secrets
|
|
17
|
+
.env
|
|
18
|
+
.env.*
|
|
19
|
+
|
|
20
|
+
# macOS
|
|
21
|
+
.DS_Store
|
|
22
|
+
|
|
23
|
+
# Claude Code
|
|
24
|
+
.claude/
|
|
25
|
+
|
|
26
|
+
# Personal interview-prep material (local only, not part of the package)
|
|
27
|
+
INTERVIEW_PREP.md
|
|
28
|
+
INTERVIEW_PREP.pdf
|
|
29
|
+
MASTER_INTERVIEW_PREP.md
|
|
30
|
+
MASTER_INTERVIEW_PREP.pdf
|
|
31
|
+
generate_pdf.py
|
|
32
|
+
generate_master_pdf.py
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# ── Stage 1: Python + Playwright ────────────────────────────────────────────
|
|
2
|
+
FROM mcr.microsoft.com/playwright/python:v1.44.0-jammy
|
|
3
|
+
|
|
4
|
+
WORKDIR /app
|
|
5
|
+
|
|
6
|
+
# System deps
|
|
7
|
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
8
|
+
curl \
|
|
9
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
10
|
+
|
|
11
|
+
# Python deps
|
|
12
|
+
COPY requirements.txt .
|
|
13
|
+
RUN pip install --no-cache-dir -r requirements.txt
|
|
14
|
+
|
|
15
|
+
# Install Playwright browsers
|
|
16
|
+
RUN playwright install chromium --with-deps
|
|
17
|
+
|
|
18
|
+
# Copy source
|
|
19
|
+
COPY src/ ./src/
|
|
20
|
+
COPY pytest.ini .
|
|
21
|
+
|
|
22
|
+
ENV PYTHONPATH=/app/src
|
|
23
|
+
ENV PYTHONUNBUFFERED=1
|
|
24
|
+
|
|
25
|
+
CMD ["pytest", "src/tests/", "-v", "--tb=short"]
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Atharva Rane
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pytest-self-healer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Auto-heal broken Playwright selectors using a local or cloud LLM
|
|
5
|
+
Project-URL: Homepage, https://github.com/athrvrne/Self-healing-Playwright-Tests
|
|
6
|
+
Project-URL: Issues, https://github.com/athrvrne/Self-healing-Playwright-Tests/issues
|
|
7
|
+
License: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Keywords: llm,ollama,playwright,pytest,self-healing,test-automation
|
|
10
|
+
Classifier: Framework :: Pytest
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Software Development :: Testing
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
21
|
+
Requires-Dist: httpx>=0.27
|
|
22
|
+
Requires-Dist: lxml>=5.0
|
|
23
|
+
Requires-Dist: playwright>=1.44
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.23
|
|
25
|
+
Requires-Dist: pytest>=7.0
|
|
26
|
+
Provides-Extra: anthropic
|
|
27
|
+
Requires-Dist: anthropic>=0.28; extra == 'anthropic'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# 🛠 Self-Healing Test Automation Framework
|
|
31
|
+
|
|
32
|
+
> A Playwright wrapper that uses a **local or cloud LLM** to automatically fix broken CSS selectors — no flaky CI pipelines, no manual triaging.
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## The Problem
|
|
37
|
+
|
|
38
|
+
UI changes break test selectors constantly:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
TimeoutError: page.click: Timeout 30000ms exceeded.
|
|
42
|
+
waiting for selector "#submit-btn"
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
The button still exists — it's just `[data-testid="login-submit"]` now. A human would fix it in 10 seconds. But at 3 AM in CI, it blocks your entire pipeline.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## How It Works
|
|
50
|
+
|
|
51
|
+
```
|
|
52
|
+
Test runs selector → TimeoutError → DOM snapshot captured
|
|
53
|
+
↓
|
|
54
|
+
DOM compressed (scripts/styles stripped, ~8KB)
|
|
55
|
+
↓
|
|
56
|
+
Prompt sent to LLM (local Ollama or Anthropic Claude)
|
|
57
|
+
↓
|
|
58
|
+
LLM returns: { "selector": "#new-id", "confidence": "high" }
|
|
59
|
+
↓
|
|
60
|
+
New selector validated in Playwright
|
|
61
|
+
↓
|
|
62
|
+
Test continues ✅ + Result cached for reuse
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Project Structure
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
pytest-self-healer/
|
|
71
|
+
├── src/
|
|
72
|
+
│ ├── pytest_self_healer/ # Installable package (pip install pytest-self-healer)
|
|
73
|
+
│ │ ├── __init__.py
|
|
74
|
+
│ │ ├── plugin.py # pytest entry point (fixtures + CLI options)
|
|
75
|
+
│ │ ├── healing_engine.py # Core: LLM clients, DOM compression, healing logic
|
|
76
|
+
│ │ └── page_wrapper.py # SelfHealingPage: drop-in Playwright Page replacement
|
|
77
|
+
│ ├── evals/
|
|
78
|
+
│ │ ├── selector_evalset.json # Ground-truth dataset for LLM accuracy benchmarking
|
|
79
|
+
│ │ ├── run_eval.py # Standalone eval runner (scores + saves report)
|
|
80
|
+
│ │ └── compare_models.py # Diff two eval reports side by side
|
|
81
|
+
│ ├── tests/
|
|
82
|
+
│ │ ├── test_healing_examples.py # Integration tests with intentionally stale selectors
|
|
83
|
+
│ │ ├── test_evalset.py # pytest integration for the evalset
|
|
84
|
+
│ │ ├── test_accuracy.py # LLM accuracy benchmarks (3 tiers)
|
|
85
|
+
│ │ └── test_unit.py # Unit tests (no browser/LLM required)
|
|
86
|
+
│ └── conftest.py # pytest fixtures, CLI options, report hook
|
|
87
|
+
├── docker/
|
|
88
|
+
│ ├── Dockerfile # Test runner image (Playwright + Python)
|
|
89
|
+
│ └── docker-compose.yml # Ollama + test runner, health-checked
|
|
90
|
+
├── reports/
|
|
91
|
+
│ ├── healing_report_<ts>.json # Per-run healing reports
|
|
92
|
+
│ └── evals/
|
|
93
|
+
│ └── eval_<provider>_<ts>.json # Per-run eval reports
|
|
94
|
+
├── requirements.txt
|
|
95
|
+
├── pytest.ini
|
|
96
|
+
└── README.md
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## Quickstart
|
|
102
|
+
|
|
103
|
+
### Option 1: Unit tests only (no browser or LLM needed)
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
pip install -r requirements.txt
|
|
107
|
+
playwright install chromium
|
|
108
|
+
PYTHONPATH=src pytest src/tests/test_unit.py -v
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Option 2: Full integration tests (requires Ollama running locally)
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
# Install and start Ollama
|
|
115
|
+
brew install ollama # or: curl -fsSL https://ollama.com/install.sh | sh
|
|
116
|
+
ollama pull qwen2.5-coder:3b
|
|
117
|
+
|
|
118
|
+
# Run the tests
|
|
119
|
+
PYTHONPATH=src pytest src/tests/ -v \
|
|
120
|
+
--ollama-url=http://localhost:11434 \
|
|
121
|
+
--ollama-model=qwen2.5-coder:3b
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Option 3: Use Anthropic Claude instead of Ollama
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
128
|
+
|
|
129
|
+
PYTHONPATH=src pytest src/tests/ -v \
|
|
130
|
+
--llm-provider=anthropic \
|
|
131
|
+
--anthropic-model=claude-haiku-4-5-20251001
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Option 4: Docker (everything bundled)
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
docker compose -f docker/docker-compose.yml up --build
|
|
138
|
+
# Reports land in ./reports/healing_report_<timestamp>.json
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
---
|
|
142
|
+
|
|
143
|
+
## Writing Your Own Healing Tests
|
|
144
|
+
|
|
145
|
+
Replace `page` with `SelfHealingPage`. Add a `purpose` string to every interaction:
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
# After: pip install pytest-self-healer
|
|
149
|
+
# No import needed — healing_page fixture is auto-available
|
|
150
|
+
|
|
151
|
+
async def test_checkout(healing_page):
|
|
152
|
+
await healing_page.goto("https://myapp.com/checkout")
|
|
153
|
+
|
|
154
|
+
# Selector is stale — LLM will find the real one
|
|
155
|
+
await healing_page.click(
|
|
156
|
+
selector="button#old-checkout-id",
|
|
157
|
+
purpose="checkout submit button in the cart summary",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
await healing_page.fill(
|
|
161
|
+
selector="input.card-num",
|
|
162
|
+
value="4242424242424242",
|
|
163
|
+
purpose="credit card number input in payment form",
|
|
164
|
+
)
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
**Tips for better healing:**
|
|
168
|
+
- Be specific in `purpose`: *"blue submit button in the login modal"* > *"button"*
|
|
169
|
+
- Use `data-testid` attributes in your app for stable baseline selectors
|
|
170
|
+
- The LLM favors `data-testid` > `aria-label` > `id` > semantic CSS
|
|
171
|
+
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## CLI Options
|
|
175
|
+
|
|
176
|
+
| Flag | Default | Description |
|
|
177
|
+
|------|---------|-------------|
|
|
178
|
+
| `--llm-provider` | `ollama` | `ollama` \| `anthropic` \| `auto` |
|
|
179
|
+
| `--ollama-url` | `http://localhost:11434` | Ollama server endpoint |
|
|
180
|
+
| `--ollama-model` | `qwen2.5-coder:3b` | Model name (also works with `llama3`, `mistral`) |
|
|
181
|
+
| `--anthropic-model` | `claude-haiku-4-5-20251001` | Any Claude model ID |
|
|
182
|
+
| `--anthropic-api-key` | `None` | Falls back to `ANTHROPIC_API_KEY` env var |
|
|
183
|
+
| `--healing-report-dir` | `reports` | Where to write JSON healing reports |
|
|
184
|
+
| `--screenshot-dir` | `reports/screenshots` | Where to write BEFORE/AFTER screenshots |
|
|
185
|
+
| `--headless` | `true` | Run browser headless |
|
|
186
|
+
|
|
187
|
+
---
|
|
188
|
+
|
|
189
|
+
## Healing Report
|
|
190
|
+
|
|
191
|
+
After each run, a JSON report is written to `reports/`:
|
|
192
|
+
|
|
193
|
+
```json
|
|
194
|
+
{
|
|
195
|
+
"total_healings_attempted": 3,
|
|
196
|
+
"successful_healings": 3,
|
|
197
|
+
"failed_healings": 0,
|
|
198
|
+
"attempts": [
|
|
199
|
+
{
|
|
200
|
+
"original_selector": "#user-name",
|
|
201
|
+
"element_purpose": "username input field on login form",
|
|
202
|
+
"suggested_selector": "#username",
|
|
203
|
+
"success": true,
|
|
204
|
+
"timestamp": "2024-01-15T10:23:45.123456",
|
|
205
|
+
"model_response_time_ms": 1840.5,
|
|
206
|
+
"dom_size_chars": 4231,
|
|
207
|
+
"provider": "ollama"
|
|
208
|
+
}
|
|
209
|
+
]
|
|
210
|
+
}
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Evalset — Benchmarking LLM Accuracy
|
|
216
|
+
|
|
217
|
+
The evalset is a structured ground-truth dataset (`src/evals/selector_evalset.json`) used to measure how accurately the LLM finds correct selectors. It is independent of the healing tests — no browser required.
|
|
218
|
+
|
|
219
|
+
### What's in the evalset
|
|
220
|
+
|
|
221
|
+
12 cases across 6 categories and 3 difficulty levels:
|
|
222
|
+
|
|
223
|
+
| Category | Cases | Difficulty |
|
|
224
|
+
|----------|-------|------------|
|
|
225
|
+
| login | 3 | easy |
|
|
226
|
+
| checkout | 2 | medium |
|
|
227
|
+
| search | 2 | easy |
|
|
228
|
+
| navigation | 1 | easy |
|
|
229
|
+
| modal | 2 | medium |
|
|
230
|
+
| profile | 1 | hard |
|
|
231
|
+
| data-table | 1 | hard |
|
|
232
|
+
|
|
233
|
+
Each case contains a stale selector, a purpose string, a minimal HTML snippet, and a list of acceptable correct selectors.
|
|
234
|
+
|
|
235
|
+
### Running the evalset
|
|
236
|
+
|
|
237
|
+
**Standalone runner** (fastest, no pytest overhead):
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# Against local Ollama
|
|
241
|
+
PYTHONPATH=src python src/evals/run_eval.py
|
|
242
|
+
|
|
243
|
+
# Against Anthropic Claude
|
|
244
|
+
PYTHONPATH=src python src/evals/run_eval.py \
|
|
245
|
+
--provider anthropic \
|
|
246
|
+
--anthropic-model claude-haiku-4-5-20251001
|
|
247
|
+
|
|
248
|
+
# Filter to a category or difficulty
|
|
249
|
+
PYTHONPATH=src python src/evals/run_eval.py --category login
|
|
250
|
+
PYTHONPATH=src python src/evals/run_eval.py --difficulty hard
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
**Via pytest** (integrates with your existing test flags):
|
|
254
|
+
|
|
255
|
+
```bash
|
|
256
|
+
PYTHONPATH=src pytest src/tests/test_evalset.py -v
|
|
257
|
+
PYTHONPATH=src pytest src/tests/test_evalset.py -v -k "login"
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Comparing two models
|
|
261
|
+
|
|
262
|
+
Each eval run saves a timestamped report to `reports/evals/`. Use `compare_models.py` to diff two runs:
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
# Run against model A
|
|
266
|
+
PYTHONPATH=src python src/evals/run_eval.py --ollama-model qwen2.5-coder:3b
|
|
267
|
+
|
|
268
|
+
# Run against model B
|
|
269
|
+
PYTHONPATH=src python src/evals/run_eval.py --ollama-model llama3
|
|
270
|
+
|
|
271
|
+
# Compare
|
|
272
|
+
python src/evals/compare_models.py \
|
|
273
|
+
reports/evals/eval_ollama_20260601_120000.json \
|
|
274
|
+
reports/evals/eval_ollama_20260601_120500.json
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Output:
|
|
278
|
+
```
|
|
279
|
+
Metric A B Delta
|
|
280
|
+
-------------------------------------------------------
|
|
281
|
+
Accuracy 75.0% 91.7% +16.7%
|
|
282
|
+
Avg response (ms) 2340 1820 -520.0
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
### Adding new evalset cases
|
|
286
|
+
|
|
287
|
+
Open `src/evals/selector_evalset.json` and append to the `cases` array. Each case needs:
|
|
288
|
+
|
|
289
|
+
```json
|
|
290
|
+
{
|
|
291
|
+
"id": "unique-slug",
|
|
292
|
+
"category": "login",
|
|
293
|
+
"difficulty": "easy",
|
|
294
|
+
"stale_selector": "#old-btn",
|
|
295
|
+
"purpose": "login submit button",
|
|
296
|
+
"expected_selectors": ["[data-testid='login-btn']", "button[type='submit']"],
|
|
297
|
+
"html": "<minimal HTML snippet containing the target element>"
|
|
298
|
+
}
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
No code changes needed — the runner and pytest integration pick up new cases automatically.
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
## Architecture Decisions
|
|
306
|
+
|
|
307
|
+
| Decision | Rationale |
|
|
308
|
+
|----------|-----------|
|
|
309
|
+
| **Local LLM first (Ollama)** | No API keys, no data leakage, works offline in CI |
|
|
310
|
+
| **Anthropic as opt-in cloud backend** | Higher accuracy on complex DOMs; useful when RAM is limited |
|
|
311
|
+
| **`auto` provider mode** | Uses Claude if `ANTHROPIC_API_KEY` is set, otherwise Ollama — same command works locally and in CI |
|
|
312
|
+
| **DOM compression** | Strips scripts/styles, keeps semantic attrs. Fits in small model context (~8KB) |
|
|
313
|
+
| **Selector caching** | Avoids repeated LLM calls for the same broken selector in one run |
|
|
314
|
+
| **Confidence scores** | LLM self-reports certainty; useful for alerting on `low` confidence heals |
|
|
315
|
+
| **`purpose` string** | Natural language > brittle heuristics. Tells LLM *why* you want the element |
|
|
316
|
+
| **Evalset separate from tests** | Ground-truth data lives in JSON, not test code — easy to grow and compare across models |
|
|
317
|
+
|
|
318
|
+
---
|
|
319
|
+
|
|
320
|
+
## Extending
|
|
321
|
+
|
|
322
|
+
- **Swap the LLM**: Change `--ollama-model=mistral` or use `--llm-provider=anthropic` for Claude
|
|
323
|
+
- **Persist the cache**: Serialize `engine._cache` to `reports/selector_cache.json` between runs
|
|
324
|
+
- **Alert on low confidence**: Check `attempt["confidence"] == "low"` in the report and open a GitHub issue automatically
|
|
325
|
+
- **Grow the evalset**: Add cases to `selector_evalset.json` to cover your app's specific UI patterns
|
|
326
|
+
- **CI accuracy gate**: Run `run_eval.py` in CI and fail the build if accuracy drops below a threshold
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
# 🛠 Self-Healing Test Automation Framework
|
|
2
|
+
|
|
3
|
+
> A Playwright wrapper that uses a **local or cloud LLM** to automatically fix broken CSS selectors — no flaky CI pipelines, no manual triaging.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## The Problem
|
|
8
|
+
|
|
9
|
+
UI changes break test selectors constantly:
|
|
10
|
+
|
|
11
|
+
```
|
|
12
|
+
TimeoutError: page.click: Timeout 30000ms exceeded.
|
|
13
|
+
waiting for selector "#submit-btn"
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
The button still exists — it's just `[data-testid="login-submit"]` now. A human would fix it in 10 seconds. But at 3 AM in CI, it blocks your entire pipeline.
|
|
17
|
+
|
|
18
|
+
---
|
|
19
|
+
|
|
20
|
+
## How It Works
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
Test runs selector → TimeoutError → DOM snapshot captured
|
|
24
|
+
↓
|
|
25
|
+
DOM compressed (scripts/styles stripped, ~8KB)
|
|
26
|
+
↓
|
|
27
|
+
Prompt sent to LLM (local Ollama or Anthropic Claude)
|
|
28
|
+
↓
|
|
29
|
+
LLM returns: { "selector": "#new-id", "confidence": "high" }
|
|
30
|
+
↓
|
|
31
|
+
New selector validated in Playwright
|
|
32
|
+
↓
|
|
33
|
+
Test continues ✅ + Result cached for reuse
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Project Structure
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
pytest-self-healer/
|
|
42
|
+
├── src/
|
|
43
|
+
│ ├── pytest_self_healer/ # Installable package (pip install pytest-self-healer)
|
|
44
|
+
│ │ ├── __init__.py
|
|
45
|
+
│ │ ├── plugin.py # pytest entry point (fixtures + CLI options)
|
|
46
|
+
│ │ ├── healing_engine.py # Core: LLM clients, DOM compression, healing logic
|
|
47
|
+
│ │ └── page_wrapper.py # SelfHealingPage: drop-in Playwright Page replacement
|
|
48
|
+
│ ├── evals/
|
|
49
|
+
│ │ ├── selector_evalset.json # Ground-truth dataset for LLM accuracy benchmarking
|
|
50
|
+
│ │ ├── run_eval.py # Standalone eval runner (scores + saves report)
|
|
51
|
+
│ │ └── compare_models.py # Diff two eval reports side by side
|
|
52
|
+
│ ├── tests/
|
|
53
|
+
│ │ ├── test_healing_examples.py # Integration tests with intentionally stale selectors
|
|
54
|
+
│ │ ├── test_evalset.py # pytest integration for the evalset
|
|
55
|
+
│ │ ├── test_accuracy.py # LLM accuracy benchmarks (3 tiers)
|
|
56
|
+
│ │ └── test_unit.py # Unit tests (no browser/LLM required)
|
|
57
|
+
│ └── conftest.py # pytest fixtures, CLI options, report hook
|
|
58
|
+
├── docker/
|
|
59
|
+
│ ├── Dockerfile # Test runner image (Playwright + Python)
|
|
60
|
+
│ └── docker-compose.yml # Ollama + test runner, health-checked
|
|
61
|
+
├── reports/
|
|
62
|
+
│ ├── healing_report_<ts>.json # Per-run healing reports
|
|
63
|
+
│ └── evals/
|
|
64
|
+
│ └── eval_<provider>_<ts>.json # Per-run eval reports
|
|
65
|
+
├── requirements.txt
|
|
66
|
+
├── pytest.ini
|
|
67
|
+
└── README.md
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
---
|
|
71
|
+
|
|
72
|
+
## Quickstart
|
|
73
|
+
|
|
74
|
+
### Option 1: Unit tests only (no browser or LLM needed)
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
pip install -r requirements.txt
|
|
78
|
+
playwright install chromium
|
|
79
|
+
PYTHONPATH=src pytest src/tests/test_unit.py -v
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Option 2: Full integration tests (requires Ollama running locally)
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Install and start Ollama
|
|
86
|
+
brew install ollama # or: curl -fsSL https://ollama.com/install.sh | sh
|
|
87
|
+
ollama pull qwen2.5-coder:3b
|
|
88
|
+
|
|
89
|
+
# Run the tests
|
|
90
|
+
PYTHONPATH=src pytest src/tests/ -v \
|
|
91
|
+
--ollama-url=http://localhost:11434 \
|
|
92
|
+
--ollama-model=qwen2.5-coder:3b
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Option 3: Use Anthropic Claude instead of Ollama
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
99
|
+
|
|
100
|
+
PYTHONPATH=src pytest src/tests/ -v \
|
|
101
|
+
--llm-provider=anthropic \
|
|
102
|
+
--anthropic-model=claude-haiku-4-5-20251001
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### Option 4: Docker (everything bundled)
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
docker compose -f docker/docker-compose.yml up --build
|
|
109
|
+
# Reports land in ./reports/healing_report_<timestamp>.json
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Writing Your Own Healing Tests
|
|
115
|
+
|
|
116
|
+
Replace `page` with `SelfHealingPage`. Add a `purpose` string to every interaction:
|
|
117
|
+
|
|
118
|
+
```python
|
|
119
|
+
# After: pip install pytest-self-healer
|
|
120
|
+
# No import needed — healing_page fixture is auto-available
|
|
121
|
+
|
|
122
|
+
async def test_checkout(healing_page):
|
|
123
|
+
await healing_page.goto("https://myapp.com/checkout")
|
|
124
|
+
|
|
125
|
+
# Selector is stale — LLM will find the real one
|
|
126
|
+
await healing_page.click(
|
|
127
|
+
selector="button#old-checkout-id",
|
|
128
|
+
purpose="checkout submit button in the cart summary",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
await healing_page.fill(
|
|
132
|
+
selector="input.card-num",
|
|
133
|
+
value="4242424242424242",
|
|
134
|
+
purpose="credit card number input in payment form",
|
|
135
|
+
)
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Tips for better healing:**
|
|
139
|
+
- Be specific in `purpose`: *"blue submit button in the login modal"* > *"button"*
|
|
140
|
+
- Use `data-testid` attributes in your app for stable baseline selectors
|
|
141
|
+
- The LLM favors `data-testid` > `aria-label` > `id` > semantic CSS
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## CLI Options
|
|
146
|
+
|
|
147
|
+
| Flag | Default | Description |
|
|
148
|
+
|------|---------|-------------|
|
|
149
|
+
| `--llm-provider` | `ollama` | `ollama` \| `anthropic` \| `auto` |
|
|
150
|
+
| `--ollama-url` | `http://localhost:11434` | Ollama server endpoint |
|
|
151
|
+
| `--ollama-model` | `qwen2.5-coder:3b` | Model name (also works with `llama3`, `mistral`) |
|
|
152
|
+
| `--anthropic-model` | `claude-haiku-4-5-20251001` | Any Claude model ID |
|
|
153
|
+
| `--anthropic-api-key` | `None` | Falls back to `ANTHROPIC_API_KEY` env var |
|
|
154
|
+
| `--healing-report-dir` | `reports` | Where to write JSON healing reports |
|
|
155
|
+
| `--screenshot-dir` | `reports/screenshots` | Where to write BEFORE/AFTER screenshots |
|
|
156
|
+
| `--headless` | `true` | Run browser headless |
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Healing Report
|
|
161
|
+
|
|
162
|
+
After each run, a JSON report is written to `reports/`:
|
|
163
|
+
|
|
164
|
+
```json
|
|
165
|
+
{
|
|
166
|
+
"total_healings_attempted": 3,
|
|
167
|
+
"successful_healings": 3,
|
|
168
|
+
"failed_healings": 0,
|
|
169
|
+
"attempts": [
|
|
170
|
+
{
|
|
171
|
+
"original_selector": "#user-name",
|
|
172
|
+
"element_purpose": "username input field on login form",
|
|
173
|
+
"suggested_selector": "#username",
|
|
174
|
+
"success": true,
|
|
175
|
+
"timestamp": "2024-01-15T10:23:45.123456",
|
|
176
|
+
"model_response_time_ms": 1840.5,
|
|
177
|
+
"dom_size_chars": 4231,
|
|
178
|
+
"provider": "ollama"
|
|
179
|
+
}
|
|
180
|
+
]
|
|
181
|
+
}
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Evalset — Benchmarking LLM Accuracy
|
|
187
|
+
|
|
188
|
+
The evalset is a structured ground-truth dataset (`src/evals/selector_evalset.json`) used to measure how accurately the LLM finds correct selectors. It is independent of the healing tests — no browser required.
|
|
189
|
+
|
|
190
|
+
### What's in the evalset
|
|
191
|
+
|
|
192
|
+
12 cases across 6 categories and 3 difficulty levels:
|
|
193
|
+
|
|
194
|
+
| Category | Cases | Difficulty |
|
|
195
|
+
|----------|-------|------------|
|
|
196
|
+
| login | 3 | easy |
|
|
197
|
+
| checkout | 2 | medium |
|
|
198
|
+
| search | 2 | easy |
|
|
199
|
+
| navigation | 1 | easy |
|
|
200
|
+
| modal | 2 | medium |
|
|
201
|
+
| profile | 1 | hard |
|
|
202
|
+
| data-table | 1 | hard |
|
|
203
|
+
|
|
204
|
+
Each case contains a stale selector, a purpose string, a minimal HTML snippet, and a list of acceptable correct selectors.
|
|
205
|
+
|
|
206
|
+
### Running the evalset
|
|
207
|
+
|
|
208
|
+
**Standalone runner** (fastest, no pytest overhead):
|
|
209
|
+
|
|
210
|
+
```bash
|
|
211
|
+
# Against local Ollama
|
|
212
|
+
PYTHONPATH=src python src/evals/run_eval.py
|
|
213
|
+
|
|
214
|
+
# Against Anthropic Claude
|
|
215
|
+
PYTHONPATH=src python src/evals/run_eval.py \
|
|
216
|
+
--provider anthropic \
|
|
217
|
+
--anthropic-model claude-haiku-4-5-20251001
|
|
218
|
+
|
|
219
|
+
# Filter to a category or difficulty
|
|
220
|
+
PYTHONPATH=src python src/evals/run_eval.py --category login
|
|
221
|
+
PYTHONPATH=src python src/evals/run_eval.py --difficulty hard
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
**Via pytest** (integrates with your existing test flags):
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
PYTHONPATH=src pytest src/tests/test_evalset.py -v
|
|
228
|
+
PYTHONPATH=src pytest src/tests/test_evalset.py -v -k "login"
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Comparing two models
|
|
232
|
+
|
|
233
|
+
Each eval run saves a timestamped report to `reports/evals/`. Use `compare_models.py` to diff two runs:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
# Run against model A
|
|
237
|
+
PYTHONPATH=src python src/evals/run_eval.py --ollama-model qwen2.5-coder:3b
|
|
238
|
+
|
|
239
|
+
# Run against model B
|
|
240
|
+
PYTHONPATH=src python src/evals/run_eval.py --ollama-model llama3
|
|
241
|
+
|
|
242
|
+
# Compare
|
|
243
|
+
python src/evals/compare_models.py \
|
|
244
|
+
reports/evals/eval_ollama_20260601_120000.json \
|
|
245
|
+
reports/evals/eval_ollama_20260601_120500.json
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
Output:
|
|
249
|
+
```
|
|
250
|
+
Metric A B Delta
|
|
251
|
+
-------------------------------------------------------
|
|
252
|
+
Accuracy 75.0% 91.7% +16.7%
|
|
253
|
+
Avg response (ms) 2340 1820 -520.0
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
### Adding new evalset cases
|
|
257
|
+
|
|
258
|
+
Open `src/evals/selector_evalset.json` and append to the `cases` array. Each case needs:
|
|
259
|
+
|
|
260
|
+
```json
|
|
261
|
+
{
|
|
262
|
+
"id": "unique-slug",
|
|
263
|
+
"category": "login",
|
|
264
|
+
"difficulty": "easy",
|
|
265
|
+
"stale_selector": "#old-btn",
|
|
266
|
+
"purpose": "login submit button",
|
|
267
|
+
"expected_selectors": ["[data-testid='login-btn']", "button[type='submit']"],
|
|
268
|
+
"html": "<minimal HTML snippet containing the target element>"
|
|
269
|
+
}
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
No code changes needed — the runner and pytest integration pick up new cases automatically.
|
|
273
|
+
|
|
274
|
+
---
|
|
275
|
+
|
|
276
|
+
## Architecture Decisions
|
|
277
|
+
|
|
278
|
+
| Decision | Rationale |
|
|
279
|
+
|----------|-----------|
|
|
280
|
+
| **Local LLM first (Ollama)** | No API keys, no data leakage, works offline in CI |
|
|
281
|
+
| **Anthropic as opt-in cloud backend** | Higher accuracy on complex DOMs; useful when RAM is limited |
|
|
282
|
+
| **`auto` provider mode** | Uses Claude if `ANTHROPIC_API_KEY` is set, otherwise Ollama — same command works locally and in CI |
|
|
283
|
+
| **DOM compression** | Strips scripts/styles, keeps semantic attrs. Fits in small model context (~8KB) |
|
|
284
|
+
| **Selector caching** | Avoids repeated LLM calls for the same broken selector in one run |
|
|
285
|
+
| **Confidence scores** | LLM self-reports certainty; useful for alerting on `low` confidence heals |
|
|
286
|
+
| **`purpose` string** | Natural language > brittle heuristics. Tells LLM *why* you want the element |
|
|
287
|
+
| **Evalset separate from tests** | Ground-truth data lives in JSON, not test code — easy to grow and compare across models |
|
|
288
|
+
|
|
289
|
+
---
|
|
290
|
+
|
|
291
|
+
## Extending
|
|
292
|
+
|
|
293
|
+
- **Swap the LLM**: Change `--ollama-model=mistral` or use `--llm-provider=anthropic` for Claude
|
|
294
|
+
- **Persist the cache**: Serialize `engine._cache` to `reports/selector_cache.json` between runs
|
|
295
|
+
- **Alert on low confidence**: Check `attempt["confidence"] == "low"` in the report and open a GitHub issue automatically
|
|
296
|
+
- **Grow the evalset**: Add cases to `selector_evalset.json` to cover your app's specific UI patterns
|
|
297
|
+
- **CI accuracy gate**: Run `run_eval.py` in CI and fail the build if accuracy drops below a threshold
|