perceptorguard 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perceptorguard-0.1.0/LICENSE +21 -0
- perceptorguard-0.1.0/PKG-INFO +340 -0
- perceptorguard-0.1.0/README.md +305 -0
- perceptorguard-0.1.0/gates/__init__.py +0 -0
- perceptorguard-0.1.0/gates/comparator.py +183 -0
- perceptorguard-0.1.0/gates/gate_runner.py +67 -0
- perceptorguard-0.1.0/gates/thresholds.py +44 -0
- perceptorguard-0.1.0/ingestion/__init__.py +0 -0
- perceptorguard-0.1.0/ingestion/class_map.py +185 -0
- perceptorguard-0.1.0/ingestion/coco_gt.py +123 -0
- perceptorguard-0.1.0/ingestion/coco_predictions.py +126 -0
- perceptorguard-0.1.0/ingestion/metadata_csv.py +138 -0
- perceptorguard-0.1.0/ingestion/slice_inferrer.py +188 -0
- perceptorguard-0.1.0/metrics/__init__.py +0 -0
- perceptorguard-0.1.0/metrics/cluster_analyzer.py +95 -0
- perceptorguard-0.1.0/metrics/engine.py +131 -0
- perceptorguard-0.1.0/metrics/failure_classifier.py +55 -0
- perceptorguard-0.1.0/metrics/reporter.py +160 -0
- perceptorguard-0.1.0/metrics/triage_reporter.py +94 -0
- perceptorguard-0.1.0/perceptorguard/__init__.py +0 -0
- perceptorguard-0.1.0/perceptorguard/cli.py +169 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/PKG-INFO +340 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/SOURCES.txt +61 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/dependency_links.txt +1 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/entry_points.txt +2 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/requires.txt +16 -0
- perceptorguard-0.1.0/perceptorguard.egg-info/top_level.txt +14 -0
- perceptorguard-0.1.0/pyproject.toml +53 -0
- perceptorguard-0.1.0/reports/__init__.py +0 -0
- perceptorguard-0.1.0/reports/annotator.py +173 -0
- perceptorguard-0.1.0/reports/renderer.py +185 -0
- perceptorguard-0.1.0/reports/tracker.py +111 -0
- perceptorguard-0.1.0/runner/__init__.py +0 -0
- perceptorguard-0.1.0/runner/eval_runner.py +114 -0
- perceptorguard-0.1.0/runner/gt_extractor.py +49 -0
- perceptorguard-0.1.0/runner/matcher.py +106 -0
- perceptorguard-0.1.0/runner/scene_runner.py +119 -0
- perceptorguard-0.1.0/scenarios/__init__.py +3 -0
- perceptorguard-0.1.0/scenarios/generator.py +230 -0
- perceptorguard-0.1.0/scenarios/schemas.py +54 -0
- perceptorguard-0.1.0/scripts/__init__.py +0 -0
- perceptorguard-0.1.0/scripts/demo_regression.py +122 -0
- perceptorguard-0.1.0/scripts/generate.py +134 -0
- perceptorguard-0.1.0/scripts/generate_report.py +125 -0
- perceptorguard-0.1.0/scripts/run_coco_eval.py +159 -0
- perceptorguard-0.1.0/scripts/run_eval.py +51 -0
- perceptorguard-0.1.0/scripts/run_gate.py +47 -0
- perceptorguard-0.1.0/scripts/save_baseline.py +59 -0
- perceptorguard-0.1.0/scripts/triage.py +66 -0
- perceptorguard-0.1.0/setup.cfg +4 -0
- perceptorguard-0.1.0/tests/__init__.py +0 -0
- perceptorguard-0.1.0/tests/test_class_map.py +283 -0
- perceptorguard-0.1.0/tests/test_coco_ingestion.py +382 -0
- perceptorguard-0.1.0/tests/test_gates.py +226 -0
- perceptorguard-0.1.0/tests/test_generator.py +79 -0
- perceptorguard-0.1.0/tests/test_matcher.py +102 -0
- perceptorguard-0.1.0/tests/test_metadata_csv.py +208 -0
- perceptorguard-0.1.0/tests/test_metrics.py +153 -0
- perceptorguard-0.1.0/tests/test_scene_runner.py +109 -0
- perceptorguard-0.1.0/tests/test_schemas.py +28 -0
- perceptorguard-0.1.0/tests/test_slice_inferrer.py +238 -0
- perceptorguard-0.1.0/tests/test_triage.py +131 -0
- perceptorguard-0.1.0/tests/verify_chunk2.py +418 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 PerceptorGuard
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: perceptorguard
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Perception model evaluation harness — sliced metrics, failure triage, and CI regression gating for detection models
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/your-org/perceptorguard
|
|
7
|
+
Project-URL: Bug Tracker, https://github.com/your-org/perceptorguard/issues
|
|
8
|
+
Keywords: object-detection,evaluation,metrics,yolo,coco,perception,robotics
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Requires-Python: >=3.11
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: pandas>=2.2.0
|
|
21
|
+
Requires-Dist: pydantic>=2.7.0
|
|
22
|
+
Requires-Dist: numpy>=1.26.0
|
|
23
|
+
Requires-Dist: Pillow>=10.3.0
|
|
24
|
+
Requires-Dist: scikit-learn>=1.3.0
|
|
25
|
+
Requires-Dist: jinja2>=3.1.0
|
|
26
|
+
Requires-Dist: pyyaml>=6.0
|
|
27
|
+
Provides-Extra: synthetic
|
|
28
|
+
Requires-Dist: pybullet>=3.2.6; extra == "synthetic"
|
|
29
|
+
Requires-Dist: torch>=2.2.0; extra == "synthetic"
|
|
30
|
+
Requires-Dist: torchvision>=0.17.0; extra == "synthetic"
|
|
31
|
+
Requires-Dist: ultralytics>=8.2.0; extra == "synthetic"
|
|
32
|
+
Provides-Extra: dev
|
|
33
|
+
Requires-Dist: pytest>=8.2.0; extra == "dev"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# PerceptorGuard
|
|
37
|
+
|
|
38
|
+
A **perception evaluation harness** for YOLO detection models — slice-based metrics, CI regression gating, failure triage, and reproducible synthetic fixtures. Built to demonstrate that evaluation is an engineering discipline, not a one-liner.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
42
|
+
## Quick start
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
# 1. Install
|
|
46
|
+
python -m venv .venv && source .venv/bin/activate
|
|
47
|
+
pip install -e ".[dev]" && pip install scikit-learn pyyaml jinja2
|
|
48
|
+
|
|
49
|
+
# 2. Generate a 20-scene dataset and run eval
|
|
50
|
+
python scripts/generate.py --count 20 --seed 42 --out artifacts/dataset
|
|
51
|
+
python scripts/run_eval.py --dataset artifacts/dataset --out artifacts/eval
|
|
52
|
+
|
|
53
|
+
# 3. Triage failures and generate report
|
|
54
|
+
python scripts/triage.py --matches artifacts/eval/matches.csv
|
|
55
|
+
python scripts/generate_report.py \
|
|
56
|
+
--dataset artifacts/dataset \
|
|
57
|
+
--eval artifacts/eval \
|
|
58
|
+
--baseline artifacts/baseline \
|
|
59
|
+
--triage artifacts/triage
|
|
60
|
+
# → open artifacts/report/report.html
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
**Planted-regression demo** (gate goes red, then green):
|
|
64
|
+
```bash
|
|
65
|
+
python scripts/save_baseline.py # promote current eval to baseline
|
|
66
|
+
python scripts/demo_regression.py # PASS → FAIL → PASS
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## System overview
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
75
|
+
│ Scenario generator (PyBullet DIRECT) │
|
|
76
|
+
│ 6 challenge profiles × 2-tier object catalog (easy + hard) │
|
|
77
|
+
│ → manifest.csv + per-scene frame.png + ground_truth.json │
|
|
78
|
+
└──────────────────────────┬────────────────────────────────────────┘
|
|
79
|
+
│
|
|
80
|
+
┌──────────────────────────▼────────────────────────────────────────┐
|
|
81
|
+
│ Eval runner │
|
|
82
|
+
│ YOLO inference at conf=0.01 (full distribution for AP) │
|
|
83
|
+
│ Greedy class-aware bipartite matching (IoU ≥ 0.5) │
|
|
84
|
+
│ → matches.csv (tidy: one row per TP / FP / FN) │
|
|
85
|
+
└──────────────────────────┬────────────────────────────────────────┘
|
|
86
|
+
│
|
|
87
|
+
┌────────────────┴──────────────────┐
|
|
88
|
+
│ │
|
|
89
|
+
┌─────────▼──────────┐ ┌──────────▼──────────────────┐
|
|
90
|
+
│ Metrics engine │ │ Failure triage │
|
|
91
|
+
│ mAP (11-pt VOC) │ │ missed / localization / │
|
|
92
|
+
│ P/R/F1 @ 0.25 │ │ wrong_class / false_pos │
|
|
93
|
+
│ slice tables: │ │ KMeans cluster analysis │
|
|
94
|
+
│ profile / dist / │ │ → failures_classified.csv │
|
|
95
|
+
│ lighting / clutter│ │ → cluster_summary.csv │
|
|
96
|
+
│ tier / class │ └──────────────────────────────┘
|
|
97
|
+
└─────────┬──────────┘
|
|
98
|
+
│
|
|
99
|
+
┌─────────▼──────────────────────────────────────────────────────┐
|
|
100
|
+
│ Regression gate │
|
|
101
|
+
│ Compare current metrics vs baseline (artifacts/baseline/) │
|
|
102
|
+
│ 46 checks: mAP, AP/recall per class, recall/FP per profile │
|
|
103
|
+
│ Exit 1 on any regression beyond configured slack │
|
|
104
|
+
└─────────┬──────────────────────────────────────────────────────┘
|
|
105
|
+
│
|
|
106
|
+
┌─────────▼──────────────────────────────────────────────────────┐
|
|
107
|
+
│ Report renderer (Jinja2) │
|
|
108
|
+
│ Self-contained HTML + Markdown │
|
|
109
|
+
│ Slice tables, failure gallery, gate diff table │
|
|
110
|
+
│ Optional: W&B / MLflow experiment tracking │
|
|
111
|
+
└────────────────────────────────────────────────────────────────┘
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
**CI split** — deliberate engineering decision:
|
|
115
|
+
- **PR gate** (`.github/workflows/ci_gate.yml`): 20 scenes, seed=42, ~2 min. Catches regressions before they land in `main`.
|
|
116
|
+
- **Nightly** (`.github/workflows/nightly.yml`): 100 scenes, auto-promotes baseline on success. Authoritative quality bar.
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## Key design decisions
|
|
121
|
+
|
|
122
|
+
### 1. Evaluation as a first-class engineering subsystem
|
|
123
|
+
|
|
124
|
+
Most perception teams treat evaluation as an afterthought: run a metric script, log the number, move on. The script is throw-away code; the number is a spreadsheet cell; there is no regression gate.
|
|
125
|
+
|
|
126
|
+
PerceptorGuard treats the eval pipeline with the same engineering discipline as the model itself:
|
|
127
|
+
|
|
128
|
+
| Property | Ad-hoc eval script | PerceptorGuard |
|
|
129
|
+
|----------|-------------------|----------------|
|
|
130
|
+
| Dataset | "whatever images we had" | Versioned, reproducible, committed |
|
|
131
|
+
| Metrics | Overall mAP only | Per-slice: profile, distance, lighting, clutter, tier, class |
|
|
132
|
+
| Failures | "recall is low" | Named failure modes with KMeans-clustered conditions |
|
|
133
|
+
| Regressions | Discovered in staging | Caught at PR time, 46 checks, named slice + delta |
|
|
134
|
+
| Reproducibility | "I think we used these settings" | Seed-fixed fixtures, LFS-tracked weights |
|
|
135
|
+
|
|
136
|
+
The eval harness is the **durable investment**. Models come and go; the harness lets you compare them honestly.
|
|
137
|
+
|
|
138
|
+
### 2. Model-agnostic interface
|
|
139
|
+
|
|
140
|
+
The harness has exactly one coupling point to YOLO: `EvalRunner._predict()` (12 lines in `runner/eval_runner.py`). The rest of the pipeline — matching, metrics, gating, triage, reporting — operates on `Detection` and `GroundTruth` Pydantic schemas.
|
|
141
|
+
|
|
142
|
+
Swapping YOLOv8n for YOLOv8x, a RT-DETR, or a custom model requires changing exactly that one function. You can A/B test models through the same harness and compare them on the same reproducible dataset without any eval-code changes.
|
|
143
|
+
|
|
144
|
+
This is the same principle I apply to RAG and agentic systems: the evaluation framework must be agnostic to the implementation choice, or you end up with evaluation that only works for the thing you already have.
|
|
145
|
+
|
|
146
|
+
### 3. CI gate blast-radius argument
|
|
147
|
+
|
|
148
|
+
Catching a regression has very different costs depending on where it surfaces:
|
|
149
|
+
|
|
150
|
+
| Where caught | Cost |
|
|
151
|
+
|-------------|------|
|
|
152
|
+
| At PR (CI gate) | 2 min CI compute |
|
|
153
|
+
| In staging after merge | Deploy + rollback + engineer-hours |
|
|
154
|
+
| In production | Incident response + user-trust loss |
|
|
155
|
+
|
|
156
|
+
The gate runs 46 checks per PR. Each check is a named (slice, metric) pair with an explicit floor: `floor = baseline_value − slack`. If any check fires, the gate exits 1 and names the regressed slice and delta. Engineers know exactly what broke and by how much — not "mAP went down a bit."
|
|
157
|
+
|
|
158
|
+
The 20-scene CI fast path is a **deliberate tradeoff**: 20 scenes is noisy enough that you'll miss subtle regressions, but it catches real structural breaks (IoU threshold bug, conf threshold change, class mapping error) in 2 minutes. The nightly full 100-scene run is the authoritative measurement. This split is explained in both workflow files.
|
|
159
|
+
|
|
160
|
+
### 4. Two-tier object catalog
|
|
161
|
+
|
|
162
|
+
```
|
|
163
|
+
Easy (COCO-recognizable): cup, bottle, bowl, teddy bear, sports ball
|
|
164
|
+
Hard (off-vocabulary): cube, duck, lego, domino
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
The split gives you two signals simultaneously:
|
|
168
|
+
- **Easy tier** measures how much signal you can extract from a COCO-pretrained backbone. Any recall at all indicates some domain-transfer.
|
|
169
|
+
- **Hard tier** measures true zero-shot generalisation. Near-zero AP here is expected and honest — it's the documented result, not a bug.
|
|
170
|
+
|
|
171
|
+
Running only easy classes would give you false confidence. Running only hard classes would give you nothing to gate on. The mix is deliberate.
|
|
172
|
+
|
|
173
|
+
### 5. Sub-threshold IoU enrichment for FN rows
|
|
174
|
+
|
|
175
|
+
The matcher records `best_iou_any_class` and `best_pred_class_at_overlap` for every FN row. This enables the failure classifier to distinguish:
|
|
176
|
+
|
|
177
|
+
- `missed_detection` — no prediction overlapped the GT (IoU < 0.1 from any box)
|
|
178
|
+
- `localization_error` — right class, right location, IoU just below threshold
|
|
179
|
+
- `wrong_class` — something overlaps (IoU ≥ 0.1) but with the wrong class label
|
|
180
|
+
|
|
181
|
+
These are **different engineering problems** requiring different interventions. Knowing you have 123 `wrong_class` failures in near-distance crowded scenes (the actual finding) is actionable. Knowing "FN = 355" is not.
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## Findings from the pilot run (YOLOv8n, 100 scenes)
|
|
186
|
+
|
|
187
|
+
```
|
|
188
|
+
Overall: mAP=0.1% Precision=0% Recall=0%
|
|
189
|
+
TP=0 FP=27 FN=355 (at op-point conf≥0.25)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
The headline finding is a **complete domain gap**: YOLOv8n, pretrained on COCO, achieves 0% recall on PyBullet synthetic renders at any operating threshold. This is expected and honest.
|
|
193
|
+
|
|
194
|
+
**The only live signal: sports ball, AP=0.9%** — the soccerball URDF has a realistic texture that partially overlaps the COCO training distribution. Two sub-threshold predictions match GT at IoU≥0.5 during the full-distribution AP sweep.
|
|
195
|
+
|
|
196
|
+
**Failure mode breakdown (771 failures):**
|
|
197
|
+
1. `false_positive` (54%) — 416 hallucinations. The model confidently detects objects that aren't there (chairs, people, cars) because PyBullet renders look like partial-context frames from real images.
|
|
198
|
+
2. `missed_detection` (30%) — 232 GTs with zero model signal. Worst in crowded + far profiles; domino and lego lead by class.
|
|
199
|
+
3. `wrong_class` (16%) — 123 GTs where a prediction overlaps but carries the wrong label. Cube accounts for 34% of wrong-class failures — the model recognises a rectangular object but can't resolve the class.
|
|
200
|
+
|
|
201
|
+
**Cluster insight**: KMeans (k=5) on [camera_distance, ambient_light, num_objects, failure_mode, tier, profile] identifies two pure FP clusters (100% false positives at mid-distance and near-distance), a mixed missed+wrong-class cluster under crowded conditions, and a distinct dark-scene cluster with different FP characteristics.
|
|
202
|
+
|
|
203
|
+
**What this means for next steps:**
|
|
204
|
+
- The gap is at the distribution level, not the architecture level. Domain randomization of textures + sim-to-real transfer is the right intervention, not model scaling.
|
|
205
|
+
- The FP flood is a calibration problem. A classification head fine-tuned on synthetic negatives would reduce it dramatically.
|
|
206
|
+
- The wrong-class failures on cube/domino suggest that geometric shape features are present but class-label resolution requires task-specific training.
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## Planted-regression demo
|
|
211
|
+
|
|
212
|
+
The gate verifiably catches regression and names the failing slice:
|
|
213
|
+
|
|
214
|
+
```
|
|
215
|
+
$ python scripts/demo_regression.py
|
|
216
|
+
|
|
217
|
+
DEMO STEP 1 — Gate on current (good) metrics
|
|
218
|
+
→ PASSED — all 46 checks within threshold
|
|
219
|
+
|
|
220
|
+
DEMO STEP 2 — Plant regression: zero sports-ball AP
|
|
221
|
+
(simulates iou_threshold cranked to 0.9)
|
|
222
|
+
→ FAILED — 1 regression(s) detected
|
|
223
|
+
✗ class:sports ball / ap
|
|
224
|
+
baseline=0.0091 current=0.0000 floor=0.0041 delta=-0.0091
|
|
225
|
+
|
|
226
|
+
DEMO STEP 3 — Restore
|
|
227
|
+
→ PASSED — all 46 checks within threshold
|
|
228
|
+
|
|
229
|
+
Result: PASS → FAIL → PASS ✓ (gate behaves correctly)
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
To trigger this in real CI, open a PR that changes `--iou 0.9` in the eval runner. The CI workflow runs, the sports-ball AP drops from 0.9% to 0%, the gate exits 1, and the PR check fails — naming the regressed slice and delta. Revert → green.
|
|
233
|
+
|
|
234
|
+
---
|
|
235
|
+
|
|
236
|
+
## Cross-domain principle: eval as infrastructure
|
|
237
|
+
|
|
238
|
+
The architectural insight that generalises across ML domains:
|
|
239
|
+
|
|
240
|
+
> **Evaluation should be a first-class subsystem, not an afterthought. It must be model-agnostic, slice-aware, and wired into CI.**
|
|
241
|
+
|
|
242
|
+
I've applied the same principle in three domains:
|
|
243
|
+
|
|
244
|
+
| Domain | What gets evaluated | The harness checks |
|
|
245
|
+
|--------|--------------------|--------------------|
|
|
246
|
+
| **Perception (this project)** | YOLO detector | Per-slice mAP, FP rate, localization quality |
|
|
247
|
+
| **RAG systems** | Retriever + LLM | Retrieval recall, answer faithfulness, citation precision |
|
|
248
|
+
| **Agentic systems** | Tool-use agent | Task completion rate, tool selection accuracy, latency |
|
|
249
|
+
|
|
250
|
+
In each case:
|
|
251
|
+
- The harness is decoupled from the implementation (model-agnostic interface)
|
|
252
|
+
- Metrics are sliced by the conditions that matter (difficulty, distance, topic domain, query type)
|
|
253
|
+
- A baseline is stored and regressions are caught before they reach production
|
|
254
|
+
- Failures are named, not just counted
|
|
255
|
+
|
|
256
|
+
The model changes; the eval discipline doesn't. This is the engineering investment that compounds.
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## What I'd do differently
|
|
261
|
+
|
|
262
|
+
**1. Domain randomization before domain gap is "interesting"**
|
|
263
|
+
PyBullet renders are too synthetic. Before drawing any production conclusions, I'd add texture randomization (PBR materials, HDRI backgrounds), noise augmentation, and random object scales. The 0% recall result is expected — but a more realistic synthetic distribution would push that to a meaningful non-zero baseline worth gating on.
|
|
264
|
+
|
|
265
|
+
**2. Real-image holdout**
|
|
266
|
+
The synthetic-to-real gap is documented but not measured. A small (50-100 image) real-world validation set, with the same object categories, would quantify the gap. This is the honest thing to do before claiming the eval harness is production-relevant.
|
|
267
|
+
|
|
268
|
+
**3. Active learning loop**
|
|
269
|
+
The triage output (failure mode distribution, KMeans clusters) should feed back into the scenario generator: generate more scenes matching the hardest cluster conditions. Right now triage is a report; it should be a signal that drives the next data generation run.
|
|
270
|
+
|
|
271
|
+
**4. Temporal and latency eval**
|
|
272
|
+
`match_scene` operates on single frames. Real robot perception needs tracking across frames, trajectory prediction, and FPS budgeting. None of that is here.
|
|
273
|
+
|
|
274
|
+
**5. Confidence calibration**
|
|
275
|
+
The op-point threshold is set at conf≥0.25 by convention. A calibration sweep (precision-recall curve analysis by condition) would let you set per-slice thresholds that reflect the actual operating point you need — not a fixed global number.
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Repository structure
|
|
280
|
+
|
|
281
|
+
```
|
|
282
|
+
perceptorguard/
|
|
283
|
+
├── scenarios/ Pydantic schemas, parameterized scenario generator
|
|
284
|
+
│ ├── schemas.py BoundingBox, Detection, GroundTruth, Scenario, ObjectSpec
|
|
285
|
+
│ └── generator.py 6 profiles × 2-tier catalog, occluder placement
|
|
286
|
+
├── runner/ Inference + matching + GT extraction
|
|
287
|
+
│ ├── scene_runner.py PyBullet DIRECT renderer, AABB→screen projection
|
|
288
|
+
│ ├── eval_runner.py YOLO inference loop, bin assignment
|
|
289
|
+
│ └── matcher.py Greedy bipartite match; FN rows carry sub-threshold IoU
|
|
290
|
+
├── metrics/ Metrics, triage, reporting
|
|
291
|
+
│ ├── engine.py 11-pt VOC AP, operating-point P/R/F1, sliced tables
|
|
292
|
+
│ ├── failure_classifier.py missed / localization / wrong_class / fp
|
|
293
|
+
│ ├── cluster_analyzer.py KMeans on scenario feature vector
|
|
294
|
+
│ ├── triage_reporter.py Ranked failure-mode summary
|
|
295
|
+
│ └── reporter.py ASCII console report
|
|
296
|
+
├── gates/ Regression gate
|
|
297
|
+
│ ├── thresholds.py GateThresholds dataclass, YAML-backed
|
|
298
|
+
│ ├── comparator.py 46-check comparison: mAP, AP, recall, FP per slice
|
|
299
|
+
│ └── gate_runner.py Print report, return bool, exit 1 on failure
|
|
300
|
+
├── reports/ Report rendering
|
|
301
|
+
│ ├── annotator.py Annotate failure scenes (GT boxes, missed/detected)
|
|
302
|
+
│ ├── renderer.py Jinja2 → HTML + Markdown
|
|
303
|
+
│ ├── tracker.py W&B + MLflow optional integration
|
|
304
|
+
│ └── templates/ report.html.j2, report.md.j2
|
|
305
|
+
├── scripts/
|
|
306
|
+
│ ├── generate.py Dataset generation CLI
|
|
307
|
+
│ ├── run_eval.py Eval CLI (--model, --iou, --imgsz)
|
|
308
|
+
│ ├── triage.py Failure triage CLI
|
|
309
|
+
│ ├── run_gate.py Gate CLI (exit 0/1)
|
|
310
|
+
│ ├── save_baseline.py Promote eval → baseline
|
|
311
|
+
│ ├── generate_report.py Report generation CLI
|
|
312
|
+
│ └── demo_regression.py Planted-regression demo
|
|
313
|
+
├── tests/ 74 unit tests (all pass)
|
|
314
|
+
│ ├── test_matcher.py, test_metrics.py
|
|
315
|
+
│ ├── test_triage.py, test_gates.py
|
|
316
|
+
│ └── verify_chunk2.py (55 GT-pipeline invariant tests)
|
|
317
|
+
├── assets/ Custom URDFs (bottle.urdf, bowl.urdf)
|
|
318
|
+
├── configs/
|
|
319
|
+
│ └── gate_thresholds.yml Tunable slack per metric
|
|
320
|
+
├── artifacts/
|
|
321
|
+
│ ├── baseline/ 100-scene golden reference (committed)
|
|
322
|
+
│ ├── ci_baseline/ 20-scene CI reference (committed, seed=42)
|
|
323
|
+
│ ├── ci_dataset/ Reproducible 20-scene fixture (committed, LFS)
|
|
324
|
+
│ ├── eval/ Full 100-scene eval output
|
|
325
|
+
│ └── triage/ Failure classification + cluster summary
|
|
326
|
+
└── .github/
|
|
327
|
+
├── workflows/ci_gate.yml PR: 20 scenes, ~2 min
|
|
328
|
+
└── workflows/nightly.yml Scheduled: 100 scenes, auto-promote baseline
|
|
329
|
+
```
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Running the full test suite
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
pytest tests/ -v
|
|
337
|
+
# → 74 passed
|
|
338
|
+
```
|
|
339
|
+
|
|
340
|
+
Tests are hermetic — no model inference, no disk artifacts required. The 55 `verify_chunk2.py` tests validate the GT pipeline (AABB projection, occlusion geometry, multi-object placement) against hardcoded fixtures. The 12 `test_matcher.py` tests cover IoU edge cases and greedy matching invariants. The 15 `test_gates.py` tests cover threshold loading, regression detection (mAP drop, FP spike, recall drop), and gate-runner pass/fail return values.
|
|
@@ -0,0 +1,305 @@
|
|
|
1
|
+
# PerceptorGuard
|
|
2
|
+
|
|
3
|
+
A **perception evaluation harness** for YOLO detection models — slice-based metrics, CI regression gating, failure triage, and reproducible synthetic fixtures. Built to demonstrate that evaluation is an engineering discipline, not a one-liner.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Quick start
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# 1. Install
|
|
11
|
+
python -m venv .venv && source .venv/bin/activate
|
|
12
|
+
pip install -e ".[dev]" && pip install scikit-learn pyyaml jinja2
|
|
13
|
+
|
|
14
|
+
# 2. Generate a 20-scene dataset and run eval
|
|
15
|
+
python scripts/generate.py --count 20 --seed 42 --out artifacts/dataset
|
|
16
|
+
python scripts/run_eval.py --dataset artifacts/dataset --out artifacts/eval
|
|
17
|
+
|
|
18
|
+
# 3. Triage failures and generate report
|
|
19
|
+
python scripts/triage.py --matches artifacts/eval/matches.csv
|
|
20
|
+
python scripts/generate_report.py \
|
|
21
|
+
--dataset artifacts/dataset \
|
|
22
|
+
--eval artifacts/eval \
|
|
23
|
+
--baseline artifacts/baseline \
|
|
24
|
+
--triage artifacts/triage
|
|
25
|
+
# → open artifacts/report/report.html
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Planted-regression demo** (gate goes red, then green):
|
|
29
|
+
```bash
|
|
30
|
+
python scripts/save_baseline.py # promote current eval to baseline
|
|
31
|
+
python scripts/demo_regression.py # PASS → FAIL → PASS
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## System overview
|
|
37
|
+
|
|
38
|
+
```
|
|
39
|
+
┌───────────────────────────────────────────────────────────────────┐
|
|
40
|
+
│ Scenario generator (PyBullet DIRECT) │
|
|
41
|
+
│ 6 challenge profiles × 2-tier object catalog (easy + hard) │
|
|
42
|
+
│ → manifest.csv + per-scene frame.png + ground_truth.json │
|
|
43
|
+
└──────────────────────────┬────────────────────────────────────────┘
|
|
44
|
+
│
|
|
45
|
+
┌──────────────────────────▼────────────────────────────────────────┐
|
|
46
|
+
│ Eval runner │
|
|
47
|
+
│ YOLO inference at conf=0.01 (full distribution for AP) │
|
|
48
|
+
│ Greedy class-aware bipartite matching (IoU ≥ 0.5) │
|
|
49
|
+
│ → matches.csv (tidy: one row per TP / FP / FN) │
|
|
50
|
+
└──────────────────────────┬────────────────────────────────────────┘
|
|
51
|
+
│
|
|
52
|
+
┌────────────────┴──────────────────┐
|
|
53
|
+
│ │
|
|
54
|
+
┌─────────▼──────────┐ ┌──────────▼──────────────────┐
|
|
55
|
+
│ Metrics engine │ │ Failure triage │
|
|
56
|
+
│ mAP (11-pt VOC) │ │ missed / localization / │
|
|
57
|
+
│ P/R/F1 @ 0.25 │ │ wrong_class / false_pos │
|
|
58
|
+
│ slice tables: │ │ KMeans cluster analysis │
|
|
59
|
+
│ profile / dist / │ │ → failures_classified.csv │
|
|
60
|
+
│ lighting / clutter│ │ → cluster_summary.csv │
|
|
61
|
+
│ tier / class │ └──────────────────────────────┘
|
|
62
|
+
└─────────┬──────────┘
|
|
63
|
+
│
|
|
64
|
+
┌─────────▼──────────────────────────────────────────────────────┐
|
|
65
|
+
│ Regression gate │
|
|
66
|
+
│ Compare current metrics vs baseline (artifacts/baseline/) │
|
|
67
|
+
│ 46 checks: mAP, AP/recall per class, recall/FP per profile │
|
|
68
|
+
│ Exit 1 on any regression beyond configured slack │
|
|
69
|
+
└─────────┬──────────────────────────────────────────────────────┘
|
|
70
|
+
│
|
|
71
|
+
┌─────────▼──────────────────────────────────────────────────────┐
|
|
72
|
+
│ Report renderer (Jinja2) │
|
|
73
|
+
│ Self-contained HTML + Markdown │
|
|
74
|
+
│ Slice tables, failure gallery, gate diff table │
|
|
75
|
+
│ Optional: W&B / MLflow experiment tracking │
|
|
76
|
+
└────────────────────────────────────────────────────────────────┘
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**CI split** — deliberate engineering decision:
|
|
80
|
+
- **PR gate** (`.github/workflows/ci_gate.yml`): 20 scenes, seed=42, ~2 min. Catches regressions before they land in `main`.
|
|
81
|
+
- **Nightly** (`.github/workflows/nightly.yml`): 100 scenes, auto-promotes baseline on success. Authoritative quality bar.
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Key design decisions
|
|
86
|
+
|
|
87
|
+
### 1. Evaluation as a first-class engineering subsystem
|
|
88
|
+
|
|
89
|
+
Most perception teams treat evaluation as an afterthought: run a metric script, log the number, move on. The script is throw-away code; the number is a spreadsheet cell; there is no regression gate.
|
|
90
|
+
|
|
91
|
+
PerceptorGuard treats the eval pipeline with the same engineering discipline as the model itself:
|
|
92
|
+
|
|
93
|
+
| Property | Ad-hoc eval script | PerceptorGuard |
|
|
94
|
+
|----------|-------------------|----------------|
|
|
95
|
+
| Dataset | "whatever images we had" | Versioned, reproducible, committed |
|
|
96
|
+
| Metrics | Overall mAP only | Per-slice: profile, distance, lighting, clutter, tier, class |
|
|
97
|
+
| Failures | "recall is low" | Named failure modes with KMeans-clustered conditions |
|
|
98
|
+
| Regressions | Discovered in staging | Caught at PR time, 46 checks, named slice + delta |
|
|
99
|
+
| Reproducibility | "I think we used these settings" | Seed-fixed fixtures, LFS-tracked weights |
|
|
100
|
+
|
|
101
|
+
The eval harness is the **durable investment**. Models come and go; the harness lets you compare them honestly.
|
|
102
|
+
|
|
103
|
+
### 2. Model-agnostic interface
|
|
104
|
+
|
|
105
|
+
The harness has exactly one coupling point to YOLO: `EvalRunner._predict()` (12 lines in `runner/eval_runner.py`). The rest of the pipeline — matching, metrics, gating, triage, reporting — operates on `Detection` and `GroundTruth` Pydantic schemas.
|
|
106
|
+
|
|
107
|
+
Swapping YOLOv8n for YOLOv8x, a RT-DETR, or a custom model requires changing exactly that one function. You can A/B test models through the same harness and compare them on the same reproducible dataset without any eval-code changes.
|
|
108
|
+
|
|
109
|
+
This is the same principle I apply to RAG and agentic systems: the evaluation framework must be agnostic to the implementation choice, or you end up with evaluation that only works for the thing you already have.
|
|
110
|
+
|
|
111
|
+
### 3. CI gate blast-radius argument
|
|
112
|
+
|
|
113
|
+
Catching a regression has very different costs depending on where it surfaces:
|
|
114
|
+
|
|
115
|
+
| Where caught | Cost |
|
|
116
|
+
|-------------|------|
|
|
117
|
+
| At PR (CI gate) | 2 min CI compute |
|
|
118
|
+
| In staging after merge | Deploy + rollback + engineer-hours |
|
|
119
|
+
| In production | Incident response + user-trust loss |
|
|
120
|
+
|
|
121
|
+
The gate runs 46 checks per PR. Each check is a named (slice, metric) pair with an explicit floor: `floor = baseline_value − slack`. If any check fires, the gate exits 1 and names the regressed slice and delta. Engineers know exactly what broke and by how much — not "mAP went down a bit."
|
|
122
|
+
|
|
123
|
+
The 20-scene CI fast path is a **deliberate tradeoff**: 20 scenes is noisy enough that you'll miss subtle regressions, but it catches real structural breaks (IoU threshold bug, conf threshold change, class mapping error) in 2 minutes. The nightly full 100-scene run is the authoritative measurement. This split is explained in both workflow files.
|
|
124
|
+
|
|
125
|
+
### 4. Two-tier object catalog
|
|
126
|
+
|
|
127
|
+
```
|
|
128
|
+
Easy (COCO-recognizable): cup, bottle, bowl, teddy bear, sports ball
|
|
129
|
+
Hard (off-vocabulary): cube, duck, lego, domino
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
The split gives you two signals simultaneously:
|
|
133
|
+
- **Easy tier** measures how much signal you can extract from a COCO-pretrained backbone. Any recall at all indicates some domain-transfer.
|
|
134
|
+
- **Hard tier** measures true zero-shot generalisation. Near-zero AP here is expected and honest — it's the documented result, not a bug.
|
|
135
|
+
|
|
136
|
+
Running only easy classes would give you false confidence. Running only hard classes would give you nothing to gate on. The mix is deliberate.
|
|
137
|
+
|
|
138
|
+
### 5. Sub-threshold IoU enrichment for FN rows
|
|
139
|
+
|
|
140
|
+
The matcher records `best_iou_any_class` and `best_pred_class_at_overlap` for every FN row. This enables the failure classifier to distinguish:
|
|
141
|
+
|
|
142
|
+
- `missed_detection` — no prediction overlapped the GT (IoU < 0.1 from any box)
|
|
143
|
+
- `localization_error` — right class, right location, IoU just below threshold
|
|
144
|
+
- `wrong_class` — something overlaps (IoU ≥ 0.1) but with the wrong class label
|
|
145
|
+
|
|
146
|
+
These are **different engineering problems** requiring different interventions. Knowing you have 123 `wrong_class` failures in near-distance crowded scenes (the actual finding) is actionable. Knowing "FN = 355" is not.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Findings from the pilot run (YOLOv8n, 100 scenes)
|
|
151
|
+
|
|
152
|
+
```
|
|
153
|
+
Overall: mAP=0.1% Precision=0% Recall=0%
|
|
154
|
+
TP=0 FP=27 FN=355 (at op-point conf≥0.25)
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
The headline finding is a **complete domain gap**: YOLOv8n, pretrained on COCO, achieves 0% recall on PyBullet synthetic renders at any operating threshold. This is expected and honest.
|
|
158
|
+
|
|
159
|
+
**The only live signal: sports ball, AP=0.9%** — the soccerball URDF has a realistic texture that partially overlaps the COCO training distribution. Two sub-threshold predictions match GT at IoU≥0.5 during the full-distribution AP sweep.
|
|
160
|
+
|
|
161
|
+
**Failure mode breakdown (771 failures):**
|
|
162
|
+
1. `false_positive` (54%) — 416 hallucinations. The model confidently detects objects that aren't there (chairs, people, cars) because PyBullet renders look like partial-context frames from real images.
|
|
163
|
+
2. `missed_detection` (30%) — 232 GTs with zero model signal. Worst in crowded + far profiles; domino and lego lead by class.
|
|
164
|
+
3. `wrong_class` (16%) — 123 GTs where a prediction overlaps but carries the wrong label. Cube accounts for 34% of wrong-class failures — the model recognises a rectangular object but can't resolve the class.
|
|
165
|
+
|
|
166
|
+
**Cluster insight**: KMeans (k=5) on [camera_distance, ambient_light, num_objects, failure_mode, tier, profile] identifies two pure FP clusters (100% false positives at mid-distance and near-distance), a mixed missed+wrong-class cluster under crowded conditions, and a distinct dark-scene cluster with different FP characteristics.
|
|
167
|
+
|
|
168
|
+
**What this means for next steps:**
|
|
169
|
+
- The gap is at the distribution level, not the architecture level. Domain randomization of textures + sim-to-real transfer is the right intervention, not model scaling.
|
|
170
|
+
- The FP flood is a calibration problem. A classification head fine-tuned on synthetic negatives would reduce it dramatically.
|
|
171
|
+
- The wrong-class failures on cube/domino suggest that geometric shape features are present but class-label resolution requires task-specific training.
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Planted-regression demo
|
|
176
|
+
|
|
177
|
+
The gate verifiably catches regression and names the failing slice:
|
|
178
|
+
|
|
179
|
+
```
|
|
180
|
+
$ python scripts/demo_regression.py
|
|
181
|
+
|
|
182
|
+
DEMO STEP 1 — Gate on current (good) metrics
|
|
183
|
+
→ PASSED — all 46 checks within threshold
|
|
184
|
+
|
|
185
|
+
DEMO STEP 2 — Plant regression: zero sports-ball AP
|
|
186
|
+
(simulates iou_threshold cranked to 0.9)
|
|
187
|
+
→ FAILED — 1 regression(s) detected
|
|
188
|
+
✗ class:sports ball / ap
|
|
189
|
+
baseline=0.0091 current=0.0000 floor=0.0041 delta=-0.0091
|
|
190
|
+
|
|
191
|
+
DEMO STEP 3 — Restore
|
|
192
|
+
→ PASSED — all 46 checks within threshold
|
|
193
|
+
|
|
194
|
+
Result: PASS → FAIL → PASS ✓ (gate behaves correctly)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
To trigger this in real CI, open a PR that changes `--iou 0.9` in the eval runner. The CI workflow runs, the sports-ball AP drops from 0.9% to 0%, the gate exits 1, and the PR check fails — naming the regressed slice and delta. Revert → green.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Cross-domain principle: eval as infrastructure
|
|
202
|
+
|
|
203
|
+
The architectural insight that generalises across ML domains:
|
|
204
|
+
|
|
205
|
+
> **Evaluation should be a first-class subsystem, not an afterthought. It must be model-agnostic, slice-aware, and wired into CI.**
|
|
206
|
+
|
|
207
|
+
I've applied the same principle in three domains:
|
|
208
|
+
|
|
209
|
+
| Domain | What gets evaluated | The harness checks |
|
|
210
|
+
|--------|--------------------|--------------------|
|
|
211
|
+
| **Perception (this project)** | YOLO detector | Per-slice mAP, FP rate, localization quality |
|
|
212
|
+
| **RAG systems** | Retriever + LLM | Retrieval recall, answer faithfulness, citation precision |
|
|
213
|
+
| **Agentic systems** | Tool-use agent | Task completion rate, tool selection accuracy, latency |
|
|
214
|
+
|
|
215
|
+
In each case:
|
|
216
|
+
- The harness is decoupled from the implementation (model-agnostic interface)
|
|
217
|
+
- Metrics are sliced by the conditions that matter (difficulty, distance, topic domain, query type)
|
|
218
|
+
- A baseline is stored and regressions are caught before they reach production
|
|
219
|
+
- Failures are named, not just counted
|
|
220
|
+
|
|
221
|
+
The model changes; the eval discipline doesn't. This is the engineering investment that compounds.
|
|
222
|
+
|
|
223
|
+
---
|
|
224
|
+
|
|
225
|
+
## What I'd do differently
|
|
226
|
+
|
|
227
|
+
**1. Domain randomization before domain gap is "interesting"**
|
|
228
|
+
PyBullet renders are too synthetic. Before drawing any production conclusions, I'd add texture randomization (PBR materials, HDRI backgrounds), noise augmentation, and random object scales. The 0% recall result is expected — but a more realistic synthetic distribution would push that to a meaningful non-zero baseline worth gating on.
|
|
229
|
+
|
|
230
|
+
**2. Real-image holdout**
|
|
231
|
+
The synthetic-to-real gap is documented but not measured. A small (50-100 image) real-world validation set, with the same object categories, would quantify the gap. This is the honest thing to do before claiming the eval harness is production-relevant.
|
|
232
|
+
|
|
233
|
+
**3. Active learning loop**
|
|
234
|
+
The triage output (failure mode distribution, KMeans clusters) should feed back into the scenario generator: generate more scenes matching the hardest cluster conditions. Right now triage is a report; it should be a signal that drives the next data generation run.
|
|
235
|
+
|
|
236
|
+
**4. Temporal and latency eval**
|
|
237
|
+
`match_scene` operates on single frames. Real robot perception needs tracking across frames, trajectory prediction, and FPS budgeting. None of that is here.
|
|
238
|
+
|
|
239
|
+
**5. Confidence calibration**
|
|
240
|
+
The op-point threshold is set at conf≥0.25 by convention. A calibration sweep (precision-recall curve analysis by condition) would let you set per-slice thresholds that reflect the actual operating point you need — not a fixed global number.
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Repository structure
|
|
245
|
+
|
|
246
|
+
```
|
|
247
|
+
perceptorguard/
|
|
248
|
+
├── scenarios/ Pydantic schemas, parameterized scenario generator
|
|
249
|
+
│ ├── schemas.py BoundingBox, Detection, GroundTruth, Scenario, ObjectSpec
|
|
250
|
+
│ └── generator.py 6 profiles × 2-tier catalog, occluder placement
|
|
251
|
+
├── runner/ Inference + matching + GT extraction
|
|
252
|
+
│ ├── scene_runner.py PyBullet DIRECT renderer, AABB→screen projection
|
|
253
|
+
│ ├── eval_runner.py YOLO inference loop, bin assignment
|
|
254
|
+
│ └── matcher.py Greedy bipartite match; FN rows carry sub-threshold IoU
|
|
255
|
+
├── metrics/ Metrics, triage, reporting
|
|
256
|
+
│ ├── engine.py 11-pt VOC AP, operating-point P/R/F1, sliced tables
|
|
257
|
+
│ ├── failure_classifier.py missed / localization / wrong_class / fp
|
|
258
|
+
│ ├── cluster_analyzer.py KMeans on scenario feature vector
|
|
259
|
+
│ ├── triage_reporter.py Ranked failure-mode summary
|
|
260
|
+
│ └── reporter.py ASCII console report
|
|
261
|
+
├── gates/ Regression gate
|
|
262
|
+
│ ├── thresholds.py GateThresholds dataclass, YAML-backed
|
|
263
|
+
│ ├── comparator.py 46-check comparison: mAP, AP, recall, FP per slice
|
|
264
|
+
│ └── gate_runner.py Print report, return bool, exit 1 on failure
|
|
265
|
+
├── reports/ Report rendering
|
|
266
|
+
│ ├── annotator.py Annotate failure scenes (GT boxes, missed/detected)
|
|
267
|
+
│ ├── renderer.py Jinja2 → HTML + Markdown
|
|
268
|
+
│ ├── tracker.py W&B + MLflow optional integration
|
|
269
|
+
│ └── templates/ report.html.j2, report.md.j2
|
|
270
|
+
├── scripts/
|
|
271
|
+
│ ├── generate.py Dataset generation CLI
|
|
272
|
+
│ ├── run_eval.py Eval CLI (--model, --iou, --imgsz)
|
|
273
|
+
│ ├── triage.py Failure triage CLI
|
|
274
|
+
│ ├── run_gate.py Gate CLI (exit 0/1)
|
|
275
|
+
│ ├── save_baseline.py Promote eval → baseline
|
|
276
|
+
│ ├── generate_report.py Report generation CLI
|
|
277
|
+
│ └── demo_regression.py Planted-regression demo
|
|
278
|
+
├── tests/ 74 unit tests (all pass)
|
|
279
|
+
│ ├── test_matcher.py, test_metrics.py
|
|
280
|
+
│ ├── test_triage.py, test_gates.py
|
|
281
|
+
│ └── verify_chunk2.py (55 GT-pipeline invariant tests)
|
|
282
|
+
├── assets/ Custom URDFs (bottle.urdf, bowl.urdf)
|
|
283
|
+
├── configs/
|
|
284
|
+
│ └── gate_thresholds.yml Tunable slack per metric
|
|
285
|
+
├── artifacts/
|
|
286
|
+
│ ├── baseline/ 100-scene golden reference (committed)
|
|
287
|
+
│ ├── ci_baseline/ 20-scene CI reference (committed, seed=42)
|
|
288
|
+
│ ├── ci_dataset/ Reproducible 20-scene fixture (committed, LFS)
|
|
289
|
+
│ ├── eval/ Full 100-scene eval output
|
|
290
|
+
│ └── triage/ Failure classification + cluster summary
|
|
291
|
+
└── .github/
|
|
292
|
+
├── workflows/ci_gate.yml PR: 20 scenes, ~2 min
|
|
293
|
+
└── workflows/nightly.yml Scheduled: 100 scenes, auto-promote baseline
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
---
|
|
297
|
+
|
|
298
|
+
## Running the full test suite
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
pytest tests/ -v
|
|
302
|
+
# → 74 passed
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Tests are hermetic — no model inference, no disk artifacts required. The 55 `verify_chunk2.py` tests validate the GT pipeline (AABB projection, occlusion geometry, multi-object placement) against hardcoded fixtures. The 12 `test_matcher.py` tests cover IoU edge cases and greedy matching invariants. The 15 `test_gates.py` tests cover threshold loading, regression detection (mAP drop, FP spike, recall drop), and gate-runner pass/fail return values.
|