perceptorguard 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. perceptorguard-0.1.0/LICENSE +21 -0
  2. perceptorguard-0.1.0/PKG-INFO +340 -0
  3. perceptorguard-0.1.0/README.md +305 -0
  4. perceptorguard-0.1.0/gates/__init__.py +0 -0
  5. perceptorguard-0.1.0/gates/comparator.py +183 -0
  6. perceptorguard-0.1.0/gates/gate_runner.py +67 -0
  7. perceptorguard-0.1.0/gates/thresholds.py +44 -0
  8. perceptorguard-0.1.0/ingestion/__init__.py +0 -0
  9. perceptorguard-0.1.0/ingestion/class_map.py +185 -0
  10. perceptorguard-0.1.0/ingestion/coco_gt.py +123 -0
  11. perceptorguard-0.1.0/ingestion/coco_predictions.py +126 -0
  12. perceptorguard-0.1.0/ingestion/metadata_csv.py +138 -0
  13. perceptorguard-0.1.0/ingestion/slice_inferrer.py +188 -0
  14. perceptorguard-0.1.0/metrics/__init__.py +0 -0
  15. perceptorguard-0.1.0/metrics/cluster_analyzer.py +95 -0
  16. perceptorguard-0.1.0/metrics/engine.py +131 -0
  17. perceptorguard-0.1.0/metrics/failure_classifier.py +55 -0
  18. perceptorguard-0.1.0/metrics/reporter.py +160 -0
  19. perceptorguard-0.1.0/metrics/triage_reporter.py +94 -0
  20. perceptorguard-0.1.0/perceptorguard/__init__.py +0 -0
  21. perceptorguard-0.1.0/perceptorguard/cli.py +169 -0
  22. perceptorguard-0.1.0/perceptorguard.egg-info/PKG-INFO +340 -0
  23. perceptorguard-0.1.0/perceptorguard.egg-info/SOURCES.txt +61 -0
  24. perceptorguard-0.1.0/perceptorguard.egg-info/dependency_links.txt +1 -0
  25. perceptorguard-0.1.0/perceptorguard.egg-info/entry_points.txt +2 -0
  26. perceptorguard-0.1.0/perceptorguard.egg-info/requires.txt +16 -0
  27. perceptorguard-0.1.0/perceptorguard.egg-info/top_level.txt +14 -0
  28. perceptorguard-0.1.0/pyproject.toml +53 -0
  29. perceptorguard-0.1.0/reports/__init__.py +0 -0
  30. perceptorguard-0.1.0/reports/annotator.py +173 -0
  31. perceptorguard-0.1.0/reports/renderer.py +185 -0
  32. perceptorguard-0.1.0/reports/tracker.py +111 -0
  33. perceptorguard-0.1.0/runner/__init__.py +0 -0
  34. perceptorguard-0.1.0/runner/eval_runner.py +114 -0
  35. perceptorguard-0.1.0/runner/gt_extractor.py +49 -0
  36. perceptorguard-0.1.0/runner/matcher.py +106 -0
  37. perceptorguard-0.1.0/runner/scene_runner.py +119 -0
  38. perceptorguard-0.1.0/scenarios/__init__.py +3 -0
  39. perceptorguard-0.1.0/scenarios/generator.py +230 -0
  40. perceptorguard-0.1.0/scenarios/schemas.py +54 -0
  41. perceptorguard-0.1.0/scripts/__init__.py +0 -0
  42. perceptorguard-0.1.0/scripts/demo_regression.py +122 -0
  43. perceptorguard-0.1.0/scripts/generate.py +134 -0
  44. perceptorguard-0.1.0/scripts/generate_report.py +125 -0
  45. perceptorguard-0.1.0/scripts/run_coco_eval.py +159 -0
  46. perceptorguard-0.1.0/scripts/run_eval.py +51 -0
  47. perceptorguard-0.1.0/scripts/run_gate.py +47 -0
  48. perceptorguard-0.1.0/scripts/save_baseline.py +59 -0
  49. perceptorguard-0.1.0/scripts/triage.py +66 -0
  50. perceptorguard-0.1.0/setup.cfg +4 -0
  51. perceptorguard-0.1.0/tests/__init__.py +0 -0
  52. perceptorguard-0.1.0/tests/test_class_map.py +283 -0
  53. perceptorguard-0.1.0/tests/test_coco_ingestion.py +382 -0
  54. perceptorguard-0.1.0/tests/test_gates.py +226 -0
  55. perceptorguard-0.1.0/tests/test_generator.py +79 -0
  56. perceptorguard-0.1.0/tests/test_matcher.py +102 -0
  57. perceptorguard-0.1.0/tests/test_metadata_csv.py +208 -0
  58. perceptorguard-0.1.0/tests/test_metrics.py +153 -0
  59. perceptorguard-0.1.0/tests/test_scene_runner.py +109 -0
  60. perceptorguard-0.1.0/tests/test_schemas.py +28 -0
  61. perceptorguard-0.1.0/tests/test_slice_inferrer.py +238 -0
  62. perceptorguard-0.1.0/tests/test_triage.py +131 -0
  63. perceptorguard-0.1.0/tests/verify_chunk2.py +418 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 PerceptorGuard
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,340 @@
1
+ Metadata-Version: 2.4
2
+ Name: perceptorguard
3
+ Version: 0.1.0
4
+ Summary: Perception model evaluation harness — sliced metrics, failure triage, and CI regression gating for detection models
5
+ License: MIT
6
+ Project-URL: Homepage, https://github.com/your-org/perceptorguard
7
+ Project-URL: Bug Tracker, https://github.com/your-org/perceptorguard/issues
8
+ Keywords: object-detection,evaluation,metrics,yolo,coco,perception,robotics
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: pandas>=2.2.0
21
+ Requires-Dist: pydantic>=2.7.0
22
+ Requires-Dist: numpy>=1.26.0
23
+ Requires-Dist: Pillow>=10.3.0
24
+ Requires-Dist: scikit-learn>=1.3.0
25
+ Requires-Dist: jinja2>=3.1.0
26
+ Requires-Dist: pyyaml>=6.0
27
+ Provides-Extra: synthetic
28
+ Requires-Dist: pybullet>=3.2.6; extra == "synthetic"
29
+ Requires-Dist: torch>=2.2.0; extra == "synthetic"
30
+ Requires-Dist: torchvision>=0.17.0; extra == "synthetic"
31
+ Requires-Dist: ultralytics>=8.2.0; extra == "synthetic"
32
+ Provides-Extra: dev
33
+ Requires-Dist: pytest>=8.2.0; extra == "dev"
34
+ Dynamic: license-file
35
+
36
+ # PerceptorGuard
37
+
38
+ A **perception evaluation harness** for YOLO detection models — slice-based metrics, CI regression gating, failure triage, and reproducible synthetic fixtures. Built to demonstrate that evaluation is an engineering discipline, not a one-liner.
39
+
40
+ ---
41
+
42
+ ## Quick start
43
+
44
+ ```bash
45
+ # 1. Install
46
+ python -m venv .venv && source .venv/bin/activate
47
+ pip install -e ".[dev]" && pip install scikit-learn pyyaml jinja2
48
+
49
+ # 2. Generate a 20-scene dataset and run eval
50
+ python scripts/generate.py --count 20 --seed 42 --out artifacts/dataset
51
+ python scripts/run_eval.py --dataset artifacts/dataset --out artifacts/eval
52
+
53
+ # 3. Triage failures and generate report
54
+ python scripts/triage.py --matches artifacts/eval/matches.csv
55
+ python scripts/generate_report.py \
56
+ --dataset artifacts/dataset \
57
+ --eval artifacts/eval \
58
+ --baseline artifacts/baseline \
59
+ --triage artifacts/triage
60
+ # → open artifacts/report/report.html
61
+ ```
62
+
63
+ **Planted-regression demo** (gate goes red, then green):
64
+ ```bash
65
+ python scripts/save_baseline.py # promote current eval to baseline
66
+ python scripts/demo_regression.py # PASS → FAIL → PASS
67
+ ```
68
+
69
+ ---
70
+
71
+ ## System overview
72
+
73
+ ```
74
+ ┌───────────────────────────────────────────────────────────────────┐
75
+ │ Scenario generator (PyBullet DIRECT) │
76
+ │ 6 challenge profiles × 2-tier object catalog (easy + hard) │
77
+ │ → manifest.csv + per-scene frame.png + ground_truth.json │
78
+ └──────────────────────────┬────────────────────────────────────────┘
79
+
80
+ ┌──────────────────────────▼────────────────────────────────────────┐
81
+ │ Eval runner │
82
+ │ YOLO inference at conf=0.01 (full distribution for AP) │
83
+ │ Greedy class-aware bipartite matching (IoU ≥ 0.5) │
84
+ │ → matches.csv (tidy: one row per TP / FP / FN) │
85
+ └──────────────────────────┬────────────────────────────────────────┘
86
+
87
+ ┌────────────────┴──────────────────┐
88
+ │ │
89
+ ┌─────────▼──────────┐ ┌──────────▼──────────────────┐
90
+ │ Metrics engine │ │ Failure triage │
91
+ │ mAP (11-pt VOC) │ │ missed / localization / │
92
+ │ P/R/F1 @ 0.25 │ │ wrong_class / false_pos │
93
+ │ slice tables: │ │ KMeans cluster analysis │
94
+ │ profile / dist / │ │ → failures_classified.csv │
95
+ │ lighting / clutter│ │ → cluster_summary.csv │
96
+ │ tier / class │ └──────────────────────────────┘
97
+ └─────────┬──────────┘
98
+
99
+ ┌─────────▼──────────────────────────────────────────────────────┐
100
+ │ Regression gate │
101
+ │ Compare current metrics vs baseline (artifacts/baseline/) │
102
+ │ 46 checks: mAP, AP/recall per class, recall/FP per profile │
103
+ │ Exit 1 on any regression beyond configured slack │
104
+ └─────────┬──────────────────────────────────────────────────────┘
105
+
106
+ ┌─────────▼──────────────────────────────────────────────────────┐
107
+ │ Report renderer (Jinja2) │
108
+ │ Self-contained HTML + Markdown │
109
+ │ Slice tables, failure gallery, gate diff table │
110
+ │ Optional: W&B / MLflow experiment tracking │
111
+ └────────────────────────────────────────────────────────────────┘
112
+ ```
113
+
114
+ **CI split** — deliberate engineering decision:
115
+ - **PR gate** (`.github/workflows/ci_gate.yml`): 20 scenes, seed=42, ~2 min. Catches regressions before they land in `main`.
116
+ - **Nightly** (`.github/workflows/nightly.yml`): 100 scenes, auto-promotes baseline on success. Authoritative quality bar.
117
+
118
+ ---
119
+
120
+ ## Key design decisions
121
+
122
+ ### 1. Evaluation as a first-class engineering subsystem
123
+
124
+ Most perception teams treat evaluation as an afterthought: run a metric script, log the number, move on. The script is throw-away code; the number is a spreadsheet cell; there is no regression gate.
125
+
126
+ PerceptorGuard treats the eval pipeline with the same engineering discipline as the model itself:
127
+
128
+ | Property | Ad-hoc eval script | PerceptorGuard |
129
+ |----------|-------------------|----------------|
130
+ | Dataset | "whatever images we had" | Versioned, reproducible, committed |
131
+ | Metrics | Overall mAP only | Per-slice: profile, distance, lighting, clutter, tier, class |
132
+ | Failures | "recall is low" | Named failure modes with KMeans-clustered conditions |
133
+ | Regressions | Discovered in staging | Caught at PR time, 46 checks, named slice + delta |
134
+ | Reproducibility | "I think we used these settings" | Seed-fixed fixtures, LFS-tracked weights |
135
+
136
+ The eval harness is the **durable investment**. Models come and go; the harness lets you compare them honestly.
137
+
138
+ ### 2. Model-agnostic interface
139
+
140
+ The harness has exactly one coupling point to YOLO: `EvalRunner._predict()` (12 lines in `runner/eval_runner.py`). The rest of the pipeline — matching, metrics, gating, triage, reporting — operates on `Detection` and `GroundTruth` Pydantic schemas.
141
+
142
+ Swapping YOLOv8n for YOLOv8x, a RT-DETR, or a custom model requires changing exactly that one function. You can A/B test models through the same harness and compare them on the same reproducible dataset without any eval-code changes.
143
+
144
+ This is the same principle I apply to RAG and agentic systems: the evaluation framework must be agnostic to the implementation choice, or you end up with evaluation that only works for the thing you already have.
145
+
146
+ ### 3. CI gate blast-radius argument
147
+
148
+ Catching a regression has very different costs depending on where it surfaces:
149
+
150
+ | Where caught | Cost |
151
+ |-------------|------|
152
+ | At PR (CI gate) | 2 min CI compute |
153
+ | In staging after merge | Deploy + rollback + engineer-hours |
154
+ | In production | Incident response + user-trust loss |
155
+
156
+ The gate runs 46 checks per PR. Each check is a named (slice, metric) pair with an explicit floor: `floor = baseline_value − slack`. If any check fires, the gate exits 1 and names the regressed slice and delta. Engineers know exactly what broke and by how much — not "mAP went down a bit."
157
+
158
+ The 20-scene CI fast path is a **deliberate tradeoff**: 20 scenes is noisy enough that you'll miss subtle regressions, but it catches real structural breaks (IoU threshold bug, conf threshold change, class mapping error) in 2 minutes. The nightly full 100-scene run is the authoritative measurement. This split is explained in both workflow files.
159
+
160
+ ### 4. Two-tier object catalog
161
+
162
+ ```
163
+ Easy (COCO-recognizable): cup, bottle, bowl, teddy bear, sports ball
164
+ Hard (off-vocabulary): cube, duck, lego, domino
165
+ ```
166
+
167
+ The split gives you two signals simultaneously:
168
+ - **Easy tier** measures how much signal you can extract from a COCO-pretrained backbone. Any recall at all indicates some domain-transfer.
169
+ - **Hard tier** measures true zero-shot generalisation. Near-zero AP here is expected and honest — it's the documented result, not a bug.
170
+
171
+ Running only easy classes would give you false confidence. Running only hard classes would give you nothing to gate on. The mix is deliberate.
172
+
173
+ ### 5. Sub-threshold IoU enrichment for FN rows
174
+
175
+ The matcher records `best_iou_any_class` and `best_pred_class_at_overlap` for every FN row. This enables the failure classifier to distinguish:
176
+
177
+ - `missed_detection` — no prediction overlapped the GT (IoU < 0.1 from any box)
178
+ - `localization_error` — right class, right location, IoU just below threshold
179
+ - `wrong_class` — something overlaps (IoU ≥ 0.1) but with the wrong class label
180
+
181
+ These are **different engineering problems** requiring different interventions. Knowing you have 123 `wrong_class` failures in near-distance crowded scenes (the actual finding) is actionable. Knowing "FN = 355" is not.
182
+
183
+ ---
184
+
185
+ ## Findings from the pilot run (YOLOv8n, 100 scenes)
186
+
187
+ ```
188
+ Overall: mAP=0.1% Precision=0% Recall=0%
189
+ TP=0 FP=27 FN=355 (at op-point conf≥0.25)
190
+ ```
191
+
192
+ The headline finding is a **complete domain gap**: YOLOv8n, pretrained on COCO, achieves 0% recall on PyBullet synthetic renders at any operating threshold. This is expected and honest.
193
+
194
+ **The only live signal: sports ball, AP=0.9%** — the soccerball URDF has a realistic texture that partially overlaps the COCO training distribution. Two sub-threshold predictions match GT at IoU≥0.5 during the full-distribution AP sweep.
195
+
196
+ **Failure mode breakdown (771 failures):**
197
+ 1. `false_positive` (54%) — 416 hallucinations. The model confidently detects objects that aren't there (chairs, people, cars) because PyBullet renders look like partial-context frames from real images.
198
+ 2. `missed_detection` (30%) — 232 GTs with zero model signal. Worst in crowded + far profiles; domino and lego lead by class.
199
+ 3. `wrong_class` (16%) — 123 GTs where a prediction overlaps but carries the wrong label. Cube accounts for 34% of wrong-class failures — the model recognises a rectangular object but can't resolve the class.
200
+
201
+ **Cluster insight**: KMeans (k=5) on [camera_distance, ambient_light, num_objects, failure_mode, tier, profile] identifies two pure FP clusters (100% false positives at mid-distance and near-distance), a mixed missed+wrong-class cluster under crowded conditions, and a distinct dark-scene cluster with different FP characteristics.
202
+
203
+ **What this means for next steps:**
204
+ - The gap is at the distribution level, not the architecture level. Domain randomization of textures + sim-to-real transfer is the right intervention, not model scaling.
205
+ - The FP flood is a calibration problem. A classification head fine-tuned on synthetic negatives would reduce it dramatically.
206
+ - The wrong-class failures on cube/domino suggest that geometric shape features are present but class-label resolution requires task-specific training.
207
+
208
+ ---
209
+
210
+ ## Planted-regression demo
211
+
212
+ The gate verifiably catches regression and names the failing slice:
213
+
214
+ ```
215
+ $ python scripts/demo_regression.py
216
+
217
+ DEMO STEP 1 — Gate on current (good) metrics
218
+ → PASSED — all 46 checks within threshold
219
+
220
+ DEMO STEP 2 — Plant regression: zero sports-ball AP
221
+ (simulates iou_threshold cranked to 0.9)
222
+ → FAILED — 1 regression(s) detected
223
+ ✗ class:sports ball / ap
224
+ baseline=0.0091 current=0.0000 floor=0.0041 delta=-0.0091
225
+
226
+ DEMO STEP 3 — Restore
227
+ → PASSED — all 46 checks within threshold
228
+
229
+ Result: PASS → FAIL → PASS ✓ (gate behaves correctly)
230
+ ```
231
+
232
+ To trigger this in real CI, open a PR that changes `--iou 0.9` in the eval runner. The CI workflow runs, the sports-ball AP drops from 0.9% to 0%, the gate exits 1, and the PR check fails — naming the regressed slice and delta. Revert → green.
233
+
234
+ ---
235
+
236
+ ## Cross-domain principle: eval as infrastructure
237
+
238
+ The architectural insight that generalises across ML domains:
239
+
240
+ > **Evaluation should be a first-class subsystem, not an afterthought. It must be model-agnostic, slice-aware, and wired into CI.**
241
+
242
+ I've applied the same principle in three domains:
243
+
244
+ | Domain | What gets evaluated | The harness checks |
245
+ |--------|--------------------|--------------------|
246
+ | **Perception (this project)** | YOLO detector | Per-slice mAP, FP rate, localization quality |
247
+ | **RAG systems** | Retriever + LLM | Retrieval recall, answer faithfulness, citation precision |
248
+ | **Agentic systems** | Tool-use agent | Task completion rate, tool selection accuracy, latency |
249
+
250
+ In each case:
251
+ - The harness is decoupled from the implementation (model-agnostic interface)
252
+ - Metrics are sliced by the conditions that matter (difficulty, distance, topic domain, query type)
253
+ - A baseline is stored and regressions are caught before they reach production
254
+ - Failures are named, not just counted
255
+
256
+ The model changes; the eval discipline doesn't. This is the engineering investment that compounds.
257
+
258
+ ---
259
+
260
+ ## What I'd do differently
261
+
262
+ **1. Domain randomization before domain gap is "interesting"**
263
+ PyBullet renders are too synthetic. Before drawing any production conclusions, I'd add texture randomization (PBR materials, HDRI backgrounds), noise augmentation, and random object scales. The 0% recall result is expected — but a more realistic synthetic distribution would push that to a meaningful non-zero baseline worth gating on.
264
+
265
+ **2. Real-image holdout**
266
+ The synthetic-to-real gap is documented but not measured. A small (50-100 image) real-world validation set, with the same object categories, would quantify the gap. This is the honest thing to do before claiming the eval harness is production-relevant.
267
+
268
+ **3. Active learning loop**
269
+ The triage output (failure mode distribution, KMeans clusters) should feed back into the scenario generator: generate more scenes matching the hardest cluster conditions. Right now triage is a report; it should be a signal that drives the next data generation run.
270
+
271
+ **4. Temporal and latency eval**
272
+ `match_scene` operates on single frames. Real robot perception needs tracking across frames, trajectory prediction, and FPS budgeting. None of that is here.
273
+
274
+ **5. Confidence calibration**
275
+ The op-point threshold is set at conf≥0.25 by convention. A calibration sweep (precision-recall curve analysis by condition) would let you set per-slice thresholds that reflect the actual operating point you need — not a fixed global number.
276
+
277
+ ---
278
+
279
+ ## Repository structure
280
+
281
+ ```
282
+ perceptorguard/
283
+ ├── scenarios/ Pydantic schemas, parameterized scenario generator
284
+ │ ├── schemas.py BoundingBox, Detection, GroundTruth, Scenario, ObjectSpec
285
+ │ └── generator.py 6 profiles × 2-tier catalog, occluder placement
286
+ ├── runner/ Inference + matching + GT extraction
287
+ │ ├── scene_runner.py PyBullet DIRECT renderer, AABB→screen projection
288
+ │ ├── eval_runner.py YOLO inference loop, bin assignment
289
+ │ └── matcher.py Greedy bipartite match; FN rows carry sub-threshold IoU
290
+ ├── metrics/ Metrics, triage, reporting
291
+ │ ├── engine.py 11-pt VOC AP, operating-point P/R/F1, sliced tables
292
+ │ ├── failure_classifier.py missed / localization / wrong_class / fp
293
+ │ ├── cluster_analyzer.py KMeans on scenario feature vector
294
+ │ ├── triage_reporter.py Ranked failure-mode summary
295
+ │ └── reporter.py ASCII console report
296
+ ├── gates/ Regression gate
297
+ │ ├── thresholds.py GateThresholds dataclass, YAML-backed
298
+ │ ├── comparator.py 46-check comparison: mAP, AP, recall, FP per slice
299
+ │ └── gate_runner.py Print report, return bool, exit 1 on failure
300
+ ├── reports/ Report rendering
301
+ │ ├── annotator.py Annotate failure scenes (GT boxes, missed/detected)
302
+ │ ├── renderer.py Jinja2 → HTML + Markdown
303
+ │ ├── tracker.py W&B + MLflow optional integration
304
+ │ └── templates/ report.html.j2, report.md.j2
305
+ ├── scripts/
306
+ │ ├── generate.py Dataset generation CLI
307
+ │ ├── run_eval.py Eval CLI (--model, --iou, --imgsz)
308
+ │ ├── triage.py Failure triage CLI
309
+ │ ├── run_gate.py Gate CLI (exit 0/1)
310
+ │ ├── save_baseline.py Promote eval → baseline
311
+ │ ├── generate_report.py Report generation CLI
312
+ │ └── demo_regression.py Planted-regression demo
313
+ ├── tests/ 74 unit tests (all pass)
314
+ │ ├── test_matcher.py, test_metrics.py
315
+ │ ├── test_triage.py, test_gates.py
316
+ │ └── verify_chunk2.py (55 GT-pipeline invariant tests)
317
+ ├── assets/ Custom URDFs (bottle.urdf, bowl.urdf)
318
+ ├── configs/
319
+ │ └── gate_thresholds.yml Tunable slack per metric
320
+ ├── artifacts/
321
+ │ ├── baseline/ 100-scene golden reference (committed)
322
+ │ ├── ci_baseline/ 20-scene CI reference (committed, seed=42)
323
+ │ ├── ci_dataset/ Reproducible 20-scene fixture (committed, LFS)
324
+ │ ├── eval/ Full 100-scene eval output
325
+ │ └── triage/ Failure classification + cluster summary
326
+ └── .github/
327
+ ├── workflows/ci_gate.yml PR: 20 scenes, ~2 min
328
+ └── workflows/nightly.yml Scheduled: 100 scenes, auto-promote baseline
329
+ ```
330
+
331
+ ---
332
+
333
+ ## Running the full test suite
334
+
335
+ ```bash
336
+ pytest tests/ -v
337
+ # → 74 passed
338
+ ```
339
+
340
+ Tests are hermetic — no model inference, no disk artifacts required. The 55 `verify_chunk2.py` tests validate the GT pipeline (AABB projection, occlusion geometry, multi-object placement) against hardcoded fixtures. The 12 `test_matcher.py` tests cover IoU edge cases and greedy matching invariants. The 15 `test_gates.py` tests cover threshold loading, regression detection (mAP drop, FP spike, recall drop), and gate-runner pass/fail return values.
@@ -0,0 +1,305 @@
1
+ # PerceptorGuard
2
+
3
+ A **perception evaluation harness** for YOLO detection models — slice-based metrics, CI regression gating, failure triage, and reproducible synthetic fixtures. Built to demonstrate that evaluation is an engineering discipline, not a one-liner.
4
+
5
+ ---
6
+
7
+ ## Quick start
8
+
9
+ ```bash
10
+ # 1. Install
11
+ python -m venv .venv && source .venv/bin/activate
12
+ pip install -e ".[dev]" && pip install scikit-learn pyyaml jinja2
13
+
14
+ # 2. Generate a 20-scene dataset and run eval
15
+ python scripts/generate.py --count 20 --seed 42 --out artifacts/dataset
16
+ python scripts/run_eval.py --dataset artifacts/dataset --out artifacts/eval
17
+
18
+ # 3. Triage failures and generate report
19
+ python scripts/triage.py --matches artifacts/eval/matches.csv
20
+ python scripts/generate_report.py \
21
+ --dataset artifacts/dataset \
22
+ --eval artifacts/eval \
23
+ --baseline artifacts/baseline \
24
+ --triage artifacts/triage
25
+ # → open artifacts/report/report.html
26
+ ```
27
+
28
+ **Planted-regression demo** (gate goes red, then green):
29
+ ```bash
30
+ python scripts/save_baseline.py # promote current eval to baseline
31
+ python scripts/demo_regression.py # PASS → FAIL → PASS
32
+ ```
33
+
34
+ ---
35
+
36
+ ## System overview
37
+
38
+ ```
39
+ ┌───────────────────────────────────────────────────────────────────┐
40
+ │ Scenario generator (PyBullet DIRECT) │
41
+ │ 6 challenge profiles × 2-tier object catalog (easy + hard) │
42
+ │ → manifest.csv + per-scene frame.png + ground_truth.json │
43
+ └──────────────────────────┬────────────────────────────────────────┘
44
+
45
+ ┌──────────────────────────▼────────────────────────────────────────┐
46
+ │ Eval runner │
47
+ │ YOLO inference at conf=0.01 (full distribution for AP) │
48
+ │ Greedy class-aware bipartite matching (IoU ≥ 0.5) │
49
+ │ → matches.csv (tidy: one row per TP / FP / FN) │
50
+ └──────────────────────────┬────────────────────────────────────────┘
51
+
52
+ ┌────────────────┴──────────────────┐
53
+ │ │
54
+ ┌─────────▼──────────┐ ┌──────────▼──────────────────┐
55
+ │ Metrics engine │ │ Failure triage │
56
+ │ mAP (11-pt VOC) │ │ missed / localization / │
57
+ │ P/R/F1 @ 0.25 │ │ wrong_class / false_pos │
58
+ │ slice tables: │ │ KMeans cluster analysis │
59
+ │ profile / dist / │ │ → failures_classified.csv │
60
+ │ lighting / clutter│ │ → cluster_summary.csv │
61
+ │ tier / class │ └──────────────────────────────┘
62
+ └─────────┬──────────┘
63
+
64
+ ┌─────────▼──────────────────────────────────────────────────────┐
65
+ │ Regression gate │
66
+ │ Compare current metrics vs baseline (artifacts/baseline/) │
67
+ │ 46 checks: mAP, AP/recall per class, recall/FP per profile │
68
+ │ Exit 1 on any regression beyond configured slack │
69
+ └─────────┬──────────────────────────────────────────────────────┘
70
+
71
+ ┌─────────▼──────────────────────────────────────────────────────┐
72
+ │ Report renderer (Jinja2) │
73
+ │ Self-contained HTML + Markdown │
74
+ │ Slice tables, failure gallery, gate diff table │
75
+ │ Optional: W&B / MLflow experiment tracking │
76
+ └────────────────────────────────────────────────────────────────┘
77
+ ```
78
+
79
+ **CI split** — deliberate engineering decision:
80
+ - **PR gate** (`.github/workflows/ci_gate.yml`): 20 scenes, seed=42, ~2 min. Catches regressions before they land in `main`.
81
+ - **Nightly** (`.github/workflows/nightly.yml`): 100 scenes, auto-promotes baseline on success. Authoritative quality bar.
82
+
83
+ ---
84
+
85
+ ## Key design decisions
86
+
87
+ ### 1. Evaluation as a first-class engineering subsystem
88
+
89
+ Most perception teams treat evaluation as an afterthought: run a metric script, log the number, move on. The script is throw-away code; the number is a spreadsheet cell; there is no regression gate.
90
+
91
+ PerceptorGuard treats the eval pipeline with the same engineering discipline as the model itself:
92
+
93
+ | Property | Ad-hoc eval script | PerceptorGuard |
94
+ |----------|-------------------|----------------|
95
+ | Dataset | "whatever images we had" | Versioned, reproducible, committed |
96
+ | Metrics | Overall mAP only | Per-slice: profile, distance, lighting, clutter, tier, class |
97
+ | Failures | "recall is low" | Named failure modes with KMeans-clustered conditions |
98
+ | Regressions | Discovered in staging | Caught at PR time, 46 checks, named slice + delta |
99
+ | Reproducibility | "I think we used these settings" | Seed-fixed fixtures, LFS-tracked weights |
100
+
101
+ The eval harness is the **durable investment**. Models come and go; the harness lets you compare them honestly.
102
+
103
+ ### 2. Model-agnostic interface
104
+
105
+ The harness has exactly one coupling point to YOLO: `EvalRunner._predict()` (12 lines in `runner/eval_runner.py`). The rest of the pipeline — matching, metrics, gating, triage, reporting — operates on `Detection` and `GroundTruth` Pydantic schemas.
106
+
107
+ Swapping YOLOv8n for YOLOv8x, a RT-DETR, or a custom model requires changing exactly that one function. You can A/B test models through the same harness and compare them on the same reproducible dataset without any eval-code changes.
108
+
109
+ This is the same principle I apply to RAG and agentic systems: the evaluation framework must be agnostic to the implementation choice, or you end up with evaluation that only works for the thing you already have.
110
+
111
+ ### 3. CI gate blast-radius argument
112
+
113
+ Catching a regression has very different costs depending on where it surfaces:
114
+
115
+ | Where caught | Cost |
116
+ |-------------|------|
117
+ | At PR (CI gate) | 2 min CI compute |
118
+ | In staging after merge | Deploy + rollback + engineer-hours |
119
+ | In production | Incident response + user-trust loss |
120
+
121
+ The gate runs 46 checks per PR. Each check is a named (slice, metric) pair with an explicit floor: `floor = baseline_value − slack`. If any check fires, the gate exits 1 and names the regressed slice and delta. Engineers know exactly what broke and by how much — not "mAP went down a bit."
122
+
123
+ The 20-scene CI fast path is a **deliberate tradeoff**: 20 scenes is noisy enough that you'll miss subtle regressions, but it catches real structural breaks (IoU threshold bug, conf threshold change, class mapping error) in 2 minutes. The nightly full 100-scene run is the authoritative measurement. This split is explained in both workflow files.
124
+
125
+ ### 4. Two-tier object catalog
126
+
127
+ ```
128
+ Easy (COCO-recognizable): cup, bottle, bowl, teddy bear, sports ball
129
+ Hard (off-vocabulary): cube, duck, lego, domino
130
+ ```
131
+
132
+ The split gives you two signals simultaneously:
133
+ - **Easy tier** measures how much signal you can extract from a COCO-pretrained backbone. Any recall at all indicates some domain-transfer.
134
+ - **Hard tier** measures true zero-shot generalisation. Near-zero AP here is expected and honest — it's the documented result, not a bug.
135
+
136
+ Running only easy classes would give you false confidence. Running only hard classes would give you nothing to gate on. The mix is deliberate.
137
+
138
+ ### 5. Sub-threshold IoU enrichment for FN rows
139
+
140
+ The matcher records `best_iou_any_class` and `best_pred_class_at_overlap` for every FN row. This enables the failure classifier to distinguish:
141
+
142
+ - `missed_detection` — no prediction overlapped the GT (IoU < 0.1 from any box)
143
+ - `localization_error` — right class, right location, IoU just below threshold
144
+ - `wrong_class` — something overlaps (IoU ≥ 0.1) but with the wrong class label
145
+
146
+ These are **different engineering problems** requiring different interventions. Knowing you have 123 `wrong_class` failures in near-distance crowded scenes (the actual finding) is actionable. Knowing "FN = 355" is not.
147
+
148
+ ---
149
+
150
+ ## Findings from the pilot run (YOLOv8n, 100 scenes)
151
+
152
+ ```
153
+ Overall: mAP=0.1% Precision=0% Recall=0%
154
+ TP=0 FP=27 FN=355 (at op-point conf≥0.25)
155
+ ```
156
+
157
+ The headline finding is a **complete domain gap**: YOLOv8n, pretrained on COCO, achieves 0% recall on PyBullet synthetic renders at any operating threshold. This is expected and honest.
158
+
159
+ **The only live signal: sports ball, AP=0.9%** — the soccerball URDF has a realistic texture that partially overlaps the COCO training distribution. Two sub-threshold predictions match GT at IoU≥0.5 during the full-distribution AP sweep.
160
+
161
+ **Failure mode breakdown (771 failures):**
162
+ 1. `false_positive` (54%) — 416 hallucinations. The model confidently detects objects that aren't there (chairs, people, cars) because PyBullet renders look like partial-context frames from real images.
163
+ 2. `missed_detection` (30%) — 232 GTs with zero model signal. Worst in crowded + far profiles; domino and lego lead by class.
164
+ 3. `wrong_class` (16%) — 123 GTs where a prediction overlaps but carries the wrong label. Cube accounts for 34% of wrong-class failures — the model recognises a rectangular object but can't resolve the class.
165
+
166
+ **Cluster insight**: KMeans (k=5) on [camera_distance, ambient_light, num_objects, failure_mode, tier, profile] identifies two pure FP clusters (100% false positives at mid-distance and near-distance), a mixed missed+wrong-class cluster under crowded conditions, and a distinct dark-scene cluster with different FP characteristics.
167
+
168
+ **What this means for next steps:**
169
+ - The gap is at the distribution level, not the architecture level. Domain randomization of textures + sim-to-real transfer is the right intervention, not model scaling.
170
+ - The FP flood is a calibration problem. A classification head fine-tuned on synthetic negatives would reduce it dramatically.
171
+ - The wrong-class failures on cube/domino suggest that geometric shape features are present but class-label resolution requires task-specific training.
172
+
173
+ ---
174
+
175
+ ## Planted-regression demo
176
+
177
+ The gate verifiably catches regression and names the failing slice:
178
+
179
+ ```
180
+ $ python scripts/demo_regression.py
181
+
182
+ DEMO STEP 1 — Gate on current (good) metrics
183
+ → PASSED — all 46 checks within threshold
184
+
185
+ DEMO STEP 2 — Plant regression: zero sports-ball AP
186
+ (simulates iou_threshold cranked to 0.9)
187
+ → FAILED — 1 regression(s) detected
188
+ ✗ class:sports ball / ap
189
+ baseline=0.0091 current=0.0000 floor=0.0041 delta=-0.0091
190
+
191
+ DEMO STEP 3 — Restore
192
+ → PASSED — all 46 checks within threshold
193
+
194
+ Result: PASS → FAIL → PASS ✓ (gate behaves correctly)
195
+ ```
196
+
197
+ To trigger this in real CI, open a PR that changes `--iou 0.9` in the eval runner. The CI workflow runs, the sports-ball AP drops from 0.9% to 0%, the gate exits 1, and the PR check fails — naming the regressed slice and delta. Revert → green.
198
+
199
+ ---
200
+
201
+ ## Cross-domain principle: eval as infrastructure
202
+
203
+ The architectural insight that generalises across ML domains:
204
+
205
+ > **Evaluation should be a first-class subsystem, not an afterthought. It must be model-agnostic, slice-aware, and wired into CI.**
206
+
207
+ I've applied the same principle in three domains:
208
+
209
+ | Domain | What gets evaluated | The harness checks |
210
+ |--------|--------------------|--------------------|
211
+ | **Perception (this project)** | YOLO detector | Per-slice mAP, FP rate, localization quality |
212
+ | **RAG systems** | Retriever + LLM | Retrieval recall, answer faithfulness, citation precision |
213
+ | **Agentic systems** | Tool-use agent | Task completion rate, tool selection accuracy, latency |
214
+
215
+ In each case:
216
+ - The harness is decoupled from the implementation (model-agnostic interface)
217
+ - Metrics are sliced by the conditions that matter (difficulty, distance, topic domain, query type)
218
+ - A baseline is stored and regressions are caught before they reach production
219
+ - Failures are named, not just counted
220
+
221
+ The model changes; the eval discipline doesn't. This is the engineering investment that compounds.
222
+
223
+ ---
224
+
225
+ ## What I'd do differently
226
+
227
+ **1. Domain randomization before domain gap is "interesting"**
228
+ PyBullet renders are too synthetic. Before drawing any production conclusions, I'd add texture randomization (PBR materials, HDRI backgrounds), noise augmentation, and random object scales. The 0% recall result is expected — but a more realistic synthetic distribution would push that to a meaningful non-zero baseline worth gating on.
229
+
230
+ **2. Real-image holdout**
231
+ The synthetic-to-real gap is documented but not measured. A small (50-100 image) real-world validation set, with the same object categories, would quantify the gap. This is the honest thing to do before claiming the eval harness is production-relevant.
232
+
233
+ **3. Active learning loop**
234
+ The triage output (failure mode distribution, KMeans clusters) should feed back into the scenario generator: generate more scenes matching the hardest cluster conditions. Right now triage is a report; it should be a signal that drives the next data generation run.
235
+
236
+ **4. Temporal and latency eval**
237
+ `match_scene` operates on single frames. Real robot perception needs tracking across frames, trajectory prediction, and FPS budgeting. None of that is here.
238
+
239
+ **5. Confidence calibration**
240
+ The op-point threshold is set at conf≥0.25 by convention. A calibration sweep (precision-recall curve analysis by condition) would let you set per-slice thresholds that reflect the actual operating point you need — not a fixed global number.
241
+
242
+ ---
243
+
244
+ ## Repository structure
245
+
246
+ ```
247
+ perceptorguard/
248
+ ├── scenarios/ Pydantic schemas, parameterized scenario generator
249
+ │ ├── schemas.py BoundingBox, Detection, GroundTruth, Scenario, ObjectSpec
250
+ │ └── generator.py 6 profiles × 2-tier catalog, occluder placement
251
+ ├── runner/ Inference + matching + GT extraction
252
+ │ ├── scene_runner.py PyBullet DIRECT renderer, AABB→screen projection
253
+ │ ├── eval_runner.py YOLO inference loop, bin assignment
254
+ │ └── matcher.py Greedy bipartite match; FN rows carry sub-threshold IoU
255
+ ├── metrics/ Metrics, triage, reporting
256
+ │ ├── engine.py 11-pt VOC AP, operating-point P/R/F1, sliced tables
257
+ │ ├── failure_classifier.py missed / localization / wrong_class / fp
258
+ │ ├── cluster_analyzer.py KMeans on scenario feature vector
259
+ │ ├── triage_reporter.py Ranked failure-mode summary
260
+ │ └── reporter.py ASCII console report
261
+ ├── gates/ Regression gate
262
+ │ ├── thresholds.py GateThresholds dataclass, YAML-backed
263
+ │ ├── comparator.py 46-check comparison: mAP, AP, recall, FP per slice
264
+ │ └── gate_runner.py Print report, return bool, exit 1 on failure
265
+ ├── reports/ Report rendering
266
+ │ ├── annotator.py Annotate failure scenes (GT boxes, missed/detected)
267
+ │ ├── renderer.py Jinja2 → HTML + Markdown
268
+ │ ├── tracker.py W&B + MLflow optional integration
269
+ │ └── templates/ report.html.j2, report.md.j2
270
+ ├── scripts/
271
+ │ ├── generate.py Dataset generation CLI
272
+ │ ├── run_eval.py Eval CLI (--model, --iou, --imgsz)
273
+ │ ├── triage.py Failure triage CLI
274
+ │ ├── run_gate.py Gate CLI (exit 0/1)
275
+ │ ├── save_baseline.py Promote eval → baseline
276
+ │ ├── generate_report.py Report generation CLI
277
+ │ └── demo_regression.py Planted-regression demo
278
+ ├── tests/ 74 unit tests (all pass)
279
+ │ ├── test_matcher.py, test_metrics.py
280
+ │ ├── test_triage.py, test_gates.py
281
+ │ └── verify_chunk2.py (55 GT-pipeline invariant tests)
282
+ ├── assets/ Custom URDFs (bottle.urdf, bowl.urdf)
283
+ ├── configs/
284
+ │ └── gate_thresholds.yml Tunable slack per metric
285
+ ├── artifacts/
286
+ │ ├── baseline/ 100-scene golden reference (committed)
287
+ │ ├── ci_baseline/ 20-scene CI reference (committed, seed=42)
288
+ │ ├── ci_dataset/ Reproducible 20-scene fixture (committed, LFS)
289
+ │ ├── eval/ Full 100-scene eval output
290
+ │ └── triage/ Failure classification + cluster summary
291
+ └── .github/
292
+ ├── workflows/ci_gate.yml PR: 20 scenes, ~2 min
293
+ └── workflows/nightly.yml Scheduled: 100 scenes, auto-promote baseline
294
+ ```
295
+
296
+ ---
297
+
298
+ ## Running the full test suite
299
+
300
+ ```bash
301
+ pytest tests/ -v
302
+ # → 74 passed
303
+ ```
304
+
305
+ Tests are hermetic — no model inference, no disk artifacts required. The 55 `verify_chunk2.py` tests validate the GT pipeline (AABB projection, occlusion geometry, multi-object placement) against hardcoded fixtures. The 12 `test_matcher.py` tests cover IoU edge cases and greedy matching invariants. The 15 `test_gates.py` tests cover threshold loading, regression detection (mAP drop, FP spike, recall drop), and gate-runner pass/fail return values.