occlude 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- occlude/__init__.py +3 -0
- occlude/__main__.py +5 -0
- occlude/cli.py +195 -0
- occlude/pipeline/__init__.py +21 -0
- occlude/pipeline/perception.py +301 -0
- occlude/pipeline/rules.py +288 -0
- occlude/pipeline/video.py +638 -0
- occlude/ui/__init__.py +0 -0
- occlude/ui/ascii_art.py +45 -0
- occlude-1.0.0.dist-info/METADATA +84 -0
- occlude-1.0.0.dist-info/RECORD +15 -0
- occlude-1.0.0.dist-info/WHEEL +5 -0
- occlude-1.0.0.dist-info/entry_points.txt +2 -0
- occlude-1.0.0.dist-info/licenses/LICENSE +21 -0
- occlude-1.0.0.dist-info/top_level.txt +1 -0
occlude/__init__.py
ADDED
occlude/__main__.py
ADDED
occlude/cli.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""OCCLUDE — CLI entry point.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
occlude --input <video> [--output <path>] [--blur-strength N]
|
|
5
|
+
|
|
6
|
+
Detects immodestly dressed people frame-by-frame and writes a clean
|
|
7
|
+
video with the original audio preserved. See OCCLUDE_SPEC.md.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# Diagnostic instrumentation: traps os._exit + sys.exit + fatal signals and
|
|
14
|
+
# dumps all-thread stacks to /tmp so a silent death pinpoints its origin.
|
|
15
|
+
# Activated only when OCCLUDE_DEBUG=1 to keep `--help` side-effect-free.
|
|
16
|
+
if os.getenv("OCCLUDE_DEBUG") == "1":
|
|
17
|
+
import sys as _diag_sys
|
|
18
|
+
import time as _diag_time
|
|
19
|
+
import traceback as _diag_tb
|
|
20
|
+
import threading as _diag_threading
|
|
21
|
+
import faulthandler as _diag_faulthandler
|
|
22
|
+
import signal as _diag_signal
|
|
23
|
+
import atexit as _diag_atexit
|
|
24
|
+
|
|
25
|
+
_DIAG_LOG = "/tmp/occlude_diag.log"
|
|
26
|
+
_FAULT_LOG = "/tmp/occlude_faulthandler.log"
|
|
27
|
+
|
|
28
|
+
def _diag_log(msg: str) -> None:
|
|
29
|
+
with open(_DIAG_LOG, "a") as f:
|
|
30
|
+
f.write(f"[{_diag_time.strftime('%H:%M:%S')}] {msg}\n")
|
|
31
|
+
f.flush()
|
|
32
|
+
|
|
33
|
+
_real_os_exit = os._exit
|
|
34
|
+
|
|
35
|
+
def _traced_os_exit(code): # noqa: ANN001
|
|
36
|
+
with open(_DIAG_LOG, "a") as f:
|
|
37
|
+
f.write(
|
|
38
|
+
f"\n========== os._exit({code}) at "
|
|
39
|
+
f"{_diag_time.strftime('%H:%M:%S')} ==========\n"
|
|
40
|
+
)
|
|
41
|
+
f.write(f"main thread: {_diag_threading.main_thread().ident}\n")
|
|
42
|
+
f.write(
|
|
43
|
+
f"current thread: {_diag_threading.current_thread().ident} "
|
|
44
|
+
f"({_diag_threading.current_thread().name})\n"
|
|
45
|
+
)
|
|
46
|
+
f.write("--- python stack at exit ---\n")
|
|
47
|
+
_diag_tb.print_stack(file=f)
|
|
48
|
+
f.write("--- all threads ---\n")
|
|
49
|
+
for tid, frame in _diag_sys._current_frames().items():
|
|
50
|
+
f.write(f"\n>>> thread {tid}\n")
|
|
51
|
+
_diag_tb.print_stack(frame, file=f)
|
|
52
|
+
f.flush()
|
|
53
|
+
_real_os_exit(code)
|
|
54
|
+
|
|
55
|
+
os._exit = _traced_os_exit
|
|
56
|
+
|
|
57
|
+
_real_sys_exit = _diag_sys.exit
|
|
58
|
+
|
|
59
|
+
def _traced_sys_exit(code=0): # noqa: ANN001
|
|
60
|
+
with open(_DIAG_LOG, "a") as f:
|
|
61
|
+
f.write(
|
|
62
|
+
f"[{_diag_time.strftime('%H:%M:%S')}] sys.exit({code}) "
|
|
63
|
+
"called from:\n"
|
|
64
|
+
)
|
|
65
|
+
_diag_tb.print_stack(file=f)
|
|
66
|
+
f.flush()
|
|
67
|
+
_real_sys_exit(code)
|
|
68
|
+
|
|
69
|
+
_diag_sys.exit = _traced_sys_exit
|
|
70
|
+
|
|
71
|
+
_diag_fault_fh = open(_FAULT_LOG, "w")
|
|
72
|
+
_diag_faulthandler.enable(file=_diag_fault_fh, all_threads=True)
|
|
73
|
+
for _diag_sig in (
|
|
74
|
+
_diag_signal.SIGTERM,
|
|
75
|
+
_diag_signal.SIGINT,
|
|
76
|
+
_diag_signal.SIGHUP,
|
|
77
|
+
_diag_signal.SIGUSR1,
|
|
78
|
+
_diag_signal.SIGUSR2,
|
|
79
|
+
_diag_signal.SIGPIPE,
|
|
80
|
+
_diag_signal.SIGQUIT,
|
|
81
|
+
):
|
|
82
|
+
try:
|
|
83
|
+
_diag_faulthandler.register(
|
|
84
|
+
_diag_sig, file=_diag_fault_fh, all_threads=True, chain=True
|
|
85
|
+
)
|
|
86
|
+
except Exception as _e: # noqa: BLE001
|
|
87
|
+
_diag_log(f"faulthandler.register({_diag_sig}) failed: {_e}")
|
|
88
|
+
|
|
89
|
+
def _diag_on_atexit() -> None:
|
|
90
|
+
_diag_log("atexit fired (orderly python shutdown)")
|
|
91
|
+
|
|
92
|
+
_diag_atexit.register(_diag_on_atexit)
|
|
93
|
+
_diag_log(
|
|
94
|
+
f"=== occlude started, pid={os.getpid()}, "
|
|
95
|
+
f"argv={_diag_sys.argv} ==="
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
import argparse
|
|
99
|
+
import shutil
|
|
100
|
+
import sys
|
|
101
|
+
from pathlib import Path
|
|
102
|
+
|
|
103
|
+
from rich.console import Console
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
107
|
+
parser = argparse.ArgumentParser(
|
|
108
|
+
prog="occlude",
|
|
109
|
+
description="Blur immodestly dressed people in a video.",
|
|
110
|
+
)
|
|
111
|
+
parser.add_argument(
|
|
112
|
+
"--input", required=True, type=Path,
|
|
113
|
+
help="path to the input video file",
|
|
114
|
+
)
|
|
115
|
+
parser.add_argument(
|
|
116
|
+
"--output", type=Path, default=None,
|
|
117
|
+
help="output video path (default: <input_stem>_occluded.mp4 next to input)",
|
|
118
|
+
)
|
|
119
|
+
# Default sourced from video.DEFAULT_BLUR_KERNEL so a single
|
|
120
|
+
# constant controls the kernel size — historical 51 was too soft
|
|
121
|
+
# on 1280×720 footage (subjects still recognizable on multi-person
|
|
122
|
+
# video). Currently 199.
|
|
123
|
+
from occlude.pipeline.video import DEFAULT_BLUR_KERNEL
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"--blur-strength", type=int, default=DEFAULT_BLUR_KERNEL,
|
|
126
|
+
help=f"Gaussian blur kernel size (must be odd, default {DEFAULT_BLUR_KERNEL})",
|
|
127
|
+
)
|
|
128
|
+
parser.add_argument(
|
|
129
|
+
"--frame-stride", type=int, default=1,
|
|
130
|
+
help="run perception every Nth frame; skipped frames re-render the previous frame's blur (default 1 = every frame)",
|
|
131
|
+
)
|
|
132
|
+
return parser.parse_args(argv)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def main(argv: list[str] | None = None) -> int:
|
|
136
|
+
args = _parse_args(argv)
|
|
137
|
+
|
|
138
|
+
from occlude.ui.ascii_art import get_header_panel
|
|
139
|
+
console = Console()
|
|
140
|
+
console.print(get_header_panel())
|
|
141
|
+
|
|
142
|
+
input_path: Path = args.input
|
|
143
|
+
if not input_path.exists() or not input_path.is_file():
|
|
144
|
+
print(f"error: input file not found: {input_path}", file=sys.stderr)
|
|
145
|
+
return 1
|
|
146
|
+
|
|
147
|
+
if args.blur_strength <= 0 or args.blur_strength % 2 == 0:
|
|
148
|
+
print(
|
|
149
|
+
f"error: --blur-strength must be a positive odd integer, got {args.blur_strength}",
|
|
150
|
+
file=sys.stderr,
|
|
151
|
+
)
|
|
152
|
+
return 1
|
|
153
|
+
|
|
154
|
+
if args.frame_stride < 1:
|
|
155
|
+
print(
|
|
156
|
+
f"error: --frame-stride must be >= 1, got {args.frame_stride}",
|
|
157
|
+
file=sys.stderr,
|
|
158
|
+
)
|
|
159
|
+
return 1
|
|
160
|
+
|
|
161
|
+
if shutil.which("ffmpeg") is None:
|
|
162
|
+
print(
|
|
163
|
+
"error: ffmpeg not found on PATH. Install via: brew install ffmpeg",
|
|
164
|
+
file=sys.stderr,
|
|
165
|
+
)
|
|
166
|
+
return 1
|
|
167
|
+
|
|
168
|
+
output_path: Path = args.output or (
|
|
169
|
+
input_path.parent / f"{input_path.stem}_occluded.mp4"
|
|
170
|
+
)
|
|
171
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
172
|
+
|
|
173
|
+
# Imported here so `--help` and arg validation don't pay the
|
|
174
|
+
# multi-second model-loading cost.
|
|
175
|
+
from occlude.pipeline.video import VideoProcessor
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
processor = VideoProcessor(
|
|
179
|
+
blur_kernel=args.blur_strength,
|
|
180
|
+
frame_stride=args.frame_stride,
|
|
181
|
+
)
|
|
182
|
+
processor.process(input_path, output_path)
|
|
183
|
+
except KeyboardInterrupt:
|
|
184
|
+
print("\ncancelled by user.", file=sys.stderr)
|
|
185
|
+
return 130
|
|
186
|
+
except Exception as e:
|
|
187
|
+
print(f"error: {e}", file=sys.stderr)
|
|
188
|
+
return 1
|
|
189
|
+
|
|
190
|
+
print(f"done. output: {output_path}")
|
|
191
|
+
return 0
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
sys.exit(main())
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Pipeline package — perception, rules, blur, video.
|
|
2
|
+
|
|
3
|
+
Stage 4 lives in :mod:`occlude.pipeline.perception` and produces the data
|
|
4
|
+
structure the Stage 5 rule layer consumes.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from occlude.pipeline.perception import (
|
|
8
|
+
SEG_LABELS,
|
|
9
|
+
TARGET_LABELS,
|
|
10
|
+
Perception,
|
|
11
|
+
Perceiver,
|
|
12
|
+
Person,
|
|
13
|
+
)
|
|
14
|
+
from occlude.pipeline.rules import Decision, RuleEngine
|
|
15
|
+
from occlude.pipeline.video import VideoProcessor, Tracker, blur_region
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
"SEG_LABELS", "TARGET_LABELS", "Perception", "Perceiver", "Person",
|
|
19
|
+
"Decision", "RuleEngine",
|
|
20
|
+
"VideoProcessor", "Tracker", "blur_region",
|
|
21
|
+
]
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""Stage 4 — Perception.
|
|
2
|
+
|
|
3
|
+
Wraps the YOLO person detector, the SegFormer body-part segmenter, and
|
|
4
|
+
the InsightFace gender classifier as a single callable that maps an
|
|
5
|
+
image to a list of per-person observations. This is the data structure
|
|
6
|
+
the Stage 5 rule layer consumes.
|
|
7
|
+
|
|
8
|
+
The class loads all three models once on construction; calling the
|
|
9
|
+
instance on an image runs detection, then segments + classifies each
|
|
10
|
+
person crop.
|
|
11
|
+
"""
|
|
12
|
+
import gc
|
|
13
|
+
from contextlib import contextmanager
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from typing import Generator, Protocol, runtime_checkable
|
|
16
|
+
|
|
17
|
+
import cv2
|
|
18
|
+
import numpy as np
|
|
19
|
+
import onnxruntime
|
|
20
|
+
import torch
|
|
21
|
+
from insightface.app import FaceAnalysis
|
|
22
|
+
from insightface.model_zoo import model_zoo as _imz
|
|
23
|
+
from PIL import Image
|
|
24
|
+
from transformers import AutoModelForSemanticSegmentation, SegformerImageProcessor
|
|
25
|
+
from ultralytics import YOLO
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _NoArenaPickableSession(_imz.PickableInferenceSession):
|
|
29
|
+
"""InsightFace `PickableInferenceSession` with the ONNX Runtime CPU
|
|
30
|
+
memory arena disabled.
|
|
31
|
+
|
|
32
|
+
By default ORT keeps a per-session arena that retains freed buffers
|
|
33
|
+
for reuse. Empirically, on multi-person video, each
|
|
34
|
+
`face_app.get(bgr)` call leaks ~45 MB/person into compressed-VM
|
|
35
|
+
pages that macOS's Activity Monitor tallies into the Jetsam
|
|
36
|
+
footprint metric (vmmap MALLOC_LARGE virtual = 4.6 GB at frame 25
|
|
37
|
+
of laughing_people.mp4 with arena on, ~250 MB total with arena
|
|
38
|
+
off). Disabling the arena trades a small inference slowdown for
|
|
39
|
+
bounded steady-state memory.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, model_path, **kwargs):
|
|
43
|
+
if "sess_options" not in kwargs:
|
|
44
|
+
opts = onnxruntime.SessionOptions()
|
|
45
|
+
opts.enable_cpu_mem_arena = False
|
|
46
|
+
# mem_pattern caches per-input-shape allocation plans.
|
|
47
|
+
# InsightFace receives variable-sized BGR crops per
|
|
48
|
+
# detection call, so each unique shape grew the cache.
|
|
49
|
+
# Disabling forces fresh allocations and bounds the heap.
|
|
50
|
+
opts.enable_mem_pattern = False
|
|
51
|
+
kwargs["sess_options"] = opts
|
|
52
|
+
super().__init__(model_path, **kwargs)
|
|
53
|
+
|
|
54
|
+
YOLO_MODEL_ID = "yolov8n.pt"
|
|
55
|
+
SEG_MODEL_ID = "mattmdjaga/segformer_b2_clothes"
|
|
56
|
+
INSIGHT_MODEL_ID = "buffalo_l"
|
|
57
|
+
PERSON_CLASS_ID = 0
|
|
58
|
+
DEFAULT_PERSON_CONF = 0.40
|
|
59
|
+
INSIGHT_DET_SIZE = (640, 640)
|
|
60
|
+
|
|
61
|
+
# 18 SegFormer classes from `mattmdjaga/segformer_b2_clothes`. Index
|
|
62
|
+
# into this list maps to the integer label in the predicted mask.
|
|
63
|
+
SEG_LABELS = [
|
|
64
|
+
"Background", "Hat", "Hair", "Sunglasses", "Upper-clothes",
|
|
65
|
+
"Skirt", "Pants", "Dress", "Belt", "Left-shoe", "Right-shoe",
|
|
66
|
+
"Face", "Left-leg", "Right-leg", "Left-arm", "Right-arm",
|
|
67
|
+
"Bag", "Scarf",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
# Subset the modesty rule layer cares about. Surfaced here (not buried
|
|
71
|
+
# in test scripts) so Stage 5 can import the same set.
|
|
72
|
+
TARGET_LABELS = frozenset({
|
|
73
|
+
"Hair", "Hat", "Scarf", "Upper-clothes", "Pants", "Skirt", "Dress",
|
|
74
|
+
"Face", "Left-leg", "Right-leg", "Left-arm", "Right-arm",
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass
|
|
79
|
+
class Person:
|
|
80
|
+
"""Everything the rule layer needs to know about one detected person."""
|
|
81
|
+
|
|
82
|
+
# YOLO bbox in source-image pixel coordinates.
|
|
83
|
+
bbox: tuple[int, int, int, int]
|
|
84
|
+
# YOLO person-class confidence.
|
|
85
|
+
det_conf: float
|
|
86
|
+
# Cropped RGB image at the bbox.
|
|
87
|
+
crop: Image.Image
|
|
88
|
+
# SegFormer pixel-wise label map, same H×W as `crop`. Values index
|
|
89
|
+
# into SEG_LABELS.
|
|
90
|
+
seg_mask: np.ndarray
|
|
91
|
+
# InsightFace gender: 'M', 'F', or None when no face was found.
|
|
92
|
+
gender: str | None
|
|
93
|
+
# InsightFace *face detection* confidence — answers "is there a
|
|
94
|
+
# face here?", not "is the gender prediction reliable." Stage 3
|
|
95
|
+
# (`docs/04-gender-classifier.md`, Finding 2) is explicit about
|
|
96
|
+
# this distinction. 0.0 when no face was detected.
|
|
97
|
+
face_det_score: float
|
|
98
|
+
# Boolean mask per SEG_LABELS entry, same H×W as seg_mask.
|
|
99
|
+
# Populated by Perception.detect_and_segment so the rule layer
|
|
100
|
+
# never needs to know integer label indices — only string names.
|
|
101
|
+
label_masks: dict[str, np.ndarray]
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@runtime_checkable
|
|
105
|
+
class Perceiver(Protocol):
|
|
106
|
+
"""Interface the video pipeline depends on.
|
|
107
|
+
|
|
108
|
+
One production adapter exists: :class:`Perception`. Fakes that
|
|
109
|
+
satisfy this protocol can be injected into :class:`VideoProcessor`
|
|
110
|
+
for testing the tracking / temporal-smoothing logic without loading
|
|
111
|
+
any model weights.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
def detect_and_segment(self, image: Image.Image) -> list[Person]: ...
|
|
115
|
+
def classify(self, crop: Image.Image) -> tuple[str | None, float]: ...
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _pick_device() -> torch.device:
|
|
119
|
+
if torch.backends.mps.is_available():
|
|
120
|
+
return torch.device("mps")
|
|
121
|
+
if torch.cuda.is_available():
|
|
122
|
+
return torch.device("cuda")
|
|
123
|
+
return torch.device("cpu")
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class Perception:
|
|
127
|
+
def __init__(self, person_conf: float = DEFAULT_PERSON_CONF) -> None:
|
|
128
|
+
self.person_conf = person_conf
|
|
129
|
+
self.device = _pick_device()
|
|
130
|
+
|
|
131
|
+
self.detector = YOLO(YOLO_MODEL_ID)
|
|
132
|
+
|
|
133
|
+
self.seg_processor = SegformerImageProcessor.from_pretrained(SEG_MODEL_ID)
|
|
134
|
+
self.seg_model = (
|
|
135
|
+
AutoModelForSemanticSegmentation.from_pretrained(SEG_MODEL_ID)
|
|
136
|
+
.to(self.device)
|
|
137
|
+
.eval()
|
|
138
|
+
)
|
|
139
|
+
# fp16: logits are immediately argmax'd so precision loss is benign.
|
|
140
|
+
# CPU fp16 support in PyTorch is incomplete; restrict to MPS/CUDA.
|
|
141
|
+
if self.device.type in ("mps", "cuda"):
|
|
142
|
+
self.seg_model = self.seg_model.half()
|
|
143
|
+
|
|
144
|
+
# torch.compile fuses transformer attention kernels on top of fp16.
|
|
145
|
+
# dynamic=True treats the batch dim as symbolic so frames with
|
|
146
|
+
# different person counts (1 person, then 3, then 2 …) don't each
|
|
147
|
+
# trigger a recompile. First batch incurs one-time JIT compilation
|
|
148
|
+
# (~10-30 frames of wall-time warm-up at 1 fps baseline); all
|
|
149
|
+
# subsequent batches run fused. On MPS, unsupported ops fall back to
|
|
150
|
+
# eager silently — no correctness risk. Silently skip if the backend
|
|
151
|
+
# raises (e.g. an inductor limitation on this torch build).
|
|
152
|
+
if self.device.type in ("mps", "cuda"):
|
|
153
|
+
try:
|
|
154
|
+
self.seg_model = torch.compile(self.seg_model, dynamic=True)
|
|
155
|
+
except Exception:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
# Two scoped tweaks for memory bounding:
|
|
159
|
+
#
|
|
160
|
+
# 1. `allowed_modules` filters the buffalo_l bundle from 5
|
|
161
|
+
# sub-models down to the 2 we actually consume. We only
|
|
162
|
+
# read `face.gender` and `face.det_score`, so landmarks
|
|
163
|
+
# (2d_106, 3d_68) and recognition embeddings are pure
|
|
164
|
+
# overhead — fewer models per call = less per-call CPU
|
|
165
|
+
# allocation.
|
|
166
|
+
# 2. Swap insightface's PickableInferenceSession for our
|
|
167
|
+
# arena/mem_pattern-disabled subclass during construction,
|
|
168
|
+
# then restore — keeps the patch scoped to this instance.
|
|
169
|
+
_orig = _imz.PickableInferenceSession
|
|
170
|
+
_imz.PickableInferenceSession = _NoArenaPickableSession
|
|
171
|
+
try:
|
|
172
|
+
self.face_app = FaceAnalysis(
|
|
173
|
+
name=INSIGHT_MODEL_ID,
|
|
174
|
+
providers=["CPUExecutionProvider"],
|
|
175
|
+
allowed_modules=["detection", "genderage"],
|
|
176
|
+
)
|
|
177
|
+
self.face_app.prepare(ctx_id=0, det_size=INSIGHT_DET_SIZE)
|
|
178
|
+
finally:
|
|
179
|
+
_imz.PickableInferenceSession = _orig
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def make_label_masks(seg_mask: np.ndarray) -> dict[str, np.ndarray]:
|
|
183
|
+
"""Convert an integer seg_mask to named boolean arrays.
|
|
184
|
+
|
|
185
|
+
Isolates the model's integer label convention here so the rule
|
|
186
|
+
layer only deals with string keys and never imports SEG_LABELS.
|
|
187
|
+
"""
|
|
188
|
+
return {name: (seg_mask == i) for i, name in enumerate(SEG_LABELS)}
|
|
189
|
+
|
|
190
|
+
def __call__(self, image: Image.Image) -> list[Person]:
|
|
191
|
+
"""Single-image entry point: detect, segment, classify all in one.
|
|
192
|
+
|
|
193
|
+
Used by test scripts. The video pipeline uses
|
|
194
|
+
:meth:`detect_and_segment` + :meth:`classify` separately so it
|
|
195
|
+
can cache gender per IoU-tracked person and avoid the
|
|
196
|
+
~200 MB/frame heap growth that running InsightFace ONNX every
|
|
197
|
+
frame produces (see docs/07-video-pipeline.md Finding 6).
|
|
198
|
+
"""
|
|
199
|
+
people = self.detect_and_segment(image)
|
|
200
|
+
for person in people:
|
|
201
|
+
person.gender, person.face_det_score = self.classify(person.crop)
|
|
202
|
+
return people
|
|
203
|
+
|
|
204
|
+
def detect_and_segment(self, image: Image.Image) -> list[Person]:
|
|
205
|
+
"""Detect persons, segment each, return Person list with
|
|
206
|
+
``gender=None`` and ``face_det_score=0.0``.
|
|
207
|
+
|
|
208
|
+
The video pipeline calls this once per frame, then only invokes
|
|
209
|
+
:meth:`classify` for newly-detected tracks.
|
|
210
|
+
"""
|
|
211
|
+
det = self.detector.predict(
|
|
212
|
+
source=image,
|
|
213
|
+
classes=[PERSON_CLASS_ID],
|
|
214
|
+
conf=self.person_conf,
|
|
215
|
+
verbose=False,
|
|
216
|
+
)[0]
|
|
217
|
+
boxes = det.boxes
|
|
218
|
+
if boxes is None or len(boxes) == 0:
|
|
219
|
+
return []
|
|
220
|
+
|
|
221
|
+
xyxy = boxes.xyxy.cpu().numpy()
|
|
222
|
+
confs = boxes.conf.cpu().numpy()
|
|
223
|
+
|
|
224
|
+
crops: list[Image.Image] = []
|
|
225
|
+
meta: list[tuple[int, int, int, int, float]] = []
|
|
226
|
+
for (x1f, y1f, x2f, y2f), conf in zip(xyxy, confs):
|
|
227
|
+
x1, y1, x2, y2 = int(x1f), int(y1f), int(x2f), int(y2f)
|
|
228
|
+
crops.append(image.crop((x1, y1, x2, y2)))
|
|
229
|
+
meta.append((x1, y1, x2, y2, float(conf)))
|
|
230
|
+
|
|
231
|
+
# Batched segmentation: one forward pass over all person crops
|
|
232
|
+
# in this frame instead of N sequential calls. The processor
|
|
233
|
+
# always resizes inputs to 512×512 so the batch tensor is
|
|
234
|
+
# (N, 3, 512, 512) regardless of crop sizes — same per-call
|
|
235
|
+
# path as the single-image version, just amortizes GPU pipeline
|
|
236
|
+
# overhead. On a 6-person 1280×720 frame this is ~2× the
|
|
237
|
+
# throughput of sequential calls on Apple MPS.
|
|
238
|
+
seg_masks = self._segment_batch(crops)
|
|
239
|
+
|
|
240
|
+
people: list[Person] = []
|
|
241
|
+
for (x1, y1, x2, y2, conf), crop, seg_mask in zip(meta, crops, seg_masks):
|
|
242
|
+
people.append(Person(
|
|
243
|
+
bbox=(x1, y1, x2, y2),
|
|
244
|
+
det_conf=conf,
|
|
245
|
+
crop=crop,
|
|
246
|
+
seg_mask=seg_mask,
|
|
247
|
+
gender=None,
|
|
248
|
+
face_det_score=0.0,
|
|
249
|
+
label_masks=Perception.make_label_masks(seg_mask),
|
|
250
|
+
))
|
|
251
|
+
return people
|
|
252
|
+
|
|
253
|
+
def classify(self, crop: Image.Image) -> tuple[str | None, float]:
|
|
254
|
+
"""Run face detection + gender classification on a person crop."""
|
|
255
|
+
return self._classify(crop)
|
|
256
|
+
|
|
257
|
+
@contextmanager
|
|
258
|
+
def _cleanup_device_memory(self) -> Generator[None, None, None]:
|
|
259
|
+
try:
|
|
260
|
+
yield
|
|
261
|
+
finally:
|
|
262
|
+
gc.collect()
|
|
263
|
+
if self.device.type == "mps":
|
|
264
|
+
torch.mps.empty_cache()
|
|
265
|
+
elif self.device.type == "cuda":
|
|
266
|
+
torch.cuda.empty_cache()
|
|
267
|
+
|
|
268
|
+
def _segment_batch(self, crops: list[Image.Image]) -> list[np.ndarray]:
|
|
269
|
+
"""One forward pass for all person crops in a frame. Argmax on
|
|
270
|
+
device, INTER_NEAREST upsample on CPU, per-batch empty_cache.
|
|
271
|
+
"""
|
|
272
|
+
if not crops:
|
|
273
|
+
return []
|
|
274
|
+
with self._cleanup_device_memory():
|
|
275
|
+
inputs = self.seg_processor(images=list(crops), return_tensors="pt").to(self.device)
|
|
276
|
+
inputs["pixel_values"] = inputs["pixel_values"].to(
|
|
277
|
+
dtype=next(self.seg_model.parameters()).dtype
|
|
278
|
+
)
|
|
279
|
+
with torch.no_grad():
|
|
280
|
+
outputs = self.seg_model(**inputs)
|
|
281
|
+
# (B, 128, 128) int64 on device — argmax before CPU move keeps
|
|
282
|
+
# the 18-channel logits tensor off CPU heap (Finding 7).
|
|
283
|
+
pred_small = outputs.logits.argmax(dim=1)
|
|
284
|
+
del outputs, inputs
|
|
285
|
+
pred_small_cpu = pred_small.detach().to("cpu").numpy().astype(np.int32)
|
|
286
|
+
del pred_small
|
|
287
|
+
results: list[np.ndarray] = []
|
|
288
|
+
for i, crop in enumerate(crops):
|
|
289
|
+
results.append(
|
|
290
|
+
cv2.resize(pred_small_cpu[i], crop.size, interpolation=cv2.INTER_NEAREST)
|
|
291
|
+
)
|
|
292
|
+
return results
|
|
293
|
+
|
|
294
|
+
def _classify(self, crop: Image.Image) -> tuple[str | None, float]:
|
|
295
|
+
bgr = np.array(crop)[:, :, ::-1]
|
|
296
|
+
faces = self.face_app.get(bgr)
|
|
297
|
+
if not faces:
|
|
298
|
+
return None, 0.0
|
|
299
|
+
face = max(faces, key=lambda f: f.det_score)
|
|
300
|
+
gender = "M" if int(face.gender) == 1 else "F"
|
|
301
|
+
return gender, float(face.det_score)
|