occlude 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
occlude/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """OCCLUDE — blur immodestly dressed people in videos."""
2
+
3
+ __version__ = "1.0.0"
occlude/__main__.py ADDED
@@ -0,0 +1,5 @@
1
+ """Allow `python -m occlude` invocation."""
2
+ from occlude.cli import main
3
+
4
+ if __name__ == "__main__":
5
+ main()
occlude/cli.py ADDED
@@ -0,0 +1,195 @@
1
+ """OCCLUDE — CLI entry point.
2
+
3
+ Usage:
4
+ occlude --input <video> [--output <path>] [--blur-strength N]
5
+
6
+ Detects immodestly dressed people frame-by-frame and writes a clean
7
+ video with the original audio preserved. See OCCLUDE_SPEC.md.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import os
12
+
13
+ # Diagnostic instrumentation: traps os._exit + sys.exit + fatal signals and
14
+ # dumps all-thread stacks to /tmp so a silent death pinpoints its origin.
15
+ # Activated only when OCCLUDE_DEBUG=1 to keep `--help` side-effect-free.
16
+ if os.getenv("OCCLUDE_DEBUG") == "1":
17
+ import sys as _diag_sys
18
+ import time as _diag_time
19
+ import traceback as _diag_tb
20
+ import threading as _diag_threading
21
+ import faulthandler as _diag_faulthandler
22
+ import signal as _diag_signal
23
+ import atexit as _diag_atexit
24
+
25
+ _DIAG_LOG = "/tmp/occlude_diag.log"
26
+ _FAULT_LOG = "/tmp/occlude_faulthandler.log"
27
+
28
+ def _diag_log(msg: str) -> None:
29
+ with open(_DIAG_LOG, "a") as f:
30
+ f.write(f"[{_diag_time.strftime('%H:%M:%S')}] {msg}\n")
31
+ f.flush()
32
+
33
+ _real_os_exit = os._exit
34
+
35
+ def _traced_os_exit(code): # noqa: ANN001
36
+ with open(_DIAG_LOG, "a") as f:
37
+ f.write(
38
+ f"\n========== os._exit({code}) at "
39
+ f"{_diag_time.strftime('%H:%M:%S')} ==========\n"
40
+ )
41
+ f.write(f"main thread: {_diag_threading.main_thread().ident}\n")
42
+ f.write(
43
+ f"current thread: {_diag_threading.current_thread().ident} "
44
+ f"({_diag_threading.current_thread().name})\n"
45
+ )
46
+ f.write("--- python stack at exit ---\n")
47
+ _diag_tb.print_stack(file=f)
48
+ f.write("--- all threads ---\n")
49
+ for tid, frame in _diag_sys._current_frames().items():
50
+ f.write(f"\n>>> thread {tid}\n")
51
+ _diag_tb.print_stack(frame, file=f)
52
+ f.flush()
53
+ _real_os_exit(code)
54
+
55
+ os._exit = _traced_os_exit
56
+
57
+ _real_sys_exit = _diag_sys.exit
58
+
59
+ def _traced_sys_exit(code=0): # noqa: ANN001
60
+ with open(_DIAG_LOG, "a") as f:
61
+ f.write(
62
+ f"[{_diag_time.strftime('%H:%M:%S')}] sys.exit({code}) "
63
+ "called from:\n"
64
+ )
65
+ _diag_tb.print_stack(file=f)
66
+ f.flush()
67
+ _real_sys_exit(code)
68
+
69
+ _diag_sys.exit = _traced_sys_exit
70
+
71
+ _diag_fault_fh = open(_FAULT_LOG, "w")
72
+ _diag_faulthandler.enable(file=_diag_fault_fh, all_threads=True)
73
+ for _diag_sig in (
74
+ _diag_signal.SIGTERM,
75
+ _diag_signal.SIGINT,
76
+ _diag_signal.SIGHUP,
77
+ _diag_signal.SIGUSR1,
78
+ _diag_signal.SIGUSR2,
79
+ _diag_signal.SIGPIPE,
80
+ _diag_signal.SIGQUIT,
81
+ ):
82
+ try:
83
+ _diag_faulthandler.register(
84
+ _diag_sig, file=_diag_fault_fh, all_threads=True, chain=True
85
+ )
86
+ except Exception as _e: # noqa: BLE001
87
+ _diag_log(f"faulthandler.register({_diag_sig}) failed: {_e}")
88
+
89
+ def _diag_on_atexit() -> None:
90
+ _diag_log("atexit fired (orderly python shutdown)")
91
+
92
+ _diag_atexit.register(_diag_on_atexit)
93
+ _diag_log(
94
+ f"=== occlude started, pid={os.getpid()}, "
95
+ f"argv={_diag_sys.argv} ==="
96
+ )
97
+
98
+ import argparse
99
+ import shutil
100
+ import sys
101
+ from pathlib import Path
102
+
103
+ from rich.console import Console
104
+
105
+
106
+ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
107
+ parser = argparse.ArgumentParser(
108
+ prog="occlude",
109
+ description="Blur immodestly dressed people in a video.",
110
+ )
111
+ parser.add_argument(
112
+ "--input", required=True, type=Path,
113
+ help="path to the input video file",
114
+ )
115
+ parser.add_argument(
116
+ "--output", type=Path, default=None,
117
+ help="output video path (default: <input_stem>_occluded.mp4 next to input)",
118
+ )
119
+ # Default sourced from video.DEFAULT_BLUR_KERNEL so a single
120
+ # constant controls the kernel size — historical 51 was too soft
121
+ # on 1280×720 footage (subjects still recognizable on multi-person
122
+ # video). Currently 199.
123
+ from occlude.pipeline.video import DEFAULT_BLUR_KERNEL
124
+ parser.add_argument(
125
+ "--blur-strength", type=int, default=DEFAULT_BLUR_KERNEL,
126
+ help=f"Gaussian blur kernel size (must be odd, default {DEFAULT_BLUR_KERNEL})",
127
+ )
128
+ parser.add_argument(
129
+ "--frame-stride", type=int, default=1,
130
+ help="run perception every Nth frame; skipped frames re-render the previous frame's blur (default 1 = every frame)",
131
+ )
132
+ return parser.parse_args(argv)
133
+
134
+
135
+ def main(argv: list[str] | None = None) -> int:
136
+ args = _parse_args(argv)
137
+
138
+ from occlude.ui.ascii_art import get_header_panel
139
+ console = Console()
140
+ console.print(get_header_panel())
141
+
142
+ input_path: Path = args.input
143
+ if not input_path.exists() or not input_path.is_file():
144
+ print(f"error: input file not found: {input_path}", file=sys.stderr)
145
+ return 1
146
+
147
+ if args.blur_strength <= 0 or args.blur_strength % 2 == 0:
148
+ print(
149
+ f"error: --blur-strength must be a positive odd integer, got {args.blur_strength}",
150
+ file=sys.stderr,
151
+ )
152
+ return 1
153
+
154
+ if args.frame_stride < 1:
155
+ print(
156
+ f"error: --frame-stride must be >= 1, got {args.frame_stride}",
157
+ file=sys.stderr,
158
+ )
159
+ return 1
160
+
161
+ if shutil.which("ffmpeg") is None:
162
+ print(
163
+ "error: ffmpeg not found on PATH. Install via: brew install ffmpeg",
164
+ file=sys.stderr,
165
+ )
166
+ return 1
167
+
168
+ output_path: Path = args.output or (
169
+ input_path.parent / f"{input_path.stem}_occluded.mp4"
170
+ )
171
+ output_path.parent.mkdir(parents=True, exist_ok=True)
172
+
173
+ # Imported here so `--help` and arg validation don't pay the
174
+ # multi-second model-loading cost.
175
+ from occlude.pipeline.video import VideoProcessor
176
+
177
+ try:
178
+ processor = VideoProcessor(
179
+ blur_kernel=args.blur_strength,
180
+ frame_stride=args.frame_stride,
181
+ )
182
+ processor.process(input_path, output_path)
183
+ except KeyboardInterrupt:
184
+ print("\ncancelled by user.", file=sys.stderr)
185
+ return 130
186
+ except Exception as e:
187
+ print(f"error: {e}", file=sys.stderr)
188
+ return 1
189
+
190
+ print(f"done. output: {output_path}")
191
+ return 0
192
+
193
+
194
+ if __name__ == "__main__":
195
+ sys.exit(main())
@@ -0,0 +1,21 @@
1
+ """Pipeline package — perception, rules, blur, video.
2
+
3
+ Stage 4 lives in :mod:`occlude.pipeline.perception` and produces the data
4
+ structure the Stage 5 rule layer consumes.
5
+ """
6
+
7
+ from occlude.pipeline.perception import (
8
+ SEG_LABELS,
9
+ TARGET_LABELS,
10
+ Perception,
11
+ Perceiver,
12
+ Person,
13
+ )
14
+ from occlude.pipeline.rules import Decision, RuleEngine
15
+ from occlude.pipeline.video import VideoProcessor, Tracker, blur_region
16
+
17
+ __all__ = [
18
+ "SEG_LABELS", "TARGET_LABELS", "Perception", "Perceiver", "Person",
19
+ "Decision", "RuleEngine",
20
+ "VideoProcessor", "Tracker", "blur_region",
21
+ ]
@@ -0,0 +1,301 @@
1
+ """Stage 4 — Perception.
2
+
3
+ Wraps the YOLO person detector, the SegFormer body-part segmenter, and
4
+ the InsightFace gender classifier as a single callable that maps an
5
+ image to a list of per-person observations. This is the data structure
6
+ the Stage 5 rule layer consumes.
7
+
8
+ The class loads all three models once on construction; calling the
9
+ instance on an image runs detection, then segments + classifies each
10
+ person crop.
11
+ """
12
+ import gc
13
+ from contextlib import contextmanager
14
+ from dataclasses import dataclass
15
+ from typing import Generator, Protocol, runtime_checkable
16
+
17
+ import cv2
18
+ import numpy as np
19
+ import onnxruntime
20
+ import torch
21
+ from insightface.app import FaceAnalysis
22
+ from insightface.model_zoo import model_zoo as _imz
23
+ from PIL import Image
24
+ from transformers import AutoModelForSemanticSegmentation, SegformerImageProcessor
25
+ from ultralytics import YOLO
26
+
27
+
28
+ class _NoArenaPickableSession(_imz.PickableInferenceSession):
29
+ """InsightFace `PickableInferenceSession` with the ONNX Runtime CPU
30
+ memory arena disabled.
31
+
32
+ By default ORT keeps a per-session arena that retains freed buffers
33
+ for reuse. Empirically, on multi-person video, each
34
+ `face_app.get(bgr)` call leaks ~45 MB/person into compressed-VM
35
+ pages that macOS's Activity Monitor tallies into the Jetsam
36
+ footprint metric (vmmap MALLOC_LARGE virtual = 4.6 GB at frame 25
37
+ of laughing_people.mp4 with arena on, ~250 MB total with arena
38
+ off). Disabling the arena trades a small inference slowdown for
39
+ bounded steady-state memory.
40
+ """
41
+
42
+ def __init__(self, model_path, **kwargs):
43
+ if "sess_options" not in kwargs:
44
+ opts = onnxruntime.SessionOptions()
45
+ opts.enable_cpu_mem_arena = False
46
+ # mem_pattern caches per-input-shape allocation plans.
47
+ # InsightFace receives variable-sized BGR crops per
48
+ # detection call, so each unique shape grew the cache.
49
+ # Disabling forces fresh allocations and bounds the heap.
50
+ opts.enable_mem_pattern = False
51
+ kwargs["sess_options"] = opts
52
+ super().__init__(model_path, **kwargs)
53
+
54
+ YOLO_MODEL_ID = "yolov8n.pt"
55
+ SEG_MODEL_ID = "mattmdjaga/segformer_b2_clothes"
56
+ INSIGHT_MODEL_ID = "buffalo_l"
57
+ PERSON_CLASS_ID = 0
58
+ DEFAULT_PERSON_CONF = 0.40
59
+ INSIGHT_DET_SIZE = (640, 640)
60
+
61
+ # 18 SegFormer classes from `mattmdjaga/segformer_b2_clothes`. Index
62
+ # into this list maps to the integer label in the predicted mask.
63
+ SEG_LABELS = [
64
+ "Background", "Hat", "Hair", "Sunglasses", "Upper-clothes",
65
+ "Skirt", "Pants", "Dress", "Belt", "Left-shoe", "Right-shoe",
66
+ "Face", "Left-leg", "Right-leg", "Left-arm", "Right-arm",
67
+ "Bag", "Scarf",
68
+ ]
69
+
70
+ # Subset the modesty rule layer cares about. Surfaced here (not buried
71
+ # in test scripts) so Stage 5 can import the same set.
72
+ TARGET_LABELS = frozenset({
73
+ "Hair", "Hat", "Scarf", "Upper-clothes", "Pants", "Skirt", "Dress",
74
+ "Face", "Left-leg", "Right-leg", "Left-arm", "Right-arm",
75
+ })
76
+
77
+
78
+ @dataclass
79
+ class Person:
80
+ """Everything the rule layer needs to know about one detected person."""
81
+
82
+ # YOLO bbox in source-image pixel coordinates.
83
+ bbox: tuple[int, int, int, int]
84
+ # YOLO person-class confidence.
85
+ det_conf: float
86
+ # Cropped RGB image at the bbox.
87
+ crop: Image.Image
88
+ # SegFormer pixel-wise label map, same H×W as `crop`. Values index
89
+ # into SEG_LABELS.
90
+ seg_mask: np.ndarray
91
+ # InsightFace gender: 'M', 'F', or None when no face was found.
92
+ gender: str | None
93
+ # InsightFace *face detection* confidence — answers "is there a
94
+ # face here?", not "is the gender prediction reliable." Stage 3
95
+ # (`docs/04-gender-classifier.md`, Finding 2) is explicit about
96
+ # this distinction. 0.0 when no face was detected.
97
+ face_det_score: float
98
+ # Boolean mask per SEG_LABELS entry, same H×W as seg_mask.
99
+ # Populated by Perception.detect_and_segment so the rule layer
100
+ # never needs to know integer label indices — only string names.
101
+ label_masks: dict[str, np.ndarray]
102
+
103
+
104
+ @runtime_checkable
105
+ class Perceiver(Protocol):
106
+ """Interface the video pipeline depends on.
107
+
108
+ One production adapter exists: :class:`Perception`. Fakes that
109
+ satisfy this protocol can be injected into :class:`VideoProcessor`
110
+ for testing the tracking / temporal-smoothing logic without loading
111
+ any model weights.
112
+ """
113
+
114
+ def detect_and_segment(self, image: Image.Image) -> list[Person]: ...
115
+ def classify(self, crop: Image.Image) -> tuple[str | None, float]: ...
116
+
117
+
118
+ def _pick_device() -> torch.device:
119
+ if torch.backends.mps.is_available():
120
+ return torch.device("mps")
121
+ if torch.cuda.is_available():
122
+ return torch.device("cuda")
123
+ return torch.device("cpu")
124
+
125
+
126
+ class Perception:
127
+ def __init__(self, person_conf: float = DEFAULT_PERSON_CONF) -> None:
128
+ self.person_conf = person_conf
129
+ self.device = _pick_device()
130
+
131
+ self.detector = YOLO(YOLO_MODEL_ID)
132
+
133
+ self.seg_processor = SegformerImageProcessor.from_pretrained(SEG_MODEL_ID)
134
+ self.seg_model = (
135
+ AutoModelForSemanticSegmentation.from_pretrained(SEG_MODEL_ID)
136
+ .to(self.device)
137
+ .eval()
138
+ )
139
+ # fp16: logits are immediately argmax'd so precision loss is benign.
140
+ # CPU fp16 support in PyTorch is incomplete; restrict to MPS/CUDA.
141
+ if self.device.type in ("mps", "cuda"):
142
+ self.seg_model = self.seg_model.half()
143
+
144
+ # torch.compile fuses transformer attention kernels on top of fp16.
145
+ # dynamic=True treats the batch dim as symbolic so frames with
146
+ # different person counts (1 person, then 3, then 2 …) don't each
147
+ # trigger a recompile. First batch incurs one-time JIT compilation
148
+ # (~10-30 frames of wall-time warm-up at 1 fps baseline); all
149
+ # subsequent batches run fused. On MPS, unsupported ops fall back to
150
+ # eager silently — no correctness risk. Silently skip if the backend
151
+ # raises (e.g. an inductor limitation on this torch build).
152
+ if self.device.type in ("mps", "cuda"):
153
+ try:
154
+ self.seg_model = torch.compile(self.seg_model, dynamic=True)
155
+ except Exception:
156
+ pass
157
+
158
+ # Two scoped tweaks for memory bounding:
159
+ #
160
+ # 1. `allowed_modules` filters the buffalo_l bundle from 5
161
+ # sub-models down to the 2 we actually consume. We only
162
+ # read `face.gender` and `face.det_score`, so landmarks
163
+ # (2d_106, 3d_68) and recognition embeddings are pure
164
+ # overhead — fewer models per call = less per-call CPU
165
+ # allocation.
166
+ # 2. Swap insightface's PickableInferenceSession for our
167
+ # arena/mem_pattern-disabled subclass during construction,
168
+ # then restore — keeps the patch scoped to this instance.
169
+ _orig = _imz.PickableInferenceSession
170
+ _imz.PickableInferenceSession = _NoArenaPickableSession
171
+ try:
172
+ self.face_app = FaceAnalysis(
173
+ name=INSIGHT_MODEL_ID,
174
+ providers=["CPUExecutionProvider"],
175
+ allowed_modules=["detection", "genderage"],
176
+ )
177
+ self.face_app.prepare(ctx_id=0, det_size=INSIGHT_DET_SIZE)
178
+ finally:
179
+ _imz.PickableInferenceSession = _orig
180
+
181
+ @staticmethod
182
+ def make_label_masks(seg_mask: np.ndarray) -> dict[str, np.ndarray]:
183
+ """Convert an integer seg_mask to named boolean arrays.
184
+
185
+ Isolates the model's integer label convention here so the rule
186
+ layer only deals with string keys and never imports SEG_LABELS.
187
+ """
188
+ return {name: (seg_mask == i) for i, name in enumerate(SEG_LABELS)}
189
+
190
+ def __call__(self, image: Image.Image) -> list[Person]:
191
+ """Single-image entry point: detect, segment, classify all in one.
192
+
193
+ Used by test scripts. The video pipeline uses
194
+ :meth:`detect_and_segment` + :meth:`classify` separately so it
195
+ can cache gender per IoU-tracked person and avoid the
196
+ ~200 MB/frame heap growth that running InsightFace ONNX every
197
+ frame produces (see docs/07-video-pipeline.md Finding 6).
198
+ """
199
+ people = self.detect_and_segment(image)
200
+ for person in people:
201
+ person.gender, person.face_det_score = self.classify(person.crop)
202
+ return people
203
+
204
+ def detect_and_segment(self, image: Image.Image) -> list[Person]:
205
+ """Detect persons, segment each, return Person list with
206
+ ``gender=None`` and ``face_det_score=0.0``.
207
+
208
+ The video pipeline calls this once per frame, then only invokes
209
+ :meth:`classify` for newly-detected tracks.
210
+ """
211
+ det = self.detector.predict(
212
+ source=image,
213
+ classes=[PERSON_CLASS_ID],
214
+ conf=self.person_conf,
215
+ verbose=False,
216
+ )[0]
217
+ boxes = det.boxes
218
+ if boxes is None or len(boxes) == 0:
219
+ return []
220
+
221
+ xyxy = boxes.xyxy.cpu().numpy()
222
+ confs = boxes.conf.cpu().numpy()
223
+
224
+ crops: list[Image.Image] = []
225
+ meta: list[tuple[int, int, int, int, float]] = []
226
+ for (x1f, y1f, x2f, y2f), conf in zip(xyxy, confs):
227
+ x1, y1, x2, y2 = int(x1f), int(y1f), int(x2f), int(y2f)
228
+ crops.append(image.crop((x1, y1, x2, y2)))
229
+ meta.append((x1, y1, x2, y2, float(conf)))
230
+
231
+ # Batched segmentation: one forward pass over all person crops
232
+ # in this frame instead of N sequential calls. The processor
233
+ # always resizes inputs to 512×512 so the batch tensor is
234
+ # (N, 3, 512, 512) regardless of crop sizes — same per-call
235
+ # path as the single-image version, just amortizes GPU pipeline
236
+ # overhead. On a 6-person 1280×720 frame this is ~2× the
237
+ # throughput of sequential calls on Apple MPS.
238
+ seg_masks = self._segment_batch(crops)
239
+
240
+ people: list[Person] = []
241
+ for (x1, y1, x2, y2, conf), crop, seg_mask in zip(meta, crops, seg_masks):
242
+ people.append(Person(
243
+ bbox=(x1, y1, x2, y2),
244
+ det_conf=conf,
245
+ crop=crop,
246
+ seg_mask=seg_mask,
247
+ gender=None,
248
+ face_det_score=0.0,
249
+ label_masks=Perception.make_label_masks(seg_mask),
250
+ ))
251
+ return people
252
+
253
+ def classify(self, crop: Image.Image) -> tuple[str | None, float]:
254
+ """Run face detection + gender classification on a person crop."""
255
+ return self._classify(crop)
256
+
257
+ @contextmanager
258
+ def _cleanup_device_memory(self) -> Generator[None, None, None]:
259
+ try:
260
+ yield
261
+ finally:
262
+ gc.collect()
263
+ if self.device.type == "mps":
264
+ torch.mps.empty_cache()
265
+ elif self.device.type == "cuda":
266
+ torch.cuda.empty_cache()
267
+
268
+ def _segment_batch(self, crops: list[Image.Image]) -> list[np.ndarray]:
269
+ """One forward pass for all person crops in a frame. Argmax on
270
+ device, INTER_NEAREST upsample on CPU, per-batch empty_cache.
271
+ """
272
+ if not crops:
273
+ return []
274
+ with self._cleanup_device_memory():
275
+ inputs = self.seg_processor(images=list(crops), return_tensors="pt").to(self.device)
276
+ inputs["pixel_values"] = inputs["pixel_values"].to(
277
+ dtype=next(self.seg_model.parameters()).dtype
278
+ )
279
+ with torch.no_grad():
280
+ outputs = self.seg_model(**inputs)
281
+ # (B, 128, 128) int64 on device — argmax before CPU move keeps
282
+ # the 18-channel logits tensor off CPU heap (Finding 7).
283
+ pred_small = outputs.logits.argmax(dim=1)
284
+ del outputs, inputs
285
+ pred_small_cpu = pred_small.detach().to("cpu").numpy().astype(np.int32)
286
+ del pred_small
287
+ results: list[np.ndarray] = []
288
+ for i, crop in enumerate(crops):
289
+ results.append(
290
+ cv2.resize(pred_small_cpu[i], crop.size, interpolation=cv2.INTER_NEAREST)
291
+ )
292
+ return results
293
+
294
+ def _classify(self, crop: Image.Image) -> tuple[str | None, float]:
295
+ bgr = np.array(crop)[:, :, ::-1]
296
+ faces = self.face_app.get(bgr)
297
+ if not faces:
298
+ return None, 0.0
299
+ face = max(faces, key=lambda f: f.det_score)
300
+ gender = "M" if int(face.gender) == 1 else "F"
301
+ return gender, float(face.det_score)