media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,459 @@
1
+ """Face detection using DeepFace with Facenet."""
2
+
3
+ import base64
4
+ import gc
5
+ import io
6
+ import logging
7
+ import shutil
8
+ import tempfile
9
+ from pathlib import Path
10
+ from typing import TypeAlias
11
+
12
+ import numpy as np
13
+ from PIL import Image
14
+
15
+ from media_engine.extractors.frame_buffer import SharedFrameBuffer
16
+ from media_engine.schemas import (
17
+ BoundingBox,
18
+ FaceDetection,
19
+ FacesResult,
20
+ )
21
+
22
+ Embedding: TypeAlias = list[float]
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ def unload_face_model() -> None:
28
+ """Unload DeepFace models to free memory.
29
+
30
+ DeepFace caches models internally. This function clears those caches.
31
+ """
32
+ import sys
33
+
34
+ # Only unload if deepface was actually imported (avoid importing during shutdown)
35
+ if "deepface" not in sys.modules:
36
+ return
37
+
38
+ logger.info("Unloading face detection models to free memory")
39
+
40
+ try:
41
+ import torch
42
+
43
+ # DeepFace caches models in deepface.modules.modeling
44
+ try:
45
+ from deepface.modules import modeling # type: ignore[import-not-found]
46
+
47
+ # Clear the model store if it exists
48
+ if hasattr(modeling, "model_obj"):
49
+ modeling.model_obj = {}
50
+ except (ImportError, AttributeError):
51
+ pass
52
+
53
+ # Also try the older DeepFace.commons.functions cache
54
+ try:
55
+ from deepface.commons import functions # type: ignore[import-not-found]
56
+
57
+ if hasattr(functions, "model_obj"):
58
+ functions.model_obj = {}
59
+ except (ImportError, AttributeError):
60
+ pass
61
+
62
+ gc.collect()
63
+
64
+ if torch.cuda.is_available():
65
+ torch.cuda.synchronize()
66
+ torch.cuda.empty_cache()
67
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
68
+ if hasattr(torch.mps, "synchronize"):
69
+ torch.mps.synchronize()
70
+ if hasattr(torch.mps, "empty_cache"):
71
+ torch.mps.empty_cache()
72
+
73
+ gc.collect()
74
+ logger.info("Face detection models unloaded")
75
+ except Exception as e:
76
+ logger.warning(f"Error unloading face models: {e}")
77
+
78
+
79
+ def extract_faces(
80
+ file_path: str,
81
+ frame_buffer: SharedFrameBuffer,
82
+ min_face_size: int = 80,
83
+ min_confidence: float = 0.5, # Lowered from 0.9 - user can discard false positives
84
+ extract_images: bool = True,
85
+ face_image_size: int = 160, # Output face thumbnail size
86
+ ) -> FacesResult:
87
+ """Extract faces from video frames using DeepFace.
88
+
89
+ Args:
90
+ file_path: Path to video file (used for logging)
91
+ frame_buffer: Pre-decoded frames from SharedFrameBuffer
92
+ min_face_size: Minimum face size in pixels
93
+ min_confidence: Minimum detection confidence
94
+ extract_images: Whether to extract face thumbnail images
95
+ face_image_size: Size of output face thumbnails (square)
96
+
97
+ Returns:
98
+ FacesResult with detected faces, embeddings, and optional images
99
+ """
100
+ from deepface import DeepFace # type: ignore[import-not-found]
101
+
102
+ path = Path(file_path)
103
+ if not path.exists():
104
+ raise FileNotFoundError(f"Video file not found: {file_path}")
105
+
106
+ # Create temp directory for face crops (needed for embedding generation)
107
+ temp_dir = tempfile.mkdtemp(prefix="polybos_faces_")
108
+
109
+ try:
110
+ detections: list[FaceDetection] = []
111
+ all_embeddings: list[Embedding] = []
112
+ frame_size: tuple[int, int] | None = None
113
+
114
+ def process_frame(frame_rgb: np.ndarray, frame_pil: Image.Image, timestamp: float) -> None:
115
+ """Process a single frame for face detection."""
116
+ nonlocal frame_size
117
+
118
+ if frame_size is None:
119
+ frame_size = (frame_rgb.shape[1], frame_rgb.shape[0]) # (width, height)
120
+
121
+ try:
122
+ # Detect faces in frame (DeepFace accepts numpy arrays)
123
+ faces = DeepFace.extract_faces(
124
+ img_path=frame_rgb,
125
+ detector_backend="retinaface",
126
+ enforce_detection=False,
127
+ align=True,
128
+ )
129
+
130
+ if faces:
131
+ logger.debug(f"RetinaFace found {len(faces)} raw detections at {timestamp}s")
132
+
133
+ for face in faces:
134
+ # Skip low confidence detections
135
+ confidence = face.get("confidence", 0)
136
+ if confidence < min_confidence:
137
+ logger.debug(f"Skipping face at {timestamp}s: confidence {confidence:.2f} < {min_confidence}")
138
+ continue
139
+
140
+ # Get bounding box
141
+ region: dict[str, int] = face.get("facial_area", {})
142
+ x, y = region.get("x", 0), region.get("y", 0)
143
+ w, h = region.get("w", 0), region.get("h", 0)
144
+
145
+ # Skip small faces
146
+ if w < min_face_size or h < min_face_size:
147
+ logger.debug(f"Skipping face at {timestamp}s: size {w}x{h} < {min_face_size}px")
148
+ continue
149
+
150
+ # Crop face with padding for better embedding
151
+ face_crop = _crop_face_with_padding(frame_pil, x, y, w, h, padding=0.3)
152
+
153
+ # Generate embedding from cropped face
154
+ embedding: Embedding = []
155
+ try:
156
+ # Convert to numpy array for DeepFace.represent
157
+ crop_array = np.array(face_crop)
158
+
159
+ embedding_result = DeepFace.represent(
160
+ img_path=crop_array,
161
+ model_name="Facenet512",
162
+ detector_backend="skip", # Already cropped
163
+ enforce_detection=False,
164
+ )
165
+
166
+ if embedding_result and len(embedding_result) > 0:
167
+ first_result = embedding_result[0]
168
+ if isinstance(first_result, dict):
169
+ embedding = first_result.get("embedding", [])
170
+ except Exception as e:
171
+ logger.warning(f"Failed to generate embedding: {e}")
172
+
173
+ # Create face thumbnail
174
+ image_base64: str | None = None
175
+ if extract_images:
176
+ image_base64 = _encode_face_image(face_crop, face_image_size)
177
+
178
+ detection = FaceDetection(
179
+ timestamp=round(float(timestamp), 2),
180
+ bbox=BoundingBox(x=x, y=y, width=w, height=h),
181
+ confidence=round(float(confidence), 3),
182
+ embedding=embedding,
183
+ image_base64=image_base64,
184
+ )
185
+ detections.append(detection)
186
+
187
+ if embedding:
188
+ all_embeddings.append(embedding)
189
+
190
+ except Exception as e:
191
+ logger.warning(f"Failed to process frame at {timestamp}s: {e}")
192
+
193
+ # Process frames from shared buffer
194
+ logger.info(f"Processing {len(frame_buffer.frames)} frames for face detection")
195
+ for ts in sorted(frame_buffer.frames.keys()):
196
+ shared_frame = frame_buffer.frames[ts]
197
+ process_frame(shared_frame.rgb, shared_frame.pil, ts)
198
+
199
+ # Cluster faces and keep best per person
200
+ unique_faces, unique_estimate = _deduplicate_faces(detections, all_embeddings, frame_size=frame_size)
201
+
202
+ needs_review = sum(1 for f in unique_faces if f.needs_review)
203
+ logger.info(f"Detected {len(detections)} faces, {unique_estimate} unique, " f"{needs_review} need review")
204
+
205
+ return FacesResult(
206
+ count=len(detections),
207
+ unique_estimate=unique_estimate,
208
+ detections=unique_faces, # Only return deduplicated faces
209
+ )
210
+
211
+ finally:
212
+ # Clean up temp directory
213
+ shutil.rmtree(temp_dir, ignore_errors=True)
214
+
215
+
216
+ def _crop_face_with_padding(img: Image.Image, x: int, y: int, w: int, h: int, padding: float = 0.3) -> Image.Image:
217
+ """Crop face region with padding for better context.
218
+
219
+ Args:
220
+ img: Source image
221
+ x, y, w, h: Face bounding box
222
+ padding: Padding as fraction of face size (0.3 = 30%)
223
+
224
+ Returns:
225
+ Cropped face image
226
+ """
227
+ img_w, img_h = img.size
228
+
229
+ # Add padding
230
+ pad_w = int(w * padding)
231
+ pad_h = int(h * padding)
232
+
233
+ x1 = max(0, x - pad_w)
234
+ y1 = max(0, y - pad_h)
235
+ x2 = min(img_w, x + w + pad_w)
236
+ y2 = min(img_h, y + h + pad_h)
237
+
238
+ return img.crop((x1, y1, x2, y2))
239
+
240
+
241
+ def _encode_face_image(face_img: Image.Image, size: int) -> str:
242
+ """Resize and encode face image as base64 JPEG."""
243
+ # Resize to square thumbnail
244
+ face_img = face_img.resize((size, size), Image.Resampling.LANCZOS)
245
+
246
+ # Encode as JPEG base64
247
+ buffer = io.BytesIO()
248
+ face_img.save(buffer, format="JPEG", quality=85)
249
+ buffer.seek(0)
250
+
251
+ return base64.b64encode(buffer.read()).decode("utf-8")
252
+
253
+
254
+ def _bbox_iou(box1: BoundingBox, box2: BoundingBox) -> float:
255
+ """Calculate Intersection over Union of two bounding boxes."""
256
+ x1 = max(box1.x, box2.x)
257
+ y1 = max(box1.y, box2.y)
258
+ x2 = min(box1.x + box1.width, box2.x + box2.width)
259
+ y2 = min(box1.y + box1.height, box2.y + box2.height)
260
+
261
+ if x2 <= x1 or y2 <= y1:
262
+ return 0.0
263
+
264
+ intersection = (x2 - x1) * (y2 - y1)
265
+ area1 = box1.width * box1.height
266
+ area2 = box2.width * box2.height
267
+ union = area1 + area2 - intersection
268
+
269
+ return intersection / union if union > 0 else 0.0
270
+
271
+
272
+ def _embedding_distance(emb1: Embedding, emb2: Embedding) -> float:
273
+ """Compute cosine distance between two embeddings."""
274
+ a = np.array(emb1)
275
+ b = np.array(emb2)
276
+ return 1.0 - float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
277
+
278
+
279
+ def check_faces_are_known(
280
+ new_faces: FacesResult,
281
+ known_embeddings: list[Embedding],
282
+ threshold: float = 0.5,
283
+ ) -> tuple[bool, list[Embedding]]:
284
+ """Check if all detected faces match known embeddings.
285
+
286
+ Args:
287
+ new_faces: Newly detected faces
288
+ known_embeddings: List of known face embeddings
289
+ threshold: Maximum distance to consider a match (cosine distance)
290
+
291
+ Returns:
292
+ Tuple of (all_known, new_embeddings):
293
+ - all_known: True if all detected faces match known embeddings
294
+ - new_embeddings: List of embeddings for any new (unknown) faces
295
+ """
296
+ if not new_faces.detections:
297
+ return True, []
298
+
299
+ new_embeddings: list[Embedding] = []
300
+ all_known = True
301
+
302
+ for detection in new_faces.detections:
303
+ if not detection.embedding:
304
+ continue
305
+
306
+ # Check if this face matches any known embedding
307
+ is_known = False
308
+ for known_emb in known_embeddings:
309
+ if not known_emb:
310
+ continue
311
+ dist = _embedding_distance(detection.embedding, known_emb)
312
+ if dist < threshold:
313
+ is_known = True
314
+ break
315
+
316
+ if not is_known:
317
+ all_known = False
318
+ new_embeddings.append(detection.embedding)
319
+
320
+ return all_known, new_embeddings
321
+
322
+
323
+ def _find_position_match(
324
+ det: FaceDetection,
325
+ persons: list[list[tuple[int, Embedding]]],
326
+ detections: list[FaceDetection],
327
+ max_time_gap: float,
328
+ min_iou: float,
329
+ ) -> int | None:
330
+ """Find matching person by bbox position in recent frames."""
331
+ for person_idx, person_dets in enumerate(persons):
332
+ for prev_det_idx, _ in reversed(person_dets):
333
+ prev_det = detections[prev_det_idx]
334
+ time_diff = det.timestamp - prev_det.timestamp
335
+
336
+ if time_diff <= max_time_gap:
337
+ iou = _bbox_iou(det.bbox, prev_det.bbox)
338
+ if iou >= min_iou:
339
+ return person_idx
340
+ else:
341
+ break # Too old
342
+ return None
343
+
344
+
345
+ def _find_embedding_match(
346
+ emb: Embedding,
347
+ persons: list[list[tuple[int, Embedding]]],
348
+ threshold: float,
349
+ ) -> int | None:
350
+ """Find matching person by embedding similarity."""
351
+ if not emb:
352
+ return None
353
+
354
+ best_dist = float("inf")
355
+ best_person = None
356
+
357
+ for person_idx, person_dets in enumerate(persons):
358
+ for _, prev_emb in person_dets:
359
+ if prev_emb:
360
+ dist = _embedding_distance(emb, prev_emb)
361
+ if dist < best_dist and dist < threshold:
362
+ best_dist = dist
363
+ best_person = person_idx
364
+
365
+ return best_person
366
+
367
+
368
+ def _is_near_edge(bbox: BoundingBox, frame_width: int, frame_height: int, margin: float = 0.05) -> bool:
369
+ """Check if bbox is near frame edge (partially out of frame)."""
370
+ margin_x = int(frame_width * margin)
371
+ margin_y = int(frame_height * margin)
372
+
373
+ return bbox.x < margin_x or bbox.y < margin_y or bbox.x + bbox.width > frame_width - margin_x or bbox.y + bbox.height > frame_height - margin_y
374
+
375
+
376
+ def _select_best_faces(
377
+ persons: list[list[tuple[int, Embedding]]],
378
+ detections: list[FaceDetection],
379
+ frame_size: tuple[int, int] | None = None,
380
+ ) -> list[FaceDetection]:
381
+ """Select best face (highest confidence) per person and flag uncertain ones."""
382
+ result: list[FaceDetection] = []
383
+
384
+ for person_dets in persons:
385
+ det_indices = [idx for idx, _ in person_dets]
386
+ best_idx = max(det_indices, key=lambda i: detections[i].confidence)
387
+ face = detections[best_idx].model_copy()
388
+
389
+ # Flag for review if uncertain
390
+ reasons: list[str] = []
391
+
392
+ # Check if near frame edge
393
+ if frame_size:
394
+ if _is_near_edge(face.bbox, frame_size[0], frame_size[1]):
395
+ reasons.append("near_edge")
396
+
397
+ # Check if low confidence
398
+ if face.confidence < 0.95:
399
+ reasons.append("low_confidence")
400
+
401
+ # Check if only one detection for this person (no tracking confirmation)
402
+ if len(person_dets) == 1:
403
+ reasons.append("single_detection")
404
+
405
+ if reasons:
406
+ face.needs_review = True
407
+ face.review_reason = ", ".join(reasons)
408
+
409
+ result.append(face)
410
+
411
+ result.sort(key=lambda d: d.timestamp)
412
+ return result
413
+
414
+
415
+ def _deduplicate_faces(
416
+ detections: list[FaceDetection],
417
+ embeddings: list[Embedding],
418
+ frame_size: tuple[int, int] | None = None,
419
+ max_time_gap: float = 5.0,
420
+ min_iou: float = 0.2,
421
+ embedding_threshold: float = 0.5,
422
+ ) -> tuple[list[FaceDetection], int]:
423
+ """Deduplicate faces using position tracking + embedding fallback.
424
+
425
+ Strategy:
426
+ 1. Try position-based matching (bbox overlap within time window)
427
+ 2. Fall back to embedding similarity for non-adjacent detections
428
+ """
429
+ if not detections:
430
+ return [], 0
431
+
432
+ if len(detections) == 1:
433
+ # Single detection - flag for review
434
+ face = detections[0].model_copy()
435
+ face.needs_review = True
436
+ face.review_reason = "single_detection"
437
+ return [face], 1
438
+
439
+ # Sort by timestamp
440
+ sorted_dets = sorted(enumerate(detections), key=lambda x: x[1].timestamp)
441
+
442
+ # Track persons: list of (detection_idx, embedding) per person
443
+ persons: list[list[tuple[int, Embedding]]] = []
444
+
445
+ for det_idx, det in sorted_dets:
446
+ emb = embeddings[det_idx] if det_idx < len(embeddings) else []
447
+
448
+ # Try position match first, then embedding match
449
+ match = _find_position_match(det, persons, detections, max_time_gap, min_iou)
450
+ if match is None:
451
+ match = _find_embedding_match(emb, persons, embedding_threshold)
452
+
453
+ if match is not None:
454
+ persons[match].append((det_idx, emb))
455
+ else:
456
+ persons.append([(det_idx, emb)])
457
+
458
+ result = _select_best_faces(persons, detections, frame_size)
459
+ return result, len(persons)