media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,75 @@
1
+ """Video feature extractors."""
2
+
3
+ from .clip import extract_clip, extract_clip_image, unload_clip_model
4
+ from .faces import check_faces_are_known, extract_faces, unload_face_model
5
+ from .frame_buffer import (
6
+ SharedFrame,
7
+ SharedFrameBuffer,
8
+ decode_frames,
9
+ get_extractor_timestamps,
10
+ )
11
+ from .frames import FrameExtractor, extract_frames_batch, get_video_duration
12
+ from .metadata import (
13
+ FFPROBE_WORKERS,
14
+ extract_metadata,
15
+ list_extractors,
16
+ run_ffprobe_batch,
17
+ shutdown_ffprobe_pool,
18
+ )
19
+ from .motion import (
20
+ MotionAnalysis,
21
+ MotionType,
22
+ analyze_motion,
23
+ get_adaptive_timestamps,
24
+ get_sample_timestamps,
25
+ )
26
+ from .objects import extract_objects, unload_yolo_model
27
+ from .objects_qwen import extract_objects_qwen, unload_qwen_model
28
+ from .ocr import extract_ocr, unload_ocr_model
29
+ from .scenes import extract_scenes
30
+ from .telemetry import extract_telemetry
31
+ from .transcribe import extract_transcript, unload_whisper_model
32
+ from .vad import AudioContent, detect_voice_activity, unload_vad_model
33
+
34
+ __all__ = [
35
+ "extract_metadata",
36
+ "run_ffprobe_batch",
37
+ "list_extractors",
38
+ "FFPROBE_WORKERS",
39
+ "shutdown_ffprobe_pool",
40
+ "extract_transcript",
41
+ "extract_faces",
42
+ "check_faces_are_known",
43
+ "extract_scenes",
44
+ "extract_objects",
45
+ "extract_objects_qwen",
46
+ "extract_clip",
47
+ "extract_clip_image",
48
+ "extract_ocr",
49
+ "extract_telemetry",
50
+ "analyze_motion",
51
+ "get_sample_timestamps",
52
+ "get_adaptive_timestamps",
53
+ "MotionAnalysis",
54
+ "MotionType",
55
+ # Model unload functions
56
+ "unload_qwen_model",
57
+ "unload_whisper_model",
58
+ "unload_yolo_model",
59
+ "unload_clip_model",
60
+ "unload_ocr_model",
61
+ "unload_face_model",
62
+ "unload_vad_model",
63
+ # Voice activity detection
64
+ "detect_voice_activity",
65
+ "AudioContent",
66
+ # Frame extraction utilities
67
+ "FrameExtractor",
68
+ "extract_frames_batch",
69
+ "get_video_duration",
70
+ # Shared frame buffer
71
+ "SharedFrame",
72
+ "SharedFrameBuffer",
73
+ "decode_frames",
74
+ "get_extractor_timestamps",
75
+ ]
@@ -0,0 +1,401 @@
1
+ """CLIP embedding extraction with platform-specific backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gc
6
+ import logging
7
+ from abc import ABC, abstractmethod
8
+ from pathlib import Path
9
+ from typing import Any, TypeAlias
10
+
11
+ import cv2 # type: ignore[import-not-found]
12
+ import numpy as np
13
+ from numpy.typing import NDArray
14
+
15
+ from media_engine.config import has_cuda, is_apple_silicon
16
+ from media_engine.extractors.frame_buffer import SharedFrameBuffer
17
+ from media_engine.schemas import ClipResult, ClipSegment
18
+
19
+ Embedding: TypeAlias = list[float]
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Map OpenCLIP-style model names to HuggingFace model names for MLX-CLIP
24
+ OPENCLIP_TO_HF_MODEL_MAP: dict[str, str] = {
25
+ "ViT-B-16": "openai/clip-vit-base-patch16",
26
+ "ViT-B-32": "openai/clip-vit-base-patch32",
27
+ "ViT-L-14": "openai/clip-vit-large-patch14",
28
+ }
29
+
30
+
31
+ def _load_image_rgb(image_path: str) -> NDArray[np.uint8]:
32
+ """Load image using OpenCV and convert to RGB."""
33
+ img = cv2.imread(image_path)
34
+ if img is None:
35
+ raise ValueError(f"Failed to load image: {image_path}")
36
+ return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # type: ignore[return-value]
37
+
38
+
39
+ class CLIPBackend(ABC):
40
+ """Abstract base class for CLIP backends."""
41
+
42
+ @abstractmethod
43
+ def encode_image(self, image_path: str) -> Embedding:
44
+ """Encode image to embedding vector.
45
+
46
+ Args:
47
+ image_path: Path to image file
48
+
49
+ Returns:
50
+ List of floats representing the embedding
51
+ """
52
+ pass
53
+
54
+ @abstractmethod
55
+ def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
56
+ """Encode image from RGB numpy array to embedding vector.
57
+
58
+ Args:
59
+ rgb_array: RGB image as numpy array (H, W, 3)
60
+
61
+ Returns:
62
+ List of floats representing the embedding
63
+ """
64
+ pass
65
+
66
+ @abstractmethod
67
+ def get_model_name(self) -> str:
68
+ """Get the model name."""
69
+ pass
70
+
71
+ @abstractmethod
72
+ def encode_text(self, text: str) -> Embedding:
73
+ """Encode text to embedding vector.
74
+
75
+ Args:
76
+ text: Text string to encode
77
+
78
+ Returns:
79
+ List of floats representing the embedding (normalized)
80
+ """
81
+ pass
82
+
83
+
84
+ class OpenCLIPBackend(CLIPBackend):
85
+ """OpenCLIP backend for CUDA/CPU."""
86
+
87
+ def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
88
+ import open_clip # type: ignore[import-not-found]
89
+
90
+ self.device = "cuda" if has_cuda() else "cpu"
91
+ self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
92
+ self.model = self.model.to(self.device)
93
+ self.model.eval()
94
+ self._model_name = model_name
95
+ logger.info(f"Loaded OpenCLIP model: {model_name} on {self.device}")
96
+
97
+ def encode_image(self, image_path: str) -> Embedding:
98
+ rgb_array = _load_image_rgb(image_path)
99
+ return self.encode_image_from_array(rgb_array)
100
+
101
+ def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
102
+ import torch
103
+ from PIL import Image
104
+
105
+ image = Image.fromarray(rgb_array)
106
+ image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
107
+
108
+ with torch.no_grad():
109
+ embedding = self.model.encode_image(image_tensor)
110
+ embedding = embedding / embedding.norm(dim=-1, keepdim=True)
111
+
112
+ return embedding.cpu().numpy().flatten().tolist()
113
+
114
+ def get_model_name(self) -> str:
115
+ return self._model_name
116
+
117
+ def encode_text(self, text: str) -> Embedding:
118
+ import open_clip # type: ignore[import-not-found]
119
+ import torch
120
+
121
+ # Tokenize the text
122
+ tokenized = open_clip.tokenize([text]).to(self.device)
123
+
124
+ with torch.no_grad():
125
+ embedding = self.model.encode_text(tokenized)
126
+ embedding = embedding / embedding.norm(dim=-1, keepdim=True)
127
+
128
+ return embedding.cpu().numpy().flatten().tolist()
129
+
130
+
131
+ class MLXCLIPBackend(CLIPBackend):
132
+ """MLX-CLIP backend for Apple Silicon."""
133
+
134
+ def __init__(self, model_name: str = "openai/clip-vit-base-patch32") -> None:
135
+ # Lazy import for Apple Silicon only
136
+ self._model: Any = None
137
+ self._processor: Any = None
138
+ self._model_name = model_name
139
+
140
+ def _load_model(self) -> None:
141
+ if self._model is None:
142
+ try:
143
+ # Try mlx-clip first
144
+ import mlx_clip # type: ignore[import-not-found]
145
+
146
+ self._model = mlx_clip.load(self._model_name)
147
+ self._processor = None # MLX-CLIP handles preprocessing
148
+ logger.info(f"Loaded MLX-CLIP model: {self._model_name}")
149
+ except ImportError:
150
+ # Fall back to transformers + mlx
151
+ from transformers import CLIPModel, CLIPProcessor # type: ignore[import-not-found]
152
+
153
+ self._processor = CLIPProcessor.from_pretrained(self._model_name)
154
+ self._model = CLIPModel.from_pretrained(self._model_name)
155
+ logger.info(f"Loaded transformers CLIP model: {self._model_name}")
156
+
157
+ def encode_image(self, image_path: str) -> Embedding:
158
+ rgb_array = _load_image_rgb(image_path)
159
+ return self.encode_image_from_array(rgb_array)
160
+
161
+ def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
162
+ from PIL import Image
163
+
164
+ self._load_model()
165
+
166
+ image = Image.fromarray(rgb_array)
167
+
168
+ if self._processor is not None:
169
+ # Using transformers
170
+ inputs = self._processor(images=image, return_tensors="pt")
171
+ outputs = self._model.get_image_features(**inputs)
172
+ embedding = outputs / outputs.norm(dim=-1, keepdim=True)
173
+ return embedding.detach().numpy().flatten().tolist()
174
+ else:
175
+ # Using mlx-clip
176
+ embedding = self._model.encode_image(image)
177
+ return embedding.tolist()
178
+
179
+ def get_model_name(self) -> str:
180
+ return self._model_name
181
+
182
+ def encode_text(self, text: str) -> Embedding:
183
+ self._load_model()
184
+
185
+ if self._processor is not None:
186
+ # Using transformers
187
+ inputs = self._processor(text=text, return_tensors="pt")
188
+ outputs = self._model.get_text_features(**inputs)
189
+ embedding = outputs / outputs.norm(dim=-1, keepdim=True)
190
+ return embedding.detach().numpy().flatten().tolist()
191
+ else:
192
+ # Using mlx-clip
193
+ embedding = self._model.encode_text(text)
194
+ return embedding.tolist()
195
+
196
+
197
+ # Singleton backend instance
198
+ _backend: CLIPBackend | None = None
199
+ _backend_model_name: str | None = None
200
+
201
+
202
+ def unload_clip_model() -> None:
203
+ """Unload the CLIP model to free memory."""
204
+ global _backend, _backend_model_name
205
+
206
+ if _backend is None:
207
+ return
208
+
209
+ logger.info("Unloading CLIP model to free memory")
210
+
211
+ try:
212
+ import torch
213
+
214
+ # Clear internal model references
215
+ if isinstance(_backend, MLXCLIPBackend):
216
+ if _backend._model is not None:
217
+ del _backend._model
218
+ _backend._model = None
219
+ if _backend._processor is not None:
220
+ del _backend._processor
221
+ _backend._processor = None
222
+ elif isinstance(_backend, OpenCLIPBackend):
223
+ if hasattr(_backend, "model"):
224
+ del _backend.model
225
+
226
+ del _backend
227
+ _backend = None
228
+ _backend_model_name = None
229
+
230
+ gc.collect()
231
+
232
+ if torch.cuda.is_available():
233
+ torch.cuda.synchronize()
234
+ torch.cuda.empty_cache()
235
+ elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
236
+ if hasattr(torch.mps, "synchronize"):
237
+ torch.mps.synchronize()
238
+ if hasattr(torch.mps, "empty_cache"):
239
+ torch.mps.empty_cache()
240
+
241
+ gc.collect()
242
+ logger.info("CLIP model unloaded")
243
+ except Exception as e:
244
+ logger.warning(f"Error unloading CLIP model: {e}")
245
+ _backend = None
246
+ _backend_model_name = None
247
+
248
+
249
+ def get_clip_backend(model_name: str | None = None) -> CLIPBackend:
250
+ """Get the appropriate CLIP backend for the current platform.
251
+
252
+ Args:
253
+ model_name: CLIP model name. For OpenCLIP: "ViT-B-16", "ViT-B-32", "ViT-L-14".
254
+ For MLX-CLIP: "openai/clip-vit-base-patch32", etc.
255
+ If None, uses defaults.
256
+ """
257
+ global _backend, _backend_model_name
258
+
259
+ # If model name changed, unload the old model
260
+ if _backend is not None and model_name is not None and _backend_model_name != model_name:
261
+ logger.info(f"Switching CLIP model from {_backend_model_name} to {model_name}")
262
+ unload_clip_model()
263
+
264
+ if _backend is not None:
265
+ return _backend
266
+
267
+ if is_apple_silicon():
268
+ try:
269
+ # MLX-CLIP uses HuggingFace model names - translate if needed
270
+ if model_name and model_name in OPENCLIP_TO_HF_MODEL_MAP:
271
+ mlx_model = OPENCLIP_TO_HF_MODEL_MAP[model_name]
272
+ logger.info(f"Translated CLIP model name: {model_name} -> {mlx_model}")
273
+ else:
274
+ mlx_model = model_name or "openai/clip-vit-base-patch32"
275
+ _backend = MLXCLIPBackend(model_name=mlx_model)
276
+ _backend_model_name = model_name or mlx_model # Store original name for comparison
277
+ logger.info(f"Using MLX-CLIP backend (Apple Silicon): {mlx_model}")
278
+ return _backend
279
+ except Exception as e:
280
+ logger.warning(f"MLX-CLIP not available: {e}, falling back to OpenCLIP")
281
+
282
+ # OpenCLIP uses short model names like "ViT-B-32"
283
+ openclip_model = model_name or "ViT-B-32"
284
+ _backend = OpenCLIPBackend(model_name=openclip_model)
285
+ _backend_model_name = openclip_model
286
+ logger.info(f"Using OpenCLIP backend: {openclip_model}")
287
+ return _backend
288
+
289
+
290
+ IMAGE_EXTENSIONS = {
291
+ ".jpg",
292
+ ".jpeg",
293
+ ".png",
294
+ ".webp",
295
+ ".bmp",
296
+ ".tiff",
297
+ ".tif",
298
+ ".heic",
299
+ ".heif",
300
+ }
301
+
302
+
303
+ def extract_clip(
304
+ file_path: str,
305
+ frame_buffer: SharedFrameBuffer,
306
+ model_name: str | None = None, # CLIP model name (e.g., "ViT-B-32", "ViT-L-14")
307
+ ) -> ClipResult:
308
+ """Extract CLIP embeddings from video frames.
309
+
310
+ For images, use extract_clip_image() instead.
311
+
312
+ Args:
313
+ file_path: Path to video file (used for logging)
314
+ frame_buffer: Pre-decoded frames from SharedFrameBuffer
315
+ model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
316
+
317
+ Returns:
318
+ ClipResult with embeddings per segment
319
+ """
320
+ backend = get_clip_backend(model_name)
321
+
322
+ # Process frames from shared buffer
323
+ logger.info(f"Extracting CLIP embeddings from {len(frame_buffer.frames)} frames")
324
+ segments: list[ClipSegment] = []
325
+
326
+ for i, ts in enumerate(sorted(frame_buffer.frames.keys())):
327
+ shared_frame = frame_buffer.frames[ts]
328
+ try:
329
+ embedding = backend.encode_image_from_array(shared_frame.rgb)
330
+ segments.append(
331
+ ClipSegment(
332
+ start=ts,
333
+ end=ts,
334
+ scene_index=i,
335
+ embedding=embedding,
336
+ )
337
+ )
338
+ except Exception as e:
339
+ logger.warning(f"Failed to encode frame at {ts}s: {e}")
340
+
341
+ logger.info(f"Generated {len(segments)} CLIP embeddings")
342
+
343
+ return ClipResult(
344
+ model=backend.get_model_name(),
345
+ segments=segments,
346
+ )
347
+
348
+
349
+ def extract_clip_image(
350
+ file_path: str,
351
+ model_name: str | None = None,
352
+ ) -> ClipResult:
353
+ """Extract CLIP embedding from a single image file.
354
+
355
+ Args:
356
+ file_path: Path to image file
357
+ model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
358
+
359
+ Returns:
360
+ ClipResult with single embedding
361
+ """
362
+ path = Path(file_path)
363
+ if not path.exists():
364
+ raise FileNotFoundError(f"Image not found: {file_path}")
365
+
366
+ backend = get_clip_backend(model_name)
367
+ logger.info(f"Encoding image directly: {file_path}")
368
+
369
+ embedding = backend.encode_image(file_path)
370
+ return ClipResult(
371
+ model=backend.get_model_name(),
372
+ segments=[
373
+ ClipSegment(
374
+ start=0.0,
375
+ end=0.0,
376
+ scene_index=0,
377
+ embedding=embedding,
378
+ )
379
+ ],
380
+ )
381
+
382
+
383
+ def encode_text_query(
384
+ text: str,
385
+ model_name: str | None = None,
386
+ ) -> Embedding:
387
+ """Encode a text query to a CLIP embedding.
388
+
389
+ The resulting embedding can be used for text-to-image search by comparing
390
+ against image embeddings using cosine similarity.
391
+
392
+ Args:
393
+ text: Text query to encode
394
+ model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
395
+
396
+ Returns:
397
+ List of floats representing the normalized embedding
398
+ """
399
+ backend = get_clip_backend(model_name)
400
+ logger.info(f"Encoding text query: {text[:50]}...")
401
+ return backend.encode_text(text)