media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Video feature extractors."""
|
|
2
|
+
|
|
3
|
+
from .clip import extract_clip, extract_clip_image, unload_clip_model
|
|
4
|
+
from .faces import check_faces_are_known, extract_faces, unload_face_model
|
|
5
|
+
from .frame_buffer import (
|
|
6
|
+
SharedFrame,
|
|
7
|
+
SharedFrameBuffer,
|
|
8
|
+
decode_frames,
|
|
9
|
+
get_extractor_timestamps,
|
|
10
|
+
)
|
|
11
|
+
from .frames import FrameExtractor, extract_frames_batch, get_video_duration
|
|
12
|
+
from .metadata import (
|
|
13
|
+
FFPROBE_WORKERS,
|
|
14
|
+
extract_metadata,
|
|
15
|
+
list_extractors,
|
|
16
|
+
run_ffprobe_batch,
|
|
17
|
+
shutdown_ffprobe_pool,
|
|
18
|
+
)
|
|
19
|
+
from .motion import (
|
|
20
|
+
MotionAnalysis,
|
|
21
|
+
MotionType,
|
|
22
|
+
analyze_motion,
|
|
23
|
+
get_adaptive_timestamps,
|
|
24
|
+
get_sample_timestamps,
|
|
25
|
+
)
|
|
26
|
+
from .objects import extract_objects, unload_yolo_model
|
|
27
|
+
from .objects_qwen import extract_objects_qwen, unload_qwen_model
|
|
28
|
+
from .ocr import extract_ocr, unload_ocr_model
|
|
29
|
+
from .scenes import extract_scenes
|
|
30
|
+
from .telemetry import extract_telemetry
|
|
31
|
+
from .transcribe import extract_transcript, unload_whisper_model
|
|
32
|
+
from .vad import AudioContent, detect_voice_activity, unload_vad_model
|
|
33
|
+
|
|
34
|
+
__all__ = [
|
|
35
|
+
"extract_metadata",
|
|
36
|
+
"run_ffprobe_batch",
|
|
37
|
+
"list_extractors",
|
|
38
|
+
"FFPROBE_WORKERS",
|
|
39
|
+
"shutdown_ffprobe_pool",
|
|
40
|
+
"extract_transcript",
|
|
41
|
+
"extract_faces",
|
|
42
|
+
"check_faces_are_known",
|
|
43
|
+
"extract_scenes",
|
|
44
|
+
"extract_objects",
|
|
45
|
+
"extract_objects_qwen",
|
|
46
|
+
"extract_clip",
|
|
47
|
+
"extract_clip_image",
|
|
48
|
+
"extract_ocr",
|
|
49
|
+
"extract_telemetry",
|
|
50
|
+
"analyze_motion",
|
|
51
|
+
"get_sample_timestamps",
|
|
52
|
+
"get_adaptive_timestamps",
|
|
53
|
+
"MotionAnalysis",
|
|
54
|
+
"MotionType",
|
|
55
|
+
# Model unload functions
|
|
56
|
+
"unload_qwen_model",
|
|
57
|
+
"unload_whisper_model",
|
|
58
|
+
"unload_yolo_model",
|
|
59
|
+
"unload_clip_model",
|
|
60
|
+
"unload_ocr_model",
|
|
61
|
+
"unload_face_model",
|
|
62
|
+
"unload_vad_model",
|
|
63
|
+
# Voice activity detection
|
|
64
|
+
"detect_voice_activity",
|
|
65
|
+
"AudioContent",
|
|
66
|
+
# Frame extraction utilities
|
|
67
|
+
"FrameExtractor",
|
|
68
|
+
"extract_frames_batch",
|
|
69
|
+
"get_video_duration",
|
|
70
|
+
# Shared frame buffer
|
|
71
|
+
"SharedFrame",
|
|
72
|
+
"SharedFrameBuffer",
|
|
73
|
+
"decode_frames",
|
|
74
|
+
"get_extractor_timestamps",
|
|
75
|
+
]
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""CLIP embedding extraction with platform-specific backends."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import gc
|
|
6
|
+
import logging
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any, TypeAlias
|
|
10
|
+
|
|
11
|
+
import cv2 # type: ignore[import-not-found]
|
|
12
|
+
import numpy as np
|
|
13
|
+
from numpy.typing import NDArray
|
|
14
|
+
|
|
15
|
+
from media_engine.config import has_cuda, is_apple_silicon
|
|
16
|
+
from media_engine.extractors.frame_buffer import SharedFrameBuffer
|
|
17
|
+
from media_engine.schemas import ClipResult, ClipSegment
|
|
18
|
+
|
|
19
|
+
Embedding: TypeAlias = list[float]
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Map OpenCLIP-style model names to HuggingFace model names for MLX-CLIP
|
|
24
|
+
OPENCLIP_TO_HF_MODEL_MAP: dict[str, str] = {
|
|
25
|
+
"ViT-B-16": "openai/clip-vit-base-patch16",
|
|
26
|
+
"ViT-B-32": "openai/clip-vit-base-patch32",
|
|
27
|
+
"ViT-L-14": "openai/clip-vit-large-patch14",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _load_image_rgb(image_path: str) -> NDArray[np.uint8]:
|
|
32
|
+
"""Load image using OpenCV and convert to RGB."""
|
|
33
|
+
img = cv2.imread(image_path)
|
|
34
|
+
if img is None:
|
|
35
|
+
raise ValueError(f"Failed to load image: {image_path}")
|
|
36
|
+
return cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # type: ignore[return-value]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class CLIPBackend(ABC):
|
|
40
|
+
"""Abstract base class for CLIP backends."""
|
|
41
|
+
|
|
42
|
+
@abstractmethod
|
|
43
|
+
def encode_image(self, image_path: str) -> Embedding:
|
|
44
|
+
"""Encode image to embedding vector.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
image_path: Path to image file
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
List of floats representing the embedding
|
|
51
|
+
"""
|
|
52
|
+
pass
|
|
53
|
+
|
|
54
|
+
@abstractmethod
|
|
55
|
+
def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
|
|
56
|
+
"""Encode image from RGB numpy array to embedding vector.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
rgb_array: RGB image as numpy array (H, W, 3)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of floats representing the embedding
|
|
63
|
+
"""
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
@abstractmethod
|
|
67
|
+
def get_model_name(self) -> str:
|
|
68
|
+
"""Get the model name."""
|
|
69
|
+
pass
|
|
70
|
+
|
|
71
|
+
@abstractmethod
|
|
72
|
+
def encode_text(self, text: str) -> Embedding:
|
|
73
|
+
"""Encode text to embedding vector.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
text: Text string to encode
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
List of floats representing the embedding (normalized)
|
|
80
|
+
"""
|
|
81
|
+
pass
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class OpenCLIPBackend(CLIPBackend):
|
|
85
|
+
"""OpenCLIP backend for CUDA/CPU."""
|
|
86
|
+
|
|
87
|
+
def __init__(self, model_name: str = "ViT-B-32", pretrained: str = "openai"):
|
|
88
|
+
import open_clip # type: ignore[import-not-found]
|
|
89
|
+
|
|
90
|
+
self.device = "cuda" if has_cuda() else "cpu"
|
|
91
|
+
self.model, _, self.preprocess = open_clip.create_model_and_transforms(model_name, pretrained=pretrained)
|
|
92
|
+
self.model = self.model.to(self.device)
|
|
93
|
+
self.model.eval()
|
|
94
|
+
self._model_name = model_name
|
|
95
|
+
logger.info(f"Loaded OpenCLIP model: {model_name} on {self.device}")
|
|
96
|
+
|
|
97
|
+
def encode_image(self, image_path: str) -> Embedding:
|
|
98
|
+
rgb_array = _load_image_rgb(image_path)
|
|
99
|
+
return self.encode_image_from_array(rgb_array)
|
|
100
|
+
|
|
101
|
+
def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
|
|
102
|
+
import torch
|
|
103
|
+
from PIL import Image
|
|
104
|
+
|
|
105
|
+
image = Image.fromarray(rgb_array)
|
|
106
|
+
image_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
|
|
107
|
+
|
|
108
|
+
with torch.no_grad():
|
|
109
|
+
embedding = self.model.encode_image(image_tensor)
|
|
110
|
+
embedding = embedding / embedding.norm(dim=-1, keepdim=True)
|
|
111
|
+
|
|
112
|
+
return embedding.cpu().numpy().flatten().tolist()
|
|
113
|
+
|
|
114
|
+
def get_model_name(self) -> str:
|
|
115
|
+
return self._model_name
|
|
116
|
+
|
|
117
|
+
def encode_text(self, text: str) -> Embedding:
|
|
118
|
+
import open_clip # type: ignore[import-not-found]
|
|
119
|
+
import torch
|
|
120
|
+
|
|
121
|
+
# Tokenize the text
|
|
122
|
+
tokenized = open_clip.tokenize([text]).to(self.device)
|
|
123
|
+
|
|
124
|
+
with torch.no_grad():
|
|
125
|
+
embedding = self.model.encode_text(tokenized)
|
|
126
|
+
embedding = embedding / embedding.norm(dim=-1, keepdim=True)
|
|
127
|
+
|
|
128
|
+
return embedding.cpu().numpy().flatten().tolist()
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
class MLXCLIPBackend(CLIPBackend):
|
|
132
|
+
"""MLX-CLIP backend for Apple Silicon."""
|
|
133
|
+
|
|
134
|
+
def __init__(self, model_name: str = "openai/clip-vit-base-patch32") -> None:
|
|
135
|
+
# Lazy import for Apple Silicon only
|
|
136
|
+
self._model: Any = None
|
|
137
|
+
self._processor: Any = None
|
|
138
|
+
self._model_name = model_name
|
|
139
|
+
|
|
140
|
+
def _load_model(self) -> None:
|
|
141
|
+
if self._model is None:
|
|
142
|
+
try:
|
|
143
|
+
# Try mlx-clip first
|
|
144
|
+
import mlx_clip # type: ignore[import-not-found]
|
|
145
|
+
|
|
146
|
+
self._model = mlx_clip.load(self._model_name)
|
|
147
|
+
self._processor = None # MLX-CLIP handles preprocessing
|
|
148
|
+
logger.info(f"Loaded MLX-CLIP model: {self._model_name}")
|
|
149
|
+
except ImportError:
|
|
150
|
+
# Fall back to transformers + mlx
|
|
151
|
+
from transformers import CLIPModel, CLIPProcessor # type: ignore[import-not-found]
|
|
152
|
+
|
|
153
|
+
self._processor = CLIPProcessor.from_pretrained(self._model_name)
|
|
154
|
+
self._model = CLIPModel.from_pretrained(self._model_name)
|
|
155
|
+
logger.info(f"Loaded transformers CLIP model: {self._model_name}")
|
|
156
|
+
|
|
157
|
+
def encode_image(self, image_path: str) -> Embedding:
|
|
158
|
+
rgb_array = _load_image_rgb(image_path)
|
|
159
|
+
return self.encode_image_from_array(rgb_array)
|
|
160
|
+
|
|
161
|
+
def encode_image_from_array(self, rgb_array: NDArray[np.uint8]) -> Embedding:
|
|
162
|
+
from PIL import Image
|
|
163
|
+
|
|
164
|
+
self._load_model()
|
|
165
|
+
|
|
166
|
+
image = Image.fromarray(rgb_array)
|
|
167
|
+
|
|
168
|
+
if self._processor is not None:
|
|
169
|
+
# Using transformers
|
|
170
|
+
inputs = self._processor(images=image, return_tensors="pt")
|
|
171
|
+
outputs = self._model.get_image_features(**inputs)
|
|
172
|
+
embedding = outputs / outputs.norm(dim=-1, keepdim=True)
|
|
173
|
+
return embedding.detach().numpy().flatten().tolist()
|
|
174
|
+
else:
|
|
175
|
+
# Using mlx-clip
|
|
176
|
+
embedding = self._model.encode_image(image)
|
|
177
|
+
return embedding.tolist()
|
|
178
|
+
|
|
179
|
+
def get_model_name(self) -> str:
|
|
180
|
+
return self._model_name
|
|
181
|
+
|
|
182
|
+
def encode_text(self, text: str) -> Embedding:
|
|
183
|
+
self._load_model()
|
|
184
|
+
|
|
185
|
+
if self._processor is not None:
|
|
186
|
+
# Using transformers
|
|
187
|
+
inputs = self._processor(text=text, return_tensors="pt")
|
|
188
|
+
outputs = self._model.get_text_features(**inputs)
|
|
189
|
+
embedding = outputs / outputs.norm(dim=-1, keepdim=True)
|
|
190
|
+
return embedding.detach().numpy().flatten().tolist()
|
|
191
|
+
else:
|
|
192
|
+
# Using mlx-clip
|
|
193
|
+
embedding = self._model.encode_text(text)
|
|
194
|
+
return embedding.tolist()
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# Singleton backend instance
|
|
198
|
+
_backend: CLIPBackend | None = None
|
|
199
|
+
_backend_model_name: str | None = None
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def unload_clip_model() -> None:
|
|
203
|
+
"""Unload the CLIP model to free memory."""
|
|
204
|
+
global _backend, _backend_model_name
|
|
205
|
+
|
|
206
|
+
if _backend is None:
|
|
207
|
+
return
|
|
208
|
+
|
|
209
|
+
logger.info("Unloading CLIP model to free memory")
|
|
210
|
+
|
|
211
|
+
try:
|
|
212
|
+
import torch
|
|
213
|
+
|
|
214
|
+
# Clear internal model references
|
|
215
|
+
if isinstance(_backend, MLXCLIPBackend):
|
|
216
|
+
if _backend._model is not None:
|
|
217
|
+
del _backend._model
|
|
218
|
+
_backend._model = None
|
|
219
|
+
if _backend._processor is not None:
|
|
220
|
+
del _backend._processor
|
|
221
|
+
_backend._processor = None
|
|
222
|
+
elif isinstance(_backend, OpenCLIPBackend):
|
|
223
|
+
if hasattr(_backend, "model"):
|
|
224
|
+
del _backend.model
|
|
225
|
+
|
|
226
|
+
del _backend
|
|
227
|
+
_backend = None
|
|
228
|
+
_backend_model_name = None
|
|
229
|
+
|
|
230
|
+
gc.collect()
|
|
231
|
+
|
|
232
|
+
if torch.cuda.is_available():
|
|
233
|
+
torch.cuda.synchronize()
|
|
234
|
+
torch.cuda.empty_cache()
|
|
235
|
+
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
|
236
|
+
if hasattr(torch.mps, "synchronize"):
|
|
237
|
+
torch.mps.synchronize()
|
|
238
|
+
if hasattr(torch.mps, "empty_cache"):
|
|
239
|
+
torch.mps.empty_cache()
|
|
240
|
+
|
|
241
|
+
gc.collect()
|
|
242
|
+
logger.info("CLIP model unloaded")
|
|
243
|
+
except Exception as e:
|
|
244
|
+
logger.warning(f"Error unloading CLIP model: {e}")
|
|
245
|
+
_backend = None
|
|
246
|
+
_backend_model_name = None
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def get_clip_backend(model_name: str | None = None) -> CLIPBackend:
|
|
250
|
+
"""Get the appropriate CLIP backend for the current platform.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
model_name: CLIP model name. For OpenCLIP: "ViT-B-16", "ViT-B-32", "ViT-L-14".
|
|
254
|
+
For MLX-CLIP: "openai/clip-vit-base-patch32", etc.
|
|
255
|
+
If None, uses defaults.
|
|
256
|
+
"""
|
|
257
|
+
global _backend, _backend_model_name
|
|
258
|
+
|
|
259
|
+
# If model name changed, unload the old model
|
|
260
|
+
if _backend is not None and model_name is not None and _backend_model_name != model_name:
|
|
261
|
+
logger.info(f"Switching CLIP model from {_backend_model_name} to {model_name}")
|
|
262
|
+
unload_clip_model()
|
|
263
|
+
|
|
264
|
+
if _backend is not None:
|
|
265
|
+
return _backend
|
|
266
|
+
|
|
267
|
+
if is_apple_silicon():
|
|
268
|
+
try:
|
|
269
|
+
# MLX-CLIP uses HuggingFace model names - translate if needed
|
|
270
|
+
if model_name and model_name in OPENCLIP_TO_HF_MODEL_MAP:
|
|
271
|
+
mlx_model = OPENCLIP_TO_HF_MODEL_MAP[model_name]
|
|
272
|
+
logger.info(f"Translated CLIP model name: {model_name} -> {mlx_model}")
|
|
273
|
+
else:
|
|
274
|
+
mlx_model = model_name or "openai/clip-vit-base-patch32"
|
|
275
|
+
_backend = MLXCLIPBackend(model_name=mlx_model)
|
|
276
|
+
_backend_model_name = model_name or mlx_model # Store original name for comparison
|
|
277
|
+
logger.info(f"Using MLX-CLIP backend (Apple Silicon): {mlx_model}")
|
|
278
|
+
return _backend
|
|
279
|
+
except Exception as e:
|
|
280
|
+
logger.warning(f"MLX-CLIP not available: {e}, falling back to OpenCLIP")
|
|
281
|
+
|
|
282
|
+
# OpenCLIP uses short model names like "ViT-B-32"
|
|
283
|
+
openclip_model = model_name or "ViT-B-32"
|
|
284
|
+
_backend = OpenCLIPBackend(model_name=openclip_model)
|
|
285
|
+
_backend_model_name = openclip_model
|
|
286
|
+
logger.info(f"Using OpenCLIP backend: {openclip_model}")
|
|
287
|
+
return _backend
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
IMAGE_EXTENSIONS = {
|
|
291
|
+
".jpg",
|
|
292
|
+
".jpeg",
|
|
293
|
+
".png",
|
|
294
|
+
".webp",
|
|
295
|
+
".bmp",
|
|
296
|
+
".tiff",
|
|
297
|
+
".tif",
|
|
298
|
+
".heic",
|
|
299
|
+
".heif",
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def extract_clip(
|
|
304
|
+
file_path: str,
|
|
305
|
+
frame_buffer: SharedFrameBuffer,
|
|
306
|
+
model_name: str | None = None, # CLIP model name (e.g., "ViT-B-32", "ViT-L-14")
|
|
307
|
+
) -> ClipResult:
|
|
308
|
+
"""Extract CLIP embeddings from video frames.
|
|
309
|
+
|
|
310
|
+
For images, use extract_clip_image() instead.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
file_path: Path to video file (used for logging)
|
|
314
|
+
frame_buffer: Pre-decoded frames from SharedFrameBuffer
|
|
315
|
+
model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
ClipResult with embeddings per segment
|
|
319
|
+
"""
|
|
320
|
+
backend = get_clip_backend(model_name)
|
|
321
|
+
|
|
322
|
+
# Process frames from shared buffer
|
|
323
|
+
logger.info(f"Extracting CLIP embeddings from {len(frame_buffer.frames)} frames")
|
|
324
|
+
segments: list[ClipSegment] = []
|
|
325
|
+
|
|
326
|
+
for i, ts in enumerate(sorted(frame_buffer.frames.keys())):
|
|
327
|
+
shared_frame = frame_buffer.frames[ts]
|
|
328
|
+
try:
|
|
329
|
+
embedding = backend.encode_image_from_array(shared_frame.rgb)
|
|
330
|
+
segments.append(
|
|
331
|
+
ClipSegment(
|
|
332
|
+
start=ts,
|
|
333
|
+
end=ts,
|
|
334
|
+
scene_index=i,
|
|
335
|
+
embedding=embedding,
|
|
336
|
+
)
|
|
337
|
+
)
|
|
338
|
+
except Exception as e:
|
|
339
|
+
logger.warning(f"Failed to encode frame at {ts}s: {e}")
|
|
340
|
+
|
|
341
|
+
logger.info(f"Generated {len(segments)} CLIP embeddings")
|
|
342
|
+
|
|
343
|
+
return ClipResult(
|
|
344
|
+
model=backend.get_model_name(),
|
|
345
|
+
segments=segments,
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def extract_clip_image(
|
|
350
|
+
file_path: str,
|
|
351
|
+
model_name: str | None = None,
|
|
352
|
+
) -> ClipResult:
|
|
353
|
+
"""Extract CLIP embedding from a single image file.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
file_path: Path to image file
|
|
357
|
+
model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
ClipResult with single embedding
|
|
361
|
+
"""
|
|
362
|
+
path = Path(file_path)
|
|
363
|
+
if not path.exists():
|
|
364
|
+
raise FileNotFoundError(f"Image not found: {file_path}")
|
|
365
|
+
|
|
366
|
+
backend = get_clip_backend(model_name)
|
|
367
|
+
logger.info(f"Encoding image directly: {file_path}")
|
|
368
|
+
|
|
369
|
+
embedding = backend.encode_image(file_path)
|
|
370
|
+
return ClipResult(
|
|
371
|
+
model=backend.get_model_name(),
|
|
372
|
+
segments=[
|
|
373
|
+
ClipSegment(
|
|
374
|
+
start=0.0,
|
|
375
|
+
end=0.0,
|
|
376
|
+
scene_index=0,
|
|
377
|
+
embedding=embedding,
|
|
378
|
+
)
|
|
379
|
+
],
|
|
380
|
+
)
|
|
381
|
+
|
|
382
|
+
|
|
383
|
+
def encode_text_query(
|
|
384
|
+
text: str,
|
|
385
|
+
model_name: str | None = None,
|
|
386
|
+
) -> Embedding:
|
|
387
|
+
"""Encode a text query to a CLIP embedding.
|
|
388
|
+
|
|
389
|
+
The resulting embedding can be used for text-to-image search by comparing
|
|
390
|
+
against image embeddings using cosine similarity.
|
|
391
|
+
|
|
392
|
+
Args:
|
|
393
|
+
text: Text query to encode
|
|
394
|
+
model_name: CLIP model name (e.g., "ViT-B-32", "ViT-L-14"). If None, uses default.
|
|
395
|
+
|
|
396
|
+
Returns:
|
|
397
|
+
List of floats representing the normalized embedding
|
|
398
|
+
"""
|
|
399
|
+
backend = get_clip_backend(model_name)
|
|
400
|
+
logger.info(f"Encoding text query: {text[:50]}...")
|
|
401
|
+
return backend.encode_text(text)
|