media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
media_engine/config.py
ADDED
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
"""Configuration settings for Polybos Media Engine."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import platform
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# =============================================================================
|
|
16
|
+
# Default Constants
|
|
17
|
+
# =============================================================================
|
|
18
|
+
|
|
19
|
+
# Config file location
|
|
20
|
+
DEFAULT_CONFIG_PATH = Path.home() / ".config" / "polybos" / "config.json"
|
|
21
|
+
|
|
22
|
+
# API
|
|
23
|
+
DEFAULT_API_VERSION = "1.0"
|
|
24
|
+
DEFAULT_LOG_LEVEL = "INFO"
|
|
25
|
+
|
|
26
|
+
# Whisper speech-to-text
|
|
27
|
+
DEFAULT_WHISPER_MODEL = "auto" # Auto-select based on VRAM
|
|
28
|
+
DEFAULT_FALLBACK_LANGUAGE = "en"
|
|
29
|
+
|
|
30
|
+
# Speaker diarization
|
|
31
|
+
DEFAULT_DIARIZATION_MODEL = "pyannote/speaker-diarization-3.1"
|
|
32
|
+
|
|
33
|
+
# Processing
|
|
34
|
+
DEFAULT_FACE_SAMPLE_FPS = 1.0
|
|
35
|
+
DEFAULT_OBJECT_SAMPLE_FPS = 2.0
|
|
36
|
+
DEFAULT_MIN_FACE_SIZE = 80
|
|
37
|
+
|
|
38
|
+
# Object detection (Qwen VLM)
|
|
39
|
+
DEFAULT_QWEN_MODEL = "auto" # Auto-select based on VRAM
|
|
40
|
+
DEFAULT_QWEN_FRAMES_PER_SCENE = 1
|
|
41
|
+
DEFAULT_OBJECT_DETECTOR = "auto" # Auto-select based on VRAM
|
|
42
|
+
|
|
43
|
+
# OCR - Latin script languages (see https://www.jaided.ai/easyocr/)
|
|
44
|
+
# For CJK: ch_sim, ch_tra, ja, ko
|
|
45
|
+
# Note: Finnish (fi) not supported by EasyOCR
|
|
46
|
+
DEFAULT_OCR_LANGUAGES: list[str] = [
|
|
47
|
+
"en", # English
|
|
48
|
+
"no", # Norwegian
|
|
49
|
+
"de", # German
|
|
50
|
+
"fr", # French
|
|
51
|
+
"es", # Spanish
|
|
52
|
+
"it", # Italian
|
|
53
|
+
"pt", # Portuguese
|
|
54
|
+
"nl", # Dutch
|
|
55
|
+
"sv", # Swedish
|
|
56
|
+
"da", # Danish
|
|
57
|
+
"pl", # Polish
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
# Temp directory
|
|
61
|
+
DEFAULT_TEMP_DIR = "/tmp/polybos"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# =============================================================================
|
|
65
|
+
# Enums
|
|
66
|
+
# =============================================================================
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class DeviceType(StrEnum):
|
|
70
|
+
"""Compute device types."""
|
|
71
|
+
|
|
72
|
+
MPS = "mps"
|
|
73
|
+
CUDA = "cuda"
|
|
74
|
+
CPU = "cpu"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class ObjectDetector(StrEnum):
|
|
78
|
+
"""Object detection backend."""
|
|
79
|
+
|
|
80
|
+
YOLO = "yolo"
|
|
81
|
+
QWEN = "qwen"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
# =============================================================================
|
|
85
|
+
# Settings (loaded from JSON config file)
|
|
86
|
+
# =============================================================================
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class Settings(BaseModel):
|
|
90
|
+
"""Application settings loaded from JSON config file.
|
|
91
|
+
|
|
92
|
+
Config file location: ~/.config/polybos/config.json
|
|
93
|
+
|
|
94
|
+
Model settings support "auto" to automatically select based on VRAM:
|
|
95
|
+
- whisper_model: "auto" | "tiny" | "small" | "medium" | "large-v3"
|
|
96
|
+
- qwen_model: "auto" | "Qwen/Qwen2-VL-2B-Instruct" | "Qwen/Qwen2-VL-7B-Instruct"
|
|
97
|
+
- yolo_model: "auto" | "yolov8n.pt" | "yolov8s.pt" | "yolov8m.pt" | "yolov8l.pt" | "yolov8x.pt"
|
|
98
|
+
- clip_model: "auto" | "ViT-B-16" | "ViT-B-32" | "ViT-L-14"
|
|
99
|
+
- object_detector: "auto" | "yolo" | "qwen"
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
# API settings
|
|
103
|
+
api_version: str = DEFAULT_API_VERSION
|
|
104
|
+
log_level: str = DEFAULT_LOG_LEVEL
|
|
105
|
+
|
|
106
|
+
# Whisper settings ("auto" = select based on VRAM)
|
|
107
|
+
whisper_model: str = DEFAULT_WHISPER_MODEL
|
|
108
|
+
fallback_language: str = DEFAULT_FALLBACK_LANGUAGE
|
|
109
|
+
|
|
110
|
+
# Speaker diarization settings
|
|
111
|
+
hf_token: str | None = None # HuggingFace token for pyannote models
|
|
112
|
+
diarization_model: str = DEFAULT_DIARIZATION_MODEL
|
|
113
|
+
|
|
114
|
+
# Processing settings
|
|
115
|
+
face_sample_fps: float = DEFAULT_FACE_SAMPLE_FPS
|
|
116
|
+
object_sample_fps: float = DEFAULT_OBJECT_SAMPLE_FPS
|
|
117
|
+
min_face_size: int = DEFAULT_MIN_FACE_SIZE
|
|
118
|
+
|
|
119
|
+
# Object detection settings ("auto" = select based on VRAM)
|
|
120
|
+
object_detector: str = DEFAULT_OBJECT_DETECTOR # "auto", "yolo", or "qwen"
|
|
121
|
+
qwen_model: str = DEFAULT_QWEN_MODEL
|
|
122
|
+
qwen_frames_per_scene: int = DEFAULT_QWEN_FRAMES_PER_SCENE
|
|
123
|
+
|
|
124
|
+
# YOLO model ("auto" = select based on VRAM)
|
|
125
|
+
yolo_model: str = "auto"
|
|
126
|
+
|
|
127
|
+
# CLIP model ("auto" = select based on VRAM)
|
|
128
|
+
clip_model: str = "auto"
|
|
129
|
+
|
|
130
|
+
# OCR settings
|
|
131
|
+
ocr_languages: list[str] = DEFAULT_OCR_LANGUAGES.copy()
|
|
132
|
+
|
|
133
|
+
# Temp directory for processing
|
|
134
|
+
temp_dir: str = DEFAULT_TEMP_DIR
|
|
135
|
+
|
|
136
|
+
def get_whisper_model(self) -> str:
|
|
137
|
+
"""Get resolved Whisper model (handles 'auto')."""
|
|
138
|
+
if self.whisper_model == "auto":
|
|
139
|
+
return get_auto_whisper_model()
|
|
140
|
+
return self.whisper_model
|
|
141
|
+
|
|
142
|
+
def get_qwen_model(self) -> str:
|
|
143
|
+
"""Get resolved Qwen model (handles 'auto')."""
|
|
144
|
+
if self.qwen_model == "auto":
|
|
145
|
+
return get_auto_qwen_model()
|
|
146
|
+
return self.qwen_model
|
|
147
|
+
|
|
148
|
+
def get_yolo_model(self) -> str:
|
|
149
|
+
"""Get resolved YOLO model (handles 'auto')."""
|
|
150
|
+
if self.yolo_model == "auto":
|
|
151
|
+
return get_auto_yolo_model()
|
|
152
|
+
return self.yolo_model
|
|
153
|
+
|
|
154
|
+
def get_clip_model(self) -> str:
|
|
155
|
+
"""Get resolved CLIP model (handles 'auto')."""
|
|
156
|
+
if self.clip_model == "auto":
|
|
157
|
+
return get_auto_clip_model()
|
|
158
|
+
return self.clip_model
|
|
159
|
+
|
|
160
|
+
def get_object_detector(self) -> ObjectDetector:
|
|
161
|
+
"""Get resolved object detector (handles 'auto')."""
|
|
162
|
+
if self.object_detector == "auto":
|
|
163
|
+
return get_auto_object_detector()
|
|
164
|
+
return ObjectDetector(self.object_detector)
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def get_config_path() -> Path:
|
|
168
|
+
"""Get the config file path."""
|
|
169
|
+
return DEFAULT_CONFIG_PATH
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def load_config_from_file(config_path: Path | None = None) -> dict[str, Any]:
|
|
173
|
+
"""Load configuration from JSON file.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
config_path: Optional path to config file. Defaults to ~/.config/polybos/config.json
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
Dictionary of settings (empty if file doesn't exist)
|
|
180
|
+
"""
|
|
181
|
+
path = config_path or DEFAULT_CONFIG_PATH
|
|
182
|
+
|
|
183
|
+
if not path.exists():
|
|
184
|
+
return {}
|
|
185
|
+
|
|
186
|
+
try:
|
|
187
|
+
with open(path) as f:
|
|
188
|
+
return json.load(f)
|
|
189
|
+
except (json.JSONDecodeError, OSError) as e:
|
|
190
|
+
logger.warning(f"Failed to load config from {path}: {e}")
|
|
191
|
+
return {}
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def save_config_to_file(settings: Settings, config_path: Path | None = None) -> None:
|
|
195
|
+
"""Save configuration to JSON file.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
settings: Settings instance to save
|
|
199
|
+
config_path: Optional path to config file. Defaults to ~/.config/polybos/config.json
|
|
200
|
+
"""
|
|
201
|
+
path = config_path or DEFAULT_CONFIG_PATH
|
|
202
|
+
|
|
203
|
+
# Create directory if needed
|
|
204
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
205
|
+
|
|
206
|
+
with open(path, "w") as f:
|
|
207
|
+
json.dump(settings.model_dump(), f, indent=2)
|
|
208
|
+
|
|
209
|
+
logger.info(f"Saved config to {path}")
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
# Cached settings instance
|
|
213
|
+
_settings: Settings | None = None
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def get_settings() -> Settings:
|
|
217
|
+
"""Get settings instance (loaded from config file on first call)."""
|
|
218
|
+
global _settings
|
|
219
|
+
|
|
220
|
+
if _settings is None:
|
|
221
|
+
config_data = load_config_from_file()
|
|
222
|
+
_settings = Settings(**config_data)
|
|
223
|
+
if config_data:
|
|
224
|
+
logger.info(f"Loaded settings from {DEFAULT_CONFIG_PATH}")
|
|
225
|
+
else:
|
|
226
|
+
logger.info("Using default settings")
|
|
227
|
+
|
|
228
|
+
return _settings
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
def reload_settings() -> Settings:
|
|
232
|
+
"""Reload settings from config file."""
|
|
233
|
+
global _settings
|
|
234
|
+
_settings = None
|
|
235
|
+
return get_settings()
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def is_apple_silicon() -> bool:
|
|
239
|
+
"""Check if running on Apple Silicon."""
|
|
240
|
+
return platform.system() == "Darwin" and platform.machine() == "arm64"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def has_cuda() -> bool:
|
|
244
|
+
"""Check if CUDA is available."""
|
|
245
|
+
try:
|
|
246
|
+
import torch
|
|
247
|
+
|
|
248
|
+
return torch.cuda.is_available()
|
|
249
|
+
except ImportError:
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def get_device() -> DeviceType:
|
|
254
|
+
"""Get the best available compute device."""
|
|
255
|
+
if is_apple_silicon():
|
|
256
|
+
return DeviceType.MPS
|
|
257
|
+
elif has_cuda():
|
|
258
|
+
return DeviceType.CUDA
|
|
259
|
+
else:
|
|
260
|
+
return DeviceType.CPU
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
# =============================================================================
|
|
264
|
+
# VRAM Detection and Auto Model Selection
|
|
265
|
+
# =============================================================================
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def get_gpu_name() -> str | None:
|
|
269
|
+
"""Get the GPU name/model.
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
GPU name string (e.g., "NVIDIA GeForce RTX 4090", "Apple M2 Max")
|
|
273
|
+
or None if no GPU is available.
|
|
274
|
+
"""
|
|
275
|
+
if has_cuda():
|
|
276
|
+
try:
|
|
277
|
+
import torch
|
|
278
|
+
|
|
279
|
+
return torch.cuda.get_device_name(0)
|
|
280
|
+
except Exception:
|
|
281
|
+
return None
|
|
282
|
+
elif is_apple_silicon():
|
|
283
|
+
try:
|
|
284
|
+
import subprocess
|
|
285
|
+
|
|
286
|
+
# Get chip name from system_profiler
|
|
287
|
+
result = subprocess.run(
|
|
288
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
289
|
+
capture_output=True,
|
|
290
|
+
text=True,
|
|
291
|
+
)
|
|
292
|
+
cpu_brand = result.stdout.strip()
|
|
293
|
+
# Extract Apple chip name if present
|
|
294
|
+
if "Apple" in cpu_brand:
|
|
295
|
+
return cpu_brand
|
|
296
|
+
# Fallback: try to get from SPHardwareDataType
|
|
297
|
+
result = subprocess.run(
|
|
298
|
+
["system_profiler", "SPHardwareDataType"],
|
|
299
|
+
capture_output=True,
|
|
300
|
+
text=True,
|
|
301
|
+
)
|
|
302
|
+
for line in result.stdout.split("\n"):
|
|
303
|
+
if "Chip:" in line:
|
|
304
|
+
return line.split(":")[-1].strip()
|
|
305
|
+
return "Apple Silicon"
|
|
306
|
+
except Exception:
|
|
307
|
+
return "Apple Silicon"
|
|
308
|
+
return None
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
@lru_cache(maxsize=1)
|
|
312
|
+
def get_available_vram_gb() -> float:
|
|
313
|
+
"""Get available GPU memory in GB.
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
Available VRAM in GB. For Apple Silicon, returns unified memory.
|
|
317
|
+
Returns 0 if no GPU is available.
|
|
318
|
+
"""
|
|
319
|
+
if has_cuda():
|
|
320
|
+
try:
|
|
321
|
+
import torch
|
|
322
|
+
|
|
323
|
+
props = torch.cuda.get_device_properties(0)
|
|
324
|
+
total_gb = props.total_memory / (1024**3)
|
|
325
|
+
logger.info(f"CUDA GPU: {props.name}, {total_gb:.1f}GB VRAM")
|
|
326
|
+
return total_gb
|
|
327
|
+
except Exception as e:
|
|
328
|
+
logger.warning(f"Failed to get CUDA memory: {e}")
|
|
329
|
+
return 0
|
|
330
|
+
|
|
331
|
+
elif is_apple_silicon():
|
|
332
|
+
try:
|
|
333
|
+
import subprocess
|
|
334
|
+
|
|
335
|
+
# Get total system memory (unified memory on Apple Silicon)
|
|
336
|
+
result = subprocess.run(
|
|
337
|
+
["sysctl", "-n", "hw.memsize"],
|
|
338
|
+
capture_output=True,
|
|
339
|
+
text=True,
|
|
340
|
+
)
|
|
341
|
+
total_bytes = int(result.stdout.strip())
|
|
342
|
+
total_gb = total_bytes / (1024**3)
|
|
343
|
+
logger.info(f"Apple Silicon: {total_gb:.0f}GB unified memory")
|
|
344
|
+
return total_gb
|
|
345
|
+
except Exception as e:
|
|
346
|
+
logger.warning(f"Failed to get Apple Silicon memory: {e}")
|
|
347
|
+
return 8.0 # Conservative default for M1/M2
|
|
348
|
+
|
|
349
|
+
return 0
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def get_free_memory_gb() -> float:
|
|
353
|
+
"""Get memory available for ML models without causing excessive swapping.
|
|
354
|
+
|
|
355
|
+
Uses psutil for cross-platform memory detection, which correctly accounts
|
|
356
|
+
for free, inactive, and purgeable memory on macOS.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
Estimated GB available for loading models.
|
|
360
|
+
"""
|
|
361
|
+
try:
|
|
362
|
+
import psutil
|
|
363
|
+
|
|
364
|
+
mem = psutil.virtual_memory()
|
|
365
|
+
# psutil.available is the memory that can be given to processes
|
|
366
|
+
# without swapping - this is what we want
|
|
367
|
+
available_gb = mem.available / (1024**3)
|
|
368
|
+
|
|
369
|
+
# Leave a 1GB buffer for system processes
|
|
370
|
+
available_for_models = max(0.0, available_gb - 1.0)
|
|
371
|
+
|
|
372
|
+
logger.info(f"Memory: {mem.total / (1024**3):.0f}GB total, " f"{mem.available / (1024**3):.1f}GB available, " f"{available_for_models:.1f}GB for models")
|
|
373
|
+
return available_for_models
|
|
374
|
+
|
|
375
|
+
except ImportError:
|
|
376
|
+
logger.warning("psutil not installed, using fallback memory detection")
|
|
377
|
+
|
|
378
|
+
# Fallback without psutil
|
|
379
|
+
try:
|
|
380
|
+
import sys
|
|
381
|
+
|
|
382
|
+
if sys.platform == "darwin":
|
|
383
|
+
import subprocess
|
|
384
|
+
|
|
385
|
+
# macOS fallback: use total memory * 0.5 as conservative estimate
|
|
386
|
+
result = subprocess.run(
|
|
387
|
+
["sysctl", "-n", "hw.memsize"],
|
|
388
|
+
capture_output=True,
|
|
389
|
+
text=True,
|
|
390
|
+
timeout=5,
|
|
391
|
+
)
|
|
392
|
+
total_bytes = int(result.stdout.strip())
|
|
393
|
+
total_gb = total_bytes / (1024**3)
|
|
394
|
+
return total_gb * 0.5 # Conservative: assume 50% available
|
|
395
|
+
else:
|
|
396
|
+
# Linux: try /proc/meminfo
|
|
397
|
+
with open("/proc/meminfo") as f:
|
|
398
|
+
for line in f:
|
|
399
|
+
if line.startswith("MemAvailable:"):
|
|
400
|
+
kb = int(line.split()[1])
|
|
401
|
+
return kb / (1024**2)
|
|
402
|
+
|
|
403
|
+
except Exception as e:
|
|
404
|
+
logger.warning(f"Failed to get free memory: {e}")
|
|
405
|
+
|
|
406
|
+
return 8.0 # Conservative fallback
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def get_auto_whisper_model() -> str:
|
|
410
|
+
"""Select Whisper model based on available VRAM.
|
|
411
|
+
|
|
412
|
+
| VRAM | Model | Size | Quality |
|
|
413
|
+
|----------|----------|--------|---------|
|
|
414
|
+
| <4GB | tiny | 75MB | Basic |
|
|
415
|
+
| 4-6GB | small | 488MB | Good |
|
|
416
|
+
| 6-10GB | medium | 1.5GB | Better |
|
|
417
|
+
| 10GB+ | large-v3 | 3GB | Best |
|
|
418
|
+
"""
|
|
419
|
+
vram = get_available_vram_gb()
|
|
420
|
+
|
|
421
|
+
if vram >= 10:
|
|
422
|
+
model = "large-v3"
|
|
423
|
+
elif vram >= 6:
|
|
424
|
+
model = "medium"
|
|
425
|
+
elif vram >= 4:
|
|
426
|
+
model = "small"
|
|
427
|
+
else:
|
|
428
|
+
model = "tiny"
|
|
429
|
+
|
|
430
|
+
logger.info(f"Auto-selected Whisper model: {model} (VRAM: {vram:.1f}GB)")
|
|
431
|
+
return model
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
def get_auto_qwen_model() -> str:
|
|
435
|
+
"""Select Qwen2-VL model based on available VRAM.
|
|
436
|
+
|
|
437
|
+
| VRAM | Model | Size | Quality |
|
|
438
|
+
|----------|----------------|-------|---------|
|
|
439
|
+
| <8GB | (use YOLO) | - | Basic |
|
|
440
|
+
| 8-16GB | Qwen2-VL-2B | ~5GB | Good |
|
|
441
|
+
| 16GB+ | Qwen2-VL-7B | ~15GB | Best |
|
|
442
|
+
"""
|
|
443
|
+
vram = get_available_vram_gb()
|
|
444
|
+
|
|
445
|
+
if vram >= 16:
|
|
446
|
+
model = "Qwen/Qwen2-VL-7B-Instruct"
|
|
447
|
+
elif vram >= 8:
|
|
448
|
+
model = "Qwen/Qwen2-VL-2B-Instruct"
|
|
449
|
+
else:
|
|
450
|
+
# Not enough VRAM for Qwen, should use YOLO instead
|
|
451
|
+
model = "Qwen/Qwen2-VL-2B-Instruct"
|
|
452
|
+
logger.warning(f"Low VRAM ({vram:.1f}GB) - consider using YOLO instead of Qwen")
|
|
453
|
+
|
|
454
|
+
logger.info(f"Auto-selected Qwen model: {model} (VRAM: {vram:.1f}GB)")
|
|
455
|
+
return model
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
def get_auto_object_detector() -> ObjectDetector:
|
|
459
|
+
"""Select object detector based on available VRAM.
|
|
460
|
+
|
|
461
|
+
YOLO is faster and uses less memory.
|
|
462
|
+
Qwen provides better scene understanding but needs more VRAM.
|
|
463
|
+
"""
|
|
464
|
+
vram = get_available_vram_gb()
|
|
465
|
+
|
|
466
|
+
if vram >= 8:
|
|
467
|
+
detector = ObjectDetector.QWEN
|
|
468
|
+
else:
|
|
469
|
+
detector = ObjectDetector.YOLO
|
|
470
|
+
|
|
471
|
+
logger.info(f"Auto-selected object detector: {detector} (VRAM: {vram:.1f}GB)")
|
|
472
|
+
return detector
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def get_auto_yolo_model() -> str:
|
|
476
|
+
"""Select YOLO model based on available VRAM.
|
|
477
|
+
|
|
478
|
+
| VRAM | Model | Size | Speed |
|
|
479
|
+
|----------|-----------|--------|---------|
|
|
480
|
+
| <2GB | yolov8n | 6MB | Fastest |
|
|
481
|
+
| 2-4GB | yolov8s | 22MB | Fast |
|
|
482
|
+
| 4-8GB | yolov8m | 52MB | Medium |
|
|
483
|
+
| 8-16GB | yolov8l | 87MB | Slow |
|
|
484
|
+
| 16GB+ | yolov8x | 136MB | Slowest |
|
|
485
|
+
"""
|
|
486
|
+
vram = get_available_vram_gb()
|
|
487
|
+
|
|
488
|
+
if vram >= 16:
|
|
489
|
+
model = "yolov8x.pt"
|
|
490
|
+
elif vram >= 8:
|
|
491
|
+
model = "yolov8l.pt"
|
|
492
|
+
elif vram >= 4:
|
|
493
|
+
model = "yolov8m.pt"
|
|
494
|
+
elif vram >= 2:
|
|
495
|
+
model = "yolov8s.pt"
|
|
496
|
+
else:
|
|
497
|
+
model = "yolov8n.pt"
|
|
498
|
+
|
|
499
|
+
logger.info(f"Auto-selected YOLO model: {model} (VRAM: {vram:.1f}GB)")
|
|
500
|
+
return model
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def get_auto_clip_model() -> str:
|
|
504
|
+
"""Select CLIP model based on available VRAM.
|
|
505
|
+
|
|
506
|
+
| VRAM | Model | Size | Quality |
|
|
507
|
+
|----------|-----------|---------|---------|
|
|
508
|
+
| <2GB | ViT-B-16 | 335MB | Good |
|
|
509
|
+
| 2-4GB | ViT-B-32 | 338MB | Good |
|
|
510
|
+
| 4GB+ | ViT-L-14 | 933MB | Best |
|
|
511
|
+
|
|
512
|
+
Note: ViT-B-16 and ViT-B-32 are similar size but different patch sizes.
|
|
513
|
+
ViT-B-32 is slightly faster, ViT-B-16 has better detail recognition.
|
|
514
|
+
"""
|
|
515
|
+
vram = get_available_vram_gb()
|
|
516
|
+
|
|
517
|
+
if vram >= 4:
|
|
518
|
+
model = "ViT-L-14"
|
|
519
|
+
elif vram >= 2:
|
|
520
|
+
model = "ViT-B-32"
|
|
521
|
+
else:
|
|
522
|
+
model = "ViT-B-16"
|
|
523
|
+
|
|
524
|
+
logger.info(f"Auto-selected CLIP model: {model} (VRAM: {vram:.1f}GB)")
|
|
525
|
+
return model
|
|
526
|
+
|
|
527
|
+
|
|
528
|
+
def get_vram_summary() -> dict:
|
|
529
|
+
"""Get a summary of VRAM and auto-selected models.
|
|
530
|
+
|
|
531
|
+
Useful for frontend to display hardware capabilities.
|
|
532
|
+
"""
|
|
533
|
+
vram = get_available_vram_gb() # Total VRAM (cached)
|
|
534
|
+
free_mem = get_free_memory_gb() # Currently available (not cached)
|
|
535
|
+
device = get_device()
|
|
536
|
+
|
|
537
|
+
# Model memory requirements
|
|
538
|
+
qwen_2b_needs = 5.0
|
|
539
|
+
qwen_7b_needs = 15.0
|
|
540
|
+
whisper_large_needs = 6.0
|
|
541
|
+
|
|
542
|
+
return {
|
|
543
|
+
"device": str(device),
|
|
544
|
+
"gpu_name": get_gpu_name(),
|
|
545
|
+
"vram_gb": round(vram, 1),
|
|
546
|
+
"free_memory_gb": round(free_mem, 1),
|
|
547
|
+
"auto_whisper_model": get_auto_whisper_model(),
|
|
548
|
+
"auto_qwen_model": get_auto_qwen_model() if vram >= 8 else None,
|
|
549
|
+
"auto_yolo_model": get_auto_yolo_model(),
|
|
550
|
+
"auto_clip_model": get_auto_clip_model(),
|
|
551
|
+
"auto_object_detector": str(get_auto_object_detector()),
|
|
552
|
+
# What the hardware CAN support (based on total VRAM)
|
|
553
|
+
"recommendations": {
|
|
554
|
+
"can_use_large_whisper": vram >= 10,
|
|
555
|
+
"can_use_qwen": vram >= 8,
|
|
556
|
+
"can_use_qwen_7b": vram >= 16,
|
|
557
|
+
"can_use_clip_l14": vram >= 4,
|
|
558
|
+
"can_use_yolo_xlarge": vram >= 16,
|
|
559
|
+
},
|
|
560
|
+
# What can load RIGHT NOW (based on free memory)
|
|
561
|
+
"available_now": {
|
|
562
|
+
"qwen_2b": free_mem >= qwen_2b_needs,
|
|
563
|
+
"qwen_7b": free_mem >= qwen_7b_needs,
|
|
564
|
+
"whisper_large": free_mem >= whisper_large_needs,
|
|
565
|
+
"whisper_medium": free_mem >= 4.0,
|
|
566
|
+
"whisper_small": free_mem >= 2.0,
|
|
567
|
+
"yolo": True, # YOLO is always available (~100MB)
|
|
568
|
+
"clip": free_mem >= 1.0,
|
|
569
|
+
},
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
# =============================================================================
|
|
574
|
+
# Memory Monitoring
|
|
575
|
+
# =============================================================================
|
|
576
|
+
|
|
577
|
+
# Approximate memory requirements for models (in GB)
|
|
578
|
+
MODEL_MEMORY_REQUIREMENTS: dict[str, float] = {
|
|
579
|
+
# Whisper models
|
|
580
|
+
"tiny": 1.0,
|
|
581
|
+
"small": 2.0,
|
|
582
|
+
"medium": 4.0,
|
|
583
|
+
"large-v3": 6.0,
|
|
584
|
+
# YOLO models
|
|
585
|
+
"yolov8n.pt": 0.2,
|
|
586
|
+
"yolov8s.pt": 0.3,
|
|
587
|
+
"yolov8m.pt": 0.5,
|
|
588
|
+
"yolov8l.pt": 0.8,
|
|
589
|
+
"yolov8x.pt": 1.2,
|
|
590
|
+
# Qwen VLM
|
|
591
|
+
"Qwen/Qwen2-VL-2B-Instruct": 6.0,
|
|
592
|
+
"Qwen/Qwen2-VL-7B-Instruct": 16.0,
|
|
593
|
+
# CLIP
|
|
594
|
+
"ViT-B-16": 0.4,
|
|
595
|
+
"ViT-B-32": 0.4,
|
|
596
|
+
"ViT-L-14": 1.0,
|
|
597
|
+
# OCR
|
|
598
|
+
"easyocr": 3.0,
|
|
599
|
+
# Face detection
|
|
600
|
+
"deepface": 0.5,
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def get_available_memory_gb() -> tuple[float, float]:
|
|
605
|
+
"""Get available system RAM and GPU memory in GB.
|
|
606
|
+
|
|
607
|
+
Returns:
|
|
608
|
+
(available_ram_gb, available_vram_gb)
|
|
609
|
+
"""
|
|
610
|
+
try:
|
|
611
|
+
import psutil # type: ignore[import-not-found]
|
|
612
|
+
|
|
613
|
+
mem = psutil.virtual_memory()
|
|
614
|
+
available_ram = mem.available / (1024**3)
|
|
615
|
+
except ImportError:
|
|
616
|
+
logger.warning("psutil not installed, cannot monitor RAM")
|
|
617
|
+
available_ram = 8.0 # Conservative default
|
|
618
|
+
|
|
619
|
+
# GPU memory
|
|
620
|
+
available_vram = 0.0
|
|
621
|
+
if has_cuda():
|
|
622
|
+
try:
|
|
623
|
+
import torch
|
|
624
|
+
|
|
625
|
+
total = torch.cuda.get_device_properties(0).total_memory
|
|
626
|
+
allocated = torch.cuda.memory_allocated(0)
|
|
627
|
+
available_vram = (total - allocated) / (1024**3)
|
|
628
|
+
except Exception:
|
|
629
|
+
available_vram = get_available_vram_gb()
|
|
630
|
+
elif is_apple_silicon():
|
|
631
|
+
# Unified memory - estimate from system available
|
|
632
|
+
available_vram = available_ram * 0.75
|
|
633
|
+
|
|
634
|
+
return available_ram, available_vram
|
|
635
|
+
|
|
636
|
+
|
|
637
|
+
def check_memory_before_load(model_name: str, clear_memory_func: Any | None = None) -> bool:
|
|
638
|
+
"""Check if enough memory is available before loading a model.
|
|
639
|
+
|
|
640
|
+
If memory is low, attempts to free memory by calling the clear function.
|
|
641
|
+
|
|
642
|
+
Args:
|
|
643
|
+
model_name: Name of the model to load (must be in MODEL_MEMORY_REQUIREMENTS)
|
|
644
|
+
clear_memory_func: Optional function to call to free memory (e.g., gc.collect)
|
|
645
|
+
|
|
646
|
+
Returns:
|
|
647
|
+
True if enough memory is available, False otherwise
|
|
648
|
+
"""
|
|
649
|
+
required_gb = MODEL_MEMORY_REQUIREMENTS.get(model_name, 2.0) # Default 2GB
|
|
650
|
+
ram, vram = get_available_memory_gb()
|
|
651
|
+
|
|
652
|
+
# Use VRAM for GPU models, RAM for CPU
|
|
653
|
+
device = get_device()
|
|
654
|
+
available = vram if device != DeviceType.CPU else ram
|
|
655
|
+
|
|
656
|
+
if available < required_gb:
|
|
657
|
+
logger.warning(f"Low memory ({available:.1f}GB available) for {model_name} " f"({required_gb:.1f}GB required)")
|
|
658
|
+
|
|
659
|
+
# Try to free memory
|
|
660
|
+
if clear_memory_func is not None:
|
|
661
|
+
logger.info("Attempting to free memory...")
|
|
662
|
+
clear_memory_func()
|
|
663
|
+
|
|
664
|
+
# Re-check
|
|
665
|
+
ram, vram = get_available_memory_gb()
|
|
666
|
+
available = vram if device != DeviceType.CPU else ram
|
|
667
|
+
|
|
668
|
+
if available < required_gb:
|
|
669
|
+
logger.error(f"Still insufficient memory ({available:.1f}GB) for {model_name}")
|
|
670
|
+
return False
|
|
671
|
+
else:
|
|
672
|
+
logger.info(f"Memory freed, now {available:.1f}GB available")
|
|
673
|
+
|
|
674
|
+
return True
|