media-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/clip.py +79 -0
- cli/faces.py +91 -0
- cli/metadata.py +68 -0
- cli/motion.py +77 -0
- cli/objects.py +94 -0
- cli/ocr.py +93 -0
- cli/scenes.py +57 -0
- cli/telemetry.py +65 -0
- cli/transcript.py +76 -0
- media_engine/__init__.py +7 -0
- media_engine/_version.py +34 -0
- media_engine/app.py +80 -0
- media_engine/batch/__init__.py +56 -0
- media_engine/batch/models.py +99 -0
- media_engine/batch/processor.py +1131 -0
- media_engine/batch/queue.py +232 -0
- media_engine/batch/state.py +30 -0
- media_engine/batch/timing.py +321 -0
- media_engine/cli.py +17 -0
- media_engine/config.py +674 -0
- media_engine/extractors/__init__.py +75 -0
- media_engine/extractors/clip.py +401 -0
- media_engine/extractors/faces.py +459 -0
- media_engine/extractors/frame_buffer.py +351 -0
- media_engine/extractors/frames.py +402 -0
- media_engine/extractors/metadata/__init__.py +127 -0
- media_engine/extractors/metadata/apple.py +169 -0
- media_engine/extractors/metadata/arri.py +118 -0
- media_engine/extractors/metadata/avchd.py +208 -0
- media_engine/extractors/metadata/avchd_gps.py +270 -0
- media_engine/extractors/metadata/base.py +688 -0
- media_engine/extractors/metadata/blackmagic.py +139 -0
- media_engine/extractors/metadata/camera_360.py +276 -0
- media_engine/extractors/metadata/canon.py +290 -0
- media_engine/extractors/metadata/dji.py +371 -0
- media_engine/extractors/metadata/dv.py +121 -0
- media_engine/extractors/metadata/ffmpeg.py +76 -0
- media_engine/extractors/metadata/generic.py +119 -0
- media_engine/extractors/metadata/gopro.py +256 -0
- media_engine/extractors/metadata/red.py +305 -0
- media_engine/extractors/metadata/registry.py +114 -0
- media_engine/extractors/metadata/sony.py +442 -0
- media_engine/extractors/metadata/tesla.py +157 -0
- media_engine/extractors/motion.py +765 -0
- media_engine/extractors/objects.py +245 -0
- media_engine/extractors/objects_qwen.py +754 -0
- media_engine/extractors/ocr.py +268 -0
- media_engine/extractors/scenes.py +82 -0
- media_engine/extractors/shot_type.py +217 -0
- media_engine/extractors/telemetry.py +262 -0
- media_engine/extractors/transcribe.py +579 -0
- media_engine/extractors/translate.py +121 -0
- media_engine/extractors/vad.py +263 -0
- media_engine/main.py +68 -0
- media_engine/py.typed +0 -0
- media_engine/routers/__init__.py +15 -0
- media_engine/routers/batch.py +78 -0
- media_engine/routers/health.py +93 -0
- media_engine/routers/models.py +211 -0
- media_engine/routers/settings.py +87 -0
- media_engine/routers/utils.py +135 -0
- media_engine/schemas.py +581 -0
- media_engine/utils/__init__.py +5 -0
- media_engine/utils/logging.py +54 -0
- media_engine/utils/memory.py +49 -0
- media_engine-0.1.0.dist-info/METADATA +276 -0
- media_engine-0.1.0.dist-info/RECORD +70 -0
- media_engine-0.1.0.dist-info/WHEEL +4 -0
- media_engine-0.1.0.dist-info/entry_points.txt +11 -0
- media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,579 @@
|
|
|
1
|
+
"""Audio transcription using Whisper with platform-specific backends."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
from abc import ABC, abstractmethod
|
|
8
|
+
from collections.abc import Callable
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from media_engine.config import get_settings, has_cuda, is_apple_silicon
|
|
14
|
+
from media_engine.schemas import Transcript, TranscriptHints, TranscriptSegment
|
|
15
|
+
|
|
16
|
+
# Progress callback type: (message, current, total) -> None
|
|
17
|
+
ProgressCallback = Callable[[str, int | None, int | None], None]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@dataclass(slots=True)
|
|
21
|
+
class TranscriptionSegment:
|
|
22
|
+
"""A single segment from transcription.
|
|
23
|
+
|
|
24
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
start: float
|
|
28
|
+
end: float
|
|
29
|
+
text: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(slots=True)
|
|
33
|
+
class TranscriptionResult:
|
|
34
|
+
"""Result from a transcription backend.
|
|
35
|
+
|
|
36
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
language: str
|
|
40
|
+
language_probability: float
|
|
41
|
+
segments: list[TranscriptionSegment] = field(default_factory=list)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass(slots=True)
|
|
45
|
+
class SpeakerSegment:
|
|
46
|
+
"""A speaker segment from diarization.
|
|
47
|
+
|
|
48
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
start: float
|
|
52
|
+
end: float
|
|
53
|
+
speaker: str
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass(slots=True)
|
|
57
|
+
class DiarizationResult:
|
|
58
|
+
"""Result from speaker diarization.
|
|
59
|
+
|
|
60
|
+
Uses slots=True to reduce memory overhead per instance.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
segments: list[SpeakerSegment] = field(default_factory=list)
|
|
64
|
+
speaker_count: int = 0
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
logger = logging.getLogger(__name__)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class TranscriptionBackend(ABC):
|
|
71
|
+
"""Abstract base class for transcription backends."""
|
|
72
|
+
|
|
73
|
+
@abstractmethod
|
|
74
|
+
def transcribe(
|
|
75
|
+
self,
|
|
76
|
+
audio_path: str,
|
|
77
|
+
model: str = "large-v3",
|
|
78
|
+
language: str | None = None,
|
|
79
|
+
initial_prompt: str | None = None,
|
|
80
|
+
) -> TranscriptionResult:
|
|
81
|
+
"""Transcribe audio file.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
audio_path: Path to audio file
|
|
85
|
+
model: Whisper model name
|
|
86
|
+
language: Force language (None for auto-detect)
|
|
87
|
+
initial_prompt: Context prompt for better accuracy
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
TranscriptionResult with language, segments, and confidence
|
|
91
|
+
"""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
class WhisperMLX(TranscriptionBackend):
|
|
96
|
+
"""Apple Silicon backend using mlx-whisper."""
|
|
97
|
+
|
|
98
|
+
def __init__(self) -> None:
|
|
99
|
+
self._model: Any = None
|
|
100
|
+
self._model_name: str | None = None
|
|
101
|
+
|
|
102
|
+
def _load_model(self, model: str) -> None:
|
|
103
|
+
if self._model is None or self._model_name != model:
|
|
104
|
+
import mlx_whisper # type: ignore[import-not-found]
|
|
105
|
+
|
|
106
|
+
self._model = mlx_whisper
|
|
107
|
+
self._model_name = model
|
|
108
|
+
logger.info(f"Loaded mlx-whisper model: {model}")
|
|
109
|
+
|
|
110
|
+
def transcribe(
|
|
111
|
+
self,
|
|
112
|
+
audio_path: str,
|
|
113
|
+
model: str = "large-v3",
|
|
114
|
+
language: str | None = None,
|
|
115
|
+
initial_prompt: str | None = None,
|
|
116
|
+
) -> TranscriptionResult:
|
|
117
|
+
self._load_model(model)
|
|
118
|
+
|
|
119
|
+
result: dict[str, Any] = self._model.transcribe(
|
|
120
|
+
audio_path,
|
|
121
|
+
path_or_hf_repo=f"mlx-community/whisper-{model}-mlx",
|
|
122
|
+
language=language,
|
|
123
|
+
initial_prompt=initial_prompt,
|
|
124
|
+
word_timestamps=False,
|
|
125
|
+
task="transcribe", # Explicitly prevent translation to English
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
return TranscriptionResult(
|
|
129
|
+
language=result.get("language", "unknown"),
|
|
130
|
+
language_probability=result.get("language_probability", 0.0),
|
|
131
|
+
segments=[TranscriptionSegment(start=s["start"], end=s["end"], text=s["text"].strip()) for s in result.get("segments", [])],
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class WhisperCUDA(TranscriptionBackend):
|
|
136
|
+
"""NVIDIA GPU backend using faster-whisper."""
|
|
137
|
+
|
|
138
|
+
def __init__(self) -> None:
|
|
139
|
+
self._model: Any = None
|
|
140
|
+
self._model_name: str | None = None
|
|
141
|
+
|
|
142
|
+
def _load_model(self, model: str) -> None:
|
|
143
|
+
if self._model is None or self._model_name != model:
|
|
144
|
+
from faster_whisper import WhisperModel # type: ignore[import-not-found]
|
|
145
|
+
|
|
146
|
+
self._model = WhisperModel(model, device="cuda", compute_type="float16")
|
|
147
|
+
self._model_name = model
|
|
148
|
+
logger.info(f"Loaded faster-whisper model: {model}")
|
|
149
|
+
|
|
150
|
+
def transcribe(
|
|
151
|
+
self,
|
|
152
|
+
audio_path: str,
|
|
153
|
+
model: str = "large-v3",
|
|
154
|
+
language: str | None = None,
|
|
155
|
+
initial_prompt: str | None = None,
|
|
156
|
+
) -> TranscriptionResult:
|
|
157
|
+
self._load_model(model)
|
|
158
|
+
|
|
159
|
+
segments, info = self._model.transcribe(
|
|
160
|
+
audio_path,
|
|
161
|
+
language=language,
|
|
162
|
+
initial_prompt=initial_prompt,
|
|
163
|
+
word_timestamps=False,
|
|
164
|
+
task="transcribe", # Explicitly prevent translation to English
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
return TranscriptionResult(
|
|
168
|
+
language=info.language,
|
|
169
|
+
language_probability=info.language_probability,
|
|
170
|
+
segments=[TranscriptionSegment(start=s.start, end=s.end, text=s.text.strip()) for s in segments],
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class WhisperCPU(TranscriptionBackend):
|
|
175
|
+
"""CPU fallback using openai-whisper."""
|
|
176
|
+
|
|
177
|
+
def __init__(self) -> None:
|
|
178
|
+
self._model: Any = None
|
|
179
|
+
self._model_name: str | None = None
|
|
180
|
+
|
|
181
|
+
def _load_model(self, model: str) -> None:
|
|
182
|
+
# Use smaller model for CPU
|
|
183
|
+
actual_model = "medium" if model == "large-v3" else model
|
|
184
|
+
|
|
185
|
+
if self._model is None or self._model_name != actual_model:
|
|
186
|
+
import whisper # type: ignore[import-not-found]
|
|
187
|
+
|
|
188
|
+
self._model = whisper.load_model(actual_model)
|
|
189
|
+
self._model_name = actual_model
|
|
190
|
+
logger.info(f"Loaded openai-whisper model: {actual_model}")
|
|
191
|
+
|
|
192
|
+
def transcribe(
|
|
193
|
+
self,
|
|
194
|
+
audio_path: str,
|
|
195
|
+
model: str = "large-v3",
|
|
196
|
+
language: str | None = None,
|
|
197
|
+
initial_prompt: str | None = None,
|
|
198
|
+
) -> TranscriptionResult:
|
|
199
|
+
self._load_model(model)
|
|
200
|
+
|
|
201
|
+
result: dict[str, Any] = self._model.transcribe(
|
|
202
|
+
audio_path,
|
|
203
|
+
language=language,
|
|
204
|
+
initial_prompt=initial_prompt,
|
|
205
|
+
word_timestamps=False,
|
|
206
|
+
task="transcribe", # Explicitly prevent translation to English
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
return TranscriptionResult(
|
|
210
|
+
language=result.get("language", "unknown"),
|
|
211
|
+
language_probability=0.0, # Not provided by openai-whisper
|
|
212
|
+
segments=[TranscriptionSegment(start=s["start"], end=s["end"], text=s["text"].strip()) for s in result.get("segments", [])],
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Singleton backend instance
|
|
217
|
+
_backend: TranscriptionBackend | None = None
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def get_transcription_backend() -> TranscriptionBackend:
|
|
221
|
+
"""Get the appropriate transcription backend for the current platform."""
|
|
222
|
+
global _backend
|
|
223
|
+
|
|
224
|
+
if _backend is not None:
|
|
225
|
+
return _backend
|
|
226
|
+
|
|
227
|
+
if is_apple_silicon():
|
|
228
|
+
try:
|
|
229
|
+
import mlx_whisper # type: ignore[import-not-found] # noqa: F401
|
|
230
|
+
|
|
231
|
+
_backend = WhisperMLX()
|
|
232
|
+
logger.info("Using mlx-whisper backend (Apple Silicon)")
|
|
233
|
+
return _backend
|
|
234
|
+
except ImportError:
|
|
235
|
+
logger.warning("mlx-whisper not available, falling back")
|
|
236
|
+
|
|
237
|
+
if has_cuda():
|
|
238
|
+
try:
|
|
239
|
+
from faster_whisper import WhisperModel # type: ignore[import-not-found] # noqa: F401
|
|
240
|
+
|
|
241
|
+
_backend = WhisperCUDA()
|
|
242
|
+
logger.info("Using faster-whisper backend (CUDA)")
|
|
243
|
+
return _backend
|
|
244
|
+
except ImportError:
|
|
245
|
+
logger.warning("faster-whisper not available, falling back")
|
|
246
|
+
|
|
247
|
+
try:
|
|
248
|
+
import whisper # type: ignore[import-not-found] # noqa: F401
|
|
249
|
+
|
|
250
|
+
_backend = WhisperCPU()
|
|
251
|
+
logger.info("Using openai-whisper backend (CPU)")
|
|
252
|
+
return _backend
|
|
253
|
+
except ImportError:
|
|
254
|
+
raise RuntimeError("No Whisper backend available. Install one of: " "mlx-whisper, faster-whisper, openai-whisper")
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def unload_whisper_model() -> None:
|
|
258
|
+
"""Unload Whisper model from memory to free GPU/MPS memory."""
|
|
259
|
+
global _backend
|
|
260
|
+
|
|
261
|
+
if _backend is not None:
|
|
262
|
+
logger.info("Unloading Whisper model from memory")
|
|
263
|
+
# Clear internal model references
|
|
264
|
+
if hasattr(_backend, "_model"):
|
|
265
|
+
del _backend._model # type: ignore[attr-defined]
|
|
266
|
+
_backend._model = None # type: ignore[attr-defined]
|
|
267
|
+
_backend = None
|
|
268
|
+
|
|
269
|
+
import gc
|
|
270
|
+
|
|
271
|
+
gc.collect()
|
|
272
|
+
|
|
273
|
+
# Free GPU memory with sync
|
|
274
|
+
try:
|
|
275
|
+
import torch
|
|
276
|
+
|
|
277
|
+
if torch.cuda.is_available():
|
|
278
|
+
torch.cuda.synchronize()
|
|
279
|
+
torch.cuda.empty_cache()
|
|
280
|
+
if hasattr(torch, "mps"):
|
|
281
|
+
if hasattr(torch.mps, "synchronize"):
|
|
282
|
+
torch.mps.synchronize()
|
|
283
|
+
if hasattr(torch.mps, "empty_cache"):
|
|
284
|
+
torch.mps.empty_cache()
|
|
285
|
+
except ImportError:
|
|
286
|
+
pass
|
|
287
|
+
|
|
288
|
+
gc.collect()
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
# Singleton diarization pipeline
|
|
292
|
+
_diarization_pipeline: Any = None
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def get_diarization_pipeline() -> Any:
|
|
296
|
+
"""Get the pyannote diarization pipeline (lazy loaded)."""
|
|
297
|
+
global _diarization_pipeline
|
|
298
|
+
|
|
299
|
+
if _diarization_pipeline is not None:
|
|
300
|
+
return _diarization_pipeline
|
|
301
|
+
|
|
302
|
+
settings = get_settings()
|
|
303
|
+
if not settings.hf_token:
|
|
304
|
+
return None
|
|
305
|
+
|
|
306
|
+
try:
|
|
307
|
+
from pyannote.audio import Pipeline # type: ignore[import-not-found]
|
|
308
|
+
|
|
309
|
+
logger.info(f"Loading diarization model: {settings.diarization_model}")
|
|
310
|
+
_diarization_pipeline = Pipeline.from_pretrained(
|
|
311
|
+
settings.diarization_model,
|
|
312
|
+
use_auth_token=settings.hf_token,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Move to appropriate device
|
|
316
|
+
if has_cuda():
|
|
317
|
+
import torch
|
|
318
|
+
|
|
319
|
+
_diarization_pipeline.to(torch.device("cuda"))
|
|
320
|
+
logger.info("Diarization pipeline moved to CUDA")
|
|
321
|
+
|
|
322
|
+
logger.info("Diarization pipeline loaded successfully")
|
|
323
|
+
return _diarization_pipeline
|
|
324
|
+
|
|
325
|
+
except ImportError:
|
|
326
|
+
logger.warning("pyannote-audio not installed, diarization disabled")
|
|
327
|
+
return None
|
|
328
|
+
except Exception as e:
|
|
329
|
+
logger.warning(f"Failed to load diarization pipeline: {e}")
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def run_diarization(audio_path: str) -> DiarizationResult | None:
|
|
334
|
+
"""Run speaker diarization on audio file.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
audio_path: Path to audio file (WAV format)
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
DiarizationResult with speaker segments, or None if diarization unavailable
|
|
341
|
+
"""
|
|
342
|
+
pipeline = get_diarization_pipeline()
|
|
343
|
+
if pipeline is None:
|
|
344
|
+
return None
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
logger.info("Running speaker diarization...")
|
|
348
|
+
diarization = pipeline(audio_path)
|
|
349
|
+
|
|
350
|
+
segments: list[SpeakerSegment] = []
|
|
351
|
+
speakers: set[str] = set()
|
|
352
|
+
|
|
353
|
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
|
354
|
+
segments.append(
|
|
355
|
+
SpeakerSegment(
|
|
356
|
+
start=turn.start,
|
|
357
|
+
end=turn.end,
|
|
358
|
+
speaker=speaker,
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
speakers.add(speaker)
|
|
362
|
+
|
|
363
|
+
logger.info(f"Diarization complete: {len(speakers)} speakers, {len(segments)} segments")
|
|
364
|
+
|
|
365
|
+
return DiarizationResult(
|
|
366
|
+
segments=segments,
|
|
367
|
+
speaker_count=len(speakers),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
except Exception as e:
|
|
371
|
+
logger.warning(f"Diarization failed: {e}")
|
|
372
|
+
return None
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def assign_speakers_to_segments(
|
|
376
|
+
transcript_segments: list[TranscriptionSegment],
|
|
377
|
+
diarization: DiarizationResult,
|
|
378
|
+
) -> list[tuple[TranscriptionSegment, str | None]]:
|
|
379
|
+
"""Assign speaker labels to transcript segments based on overlap.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
transcript_segments: List of transcript segments with timestamps
|
|
383
|
+
diarization: Diarization result with speaker segments
|
|
384
|
+
|
|
385
|
+
Returns:
|
|
386
|
+
List of (segment, speaker) tuples
|
|
387
|
+
"""
|
|
388
|
+
result: list[tuple[TranscriptionSegment, str | None]] = []
|
|
389
|
+
|
|
390
|
+
for seg in transcript_segments:
|
|
391
|
+
# Find the speaker segment with maximum overlap
|
|
392
|
+
best_speaker: str | None = None
|
|
393
|
+
best_overlap = 0.0
|
|
394
|
+
|
|
395
|
+
for spk_seg in diarization.segments:
|
|
396
|
+
# Calculate overlap
|
|
397
|
+
overlap_start = max(seg.start, spk_seg.start)
|
|
398
|
+
overlap_end = min(seg.end, spk_seg.end)
|
|
399
|
+
overlap = max(0.0, overlap_end - overlap_start)
|
|
400
|
+
|
|
401
|
+
if overlap > best_overlap:
|
|
402
|
+
best_overlap = overlap
|
|
403
|
+
best_speaker = spk_seg.speaker
|
|
404
|
+
|
|
405
|
+
result.append((seg, best_speaker))
|
|
406
|
+
|
|
407
|
+
return result
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def extract_audio(video_path: str, output_path: str | None = None) -> str:
|
|
411
|
+
"""Extract audio from video file as 16kHz mono WAV.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
video_path: Path to video file
|
|
415
|
+
output_path: Output path for audio file (optional)
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Path to extracted audio file
|
|
419
|
+
"""
|
|
420
|
+
if output_path is None:
|
|
421
|
+
fd, output_path = tempfile.mkstemp(suffix=".wav")
|
|
422
|
+
os.close(fd)
|
|
423
|
+
|
|
424
|
+
cmd = [
|
|
425
|
+
"ffmpeg",
|
|
426
|
+
"-y",
|
|
427
|
+
"-i",
|
|
428
|
+
video_path,
|
|
429
|
+
"-vn",
|
|
430
|
+
"-ar",
|
|
431
|
+
"16000",
|
|
432
|
+
"-ac",
|
|
433
|
+
"1",
|
|
434
|
+
"-c:a",
|
|
435
|
+
"pcm_s16le",
|
|
436
|
+
output_path,
|
|
437
|
+
]
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
subprocess.run(cmd, capture_output=True, check=True)
|
|
441
|
+
return output_path
|
|
442
|
+
except subprocess.CalledProcessError as e:
|
|
443
|
+
logger.error(f"Failed to extract audio: {e.stderr}")
|
|
444
|
+
raise RuntimeError(f"Failed to extract audio: {e.stderr}")
|
|
445
|
+
|
|
446
|
+
|
|
447
|
+
def extract_transcript(
|
|
448
|
+
file_path: str,
|
|
449
|
+
model: str = "auto",
|
|
450
|
+
language: str | None = None,
|
|
451
|
+
fallback_language: str = "en",
|
|
452
|
+
language_hints: list[str] | None = None,
|
|
453
|
+
context_hint: str | None = None,
|
|
454
|
+
progress_callback: ProgressCallback | None = None,
|
|
455
|
+
) -> Transcript:
|
|
456
|
+
"""Extract transcript from video file.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
file_path: Path to video file
|
|
460
|
+
model: Whisper model name ("auto" = select based on VRAM)
|
|
461
|
+
language: Force language (skip detection)
|
|
462
|
+
fallback_language: Fallback for short clips with low confidence
|
|
463
|
+
language_hints: Language hints (not directly used by Whisper, but logged)
|
|
464
|
+
context_hint: Context hint used as initial_prompt
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Transcript object with segments and metadata
|
|
468
|
+
"""
|
|
469
|
+
# Resolve "auto" model based on available VRAM
|
|
470
|
+
if model == "auto":
|
|
471
|
+
from media_engine.config import get_auto_whisper_model
|
|
472
|
+
|
|
473
|
+
model = get_auto_whisper_model()
|
|
474
|
+
logger.info(f"Auto-selected Whisper model: {model}")
|
|
475
|
+
|
|
476
|
+
path = Path(file_path)
|
|
477
|
+
if not path.exists():
|
|
478
|
+
raise FileNotFoundError(f"Video file not found: {file_path}")
|
|
479
|
+
|
|
480
|
+
# Extract audio
|
|
481
|
+
if progress_callback:
|
|
482
|
+
progress_callback("Extracting audio...", None, None)
|
|
483
|
+
logger.info(f"Extracting audio from {file_path}")
|
|
484
|
+
audio_path = extract_audio(file_path)
|
|
485
|
+
|
|
486
|
+
try:
|
|
487
|
+
# Get audio duration
|
|
488
|
+
audio_duration = _get_audio_duration(audio_path)
|
|
489
|
+
|
|
490
|
+
# Get backend
|
|
491
|
+
backend = get_transcription_backend()
|
|
492
|
+
|
|
493
|
+
# Determine language
|
|
494
|
+
use_language = language
|
|
495
|
+
fallback_applied = False
|
|
496
|
+
|
|
497
|
+
if use_language is None:
|
|
498
|
+
# First pass: detect language
|
|
499
|
+
if progress_callback:
|
|
500
|
+
progress_callback("Detecting language...", None, None)
|
|
501
|
+
logger.info("Detecting language...")
|
|
502
|
+
detect_result = backend.transcribe(audio_path, model=model, language=None, initial_prompt=None)
|
|
503
|
+
|
|
504
|
+
detected_lang = detect_result.language
|
|
505
|
+
confidence = detect_result.language_probability
|
|
506
|
+
|
|
507
|
+
logger.info(f"Detected language: {detected_lang} (confidence: {confidence:.2f})")
|
|
508
|
+
|
|
509
|
+
# Apply fallback for short clips with low confidence
|
|
510
|
+
if confidence < 0.7 and audio_duration < 15:
|
|
511
|
+
use_language = fallback_language
|
|
512
|
+
fallback_applied = True
|
|
513
|
+
logger.info(f"Low confidence on short clip, using fallback: {use_language}")
|
|
514
|
+
else:
|
|
515
|
+
use_language = detected_lang
|
|
516
|
+
|
|
517
|
+
# Main transcription
|
|
518
|
+
if progress_callback:
|
|
519
|
+
progress_callback("Transcribing audio...", None, None)
|
|
520
|
+
logger.info(f"Transcribing with language={use_language}, model={model}")
|
|
521
|
+
result = backend.transcribe(
|
|
522
|
+
audio_path,
|
|
523
|
+
model=model,
|
|
524
|
+
language=use_language,
|
|
525
|
+
initial_prompt=context_hint,
|
|
526
|
+
)
|
|
527
|
+
|
|
528
|
+
# Run diarization if available
|
|
529
|
+
if progress_callback:
|
|
530
|
+
progress_callback("Speaker diarization...", None, None)
|
|
531
|
+
diarization = run_diarization(audio_path)
|
|
532
|
+
speaker_count: int | None = None
|
|
533
|
+
|
|
534
|
+
if diarization is not None:
|
|
535
|
+
# Assign speakers to segments
|
|
536
|
+
segments_with_speakers = assign_speakers_to_segments(result.segments, diarization)
|
|
537
|
+
segments = [TranscriptSegment(start=s.start, end=s.end, text=s.text, speaker=speaker) for s, speaker in segments_with_speakers]
|
|
538
|
+
speaker_count = diarization.speaker_count
|
|
539
|
+
logger.info(f"Diarization complete: {speaker_count} speakers detected")
|
|
540
|
+
else:
|
|
541
|
+
segments = [TranscriptSegment(start=s.start, end=s.end, text=s.text) for s in result.segments]
|
|
542
|
+
|
|
543
|
+
return Transcript(
|
|
544
|
+
language=result.language,
|
|
545
|
+
confidence=result.language_probability,
|
|
546
|
+
duration=audio_duration,
|
|
547
|
+
speaker_count=speaker_count,
|
|
548
|
+
hints_used=TranscriptHints(
|
|
549
|
+
language_hints=language_hints or [],
|
|
550
|
+
context_hint=context_hint,
|
|
551
|
+
fallback_applied=fallback_applied,
|
|
552
|
+
),
|
|
553
|
+
segments=segments,
|
|
554
|
+
)
|
|
555
|
+
|
|
556
|
+
finally:
|
|
557
|
+
# Clean up temp audio file
|
|
558
|
+
if os.path.exists(audio_path):
|
|
559
|
+
os.remove(audio_path)
|
|
560
|
+
|
|
561
|
+
|
|
562
|
+
def _get_audio_duration(audio_path: str) -> float:
|
|
563
|
+
"""Get duration of audio file in seconds."""
|
|
564
|
+
cmd = [
|
|
565
|
+
"ffprobe",
|
|
566
|
+
"-v",
|
|
567
|
+
"quiet",
|
|
568
|
+
"-show_entries",
|
|
569
|
+
"format=duration",
|
|
570
|
+
"-of",
|
|
571
|
+
"default=noprint_wrappers=1:nokey=1",
|
|
572
|
+
audio_path,
|
|
573
|
+
]
|
|
574
|
+
|
|
575
|
+
try:
|
|
576
|
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
|
577
|
+
return float(result.stdout.strip())
|
|
578
|
+
except (subprocess.CalledProcessError, ValueError):
|
|
579
|
+
return 0.0
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""Query translation for CLIP search.
|
|
2
|
+
|
|
3
|
+
Translates non-English search queries to English for better CLIP performance,
|
|
4
|
+
since CLIP models are primarily trained on English text.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from langdetect import LangDetectException, detect
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
# Languages supported for translation (ISO 639-1 codes)
|
|
16
|
+
SUPPORTED_LANGUAGES = {"de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi", "pl", "ru", "ja", "zh-cn", "zh-tw", "ko"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def detect_language(text: str) -> Optional[str]:
|
|
20
|
+
"""Detect the language of the given text.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
text: Text to detect language of
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
ISO 639-1 language code (e.g., 'en', 'de', 'fr') or None if detection fails
|
|
27
|
+
"""
|
|
28
|
+
if not text or len(text.strip()) < 3:
|
|
29
|
+
return None
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
lang = detect(text)
|
|
33
|
+
return lang
|
|
34
|
+
except LangDetectException as e:
|
|
35
|
+
logger.debug(f"Language detection failed: {e}")
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def translate_to_english(text: str, source_lang: Optional[str] = None, ollama_base_url: str = "http://localhost:11434") -> str:
|
|
40
|
+
"""Translate text to English using Ollama.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
text: Text to translate
|
|
44
|
+
source_lang: Source language code (auto-detected if not provided)
|
|
45
|
+
ollama_base_url: Base URL for Ollama API
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Translated text in English, or original text if translation fails
|
|
49
|
+
"""
|
|
50
|
+
if not text:
|
|
51
|
+
return text
|
|
52
|
+
|
|
53
|
+
# Detect language if not provided
|
|
54
|
+
if source_lang is None:
|
|
55
|
+
source_lang = detect_language(text)
|
|
56
|
+
|
|
57
|
+
# If already English or detection failed, return original
|
|
58
|
+
if source_lang is None or source_lang == "en":
|
|
59
|
+
return text
|
|
60
|
+
|
|
61
|
+
# Try translation with Ollama
|
|
62
|
+
try:
|
|
63
|
+
prompt = f"""Translate the following text to English. Only output the translation, nothing else.
|
|
64
|
+
|
|
65
|
+
Text: {text}
|
|
66
|
+
|
|
67
|
+
English translation:"""
|
|
68
|
+
|
|
69
|
+
response = httpx.post(
|
|
70
|
+
f"{ollama_base_url}/api/generate",
|
|
71
|
+
json={
|
|
72
|
+
"model": "qwen2.5:3b", # Small, fast model for translation
|
|
73
|
+
"prompt": prompt,
|
|
74
|
+
"stream": False,
|
|
75
|
+
"options": {
|
|
76
|
+
"temperature": 0.1, # Low temperature for deterministic translation
|
|
77
|
+
"num_predict": 100, # Short output expected
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
timeout=30.0,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if response.status_code == 200:
|
|
84
|
+
result = response.json()
|
|
85
|
+
translated = result.get("response", "").strip()
|
|
86
|
+
if translated:
|
|
87
|
+
logger.info(f"Translated query from {source_lang}: '{text}' -> '{translated}'")
|
|
88
|
+
return translated
|
|
89
|
+
except httpx.TimeoutException:
|
|
90
|
+
logger.warning("Translation timed out, using original text")
|
|
91
|
+
except httpx.ConnectError:
|
|
92
|
+
logger.debug("Ollama not available for translation")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.warning(f"Translation failed: {e}")
|
|
95
|
+
|
|
96
|
+
return text
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def translate_query_for_clip(text: str, enable_translation: bool = True, ollama_base_url: str = "http://localhost:11434") -> tuple[str, Optional[str], bool]:
|
|
100
|
+
"""Translate a CLIP search query to English if needed.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
text: Search query text
|
|
104
|
+
enable_translation: Whether to enable translation
|
|
105
|
+
ollama_base_url: Base URL for Ollama API
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
Tuple of (processed_text, detected_language, was_translated)
|
|
109
|
+
"""
|
|
110
|
+
if not enable_translation:
|
|
111
|
+
return text, None, False
|
|
112
|
+
|
|
113
|
+
source_lang = detect_language(text)
|
|
114
|
+
|
|
115
|
+
if source_lang is None or source_lang == "en":
|
|
116
|
+
return text, source_lang, False
|
|
117
|
+
|
|
118
|
+
translated = translate_to_english(text, source_lang, ollama_base_url)
|
|
119
|
+
was_translated = translated != text
|
|
120
|
+
|
|
121
|
+
return translated, source_lang, was_translated
|