media-engine 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. cli/clip.py +79 -0
  2. cli/faces.py +91 -0
  3. cli/metadata.py +68 -0
  4. cli/motion.py +77 -0
  5. cli/objects.py +94 -0
  6. cli/ocr.py +93 -0
  7. cli/scenes.py +57 -0
  8. cli/telemetry.py +65 -0
  9. cli/transcript.py +76 -0
  10. media_engine/__init__.py +7 -0
  11. media_engine/_version.py +34 -0
  12. media_engine/app.py +80 -0
  13. media_engine/batch/__init__.py +56 -0
  14. media_engine/batch/models.py +99 -0
  15. media_engine/batch/processor.py +1131 -0
  16. media_engine/batch/queue.py +232 -0
  17. media_engine/batch/state.py +30 -0
  18. media_engine/batch/timing.py +321 -0
  19. media_engine/cli.py +17 -0
  20. media_engine/config.py +674 -0
  21. media_engine/extractors/__init__.py +75 -0
  22. media_engine/extractors/clip.py +401 -0
  23. media_engine/extractors/faces.py +459 -0
  24. media_engine/extractors/frame_buffer.py +351 -0
  25. media_engine/extractors/frames.py +402 -0
  26. media_engine/extractors/metadata/__init__.py +127 -0
  27. media_engine/extractors/metadata/apple.py +169 -0
  28. media_engine/extractors/metadata/arri.py +118 -0
  29. media_engine/extractors/metadata/avchd.py +208 -0
  30. media_engine/extractors/metadata/avchd_gps.py +270 -0
  31. media_engine/extractors/metadata/base.py +688 -0
  32. media_engine/extractors/metadata/blackmagic.py +139 -0
  33. media_engine/extractors/metadata/camera_360.py +276 -0
  34. media_engine/extractors/metadata/canon.py +290 -0
  35. media_engine/extractors/metadata/dji.py +371 -0
  36. media_engine/extractors/metadata/dv.py +121 -0
  37. media_engine/extractors/metadata/ffmpeg.py +76 -0
  38. media_engine/extractors/metadata/generic.py +119 -0
  39. media_engine/extractors/metadata/gopro.py +256 -0
  40. media_engine/extractors/metadata/red.py +305 -0
  41. media_engine/extractors/metadata/registry.py +114 -0
  42. media_engine/extractors/metadata/sony.py +442 -0
  43. media_engine/extractors/metadata/tesla.py +157 -0
  44. media_engine/extractors/motion.py +765 -0
  45. media_engine/extractors/objects.py +245 -0
  46. media_engine/extractors/objects_qwen.py +754 -0
  47. media_engine/extractors/ocr.py +268 -0
  48. media_engine/extractors/scenes.py +82 -0
  49. media_engine/extractors/shot_type.py +217 -0
  50. media_engine/extractors/telemetry.py +262 -0
  51. media_engine/extractors/transcribe.py +579 -0
  52. media_engine/extractors/translate.py +121 -0
  53. media_engine/extractors/vad.py +263 -0
  54. media_engine/main.py +68 -0
  55. media_engine/py.typed +0 -0
  56. media_engine/routers/__init__.py +15 -0
  57. media_engine/routers/batch.py +78 -0
  58. media_engine/routers/health.py +93 -0
  59. media_engine/routers/models.py +211 -0
  60. media_engine/routers/settings.py +87 -0
  61. media_engine/routers/utils.py +135 -0
  62. media_engine/schemas.py +581 -0
  63. media_engine/utils/__init__.py +5 -0
  64. media_engine/utils/logging.py +54 -0
  65. media_engine/utils/memory.py +49 -0
  66. media_engine-0.1.0.dist-info/METADATA +276 -0
  67. media_engine-0.1.0.dist-info/RECORD +70 -0
  68. media_engine-0.1.0.dist-info/WHEEL +4 -0
  69. media_engine-0.1.0.dist-info/entry_points.txt +11 -0
  70. media_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,579 @@
1
+ """Audio transcription using Whisper with platform-specific backends."""
2
+
3
+ import logging
4
+ import os
5
+ import subprocess
6
+ import tempfile
7
+ from abc import ABC, abstractmethod
8
+ from collections.abc import Callable
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+ from media_engine.config import get_settings, has_cuda, is_apple_silicon
14
+ from media_engine.schemas import Transcript, TranscriptHints, TranscriptSegment
15
+
16
+ # Progress callback type: (message, current, total) -> None
17
+ ProgressCallback = Callable[[str, int | None, int | None], None]
18
+
19
+
20
+ @dataclass(slots=True)
21
+ class TranscriptionSegment:
22
+ """A single segment from transcription.
23
+
24
+ Uses slots=True to reduce memory overhead per instance.
25
+ """
26
+
27
+ start: float
28
+ end: float
29
+ text: str
30
+
31
+
32
+ @dataclass(slots=True)
33
+ class TranscriptionResult:
34
+ """Result from a transcription backend.
35
+
36
+ Uses slots=True to reduce memory overhead per instance.
37
+ """
38
+
39
+ language: str
40
+ language_probability: float
41
+ segments: list[TranscriptionSegment] = field(default_factory=list)
42
+
43
+
44
+ @dataclass(slots=True)
45
+ class SpeakerSegment:
46
+ """A speaker segment from diarization.
47
+
48
+ Uses slots=True to reduce memory overhead per instance.
49
+ """
50
+
51
+ start: float
52
+ end: float
53
+ speaker: str
54
+
55
+
56
+ @dataclass(slots=True)
57
+ class DiarizationResult:
58
+ """Result from speaker diarization.
59
+
60
+ Uses slots=True to reduce memory overhead per instance.
61
+ """
62
+
63
+ segments: list[SpeakerSegment] = field(default_factory=list)
64
+ speaker_count: int = 0
65
+
66
+
67
+ logger = logging.getLogger(__name__)
68
+
69
+
70
+ class TranscriptionBackend(ABC):
71
+ """Abstract base class for transcription backends."""
72
+
73
+ @abstractmethod
74
+ def transcribe(
75
+ self,
76
+ audio_path: str,
77
+ model: str = "large-v3",
78
+ language: str | None = None,
79
+ initial_prompt: str | None = None,
80
+ ) -> TranscriptionResult:
81
+ """Transcribe audio file.
82
+
83
+ Args:
84
+ audio_path: Path to audio file
85
+ model: Whisper model name
86
+ language: Force language (None for auto-detect)
87
+ initial_prompt: Context prompt for better accuracy
88
+
89
+ Returns:
90
+ TranscriptionResult with language, segments, and confidence
91
+ """
92
+ pass
93
+
94
+
95
+ class WhisperMLX(TranscriptionBackend):
96
+ """Apple Silicon backend using mlx-whisper."""
97
+
98
+ def __init__(self) -> None:
99
+ self._model: Any = None
100
+ self._model_name: str | None = None
101
+
102
+ def _load_model(self, model: str) -> None:
103
+ if self._model is None or self._model_name != model:
104
+ import mlx_whisper # type: ignore[import-not-found]
105
+
106
+ self._model = mlx_whisper
107
+ self._model_name = model
108
+ logger.info(f"Loaded mlx-whisper model: {model}")
109
+
110
+ def transcribe(
111
+ self,
112
+ audio_path: str,
113
+ model: str = "large-v3",
114
+ language: str | None = None,
115
+ initial_prompt: str | None = None,
116
+ ) -> TranscriptionResult:
117
+ self._load_model(model)
118
+
119
+ result: dict[str, Any] = self._model.transcribe(
120
+ audio_path,
121
+ path_or_hf_repo=f"mlx-community/whisper-{model}-mlx",
122
+ language=language,
123
+ initial_prompt=initial_prompt,
124
+ word_timestamps=False,
125
+ task="transcribe", # Explicitly prevent translation to English
126
+ )
127
+
128
+ return TranscriptionResult(
129
+ language=result.get("language", "unknown"),
130
+ language_probability=result.get("language_probability", 0.0),
131
+ segments=[TranscriptionSegment(start=s["start"], end=s["end"], text=s["text"].strip()) for s in result.get("segments", [])],
132
+ )
133
+
134
+
135
+ class WhisperCUDA(TranscriptionBackend):
136
+ """NVIDIA GPU backend using faster-whisper."""
137
+
138
+ def __init__(self) -> None:
139
+ self._model: Any = None
140
+ self._model_name: str | None = None
141
+
142
+ def _load_model(self, model: str) -> None:
143
+ if self._model is None or self._model_name != model:
144
+ from faster_whisper import WhisperModel # type: ignore[import-not-found]
145
+
146
+ self._model = WhisperModel(model, device="cuda", compute_type="float16")
147
+ self._model_name = model
148
+ logger.info(f"Loaded faster-whisper model: {model}")
149
+
150
+ def transcribe(
151
+ self,
152
+ audio_path: str,
153
+ model: str = "large-v3",
154
+ language: str | None = None,
155
+ initial_prompt: str | None = None,
156
+ ) -> TranscriptionResult:
157
+ self._load_model(model)
158
+
159
+ segments, info = self._model.transcribe(
160
+ audio_path,
161
+ language=language,
162
+ initial_prompt=initial_prompt,
163
+ word_timestamps=False,
164
+ task="transcribe", # Explicitly prevent translation to English
165
+ )
166
+
167
+ return TranscriptionResult(
168
+ language=info.language,
169
+ language_probability=info.language_probability,
170
+ segments=[TranscriptionSegment(start=s.start, end=s.end, text=s.text.strip()) for s in segments],
171
+ )
172
+
173
+
174
+ class WhisperCPU(TranscriptionBackend):
175
+ """CPU fallback using openai-whisper."""
176
+
177
+ def __init__(self) -> None:
178
+ self._model: Any = None
179
+ self._model_name: str | None = None
180
+
181
+ def _load_model(self, model: str) -> None:
182
+ # Use smaller model for CPU
183
+ actual_model = "medium" if model == "large-v3" else model
184
+
185
+ if self._model is None or self._model_name != actual_model:
186
+ import whisper # type: ignore[import-not-found]
187
+
188
+ self._model = whisper.load_model(actual_model)
189
+ self._model_name = actual_model
190
+ logger.info(f"Loaded openai-whisper model: {actual_model}")
191
+
192
+ def transcribe(
193
+ self,
194
+ audio_path: str,
195
+ model: str = "large-v3",
196
+ language: str | None = None,
197
+ initial_prompt: str | None = None,
198
+ ) -> TranscriptionResult:
199
+ self._load_model(model)
200
+
201
+ result: dict[str, Any] = self._model.transcribe(
202
+ audio_path,
203
+ language=language,
204
+ initial_prompt=initial_prompt,
205
+ word_timestamps=False,
206
+ task="transcribe", # Explicitly prevent translation to English
207
+ )
208
+
209
+ return TranscriptionResult(
210
+ language=result.get("language", "unknown"),
211
+ language_probability=0.0, # Not provided by openai-whisper
212
+ segments=[TranscriptionSegment(start=s["start"], end=s["end"], text=s["text"].strip()) for s in result.get("segments", [])],
213
+ )
214
+
215
+
216
+ # Singleton backend instance
217
+ _backend: TranscriptionBackend | None = None
218
+
219
+
220
+ def get_transcription_backend() -> TranscriptionBackend:
221
+ """Get the appropriate transcription backend for the current platform."""
222
+ global _backend
223
+
224
+ if _backend is not None:
225
+ return _backend
226
+
227
+ if is_apple_silicon():
228
+ try:
229
+ import mlx_whisper # type: ignore[import-not-found] # noqa: F401
230
+
231
+ _backend = WhisperMLX()
232
+ logger.info("Using mlx-whisper backend (Apple Silicon)")
233
+ return _backend
234
+ except ImportError:
235
+ logger.warning("mlx-whisper not available, falling back")
236
+
237
+ if has_cuda():
238
+ try:
239
+ from faster_whisper import WhisperModel # type: ignore[import-not-found] # noqa: F401
240
+
241
+ _backend = WhisperCUDA()
242
+ logger.info("Using faster-whisper backend (CUDA)")
243
+ return _backend
244
+ except ImportError:
245
+ logger.warning("faster-whisper not available, falling back")
246
+
247
+ try:
248
+ import whisper # type: ignore[import-not-found] # noqa: F401
249
+
250
+ _backend = WhisperCPU()
251
+ logger.info("Using openai-whisper backend (CPU)")
252
+ return _backend
253
+ except ImportError:
254
+ raise RuntimeError("No Whisper backend available. Install one of: " "mlx-whisper, faster-whisper, openai-whisper")
255
+
256
+
257
+ def unload_whisper_model() -> None:
258
+ """Unload Whisper model from memory to free GPU/MPS memory."""
259
+ global _backend
260
+
261
+ if _backend is not None:
262
+ logger.info("Unloading Whisper model from memory")
263
+ # Clear internal model references
264
+ if hasattr(_backend, "_model"):
265
+ del _backend._model # type: ignore[attr-defined]
266
+ _backend._model = None # type: ignore[attr-defined]
267
+ _backend = None
268
+
269
+ import gc
270
+
271
+ gc.collect()
272
+
273
+ # Free GPU memory with sync
274
+ try:
275
+ import torch
276
+
277
+ if torch.cuda.is_available():
278
+ torch.cuda.synchronize()
279
+ torch.cuda.empty_cache()
280
+ if hasattr(torch, "mps"):
281
+ if hasattr(torch.mps, "synchronize"):
282
+ torch.mps.synchronize()
283
+ if hasattr(torch.mps, "empty_cache"):
284
+ torch.mps.empty_cache()
285
+ except ImportError:
286
+ pass
287
+
288
+ gc.collect()
289
+
290
+
291
+ # Singleton diarization pipeline
292
+ _diarization_pipeline: Any = None
293
+
294
+
295
+ def get_diarization_pipeline() -> Any:
296
+ """Get the pyannote diarization pipeline (lazy loaded)."""
297
+ global _diarization_pipeline
298
+
299
+ if _diarization_pipeline is not None:
300
+ return _diarization_pipeline
301
+
302
+ settings = get_settings()
303
+ if not settings.hf_token:
304
+ return None
305
+
306
+ try:
307
+ from pyannote.audio import Pipeline # type: ignore[import-not-found]
308
+
309
+ logger.info(f"Loading diarization model: {settings.diarization_model}")
310
+ _diarization_pipeline = Pipeline.from_pretrained(
311
+ settings.diarization_model,
312
+ use_auth_token=settings.hf_token,
313
+ )
314
+
315
+ # Move to appropriate device
316
+ if has_cuda():
317
+ import torch
318
+
319
+ _diarization_pipeline.to(torch.device("cuda"))
320
+ logger.info("Diarization pipeline moved to CUDA")
321
+
322
+ logger.info("Diarization pipeline loaded successfully")
323
+ return _diarization_pipeline
324
+
325
+ except ImportError:
326
+ logger.warning("pyannote-audio not installed, diarization disabled")
327
+ return None
328
+ except Exception as e:
329
+ logger.warning(f"Failed to load diarization pipeline: {e}")
330
+ return None
331
+
332
+
333
+ def run_diarization(audio_path: str) -> DiarizationResult | None:
334
+ """Run speaker diarization on audio file.
335
+
336
+ Args:
337
+ audio_path: Path to audio file (WAV format)
338
+
339
+ Returns:
340
+ DiarizationResult with speaker segments, or None if diarization unavailable
341
+ """
342
+ pipeline = get_diarization_pipeline()
343
+ if pipeline is None:
344
+ return None
345
+
346
+ try:
347
+ logger.info("Running speaker diarization...")
348
+ diarization = pipeline(audio_path)
349
+
350
+ segments: list[SpeakerSegment] = []
351
+ speakers: set[str] = set()
352
+
353
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
354
+ segments.append(
355
+ SpeakerSegment(
356
+ start=turn.start,
357
+ end=turn.end,
358
+ speaker=speaker,
359
+ )
360
+ )
361
+ speakers.add(speaker)
362
+
363
+ logger.info(f"Diarization complete: {len(speakers)} speakers, {len(segments)} segments")
364
+
365
+ return DiarizationResult(
366
+ segments=segments,
367
+ speaker_count=len(speakers),
368
+ )
369
+
370
+ except Exception as e:
371
+ logger.warning(f"Diarization failed: {e}")
372
+ return None
373
+
374
+
375
+ def assign_speakers_to_segments(
376
+ transcript_segments: list[TranscriptionSegment],
377
+ diarization: DiarizationResult,
378
+ ) -> list[tuple[TranscriptionSegment, str | None]]:
379
+ """Assign speaker labels to transcript segments based on overlap.
380
+
381
+ Args:
382
+ transcript_segments: List of transcript segments with timestamps
383
+ diarization: Diarization result with speaker segments
384
+
385
+ Returns:
386
+ List of (segment, speaker) tuples
387
+ """
388
+ result: list[tuple[TranscriptionSegment, str | None]] = []
389
+
390
+ for seg in transcript_segments:
391
+ # Find the speaker segment with maximum overlap
392
+ best_speaker: str | None = None
393
+ best_overlap = 0.0
394
+
395
+ for spk_seg in diarization.segments:
396
+ # Calculate overlap
397
+ overlap_start = max(seg.start, spk_seg.start)
398
+ overlap_end = min(seg.end, spk_seg.end)
399
+ overlap = max(0.0, overlap_end - overlap_start)
400
+
401
+ if overlap > best_overlap:
402
+ best_overlap = overlap
403
+ best_speaker = spk_seg.speaker
404
+
405
+ result.append((seg, best_speaker))
406
+
407
+ return result
408
+
409
+
410
+ def extract_audio(video_path: str, output_path: str | None = None) -> str:
411
+ """Extract audio from video file as 16kHz mono WAV.
412
+
413
+ Args:
414
+ video_path: Path to video file
415
+ output_path: Output path for audio file (optional)
416
+
417
+ Returns:
418
+ Path to extracted audio file
419
+ """
420
+ if output_path is None:
421
+ fd, output_path = tempfile.mkstemp(suffix=".wav")
422
+ os.close(fd)
423
+
424
+ cmd = [
425
+ "ffmpeg",
426
+ "-y",
427
+ "-i",
428
+ video_path,
429
+ "-vn",
430
+ "-ar",
431
+ "16000",
432
+ "-ac",
433
+ "1",
434
+ "-c:a",
435
+ "pcm_s16le",
436
+ output_path,
437
+ ]
438
+
439
+ try:
440
+ subprocess.run(cmd, capture_output=True, check=True)
441
+ return output_path
442
+ except subprocess.CalledProcessError as e:
443
+ logger.error(f"Failed to extract audio: {e.stderr}")
444
+ raise RuntimeError(f"Failed to extract audio: {e.stderr}")
445
+
446
+
447
+ def extract_transcript(
448
+ file_path: str,
449
+ model: str = "auto",
450
+ language: str | None = None,
451
+ fallback_language: str = "en",
452
+ language_hints: list[str] | None = None,
453
+ context_hint: str | None = None,
454
+ progress_callback: ProgressCallback | None = None,
455
+ ) -> Transcript:
456
+ """Extract transcript from video file.
457
+
458
+ Args:
459
+ file_path: Path to video file
460
+ model: Whisper model name ("auto" = select based on VRAM)
461
+ language: Force language (skip detection)
462
+ fallback_language: Fallback for short clips with low confidence
463
+ language_hints: Language hints (not directly used by Whisper, but logged)
464
+ context_hint: Context hint used as initial_prompt
465
+
466
+ Returns:
467
+ Transcript object with segments and metadata
468
+ """
469
+ # Resolve "auto" model based on available VRAM
470
+ if model == "auto":
471
+ from media_engine.config import get_auto_whisper_model
472
+
473
+ model = get_auto_whisper_model()
474
+ logger.info(f"Auto-selected Whisper model: {model}")
475
+
476
+ path = Path(file_path)
477
+ if not path.exists():
478
+ raise FileNotFoundError(f"Video file not found: {file_path}")
479
+
480
+ # Extract audio
481
+ if progress_callback:
482
+ progress_callback("Extracting audio...", None, None)
483
+ logger.info(f"Extracting audio from {file_path}")
484
+ audio_path = extract_audio(file_path)
485
+
486
+ try:
487
+ # Get audio duration
488
+ audio_duration = _get_audio_duration(audio_path)
489
+
490
+ # Get backend
491
+ backend = get_transcription_backend()
492
+
493
+ # Determine language
494
+ use_language = language
495
+ fallback_applied = False
496
+
497
+ if use_language is None:
498
+ # First pass: detect language
499
+ if progress_callback:
500
+ progress_callback("Detecting language...", None, None)
501
+ logger.info("Detecting language...")
502
+ detect_result = backend.transcribe(audio_path, model=model, language=None, initial_prompt=None)
503
+
504
+ detected_lang = detect_result.language
505
+ confidence = detect_result.language_probability
506
+
507
+ logger.info(f"Detected language: {detected_lang} (confidence: {confidence:.2f})")
508
+
509
+ # Apply fallback for short clips with low confidence
510
+ if confidence < 0.7 and audio_duration < 15:
511
+ use_language = fallback_language
512
+ fallback_applied = True
513
+ logger.info(f"Low confidence on short clip, using fallback: {use_language}")
514
+ else:
515
+ use_language = detected_lang
516
+
517
+ # Main transcription
518
+ if progress_callback:
519
+ progress_callback("Transcribing audio...", None, None)
520
+ logger.info(f"Transcribing with language={use_language}, model={model}")
521
+ result = backend.transcribe(
522
+ audio_path,
523
+ model=model,
524
+ language=use_language,
525
+ initial_prompt=context_hint,
526
+ )
527
+
528
+ # Run diarization if available
529
+ if progress_callback:
530
+ progress_callback("Speaker diarization...", None, None)
531
+ diarization = run_diarization(audio_path)
532
+ speaker_count: int | None = None
533
+
534
+ if diarization is not None:
535
+ # Assign speakers to segments
536
+ segments_with_speakers = assign_speakers_to_segments(result.segments, diarization)
537
+ segments = [TranscriptSegment(start=s.start, end=s.end, text=s.text, speaker=speaker) for s, speaker in segments_with_speakers]
538
+ speaker_count = diarization.speaker_count
539
+ logger.info(f"Diarization complete: {speaker_count} speakers detected")
540
+ else:
541
+ segments = [TranscriptSegment(start=s.start, end=s.end, text=s.text) for s in result.segments]
542
+
543
+ return Transcript(
544
+ language=result.language,
545
+ confidence=result.language_probability,
546
+ duration=audio_duration,
547
+ speaker_count=speaker_count,
548
+ hints_used=TranscriptHints(
549
+ language_hints=language_hints or [],
550
+ context_hint=context_hint,
551
+ fallback_applied=fallback_applied,
552
+ ),
553
+ segments=segments,
554
+ )
555
+
556
+ finally:
557
+ # Clean up temp audio file
558
+ if os.path.exists(audio_path):
559
+ os.remove(audio_path)
560
+
561
+
562
+ def _get_audio_duration(audio_path: str) -> float:
563
+ """Get duration of audio file in seconds."""
564
+ cmd = [
565
+ "ffprobe",
566
+ "-v",
567
+ "quiet",
568
+ "-show_entries",
569
+ "format=duration",
570
+ "-of",
571
+ "default=noprint_wrappers=1:nokey=1",
572
+ audio_path,
573
+ ]
574
+
575
+ try:
576
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
577
+ return float(result.stdout.strip())
578
+ except (subprocess.CalledProcessError, ValueError):
579
+ return 0.0
@@ -0,0 +1,121 @@
1
+ """Query translation for CLIP search.
2
+
3
+ Translates non-English search queries to English for better CLIP performance,
4
+ since CLIP models are primarily trained on English text.
5
+ """
6
+
7
+ import logging
8
+ from typing import Optional
9
+
10
+ import httpx
11
+ from langdetect import LangDetectException, detect
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Languages supported for translation (ISO 639-1 codes)
16
+ SUPPORTED_LANGUAGES = {"de", "fr", "es", "it", "pt", "nl", "sv", "da", "no", "fi", "pl", "ru", "ja", "zh-cn", "zh-tw", "ko"}
17
+
18
+
19
+ def detect_language(text: str) -> Optional[str]:
20
+ """Detect the language of the given text.
21
+
22
+ Args:
23
+ text: Text to detect language of
24
+
25
+ Returns:
26
+ ISO 639-1 language code (e.g., 'en', 'de', 'fr') or None if detection fails
27
+ """
28
+ if not text or len(text.strip()) < 3:
29
+ return None
30
+
31
+ try:
32
+ lang = detect(text)
33
+ return lang
34
+ except LangDetectException as e:
35
+ logger.debug(f"Language detection failed: {e}")
36
+ return None
37
+
38
+
39
+ def translate_to_english(text: str, source_lang: Optional[str] = None, ollama_base_url: str = "http://localhost:11434") -> str:
40
+ """Translate text to English using Ollama.
41
+
42
+ Args:
43
+ text: Text to translate
44
+ source_lang: Source language code (auto-detected if not provided)
45
+ ollama_base_url: Base URL for Ollama API
46
+
47
+ Returns:
48
+ Translated text in English, or original text if translation fails
49
+ """
50
+ if not text:
51
+ return text
52
+
53
+ # Detect language if not provided
54
+ if source_lang is None:
55
+ source_lang = detect_language(text)
56
+
57
+ # If already English or detection failed, return original
58
+ if source_lang is None or source_lang == "en":
59
+ return text
60
+
61
+ # Try translation with Ollama
62
+ try:
63
+ prompt = f"""Translate the following text to English. Only output the translation, nothing else.
64
+
65
+ Text: {text}
66
+
67
+ English translation:"""
68
+
69
+ response = httpx.post(
70
+ f"{ollama_base_url}/api/generate",
71
+ json={
72
+ "model": "qwen2.5:3b", # Small, fast model for translation
73
+ "prompt": prompt,
74
+ "stream": False,
75
+ "options": {
76
+ "temperature": 0.1, # Low temperature for deterministic translation
77
+ "num_predict": 100, # Short output expected
78
+ },
79
+ },
80
+ timeout=30.0,
81
+ )
82
+
83
+ if response.status_code == 200:
84
+ result = response.json()
85
+ translated = result.get("response", "").strip()
86
+ if translated:
87
+ logger.info(f"Translated query from {source_lang}: '{text}' -> '{translated}'")
88
+ return translated
89
+ except httpx.TimeoutException:
90
+ logger.warning("Translation timed out, using original text")
91
+ except httpx.ConnectError:
92
+ logger.debug("Ollama not available for translation")
93
+ except Exception as e:
94
+ logger.warning(f"Translation failed: {e}")
95
+
96
+ return text
97
+
98
+
99
+ def translate_query_for_clip(text: str, enable_translation: bool = True, ollama_base_url: str = "http://localhost:11434") -> tuple[str, Optional[str], bool]:
100
+ """Translate a CLIP search query to English if needed.
101
+
102
+ Args:
103
+ text: Search query text
104
+ enable_translation: Whether to enable translation
105
+ ollama_base_url: Base URL for Ollama API
106
+
107
+ Returns:
108
+ Tuple of (processed_text, detected_language, was_translated)
109
+ """
110
+ if not enable_translation:
111
+ return text, None, False
112
+
113
+ source_lang = detect_language(text)
114
+
115
+ if source_lang is None or source_lang == "en":
116
+ return text, source_lang, False
117
+
118
+ translated = translate_to_english(text, source_lang, ollama_base_url)
119
+ was_translated = translated != text
120
+
121
+ return translated, source_lang, was_translated