videopython 0.28.0__tar.gz → 0.28.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {videopython-0.28.0 → videopython-0.28.1}/PKG-INFO +2 -1
  2. {videopython-0.28.0 → videopython-0.28.1}/pyproject.toml +6 -1
  3. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py +2 -0
  4. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py +12 -2
  5. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py +6 -0
  6. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py +76 -6
  7. videopython-0.28.1/src/videopython/ai/generation/qwen3.py +394 -0
  8. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/translation.py +109 -5
  9. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/audio.py +40 -1
  10. {videopython-0.28.0 → videopython-0.28.1}/.gitignore +0 -0
  11. {videopython-0.28.0 → videopython-0.28.1}/LICENSE +0 -0
  12. {videopython-0.28.0 → videopython-0.28.1}/README.md +0 -0
  13. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/__init__.py +0 -0
  14. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/__init__.py +0 -0
  15. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/_device.py +0 -0
  16. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/quality.py +0 -0
  17. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/remux.py +0 -0
  18. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/timing.py +0 -0
  19. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/__init__.py +0 -0
  20. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/audio.py +0 -0
  21. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/image.py +0 -0
  22. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/video.py +0 -0
  23. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/registry.py +0 -0
  24. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/__init__.py +0 -0
  25. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/inpainter.py +0 -0
  26. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/models.py +0 -0
  27. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/segmenter.py +0 -0
  28. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/swapper.py +0 -0
  29. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/transforms.py +0 -0
  30. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/__init__.py +0 -0
  31. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/image.py +0 -0
  32. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/separation.py +0 -0
  33. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/temporal.py +0 -0
  34. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/video_analysis.py +0 -0
  35. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/__init__.py +0 -0
  36. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/__init__.py +0 -0
  37. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/analysis.py +0 -0
  38. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/audio.py +0 -0
  39. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/combine.py +0 -0
  40. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/description.py +0 -0
  41. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/effects.py +0 -0
  42. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/exceptions.py +0 -0
  43. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/progress.py +0 -0
  44. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/registry.py +0 -0
  45. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/scene.py +0 -0
  46. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/streaming.py +0 -0
  47. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/__init__.py +0 -0
  48. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/overlay.py +0 -0
  49. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/transcription.py +0 -0
  50. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/transforms.py +0 -0
  51. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/transitions.py +0 -0
  52. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/utils.py +0 -0
  53. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/video.py +0 -0
  54. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/__init__.py +0 -0
  55. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/multicam.py +0 -0
  56. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/premiere_xml.py +0 -0
  57. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/video_edit.py +0 -0
  58. {videopython-0.28.0 → videopython-0.28.1}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.28.0
3
+ Version: 0.28.1
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
29
29
  Requires-Dist: diffusers>=0.30.0; extra == 'ai'
30
30
  Requires-Dist: easyocr>=1.7.0; extra == 'ai'
31
31
  Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
32
+ Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
32
33
  Requires-Dist: numba>=0.61.0; extra == 'ai'
33
34
  Requires-Dist: ollama>=0.4.5; extra == 'ai'
34
35
  Requires-Dist: openai-whisper>=20240930; extra == 'ai'
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.28.0"
3
+ version = "0.28.1"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -80,6 +80,8 @@ ai = [
80
80
  "sentencepiece>=0.1.99",
81
81
  # Audio source separation
82
82
  "demucs>=4.0.0",
83
+ # Translation backend: Qwen3 GGUF inference (M2)
84
+ "llama-cpp-python>=0.3.0",
83
85
  ]
84
86
 
85
87
  # Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
@@ -111,6 +113,8 @@ ai = [
111
113
  "sentencepiece>=0.1.99",
112
114
  # Audio source separation
113
115
  "demucs>=4.0.0",
116
+ # Translation backend: Qwen3 GGUF inference (M2)
117
+ "llama-cpp-python>=0.3.0",
114
118
  ]
115
119
 
116
120
  [project.urls]
@@ -136,6 +140,7 @@ module = [
136
140
  "pyannote", "pyannote.*",
137
141
  "silero_vad", "silero_vad.*",
138
142
  "cv2", "cv2.*",
143
+ "llama_cpp", "llama_cpp.*",
139
144
  ]
140
145
  ignore_missing_imports = true
141
146
 
@@ -5,6 +5,7 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
5
5
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
6
6
  from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
7
7
  from videopython.ai.dubbing.timing import TimingSynchronizer
8
+ from videopython.ai.generation.translation import UnsupportedLanguageError
8
9
 
9
10
  __all__ = [
10
11
  "VideoDubber",
@@ -17,4 +18,5 @@ __all__ = [
17
18
  "GarbageTranscriptError",
18
19
  "TranscriptQuality",
19
20
  "assess_transcript",
21
+ "UnsupportedLanguageError",
20
22
  ]
@@ -7,7 +7,7 @@ from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Callable
8
8
 
9
9
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
10
- from videopython.ai.dubbing.pipeline import WhisperModel
10
+ from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from videopython.base.video import Video
@@ -44,6 +44,12 @@ class VideoDubber:
44
44
  but processing continues. Either way the
45
45
  :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
46
46
  inspection.
47
+ translator: Translation backend to use. ``"auto"`` (default)
48
+ picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
49
+ ``"qwen3"`` force the named backend regardless of device.
50
+ See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
51
+ for tradeoffs (Qwen3 is slower on CPU but produces
52
+ context-aware, length-budgeted output).
47
53
  """
48
54
 
49
55
  def __init__(
@@ -55,6 +61,7 @@ class VideoDubber:
55
61
  no_speech_threshold: float = 0.6,
56
62
  logprob_threshold: float | None = -1.0,
57
63
  strict_quality: bool = False,
64
+ translator: TranslatorChoice = "auto",
58
65
  ):
59
66
  self.device = device
60
67
  self.low_memory = low_memory
@@ -63,13 +70,15 @@ class VideoDubber:
63
70
  self.no_speech_threshold = no_speech_threshold
64
71
  self.logprob_threshold = logprob_threshold
65
72
  self.strict_quality = strict_quality
73
+ self.translator = translator
66
74
  self._local_pipeline: Any = None
67
75
  requested = device.lower() if isinstance(device, str) else "auto"
68
76
  logger.info(
69
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
77
+ "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
70
78
  requested,
71
79
  low_memory,
72
80
  whisper_model,
81
+ translator,
73
82
  )
74
83
 
75
84
  def _init_local_pipeline(self) -> None:
@@ -83,6 +92,7 @@ class VideoDubber:
83
92
  no_speech_threshold=self.no_speech_threshold,
84
93
  logprob_threshold=self.logprob_threshold,
85
94
  strict_quality=self.strict_quality,
95
+ translator=self.translator,
86
96
  )
87
97
 
88
98
  def dub(
@@ -180,6 +180,11 @@ class DubbingResult:
180
180
  timing_summary: Aggregate stats over per-segment timing adjustments.
181
181
  transcript_quality: Heuristic quality assessment of the transcription
182
182
  (None when the pipeline returned early on an empty transcription).
183
+ translation_failures: Indices of segments where translation failed
184
+ entirely. Used by Qwen3Translator when both the primary call and
185
+ the per-segment Marian fallback fail; those segments are dubbed
186
+ with empty text. Empty list under MarianTranslator (Marian has
187
+ no failure mode that drops segments).
183
188
  """
184
189
 
185
190
  dubbed_audio: Audio
@@ -191,6 +196,7 @@ class DubbingResult:
191
196
  voice_samples: dict[str, Audio] = field(default_factory=dict)
192
197
  timing_summary: TimingSummary | None = None
193
198
  transcript_quality: TranscriptQuality | None = None
199
+ translation_failures: list[int] = field(default_factory=list)
194
200
 
195
201
  @property
196
202
  def num_segments(self) -> int:
@@ -9,14 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
9
9
 
10
10
  import numpy as np
11
11
 
12
+ from videopython.ai._device import select_device
12
13
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
13
14
  from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
14
15
  from videopython.ai.dubbing.timing import TimingSynchronizer
16
+ from videopython.ai.generation.qwen3 import Qwen3Translator
17
+ from videopython.ai.generation.translation import (
18
+ MarianTranslator,
19
+ TranslationBackend,
20
+ UnsupportedLanguageError,
21
+ )
15
22
 
16
23
  if TYPE_CHECKING:
17
24
  from videopython.base.audio import Audio
18
25
 
19
26
 
27
+ TranslatorChoice = Literal["auto", "marian", "qwen3"]
28
+
29
+
20
30
  def _peak_match(target: Audio, reference: Audio) -> Audio:
21
31
  """Scale ``target`` so its peak amplitude matches ``reference``.
22
32
 
@@ -74,6 +84,7 @@ class LocalDubbingPipeline:
74
84
  no_speech_threshold: float = 0.6,
75
85
  logprob_threshold: float | None = -1.0,
76
86
  strict_quality: bool = False,
87
+ translator: TranslatorChoice = "auto",
77
88
  ):
78
89
  self.device = device
79
90
  self.low_memory = low_memory
@@ -82,12 +93,14 @@ class LocalDubbingPipeline:
82
93
  self.no_speech_threshold = no_speech_threshold
83
94
  self.logprob_threshold = logprob_threshold
84
95
  self.strict_quality = strict_quality
96
+ self.translator = translator
85
97
  requested = device.lower() if isinstance(device, str) else "auto"
86
98
  logger.info(
87
- "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
99
+ "LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
88
100
  requested,
89
101
  low_memory,
90
102
  whisper_model,
103
+ translator,
91
104
  )
92
105
 
93
106
  self._transcriber: Any = None
@@ -128,11 +141,64 @@ class LocalDubbingPipeline:
128
141
  logprob_threshold=self.logprob_threshold,
129
142
  )
130
143
 
131
- def _init_translator(self) -> None:
132
- """Initialize the translation model."""
133
- from videopython.ai.generation.translation import TextTranslator
144
+ def _init_translator(self, source_lang: str, target_lang: str) -> None:
145
+ """Initialize the translation backend.
146
+
147
+ Resolves the configured ``self.translator`` choice into a concrete
148
+ backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
149
+ choices instantiate the named backend directly. Re-initialization
150
+ is a no-op when ``self._translator`` is already a matching instance
151
+ for the same language pair (handled at call sites via the existing
152
+ ``self._translator is None`` gate).
153
+ """
154
+ if self.translator == "marian":
155
+ self._translator = MarianTranslator(device=self.device)
156
+ elif self.translator == "qwen3":
157
+ self._translator = Qwen3Translator(device=self.device)
158
+ else: # "auto"
159
+ self._translator = self._resolve_translator_auto(source_lang, target_lang)
160
+
161
+ def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
162
+ """Pick a backend based on language coverage AND device.
163
+
164
+ Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
165
+ spike on dreams_15min.mp4). The resolver picks Marian on CPU
166
+ whenever it covers the language pair and only escalates to Qwen
167
+ when a GPU is available or Marian doesn't cover the pair.
168
+ """
169
+ device = select_device(self.device, mps_allowed=True)
170
+ has_gpu = device in ("cuda", "mps")
171
+
172
+ # 1. GPU + Qwen covers the pair → Qwen wins (best quality).
173
+ if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
174
+ logger.info(
175
+ "translator: auto-selected qwen3 (device=%s, supports %s->%s)",
176
+ device,
177
+ source_lang,
178
+ target_lang,
179
+ )
180
+ return Qwen3Translator(device=self.device)
181
+
182
+ # 2. Marian covers the pair → Marian (fast).
183
+ if MarianTranslator.has_model_for(source_lang, target_lang):
184
+ if has_gpu:
185
+ reason = f"Qwen does not cover {source_lang}->{target_lang}"
186
+ else:
187
+ reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
188
+ logger.info("translator: auto-selected marian (%s)", reason)
189
+ return MarianTranslator(device=self.device)
190
+
191
+ # 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
192
+ if Qwen3Translator.supports(source_lang, target_lang):
193
+ logger.warning(
194
+ "translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
195
+ "translation will be slow (~10-15x MarianMT). Consider GPU.",
196
+ source_lang,
197
+ target_lang,
198
+ )
199
+ return Qwen3Translator(device=self.device)
134
200
 
135
- self._translator = TextTranslator(device=self.device)
201
+ raise UnsupportedLanguageError(source_lang, target_lang)
136
202
 
137
203
  def _init_tts(self, language: str = "en") -> None:
138
204
  """Initialize the text-to-speech model."""
@@ -430,7 +496,7 @@ class LocalDubbingPipeline:
430
496
 
431
497
  report_progress("Translating text", 0.35)
432
498
  if self._translator is None:
433
- self._init_translator()
499
+ self._init_translator(source_lang=detected_lang, target_lang=target_lang)
434
500
 
435
501
  # Translation stage spans 0.35 → 0.50 of overall pipeline progress.
436
502
  # MarianMT runs sequentially over 8-segment batches; on a 15-min
@@ -446,6 +512,9 @@ class LocalDubbingPipeline:
446
512
  source_lang=detected_lang,
447
513
  progress_callback=_on_translation_progress,
448
514
  )
515
+ # Capture per-segment failures (always empty for Marian) before
516
+ # _maybe_unload nukes the backend in low_memory mode.
517
+ translation_failures = list(self._translator.translation_failures)
449
518
  self._maybe_unload("_translator")
450
519
 
451
520
  report_progress("Generating dubbed speech", 0.50)
@@ -559,6 +628,7 @@ class LocalDubbingPipeline:
559
628
  voice_samples=voice_samples,
560
629
  timing_summary=timing_summary,
561
630
  transcript_quality=transcript_quality,
631
+ translation_failures=translation_failures,
562
632
  )
563
633
 
564
634
  def revoice(
@@ -0,0 +1,394 @@
1
+ """Qwen3-Instruct translation backend (M2).
2
+
3
+ GGUF inference via ``llama-cpp-python``. One model for now —
4
+ ``Qwen3-4B-Instruct-2507`` (Apache-2.0, ~2.4 GB Q4_K_M). The original M2
5
+ plan called for low/medium/high tiers (4B / 8B / 30B-A3B); we deferred
6
+ that complexity until M2.4 eval data shows the larger models actually
7
+ deliver a quality lift worth the VRAM cost.
8
+
9
+ Latency note: on CPU the 4B model is roughly 10-15× slower than
10
+ :class:`MarianTranslator` per the M2.1 spike. On GPU it lands within ~2×
11
+ of Marian. Translation quality is decisively higher than Marian on
12
+ context-dependent and idiomatic content. The pipeline's
13
+ :class:`LocalDubbingPipeline` chooses based on ``device`` + the
14
+ ``translator`` kwarg.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import logging
21
+ from pathlib import Path
22
+ from typing import TYPE_CHECKING, Any, Callable
23
+
24
+ from videopython.ai._device import release_device_memory, select_device
25
+ from videopython.ai.generation.translation import (
26
+ LANGUAGE_NAMES,
27
+ MarianTranslator,
28
+ _is_translatable_text,
29
+ )
30
+ from videopython.base.text.transcription import TranscriptionSegment
31
+
32
+ # Imported under TYPE_CHECKING only — qwen3 sits below videopython.ai.dubbing
33
+ # in the import order (pipeline.py imports Qwen3Translator), so a top-level
34
+ # import would create a cycle. The runtime constructor reaches for it via a
35
+ # lazy local import inside translate_segments.
36
+ if TYPE_CHECKING:
37
+ from videopython.ai.dubbing.models import TranslatedSegment
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+
42
+ # Default model. Constants are module-level so an eval harness or future
43
+ # tier pick can override at the call site without forking the class.
44
+ DEFAULT_REPO_ID = "unsloth/Qwen3-4B-Instruct-2507-GGUF"
45
+ DEFAULT_FILENAME = "Qwen3-4B-Instruct-2507-Q4_K_M.gguf"
46
+
47
+
48
+ # Average characters per second of natural speech, used to derive the
49
+ # per-segment ``target_chars`` budget. Rough field measurements; the prompt
50
+ # tells Qwen this is a target ±15%, not a hard cap.
51
+ _SPEECH_CHARS_PER_SEC: dict[str, float] = {
52
+ "en": 14.0,
53
+ "es": 14.0,
54
+ "pt": 13.5,
55
+ "it": 13.5,
56
+ "fr": 13.0,
57
+ "de": 12.0,
58
+ "pl": 12.5,
59
+ "nl": 12.5,
60
+ "ru": 12.0,
61
+ "uk": 12.0,
62
+ "cs": 12.0,
63
+ "sk": 12.0,
64
+ "ro": 13.0,
65
+ "hu": 12.0,
66
+ "fi": 11.0,
67
+ "sv": 12.5,
68
+ "da": 13.0,
69
+ "nb": 13.0,
70
+ "no": 13.0,
71
+ "ja": 8.0,
72
+ "ko": 9.0,
73
+ "zh": 7.0,
74
+ "zh-CN": 7.0,
75
+ "zh-TW": 7.0,
76
+ "th": 9.0,
77
+ "vi": 11.0,
78
+ "ar": 10.0,
79
+ "he": 10.0,
80
+ "hi": 11.0,
81
+ "ta": 10.0,
82
+ "id": 12.0,
83
+ "ms": 12.0,
84
+ "tr": 12.0,
85
+ "el": 12.0,
86
+ }
87
+ _SPEECH_CHARS_DEFAULT = 12.0
88
+
89
+
90
+ # Qwen's avg_logprob is in [-inf, 0]. Values below this threshold mark a
91
+ # transcription window we don't trust — Qwen gets a hint not to over-anchor.
92
+ _LOW_LOGPROB_HINT_THRESHOLD = -1.0
93
+
94
+
95
+ def _target_chars_for(duration_seconds: float, target_lang: str) -> int:
96
+ """Character-count budget for a segment of ``duration_seconds`` in ``target_lang``."""
97
+ rate = _SPEECH_CHARS_PER_SEC.get(target_lang, _SPEECH_CHARS_DEFAULT)
98
+ return max(1, int(duration_seconds * rate * 1.15))
99
+
100
+
101
+ def _build_system_prompt(source_lang: str, target_lang: str) -> str:
102
+ """Stable system + format spec. The few-shot example uses generic
103
+ phrases (no fixture-specific content) so the prompt generalizes.
104
+ """
105
+ src_name = LANGUAGE_NAMES.get(source_lang, source_lang)
106
+ tgt_name = LANGUAGE_NAMES.get(target_lang, target_lang)
107
+ return (
108
+ f"You are a professional dub translator. Translate from {src_name} to {tgt_name}.\n"
109
+ "Preserve register and proper nouns. Match each segment's syllable count so the\n"
110
+ "dub fits the original timing — translation is for spoken audio, not subtitles.\n"
111
+ "Aim for ``target_chars`` characters per segment (±15%).\n"
112
+ "If a segment is non-speech filler (grunts, laughter, music cues) keep it as filler in\n"
113
+ "the target language; do not invent content.\n"
114
+ "If a segment carries ``low_confidence``, the source transcription may be wrong;\n"
115
+ "translate conservatively rather than committing to a specific phrase.\n"
116
+ "\n"
117
+ "Output one JSON object per line, no preamble, no commentary, no markdown:\n"
118
+ '{"i": <segment_index>, "translated": "<text>"}\n'
119
+ )
120
+
121
+
122
+ def _build_user_prompt(segments: list[TranscriptionSegment], target_lang: str) -> str:
123
+ """Per-call body — the segments to translate."""
124
+ lines: list[str] = []
125
+ for idx, seg in enumerate(segments):
126
+ budget = _target_chars_for(seg.end - seg.start, target_lang)
127
+ entry: dict[str, Any] = {
128
+ "i": idx,
129
+ "text": seg.text,
130
+ "target_chars": budget,
131
+ }
132
+ if seg.avg_logprob is not None and seg.avg_logprob < _LOW_LOGPROB_HINT_THRESHOLD:
133
+ entry["low_confidence"] = True
134
+ lines.append(json.dumps(entry, ensure_ascii=False))
135
+ request_block = "\n".join(lines)
136
+ return (
137
+ f"Input segments:\n{request_block}\nTranslations (one JSON object per line, exactly {len(segments)} lines):\n"
138
+ )
139
+
140
+
141
+ def _parse_jsonl_response(raw: str) -> dict[int, str]:
142
+ """Extract ``{i: translated_text}`` from Qwen output. Permissive — tolerates
143
+ markdown fences and preamble lines that the model occasionally adds."""
144
+ parsed: dict[int, str] = {}
145
+ for line in raw.splitlines():
146
+ line = line.strip()
147
+ if not line or line.startswith("```"):
148
+ continue
149
+ try:
150
+ obj = json.loads(line)
151
+ except json.JSONDecodeError:
152
+ continue
153
+ if isinstance(obj, dict) and "i" in obj and "translated" in obj:
154
+ try:
155
+ parsed[int(obj["i"])] = str(obj["translated"])
156
+ except (TypeError, ValueError):
157
+ continue
158
+ return parsed
159
+
160
+
161
+ class Qwen3Translator:
162
+ """Qwen3-Instruct translation via llama-cpp-python (GGUF).
163
+
164
+ Args:
165
+ device: ``"cuda"``, ``"mps"``, ``"cpu"``, or ``None`` for auto.
166
+ marian_fallback: If True (default), fall back to Marian for any
167
+ segment that fails Qwen's parse retry. Set False to disable
168
+ (failures land in ``translation_failures`` instead).
169
+ repo_id: HuggingFace repo for the GGUF weights. Defaults to
170
+ ``DEFAULT_REPO_ID``; override for eval harnesses.
171
+ filename: GGUF filename within ``repo_id``. Defaults to
172
+ ``DEFAULT_FILENAME``.
173
+ n_ctx: llama.cpp context window. 8192 is plenty for a 15-min source;
174
+ raise for very long sources. Hard cap is the model's training
175
+ context (262K for Qwen3-4B-Instruct-2507).
176
+ max_tokens: Generation cap per call. 4× the input character count
177
+ is a safe upper bound for translation output.
178
+ temperature: Decoding temperature. 0.1 keeps output structurally
179
+ consistent (high JSON parse rate) without being deterministic.
180
+ """
181
+
182
+ def __init__(
183
+ self,
184
+ device: str | None = None,
185
+ marian_fallback: bool = True,
186
+ repo_id: str = DEFAULT_REPO_ID,
187
+ filename: str = DEFAULT_FILENAME,
188
+ n_ctx: int = 8192,
189
+ max_tokens: int = 4096,
190
+ temperature: float = 0.1,
191
+ ):
192
+ self.device = device
193
+ self.marian_fallback = marian_fallback
194
+ self.repo_id = repo_id
195
+ self.filename = filename
196
+ self.n_ctx = n_ctx
197
+ self.max_tokens = max_tokens
198
+ self.temperature = temperature
199
+
200
+ # Lazily initialized.
201
+ self._llm: Any = None
202
+ self._marian: MarianTranslator | None = None
203
+ # Tracks which segment indices both Qwen and Marian failed on. The
204
+ # pipeline reads this to populate DubbingResult.translation_failures.
205
+ self._failures_last_call: list[int] = []
206
+
207
+ def _init_local(self) -> None:
208
+ """Download (if needed) and load the GGUF weights."""
209
+ from huggingface_hub import hf_hub_download
210
+ from llama_cpp import Llama
211
+
212
+ # Warn about CPU latency at load time (not __init__) — the warning is
213
+ # about runtime cost, which only applies once the model is actually
214
+ # loaded. Construction is cheap; tests instantiate Qwen3Translator
215
+ # without intending to run inference, so __init__ shouldn't shout.
216
+ resolved = select_device(self.device, mps_allowed=True)
217
+ if resolved == "cpu":
218
+ logger.warning(
219
+ "Qwen3Translator on CPU is ~10-15x slower than MarianTranslator. "
220
+ "Consider translator='marian' for development or pass device='cuda'/'mps'.",
221
+ )
222
+
223
+ logger.info("Qwen3Translator: loading %s", self.filename)
224
+ model_path = Path(hf_hub_download(repo_id=self.repo_id, filename=self.filename))
225
+
226
+ # n_gpu_layers=-1 offloads everything to GPU when one is available;
227
+ # 0 forces CPU. llama-cpp-python's Metal/CUDA support detects and
228
+ # uses whatever the build was compiled against.
229
+ n_gpu_layers = 0 if resolved == "cpu" else -1
230
+ # n_threads omitted on purpose — llama-cpp-python defaults to a
231
+ # sensible per-host value (min(physical cores, 4)). Hardcoding 8
232
+ # under-utilizes a 16-core box and over-subscribes a 4-core CI.
233
+ self._llm = Llama(
234
+ model_path=str(model_path),
235
+ n_ctx=self.n_ctx,
236
+ n_gpu_layers=n_gpu_layers,
237
+ verbose=False,
238
+ )
239
+
240
+ def _qwen_translate(
241
+ self, segments: list[TranscriptionSegment], target_lang: str, source_lang: str
242
+ ) -> dict[int, str]:
243
+ """One Qwen call to translate all segments. Empty result on parse failure."""
244
+ if self._llm is None:
245
+ self._init_local()
246
+
247
+ system = _build_system_prompt(source_lang, target_lang)
248
+ user = _build_user_prompt(segments, target_lang)
249
+ prompt = system + user
250
+
251
+ response = self._llm(
252
+ prompt,
253
+ max_tokens=self.max_tokens,
254
+ temperature=self.temperature,
255
+ stop=None,
256
+ )
257
+ raw = response["choices"][0]["text"]
258
+ return _parse_jsonl_response(raw)
259
+
260
+ def translate_segments(
261
+ self,
262
+ segments: list[TranscriptionSegment],
263
+ target_lang: str,
264
+ source_lang: str | None = None,
265
+ progress_callback: Callable[[float], None] | None = None,
266
+ ) -> list[TranslatedSegment]:
267
+ """Translate segments via Qwen with parse-retry + optional Marian fallback.
268
+
269
+ The progress_callback fires three times: 0.5 after the first
270
+ Qwen call, 0.9 after the optional retry/fallback, 1.0 at the
271
+ end. M2.1 phase 2 confirmed smaller batches don't help on CPU,
272
+ so finer-grained progress isn't possible without fake ticks.
273
+ """
274
+ effective_source = source_lang or "en"
275
+ self._failures_last_call = []
276
+
277
+ translatable_indices = [i for i, seg in enumerate(segments) if _is_translatable_text(seg.text)]
278
+ translatable_segments = [segments[i] for i in translatable_indices]
279
+
280
+ # First attempt.
281
+ if translatable_segments:
282
+ qwen_results = self._qwen_translate(translatable_segments, target_lang, effective_source)
283
+ else:
284
+ qwen_results = {}
285
+ if progress_callback is not None:
286
+ progress_callback(0.5)
287
+
288
+ # Identify segments Qwen failed (unparseable or missing index).
289
+ # Indices in qwen_results / translatable_segments are 0-based positions
290
+ # within translatable_segments, NOT positions in the full ``segments``
291
+ # list. Map back at the end.
292
+ missing_local_indices = [li for li in range(len(translatable_segments)) if li not in qwen_results]
293
+
294
+ # Retry once on the missing subset with stricter instructions.
295
+ if missing_local_indices:
296
+ retry_segments = [translatable_segments[li] for li in missing_local_indices]
297
+ logger.info(
298
+ "Qwen3Translator: retrying %d/%d segments after first parse",
299
+ len(retry_segments),
300
+ len(translatable_segments),
301
+ )
302
+ retry_results = self._qwen_translate(retry_segments, target_lang, effective_source)
303
+ # retry_results uses 0..len(retry_segments)-1 as keys; map back.
304
+ for retry_local, original_local in enumerate(missing_local_indices):
305
+ if retry_local in retry_results:
306
+ qwen_results[original_local] = retry_results[retry_local]
307
+ if progress_callback is not None:
308
+ progress_callback(0.9)
309
+
310
+ # Anything still missing → Marian fallback (or surface as failure).
311
+ still_missing_local = [li for li in range(len(translatable_segments)) if li not in qwen_results]
312
+ if still_missing_local and self.marian_fallback:
313
+ fallback_segments = [translatable_segments[li] for li in still_missing_local]
314
+ logger.warning(
315
+ "Qwen3Translator: falling back to Marian for %d segments after retry",
316
+ len(fallback_segments),
317
+ )
318
+ if self._marian is None:
319
+ self._marian = MarianTranslator(device=self.device)
320
+ try:
321
+ fallback_translated = self._marian.translate_segments(
322
+ fallback_segments, target_lang=target_lang, source_lang=effective_source
323
+ )
324
+ for li, ts in zip(still_missing_local, fallback_translated):
325
+ qwen_results[li] = ts.translated_text
326
+ except Exception as exc:
327
+ logger.warning("Qwen3Translator: Marian fallback failed (%s)", exc)
328
+ # Leave them missing; they'll be recorded as failures below.
329
+
330
+ # Whatever's still missing is a hard failure. Record original-segment
331
+ # indices (positions in the full ``segments`` list) so the caller
332
+ # can reconcile against translated_segments.
333
+ for li in range(len(translatable_segments)):
334
+ if li not in qwen_results:
335
+ self._failures_last_call.append(translatable_indices[li])
336
+
337
+ # Lazy import to avoid a circular dep through videopython.ai.dubbing
338
+ # (see TYPE_CHECKING import at the top of the module).
339
+ from videopython.ai.dubbing.models import TranslatedSegment
340
+
341
+ # Materialize TranslatedSegments parallel to the input list.
342
+ translated_segments: list[TranslatedSegment] = []
343
+ local_translation_for_orig: dict[int, str] = {}
344
+ for li, original_idx in enumerate(translatable_indices):
345
+ if li in qwen_results:
346
+ local_translation_for_orig[original_idx] = qwen_results[li]
347
+
348
+ for i, segment in enumerate(segments):
349
+ translated_text = local_translation_for_orig.get(i, "")
350
+ translated_segments.append(
351
+ TranslatedSegment(
352
+ original_segment=segment,
353
+ translated_text=translated_text,
354
+ source_lang=effective_source,
355
+ target_lang=target_lang,
356
+ speaker=segment.speaker,
357
+ start=segment.start,
358
+ end=segment.end,
359
+ )
360
+ )
361
+
362
+ if progress_callback is not None:
363
+ progress_callback(1.0)
364
+ return translated_segments
365
+
366
+ @property
367
+ def translation_failures(self) -> list[int]:
368
+ """Indices (in the most recent ``segments`` input) where translation
369
+ failed entirely. Empty if all segments translated.
370
+ """
371
+ return list(self._failures_last_call)
372
+
373
+ def unload(self) -> None:
374
+ """Release the model so the next call re-initializes. Used by
375
+ :class:`LocalDubbingPipeline` in ``low_memory`` mode."""
376
+ self._llm = None
377
+ if self._marian is not None:
378
+ self._marian.unload()
379
+ self._marian = None
380
+ release_device_memory(self.device)
381
+
382
+ @staticmethod
383
+ def get_supported_languages() -> dict[str, str]:
384
+ """Qwen handles all of Marian's language set plus more; we expose the
385
+ Marian set for now and let M2.4 eval add anything Qwen-only.
386
+ """
387
+ return LANGUAGE_NAMES.copy()
388
+
389
+ @classmethod
390
+ def supports(cls, source_lang: str, target_lang: str) -> bool:
391
+ """Coverage hint for the M2.3 ``auto`` resolver."""
392
+ if source_lang == target_lang:
393
+ return True
394
+ return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES
@@ -1,13 +1,50 @@
1
- """Text translation using local Helsinki-NLP models."""
1
+ """Text translation backends.
2
+
3
+ Two backends share the :class:`TranslationBackend` protocol:
4
+
5
+ - :class:`MarianTranslator` (HuggingFace Helsinki-NLP MarianMT) — fast,
6
+ segment-isolated, available for ~30 language pairs. Default on CPU.
7
+ - :class:`Qwen3Translator` (Qwen3-4B/8B/14B-Instruct via llama-cpp-python) —
8
+ slower but produces context-aware, length-budgeted translations. Default
9
+ on GPU.
10
+
11
+ The pipeline picks via :class:`videopython.ai.dubbing.pipeline` based on a
12
+ ``translator`` kwarg (``"auto"`` resolves at runtime).
13
+ """
2
14
 
3
15
  from __future__ import annotations
4
16
 
5
- from typing import Any, Callable
17
+ from typing import TYPE_CHECKING, Any, Callable, Protocol, runtime_checkable
6
18
 
7
19
  from videopython.ai._device import log_device_initialization, release_device_memory, select_device
8
- from videopython.ai.dubbing.models import TranslatedSegment
9
20
  from videopython.base.text.transcription import TranscriptionSegment
10
21
 
22
+ # Imported under TYPE_CHECKING to avoid a circular dep through
23
+ # videopython.ai.dubbing (the dubbing pipeline imports both
24
+ # MarianTranslator and Qwen3Translator, which both import
25
+ # TranslatedSegment from dubbing.models). Runtime users do a lazy
26
+ # local import inside translate_segments.
27
+ if TYPE_CHECKING:
28
+ from videopython.ai.dubbing.models import TranslatedSegment
29
+
30
+
31
+ class UnsupportedLanguageError(ValueError):
32
+ """Raised when no available translation backend supports a given
33
+ ``(source, target)`` language pair.
34
+
35
+ Carries the requested pair so callers can introspect:
36
+
37
+ try:
38
+ dubber.dub(video, target_lang="xh")
39
+ except UnsupportedLanguageError as e:
40
+ print(f"No backend covers {e.source_lang}->{e.target_lang}")
41
+ """
42
+
43
+ def __init__(self, source_lang: str, target_lang: str, message: str | None = None):
44
+ self.source_lang = source_lang
45
+ self.target_lang = target_lang
46
+ super().__init__(message or f"No translation backend supports {source_lang}->{target_lang}")
47
+
11
48
 
12
49
  def _is_translatable_text(text: str) -> bool:
13
50
  """Return True if text has enough content to be worth translating.
@@ -19,6 +56,36 @@ def _is_translatable_text(text: str) -> bool:
19
56
  return sum(1 for c in text if c.isalnum()) >= 2
20
57
 
21
58
 
59
+ @runtime_checkable
60
+ class TranslationBackend(Protocol):
61
+ """Pipeline-facing translation interface.
62
+
63
+ Both :class:`MarianTranslator` and :class:`Qwen3Translator` satisfy
64
+ this. The pipeline only depends on these methods.
65
+ """
66
+
67
+ def translate_segments(
68
+ self,
69
+ segments: list[TranscriptionSegment],
70
+ target_lang: str,
71
+ source_lang: str | None = None,
72
+ progress_callback: Callable[[float], None] | None = None,
73
+ ) -> list[TranslatedSegment]: ...
74
+
75
+ def unload(self) -> None: ...
76
+
77
+ @property
78
+ def translation_failures(self) -> list[int]:
79
+ """Indices into the most recent ``segments`` input where the backend
80
+ could not produce a translation. Empty for backends that never fail
81
+ per-segment (e.g. MarianTranslator). The dubbing pipeline copies
82
+ this onto :class:`DubbingResult.translation_failures`."""
83
+ ...
84
+
85
+ @staticmethod
86
+ def get_supported_languages() -> dict[str, str]: ...
87
+
88
+
22
89
  LANGUAGE_NAMES = {
23
90
  "en": "English",
24
91
  "es": "Spanish",
@@ -56,8 +123,8 @@ LANGUAGE_NAMES = {
56
123
  }
57
124
 
58
125
 
59
- class TextTranslator:
60
- """Translates text between languages using local seq2seq models."""
126
+ class MarianTranslator:
127
+ """Translates text between languages using local Helsinki-NLP MarianMT models."""
61
128
 
62
129
  # Languages without a direct opus-mt-{src}-{tgt} model. Maps (source, target)
63
130
  # to an alternative HuggingFace model identifier.
@@ -68,6 +135,25 @@ class TextTranslator:
68
135
  ("en", "pl"): "Helsinki-NLP/opus-mt-en-zlw",
69
136
  }
70
137
 
138
+ @classmethod
139
+ def has_model_for(cls, source_lang: str, target_lang: str) -> bool:
140
+ """Return True if Marian has (or is likely to have) a model for ``(source, target)``.
141
+
142
+ Same-language pairs return True (translation is the identity).
143
+ Otherwise: True if either an entry in ``_MODEL_OVERRIDES`` exists or
144
+ both languages are in :data:`LANGUAGE_NAMES`. The latter is a
145
+ permissive proxy — Marian publishes ``opus-mt-{src}-{tgt}`` for
146
+ most ISO-639-1 pairs we expose, but not all (e.g. some Asian-to-
147
+ Asian pairs route through English). Used by the M2.3 ``auto``
148
+ resolver as a *coverage hint*; the actual existence check happens
149
+ at first-use download time.
150
+ """
151
+ if source_lang == target_lang:
152
+ return True
153
+ if (source_lang, target_lang) in cls._MODEL_OVERRIDES:
154
+ return True
155
+ return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES
156
+
71
157
  def __init__(self, model_name: str | None = None, device: str | None = None):
72
158
  self.model_name = model_name
73
159
  self.device = device
@@ -194,6 +280,10 @@ class TextTranslator:
194
280
  callers can render translation-stage progress without knowing the
195
281
  batch size.
196
282
  """
283
+ # Lazy import to avoid a circular dep through videopython.ai.dubbing
284
+ # (see TYPE_CHECKING import at the top of the module).
285
+ from videopython.ai.dubbing.models import TranslatedSegment
286
+
197
287
  effective_source = source_lang or "en"
198
288
 
199
289
  translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
@@ -230,6 +320,20 @@ class TextTranslator:
230
320
  self._current_lang_pair = None
231
321
  release_device_memory(self.device)
232
322
 
323
+ @property
324
+ def translation_failures(self) -> list[int]:
325
+ """Marian never fails per-segment (worst case it produces poor
326
+ output, not no output). Always empty; satisfies the
327
+ :class:`TranslationBackend` protocol."""
328
+ return []
329
+
233
330
  @staticmethod
234
331
  def get_supported_languages() -> dict[str, str]:
235
332
  return LANGUAGE_NAMES.copy()
333
+
334
+
335
+ # Back-compat alias. ``TextTranslator`` was the class name through 0.28.x;
336
+ # 0.29.0 renames to ``MarianTranslator`` to make room for ``Qwen3Translator``
337
+ # behind a shared :class:`TranslationBackend` protocol. The alias will be
338
+ # removed in 0.30.0.
339
+ TextTranslator = MarianTranslator
@@ -11,6 +11,36 @@ from videopython.base.text.transcription import Transcription, TranscriptionSegm
11
11
  from videopython.base.video import Video
12
12
 
13
13
 
14
+ def _attach_confidence_by_overlap(
15
+ target_segments: list[TranscriptionSegment],
16
+ source_segments: list[TranscriptionSegment],
17
+ ) -> None:
18
+ """Stamp Whisper confidence (avg_logprob, no_speech_prob, compression_ratio)
19
+ onto ``target_segments`` from the ``source_segments`` they overlap most with.
20
+
21
+ Used to re-attach per-segment confidence after diarization rebuilds segments
22
+ from words and drops the original Whisper-segment metadata. Whisper's
23
+ confidence is window-level, not phoneme-level, so overlap-by-time is the
24
+ right granularity — re-deriving per-word and re-aggregating wouldn't be
25
+ more accurate.
26
+
27
+ Mutates ``target_segments`` in place. Segments with no overlap to any
28
+ source segment are left untouched (their confidence stays None).
29
+ """
30
+ for tgt in target_segments:
31
+ best_overlap = 0.0
32
+ best_src: TranscriptionSegment | None = None
33
+ for src in source_segments:
34
+ overlap = max(0.0, min(tgt.end, src.end) - max(tgt.start, src.start))
35
+ if overlap > best_overlap:
36
+ best_overlap = overlap
37
+ best_src = src
38
+ if best_src is not None:
39
+ tgt.avg_logprob = best_src.avg_logprob
40
+ tgt.no_speech_prob = best_src.no_speech_prob
41
+ tgt.compression_ratio = best_src.compression_ratio
42
+
43
+
14
44
  class AudioToText:
15
45
  """Transcription service for audio and video using local Whisper models.
16
46
 
@@ -295,6 +325,13 @@ class AudioToText:
295
325
 
296
326
  transcription = self._process_transcription_result(transcription_result)
297
327
 
328
+ # Capture original Whisper segments before flattening to words. The
329
+ # diarization rebuild via Transcription(words=...) regroups by speaker,
330
+ # which loses the per-segment confidence M1.3 plumbed through. We
331
+ # re-attach by max-overlap match below so M2's confidence-aware
332
+ # translation prompts have signal on the diarized path too.
333
+ whisper_segments = transcription.segments
334
+
298
335
  all_words: list[TranscriptionWord] = []
299
336
  for seg in transcription.segments:
300
337
  all_words.extend(seg.words)
@@ -302,7 +339,9 @@ class AudioToText:
302
339
  if all_words:
303
340
  all_words = self._assign_speakers_to_words(all_words, diarization_result)
304
341
 
305
- return Transcription(words=all_words, language=transcription.language)
342
+ rebuilt = Transcription(words=all_words, language=transcription.language)
343
+ _attach_confidence_by_overlap(rebuilt.segments, whisper_segments)
344
+ return rebuilt
306
345
 
307
346
  def _transcribe_local(self, audio: Audio) -> Transcription:
308
347
  """Transcribe using local Whisper model.
File without changes
File without changes
File without changes