videopython 0.28.0__tar.gz → 0.28.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.28.0 → videopython-0.28.1}/PKG-INFO +2 -1
- {videopython-0.28.0 → videopython-0.28.1}/pyproject.toml +6 -1
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/__init__.py +2 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/dubber.py +12 -2
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/models.py +6 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/pipeline.py +76 -6
- videopython-0.28.1/src/videopython/ai/generation/qwen3.py +394 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/translation.py +109 -5
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/audio.py +40 -1
- {videopython-0.28.0 → videopython-0.28.1}/.gitignore +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/LICENSE +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/README.md +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/_device.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/registry.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/inpainter.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/models.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/segmenter.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/swapping/swapper.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/transforms.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/ai/video_analysis.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/analysis.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/audio/audio.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/combine.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/description.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/effects.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/progress.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/registry.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/scene.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/streaming.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/overlay.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/text/transcription.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/transforms.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/transitions.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/utils.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/base/video.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/__init__.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/multicam.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/premiere_xml.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.28.0 → videopython-0.28.1}/src/videopython/py.typed +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.28.
|
|
3
|
+
Version: 0.28.1
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://videopython.com
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -29,6 +29,7 @@ Requires-Dist: demucs>=4.0.0; extra == 'ai'
|
|
|
29
29
|
Requires-Dist: diffusers>=0.30.0; extra == 'ai'
|
|
30
30
|
Requires-Dist: easyocr>=1.7.0; extra == 'ai'
|
|
31
31
|
Requires-Dist: hf-transfer>=0.1.9; extra == 'ai'
|
|
32
|
+
Requires-Dist: llama-cpp-python>=0.3.0; extra == 'ai'
|
|
32
33
|
Requires-Dist: numba>=0.61.0; extra == 'ai'
|
|
33
34
|
Requires-Dist: ollama>=0.4.5; extra == 'ai'
|
|
34
35
|
Requires-Dist: openai-whisper>=20240930; extra == 'ai'
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.28.
|
|
3
|
+
version = "0.28.1"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -80,6 +80,8 @@ ai = [
|
|
|
80
80
|
"sentencepiece>=0.1.99",
|
|
81
81
|
# Audio source separation
|
|
82
82
|
"demucs>=4.0.0",
|
|
83
|
+
# Translation backend: Qwen3 GGUF inference (M2)
|
|
84
|
+
"llama-cpp-python>=0.3.0",
|
|
83
85
|
]
|
|
84
86
|
|
|
85
87
|
# Required for pip install videopython[ai] - pip uses optional-dependencies, not dependency-groups
|
|
@@ -111,6 +113,8 @@ ai = [
|
|
|
111
113
|
"sentencepiece>=0.1.99",
|
|
112
114
|
# Audio source separation
|
|
113
115
|
"demucs>=4.0.0",
|
|
116
|
+
# Translation backend: Qwen3 GGUF inference (M2)
|
|
117
|
+
"llama-cpp-python>=0.3.0",
|
|
114
118
|
]
|
|
115
119
|
|
|
116
120
|
[project.urls]
|
|
@@ -136,6 +140,7 @@ module = [
|
|
|
136
140
|
"pyannote", "pyannote.*",
|
|
137
141
|
"silero_vad", "silero_vad.*",
|
|
138
142
|
"cv2", "cv2.*",
|
|
143
|
+
"llama_cpp", "llama_cpp.*",
|
|
139
144
|
]
|
|
140
145
|
ignore_missing_imports = true
|
|
141
146
|
|
|
@@ -5,6 +5,7 @@ from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, Separate
|
|
|
5
5
|
from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
|
|
6
6
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, TranscriptQuality, assess_transcript
|
|
7
7
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
8
|
+
from videopython.ai.generation.translation import UnsupportedLanguageError
|
|
8
9
|
|
|
9
10
|
__all__ = [
|
|
10
11
|
"VideoDubber",
|
|
@@ -17,4 +18,5 @@ __all__ = [
|
|
|
17
18
|
"GarbageTranscriptError",
|
|
18
19
|
"TranscriptQuality",
|
|
19
20
|
"assess_transcript",
|
|
21
|
+
"UnsupportedLanguageError",
|
|
20
22
|
]
|
|
@@ -7,7 +7,7 @@ from pathlib import Path
|
|
|
7
7
|
from typing import TYPE_CHECKING, Any, Callable
|
|
8
8
|
|
|
9
9
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
|
|
10
|
-
from videopython.ai.dubbing.pipeline import WhisperModel
|
|
10
|
+
from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
|
|
11
11
|
|
|
12
12
|
if TYPE_CHECKING:
|
|
13
13
|
from videopython.base.video import Video
|
|
@@ -44,6 +44,12 @@ class VideoDubber:
|
|
|
44
44
|
but processing continues. Either way the
|
|
45
45
|
:class:`TranscriptQuality` is exposed on ``DubbingResult`` for
|
|
46
46
|
inspection.
|
|
47
|
+
translator: Translation backend to use. ``"auto"`` (default)
|
|
48
|
+
picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
|
|
49
|
+
``"qwen3"`` force the named backend regardless of device.
|
|
50
|
+
See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
|
|
51
|
+
for tradeoffs (Qwen3 is slower on CPU but produces
|
|
52
|
+
context-aware, length-budgeted output).
|
|
47
53
|
"""
|
|
48
54
|
|
|
49
55
|
def __init__(
|
|
@@ -55,6 +61,7 @@ class VideoDubber:
|
|
|
55
61
|
no_speech_threshold: float = 0.6,
|
|
56
62
|
logprob_threshold: float | None = -1.0,
|
|
57
63
|
strict_quality: bool = False,
|
|
64
|
+
translator: TranslatorChoice = "auto",
|
|
58
65
|
):
|
|
59
66
|
self.device = device
|
|
60
67
|
self.low_memory = low_memory
|
|
@@ -63,13 +70,15 @@ class VideoDubber:
|
|
|
63
70
|
self.no_speech_threshold = no_speech_threshold
|
|
64
71
|
self.logprob_threshold = logprob_threshold
|
|
65
72
|
self.strict_quality = strict_quality
|
|
73
|
+
self.translator = translator
|
|
66
74
|
self._local_pipeline: Any = None
|
|
67
75
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
68
76
|
logger.info(
|
|
69
|
-
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s",
|
|
77
|
+
"VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
70
78
|
requested,
|
|
71
79
|
low_memory,
|
|
72
80
|
whisper_model,
|
|
81
|
+
translator,
|
|
73
82
|
)
|
|
74
83
|
|
|
75
84
|
def _init_local_pipeline(self) -> None:
|
|
@@ -83,6 +92,7 @@ class VideoDubber:
|
|
|
83
92
|
no_speech_threshold=self.no_speech_threshold,
|
|
84
93
|
logprob_threshold=self.logprob_threshold,
|
|
85
94
|
strict_quality=self.strict_quality,
|
|
95
|
+
translator=self.translator,
|
|
86
96
|
)
|
|
87
97
|
|
|
88
98
|
def dub(
|
|
@@ -180,6 +180,11 @@ class DubbingResult:
|
|
|
180
180
|
timing_summary: Aggregate stats over per-segment timing adjustments.
|
|
181
181
|
transcript_quality: Heuristic quality assessment of the transcription
|
|
182
182
|
(None when the pipeline returned early on an empty transcription).
|
|
183
|
+
translation_failures: Indices of segments where translation failed
|
|
184
|
+
entirely. Used by Qwen3Translator when both the primary call and
|
|
185
|
+
the per-segment Marian fallback fail; those segments are dubbed
|
|
186
|
+
with empty text. Empty list under MarianTranslator (Marian has
|
|
187
|
+
no failure mode that drops segments).
|
|
183
188
|
"""
|
|
184
189
|
|
|
185
190
|
dubbed_audio: Audio
|
|
@@ -191,6 +196,7 @@ class DubbingResult:
|
|
|
191
196
|
voice_samples: dict[str, Audio] = field(default_factory=dict)
|
|
192
197
|
timing_summary: TimingSummary | None = None
|
|
193
198
|
transcript_quality: TranscriptQuality | None = None
|
|
199
|
+
translation_failures: list[int] = field(default_factory=list)
|
|
194
200
|
|
|
195
201
|
@property
|
|
196
202
|
def num_segments(self) -> int:
|
|
@@ -9,14 +9,24 @@ from typing import TYPE_CHECKING, Any, Callable, Literal
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
|
+
from videopython.ai._device import select_device
|
|
12
13
|
from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio, TimingSummary
|
|
13
14
|
from videopython.ai.dubbing.quality import GarbageTranscriptError, assess_transcript
|
|
14
15
|
from videopython.ai.dubbing.timing import TimingSynchronizer
|
|
16
|
+
from videopython.ai.generation.qwen3 import Qwen3Translator
|
|
17
|
+
from videopython.ai.generation.translation import (
|
|
18
|
+
MarianTranslator,
|
|
19
|
+
TranslationBackend,
|
|
20
|
+
UnsupportedLanguageError,
|
|
21
|
+
)
|
|
15
22
|
|
|
16
23
|
if TYPE_CHECKING:
|
|
17
24
|
from videopython.base.audio import Audio
|
|
18
25
|
|
|
19
26
|
|
|
27
|
+
TranslatorChoice = Literal["auto", "marian", "qwen3"]
|
|
28
|
+
|
|
29
|
+
|
|
20
30
|
def _peak_match(target: Audio, reference: Audio) -> Audio:
|
|
21
31
|
"""Scale ``target`` so its peak amplitude matches ``reference``.
|
|
22
32
|
|
|
@@ -74,6 +84,7 @@ class LocalDubbingPipeline:
|
|
|
74
84
|
no_speech_threshold: float = 0.6,
|
|
75
85
|
logprob_threshold: float | None = -1.0,
|
|
76
86
|
strict_quality: bool = False,
|
|
87
|
+
translator: TranslatorChoice = "auto",
|
|
77
88
|
):
|
|
78
89
|
self.device = device
|
|
79
90
|
self.low_memory = low_memory
|
|
@@ -82,12 +93,14 @@ class LocalDubbingPipeline:
|
|
|
82
93
|
self.no_speech_threshold = no_speech_threshold
|
|
83
94
|
self.logprob_threshold = logprob_threshold
|
|
84
95
|
self.strict_quality = strict_quality
|
|
96
|
+
self.translator = translator
|
|
85
97
|
requested = device.lower() if isinstance(device, str) else "auto"
|
|
86
98
|
logger.info(
|
|
87
|
-
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s",
|
|
99
|
+
"LocalDubbingPipeline initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
|
|
88
100
|
requested,
|
|
89
101
|
low_memory,
|
|
90
102
|
whisper_model,
|
|
103
|
+
translator,
|
|
91
104
|
)
|
|
92
105
|
|
|
93
106
|
self._transcriber: Any = None
|
|
@@ -128,11 +141,64 @@ class LocalDubbingPipeline:
|
|
|
128
141
|
logprob_threshold=self.logprob_threshold,
|
|
129
142
|
)
|
|
130
143
|
|
|
131
|
-
def _init_translator(self) -> None:
|
|
132
|
-
"""Initialize the translation
|
|
133
|
-
|
|
144
|
+
def _init_translator(self, source_lang: str, target_lang: str) -> None:
|
|
145
|
+
"""Initialize the translation backend.
|
|
146
|
+
|
|
147
|
+
Resolves the configured ``self.translator`` choice into a concrete
|
|
148
|
+
backend. ``"auto"`` uses :meth:`_resolve_translator_auto`; explicit
|
|
149
|
+
choices instantiate the named backend directly. Re-initialization
|
|
150
|
+
is a no-op when ``self._translator`` is already a matching instance
|
|
151
|
+
for the same language pair (handled at call sites via the existing
|
|
152
|
+
``self._translator is None`` gate).
|
|
153
|
+
"""
|
|
154
|
+
if self.translator == "marian":
|
|
155
|
+
self._translator = MarianTranslator(device=self.device)
|
|
156
|
+
elif self.translator == "qwen3":
|
|
157
|
+
self._translator = Qwen3Translator(device=self.device)
|
|
158
|
+
else: # "auto"
|
|
159
|
+
self._translator = self._resolve_translator_auto(source_lang, target_lang)
|
|
160
|
+
|
|
161
|
+
def _resolve_translator_auto(self, source_lang: str, target_lang: str) -> TranslationBackend:
|
|
162
|
+
"""Pick a backend based on language coverage AND device.
|
|
163
|
+
|
|
164
|
+
Qwen3-4B Q4_K_M on CPU is roughly 10-15x slower than MarianMT (M2.1
|
|
165
|
+
spike on dreams_15min.mp4). The resolver picks Marian on CPU
|
|
166
|
+
whenever it covers the language pair and only escalates to Qwen
|
|
167
|
+
when a GPU is available or Marian doesn't cover the pair.
|
|
168
|
+
"""
|
|
169
|
+
device = select_device(self.device, mps_allowed=True)
|
|
170
|
+
has_gpu = device in ("cuda", "mps")
|
|
171
|
+
|
|
172
|
+
# 1. GPU + Qwen covers the pair → Qwen wins (best quality).
|
|
173
|
+
if has_gpu and Qwen3Translator.supports(source_lang, target_lang):
|
|
174
|
+
logger.info(
|
|
175
|
+
"translator: auto-selected qwen3 (device=%s, supports %s->%s)",
|
|
176
|
+
device,
|
|
177
|
+
source_lang,
|
|
178
|
+
target_lang,
|
|
179
|
+
)
|
|
180
|
+
return Qwen3Translator(device=self.device)
|
|
181
|
+
|
|
182
|
+
# 2. Marian covers the pair → Marian (fast).
|
|
183
|
+
if MarianTranslator.has_model_for(source_lang, target_lang):
|
|
184
|
+
if has_gpu:
|
|
185
|
+
reason = f"Qwen does not cover {source_lang}->{target_lang}"
|
|
186
|
+
else:
|
|
187
|
+
reason = f"device={device} (Qwen would be ~10-15x slower; pass translator='qwen3' to override)"
|
|
188
|
+
logger.info("translator: auto-selected marian (%s)", reason)
|
|
189
|
+
return MarianTranslator(device=self.device)
|
|
190
|
+
|
|
191
|
+
# 3. CPU + only Qwen covers it: warn loudly and use Qwen anyway.
|
|
192
|
+
if Qwen3Translator.supports(source_lang, target_lang):
|
|
193
|
+
logger.warning(
|
|
194
|
+
"translator: auto-selected qwen3 on CPU (%s->%s not in Marian); "
|
|
195
|
+
"translation will be slow (~10-15x MarianMT). Consider GPU.",
|
|
196
|
+
source_lang,
|
|
197
|
+
target_lang,
|
|
198
|
+
)
|
|
199
|
+
return Qwen3Translator(device=self.device)
|
|
134
200
|
|
|
135
|
-
|
|
201
|
+
raise UnsupportedLanguageError(source_lang, target_lang)
|
|
136
202
|
|
|
137
203
|
def _init_tts(self, language: str = "en") -> None:
|
|
138
204
|
"""Initialize the text-to-speech model."""
|
|
@@ -430,7 +496,7 @@ class LocalDubbingPipeline:
|
|
|
430
496
|
|
|
431
497
|
report_progress("Translating text", 0.35)
|
|
432
498
|
if self._translator is None:
|
|
433
|
-
self._init_translator()
|
|
499
|
+
self._init_translator(source_lang=detected_lang, target_lang=target_lang)
|
|
434
500
|
|
|
435
501
|
# Translation stage spans 0.35 → 0.50 of overall pipeline progress.
|
|
436
502
|
# MarianMT runs sequentially over 8-segment batches; on a 15-min
|
|
@@ -446,6 +512,9 @@ class LocalDubbingPipeline:
|
|
|
446
512
|
source_lang=detected_lang,
|
|
447
513
|
progress_callback=_on_translation_progress,
|
|
448
514
|
)
|
|
515
|
+
# Capture per-segment failures (always empty for Marian) before
|
|
516
|
+
# _maybe_unload nukes the backend in low_memory mode.
|
|
517
|
+
translation_failures = list(self._translator.translation_failures)
|
|
449
518
|
self._maybe_unload("_translator")
|
|
450
519
|
|
|
451
520
|
report_progress("Generating dubbed speech", 0.50)
|
|
@@ -559,6 +628,7 @@ class LocalDubbingPipeline:
|
|
|
559
628
|
voice_samples=voice_samples,
|
|
560
629
|
timing_summary=timing_summary,
|
|
561
630
|
transcript_quality=transcript_quality,
|
|
631
|
+
translation_failures=translation_failures,
|
|
562
632
|
)
|
|
563
633
|
|
|
564
634
|
def revoice(
|
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
"""Qwen3-Instruct translation backend (M2).
|
|
2
|
+
|
|
3
|
+
GGUF inference via ``llama-cpp-python``. One model for now —
|
|
4
|
+
``Qwen3-4B-Instruct-2507`` (Apache-2.0, ~2.4 GB Q4_K_M). The original M2
|
|
5
|
+
plan called for low/medium/high tiers (4B / 8B / 30B-A3B); we deferred
|
|
6
|
+
that complexity until M2.4 eval data shows the larger models actually
|
|
7
|
+
deliver a quality lift worth the VRAM cost.
|
|
8
|
+
|
|
9
|
+
Latency note: on CPU the 4B model is roughly 10-15× slower than
|
|
10
|
+
:class:`MarianTranslator` per the M2.1 spike. On GPU it lands within ~2×
|
|
11
|
+
of Marian. Translation quality is decisively higher than Marian on
|
|
12
|
+
context-dependent and idiomatic content. The pipeline's
|
|
13
|
+
:class:`LocalDubbingPipeline` chooses based on ``device`` + the
|
|
14
|
+
``translator`` kwarg.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
import logging
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import TYPE_CHECKING, Any, Callable
|
|
23
|
+
|
|
24
|
+
from videopython.ai._device import release_device_memory, select_device
|
|
25
|
+
from videopython.ai.generation.translation import (
|
|
26
|
+
LANGUAGE_NAMES,
|
|
27
|
+
MarianTranslator,
|
|
28
|
+
_is_translatable_text,
|
|
29
|
+
)
|
|
30
|
+
from videopython.base.text.transcription import TranscriptionSegment
|
|
31
|
+
|
|
32
|
+
# Imported under TYPE_CHECKING only — qwen3 sits below videopython.ai.dubbing
|
|
33
|
+
# in the import order (pipeline.py imports Qwen3Translator), so a top-level
|
|
34
|
+
# import would create a cycle. The runtime constructor reaches for it via a
|
|
35
|
+
# lazy local import inside translate_segments.
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
38
|
+
|
|
39
|
+
logger = logging.getLogger(__name__)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Default model. Constants are module-level so an eval harness or future
|
|
43
|
+
# tier pick can override at the call site without forking the class.
|
|
44
|
+
DEFAULT_REPO_ID = "unsloth/Qwen3-4B-Instruct-2507-GGUF"
|
|
45
|
+
DEFAULT_FILENAME = "Qwen3-4B-Instruct-2507-Q4_K_M.gguf"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Average characters per second of natural speech, used to derive the
|
|
49
|
+
# per-segment ``target_chars`` budget. Rough field measurements; the prompt
|
|
50
|
+
# tells Qwen this is a target ±15%, not a hard cap.
|
|
51
|
+
_SPEECH_CHARS_PER_SEC: dict[str, float] = {
|
|
52
|
+
"en": 14.0,
|
|
53
|
+
"es": 14.0,
|
|
54
|
+
"pt": 13.5,
|
|
55
|
+
"it": 13.5,
|
|
56
|
+
"fr": 13.0,
|
|
57
|
+
"de": 12.0,
|
|
58
|
+
"pl": 12.5,
|
|
59
|
+
"nl": 12.5,
|
|
60
|
+
"ru": 12.0,
|
|
61
|
+
"uk": 12.0,
|
|
62
|
+
"cs": 12.0,
|
|
63
|
+
"sk": 12.0,
|
|
64
|
+
"ro": 13.0,
|
|
65
|
+
"hu": 12.0,
|
|
66
|
+
"fi": 11.0,
|
|
67
|
+
"sv": 12.5,
|
|
68
|
+
"da": 13.0,
|
|
69
|
+
"nb": 13.0,
|
|
70
|
+
"no": 13.0,
|
|
71
|
+
"ja": 8.0,
|
|
72
|
+
"ko": 9.0,
|
|
73
|
+
"zh": 7.0,
|
|
74
|
+
"zh-CN": 7.0,
|
|
75
|
+
"zh-TW": 7.0,
|
|
76
|
+
"th": 9.0,
|
|
77
|
+
"vi": 11.0,
|
|
78
|
+
"ar": 10.0,
|
|
79
|
+
"he": 10.0,
|
|
80
|
+
"hi": 11.0,
|
|
81
|
+
"ta": 10.0,
|
|
82
|
+
"id": 12.0,
|
|
83
|
+
"ms": 12.0,
|
|
84
|
+
"tr": 12.0,
|
|
85
|
+
"el": 12.0,
|
|
86
|
+
}
|
|
87
|
+
_SPEECH_CHARS_DEFAULT = 12.0
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# Qwen's avg_logprob is in [-inf, 0]. Values below this threshold mark a
|
|
91
|
+
# transcription window we don't trust — Qwen gets a hint not to over-anchor.
|
|
92
|
+
_LOW_LOGPROB_HINT_THRESHOLD = -1.0
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _target_chars_for(duration_seconds: float, target_lang: str) -> int:
|
|
96
|
+
"""Character-count budget for a segment of ``duration_seconds`` in ``target_lang``."""
|
|
97
|
+
rate = _SPEECH_CHARS_PER_SEC.get(target_lang, _SPEECH_CHARS_DEFAULT)
|
|
98
|
+
return max(1, int(duration_seconds * rate * 1.15))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _build_system_prompt(source_lang: str, target_lang: str) -> str:
|
|
102
|
+
"""Stable system + format spec. The few-shot example uses generic
|
|
103
|
+
phrases (no fixture-specific content) so the prompt generalizes.
|
|
104
|
+
"""
|
|
105
|
+
src_name = LANGUAGE_NAMES.get(source_lang, source_lang)
|
|
106
|
+
tgt_name = LANGUAGE_NAMES.get(target_lang, target_lang)
|
|
107
|
+
return (
|
|
108
|
+
f"You are a professional dub translator. Translate from {src_name} to {tgt_name}.\n"
|
|
109
|
+
"Preserve register and proper nouns. Match each segment's syllable count so the\n"
|
|
110
|
+
"dub fits the original timing — translation is for spoken audio, not subtitles.\n"
|
|
111
|
+
"Aim for ``target_chars`` characters per segment (±15%).\n"
|
|
112
|
+
"If a segment is non-speech filler (grunts, laughter, music cues) keep it as filler in\n"
|
|
113
|
+
"the target language; do not invent content.\n"
|
|
114
|
+
"If a segment carries ``low_confidence``, the source transcription may be wrong;\n"
|
|
115
|
+
"translate conservatively rather than committing to a specific phrase.\n"
|
|
116
|
+
"\n"
|
|
117
|
+
"Output one JSON object per line, no preamble, no commentary, no markdown:\n"
|
|
118
|
+
'{"i": <segment_index>, "translated": "<text>"}\n'
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _build_user_prompt(segments: list[TranscriptionSegment], target_lang: str) -> str:
|
|
123
|
+
"""Per-call body — the segments to translate."""
|
|
124
|
+
lines: list[str] = []
|
|
125
|
+
for idx, seg in enumerate(segments):
|
|
126
|
+
budget = _target_chars_for(seg.end - seg.start, target_lang)
|
|
127
|
+
entry: dict[str, Any] = {
|
|
128
|
+
"i": idx,
|
|
129
|
+
"text": seg.text,
|
|
130
|
+
"target_chars": budget,
|
|
131
|
+
}
|
|
132
|
+
if seg.avg_logprob is not None and seg.avg_logprob < _LOW_LOGPROB_HINT_THRESHOLD:
|
|
133
|
+
entry["low_confidence"] = True
|
|
134
|
+
lines.append(json.dumps(entry, ensure_ascii=False))
|
|
135
|
+
request_block = "\n".join(lines)
|
|
136
|
+
return (
|
|
137
|
+
f"Input segments:\n{request_block}\nTranslations (one JSON object per line, exactly {len(segments)} lines):\n"
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def _parse_jsonl_response(raw: str) -> dict[int, str]:
|
|
142
|
+
"""Extract ``{i: translated_text}`` from Qwen output. Permissive — tolerates
|
|
143
|
+
markdown fences and preamble lines that the model occasionally adds."""
|
|
144
|
+
parsed: dict[int, str] = {}
|
|
145
|
+
for line in raw.splitlines():
|
|
146
|
+
line = line.strip()
|
|
147
|
+
if not line or line.startswith("```"):
|
|
148
|
+
continue
|
|
149
|
+
try:
|
|
150
|
+
obj = json.loads(line)
|
|
151
|
+
except json.JSONDecodeError:
|
|
152
|
+
continue
|
|
153
|
+
if isinstance(obj, dict) and "i" in obj and "translated" in obj:
|
|
154
|
+
try:
|
|
155
|
+
parsed[int(obj["i"])] = str(obj["translated"])
|
|
156
|
+
except (TypeError, ValueError):
|
|
157
|
+
continue
|
|
158
|
+
return parsed
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class Qwen3Translator:
|
|
162
|
+
"""Qwen3-Instruct translation via llama-cpp-python (GGUF).
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
device: ``"cuda"``, ``"mps"``, ``"cpu"``, or ``None`` for auto.
|
|
166
|
+
marian_fallback: If True (default), fall back to Marian for any
|
|
167
|
+
segment that fails Qwen's parse retry. Set False to disable
|
|
168
|
+
(failures land in ``translation_failures`` instead).
|
|
169
|
+
repo_id: HuggingFace repo for the GGUF weights. Defaults to
|
|
170
|
+
``DEFAULT_REPO_ID``; override for eval harnesses.
|
|
171
|
+
filename: GGUF filename within ``repo_id``. Defaults to
|
|
172
|
+
``DEFAULT_FILENAME``.
|
|
173
|
+
n_ctx: llama.cpp context window. 8192 is plenty for a 15-min source;
|
|
174
|
+
raise for very long sources. Hard cap is the model's training
|
|
175
|
+
context (262K for Qwen3-4B-Instruct-2507).
|
|
176
|
+
max_tokens: Generation cap per call. 4× the input character count
|
|
177
|
+
is a safe upper bound for translation output.
|
|
178
|
+
temperature: Decoding temperature. 0.1 keeps output structurally
|
|
179
|
+
consistent (high JSON parse rate) without being deterministic.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def __init__(
|
|
183
|
+
self,
|
|
184
|
+
device: str | None = None,
|
|
185
|
+
marian_fallback: bool = True,
|
|
186
|
+
repo_id: str = DEFAULT_REPO_ID,
|
|
187
|
+
filename: str = DEFAULT_FILENAME,
|
|
188
|
+
n_ctx: int = 8192,
|
|
189
|
+
max_tokens: int = 4096,
|
|
190
|
+
temperature: float = 0.1,
|
|
191
|
+
):
|
|
192
|
+
self.device = device
|
|
193
|
+
self.marian_fallback = marian_fallback
|
|
194
|
+
self.repo_id = repo_id
|
|
195
|
+
self.filename = filename
|
|
196
|
+
self.n_ctx = n_ctx
|
|
197
|
+
self.max_tokens = max_tokens
|
|
198
|
+
self.temperature = temperature
|
|
199
|
+
|
|
200
|
+
# Lazily initialized.
|
|
201
|
+
self._llm: Any = None
|
|
202
|
+
self._marian: MarianTranslator | None = None
|
|
203
|
+
# Tracks which segment indices both Qwen and Marian failed on. The
|
|
204
|
+
# pipeline reads this to populate DubbingResult.translation_failures.
|
|
205
|
+
self._failures_last_call: list[int] = []
|
|
206
|
+
|
|
207
|
+
def _init_local(self) -> None:
|
|
208
|
+
"""Download (if needed) and load the GGUF weights."""
|
|
209
|
+
from huggingface_hub import hf_hub_download
|
|
210
|
+
from llama_cpp import Llama
|
|
211
|
+
|
|
212
|
+
# Warn about CPU latency at load time (not __init__) — the warning is
|
|
213
|
+
# about runtime cost, which only applies once the model is actually
|
|
214
|
+
# loaded. Construction is cheap; tests instantiate Qwen3Translator
|
|
215
|
+
# without intending to run inference, so __init__ shouldn't shout.
|
|
216
|
+
resolved = select_device(self.device, mps_allowed=True)
|
|
217
|
+
if resolved == "cpu":
|
|
218
|
+
logger.warning(
|
|
219
|
+
"Qwen3Translator on CPU is ~10-15x slower than MarianTranslator. "
|
|
220
|
+
"Consider translator='marian' for development or pass device='cuda'/'mps'.",
|
|
221
|
+
)
|
|
222
|
+
|
|
223
|
+
logger.info("Qwen3Translator: loading %s", self.filename)
|
|
224
|
+
model_path = Path(hf_hub_download(repo_id=self.repo_id, filename=self.filename))
|
|
225
|
+
|
|
226
|
+
# n_gpu_layers=-1 offloads everything to GPU when one is available;
|
|
227
|
+
# 0 forces CPU. llama-cpp-python's Metal/CUDA support detects and
|
|
228
|
+
# uses whatever the build was compiled against.
|
|
229
|
+
n_gpu_layers = 0 if resolved == "cpu" else -1
|
|
230
|
+
# n_threads omitted on purpose — llama-cpp-python defaults to a
|
|
231
|
+
# sensible per-host value (min(physical cores, 4)). Hardcoding 8
|
|
232
|
+
# under-utilizes a 16-core box and over-subscribes a 4-core CI.
|
|
233
|
+
self._llm = Llama(
|
|
234
|
+
model_path=str(model_path),
|
|
235
|
+
n_ctx=self.n_ctx,
|
|
236
|
+
n_gpu_layers=n_gpu_layers,
|
|
237
|
+
verbose=False,
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
def _qwen_translate(
|
|
241
|
+
self, segments: list[TranscriptionSegment], target_lang: str, source_lang: str
|
|
242
|
+
) -> dict[int, str]:
|
|
243
|
+
"""One Qwen call to translate all segments. Empty result on parse failure."""
|
|
244
|
+
if self._llm is None:
|
|
245
|
+
self._init_local()
|
|
246
|
+
|
|
247
|
+
system = _build_system_prompt(source_lang, target_lang)
|
|
248
|
+
user = _build_user_prompt(segments, target_lang)
|
|
249
|
+
prompt = system + user
|
|
250
|
+
|
|
251
|
+
response = self._llm(
|
|
252
|
+
prompt,
|
|
253
|
+
max_tokens=self.max_tokens,
|
|
254
|
+
temperature=self.temperature,
|
|
255
|
+
stop=None,
|
|
256
|
+
)
|
|
257
|
+
raw = response["choices"][0]["text"]
|
|
258
|
+
return _parse_jsonl_response(raw)
|
|
259
|
+
|
|
260
|
+
def translate_segments(
|
|
261
|
+
self,
|
|
262
|
+
segments: list[TranscriptionSegment],
|
|
263
|
+
target_lang: str,
|
|
264
|
+
source_lang: str | None = None,
|
|
265
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
266
|
+
) -> list[TranslatedSegment]:
|
|
267
|
+
"""Translate segments via Qwen with parse-retry + optional Marian fallback.
|
|
268
|
+
|
|
269
|
+
The progress_callback fires three times: 0.5 after the first
|
|
270
|
+
Qwen call, 0.9 after the optional retry/fallback, 1.0 at the
|
|
271
|
+
end. M2.1 phase 2 confirmed smaller batches don't help on CPU,
|
|
272
|
+
so finer-grained progress isn't possible without fake ticks.
|
|
273
|
+
"""
|
|
274
|
+
effective_source = source_lang or "en"
|
|
275
|
+
self._failures_last_call = []
|
|
276
|
+
|
|
277
|
+
translatable_indices = [i for i, seg in enumerate(segments) if _is_translatable_text(seg.text)]
|
|
278
|
+
translatable_segments = [segments[i] for i in translatable_indices]
|
|
279
|
+
|
|
280
|
+
# First attempt.
|
|
281
|
+
if translatable_segments:
|
|
282
|
+
qwen_results = self._qwen_translate(translatable_segments, target_lang, effective_source)
|
|
283
|
+
else:
|
|
284
|
+
qwen_results = {}
|
|
285
|
+
if progress_callback is not None:
|
|
286
|
+
progress_callback(0.5)
|
|
287
|
+
|
|
288
|
+
# Identify segments Qwen failed (unparseable or missing index).
|
|
289
|
+
# Indices in qwen_results / translatable_segments are 0-based positions
|
|
290
|
+
# within translatable_segments, NOT positions in the full ``segments``
|
|
291
|
+
# list. Map back at the end.
|
|
292
|
+
missing_local_indices = [li for li in range(len(translatable_segments)) if li not in qwen_results]
|
|
293
|
+
|
|
294
|
+
# Retry once on the missing subset with stricter instructions.
|
|
295
|
+
if missing_local_indices:
|
|
296
|
+
retry_segments = [translatable_segments[li] for li in missing_local_indices]
|
|
297
|
+
logger.info(
|
|
298
|
+
"Qwen3Translator: retrying %d/%d segments after first parse",
|
|
299
|
+
len(retry_segments),
|
|
300
|
+
len(translatable_segments),
|
|
301
|
+
)
|
|
302
|
+
retry_results = self._qwen_translate(retry_segments, target_lang, effective_source)
|
|
303
|
+
# retry_results uses 0..len(retry_segments)-1 as keys; map back.
|
|
304
|
+
for retry_local, original_local in enumerate(missing_local_indices):
|
|
305
|
+
if retry_local in retry_results:
|
|
306
|
+
qwen_results[original_local] = retry_results[retry_local]
|
|
307
|
+
if progress_callback is not None:
|
|
308
|
+
progress_callback(0.9)
|
|
309
|
+
|
|
310
|
+
# Anything still missing → Marian fallback (or surface as failure).
|
|
311
|
+
still_missing_local = [li for li in range(len(translatable_segments)) if li not in qwen_results]
|
|
312
|
+
if still_missing_local and self.marian_fallback:
|
|
313
|
+
fallback_segments = [translatable_segments[li] for li in still_missing_local]
|
|
314
|
+
logger.warning(
|
|
315
|
+
"Qwen3Translator: falling back to Marian for %d segments after retry",
|
|
316
|
+
len(fallback_segments),
|
|
317
|
+
)
|
|
318
|
+
if self._marian is None:
|
|
319
|
+
self._marian = MarianTranslator(device=self.device)
|
|
320
|
+
try:
|
|
321
|
+
fallback_translated = self._marian.translate_segments(
|
|
322
|
+
fallback_segments, target_lang=target_lang, source_lang=effective_source
|
|
323
|
+
)
|
|
324
|
+
for li, ts in zip(still_missing_local, fallback_translated):
|
|
325
|
+
qwen_results[li] = ts.translated_text
|
|
326
|
+
except Exception as exc:
|
|
327
|
+
logger.warning("Qwen3Translator: Marian fallback failed (%s)", exc)
|
|
328
|
+
# Leave them missing; they'll be recorded as failures below.
|
|
329
|
+
|
|
330
|
+
# Whatever's still missing is a hard failure. Record original-segment
|
|
331
|
+
# indices (positions in the full ``segments`` list) so the caller
|
|
332
|
+
# can reconcile against translated_segments.
|
|
333
|
+
for li in range(len(translatable_segments)):
|
|
334
|
+
if li not in qwen_results:
|
|
335
|
+
self._failures_last_call.append(translatable_indices[li])
|
|
336
|
+
|
|
337
|
+
# Lazy import to avoid a circular dep through videopython.ai.dubbing
|
|
338
|
+
# (see TYPE_CHECKING import at the top of the module).
|
|
339
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
340
|
+
|
|
341
|
+
# Materialize TranslatedSegments parallel to the input list.
|
|
342
|
+
translated_segments: list[TranslatedSegment] = []
|
|
343
|
+
local_translation_for_orig: dict[int, str] = {}
|
|
344
|
+
for li, original_idx in enumerate(translatable_indices):
|
|
345
|
+
if li in qwen_results:
|
|
346
|
+
local_translation_for_orig[original_idx] = qwen_results[li]
|
|
347
|
+
|
|
348
|
+
for i, segment in enumerate(segments):
|
|
349
|
+
translated_text = local_translation_for_orig.get(i, "")
|
|
350
|
+
translated_segments.append(
|
|
351
|
+
TranslatedSegment(
|
|
352
|
+
original_segment=segment,
|
|
353
|
+
translated_text=translated_text,
|
|
354
|
+
source_lang=effective_source,
|
|
355
|
+
target_lang=target_lang,
|
|
356
|
+
speaker=segment.speaker,
|
|
357
|
+
start=segment.start,
|
|
358
|
+
end=segment.end,
|
|
359
|
+
)
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if progress_callback is not None:
|
|
363
|
+
progress_callback(1.0)
|
|
364
|
+
return translated_segments
|
|
365
|
+
|
|
366
|
+
@property
|
|
367
|
+
def translation_failures(self) -> list[int]:
|
|
368
|
+
"""Indices (in the most recent ``segments`` input) where translation
|
|
369
|
+
failed entirely. Empty if all segments translated.
|
|
370
|
+
"""
|
|
371
|
+
return list(self._failures_last_call)
|
|
372
|
+
|
|
373
|
+
def unload(self) -> None:
|
|
374
|
+
"""Release the model so the next call re-initializes. Used by
|
|
375
|
+
:class:`LocalDubbingPipeline` in ``low_memory`` mode."""
|
|
376
|
+
self._llm = None
|
|
377
|
+
if self._marian is not None:
|
|
378
|
+
self._marian.unload()
|
|
379
|
+
self._marian = None
|
|
380
|
+
release_device_memory(self.device)
|
|
381
|
+
|
|
382
|
+
@staticmethod
|
|
383
|
+
def get_supported_languages() -> dict[str, str]:
|
|
384
|
+
"""Qwen handles all of Marian's language set plus more; we expose the
|
|
385
|
+
Marian set for now and let M2.4 eval add anything Qwen-only.
|
|
386
|
+
"""
|
|
387
|
+
return LANGUAGE_NAMES.copy()
|
|
388
|
+
|
|
389
|
+
@classmethod
|
|
390
|
+
def supports(cls, source_lang: str, target_lang: str) -> bool:
|
|
391
|
+
"""Coverage hint for the M2.3 ``auto`` resolver."""
|
|
392
|
+
if source_lang == target_lang:
|
|
393
|
+
return True
|
|
394
|
+
return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES
|
|
@@ -1,13 +1,50 @@
|
|
|
1
|
-
"""Text translation
|
|
1
|
+
"""Text translation backends.
|
|
2
|
+
|
|
3
|
+
Two backends share the :class:`TranslationBackend` protocol:
|
|
4
|
+
|
|
5
|
+
- :class:`MarianTranslator` (HuggingFace Helsinki-NLP MarianMT) — fast,
|
|
6
|
+
segment-isolated, available for ~30 language pairs. Default on CPU.
|
|
7
|
+
- :class:`Qwen3Translator` (Qwen3-4B/8B/14B-Instruct via llama-cpp-python) —
|
|
8
|
+
slower but produces context-aware, length-budgeted translations. Default
|
|
9
|
+
on GPU.
|
|
10
|
+
|
|
11
|
+
The pipeline picks via :class:`videopython.ai.dubbing.pipeline` based on a
|
|
12
|
+
``translator`` kwarg (``"auto"`` resolves at runtime).
|
|
13
|
+
"""
|
|
2
14
|
|
|
3
15
|
from __future__ import annotations
|
|
4
16
|
|
|
5
|
-
from typing import Any, Callable
|
|
17
|
+
from typing import TYPE_CHECKING, Any, Callable, Protocol, runtime_checkable
|
|
6
18
|
|
|
7
19
|
from videopython.ai._device import log_device_initialization, release_device_memory, select_device
|
|
8
|
-
from videopython.ai.dubbing.models import TranslatedSegment
|
|
9
20
|
from videopython.base.text.transcription import TranscriptionSegment
|
|
10
21
|
|
|
22
|
+
# Imported under TYPE_CHECKING to avoid a circular dep through
|
|
23
|
+
# videopython.ai.dubbing (the dubbing pipeline imports both
|
|
24
|
+
# MarianTranslator and Qwen3Translator, which both import
|
|
25
|
+
# TranslatedSegment from dubbing.models). Runtime users do a lazy
|
|
26
|
+
# local import inside translate_segments.
|
|
27
|
+
if TYPE_CHECKING:
|
|
28
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class UnsupportedLanguageError(ValueError):
|
|
32
|
+
"""Raised when no available translation backend supports a given
|
|
33
|
+
``(source, target)`` language pair.
|
|
34
|
+
|
|
35
|
+
Carries the requested pair so callers can introspect:
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
dubber.dub(video, target_lang="xh")
|
|
39
|
+
except UnsupportedLanguageError as e:
|
|
40
|
+
print(f"No backend covers {e.source_lang}->{e.target_lang}")
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, source_lang: str, target_lang: str, message: str | None = None):
|
|
44
|
+
self.source_lang = source_lang
|
|
45
|
+
self.target_lang = target_lang
|
|
46
|
+
super().__init__(message or f"No translation backend supports {source_lang}->{target_lang}")
|
|
47
|
+
|
|
11
48
|
|
|
12
49
|
def _is_translatable_text(text: str) -> bool:
|
|
13
50
|
"""Return True if text has enough content to be worth translating.
|
|
@@ -19,6 +56,36 @@ def _is_translatable_text(text: str) -> bool:
|
|
|
19
56
|
return sum(1 for c in text if c.isalnum()) >= 2
|
|
20
57
|
|
|
21
58
|
|
|
59
|
+
@runtime_checkable
|
|
60
|
+
class TranslationBackend(Protocol):
|
|
61
|
+
"""Pipeline-facing translation interface.
|
|
62
|
+
|
|
63
|
+
Both :class:`MarianTranslator` and :class:`Qwen3Translator` satisfy
|
|
64
|
+
this. The pipeline only depends on these methods.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
def translate_segments(
|
|
68
|
+
self,
|
|
69
|
+
segments: list[TranscriptionSegment],
|
|
70
|
+
target_lang: str,
|
|
71
|
+
source_lang: str | None = None,
|
|
72
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
73
|
+
) -> list[TranslatedSegment]: ...
|
|
74
|
+
|
|
75
|
+
def unload(self) -> None: ...
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def translation_failures(self) -> list[int]:
|
|
79
|
+
"""Indices into the most recent ``segments`` input where the backend
|
|
80
|
+
could not produce a translation. Empty for backends that never fail
|
|
81
|
+
per-segment (e.g. MarianTranslator). The dubbing pipeline copies
|
|
82
|
+
this onto :class:`DubbingResult.translation_failures`."""
|
|
83
|
+
...
|
|
84
|
+
|
|
85
|
+
@staticmethod
|
|
86
|
+
def get_supported_languages() -> dict[str, str]: ...
|
|
87
|
+
|
|
88
|
+
|
|
22
89
|
LANGUAGE_NAMES = {
|
|
23
90
|
"en": "English",
|
|
24
91
|
"es": "Spanish",
|
|
@@ -56,8 +123,8 @@ LANGUAGE_NAMES = {
|
|
|
56
123
|
}
|
|
57
124
|
|
|
58
125
|
|
|
59
|
-
class
|
|
60
|
-
"""Translates text between languages using local
|
|
126
|
+
class MarianTranslator:
|
|
127
|
+
"""Translates text between languages using local Helsinki-NLP MarianMT models."""
|
|
61
128
|
|
|
62
129
|
# Languages without a direct opus-mt-{src}-{tgt} model. Maps (source, target)
|
|
63
130
|
# to an alternative HuggingFace model identifier.
|
|
@@ -68,6 +135,25 @@ class TextTranslator:
|
|
|
68
135
|
("en", "pl"): "Helsinki-NLP/opus-mt-en-zlw",
|
|
69
136
|
}
|
|
70
137
|
|
|
138
|
+
@classmethod
|
|
139
|
+
def has_model_for(cls, source_lang: str, target_lang: str) -> bool:
|
|
140
|
+
"""Return True if Marian has (or is likely to have) a model for ``(source, target)``.
|
|
141
|
+
|
|
142
|
+
Same-language pairs return True (translation is the identity).
|
|
143
|
+
Otherwise: True if either an entry in ``_MODEL_OVERRIDES`` exists or
|
|
144
|
+
both languages are in :data:`LANGUAGE_NAMES`. The latter is a
|
|
145
|
+
permissive proxy — Marian publishes ``opus-mt-{src}-{tgt}`` for
|
|
146
|
+
most ISO-639-1 pairs we expose, but not all (e.g. some Asian-to-
|
|
147
|
+
Asian pairs route through English). Used by the M2.3 ``auto``
|
|
148
|
+
resolver as a *coverage hint*; the actual existence check happens
|
|
149
|
+
at first-use download time.
|
|
150
|
+
"""
|
|
151
|
+
if source_lang == target_lang:
|
|
152
|
+
return True
|
|
153
|
+
if (source_lang, target_lang) in cls._MODEL_OVERRIDES:
|
|
154
|
+
return True
|
|
155
|
+
return source_lang in LANGUAGE_NAMES and target_lang in LANGUAGE_NAMES
|
|
156
|
+
|
|
71
157
|
def __init__(self, model_name: str | None = None, device: str | None = None):
|
|
72
158
|
self.model_name = model_name
|
|
73
159
|
self.device = device
|
|
@@ -194,6 +280,10 @@ class TextTranslator:
|
|
|
194
280
|
callers can render translation-stage progress without knowing the
|
|
195
281
|
batch size.
|
|
196
282
|
"""
|
|
283
|
+
# Lazy import to avoid a circular dep through videopython.ai.dubbing
|
|
284
|
+
# (see TYPE_CHECKING import at the top of the module).
|
|
285
|
+
from videopython.ai.dubbing.models import TranslatedSegment
|
|
286
|
+
|
|
197
287
|
effective_source = source_lang or "en"
|
|
198
288
|
|
|
199
289
|
translatable_indices = [i for i, segment in enumerate(segments) if _is_translatable_text(segment.text)]
|
|
@@ -230,6 +320,20 @@ class TextTranslator:
|
|
|
230
320
|
self._current_lang_pair = None
|
|
231
321
|
release_device_memory(self.device)
|
|
232
322
|
|
|
323
|
+
@property
|
|
324
|
+
def translation_failures(self) -> list[int]:
|
|
325
|
+
"""Marian never fails per-segment (worst case it produces poor
|
|
326
|
+
output, not no output). Always empty; satisfies the
|
|
327
|
+
:class:`TranslationBackend` protocol."""
|
|
328
|
+
return []
|
|
329
|
+
|
|
233
330
|
@staticmethod
|
|
234
331
|
def get_supported_languages() -> dict[str, str]:
|
|
235
332
|
return LANGUAGE_NAMES.copy()
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
# Back-compat alias. ``TextTranslator`` was the class name through 0.28.x;
|
|
336
|
+
# 0.29.0 renames to ``MarianTranslator`` to make room for ``Qwen3Translator``
|
|
337
|
+
# behind a shared :class:`TranslationBackend` protocol. The alias will be
|
|
338
|
+
# removed in 0.30.0.
|
|
339
|
+
TextTranslator = MarianTranslator
|
|
@@ -11,6 +11,36 @@ from videopython.base.text.transcription import Transcription, TranscriptionSegm
|
|
|
11
11
|
from videopython.base.video import Video
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def _attach_confidence_by_overlap(
|
|
15
|
+
target_segments: list[TranscriptionSegment],
|
|
16
|
+
source_segments: list[TranscriptionSegment],
|
|
17
|
+
) -> None:
|
|
18
|
+
"""Stamp Whisper confidence (avg_logprob, no_speech_prob, compression_ratio)
|
|
19
|
+
onto ``target_segments`` from the ``source_segments`` they overlap most with.
|
|
20
|
+
|
|
21
|
+
Used to re-attach per-segment confidence after diarization rebuilds segments
|
|
22
|
+
from words and drops the original Whisper-segment metadata. Whisper's
|
|
23
|
+
confidence is window-level, not phoneme-level, so overlap-by-time is the
|
|
24
|
+
right granularity — re-deriving per-word and re-aggregating wouldn't be
|
|
25
|
+
more accurate.
|
|
26
|
+
|
|
27
|
+
Mutates ``target_segments`` in place. Segments with no overlap to any
|
|
28
|
+
source segment are left untouched (their confidence stays None).
|
|
29
|
+
"""
|
|
30
|
+
for tgt in target_segments:
|
|
31
|
+
best_overlap = 0.0
|
|
32
|
+
best_src: TranscriptionSegment | None = None
|
|
33
|
+
for src in source_segments:
|
|
34
|
+
overlap = max(0.0, min(tgt.end, src.end) - max(tgt.start, src.start))
|
|
35
|
+
if overlap > best_overlap:
|
|
36
|
+
best_overlap = overlap
|
|
37
|
+
best_src = src
|
|
38
|
+
if best_src is not None:
|
|
39
|
+
tgt.avg_logprob = best_src.avg_logprob
|
|
40
|
+
tgt.no_speech_prob = best_src.no_speech_prob
|
|
41
|
+
tgt.compression_ratio = best_src.compression_ratio
|
|
42
|
+
|
|
43
|
+
|
|
14
44
|
class AudioToText:
|
|
15
45
|
"""Transcription service for audio and video using local Whisper models.
|
|
16
46
|
|
|
@@ -295,6 +325,13 @@ class AudioToText:
|
|
|
295
325
|
|
|
296
326
|
transcription = self._process_transcription_result(transcription_result)
|
|
297
327
|
|
|
328
|
+
# Capture original Whisper segments before flattening to words. The
|
|
329
|
+
# diarization rebuild via Transcription(words=...) regroups by speaker,
|
|
330
|
+
# which loses the per-segment confidence M1.3 plumbed through. We
|
|
331
|
+
# re-attach by max-overlap match below so M2's confidence-aware
|
|
332
|
+
# translation prompts have signal on the diarized path too.
|
|
333
|
+
whisper_segments = transcription.segments
|
|
334
|
+
|
|
298
335
|
all_words: list[TranscriptionWord] = []
|
|
299
336
|
for seg in transcription.segments:
|
|
300
337
|
all_words.extend(seg.words)
|
|
@@ -302,7 +339,9 @@ class AudioToText:
|
|
|
302
339
|
if all_words:
|
|
303
340
|
all_words = self._assign_speakers_to_words(all_words, diarization_result)
|
|
304
341
|
|
|
305
|
-
|
|
342
|
+
rebuilt = Transcription(words=all_words, language=transcription.language)
|
|
343
|
+
_attach_confidence_by_overlap(rebuilt.segments, whisper_segments)
|
|
344
|
+
return rebuilt
|
|
306
345
|
|
|
307
346
|
def _transcribe_local(self, audio: Audio) -> Transcription:
|
|
308
347
|
"""Transcribe using local Whisper model.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|