videopython 0.32.0__tar.gz → 0.33.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {videopython-0.32.0 → videopython-0.33.0}/PKG-INFO +1 -1
  2. {videopython-0.32.0 → videopython-0.33.0}/pyproject.toml +1 -1
  3. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/__init__.py +2 -0
  4. videopython-0.33.0/src/videopython/ai/dubbing/config.py +80 -0
  5. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/dubber.py +12 -76
  6. videopython-0.33.0/src/videopython/ai/dubbing/expressiveness.py +47 -0
  7. videopython-0.33.0/src/videopython/ai/dubbing/loudness.py +86 -0
  8. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/models.py +50 -69
  9. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/pipeline.py +69 -339
  10. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/quality.py +5 -26
  11. videopython-0.33.0/src/videopython/ai/dubbing/voice_sample.py +152 -0
  12. videopython-0.33.0/src/videopython/ai/video_analysis/__init__.py +39 -0
  13. videopython-0.33.0/src/videopython/ai/video_analysis/analyzer.py +490 -0
  14. videopython-0.33.0/src/videopython/ai/video_analysis/models.py +228 -0
  15. videopython-0.33.0/src/videopython/ai/video_analysis/sampling.py +113 -0
  16. videopython-0.33.0/src/videopython/ai/video_analysis/stages.py +354 -0
  17. videopython-0.32.0/src/videopython/ai/video_analysis.py +0 -1181
  18. {videopython-0.32.0 → videopython-0.33.0}/.gitignore +0 -0
  19. {videopython-0.32.0 → videopython-0.33.0}/LICENSE +0 -0
  20. {videopython-0.32.0 → videopython-0.33.0}/README.md +0 -0
  21. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/__init__.py +0 -0
  22. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/__init__.py +0 -0
  23. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/_device.py +0 -0
  24. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/remux.py +0 -0
  25. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/dubbing/timing.py +0 -0
  26. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/__init__.py +0 -0
  27. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/audio.py +0 -0
  28. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/image.py +0 -0
  29. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/qwen3.py +0 -0
  30. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/translation.py +0 -0
  31. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/generation/video.py +0 -0
  32. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/transforms.py +0 -0
  33. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/__init__.py +0 -0
  34. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/audio.py +0 -0
  35. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/faces.py +0 -0
  36. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/image.py +0 -0
  37. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/separation.py +0 -0
  38. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/ai/understanding/temporal.py +0 -0
  39. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/__init__.py +0 -0
  40. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/analysis.py +0 -0
  41. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/audio/audio.py +0 -0
  42. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/__init__.py +0 -0
  43. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_dimensions.py +0 -0
  44. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_ffmpeg.py +0 -0
  45. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/_video_io.py +0 -0
  46. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/description.py +0 -0
  47. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/exceptions.py +0 -0
  48. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/image_text.py +0 -0
  49. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/transcription.py +0 -0
  50. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/base/video.py +0 -0
  51. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/__init__.py +0 -0
  52. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/effects.py +0 -0
  53. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/operation.py +0 -0
  54. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/streaming.py +0 -0
  55. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/transcription_overlay.py +0 -0
  56. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/transforms.py +0 -0
  57. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/editing/video_edit.py +0 -0
  58. {videopython-0.32.0 → videopython-0.33.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.32.0
3
+ Version: 0.33.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.32.0"
3
+ version = "0.33.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -1,5 +1,6 @@
1
1
  """Local video dubbing functionality."""
2
2
 
3
+ from videopython.ai.dubbing.config import DubbingConfig
3
4
  from videopython.ai.dubbing.dubber import VideoDubber
4
5
  from videopython.ai.dubbing.models import (
5
6
  DubbingResult,
@@ -15,6 +16,7 @@ from videopython.ai.generation.translation import UnsupportedLanguageError
15
16
 
16
17
  __all__ = [
17
18
  "VideoDubber",
19
+ "DubbingConfig",
18
20
  "DubbingResult",
19
21
  "RevoiceResult",
20
22
  "TranslatedSegment",
@@ -0,0 +1,80 @@
1
+ """Configuration model for the dubbing pipeline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict
8
+
9
+ TranslatorChoice = Literal["auto", "marian", "qwen3"]
10
+ WhisperModel = Literal["tiny", "base", "small", "medium", "large", "turbo"]
11
+
12
+
13
+ class DubbingConfig(BaseModel):
14
+ """Knobs shared by :class:`VideoDubber` and :class:`LocalDubbingPipeline`.
15
+
16
+ Accepted as either ``config=DubbingConfig(...)`` or flat kwargs on the
17
+ two constructors; the flat path builds a ``DubbingConfig`` internally.
18
+
19
+ Attributes:
20
+ device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
21
+ low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
22
+ Chatterbox TTS) is unloaded from memory after it runs, so only one
23
+ model is resident at a time. Trades per-run latency (~10-30s of
24
+ extra model loads) for a much lower memory ceiling. Recommended
25
+ for GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
26
+ whisper_model: Whisper model size used for transcription. Larger
27
+ models give better accuracy at the cost of VRAM and latency. One
28
+ of ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
29
+ Default ``turbo``.
30
+ condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
31
+ ``False`` (Whisper's own default is ``True``). With conditioning
32
+ on, a single hallucinated filler phrase cascades through the rest
33
+ of the file. See ``AudioToText`` for the full rationale.
34
+ no_speech_threshold: Forwarded to ``AudioToText``. Whisper's
35
+ no-speech gate; raise to drop more low-confidence windows.
36
+ logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
37
+ log-probability gate.
38
+ vocabulary: Forwarded to ``AudioToText``. Optional list of brand
39
+ names, product names, or proper nouns to bias Whisper's
40
+ first-window decoder via ``initial_prompt``. Recovers
41
+ near-mishears (e.g. Klarna -> "carna") on brand-monitoring
42
+ inputs without new model deps.
43
+ strict_quality: When True, the pipeline raises
44
+ :class:`GarbageTranscriptError` before Demucs/translation/TTS
45
+ run if the transcript-quality heuristic returns ``"reject"``.
46
+ When False (default), low-quality transcripts are logged at
47
+ WARNING but processing continues. Either way the
48
+ :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
49
+ inspection.
50
+ translator: Translation backend to use. ``"auto"`` (default) picks
51
+ Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and ``"qwen3"`` force
52
+ the named backend regardless of device. See
53
+ :class:`videopython.ai.generation.qwen3.Qwen3Translator` for
54
+ tradeoffs (Qwen3 is slower on CPU but produces context-aware,
55
+ length-budgeted output).
56
+ """
57
+
58
+ model_config = ConfigDict(frozen=True)
59
+
60
+ device: str | None = None
61
+ low_memory: bool = False
62
+ whisper_model: WhisperModel = "turbo"
63
+ condition_on_previous_text: bool = False
64
+ no_speech_threshold: float = 0.6
65
+ logprob_threshold: float | None = -1.0
66
+ vocabulary: list[str] | None = None
67
+ strict_quality: bool = False
68
+ translator: TranslatorChoice = "auto"
69
+
70
+ def init_log_fields(self) -> dict[str, object]:
71
+ """Subset of fields surfaced in the init-log line.
72
+
73
+ Hand-picked so log noise stays bounded as the config grows.
74
+ """
75
+ return {
76
+ "device": self.device.lower() if isinstance(self.device, str) else "auto",
77
+ "low_memory": self.low_memory,
78
+ "whisper_model": self.whisper_model,
79
+ "translator": self.translator,
80
+ }
@@ -6,8 +6,8 @@ import logging
6
6
  from pathlib import Path
7
7
  from typing import TYPE_CHECKING, Any, Callable
8
8
 
9
+ from videopython.ai.dubbing.config import DubbingConfig
9
10
  from videopython.ai.dubbing.models import DubbingResult, RevoiceResult
10
- from videopython.ai.dubbing.pipeline import TranslatorChoice, WhisperModel
11
11
 
12
12
  if TYPE_CHECKING:
13
13
  from videopython.base.video import Video
@@ -18,90 +18,26 @@ logger = logging.getLogger(__name__)
18
18
  class VideoDubber:
19
19
  """Dubs videos into different languages using the local pipeline.
20
20
 
21
- Args:
22
- device: Execution device (``cpu``, ``cuda``, ``mps``, or ``None`` for auto).
23
- low_memory: When True, each pipeline stage (Whisper, Demucs, MarianMT,
24
- Chatterbox TTS) is unloaded from memory after it runs, so only one
25
- model is resident at a time. Trades per-run latency (~10-30s of
26
- extra model loads) for a much lower memory ceiling. Recommended for
27
- GPUs with <=12GB VRAM or hosts with <32GB RAM. Default False.
28
- whisper_model: Whisper model size used for transcription. Larger models
29
- give better accuracy at the cost of VRAM and latency. One of
30
- ``tiny``, ``base``, ``small``, ``medium``, ``large``, ``turbo``.
31
- Default ``turbo``.
32
- condition_on_previous_text: Forwarded to ``AudioToText``. Defaults to
33
- ``False`` (Whisper's own default is ``True``). With conditioning on,
34
- a single hallucinated filler phrase cascades through the rest of
35
- the file. See ``AudioToText`` for the full rationale.
36
- no_speech_threshold: Forwarded to ``AudioToText``. Whisper's no-speech
37
- gate; raise to drop more low-confidence windows.
38
- logprob_threshold: Forwarded to ``AudioToText``. Whisper's average
39
- log-probability gate.
40
- vocabulary: Forwarded to ``AudioToText``. Optional list of brand
41
- names, product names, or proper nouns to bias Whisper's first-
42
- window decoder via ``initial_prompt``. Recovers near-mishears
43
- (e.g. Klarna → "carna") on brand-monitoring inputs without new
44
- model deps.
45
- strict_quality: When True, the pipeline raises
46
- :class:`GarbageTranscriptError` before Demucs/translation/TTS run
47
- if the transcript-quality heuristic returns ``"reject"``. When
48
- False (default), low-quality transcripts are logged at WARNING
49
- but processing continues. Either way the
50
- :class:`TranscriptQuality` is exposed on ``DubbingResult`` for
51
- inspection.
52
- translator: Translation backend to use. ``"auto"`` (default)
53
- picks Qwen3 on GPU, MarianMT on CPU; ``"marian"`` and
54
- ``"qwen3"`` force the named backend regardless of device.
55
- See :class:`videopython.ai.generation.qwen3.Qwen3Translator`
56
- for tradeoffs (Qwen3 is slower on CPU but produces
57
- context-aware, length-budgeted output).
21
+ Accepts either a :class:`DubbingConfig` or the same knobs as flat kwargs
22
+ (``device``, ``low_memory``, ``whisper_model``, ``translator``, etc.) --
23
+ the flat path builds a ``DubbingConfig`` internally. See
24
+ :class:`DubbingConfig` for the full knob list and defaults.
58
25
  """
59
26
 
60
- def __init__(
61
- self,
62
- device: str | None = None,
63
- low_memory: bool = False,
64
- whisper_model: WhisperModel = "turbo",
65
- condition_on_previous_text: bool = False,
66
- no_speech_threshold: float = 0.6,
67
- logprob_threshold: float | None = -1.0,
68
- vocabulary: list[str] | None = None,
69
- strict_quality: bool = False,
70
- translator: TranslatorChoice = "auto",
71
- ):
72
- self.device = device
73
- self.low_memory = low_memory
74
- self.whisper_model = whisper_model
75
- self.condition_on_previous_text = condition_on_previous_text
76
- self.no_speech_threshold = no_speech_threshold
77
- self.logprob_threshold = logprob_threshold
78
- self.vocabulary = vocabulary
79
- self.strict_quality = strict_quality
80
- self.translator = translator
27
+ def __init__(self, config: DubbingConfig | None = None, **kwargs: Any):
28
+ if config is not None and kwargs:
29
+ raise TypeError("Pass either `config=` or knob kwargs, not both")
30
+ self.config = config or DubbingConfig(**kwargs)
81
31
  self._local_pipeline: Any = None
82
- requested = device.lower() if isinstance(device, str) else "auto"
83
32
  logger.info(
84
- "VideoDubber initialized with device=%s low_memory=%s whisper_model=%s translator=%s",
85
- requested,
86
- low_memory,
87
- whisper_model,
88
- translator,
33
+ "VideoDubber initialized with %s",
34
+ " ".join(f"{k}={v}" for k, v in self.config.init_log_fields().items()),
89
35
  )
90
36
 
91
37
  def _init_local_pipeline(self) -> None:
92
38
  from videopython.ai.dubbing.pipeline import LocalDubbingPipeline
93
39
 
94
- self._local_pipeline = LocalDubbingPipeline(
95
- device=self.device,
96
- low_memory=self.low_memory,
97
- whisper_model=self.whisper_model,
98
- condition_on_previous_text=self.condition_on_previous_text,
99
- no_speech_threshold=self.no_speech_threshold,
100
- logprob_threshold=self.logprob_threshold,
101
- vocabulary=self.vocabulary,
102
- strict_quality=self.strict_quality,
103
- translator=self.translator,
104
- )
40
+ self._local_pipeline = LocalDubbingPipeline(config=self.config)
105
41
 
106
42
  def dub(
107
43
  self,
@@ -0,0 +1,47 @@
1
+ """Source-prosody-driven expressiveness knobs for Chatterbox TTS."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ from videopython.ai.dubbing.models import Expressiveness
10
+
11
+ if TYPE_CHECKING:
12
+ from videopython.audio import Audio
13
+
14
+
15
+ # Prosody-conditioning thresholds. Source-segment RMS / whole-vocals RMS
16
+ # below CALM lands in the calm bucket; above DRAMATIC in the dramatic
17
+ # bucket; in between gets Chatterbox's defaults. Knob values picked
18
+ # by-ear on cam1_1min.mp4 -- see RELEASE_NOTES 0.29.0.
19
+ CALM_RATIO_THRESHOLD = 0.7
20
+ DRAMATIC_RATIO_THRESHOLD = 1.3
21
+ _CALM = Expressiveness(exaggeration=0.3, cfg_weight=0.7)
22
+ _DRAMATIC = Expressiveness(exaggeration=0.85, cfg_weight=0.35)
23
+
24
+
25
+ def rms(data: np.ndarray) -> float:
26
+ """RMS over samples; ``0.0`` for empty input. float64 reduction so a
27
+ long slice can't overflow the squared accumulator."""
28
+ if data.size == 0:
29
+ return 0.0
30
+ return float(np.sqrt(np.mean(np.square(data, dtype=np.float64))))
31
+
32
+
33
+ def expressiveness_for(source_slice: Audio, baseline_rms: float) -> Expressiveness:
34
+ """Map a source vocals slice to a Chatterbox expressiveness profile
35
+ by RMS ratio. Falls back to the no-knobs default for empty or silent
36
+ inputs."""
37
+ if baseline_rms <= 0.0:
38
+ return Expressiveness()
39
+ segment_rms = rms(source_slice.data)
40
+ if segment_rms <= 0.0:
41
+ return Expressiveness()
42
+ ratio = segment_rms / baseline_rms
43
+ if ratio < CALM_RATIO_THRESHOLD:
44
+ return _CALM
45
+ if ratio > DRAMATIC_RATIO_THRESHOLD:
46
+ return _DRAMATIC
47
+ return Expressiveness()
@@ -0,0 +1,86 @@
1
+ """LUFS / peak loudness matching for dubbed audio."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ import numpy as np
8
+
9
+ if TYPE_CHECKING:
10
+ from videopython.audio import Audio
11
+
12
+
13
+ # BS.1770 integrated-loudness measurement requires at least 400 ms of audio
14
+ # (one gating block). Below this, fall back to peak match -- pyloudnorm
15
+ # returns -inf or warns, neither of which gives a usable gain.
16
+ _LUFS_MIN_DURATION_SECONDS = 0.4
17
+
18
+
19
+ def peak_match(target: Audio, reference: Audio) -> Audio:
20
+ """Scale ``target`` so its peak amplitude matches ``reference``.
21
+
22
+ Used as the fallback when LUFS measurement isn't viable (clip < 0.4s
23
+ or silent input). The new ``Audio`` shares no buffer with ``target``.
24
+ """
25
+ from videopython.audio import Audio as _Audio
26
+
27
+ target_peak = float(np.max(np.abs(target.data))) if target.data.size else 0.0
28
+ reference_peak = float(np.max(np.abs(reference.data))) if reference.data.size else 0.0
29
+
30
+ if target_peak <= 0.0 or reference_peak <= 0.0:
31
+ return target
32
+
33
+ scale = reference_peak / target_peak
34
+ if abs(scale - 1.0) < 1e-3:
35
+ return target
36
+
37
+ return _Audio(target.data * scale, target.metadata)
38
+
39
+
40
+ def loudness_match(target: Audio, reference: Audio) -> Audio:
41
+ """Scale ``target`` so its integrated loudness (BS.1770 / LUFS) matches ``reference``.
42
+
43
+ Demucs background normalization and the timing-assembler peak guard
44
+ each clamp at 1.0 instead of restoring perceived loudness, so a
45
+ dubbed mix lands perceptually "thinner" than the source even after
46
+ peak match. LUFS captures the ear-weighted envelope that peak ratio
47
+ misses on dialogue-heavy material.
48
+
49
+ Falls back to :func:`peak_match` when either clip is shorter than
50
+ the BS.1770 gating block (400 ms) or when measurement returns -inf
51
+ (silent or near-silent gated content). After gain is applied, peaks
52
+ are clamped to 0.99 -- BS.1770 has no peak ceiling and a sufficiently
53
+ quiet source can demand gain that would otherwise clip.
54
+ """
55
+ from videopython.audio import Audio as _Audio
56
+
57
+ target_dur = target.metadata.duration_seconds
58
+ ref_dur = reference.metadata.duration_seconds
59
+ if target_dur < _LUFS_MIN_DURATION_SECONDS or ref_dur < _LUFS_MIN_DURATION_SECONDS:
60
+ return peak_match(target, reference)
61
+
62
+ if not target.data.size or not reference.data.size:
63
+ return target
64
+
65
+ import pyloudnorm
66
+
67
+ target_lufs = pyloudnorm.Meter(target.metadata.sample_rate).integrated_loudness(target.data)
68
+ reference_lufs = pyloudnorm.Meter(reference.metadata.sample_rate).integrated_loudness(reference.data)
69
+
70
+ # Either clip's gated content was below -70 LUFS (effectively silent
71
+ # under BS.1770). Gain would be undefined -- fall back to peak match,
72
+ # which has its own silent-input no-op.
73
+ if not np.isfinite(target_lufs) or not np.isfinite(reference_lufs):
74
+ return peak_match(target, reference)
75
+
76
+ gain_db = reference_lufs - target_lufs
77
+ if abs(gain_db) < 0.1:
78
+ return target
79
+ scale = float(10 ** (gain_db / 20.0))
80
+
81
+ scaled = target.data * scale
82
+ peak = float(np.max(np.abs(scaled)))
83
+ if peak > 0.99:
84
+ scaled = scaled * (0.99 / peak)
85
+
86
+ return _Audio(scaled, target.metadata)
@@ -2,14 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from dataclasses import dataclass, field
6
- from typing import TYPE_CHECKING, Any
5
+ from typing import TYPE_CHECKING, Annotated, Any
7
6
 
7
+ from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, PlainSerializer, model_validator
8
+
9
+ from videopython.ai.dubbing.quality import TranscriptQuality
8
10
  from videopython.audio import Audio
9
11
  from videopython.base.transcription import Transcription, TranscriptionSegment
10
12
 
11
13
  if TYPE_CHECKING:
12
- from videopython.ai.dubbing.quality import TranscriptQuality
13
14
  from videopython.ai.dubbing.timing import TimingAdjustment
14
15
 
15
16
 
@@ -19,11 +20,30 @@ if TYPE_CHECKING:
19
20
  CLEAN_SPEED_TOLERANCE = 0.01
20
21
 
21
22
 
22
- @dataclass(frozen=True)
23
- class Expressiveness:
23
+ # TranscriptionSegment and Transcription still live in videopython.base as
24
+ # plain dataclasses with hand-rolled to_dict/from_dict. Bridge them at
25
+ # the field boundary so the dubbing cache wire format stays identical.
26
+ def _validate_transcription_segment(value: Any) -> Any:
27
+ if value is None or isinstance(value, TranscriptionSegment):
28
+ return value
29
+ return TranscriptionSegment.from_dict(value)
30
+
31
+
32
+ def _serialize_with_to_dict(value: Any) -> Any:
33
+ return value.to_dict() if value is not None else None
34
+
35
+
36
+ _TranscriptionSegmentField = Annotated[
37
+ TranscriptionSegment,
38
+ BeforeValidator(_validate_transcription_segment),
39
+ PlainSerializer(_serialize_with_to_dict, return_type=dict, when_used="always"),
40
+ ]
41
+
42
+
43
+ class Expressiveness(BaseModel):
24
44
  """Chatterbox ``generate()`` knobs derived from source-segment prosody.
25
45
 
26
- ``None`` on any field means "let Chatterbox use its own default"
46
+ ``None`` on any field means "let Chatterbox use its own default" --
27
47
  avoids pinning the dub against future Chatterbox default changes.
28
48
 
29
49
  Attributes:
@@ -34,6 +54,8 @@ class Expressiveness:
34
54
  temperature: Sampling temperature. Chatterbox default ``0.8``.
35
55
  """
36
56
 
57
+ model_config = ConfigDict(frozen=True)
58
+
37
59
  exaggeration: float | None = None
38
60
  cfg_weight: float | None = None
39
61
  temperature: float | None = None
@@ -54,8 +76,7 @@ class Expressiveness:
54
76
  }
55
77
 
56
78
 
57
- @dataclass
58
- class TranslatedSegment:
79
+ class TranslatedSegment(BaseModel):
59
80
  """A segment of translated text with timing information.
60
81
 
61
82
  Attributes:
@@ -68,7 +89,9 @@ class TranslatedSegment:
68
89
  end: End time in seconds.
69
90
  """
70
91
 
71
- original_segment: TranscriptionSegment
92
+ model_config = ConfigDict(arbitrary_types_allowed=True)
93
+
94
+ original_segment: _TranscriptionSegmentField
72
95
  translated_text: str
73
96
  source_lang: str
74
97
  target_lang: str
@@ -76,13 +99,17 @@ class TranslatedSegment:
76
99
  start: float = 0.0
77
100
  end: float = 0.0
78
101
 
79
- def __post_init__(self) -> None:
80
- """Set timing from original segment if not provided."""
102
+ @model_validator(mode="after")
103
+ def _default_timing_from_segment(self) -> TranslatedSegment:
104
+ # ``start == end == 0.0`` is the dataclass-era sentinel for "use the
105
+ # original segment's timing." Preserved so legacy callers (and the
106
+ # dub cache wire format) keep working.
81
107
  if self.start == 0.0 and self.end == 0.0:
82
108
  self.start = self.original_segment.start
83
109
  self.end = self.original_segment.end
84
110
  if self.speaker is None:
85
111
  self.speaker = self.original_segment.speaker
112
+ return self
86
113
 
87
114
  @property
88
115
  def original_text(self) -> str:
@@ -94,34 +121,8 @@ class TranslatedSegment:
94
121
  """Duration of the segment in seconds."""
95
122
  return self.end - self.start
96
123
 
97
- def to_dict(self) -> dict[str, Any]:
98
- """Convert to dictionary for JSON serialization (used by the dub cache)."""
99
- return {
100
- "original_segment": self.original_segment.to_dict(),
101
- "translated_text": self.translated_text,
102
- "source_lang": self.source_lang,
103
- "target_lang": self.target_lang,
104
- "speaker": self.speaker,
105
- "start": self.start,
106
- "end": self.end,
107
- }
108
-
109
- @classmethod
110
- def from_dict(cls, data: dict[str, Any]) -> TranslatedSegment:
111
- """Reconstruct from a dict produced by :meth:`to_dict`."""
112
- return cls(
113
- original_segment=TranscriptionSegment.from_dict(data["original_segment"]),
114
- translated_text=data["translated_text"],
115
- source_lang=data["source_lang"],
116
- target_lang=data["target_lang"],
117
- speaker=data.get("speaker"),
118
- start=data.get("start", 0.0),
119
- end=data.get("end", 0.0),
120
- )
121
-
122
124
 
123
- @dataclass
124
- class SeparatedAudio:
125
+ class SeparatedAudio(BaseModel):
125
126
  """Audio separated into different components.
126
127
 
127
128
  Attributes:
@@ -132,6 +133,8 @@ class SeparatedAudio:
132
133
  original: The original unseparated audio.
133
134
  """
134
135
 
136
+ model_config = ConfigDict(arbitrary_types_allowed=True)
137
+
135
138
  vocals: Audio
136
139
  background: Audio
137
140
  original: Audio
@@ -144,8 +147,7 @@ class SeparatedAudio:
144
147
  return self.music is not None and self.effects is not None
145
148
 
146
149
 
147
- @dataclass
148
- class TimingSummary:
150
+ class TimingSummary(BaseModel):
149
151
  """Aggregate stats over per-segment timing adjustments.
150
152
 
151
153
  Surfaces how aggressively the timing synchronizer had to compress or
@@ -201,32 +203,8 @@ class TimingSummary:
201
203
  max_truncation_seconds=max_truncation,
202
204
  )
203
205
 
204
- def to_dict(self) -> dict[str, Any]:
205
- """Convert to dictionary for JSON serialization."""
206
- return {
207
- "total_segments": self.total_segments,
208
- "clean_count": self.clean_count,
209
- "stretched_count": self.stretched_count,
210
- "truncated_count": self.truncated_count,
211
- "mean_speed_factor": self.mean_speed_factor,
212
- "max_truncation_seconds": self.max_truncation_seconds,
213
- }
214
206
 
215
- @classmethod
216
- def from_dict(cls, data: dict[str, Any]) -> TimingSummary:
217
- """Create TimingSummary from dictionary."""
218
- return cls(
219
- total_segments=data["total_segments"],
220
- clean_count=data["clean_count"],
221
- stretched_count=data["stretched_count"],
222
- truncated_count=data["truncated_count"],
223
- mean_speed_factor=data["mean_speed_factor"],
224
- max_truncation_seconds=data["max_truncation_seconds"],
225
- )
226
-
227
-
228
- @dataclass
229
- class DubbingResult:
207
+ class DubbingResult(BaseModel):
230
208
  """Result of a video dubbing operation.
231
209
 
232
210
  Attributes:
@@ -247,16 +225,18 @@ class DubbingResult:
247
225
  no failure mode that drops segments).
248
226
  """
249
227
 
228
+ model_config = ConfigDict(arbitrary_types_allowed=True)
229
+
250
230
  dubbed_audio: Audio
251
231
  translated_segments: list[TranslatedSegment]
252
232
  source_transcription: Transcription
253
233
  source_lang: str
254
234
  target_lang: str
255
235
  separated_audio: SeparatedAudio | None = None
256
- voice_samples: dict[str, Audio] = field(default_factory=dict)
236
+ voice_samples: dict[str, Audio] = Field(default_factory=dict)
257
237
  timing_summary: TimingSummary | None = None
258
238
  transcript_quality: TranscriptQuality | None = None
259
- translation_failures: list[int] = field(default_factory=list)
239
+ translation_failures: list[int] = Field(default_factory=list)
260
240
 
261
241
  @property
262
242
  def num_segments(self) -> int:
@@ -283,8 +263,7 @@ class DubbingResult:
283
263
  return segments_by_speaker
284
264
 
285
265
 
286
- @dataclass
287
- class RevoiceResult:
266
+ class RevoiceResult(BaseModel):
288
267
  """Result of a voice replacement operation.
289
268
 
290
269
  Attributes:
@@ -296,6 +275,8 @@ class RevoiceResult:
296
275
  speech_duration: Duration of the generated speech.
297
276
  """
298
277
 
278
+ model_config = ConfigDict(arbitrary_types_allowed=True)
279
+
299
280
  revoiced_audio: Audio
300
281
  text: str
301
282
  separated_audio: SeparatedAudio | None = None