PyPI - videopython - Versions diffs - 0.26.5__tar.gz → 0.26.7__tar.gz - Mend

videopython 0.26.5tar.gz → 0.26.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{videopython-0.26.5 → videopython-0.26.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.26.5
+Version: 0.26.7
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.26.5 → videopython-0.26.7}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.26.5"
+version = "0.26.7"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/dubber.py RENAMED Viewed

@@ -3,7 +3,6 @@
 from __future__ import annotations
 import logging
-import tempfile
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable
@@ -74,9 +73,14 @@ class VideoDubber:
         Args:
             enable_diarization: Enable speaker diarization to clone each speaker's
-                voice separately. Requires additional VRAM for the diarization model.
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+                voice separately. With ``transcription=None``, runs alongside Whisper.
+                With a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         """
         if self._local_pipeline is None:
             self._init_local_pipeline()
@@ -106,8 +110,10 @@ class VideoDubber:
         """Dub a video and return a new video with the dubbed audio.
         Args:
-            transcription: Optional pre-computed Transcription object. When provided,
-                the internal Whisper transcription step is skipped.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. See ``dub()`` for the interaction with
+                ``enable_diarization``.
         """
         result = self.dub(
             video=video,
@@ -152,14 +158,18 @@ class VideoDubber:
             preserve_background: Preserve background music/effects via source separation.
             voice_clone: Clone the source speaker's voice for the dubbed track.
             enable_diarization: Enable speaker diarization for per-speaker voice cloning.
+                See ``dub()`` for the interaction with ``transcription``.
             progress_callback: Optional callback ``(stage: str, progress: float) -> None``.
-            transcription: Optional pre-computed ``Transcription`` to skip the Whisper step.
+            transcription: Optional pre-computed ``Transcription`` to skip the Whisper
+                step. Speaker labels on the supplied transcription drive per-speaker
+                voice cloning. If it has no speakers, pass ``enable_diarization=True``
+                to add them via pyannote (requires word-level timings).
         Returns:
             ``DubbingResult`` with the dubbed audio, translated segments, and
             source transcription. The output video is written to ``output_path``.
         """
-        from videopython.ai.dubbing.remux import replace_audio_stream
+        from videopython.ai.dubbing.remux import replace_audio_stream_from_audio
         from videopython.base.audio import Audio
         input_path = Path(input_path)
@@ -185,17 +195,14 @@ class VideoDubber:
             transcription=transcription,
         )
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            dubbed_audio_path = Path(tmp.name)
-        try:
-            result.dubbed_audio.save(dubbed_audio_path)
-            replace_audio_stream(
-                video_path=input_path,
-                audio_path=dubbed_audio_path,
-                output_path=output_path,
-            )
-        finally:
-            dubbed_audio_path.unlink(missing_ok=True)
+        # Stream the dubbed Audio directly into ffmpeg via stdin instead of
+        # going through a temp WAV on disk. For a 2h dub the temp file would
+        # be ~10 GB written-then-read; the streaming path drops both copies.
+        replace_audio_stream_from_audio(
+            video_path=input_path,
+            audio=result.dubbed_audio,
+            output_path=output_path,
+        )
         return result

{videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/pipeline.py RENAMED Viewed

@@ -3,6 +3,8 @@
 from __future__ import annotations
 import logging
+import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Literal
 from videopython.ai.dubbing.models import DubbingResult, RevoiceResult, SeparatedAudio
@@ -162,7 +164,16 @@ class LocalDubbingPipeline:
             transcription: Optional pre-computed Transcription object. When provided,
                 the internal Whisper transcription step is skipped (saving time and VRAM).
                 Must be a ``videopython.base.text.transcription.Transcription`` instance
-                with populated ``segments``.
+                with populated ``segments``. Speaker labels on the supplied transcription
+                drive per-speaker voice cloning. If the supplied transcription has no
+                speakers and ``enable_diarization=True``, pyannote is run standalone on
+                ``source_audio`` and speakers are attached to the supplied words
+                (requires word-level timings).
+            enable_diarization: When True, run speaker diarization to enable per-speaker
+                voice cloning. With ``transcription=None``, runs alongside Whisper. With
+                a supplied ``transcription`` that has no speakers, runs pyannote
+                standalone and overlays speakers onto the supplied words. Ignored when
+                the supplied transcription already has speaker labels.
         """
         def report_progress(stage: str, progress: float) -> None:
@@ -171,6 +182,34 @@ class LocalDubbingPipeline:
         if transcription is not None:
             report_progress("Using provided transcription", 0.05)
+            if transcription.speakers:
+                logger.info(
+                    "Using provided transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+                if enable_diarization:
+                    logger.info("enable_diarization=True ignored: supplied transcription already has speaker labels.")
+            elif enable_diarization:
+                report_progress("Diarizing supplied transcription", 0.10)
+                if self._transcriber is None or self._transcriber_diarization is not True:
+                    self._init_transcriber(enable_diarization=True)
+                    self._transcriber_diarization = True
+                transcription = self._transcriber.diarize_transcription(source_audio, transcription)
+                self._maybe_unload("_transcriber")
+                logger.info(
+                    "Diarized supplied transcription: %d segment(s), %d speaker(s)",
+                    len(transcription.segments),
+                    len(transcription.speakers),
+                )
+            else:
+                logger.info(
+                    "Using provided transcription: %d segment(s), no speaker labels. "
+                    "All segments will share a single voice clone. Pass "
+                    "enable_diarization=True to add per-speaker labels, or "
+                    "voice_clone=False to use the default TTS voice.",
+                    len(transcription.segments),
+                )
         else:
             report_progress("Transcribing audio", 0.05)
             if self._transcriber is None or self._transcriber_diarization != enable_diarization:
@@ -241,24 +280,40 @@ class LocalDubbingPipeline:
         target_durations: list[float] = []
         start_times: list[float] = []
-        for i, segment in enumerate(translated_segments):
-            if segment.duration < 0.1:
-                continue
-            progress = 0.50 + (0.30 * (i / len(translated_segments)))
-            report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
-            speaker = segment.speaker or "speaker_0"
-            voice_sample = voice_samples.get(speaker)
-            if voice_clone and voice_sample is not None:
-                dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample=voice_sample)
-            else:
-                dubbed_audio = self._tts.generate_audio(segment.translated_text)
-            dubbed_segments.append(dubbed_audio)
-            target_durations.append(segment.duration)
-            start_times.append(segment.start)
+        # Encode each speaker's voice sample to a temp WAV exactly once and
+        # reuse the path across every segment for that speaker. Without this
+        # cache, TextToSpeech.generate_audio re-encodes the same voice sample
+        # on every call (one temp WAV write + delete per segment), which is
+        # pure overhead for long dubs with many segments per speaker.
+        speaker_wav_paths: dict[str, Path] = {}
+        try:
+            if voice_clone:
+                for speaker, sample in voice_samples.items():
+                    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                        sample.save(f.name)
+                        speaker_wav_paths[speaker] = Path(f.name)
+            for i, segment in enumerate(translated_segments):
+                if segment.duration < 0.1:
+                    continue
+                progress = 0.50 + (0.30 * (i / len(translated_segments)))
+                report_progress(f"Generating speech ({i + 1}/{len(translated_segments)})", progress)
+                speaker = segment.speaker or "speaker_0"
+                cached_path = speaker_wav_paths.get(speaker) if voice_clone else None
+                if cached_path is not None:
+                    dubbed_audio = self._tts.generate_audio(segment.translated_text, voice_sample_path=cached_path)
+                else:
+                    dubbed_audio = self._tts.generate_audio(segment.translated_text)
+                dubbed_segments.append(dubbed_audio)
+                target_durations.append(segment.duration)
+                start_times.append(segment.start)
+        finally:
+            for path in speaker_wav_paths.values():
+                path.unlink(missing_ok=True)
         self._maybe_unload("_tts")

videopython-0.26.7/src/videopython/ai/dubbing/remux.py ADDED Viewed

@@ -0,0 +1,159 @@
+"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
+from __future__ import annotations
+import io
+import logging
+import subprocess
+import wave
+from pathlib import Path
+from typing import TYPE_CHECKING
+import numpy as np
+if TYPE_CHECKING:
+    from videopython.base.audio import Audio
+logger = logging.getLogger(__name__)
+class RemuxError(RuntimeError):
+    """ffmpeg failed while replacing an audio stream."""
+def replace_audio_stream(
+    video_path: str | Path,
+    audio_path: str | Path,
+    output_path: str | Path,
+    audio_codec: str = "aac",
+    audio_bitrate: str = "192k",
+) -> None:
+    """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
+    Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
+    ``-shortest`` trims to the shorter of the two streams so the output duration
+    matches the source video when the dubbed audio is slightly longer.
+    Args:
+        video_path: Source video file (video stream is copied unchanged).
+        audio_path: Audio file to use as the new audio track.
+        output_path: Destination file. Overwritten if it exists.
+        audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
+        audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
+    Raises:
+        FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
+        RemuxError: If ffmpeg returns a non-zero exit code.
+    """
+    video_path = Path(video_path)
+    audio_path = Path(audio_path)
+    output_path = Path(output_path)
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    if not audio_path.exists():
+        raise FileNotFoundError(f"Audio file not found: {audio_path}")
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(video_path),
+        "-i",
+        str(audio_path),
+        "-map",
+        "0:v:0",
+        "-map",
+        "1:a:0",
+        "-c:v",
+        "copy",
+        "-c:a",
+        audio_codec,
+        "-b:a",
+        audio_bitrate,
+        "-shortest",
+        str(output_path),
+    ]
+    logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
+    result = subprocess.run(cmd, capture_output=True)
+    if result.returncode != 0:
+        raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")
+def replace_audio_stream_from_audio(
+    video_path: str | Path,
+    audio: Audio,
+    output_path: str | Path,
+    audio_codec: str = "aac",
+    audio_bitrate: str = "192k",
+) -> None:
+    """Like ``replace_audio_stream`` but takes an in-memory ``Audio`` and pipes WAV to ffmpeg.
+    Avoids the ``Audio.save -> read-from-disk -> ffmpeg`` round-trip used by
+    the path-based variant: we serialize the WAV in memory and feed it to
+    ffmpeg via stdin. For long dubs this saves a full WAV write+read of the
+    output audio (~10 GB for a 2h source).
+    Args:
+        video_path: Source video file (video stream is copied unchanged).
+        audio: ``Audio`` instance to mux in as the new audio track.
+        output_path: Destination file. Overwritten if it exists.
+        audio_codec: ffmpeg audio codec name. Defaults to ``aac``.
+        audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
+    Raises:
+        FileNotFoundError: If ``video_path`` does not exist.
+        RemuxError: If ffmpeg returns a non-zero exit code.
+    """
+    video_path = Path(video_path)
+    output_path = Path(output_path)
+    if not video_path.exists():
+        raise FileNotFoundError(f"Video file not found: {video_path}")
+    # Serialize Audio to WAV bytes in memory. Mirrors Audio.save's WAV writer:
+    # int16 samples, header from metadata. We stream these bytes to ffmpeg's
+    # stdin as the second input (the first is the video file on disk).
+    int_data = (audio.data * np.iinfo(np.int16).max).astype(np.int16)
+    wav_io = io.BytesIO()
+    with wave.open(wav_io, "wb") as wav_file:
+        wav_file.setnchannels(audio.metadata.channels)
+        wav_file.setsampwidth(audio.metadata.sample_width)
+        wav_file.setframerate(audio.metadata.sample_rate)
+        wav_file.writeframes(int_data.tobytes())
+    wav_bytes = wav_io.getvalue()
+    cmd = [
+        "ffmpeg",
+        "-y",
+        "-i",
+        str(video_path),
+        "-f",
+        "wav",
+        "-i",
+        "-",
+        "-map",
+        "0:v:0",
+        "-map",
+        "1:a:0",
+        "-c:v",
+        "copy",
+        "-c:a",
+        audio_codec,
+        "-b:a",
+        audio_bitrate,
+        "-shortest",
+        str(output_path),
+    ]
+    logger.info(
+        "replace_audio_stream_from_audio: %s + <stdin wav %d bytes> -> %s",
+        video_path,
+        len(wav_bytes),
+        output_path,
+    )
+    process = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+    _, stderr = process.communicate(wav_bytes)
+    if process.returncode != 0:
+        raise RemuxError(f"ffmpeg failed (exit {process.returncode}): {stderr.decode(errors='replace')}")

{videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/dubbing/timing.py RENAMED Viewed

@@ -4,7 +4,9 @@ from __future__ import annotations
 from dataclasses import dataclass
-from videopython.base.audio import Audio
+import numpy as np
+from videopython.base.audio import Audio, AudioMetadata
 @dataclass
@@ -181,32 +183,58 @@ class TimingSynchronizer:
         if len(audio_segments) != len(start_times):
             raise ValueError(f"Length mismatch: {len(audio_segments)} segments vs {len(start_times)} start times")
+        for start_time in start_times:
+            if start_time < 0:
+                raise ValueError(f"Invalid start time: {start_time}")
         if not audio_segments:
             return Audio.create_silent(total_duration, stereo=False)
-        # Determine sample rate from first segment
+        # Single-pass assembler: allocate one mono float32 buffer and add each
+        # segment in place at its start sample. The previous implementation
+        # called Audio.overlay() per segment, which allocates np.zeros and
+        # copies the full track on every call — O(N * total_samples) memory
+        # traffic. For long dubs (thousands of segments) this loop dominated
+        # wall time and peak RAM.
         sample_rate = audio_segments[0].metadata.sample_rate
-        # Create base silent track
-        output = Audio.create_silent(total_duration, stereo=False, sample_rate=sample_rate)
-        # Overlay each segment at its start time
+        base_samples = max(int(total_duration * sample_rate), 0)
+        # Pre-normalize each segment to (mono, target sample rate) and compute
+        # placement bounds so the output buffer is sized to fit any segment
+        # that runs past total_duration (mirrors Audio.overlay's extend-on-OOB
+        # behavior so we don't silently truncate speech).
+        normalized: list[tuple[int, np.ndarray]] = []
+        end_sample = base_samples
         for audio, start_time in zip(audio_segments, start_times):
-            if start_time < 0:
-                raise ValueError(f"Invalid start time: {start_time}")
-            # Resample if needed
             if audio.metadata.sample_rate != sample_rate:
                 audio = audio.resample(sample_rate)
-            # Convert to mono if needed
             if audio.metadata.channels > 1:
                 audio = audio.to_mono()
-            # Overlay at position
-            output = output.overlay(audio, position=start_time)
-        return output
+            start_sample = int(np.ceil(start_time * sample_rate))
+            seg_data = audio.data
+            normalized.append((start_sample, seg_data))
+            end_sample = max(end_sample, start_sample + len(seg_data))
+        output = np.zeros(end_sample, dtype=np.float32)
+        for start_sample, seg_data in normalized:
+            stop = start_sample + len(seg_data)
+            output[start_sample:stop] += seg_data
+        # Single post-mix peak guard, equivalent to Audio.overlay's per-call
+        # rescale collapsed into one pass. For non-overlapping dub segments
+        # this is a no-op; only the rare overlap case touches it.
+        max_amplitude = float(np.max(np.abs(output))) if output.size else 0.0
+        if max_amplitude > 1.0:
+            output /= max_amplitude
+        metadata = AudioMetadata(
+            sample_rate=sample_rate,
+            channels=1,
+            sample_width=audio_segments[0].metadata.sample_width,
+            duration_seconds=end_sample / sample_rate,
+            frame_count=end_sample,
+        )
+        return Audio(output, metadata)
     def check_overlaps(
         self,

{videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/generation/audio.py RENAMED Viewed

@@ -2,11 +2,14 @@
 from __future__ import annotations
-from typing import Any
+from typing import TYPE_CHECKING, Any
 from videopython.ai._device import log_device_initialization, release_device_memory, select_device
 from videopython.base.audio import Audio, AudioMetadata
+if TYPE_CHECKING:
+    from pathlib import Path
 class TextToSpeech:
     """Generates speech audio from text using Chatterbox Multilingual.
@@ -47,6 +50,7 @@ class TextToSpeech:
         self,
         text: str,
         voice_sample: Audio | None = None,
+        voice_sample_path: str | Path | None = None,
     ) -> Audio:
         """Generate speech audio from text.
@@ -54,6 +58,12 @@ class TextToSpeech:
             text: Text to synthesize.
             voice_sample: Optional voice sample to clone. Falls back to the
                 instance's ``voice`` and then to Chatterbox's default speaker.
+            voice_sample_path: Optional pre-encoded WAV path to use directly as
+                the speaker prompt. Skips the per-call temp-WAV encode that
+                ``voice_sample`` would otherwise trigger. When set, takes
+                precedence over ``voice_sample`` and ``self.voice``. Used by
+                the dubbing pipeline to encode each speaker's sample once and
+                reuse it across all of that speaker's segments.
         """
         import tempfile
         from pathlib import Path
@@ -63,13 +73,18 @@ class TextToSpeech:
         if self._model is None:
             self._init_model()
-        effective_sample = voice_sample or self.voice
         speaker_wav_path: Path | None = None
-        if effective_sample is not None:
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-                effective_sample.save(f.name)
-                speaker_wav_path = Path(f.name)
+        cleanup_path = False
+        if voice_sample_path is not None:
+            speaker_wav_path = Path(voice_sample_path)
+        else:
+            effective_sample = voice_sample or self.voice
+            if effective_sample is not None:
+                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                    effective_sample.save(f.name)
+                    speaker_wav_path = Path(f.name)
+                    cleanup_path = True
         try:
             wav = self._model.generate(
@@ -91,8 +106,8 @@ class TextToSpeech:
             )
             return Audio(audio_data, metadata)
         finally:
-            if speaker_wav_path is not None:
-                speaker_wav_path.unlink()
+            if cleanup_path and speaker_wav_path is not None:
+                speaker_wav_path.unlink(missing_ok=True)
     def unload(self) -> None:
         """Release the TTS model so the next generate_audio() re-initializes.

{videopython-0.26.5 → videopython-0.26.7}/src/videopython/ai/understanding/audio.py RENAMED Viewed

@@ -130,6 +130,48 @@ class AudioToText:
             )
         return result
+    def diarize_transcription(self, audio: Audio, transcription: Transcription) -> Transcription:
+        """Attach speaker labels to a pre-computed transcription using pyannote.
+        Useful when callers have a transcription (e.g. pre-computed and edited)
+        but no speakers, and want per-speaker voice cloning in dubbing without
+        re-running Whisper. Runs pyannote standalone on ``audio`` and overlays
+        speakers onto the supplied transcription's words.
+        Requires word-level timings: at least one segment must contain more
+        than one word. Transcriptions loaded from SRT (one synthetic word per
+        segment) will not produce useful speakers and are rejected.
+        """
+        import numpy as np
+        import torch
+        all_words: list[TranscriptionWord] = list(transcription.words)
+        if not all_words:
+            raise ValueError("Cannot diarize a transcription with no words.")
+        if not any(len(seg.words) > 1 for seg in transcription.segments):
+            raise ValueError(
+                "Cannot diarize a transcription without word-level timings. "
+                "Supplied transcription has at most one word per segment "
+                "(e.g. loaded from SRT). Provide a transcription with "
+                "word-level timings, or omit `transcription` to let the "
+                "pipeline transcribe and diarize from scratch."
+            )
+        if self._diarization_pipeline is None:
+            self._init_diarization()
+        import whisper
+        audio_mono = audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
+        waveform = torch.from_numpy(audio_mono.data.astype(np.float32)).unsqueeze(0)
+        diarization_result = self._diarization_pipeline(
+            {"waveform": waveform, "sample_rate": audio_mono.metadata.sample_rate}
+        )
+        all_words = self._assign_speakers_to_words(all_words, diarization_result)
+        return Transcription(words=all_words, language=transcription.language)
     def _transcribe_with_diarization(self, audio_mono: Audio) -> Transcription:
         """Transcribe with word timestamps and assign speakers via pyannote."""
         import numpy as np

videopython-0.26.5/src/videopython/ai/dubbing/remux.py DELETED Viewed

@@ -1,73 +0,0 @@
-"""ffmpeg helper for replacing a video file's audio track without re-encoding video."""
-from __future__ import annotations
-import logging
-import subprocess
-from pathlib import Path
-logger = logging.getLogger(__name__)
-class RemuxError(RuntimeError):
-    """ffmpeg failed while replacing an audio stream."""
-def replace_audio_stream(
-    video_path: str | Path,
-    audio_path: str | Path,
-    output_path: str | Path,
-    audio_codec: str = "aac",
-    audio_bitrate: str = "192k",
-) -> None:
-    """Copy ``video_path``'s video stream and mux in ``audio_path`` as the audio track.
-    Uses ffmpeg stream-copy for video (no re-encode) and encodes audio to AAC.
-    ``-shortest`` trims to the shorter of the two streams so the output duration
-    matches the source video when the dubbed audio is slightly longer.
-    Args:
-        video_path: Source video file (video stream is copied unchanged).
-        audio_path: Audio file to use as the new audio track.
-        output_path: Destination file. Overwritten if it exists.
-        audio_codec: ffmpeg audio codec name. Defaults to ``aac`` (MP4-compatible).
-        audio_bitrate: Audio bitrate passed to ffmpeg (``-b:a``).
-    Raises:
-        FileNotFoundError: If ``video_path`` or ``audio_path`` does not exist.
-        RemuxError: If ffmpeg returns a non-zero exit code.
-    """
-    video_path = Path(video_path)
-    audio_path = Path(audio_path)
-    output_path = Path(output_path)
-    if not video_path.exists():
-        raise FileNotFoundError(f"Video file not found: {video_path}")
-    if not audio_path.exists():
-        raise FileNotFoundError(f"Audio file not found: {audio_path}")
-    cmd = [
-        "ffmpeg",
-        "-y",
-        "-i",
-        str(video_path),
-        "-i",
-        str(audio_path),
-        "-map",
-        "0:v:0",
-        "-map",
-        "1:a:0",
-        "-c:v",
-        "copy",
-        "-c:a",
-        audio_codec,
-        "-b:a",
-        audio_bitrate,
-        "-shortest",
-        str(output_path),
-    ]
-    logger.info("replace_audio_stream: %s + %s -> %s", video_path, audio_path, output_path)
-    result = subprocess.run(cmd, capture_output=True)
-    if result.returncode != 0:
-        raise RemuxError(f"ffmpeg failed (exit {result.returncode}): {result.stderr.decode(errors='replace')}")