srt2speech 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 nbr23
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,108 @@
1
+ Metadata-Version: 2.4
2
+ Name: srt2speech
3
+ Version: 1.0.0
4
+ Summary: Synthesize a timestamp-synced speech track from a subtitle file and mux it into video
5
+ Author: nbr23
6
+ Author-email: nbr23 <max@23.tf>
7
+ License-Expression: MIT
8
+ License-File: LICENSE
9
+ Requires-Dist: pysubs2>=1.7
10
+ Requires-Dist: httpx>=0.27
11
+ Requires-Dist: typer>=0.12
12
+ Requires-Dist: pydub>=0.25
13
+ Requires-Dist: rich>=13.7
14
+ Requires-Dist: audioop-lts>=0.2 ; python_full_version >= '3.13'
15
+ Requires-Python: >=3.11
16
+ Project-URL: Homepage, https://github.com/nbr23/srt2speech
17
+ Project-URL: Repository, https://github.com/nbr23/srt2speech
18
+ Description-Content-Type: text/markdown
19
+
20
+ # srt2speech
21
+
22
+ Turn a subtitle file into a **timestamp-synced speech track** and mux it into a video.
23
+
24
+ Give it a video + an `.srt` (or `.vtt`/`.ass`); it synthesizes audio where each subtitle is spoken
25
+ at its timestamp, then optionally muxes the track back in with ffmpeg. Useful for restoring lost
26
+ audio, rough translation dubs, narrating silent videos, or adding **audio description** by reading
27
+ only the descriptive/SDH cues.
28
+
29
+ It does the SRT→audio part well and nothing else: **no translation, no transcription** — bring an
30
+ already-final subtitle file.
31
+
32
+ ## Requirements
33
+
34
+ - Python ≥ 3.11, [uv](https://docs.astral.sh/uv/)
35
+ - `ffmpeg` / `ffprobe` on `PATH`
36
+ - A TTS backend:
37
+ - **piper** — a local [gopipertts](https://github.com/nbr23/gopipertts) server (free, default;
38
+ set `SRT2SPEECH_PIPER_URL` if not on `http://localhost:8080`)
39
+ - **openai** — `gpt-4o-mini-tts` (set `OPENAI_API_KEY`)
40
+ - **elevenlabs** — `eleven_multilingual_v2` (set `ELEVENLABS_API_KEY`)
41
+
42
+ ## Install
43
+
44
+ ```sh
45
+ uv sync
46
+ ```
47
+
48
+ ## Usage
49
+
50
+ ```sh
51
+ # generate a synced track with the local piper backend, sized to the video
52
+ uv run srt2speech generate subs.srt --video clip.mp4 -o track.wav
53
+
54
+ # generate + mux into the video in one step
55
+ uv run srt2speech run clip.mp4 subs.srt -o dubbed.mp4
56
+
57
+ # paid backend with delivery guidance
58
+ OPENAI_API_KEY=... uv run srt2speech generate subs.srt \
59
+ --backend openai --voice coral --instructions "calm documentary narration" -o track.wav
60
+
61
+ # audio description: only descriptive/SDH cues, mixed over the existing audio
62
+ uv run srt2speech run movie.mkv subs.srt --mode descriptive --mux-mode mix -o described.mkv
63
+
64
+ # mux an existing track yourself
65
+ uv run srt2speech mux clip.mp4 track.wav -o dubbed.mp4
66
+
67
+ # list a backend's voices
68
+ uv run srt2speech voices --backend openai
69
+ ```
70
+
71
+ ### Docker Compose
72
+
73
+ Runs a local piper server plus an on-demand CLI; no host Python or ffmpeg needed. Put your video
74
+ and subtitles in `./data` (mounted at `/data`); pulled voices are cached in `./voices`.
75
+
76
+ ```sh
77
+ # 1. start the piper TTS server (preloads the default voice)
78
+ docker compose up -d gopipertts
79
+
80
+ # 2. run the CLI against files in ./data
81
+ docker compose run --rm srt2speech run /data/clip.mp4 /data/subs.srt -o /data/dubbed.mp4
82
+
83
+ # 3. tear down when done
84
+ docker compose down
85
+ ```
86
+
87
+ For the OpenAI backend, put `OPENAI_API_KEY=sk-...` in a `.env` file (gitignored) — Compose loads it
88
+ automatically and passes it through to the CLI container.
89
+
90
+ ### Sync strategies (`--strategy`)
91
+
92
+ Speech rarely fits a cue's window exactly. The fit engine offers:
93
+
94
+ - `hybrid` *(default)* — fit into the cue window plus the silent gap before the next cue; only then
95
+ speed up, capped by `--max-speedup` (default `1.15`).
96
+ - `overflow` — never speed up; let speech run into following silence (best quality, can drift).
97
+ - `precise` — fit the exact cue window, speeding up to the cap.
98
+
99
+ ### Modes (`--mode`)
100
+
101
+ `all` (default) · `descriptive` (SDH/audio-description only) · `dialogue` (drop sound cues).
102
+
103
+ ## Development
104
+
105
+ ```sh
106
+ uv run pytest
107
+ uv run ruff check
108
+ ```
@@ -0,0 +1,89 @@
1
+ # srt2speech
2
+
3
+ Turn a subtitle file into a **timestamp-synced speech track** and mux it into a video.
4
+
5
+ Give it a video + an `.srt` (or `.vtt`/`.ass`); it synthesizes audio where each subtitle is spoken
6
+ at its timestamp, then optionally muxes the track back in with ffmpeg. Useful for restoring lost
7
+ audio, rough translation dubs, narrating silent videos, or adding **audio description** by reading
8
+ only the descriptive/SDH cues.
9
+
10
+ It does the SRT→audio part well and nothing else: **no translation, no transcription** — bring an
11
+ already-final subtitle file.
12
+
13
+ ## Requirements
14
+
15
+ - Python ≥ 3.11, [uv](https://docs.astral.sh/uv/)
16
+ - `ffmpeg` / `ffprobe` on `PATH`
17
+ - A TTS backend:
18
+ - **piper** — a local [gopipertts](https://github.com/nbr23/gopipertts) server (free, default;
19
+ set `SRT2SPEECH_PIPER_URL` if not on `http://localhost:8080`)
20
+ - **openai** — `gpt-4o-mini-tts` (set `OPENAI_API_KEY`)
21
+ - **elevenlabs** — `eleven_multilingual_v2` (set `ELEVENLABS_API_KEY`)
22
+
23
+ ## Install
24
+
25
+ ```sh
26
+ uv sync
27
+ ```
28
+
29
+ ## Usage
30
+
31
+ ```sh
32
+ # generate a synced track with the local piper backend, sized to the video
33
+ uv run srt2speech generate subs.srt --video clip.mp4 -o track.wav
34
+
35
+ # generate + mux into the video in one step
36
+ uv run srt2speech run clip.mp4 subs.srt -o dubbed.mp4
37
+
38
+ # paid backend with delivery guidance
39
+ OPENAI_API_KEY=... uv run srt2speech generate subs.srt \
40
+ --backend openai --voice coral --instructions "calm documentary narration" -o track.wav
41
+
42
+ # audio description: only descriptive/SDH cues, mixed over the existing audio
43
+ uv run srt2speech run movie.mkv subs.srt --mode descriptive --mux-mode mix -o described.mkv
44
+
45
+ # mux an existing track yourself
46
+ uv run srt2speech mux clip.mp4 track.wav -o dubbed.mp4
47
+
48
+ # list a backend's voices
49
+ uv run srt2speech voices --backend openai
50
+ ```
51
+
52
+ ### Docker Compose
53
+
54
+ Runs a local piper server plus an on-demand CLI; no host Python or ffmpeg needed. Put your video
55
+ and subtitles in `./data` (mounted at `/data`); pulled voices are cached in `./voices`.
56
+
57
+ ```sh
58
+ # 1. start the piper TTS server (preloads the default voice)
59
+ docker compose up -d gopipertts
60
+
61
+ # 2. run the CLI against files in ./data
62
+ docker compose run --rm srt2speech run /data/clip.mp4 /data/subs.srt -o /data/dubbed.mp4
63
+
64
+ # 3. tear down when done
65
+ docker compose down
66
+ ```
67
+
68
+ For the OpenAI backend, put `OPENAI_API_KEY=sk-...` in a `.env` file (gitignored) — Compose loads it
69
+ automatically and passes it through to the CLI container.
70
+
71
+ ### Sync strategies (`--strategy`)
72
+
73
+ Speech rarely fits a cue's window exactly. The fit engine offers:
74
+
75
+ - `hybrid` *(default)* — fit into the cue window plus the silent gap before the next cue; only then
76
+ speed up, capped by `--max-speedup` (default `1.15`).
77
+ - `overflow` — never speed up; let speech run into following silence (best quality, can drift).
78
+ - `precise` — fit the exact cue window, speeding up to the cap.
79
+
80
+ ### Modes (`--mode`)
81
+
82
+ `all` (default) · `descriptive` (SDH/audio-description only) · `dialogue` (drop sound cues).
83
+
84
+ ## Development
85
+
86
+ ```sh
87
+ uv run pytest
88
+ uv run ruff check
89
+ ```
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "srt2speech"
3
+ version = "1.0.0"
4
+ description = "Synthesize a timestamp-synced speech track from a subtitle file and mux it into video"
5
+ readme = "README.md"
6
+ authors = [{ name = "nbr23", email = "max@23.tf" }]
7
+ license = "MIT"
8
+ license-files = ["LICENSE"]
9
+ requires-python = ">=3.11"
10
+ dependencies = [
11
+ "pysubs2>=1.7",
12
+ "httpx>=0.27",
13
+ "typer>=0.12",
14
+ "pydub>=0.25",
15
+ "rich>=13.7",
16
+ "audioop-lts>=0.2; python_version>='3.13'",
17
+ ]
18
+
19
+ [project.urls]
20
+ Homepage = "https://github.com/nbr23/srt2speech"
21
+ Repository = "https://github.com/nbr23/srt2speech"
22
+
23
+ [project.scripts]
24
+ srt2speech = "srt2speech.cli:main"
25
+
26
+ [dependency-groups]
27
+ dev = [
28
+ "pytest>=8.0",
29
+ "ruff>=0.6",
30
+ ]
31
+
32
+ [build-system]
33
+ requires = ["uv_build>=0.11.23,<0.12.0"]
34
+ build-backend = "uv_build"
35
+
36
+ [tool.ruff]
37
+ line-length = 100
38
+ src = ["src", "tests"]
39
+
40
+ [tool.ruff.lint]
41
+ select = ["E", "F", "I", "UP", "B"]
42
+
43
+ [tool.ruff.lint.per-file-ignores]
44
+ # typer requires Option()/Argument() calls in parameter defaults.
45
+ "src/srt2speech/cli.py" = ["B008"]
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
@@ -0,0 +1,3 @@
1
+ """srt2speech: synthesize a timestamp-synced speech track from subtitles."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,3 @@
1
+ from .cli import main
2
+
3
+ main()
@@ -0,0 +1,31 @@
1
+ """Lay rendered segments onto a silent timeline at their exact start offsets.
2
+
3
+ Overlapping audio (from overflow that exceeds even the gap) is mixed rather than ducked or pushed;
4
+ collision handling is a deliberate v2 refinement. The hybrid strategy keeps such overlaps rare.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from pydub import AudioSegment
10
+
11
+ from .models import Segment
12
+
13
+ Rendered = list[tuple[Segment, AudioSegment]]
14
+
15
+
16
+ def assemble(
17
+ rendered: Rendered,
18
+ *,
19
+ total_ms: int | None = None,
20
+ sample_rate: int = 24000,
21
+ channels: int = 1,
22
+ ) -> AudioSegment:
23
+ """Mix rendered segments onto a silent base sized to `total_ms` (or the content extent)."""
24
+ content_end = max((seg.start_ms + len(audio) for seg, audio in rendered), default=0)
25
+ length = max(total_ms or 0, content_end)
26
+
27
+ base = AudioSegment.silent(duration=length, frame_rate=sample_rate).set_channels(channels)
28
+ for seg, audio in rendered:
29
+ audio = audio.set_frame_rate(sample_rate).set_channels(channels)
30
+ base = base.overlay(audio, position=max(0, seg.start_ms))
31
+ return base
@@ -0,0 +1,156 @@
1
+ """Command-line interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+
8
+ import typer
9
+ from rich.console import Console
10
+ from rich.progress import Progress
11
+
12
+ from .mux import mux as mux_audio
13
+ from .pipeline import generate_track
14
+ from .probe import media_duration_ms
15
+ from .tts import BACKENDS, get_backend
16
+
17
+ app = typer.Typer(
18
+ add_completion=False,
19
+ help="Synthesize a timestamp-synced speech track from subtitles and mux it into video.",
20
+ )
21
+ console = Console()
22
+
23
+
24
+ def _build_backend(name: str, voice: str | None, piper_url: str | None, speed: float | None):
25
+ extra: dict[str, object] = {}
26
+ if name == "piper" and piper_url:
27
+ extra["base_url"] = piper_url
28
+ if speed is not None:
29
+ extra["speed"] = speed
30
+ return get_backend(name, voice=voice, **extra)
31
+
32
+
33
+ def _render_track(
34
+ srt: Path,
35
+ output: Path,
36
+ *,
37
+ video: Path | None,
38
+ backend: str,
39
+ voice: str | None,
40
+ strategy: str,
41
+ max_speedup: float,
42
+ merge_threshold: int,
43
+ mode: str,
44
+ instructions: str | None,
45
+ piper_url: str | None,
46
+ speed: float | None,
47
+ ) -> Path:
48
+ try:
49
+ engine = _build_backend(backend, voice, piper_url, speed)
50
+ except (ValueError, RuntimeError) as exc:
51
+ raise typer.BadParameter(str(exc)) from exc
52
+
53
+ total_ms = media_duration_ms(video) if video else None
54
+
55
+ with Progress(console=console, transient=True) as progress:
56
+ task = progress.add_task("Synthesizing", total=None)
57
+
58
+ def on_progress(done: int, total: int) -> None:
59
+ progress.update(task, completed=done, total=total)
60
+
61
+ track = generate_track(
62
+ srt, engine,
63
+ mode=mode, strategy=strategy, max_speedup=max_speedup,
64
+ merge_threshold_ms=merge_threshold, voice=voice,
65
+ instructions=instructions, total_ms=total_ms, progress=on_progress,
66
+ )
67
+
68
+ fmt = output.suffix.lstrip(".") or "wav"
69
+ track.export(output, format=fmt)
70
+ console.print(f"[green]Wrote[/] {output} ({len(track) / 1000:.1f}s)")
71
+ return output
72
+
73
+
74
+ @app.command()
75
+ def generate(
76
+ srt: Path = typer.Argument(..., exists=True, dir_okay=False, help="Subtitle file"),
77
+ output: Path = typer.Option(Path("out.wav"), "-o", "--output", help="Output audio file"),
78
+ video: Path | None = typer.Option(None, "--video", help="Match this media's duration"),
79
+ backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
80
+ voice: str | None = typer.Option(None, "--voice"),
81
+ strategy: str = typer.Option("hybrid", "--strategy", help="hybrid | overflow | precise"),
82
+ max_speedup: float = typer.Option(1.15, "--max-speedup", help="Cap on speech speed-up"),
83
+ merge_threshold: int = typer.Option(250, "--merge-threshold", help="Cue merge gap (ms)"),
84
+ mode: str = typer.Option("all", "--mode", help="all | descriptive | dialogue"),
85
+ instructions: str | None = typer.Option(None, "--instructions", help="OpenAI delivery hint"),
86
+ speed: float | None = typer.Option(None, "--speed", help="Base rate (piper default 1.2)"),
87
+ piper_url: str | None = typer.Option(None, "--piper-url", help="gopipertts base URL"),
88
+ ) -> None:
89
+ """Generate a synced speech track from a subtitle file."""
90
+ _render_track(
91
+ srt, output, video=video, backend=backend, voice=voice, strategy=strategy,
92
+ max_speedup=max_speedup, merge_threshold=merge_threshold, mode=mode,
93
+ instructions=instructions, piper_url=piper_url, speed=speed,
94
+ )
95
+
96
+
97
+ @app.command()
98
+ def mux(
99
+ video: Path = typer.Argument(..., exists=True, dir_okay=False),
100
+ audio: Path = typer.Argument(..., exists=True, dir_okay=False),
101
+ output: Path = typer.Option(..., "-o", "--output"),
102
+ mode: str = typer.Option("replace", "--mode", help="replace | mix"),
103
+ ) -> None:
104
+ """Mux an audio track into a video."""
105
+ mux_audio(video, audio, output, mode=mode)
106
+ console.print(f"[green]Wrote[/] {output}")
107
+
108
+
109
+ @app.command()
110
+ def run(
111
+ video: Path = typer.Argument(..., exists=True, dir_okay=False),
112
+ srt: Path = typer.Argument(..., exists=True, dir_okay=False),
113
+ output: Path = typer.Option(..., "-o", "--output", help="Output video file"),
114
+ backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
115
+ voice: str | None = typer.Option(None, "--voice"),
116
+ strategy: str = typer.Option("hybrid", "--strategy", help="hybrid | overflow | precise"),
117
+ max_speedup: float = typer.Option(1.15, "--max-speedup"),
118
+ merge_threshold: int = typer.Option(250, "--merge-threshold"),
119
+ mode: str = typer.Option("all", "--mode", help="all | descriptive | dialogue"),
120
+ instructions: str | None = typer.Option(None, "--instructions"),
121
+ speed: float | None = typer.Option(None, "--speed", help="Base rate (piper default 1.2)"),
122
+ mux_mode: str = typer.Option("replace", "--mux-mode", help="replace | mix"),
123
+ piper_url: str | None = typer.Option(None, "--piper-url"),
124
+ ) -> None:
125
+ """Generate the track and mux it into the video in one step."""
126
+ with tempfile.TemporaryDirectory() as tmp:
127
+ track = Path(tmp) / "track.wav"
128
+ _render_track(
129
+ srt, track, video=video, backend=backend, voice=voice, strategy=strategy,
130
+ max_speedup=max_speedup, merge_threshold=merge_threshold, mode=mode,
131
+ instructions=instructions, piper_url=piper_url, speed=speed,
132
+ )
133
+ mux_audio(video, track, output, mode=mux_mode)
134
+ console.print(f"[green]Wrote[/] {output}")
135
+
136
+
137
+ @app.command()
138
+ def voices(
139
+ backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
140
+ piper_url: str | None = typer.Option(None, "--piper-url"),
141
+ ) -> None:
142
+ """List available voices for a backend."""
143
+ try:
144
+ engine = _build_backend(backend, None, piper_url, None)
145
+ for v in engine.list_voices():
146
+ console.print(v)
147
+ except (ValueError, RuntimeError) as exc:
148
+ raise typer.BadParameter(str(exc)) from exc
149
+
150
+
151
+ def main() -> None:
152
+ app()
153
+
154
+
155
+ if __name__ == "__main__":
156
+ main()
@@ -0,0 +1,50 @@
1
+ """Cue filtering for accessibility modes.
2
+
3
+ SDH (subtitles for the deaf and hard-of-hearing) mark non-dialogue sound information as
4
+ bracketed/parenthesized cues (``[door creaks]``, ``(ominous music)``) or all-caps sound effects
5
+ (``THUNDER RUMBLES``). Keeping only those yields an audio-description-style track; dropping them
6
+ yields a clean dialogue dub.
7
+
8
+ The heuristics are whole-cue and intentionally simple; partial cues ("He runs. [gunshot]") are
9
+ treated by their dominant form. Tune the regexes here if a corpus needs it.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ from collections.abc import Iterable
16
+
17
+ from .models import Cue
18
+
19
+ _WRAPPED_RE = re.compile(r"^\s*[\[(].*[\])]\s*$", re.DOTALL)
20
+
21
+
22
+ def is_descriptive(text: str) -> bool:
23
+ """True for bracketed/parenthesized cues or all-caps sound effects."""
24
+ stripped = text.strip()
25
+ if not stripped:
26
+ return False
27
+ if _WRAPPED_RE.match(stripped):
28
+ return True
29
+ letters = [c for c in stripped if c.isalpha()]
30
+ return bool(letters) and all(c.isupper() for c in letters)
31
+
32
+
33
+ def keep_descriptive(cues: Iterable[Cue]) -> list[Cue]:
34
+ return [c for c in cues if is_descriptive(c.text)]
35
+
36
+
37
+ def keep_dialogue(cues: Iterable[Cue]) -> list[Cue]:
38
+ return [c for c in cues if not is_descriptive(c.text)]
39
+
40
+
41
+ def apply_mode(cues: Iterable[Cue], mode: str) -> list[Cue]:
42
+ """Select cues for a CLI ``--mode``: ``all`` | ``descriptive`` | ``dialogue``."""
43
+ cues = list(cues)
44
+ if mode == "all":
45
+ return cues
46
+ if mode == "descriptive":
47
+ return keep_descriptive(cues)
48
+ if mode == "dialogue":
49
+ return keep_dialogue(cues)
50
+ raise ValueError(f"unknown mode: {mode!r}")
@@ -0,0 +1,106 @@
1
+ """The fit engine: make synthesized speech land inside a subtitle's time window.
2
+
3
+ This is the heart of the tool. Each strategy is a different answer to "the speech is longer than
4
+ the cue window":
5
+
6
+ - ``overflow`` : never speed up; let speech run past the window into following silence. Best
7
+ voice quality, timing can drift on dense subtitles.
8
+ - ``hybrid`` : (default) allow natural overflow into the window plus the silent gap before the
9
+ next segment; only when still too long, speed up, capped at ``max_speedup``.
10
+ - ``precise`` : fit to the exact cue window (ignores the gap), speeding up to ``max_speedup``.
11
+
12
+ Speed-up is achieved by re-synthesizing at a faster rate when the backend reports reliable speed
13
+ control, otherwise by deterministic ffmpeg ``atempo`` time-stretching of the natural audio.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import subprocess
19
+ import tempfile
20
+ from pathlib import Path
21
+
22
+ from pydub import AudioSegment
23
+
24
+ from .models import Segment
25
+ from .probe import require_tool
26
+ from .tts.base import TTSBackend
27
+
28
+ STRATEGIES = ("hybrid", "overflow", "precise")
29
+
30
+
31
+ def _atempo_chain(factor: float) -> str:
32
+ """Build an atempo filter chain; a single atempo only accepts 0.5-2.0."""
33
+ parts: list[str] = []
34
+ f = factor
35
+ while f > 2.0:
36
+ parts.append("atempo=2.0")
37
+ f /= 2.0
38
+ while f < 0.5:
39
+ parts.append("atempo=0.5")
40
+ f /= 0.5
41
+ parts.append(f"atempo={f:.6f}")
42
+ return ",".join(parts)
43
+
44
+
45
+ def time_stretch(audio: AudioSegment, factor: float) -> AudioSegment:
46
+ """Change tempo by `factor` (>1 faster) without altering pitch, via ffmpeg."""
47
+ if abs(factor - 1.0) < 1e-3:
48
+ return audio
49
+ ffmpeg = require_tool("ffmpeg")
50
+ with tempfile.TemporaryDirectory() as tmp:
51
+ src = Path(tmp) / "in.wav"
52
+ dst = Path(tmp) / "out.wav"
53
+ audio.export(src, format="wav")
54
+ subprocess.run(
55
+ [ffmpeg, "-y", "-i", str(src), "-filter:a", _atempo_chain(factor), str(dst)],
56
+ check=True, capture_output=True,
57
+ )
58
+ return AudioSegment.from_file(dst, format="wav")
59
+
60
+
61
+ def _compress_to(
62
+ backend: TTSBackend,
63
+ segment: Segment,
64
+ natural: AudioSegment,
65
+ target_ms: int,
66
+ *,
67
+ voice: str | None,
68
+ instructions: str | None,
69
+ max_speedup: float,
70
+ ) -> AudioSegment:
71
+ target_ms = max(1, target_ms)
72
+ if len(natural) <= target_ms:
73
+ return natural
74
+ factor = min(len(natural) / target_ms, max_speedup)
75
+ if factor <= 1.0:
76
+ return natural
77
+ if backend.supports_reliable_speed:
78
+ return backend.synthesize(
79
+ segment.text, voice=voice, speed=factor, instructions=instructions
80
+ )
81
+ return time_stretch(natural, factor)
82
+
83
+
84
+ def fit_segment(
85
+ backend: TTSBackend,
86
+ segment: Segment,
87
+ *,
88
+ voice: str | None = None,
89
+ strategy: str = "hybrid",
90
+ max_speedup: float = 1.15,
91
+ instructions: str | None = None,
92
+ ) -> AudioSegment:
93
+ """Synthesize `segment` and shape it to its window per `strategy`."""
94
+ if strategy not in STRATEGIES:
95
+ raise ValueError(f"unknown strategy {strategy!r}; choose from {', '.join(STRATEGIES)}")
96
+
97
+ natural = backend.synthesize(segment.text, voice=voice, instructions=instructions)
98
+ if strategy == "overflow":
99
+ return natural
100
+
101
+ window = segment.duration_ms
102
+ target = window if strategy == "precise" else window + segment.gap_after_ms
103
+ return _compress_to(
104
+ backend, segment, natural, target,
105
+ voice=voice, instructions=instructions, max_speedup=max_speedup,
106
+ )
@@ -0,0 +1,38 @@
1
+ """Core data structures: subtitle cues and the synthesis segments derived from them."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+
7
+
8
+ @dataclass
9
+ class Cue:
10
+ """A single subtitle entry with millisecond timing."""
11
+
12
+ index: int
13
+ start_ms: int
14
+ end_ms: int
15
+ text: str
16
+
17
+ @property
18
+ def duration_ms(self) -> int:
19
+ return self.end_ms - self.start_ms
20
+
21
+
22
+ @dataclass
23
+ class Segment:
24
+ """A unit of speech to synthesize, spanning one or more merged cues.
25
+
26
+ `start_ms`/`end_ms` define the target window the synthesized audio should fit into.
27
+ `gap_after_ms` is the silence until the next segment starts, usable for overflow.
28
+ """
29
+
30
+ text: str
31
+ start_ms: int
32
+ end_ms: int
33
+ cues: list[Cue] = field(default_factory=list)
34
+ gap_after_ms: int = 0
35
+
36
+ @property
37
+ def duration_ms(self) -> int:
38
+ return self.end_ms - self.start_ms
@@ -0,0 +1,43 @@
1
+ """Mux a generated audio track into a video with ffmpeg.
2
+
3
+ - ``replace`` : drop any existing audio and use the generated track (restore/narrate use cases).
4
+ - ``mix`` : blend the generated track over the existing audio (audio-description use case;
5
+ requires the source to already have an audio stream).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import subprocess
11
+ from pathlib import Path
12
+
13
+ from .probe import require_tool
14
+
15
+ MODES = ("replace", "mix")
16
+
17
+
18
+ def mux(
19
+ video: str | Path,
20
+ audio: str | Path,
21
+ out: str | Path,
22
+ *,
23
+ mode: str = "replace",
24
+ ) -> None:
25
+ if mode not in MODES:
26
+ raise ValueError(f"unknown mux mode {mode!r}; choose from {', '.join(MODES)}")
27
+ ffmpeg = require_tool("ffmpeg")
28
+ video, audio, out = str(video), str(audio), str(out)
29
+
30
+ if mode == "replace":
31
+ cmd = [
32
+ ffmpeg, "-y", "-i", video, "-i", audio,
33
+ "-map", "0:v:0", "-map", "1:a:0",
34
+ "-c:v", "copy", "-c:a", "aac", "-shortest", out,
35
+ ]
36
+ else:
37
+ cmd = [
38
+ ffmpeg, "-y", "-i", video, "-i", audio,
39
+ "-filter_complex", "[0:a][1:a]amix=inputs=2:duration=first[a]",
40
+ "-map", "0:v:0", "-map", "[a]",
41
+ "-c:v", "copy", "-c:a", "aac", out,
42
+ ]
43
+ subprocess.run(cmd, check=True, capture_output=True)
@@ -0,0 +1,36 @@
1
+ """Subtitle loading. Delegates format handling (SRT/VTT/ASS/SSA) to pysubs2."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ import pysubs2
9
+
10
+ from .models import Cue
11
+
12
+ # pysubs2 keeps override tags out of `plaintext`, but ASS draws and stray markup can remain.
13
+ _TAG_RE = re.compile(r"\{[^}]*\}|<[^>]+>")
14
+ _WS_RE = re.compile(r"\s+")
15
+
16
+
17
+ def _clean(text: str) -> str:
18
+ text = _TAG_RE.sub("", text)
19
+ text = text.replace("\\N", " ").replace("\\n", " ")
20
+ return _WS_RE.sub(" ", text).strip()
21
+
22
+
23
+ def load_subtitles(path: str | Path) -> list[Cue]:
24
+ """Load a subtitle file into time-ordered cues with empty entries dropped."""
25
+ subs = pysubs2.load(str(path))
26
+ cues: list[Cue] = []
27
+ for event in sorted(subs, key=lambda e: e.start):
28
+ if event.is_comment:
29
+ continue
30
+ text = _clean(event.plaintext or event.text)
31
+ if not text:
32
+ continue
33
+ cues.append(
34
+ Cue(index=len(cues), start_ms=int(event.start), end_ms=int(event.end), text=text)
35
+ )
36
+ return cues
@@ -0,0 +1,47 @@
1
+ """Orchestrates parse -> filter -> segment -> fit -> assemble into a single audio track."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+ from pathlib import Path
7
+
8
+ from pydub import AudioSegment
9
+
10
+ from .assemble import assemble
11
+ from .filters import apply_mode
12
+ from .fit import fit_segment
13
+ from .parse import load_subtitles
14
+ from .segment import merge_cues
15
+ from .tts.base import TTSBackend
16
+
17
+ ProgressFn = Callable[[int, int], None]
18
+
19
+
20
+ def generate_track(
21
+ subtitles: str | Path,
22
+ backend: TTSBackend,
23
+ *,
24
+ mode: str = "all",
25
+ strategy: str = "hybrid",
26
+ max_speedup: float = 1.15,
27
+ merge_threshold_ms: int = 250,
28
+ voice: str | None = None,
29
+ instructions: str | None = None,
30
+ total_ms: int | None = None,
31
+ progress: ProgressFn | None = None,
32
+ ) -> AudioSegment:
33
+ cues = apply_mode(load_subtitles(subtitles), mode)
34
+ segments = merge_cues(cues, merge_threshold_ms=merge_threshold_ms)
35
+
36
+ rendered = []
37
+ for i, seg in enumerate(segments):
38
+ audio = fit_segment(
39
+ backend, seg,
40
+ voice=voice, strategy=strategy,
41
+ max_speedup=max_speedup, instructions=instructions,
42
+ )
43
+ rendered.append((seg, audio))
44
+ if progress:
45
+ progress(i + 1, len(segments))
46
+
47
+ return assemble(rendered, total_ms=total_ms, sample_rate=backend.sample_rate)
@@ -0,0 +1,27 @@
1
+ """Thin ffprobe wrapper for media duration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import shutil
7
+ import subprocess
8
+ from pathlib import Path
9
+
10
+
11
+ def require_tool(tool: str) -> str:
12
+ path = shutil.which(tool)
13
+ if path is None:
14
+ raise RuntimeError(f"{tool} not found on PATH; install ffmpeg")
15
+ return path
16
+
17
+
18
+ def media_duration_ms(path: str | Path) -> int:
19
+ """Return the container duration in milliseconds via ffprobe."""
20
+ ffprobe = require_tool("ffprobe")
21
+ out = subprocess.run(
22
+ [ffprobe, "-v", "error", "-show_entries", "format=duration",
23
+ "-of", "json", str(path)],
24
+ capture_output=True, text=True, check=True,
25
+ )
26
+ duration = json.loads(out.stdout)["format"]["duration"]
27
+ return int(float(duration) * 1000)
@@ -0,0 +1,45 @@
1
+ """Merge cues into sentence-sized synthesis segments.
2
+
3
+ SRT often splits one sentence across several cues. Synthesizing each cue independently produces
4
+ choppy prosody, so we merge consecutive cues into a single segment unless they are separated by a
5
+ real pause or the earlier cue ends a sentence. The merged segment's window spans the union of its
6
+ cues; per-cue timing is intentionally collapsed into one target window.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+
13
+ from .models import Cue, Segment
14
+
15
+ _SENTENCE_END_RE = re.compile(r"[.!?…][\"')\]]?\s*$")
16
+
17
+
18
+ def _make_segment(buf: list[Cue]) -> Segment:
19
+ return Segment(
20
+ text=" ".join(c.text for c in buf),
21
+ start_ms=buf[0].start_ms,
22
+ end_ms=buf[-1].end_ms,
23
+ cues=list(buf),
24
+ )
25
+
26
+
27
+ def merge_cues(cues: list[Cue], merge_threshold_ms: int = 250) -> list[Segment]:
28
+ """Group cues into segments, splitting on pauses or sentence boundaries."""
29
+ segments: list[Segment] = []
30
+ buf: list[Cue] = []
31
+ for cue in cues:
32
+ if buf:
33
+ prev = buf[-1]
34
+ gap = cue.start_ms - prev.end_ms
35
+ if gap > merge_threshold_ms or _SENTENCE_END_RE.search(prev.text):
36
+ segments.append(_make_segment(buf))
37
+ buf = []
38
+ buf.append(cue)
39
+ if buf:
40
+ segments.append(_make_segment(buf))
41
+
42
+ for i, seg in enumerate(segments):
43
+ nxt = segments[i + 1] if i + 1 < len(segments) else None
44
+ seg.gap_after_ms = max(0, nxt.start_ms - seg.end_ms) if nxt else 0
45
+ return segments
@@ -0,0 +1,28 @@
1
+ """Backend registry. Add a backend by subclassing TTSBackend and registering it here."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import TTSBackend
6
+ from .elevenlabs import ElevenLabsBackend
7
+ from .openai import OpenAIBackend
8
+ from .piper import PiperBackend
9
+
10
+ BACKENDS: dict[str, type[TTSBackend]] = {
11
+ "piper": PiperBackend,
12
+ "openai": OpenAIBackend,
13
+ "elevenlabs": ElevenLabsBackend,
14
+ }
15
+
16
+
17
+ def get_backend(name: str, *, voice: str | None = None, **extra: object) -> TTSBackend:
18
+ cls = BACKENDS.get(name)
19
+ if cls is None:
20
+ available = ", ".join(sorted(BACKENDS))
21
+ raise ValueError(f"unknown backend {name!r}; available: {available}")
22
+ kwargs = dict(extra)
23
+ if voice is not None:
24
+ kwargs["voice"] = voice
25
+ return cls(**kwargs)
26
+
27
+
28
+ __all__ = ["TTSBackend", "BACKENDS", "get_backend"]
@@ -0,0 +1,35 @@
1
+ """The backend contract. The fit engine depends only on this interface."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+
7
+ from pydub import AudioSegment
8
+
9
+
10
+ class TTSBackend(ABC):
11
+ """A text-to-speech engine.
12
+
13
+ `supports_reliable_speed` tells the fit engine whether asking the backend to re-synthesize at
14
+ a given `speed` is trustworthy. When True, the fit engine prefers a second-pass re-synthesis to
15
+ hit a target duration (higher quality); when False, it falls back to deterministic ffmpeg
16
+ time-stretching of the natural audio.
17
+ """
18
+
19
+ name: str = "base"
20
+ supports_reliable_speed: bool = False
21
+ sample_rate: int = 24000
22
+
23
+ @abstractmethod
24
+ def synthesize(
25
+ self,
26
+ text: str,
27
+ *,
28
+ voice: str | None = None,
29
+ speed: float = 1.0,
30
+ instructions: str | None = None,
31
+ ) -> AudioSegment:
32
+ """Synthesize `text` to a pydub AudioSegment."""
33
+
34
+ def list_voices(self) -> list[str]:
35
+ return []
@@ -0,0 +1,84 @@
1
+ """Paid TTS via ElevenLabs (/v1/text-to-speech).
2
+
3
+ ElevenLabs leads on naturalness and is purpose-built for dubbing. Its delivery is shaped by the
4
+ voice and ``voice_settings`` rather than natural-language instructions, and its ``speed`` knob is
5
+ range-limited (0.7-1.2) and quality-affecting, so we set ``supports_reliable_speed = False`` and
6
+ let the fit engine handle timing via ffmpeg time-stretch. ``--voice`` accepts a voice name (resolved
7
+ against the account's library) or a raw voice id.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import io
13
+ import os
14
+
15
+ import httpx
16
+ from pydub import AudioSegment
17
+
18
+ from .base import TTSBackend
19
+
20
+ API_BASE = "https://api.elevenlabs.io/v1"
21
+ MODEL = "eleven_multilingual_v2"
22
+ OUTPUT_FORMAT = "mp3_44100_128"
23
+ DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM" # Rachel, a stock voice present on every account.
24
+ SPEED_RANGE = (0.7, 1.2)
25
+
26
+
27
+ class ElevenLabsBackend(TTSBackend):
28
+ name = "elevenlabs"
29
+ supports_reliable_speed = False
30
+ sample_rate = 44100
31
+
32
+ def __init__(
33
+ self,
34
+ voice: str = DEFAULT_VOICE,
35
+ model: str = MODEL,
36
+ speed: float = 1.0,
37
+ api_key: str | None = None,
38
+ timeout: float = 120.0,
39
+ ) -> None:
40
+ key = api_key or os.environ.get("ELEVENLABS_API_KEY")
41
+ if not key:
42
+ raise RuntimeError("ELEVENLABS_API_KEY is not set")
43
+ self.default_voice = voice
44
+ self.model = model
45
+ self.base_speed = speed
46
+ self._voice_index: dict[str, str] | None = None
47
+ self._client = httpx.Client(timeout=timeout, headers={"xi-api-key": key})
48
+
49
+ def synthesize(
50
+ self,
51
+ text: str,
52
+ *,
53
+ voice: str | None = None,
54
+ speed: float = 1.0,
55
+ instructions: str | None = None,
56
+ ) -> AudioSegment:
57
+ voice_id = self._resolve_voice(voice or self.default_voice)
58
+ payload: dict[str, object] = {"text": text, "model_id": self.model}
59
+ effective_speed = self.base_speed * speed
60
+ if effective_speed != 1.0:
61
+ lo, hi = SPEED_RANGE
62
+ payload["voice_settings"] = {"speed": min(max(effective_speed, lo), hi)}
63
+ resp = self._client.post(
64
+ f"{API_BASE}/text-to-speech/{voice_id}",
65
+ params={"output_format": OUTPUT_FORMAT},
66
+ json=payload,
67
+ )
68
+ resp.raise_for_status()
69
+ return AudioSegment.from_file(io.BytesIO(resp.content), format="mp3")
70
+
71
+ def list_voices(self) -> list[str]:
72
+ return sorted(self._voice_map())
73
+
74
+ def _resolve_voice(self, voice: str) -> str:
75
+ return self._voice_map().get(voice, voice)
76
+
77
+ def _voice_map(self) -> dict[str, str]:
78
+ if self._voice_index is None:
79
+ resp = self._client.get(f"{API_BASE}/voices")
80
+ resp.raise_for_status()
81
+ self._voice_index = {
82
+ v["name"]: v["voice_id"] for v in resp.json().get("voices", [])
83
+ }
84
+ return self._voice_index
@@ -0,0 +1,77 @@
1
+ """Paid TTS via OpenAI's /v1/audio/speech (gpt-4o-mini-tts).
2
+
3
+ gpt-4o-mini-tts steers delivery through the natural-language ``instructions`` parameter rather than
4
+ a numeric rate; its ``speed`` parameter is reported as unreliable/quality-degrading. We therefore
5
+ set ``supports_reliable_speed = False`` so the fit engine handles timing via ffmpeg time-stretch
6
+ instead of trusting backend re-pacing. Input is capped around 2000 tokens, which sentence-sized
7
+ segments stay well under.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import io
13
+ import os
14
+
15
+ import httpx
16
+ from pydub import AudioSegment
17
+
18
+ from .base import TTSBackend
19
+
20
+ API_URL = "https://api.openai.com/v1/audio/speech"
21
+ MODEL = "gpt-4o-mini-tts"
22
+ VOICES = [
23
+ "alloy", "ash", "ballad", "coral", "echo",
24
+ "fable", "nova", "onyx", "sage", "shimmer",
25
+ ]
26
+ DEFAULT_VOICE = "coral"
27
+
28
+
29
+ class OpenAIBackend(TTSBackend):
30
+ name = "openai"
31
+ supports_reliable_speed = False
32
+ sample_rate = 24000
33
+
34
+ def __init__(
35
+ self,
36
+ voice: str = DEFAULT_VOICE,
37
+ model: str = MODEL,
38
+ speed: float = 1.0,
39
+ api_key: str | None = None,
40
+ timeout: float = 120.0,
41
+ ) -> None:
42
+ key = api_key or os.environ.get("OPENAI_API_KEY")
43
+ if not key:
44
+ raise RuntimeError("OPENAI_API_KEY is not set")
45
+ self.default_voice = voice
46
+ self.model = model
47
+ self.base_speed = speed
48
+ self._client = httpx.Client(
49
+ timeout=timeout,
50
+ headers={"Authorization": f"Bearer {key}"},
51
+ )
52
+
53
+ def synthesize(
54
+ self,
55
+ text: str,
56
+ *,
57
+ voice: str | None = None,
58
+ speed: float = 1.0,
59
+ instructions: str | None = None,
60
+ ) -> AudioSegment:
61
+ payload: dict[str, object] = {
62
+ "model": self.model,
63
+ "input": text,
64
+ "voice": voice or self.default_voice,
65
+ "response_format": "wav",
66
+ }
67
+ if instructions:
68
+ payload["instructions"] = instructions
69
+ effective_speed = self.base_speed * speed
70
+ if effective_speed != 1.0:
71
+ payload["speed"] = effective_speed
72
+ resp = self._client.post(API_URL, json=payload)
73
+ resp.raise_for_status()
74
+ return AudioSegment.from_file(io.BytesIO(resp.content), format="wav")
75
+
76
+ def list_voices(self) -> list[str]:
77
+ return list(VOICES)
@@ -0,0 +1,70 @@
1
+ """Local TTS via a gopipertts server (https://github.com/nbr23/gopipertts).
2
+
3
+ gopiper exposes ``GET/POST /api/tts`` with ``text``, ``voice``, ``speed`` (a reliable rate
4
+ multiplier) and ``outputFormat``, plus ``GET /api/voices``. Its ``speed`` is a genuine re-pacing,
5
+ so this backend advertises reliable speed control.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import os
12
+
13
+ import httpx
14
+ from pydub import AudioSegment
15
+
16
+ from .base import TTSBackend
17
+
18
+ DEFAULT_URL = "http://localhost:8080"
19
+ DEFAULT_VOICE = "en_US-ryan-high"
20
+ # Piper at 1.0 is slow and flat; a small baseline lift makes narration livelier.
21
+ DEFAULT_SPEED = 1.2
22
+
23
+
24
+ class PiperBackend(TTSBackend):
25
+ name = "piper"
26
+ supports_reliable_speed = True
27
+ sample_rate = 22050
28
+
29
+ def __init__(
30
+ self,
31
+ base_url: str | None = None,
32
+ voice: str = DEFAULT_VOICE,
33
+ speed: float = DEFAULT_SPEED,
34
+ timeout: float = 120.0,
35
+ ) -> None:
36
+ url = base_url or os.environ.get("SRT2SPEECH_PIPER_URL", DEFAULT_URL)
37
+ self.base_url = url.rstrip("/")
38
+ self.default_voice = voice
39
+ self.base_speed = speed
40
+ self._client = httpx.Client(timeout=timeout)
41
+
42
+ def synthesize(
43
+ self,
44
+ text: str,
45
+ *,
46
+ voice: str | None = None,
47
+ speed: float = 1.0,
48
+ instructions: str | None = None,
49
+ ) -> AudioSegment:
50
+ params = {
51
+ "text": text,
52
+ "voice": voice or self.default_voice,
53
+ "speed": self.base_speed * speed,
54
+ "outputFormat": "wav",
55
+ }
56
+ resp = self._client.get(f"{self.base_url}/api/tts", params=params)
57
+ resp.raise_for_status()
58
+ return AudioSegment.from_file(io.BytesIO(resp.content), format="wav")
59
+
60
+ def list_voices(self) -> list[str]:
61
+ resp = self._client.get(f"{self.base_url}/api/voices")
62
+ resp.raise_for_status()
63
+ data = resp.json()
64
+ voices: list[str] = []
65
+ for item in data:
66
+ if isinstance(item, str):
67
+ voices.append(item)
68
+ elif isinstance(item, dict):
69
+ voices.append(str(item.get("key") or item.get("name") or item.get("id") or item))
70
+ return voices