srt2speech 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- srt2speech/__init__.py +3 -0
- srt2speech/__main__.py +3 -0
- srt2speech/assemble.py +31 -0
- srt2speech/cli.py +156 -0
- srt2speech/filters.py +50 -0
- srt2speech/fit.py +106 -0
- srt2speech/models.py +38 -0
- srt2speech/mux.py +43 -0
- srt2speech/parse.py +36 -0
- srt2speech/pipeline.py +47 -0
- srt2speech/probe.py +27 -0
- srt2speech/segment.py +45 -0
- srt2speech/tts/__init__.py +28 -0
- srt2speech/tts/base.py +35 -0
- srt2speech/tts/elevenlabs.py +84 -0
- srt2speech/tts/openai.py +77 -0
- srt2speech/tts/piper.py +70 -0
- srt2speech-1.0.0.dist-info/METADATA +108 -0
- srt2speech-1.0.0.dist-info/RECORD +22 -0
- srt2speech-1.0.0.dist-info/WHEEL +4 -0
- srt2speech-1.0.0.dist-info/entry_points.txt +3 -0
- srt2speech-1.0.0.dist-info/licenses/LICENSE +9 -0
srt2speech/__init__.py
ADDED
srt2speech/__main__.py
ADDED
srt2speech/assemble.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Lay rendered segments onto a silent timeline at their exact start offsets.
|
|
2
|
+
|
|
3
|
+
Overlapping audio (from overflow that exceeds even the gap) is mixed rather than ducked or pushed;
|
|
4
|
+
collision handling is a deliberate v2 refinement. The hybrid strategy keeps such overlaps rare.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pydub import AudioSegment
|
|
10
|
+
|
|
11
|
+
from .models import Segment
|
|
12
|
+
|
|
13
|
+
Rendered = list[tuple[Segment, AudioSegment]]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def assemble(
|
|
17
|
+
rendered: Rendered,
|
|
18
|
+
*,
|
|
19
|
+
total_ms: int | None = None,
|
|
20
|
+
sample_rate: int = 24000,
|
|
21
|
+
channels: int = 1,
|
|
22
|
+
) -> AudioSegment:
|
|
23
|
+
"""Mix rendered segments onto a silent base sized to `total_ms` (or the content extent)."""
|
|
24
|
+
content_end = max((seg.start_ms + len(audio) for seg, audio in rendered), default=0)
|
|
25
|
+
length = max(total_ms or 0, content_end)
|
|
26
|
+
|
|
27
|
+
base = AudioSegment.silent(duration=length, frame_rate=sample_rate).set_channels(channels)
|
|
28
|
+
for seg, audio in rendered:
|
|
29
|
+
audio = audio.set_frame_rate(sample_rate).set_channels(channels)
|
|
30
|
+
base = base.overlay(audio, position=max(0, seg.start_ms))
|
|
31
|
+
return base
|
srt2speech/cli.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Command-line interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import tempfile
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import typer
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.progress import Progress
|
|
11
|
+
|
|
12
|
+
from .mux import mux as mux_audio
|
|
13
|
+
from .pipeline import generate_track
|
|
14
|
+
from .probe import media_duration_ms
|
|
15
|
+
from .tts import BACKENDS, get_backend
|
|
16
|
+
|
|
17
|
+
app = typer.Typer(
|
|
18
|
+
add_completion=False,
|
|
19
|
+
help="Synthesize a timestamp-synced speech track from subtitles and mux it into video.",
|
|
20
|
+
)
|
|
21
|
+
console = Console()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _build_backend(name: str, voice: str | None, piper_url: str | None, speed: float | None):
|
|
25
|
+
extra: dict[str, object] = {}
|
|
26
|
+
if name == "piper" and piper_url:
|
|
27
|
+
extra["base_url"] = piper_url
|
|
28
|
+
if speed is not None:
|
|
29
|
+
extra["speed"] = speed
|
|
30
|
+
return get_backend(name, voice=voice, **extra)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _render_track(
|
|
34
|
+
srt: Path,
|
|
35
|
+
output: Path,
|
|
36
|
+
*,
|
|
37
|
+
video: Path | None,
|
|
38
|
+
backend: str,
|
|
39
|
+
voice: str | None,
|
|
40
|
+
strategy: str,
|
|
41
|
+
max_speedup: float,
|
|
42
|
+
merge_threshold: int,
|
|
43
|
+
mode: str,
|
|
44
|
+
instructions: str | None,
|
|
45
|
+
piper_url: str | None,
|
|
46
|
+
speed: float | None,
|
|
47
|
+
) -> Path:
|
|
48
|
+
try:
|
|
49
|
+
engine = _build_backend(backend, voice, piper_url, speed)
|
|
50
|
+
except (ValueError, RuntimeError) as exc:
|
|
51
|
+
raise typer.BadParameter(str(exc)) from exc
|
|
52
|
+
|
|
53
|
+
total_ms = media_duration_ms(video) if video else None
|
|
54
|
+
|
|
55
|
+
with Progress(console=console, transient=True) as progress:
|
|
56
|
+
task = progress.add_task("Synthesizing", total=None)
|
|
57
|
+
|
|
58
|
+
def on_progress(done: int, total: int) -> None:
|
|
59
|
+
progress.update(task, completed=done, total=total)
|
|
60
|
+
|
|
61
|
+
track = generate_track(
|
|
62
|
+
srt, engine,
|
|
63
|
+
mode=mode, strategy=strategy, max_speedup=max_speedup,
|
|
64
|
+
merge_threshold_ms=merge_threshold, voice=voice,
|
|
65
|
+
instructions=instructions, total_ms=total_ms, progress=on_progress,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
fmt = output.suffix.lstrip(".") or "wav"
|
|
69
|
+
track.export(output, format=fmt)
|
|
70
|
+
console.print(f"[green]Wrote[/] {output} ({len(track) / 1000:.1f}s)")
|
|
71
|
+
return output
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@app.command()
|
|
75
|
+
def generate(
|
|
76
|
+
srt: Path = typer.Argument(..., exists=True, dir_okay=False, help="Subtitle file"),
|
|
77
|
+
output: Path = typer.Option(Path("out.wav"), "-o", "--output", help="Output audio file"),
|
|
78
|
+
video: Path | None = typer.Option(None, "--video", help="Match this media's duration"),
|
|
79
|
+
backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
|
|
80
|
+
voice: str | None = typer.Option(None, "--voice"),
|
|
81
|
+
strategy: str = typer.Option("hybrid", "--strategy", help="hybrid | overflow | precise"),
|
|
82
|
+
max_speedup: float = typer.Option(1.15, "--max-speedup", help="Cap on speech speed-up"),
|
|
83
|
+
merge_threshold: int = typer.Option(250, "--merge-threshold", help="Cue merge gap (ms)"),
|
|
84
|
+
mode: str = typer.Option("all", "--mode", help="all | descriptive | dialogue"),
|
|
85
|
+
instructions: str | None = typer.Option(None, "--instructions", help="OpenAI delivery hint"),
|
|
86
|
+
speed: float | None = typer.Option(None, "--speed", help="Base rate (piper default 1.2)"),
|
|
87
|
+
piper_url: str | None = typer.Option(None, "--piper-url", help="gopipertts base URL"),
|
|
88
|
+
) -> None:
|
|
89
|
+
"""Generate a synced speech track from a subtitle file."""
|
|
90
|
+
_render_track(
|
|
91
|
+
srt, output, video=video, backend=backend, voice=voice, strategy=strategy,
|
|
92
|
+
max_speedup=max_speedup, merge_threshold=merge_threshold, mode=mode,
|
|
93
|
+
instructions=instructions, piper_url=piper_url, speed=speed,
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@app.command()
|
|
98
|
+
def mux(
|
|
99
|
+
video: Path = typer.Argument(..., exists=True, dir_okay=False),
|
|
100
|
+
audio: Path = typer.Argument(..., exists=True, dir_okay=False),
|
|
101
|
+
output: Path = typer.Option(..., "-o", "--output"),
|
|
102
|
+
mode: str = typer.Option("replace", "--mode", help="replace | mix"),
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Mux an audio track into a video."""
|
|
105
|
+
mux_audio(video, audio, output, mode=mode)
|
|
106
|
+
console.print(f"[green]Wrote[/] {output}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
@app.command()
|
|
110
|
+
def run(
|
|
111
|
+
video: Path = typer.Argument(..., exists=True, dir_okay=False),
|
|
112
|
+
srt: Path = typer.Argument(..., exists=True, dir_okay=False),
|
|
113
|
+
output: Path = typer.Option(..., "-o", "--output", help="Output video file"),
|
|
114
|
+
backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
|
|
115
|
+
voice: str | None = typer.Option(None, "--voice"),
|
|
116
|
+
strategy: str = typer.Option("hybrid", "--strategy", help="hybrid | overflow | precise"),
|
|
117
|
+
max_speedup: float = typer.Option(1.15, "--max-speedup"),
|
|
118
|
+
merge_threshold: int = typer.Option(250, "--merge-threshold"),
|
|
119
|
+
mode: str = typer.Option("all", "--mode", help="all | descriptive | dialogue"),
|
|
120
|
+
instructions: str | None = typer.Option(None, "--instructions"),
|
|
121
|
+
speed: float | None = typer.Option(None, "--speed", help="Base rate (piper default 1.2)"),
|
|
122
|
+
mux_mode: str = typer.Option("replace", "--mux-mode", help="replace | mix"),
|
|
123
|
+
piper_url: str | None = typer.Option(None, "--piper-url"),
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Generate the track and mux it into the video in one step."""
|
|
126
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
127
|
+
track = Path(tmp) / "track.wav"
|
|
128
|
+
_render_track(
|
|
129
|
+
srt, track, video=video, backend=backend, voice=voice, strategy=strategy,
|
|
130
|
+
max_speedup=max_speedup, merge_threshold=merge_threshold, mode=mode,
|
|
131
|
+
instructions=instructions, piper_url=piper_url, speed=speed,
|
|
132
|
+
)
|
|
133
|
+
mux_audio(video, track, output, mode=mux_mode)
|
|
134
|
+
console.print(f"[green]Wrote[/] {output}")
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@app.command()
|
|
138
|
+
def voices(
|
|
139
|
+
backend: str = typer.Option("piper", "--backend", help=f"One of: {', '.join(BACKENDS)}"),
|
|
140
|
+
piper_url: str | None = typer.Option(None, "--piper-url"),
|
|
141
|
+
) -> None:
|
|
142
|
+
"""List available voices for a backend."""
|
|
143
|
+
try:
|
|
144
|
+
engine = _build_backend(backend, None, piper_url, None)
|
|
145
|
+
for v in engine.list_voices():
|
|
146
|
+
console.print(v)
|
|
147
|
+
except (ValueError, RuntimeError) as exc:
|
|
148
|
+
raise typer.BadParameter(str(exc)) from exc
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def main() -> None:
|
|
152
|
+
app()
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
if __name__ == "__main__":
|
|
156
|
+
main()
|
srt2speech/filters.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Cue filtering for accessibility modes.
|
|
2
|
+
|
|
3
|
+
SDH (subtitles for the deaf and hard-of-hearing) mark non-dialogue sound information as
|
|
4
|
+
bracketed/parenthesized cues (``[door creaks]``, ``(ominous music)``) or all-caps sound effects
|
|
5
|
+
(``THUNDER RUMBLES``). Keeping only those yields an audio-description-style track; dropping them
|
|
6
|
+
yields a clean dialogue dub.
|
|
7
|
+
|
|
8
|
+
The heuristics are whole-cue and intentionally simple; partial cues ("He runs. [gunshot]") are
|
|
9
|
+
treated by their dominant form. Tune the regexes here if a corpus needs it.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from collections.abc import Iterable
|
|
16
|
+
|
|
17
|
+
from .models import Cue
|
|
18
|
+
|
|
19
|
+
_WRAPPED_RE = re.compile(r"^\s*[\[(].*[\])]\s*$", re.DOTALL)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def is_descriptive(text: str) -> bool:
|
|
23
|
+
"""True for bracketed/parenthesized cues or all-caps sound effects."""
|
|
24
|
+
stripped = text.strip()
|
|
25
|
+
if not stripped:
|
|
26
|
+
return False
|
|
27
|
+
if _WRAPPED_RE.match(stripped):
|
|
28
|
+
return True
|
|
29
|
+
letters = [c for c in stripped if c.isalpha()]
|
|
30
|
+
return bool(letters) and all(c.isupper() for c in letters)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def keep_descriptive(cues: Iterable[Cue]) -> list[Cue]:
|
|
34
|
+
return [c for c in cues if is_descriptive(c.text)]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def keep_dialogue(cues: Iterable[Cue]) -> list[Cue]:
|
|
38
|
+
return [c for c in cues if not is_descriptive(c.text)]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def apply_mode(cues: Iterable[Cue], mode: str) -> list[Cue]:
|
|
42
|
+
"""Select cues for a CLI ``--mode``: ``all`` | ``descriptive`` | ``dialogue``."""
|
|
43
|
+
cues = list(cues)
|
|
44
|
+
if mode == "all":
|
|
45
|
+
return cues
|
|
46
|
+
if mode == "descriptive":
|
|
47
|
+
return keep_descriptive(cues)
|
|
48
|
+
if mode == "dialogue":
|
|
49
|
+
return keep_dialogue(cues)
|
|
50
|
+
raise ValueError(f"unknown mode: {mode!r}")
|
srt2speech/fit.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
"""The fit engine: make synthesized speech land inside a subtitle's time window.
|
|
2
|
+
|
|
3
|
+
This is the heart of the tool. Each strategy is a different answer to "the speech is longer than
|
|
4
|
+
the cue window":
|
|
5
|
+
|
|
6
|
+
- ``overflow`` : never speed up; let speech run past the window into following silence. Best
|
|
7
|
+
voice quality, timing can drift on dense subtitles.
|
|
8
|
+
- ``hybrid`` : (default) allow natural overflow into the window plus the silent gap before the
|
|
9
|
+
next segment; only when still too long, speed up, capped at ``max_speedup``.
|
|
10
|
+
- ``precise`` : fit to the exact cue window (ignores the gap), speeding up to ``max_speedup``.
|
|
11
|
+
|
|
12
|
+
Speed-up is achieved by re-synthesizing at a faster rate when the backend reports reliable speed
|
|
13
|
+
control, otherwise by deterministic ffmpeg ``atempo`` time-stretching of the natural audio.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import subprocess
|
|
19
|
+
import tempfile
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
from pydub import AudioSegment
|
|
23
|
+
|
|
24
|
+
from .models import Segment
|
|
25
|
+
from .probe import require_tool
|
|
26
|
+
from .tts.base import TTSBackend
|
|
27
|
+
|
|
28
|
+
STRATEGIES = ("hybrid", "overflow", "precise")
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _atempo_chain(factor: float) -> str:
|
|
32
|
+
"""Build an atempo filter chain; a single atempo only accepts 0.5-2.0."""
|
|
33
|
+
parts: list[str] = []
|
|
34
|
+
f = factor
|
|
35
|
+
while f > 2.0:
|
|
36
|
+
parts.append("atempo=2.0")
|
|
37
|
+
f /= 2.0
|
|
38
|
+
while f < 0.5:
|
|
39
|
+
parts.append("atempo=0.5")
|
|
40
|
+
f /= 0.5
|
|
41
|
+
parts.append(f"atempo={f:.6f}")
|
|
42
|
+
return ",".join(parts)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def time_stretch(audio: AudioSegment, factor: float) -> AudioSegment:
|
|
46
|
+
"""Change tempo by `factor` (>1 faster) without altering pitch, via ffmpeg."""
|
|
47
|
+
if abs(factor - 1.0) < 1e-3:
|
|
48
|
+
return audio
|
|
49
|
+
ffmpeg = require_tool("ffmpeg")
|
|
50
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
51
|
+
src = Path(tmp) / "in.wav"
|
|
52
|
+
dst = Path(tmp) / "out.wav"
|
|
53
|
+
audio.export(src, format="wav")
|
|
54
|
+
subprocess.run(
|
|
55
|
+
[ffmpeg, "-y", "-i", str(src), "-filter:a", _atempo_chain(factor), str(dst)],
|
|
56
|
+
check=True, capture_output=True,
|
|
57
|
+
)
|
|
58
|
+
return AudioSegment.from_file(dst, format="wav")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _compress_to(
|
|
62
|
+
backend: TTSBackend,
|
|
63
|
+
segment: Segment,
|
|
64
|
+
natural: AudioSegment,
|
|
65
|
+
target_ms: int,
|
|
66
|
+
*,
|
|
67
|
+
voice: str | None,
|
|
68
|
+
instructions: str | None,
|
|
69
|
+
max_speedup: float,
|
|
70
|
+
) -> AudioSegment:
|
|
71
|
+
target_ms = max(1, target_ms)
|
|
72
|
+
if len(natural) <= target_ms:
|
|
73
|
+
return natural
|
|
74
|
+
factor = min(len(natural) / target_ms, max_speedup)
|
|
75
|
+
if factor <= 1.0:
|
|
76
|
+
return natural
|
|
77
|
+
if backend.supports_reliable_speed:
|
|
78
|
+
return backend.synthesize(
|
|
79
|
+
segment.text, voice=voice, speed=factor, instructions=instructions
|
|
80
|
+
)
|
|
81
|
+
return time_stretch(natural, factor)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def fit_segment(
|
|
85
|
+
backend: TTSBackend,
|
|
86
|
+
segment: Segment,
|
|
87
|
+
*,
|
|
88
|
+
voice: str | None = None,
|
|
89
|
+
strategy: str = "hybrid",
|
|
90
|
+
max_speedup: float = 1.15,
|
|
91
|
+
instructions: str | None = None,
|
|
92
|
+
) -> AudioSegment:
|
|
93
|
+
"""Synthesize `segment` and shape it to its window per `strategy`."""
|
|
94
|
+
if strategy not in STRATEGIES:
|
|
95
|
+
raise ValueError(f"unknown strategy {strategy!r}; choose from {', '.join(STRATEGIES)}")
|
|
96
|
+
|
|
97
|
+
natural = backend.synthesize(segment.text, voice=voice, instructions=instructions)
|
|
98
|
+
if strategy == "overflow":
|
|
99
|
+
return natural
|
|
100
|
+
|
|
101
|
+
window = segment.duration_ms
|
|
102
|
+
target = window if strategy == "precise" else window + segment.gap_after_ms
|
|
103
|
+
return _compress_to(
|
|
104
|
+
backend, segment, natural, target,
|
|
105
|
+
voice=voice, instructions=instructions, max_speedup=max_speedup,
|
|
106
|
+
)
|
srt2speech/models.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Core data structures: subtitle cues and the synthesis segments derived from them."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class Cue:
|
|
10
|
+
"""A single subtitle entry with millisecond timing."""
|
|
11
|
+
|
|
12
|
+
index: int
|
|
13
|
+
start_ms: int
|
|
14
|
+
end_ms: int
|
|
15
|
+
text: str
|
|
16
|
+
|
|
17
|
+
@property
|
|
18
|
+
def duration_ms(self) -> int:
|
|
19
|
+
return self.end_ms - self.start_ms
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class Segment:
|
|
24
|
+
"""A unit of speech to synthesize, spanning one or more merged cues.
|
|
25
|
+
|
|
26
|
+
`start_ms`/`end_ms` define the target window the synthesized audio should fit into.
|
|
27
|
+
`gap_after_ms` is the silence until the next segment starts, usable for overflow.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
text: str
|
|
31
|
+
start_ms: int
|
|
32
|
+
end_ms: int
|
|
33
|
+
cues: list[Cue] = field(default_factory=list)
|
|
34
|
+
gap_after_ms: int = 0
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def duration_ms(self) -> int:
|
|
38
|
+
return self.end_ms - self.start_ms
|
srt2speech/mux.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Mux a generated audio track into a video with ffmpeg.
|
|
2
|
+
|
|
3
|
+
- ``replace`` : drop any existing audio and use the generated track (restore/narrate use cases).
|
|
4
|
+
- ``mix`` : blend the generated track over the existing audio (audio-description use case;
|
|
5
|
+
requires the source to already have an audio stream).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from .probe import require_tool
|
|
14
|
+
|
|
15
|
+
MODES = ("replace", "mix")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def mux(
|
|
19
|
+
video: str | Path,
|
|
20
|
+
audio: str | Path,
|
|
21
|
+
out: str | Path,
|
|
22
|
+
*,
|
|
23
|
+
mode: str = "replace",
|
|
24
|
+
) -> None:
|
|
25
|
+
if mode not in MODES:
|
|
26
|
+
raise ValueError(f"unknown mux mode {mode!r}; choose from {', '.join(MODES)}")
|
|
27
|
+
ffmpeg = require_tool("ffmpeg")
|
|
28
|
+
video, audio, out = str(video), str(audio), str(out)
|
|
29
|
+
|
|
30
|
+
if mode == "replace":
|
|
31
|
+
cmd = [
|
|
32
|
+
ffmpeg, "-y", "-i", video, "-i", audio,
|
|
33
|
+
"-map", "0:v:0", "-map", "1:a:0",
|
|
34
|
+
"-c:v", "copy", "-c:a", "aac", "-shortest", out,
|
|
35
|
+
]
|
|
36
|
+
else:
|
|
37
|
+
cmd = [
|
|
38
|
+
ffmpeg, "-y", "-i", video, "-i", audio,
|
|
39
|
+
"-filter_complex", "[0:a][1:a]amix=inputs=2:duration=first[a]",
|
|
40
|
+
"-map", "0:v:0", "-map", "[a]",
|
|
41
|
+
"-c:v", "copy", "-c:a", "aac", out,
|
|
42
|
+
]
|
|
43
|
+
subprocess.run(cmd, check=True, capture_output=True)
|
srt2speech/parse.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Subtitle loading. Delegates format handling (SRT/VTT/ASS/SSA) to pysubs2."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pysubs2
|
|
9
|
+
|
|
10
|
+
from .models import Cue
|
|
11
|
+
|
|
12
|
+
# pysubs2 keeps override tags out of `plaintext`, but ASS draws and stray markup can remain.
|
|
13
|
+
_TAG_RE = re.compile(r"\{[^}]*\}|<[^>]+>")
|
|
14
|
+
_WS_RE = re.compile(r"\s+")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _clean(text: str) -> str:
|
|
18
|
+
text = _TAG_RE.sub("", text)
|
|
19
|
+
text = text.replace("\\N", " ").replace("\\n", " ")
|
|
20
|
+
return _WS_RE.sub(" ", text).strip()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def load_subtitles(path: str | Path) -> list[Cue]:
|
|
24
|
+
"""Load a subtitle file into time-ordered cues with empty entries dropped."""
|
|
25
|
+
subs = pysubs2.load(str(path))
|
|
26
|
+
cues: list[Cue] = []
|
|
27
|
+
for event in sorted(subs, key=lambda e: e.start):
|
|
28
|
+
if event.is_comment:
|
|
29
|
+
continue
|
|
30
|
+
text = _clean(event.plaintext or event.text)
|
|
31
|
+
if not text:
|
|
32
|
+
continue
|
|
33
|
+
cues.append(
|
|
34
|
+
Cue(index=len(cues), start_ms=int(event.start), end_ms=int(event.end), text=text)
|
|
35
|
+
)
|
|
36
|
+
return cues
|
srt2speech/pipeline.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Orchestrates parse -> filter -> segment -> fit -> assemble into a single audio track."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from pydub import AudioSegment
|
|
9
|
+
|
|
10
|
+
from .assemble import assemble
|
|
11
|
+
from .filters import apply_mode
|
|
12
|
+
from .fit import fit_segment
|
|
13
|
+
from .parse import load_subtitles
|
|
14
|
+
from .segment import merge_cues
|
|
15
|
+
from .tts.base import TTSBackend
|
|
16
|
+
|
|
17
|
+
ProgressFn = Callable[[int, int], None]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def generate_track(
|
|
21
|
+
subtitles: str | Path,
|
|
22
|
+
backend: TTSBackend,
|
|
23
|
+
*,
|
|
24
|
+
mode: str = "all",
|
|
25
|
+
strategy: str = "hybrid",
|
|
26
|
+
max_speedup: float = 1.15,
|
|
27
|
+
merge_threshold_ms: int = 250,
|
|
28
|
+
voice: str | None = None,
|
|
29
|
+
instructions: str | None = None,
|
|
30
|
+
total_ms: int | None = None,
|
|
31
|
+
progress: ProgressFn | None = None,
|
|
32
|
+
) -> AudioSegment:
|
|
33
|
+
cues = apply_mode(load_subtitles(subtitles), mode)
|
|
34
|
+
segments = merge_cues(cues, merge_threshold_ms=merge_threshold_ms)
|
|
35
|
+
|
|
36
|
+
rendered = []
|
|
37
|
+
for i, seg in enumerate(segments):
|
|
38
|
+
audio = fit_segment(
|
|
39
|
+
backend, seg,
|
|
40
|
+
voice=voice, strategy=strategy,
|
|
41
|
+
max_speedup=max_speedup, instructions=instructions,
|
|
42
|
+
)
|
|
43
|
+
rendered.append((seg, audio))
|
|
44
|
+
if progress:
|
|
45
|
+
progress(i + 1, len(segments))
|
|
46
|
+
|
|
47
|
+
return assemble(rendered, total_ms=total_ms, sample_rate=backend.sample_rate)
|
srt2speech/probe.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Thin ffprobe wrapper for media duration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def require_tool(tool: str) -> str:
|
|
12
|
+
path = shutil.which(tool)
|
|
13
|
+
if path is None:
|
|
14
|
+
raise RuntimeError(f"{tool} not found on PATH; install ffmpeg")
|
|
15
|
+
return path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def media_duration_ms(path: str | Path) -> int:
|
|
19
|
+
"""Return the container duration in milliseconds via ffprobe."""
|
|
20
|
+
ffprobe = require_tool("ffprobe")
|
|
21
|
+
out = subprocess.run(
|
|
22
|
+
[ffprobe, "-v", "error", "-show_entries", "format=duration",
|
|
23
|
+
"-of", "json", str(path)],
|
|
24
|
+
capture_output=True, text=True, check=True,
|
|
25
|
+
)
|
|
26
|
+
duration = json.loads(out.stdout)["format"]["duration"]
|
|
27
|
+
return int(float(duration) * 1000)
|
srt2speech/segment.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Merge cues into sentence-sized synthesis segments.
|
|
2
|
+
|
|
3
|
+
SRT often splits one sentence across several cues. Synthesizing each cue independently produces
|
|
4
|
+
choppy prosody, so we merge consecutive cues into a single segment unless they are separated by a
|
|
5
|
+
real pause or the earlier cue ends a sentence. The merged segment's window spans the union of its
|
|
6
|
+
cues; per-cue timing is intentionally collapsed into one target window.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
|
|
13
|
+
from .models import Cue, Segment
|
|
14
|
+
|
|
15
|
+
_SENTENCE_END_RE = re.compile(r"[.!?…][\"')\]]?\s*$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _make_segment(buf: list[Cue]) -> Segment:
|
|
19
|
+
return Segment(
|
|
20
|
+
text=" ".join(c.text for c in buf),
|
|
21
|
+
start_ms=buf[0].start_ms,
|
|
22
|
+
end_ms=buf[-1].end_ms,
|
|
23
|
+
cues=list(buf),
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def merge_cues(cues: list[Cue], merge_threshold_ms: int = 250) -> list[Segment]:
|
|
28
|
+
"""Group cues into segments, splitting on pauses or sentence boundaries."""
|
|
29
|
+
segments: list[Segment] = []
|
|
30
|
+
buf: list[Cue] = []
|
|
31
|
+
for cue in cues:
|
|
32
|
+
if buf:
|
|
33
|
+
prev = buf[-1]
|
|
34
|
+
gap = cue.start_ms - prev.end_ms
|
|
35
|
+
if gap > merge_threshold_ms or _SENTENCE_END_RE.search(prev.text):
|
|
36
|
+
segments.append(_make_segment(buf))
|
|
37
|
+
buf = []
|
|
38
|
+
buf.append(cue)
|
|
39
|
+
if buf:
|
|
40
|
+
segments.append(_make_segment(buf))
|
|
41
|
+
|
|
42
|
+
for i, seg in enumerate(segments):
|
|
43
|
+
nxt = segments[i + 1] if i + 1 < len(segments) else None
|
|
44
|
+
seg.gap_after_ms = max(0, nxt.start_ms - seg.end_ms) if nxt else 0
|
|
45
|
+
return segments
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Backend registry. Add a backend by subclassing TTSBackend and registering it here."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import TTSBackend
|
|
6
|
+
from .elevenlabs import ElevenLabsBackend
|
|
7
|
+
from .openai import OpenAIBackend
|
|
8
|
+
from .piper import PiperBackend
|
|
9
|
+
|
|
10
|
+
BACKENDS: dict[str, type[TTSBackend]] = {
|
|
11
|
+
"piper": PiperBackend,
|
|
12
|
+
"openai": OpenAIBackend,
|
|
13
|
+
"elevenlabs": ElevenLabsBackend,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_backend(name: str, *, voice: str | None = None, **extra: object) -> TTSBackend:
|
|
18
|
+
cls = BACKENDS.get(name)
|
|
19
|
+
if cls is None:
|
|
20
|
+
available = ", ".join(sorted(BACKENDS))
|
|
21
|
+
raise ValueError(f"unknown backend {name!r}; available: {available}")
|
|
22
|
+
kwargs = dict(extra)
|
|
23
|
+
if voice is not None:
|
|
24
|
+
kwargs["voice"] = voice
|
|
25
|
+
return cls(**kwargs)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__all__ = ["TTSBackend", "BACKENDS", "get_backend"]
|
srt2speech/tts/base.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""The backend contract. The fit engine depends only on this interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
|
|
7
|
+
from pydub import AudioSegment
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TTSBackend(ABC):
|
|
11
|
+
"""A text-to-speech engine.
|
|
12
|
+
|
|
13
|
+
`supports_reliable_speed` tells the fit engine whether asking the backend to re-synthesize at
|
|
14
|
+
a given `speed` is trustworthy. When True, the fit engine prefers a second-pass re-synthesis to
|
|
15
|
+
hit a target duration (higher quality); when False, it falls back to deterministic ffmpeg
|
|
16
|
+
time-stretching of the natural audio.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
name: str = "base"
|
|
20
|
+
supports_reliable_speed: bool = False
|
|
21
|
+
sample_rate: int = 24000
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def synthesize(
|
|
25
|
+
self,
|
|
26
|
+
text: str,
|
|
27
|
+
*,
|
|
28
|
+
voice: str | None = None,
|
|
29
|
+
speed: float = 1.0,
|
|
30
|
+
instructions: str | None = None,
|
|
31
|
+
) -> AudioSegment:
|
|
32
|
+
"""Synthesize `text` to a pydub AudioSegment."""
|
|
33
|
+
|
|
34
|
+
def list_voices(self) -> list[str]:
|
|
35
|
+
return []
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Paid TTS via ElevenLabs (/v1/text-to-speech).
|
|
2
|
+
|
|
3
|
+
ElevenLabs leads on naturalness and is purpose-built for dubbing. Its delivery is shaped by the
|
|
4
|
+
voice and ``voice_settings`` rather than natural-language instructions, and its ``speed`` knob is
|
|
5
|
+
range-limited (0.7-1.2) and quality-affecting, so we set ``supports_reliable_speed = False`` and
|
|
6
|
+
let the fit engine handle timing via ffmpeg time-stretch. ``--voice`` accepts a voice name (resolved
|
|
7
|
+
against the account's library) or a raw voice id.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import io
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
from pydub import AudioSegment
|
|
17
|
+
|
|
18
|
+
from .base import TTSBackend
|
|
19
|
+
|
|
20
|
+
API_BASE = "https://api.elevenlabs.io/v1"
|
|
21
|
+
MODEL = "eleven_multilingual_v2"
|
|
22
|
+
OUTPUT_FORMAT = "mp3_44100_128"
|
|
23
|
+
DEFAULT_VOICE = "21m00Tcm4TlvDq8ikWAM" # Rachel, a stock voice present on every account.
|
|
24
|
+
SPEED_RANGE = (0.7, 1.2)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ElevenLabsBackend(TTSBackend):
|
|
28
|
+
name = "elevenlabs"
|
|
29
|
+
supports_reliable_speed = False
|
|
30
|
+
sample_rate = 44100
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
voice: str = DEFAULT_VOICE,
|
|
35
|
+
model: str = MODEL,
|
|
36
|
+
speed: float = 1.0,
|
|
37
|
+
api_key: str | None = None,
|
|
38
|
+
timeout: float = 120.0,
|
|
39
|
+
) -> None:
|
|
40
|
+
key = api_key or os.environ.get("ELEVENLABS_API_KEY")
|
|
41
|
+
if not key:
|
|
42
|
+
raise RuntimeError("ELEVENLABS_API_KEY is not set")
|
|
43
|
+
self.default_voice = voice
|
|
44
|
+
self.model = model
|
|
45
|
+
self.base_speed = speed
|
|
46
|
+
self._voice_index: dict[str, str] | None = None
|
|
47
|
+
self._client = httpx.Client(timeout=timeout, headers={"xi-api-key": key})
|
|
48
|
+
|
|
49
|
+
def synthesize(
|
|
50
|
+
self,
|
|
51
|
+
text: str,
|
|
52
|
+
*,
|
|
53
|
+
voice: str | None = None,
|
|
54
|
+
speed: float = 1.0,
|
|
55
|
+
instructions: str | None = None,
|
|
56
|
+
) -> AudioSegment:
|
|
57
|
+
voice_id = self._resolve_voice(voice or self.default_voice)
|
|
58
|
+
payload: dict[str, object] = {"text": text, "model_id": self.model}
|
|
59
|
+
effective_speed = self.base_speed * speed
|
|
60
|
+
if effective_speed != 1.0:
|
|
61
|
+
lo, hi = SPEED_RANGE
|
|
62
|
+
payload["voice_settings"] = {"speed": min(max(effective_speed, lo), hi)}
|
|
63
|
+
resp = self._client.post(
|
|
64
|
+
f"{API_BASE}/text-to-speech/{voice_id}",
|
|
65
|
+
params={"output_format": OUTPUT_FORMAT},
|
|
66
|
+
json=payload,
|
|
67
|
+
)
|
|
68
|
+
resp.raise_for_status()
|
|
69
|
+
return AudioSegment.from_file(io.BytesIO(resp.content), format="mp3")
|
|
70
|
+
|
|
71
|
+
def list_voices(self) -> list[str]:
|
|
72
|
+
return sorted(self._voice_map())
|
|
73
|
+
|
|
74
|
+
def _resolve_voice(self, voice: str) -> str:
|
|
75
|
+
return self._voice_map().get(voice, voice)
|
|
76
|
+
|
|
77
|
+
def _voice_map(self) -> dict[str, str]:
|
|
78
|
+
if self._voice_index is None:
|
|
79
|
+
resp = self._client.get(f"{API_BASE}/voices")
|
|
80
|
+
resp.raise_for_status()
|
|
81
|
+
self._voice_index = {
|
|
82
|
+
v["name"]: v["voice_id"] for v in resp.json().get("voices", [])
|
|
83
|
+
}
|
|
84
|
+
return self._voice_index
|
srt2speech/tts/openai.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Paid TTS via OpenAI's /v1/audio/speech (gpt-4o-mini-tts).
|
|
2
|
+
|
|
3
|
+
gpt-4o-mini-tts steers delivery through the natural-language ``instructions`` parameter rather than
|
|
4
|
+
a numeric rate; its ``speed`` parameter is reported as unreliable/quality-degrading. We therefore
|
|
5
|
+
set ``supports_reliable_speed = False`` so the fit engine handles timing via ffmpeg time-stretch
|
|
6
|
+
instead of trusting backend re-pacing. Input is capped around 2000 tokens, which sentence-sized
|
|
7
|
+
segments stay well under.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import io
|
|
13
|
+
import os
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
from pydub import AudioSegment
|
|
17
|
+
|
|
18
|
+
from .base import TTSBackend
|
|
19
|
+
|
|
20
|
+
API_URL = "https://api.openai.com/v1/audio/speech"
|
|
21
|
+
MODEL = "gpt-4o-mini-tts"
|
|
22
|
+
VOICES = [
|
|
23
|
+
"alloy", "ash", "ballad", "coral", "echo",
|
|
24
|
+
"fable", "nova", "onyx", "sage", "shimmer",
|
|
25
|
+
]
|
|
26
|
+
DEFAULT_VOICE = "coral"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class OpenAIBackend(TTSBackend):
|
|
30
|
+
name = "openai"
|
|
31
|
+
supports_reliable_speed = False
|
|
32
|
+
sample_rate = 24000
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
voice: str = DEFAULT_VOICE,
|
|
37
|
+
model: str = MODEL,
|
|
38
|
+
speed: float = 1.0,
|
|
39
|
+
api_key: str | None = None,
|
|
40
|
+
timeout: float = 120.0,
|
|
41
|
+
) -> None:
|
|
42
|
+
key = api_key or os.environ.get("OPENAI_API_KEY")
|
|
43
|
+
if not key:
|
|
44
|
+
raise RuntimeError("OPENAI_API_KEY is not set")
|
|
45
|
+
self.default_voice = voice
|
|
46
|
+
self.model = model
|
|
47
|
+
self.base_speed = speed
|
|
48
|
+
self._client = httpx.Client(
|
|
49
|
+
timeout=timeout,
|
|
50
|
+
headers={"Authorization": f"Bearer {key}"},
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
def synthesize(
|
|
54
|
+
self,
|
|
55
|
+
text: str,
|
|
56
|
+
*,
|
|
57
|
+
voice: str | None = None,
|
|
58
|
+
speed: float = 1.0,
|
|
59
|
+
instructions: str | None = None,
|
|
60
|
+
) -> AudioSegment:
|
|
61
|
+
payload: dict[str, object] = {
|
|
62
|
+
"model": self.model,
|
|
63
|
+
"input": text,
|
|
64
|
+
"voice": voice or self.default_voice,
|
|
65
|
+
"response_format": "wav",
|
|
66
|
+
}
|
|
67
|
+
if instructions:
|
|
68
|
+
payload["instructions"] = instructions
|
|
69
|
+
effective_speed = self.base_speed * speed
|
|
70
|
+
if effective_speed != 1.0:
|
|
71
|
+
payload["speed"] = effective_speed
|
|
72
|
+
resp = self._client.post(API_URL, json=payload)
|
|
73
|
+
resp.raise_for_status()
|
|
74
|
+
return AudioSegment.from_file(io.BytesIO(resp.content), format="wav")
|
|
75
|
+
|
|
76
|
+
def list_voices(self) -> list[str]:
|
|
77
|
+
return list(VOICES)
|
srt2speech/tts/piper.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Local TTS via a gopipertts server (https://github.com/nbr23/gopipertts).
|
|
2
|
+
|
|
3
|
+
gopiper exposes ``GET/POST /api/tts`` with ``text``, ``voice``, ``speed`` (a reliable rate
|
|
4
|
+
multiplier) and ``outputFormat``, plus ``GET /api/voices``. Its ``speed`` is a genuine re-pacing,
|
|
5
|
+
so this backend advertises reliable speed control.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import io
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
import httpx
|
|
14
|
+
from pydub import AudioSegment
|
|
15
|
+
|
|
16
|
+
from .base import TTSBackend
|
|
17
|
+
|
|
18
|
+
DEFAULT_URL = "http://localhost:8080"
|
|
19
|
+
DEFAULT_VOICE = "en_US-ryan-high"
|
|
20
|
+
# Piper at 1.0 is slow and flat; a small baseline lift makes narration livelier.
|
|
21
|
+
DEFAULT_SPEED = 1.2
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PiperBackend(TTSBackend):
|
|
25
|
+
name = "piper"
|
|
26
|
+
supports_reliable_speed = True
|
|
27
|
+
sample_rate = 22050
|
|
28
|
+
|
|
29
|
+
def __init__(
|
|
30
|
+
self,
|
|
31
|
+
base_url: str | None = None,
|
|
32
|
+
voice: str = DEFAULT_VOICE,
|
|
33
|
+
speed: float = DEFAULT_SPEED,
|
|
34
|
+
timeout: float = 120.0,
|
|
35
|
+
) -> None:
|
|
36
|
+
url = base_url or os.environ.get("SRT2SPEECH_PIPER_URL", DEFAULT_URL)
|
|
37
|
+
self.base_url = url.rstrip("/")
|
|
38
|
+
self.default_voice = voice
|
|
39
|
+
self.base_speed = speed
|
|
40
|
+
self._client = httpx.Client(timeout=timeout)
|
|
41
|
+
|
|
42
|
+
def synthesize(
|
|
43
|
+
self,
|
|
44
|
+
text: str,
|
|
45
|
+
*,
|
|
46
|
+
voice: str | None = None,
|
|
47
|
+
speed: float = 1.0,
|
|
48
|
+
instructions: str | None = None,
|
|
49
|
+
) -> AudioSegment:
|
|
50
|
+
params = {
|
|
51
|
+
"text": text,
|
|
52
|
+
"voice": voice or self.default_voice,
|
|
53
|
+
"speed": self.base_speed * speed,
|
|
54
|
+
"outputFormat": "wav",
|
|
55
|
+
}
|
|
56
|
+
resp = self._client.get(f"{self.base_url}/api/tts", params=params)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
return AudioSegment.from_file(io.BytesIO(resp.content), format="wav")
|
|
59
|
+
|
|
60
|
+
def list_voices(self) -> list[str]:
|
|
61
|
+
resp = self._client.get(f"{self.base_url}/api/voices")
|
|
62
|
+
resp.raise_for_status()
|
|
63
|
+
data = resp.json()
|
|
64
|
+
voices: list[str] = []
|
|
65
|
+
for item in data:
|
|
66
|
+
if isinstance(item, str):
|
|
67
|
+
voices.append(item)
|
|
68
|
+
elif isinstance(item, dict):
|
|
69
|
+
voices.append(str(item.get("key") or item.get("name") or item.get("id") or item))
|
|
70
|
+
return voices
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: srt2speech
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Synthesize a timestamp-synced speech track from a subtitle file and mux it into video
|
|
5
|
+
Author: nbr23
|
|
6
|
+
Author-email: nbr23 <max@23.tf>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: pysubs2>=1.7
|
|
10
|
+
Requires-Dist: httpx>=0.27
|
|
11
|
+
Requires-Dist: typer>=0.12
|
|
12
|
+
Requires-Dist: pydub>=0.25
|
|
13
|
+
Requires-Dist: rich>=13.7
|
|
14
|
+
Requires-Dist: audioop-lts>=0.2 ; python_full_version >= '3.13'
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Project-URL: Homepage, https://github.com/nbr23/srt2speech
|
|
17
|
+
Project-URL: Repository, https://github.com/nbr23/srt2speech
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# srt2speech
|
|
21
|
+
|
|
22
|
+
Turn a subtitle file into a **timestamp-synced speech track** and mux it into a video.
|
|
23
|
+
|
|
24
|
+
Give it a video + an `.srt` (or `.vtt`/`.ass`); it synthesizes audio where each subtitle is spoken
|
|
25
|
+
at its timestamp, then optionally muxes the track back in with ffmpeg. Useful for restoring lost
|
|
26
|
+
audio, rough translation dubs, narrating silent videos, or adding **audio description** by reading
|
|
27
|
+
only the descriptive/SDH cues.
|
|
28
|
+
|
|
29
|
+
It does the SRT→audio part well and nothing else: **no translation, no transcription** — bring an
|
|
30
|
+
already-final subtitle file.
|
|
31
|
+
|
|
32
|
+
## Requirements
|
|
33
|
+
|
|
34
|
+
- Python ≥ 3.11, [uv](https://docs.astral.sh/uv/)
|
|
35
|
+
- `ffmpeg` / `ffprobe` on `PATH`
|
|
36
|
+
- A TTS backend:
|
|
37
|
+
- **piper** — a local [gopipertts](https://github.com/nbr23/gopipertts) server (free, default;
|
|
38
|
+
set `SRT2SPEECH_PIPER_URL` if not on `http://localhost:8080`)
|
|
39
|
+
- **openai** — `gpt-4o-mini-tts` (set `OPENAI_API_KEY`)
|
|
40
|
+
- **elevenlabs** — `eleven_multilingual_v2` (set `ELEVENLABS_API_KEY`)
|
|
41
|
+
|
|
42
|
+
## Install
|
|
43
|
+
|
|
44
|
+
```sh
|
|
45
|
+
uv sync
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Usage
|
|
49
|
+
|
|
50
|
+
```sh
|
|
51
|
+
# generate a synced track with the local piper backend, sized to the video
|
|
52
|
+
uv run srt2speech generate subs.srt --video clip.mp4 -o track.wav
|
|
53
|
+
|
|
54
|
+
# generate + mux into the video in one step
|
|
55
|
+
uv run srt2speech run clip.mp4 subs.srt -o dubbed.mp4
|
|
56
|
+
|
|
57
|
+
# paid backend with delivery guidance
|
|
58
|
+
OPENAI_API_KEY=... uv run srt2speech generate subs.srt \
|
|
59
|
+
--backend openai --voice coral --instructions "calm documentary narration" -o track.wav
|
|
60
|
+
|
|
61
|
+
# audio description: only descriptive/SDH cues, mixed over the existing audio
|
|
62
|
+
uv run srt2speech run movie.mkv subs.srt --mode descriptive --mux-mode mix -o described.mkv
|
|
63
|
+
|
|
64
|
+
# mux an existing track yourself
|
|
65
|
+
uv run srt2speech mux clip.mp4 track.wav -o dubbed.mp4
|
|
66
|
+
|
|
67
|
+
# list a backend's voices
|
|
68
|
+
uv run srt2speech voices --backend openai
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### Docker Compose
|
|
72
|
+
|
|
73
|
+
Runs a local piper server plus an on-demand CLI; no host Python or ffmpeg needed. Put your video
|
|
74
|
+
and subtitles in `./data` (mounted at `/data`); pulled voices are cached in `./voices`.
|
|
75
|
+
|
|
76
|
+
```sh
|
|
77
|
+
# 1. start the piper TTS server (preloads the default voice)
|
|
78
|
+
docker compose up -d gopipertts
|
|
79
|
+
|
|
80
|
+
# 2. run the CLI against files in ./data
|
|
81
|
+
docker compose run --rm srt2speech run /data/clip.mp4 /data/subs.srt -o /data/dubbed.mp4
|
|
82
|
+
|
|
83
|
+
# 3. tear down when done
|
|
84
|
+
docker compose down
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
For the OpenAI backend, put `OPENAI_API_KEY=sk-...` in a `.env` file (gitignored) — Compose loads it
|
|
88
|
+
automatically and passes it through to the CLI container.
|
|
89
|
+
|
|
90
|
+
### Sync strategies (`--strategy`)
|
|
91
|
+
|
|
92
|
+
Speech rarely fits a cue's window exactly. The fit engine offers:
|
|
93
|
+
|
|
94
|
+
- `hybrid` *(default)* — fit into the cue window plus the silent gap before the next cue; only then
|
|
95
|
+
speed up, capped by `--max-speedup` (default `1.15`).
|
|
96
|
+
- `overflow` — never speed up; let speech run into following silence (best quality, can drift).
|
|
97
|
+
- `precise` — fit the exact cue window, speeding up to the cap.
|
|
98
|
+
|
|
99
|
+
### Modes (`--mode`)
|
|
100
|
+
|
|
101
|
+
`all` (default) · `descriptive` (SDH/audio-description only) · `dialogue` (drop sound cues).
|
|
102
|
+
|
|
103
|
+
## Development
|
|
104
|
+
|
|
105
|
+
```sh
|
|
106
|
+
uv run pytest
|
|
107
|
+
uv run ruff check
|
|
108
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
srt2speech/__init__.py,sha256=HTg_rsnEfyzDBcVpd0mpvrKsDYOSct7Gm8-iDKokLc0,100
|
|
2
|
+
srt2speech/__main__.py,sha256=bYt9eEaoRQWdejEHFD8REx9jxVEdZptECFsV7F49Ink,30
|
|
3
|
+
srt2speech/assemble.py,sha256=QcgCOa9o-4PM81GJZAVLJ68zc0Uc7kxgiDt82jsEQfs,1087
|
|
4
|
+
srt2speech/cli.py,sha256=bOvBMBNzXCfaP7MhFeDXSspDZD0l2Wu20kIeGWyzzyQ,6018
|
|
5
|
+
srt2speech/filters.py,sha256=kqeczIcHwBlc1495qk-SjldYkDtC4vk0MvnVv6drSx4,1674
|
|
6
|
+
srt2speech/fit.py,sha256=9tgvG3QKd8dBKcDG0tqj6aRUu6YsTQnkS8rYXsMAqe8,3553
|
|
7
|
+
srt2speech/models.py,sha256=PxJT2b5xeiEzCFNLRiOg-J_sDLVIz7jgaKzhf7jjzlQ,906
|
|
8
|
+
srt2speech/mux.py,sha256=BXeLNVbzcVmYiBsY3WZS_vcXJrts6G7KMnx9C-mhV58,1319
|
|
9
|
+
srt2speech/parse.py,sha256=eiGABaq1eEEnUhN1XtRD1shITMeRTdJyWTf3rYwo4u4,1055
|
|
10
|
+
srt2speech/pipeline.py,sha256=LecCdeqVhMcHZEOTs_oV60xh01-lrECcuBLYjFPJ2NI,1357
|
|
11
|
+
srt2speech/probe.py,sha256=DSGKTxZQXfzuhhG08Nw4m4IdoRV6yNss5yBmjR2MSzA,775
|
|
12
|
+
srt2speech/segment.py,sha256=SbXFUCTVcDzFhvcceydQ3rzlg84LrqAs4ogBVBXX5AY,1533
|
|
13
|
+
srt2speech/tts/__init__.py,sha256=AWAjQdqJ1o5BDeqAFSQIDbkbT3NRUStUxUzhLNK1pQw,828
|
|
14
|
+
srt2speech/tts/base.py,sha256=fPHBxrBqiHXFYjqwRNGyCR9QDlWfLjPZAsbtPj1nnFc,1002
|
|
15
|
+
srt2speech/tts/elevenlabs.py,sha256=AxU36efaEmjKrsvvHgY0g4nCkFUGwLGszV4gCLd7ZyY,2905
|
|
16
|
+
srt2speech/tts/openai.py,sha256=2A6oN-_rGHjTcqSUPfIga7ysDYCiDPkvxna_0sv8dHs,2343
|
|
17
|
+
srt2speech/tts/piper.py,sha256=db5ztjLYaqHbgVjXTAtETh9ot7lJZCzmP-6vulzwwRc,2185
|
|
18
|
+
srt2speech-1.0.0.dist-info/licenses/LICENSE,sha256=E0EHU1wtxgICvZQv79uslz85y93XLe4rb3-NZa_4mzs,1062
|
|
19
|
+
srt2speech-1.0.0.dist-info/WHEEL,sha256=s49dN1sxqzkgPplo4QuUaKomil-_cbDzeLK4-pZKD-A,81
|
|
20
|
+
srt2speech-1.0.0.dist-info/entry_points.txt,sha256=sHpZgIrfBsl-y4oP5BF930YFirJTPNaj_ghCuyH9Eag,52
|
|
21
|
+
srt2speech-1.0.0.dist-info/METADATA,sha256=0GwxpXJ8GYE0iVtw8oFwsgfIlG-6IeC8WKL1eJQW3kg,3666
|
|
22
|
+
srt2speech-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 nbr23
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|