subtatix 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ Metadata-Version: 2.4
2
+ Name: subtatix
3
+ Version: 0.1.0
4
+ Summary: CLI for generating and translating SRT subtitles with WhisperX
5
+ Keywords: cli,subtitles,srt,whisperx,transcription,translation
6
+ Author: Chris Paganon
7
+ Author-email: Chris Paganon <info@chrispaganon.com>
8
+ License-Expression: BSD-2-Clause
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: End Users/Desktop
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Operating System :: OS Independent
14
+ Classifier: Programming Language :: Python
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
18
+ Classifier: Topic :: Text Processing
19
+ Classifier: Topic :: Utilities
20
+ Requires-Dist: accelerate>=1.13.0
21
+ Requires-Dist: sentencepiece>=0.2.1
22
+ Requires-Dist: torch>=2.8.0
23
+ Requires-Dist: transformers>=4.57.6
24
+ Requires-Dist: typer>=0.25.1
25
+ Requires-Dist: whisperx>=3.8.5
26
+ Requires-Python: >=3.12
27
+ Project-URL: Homepage, https://github.com/chris-paganon/subtatix
28
+ Project-URL: Repository, https://github.com/chris-paganon/subtatix
29
+ Project-URL: Issues, https://github.com/chris-paganon/subtatix/issues
30
+ Description-Content-Type: text/markdown
31
+
32
+ # Subtatix
33
+
34
+ `Subtatix` is a small CLI for generating `.srt` subtitles from audio or video files with [WhisperX](https://github.com/m-bain/whisperX), with optional subtitle translation.
35
+
36
+ It transcribes the input, aligns subtitle timings with WhisperX, and can then translate the resulting subtitle lines into another language.
37
+
38
+ ## Requirements
39
+
40
+ - Python 3.12+
41
+ - `ffmpeg` installed separately and available on your `PATH`
42
+ - Enough disk space for model downloads and caching
43
+
44
+ The first run will be slower because WhisperX and translation models need to be downloaded. Subsequent runs reuse the cached models and do not need to download them again unless the cache is cleared.
45
+
46
+ `ffmpeg` is an external system dependency. It is not installed by `pip`, `uvx`, or `uv tool install`.
47
+
48
+ ## Installation
49
+
50
+ Run without installing:
51
+
52
+ ```bash
53
+ uvx subtatix --help
54
+ ```
55
+
56
+ Install as a tool with `uv`:
57
+
58
+ ```bash
59
+ uv tool install subtatix
60
+ ```
61
+
62
+ Install with `pip`:
63
+
64
+ ```bash
65
+ pip install subtatix
66
+ ```
67
+
68
+ ## Usage
69
+
70
+ Run the CLI:
71
+
72
+ ```bash
73
+ subtatix input.mp4
74
+ ```
75
+
76
+ Transcribe to a specific output path:
77
+
78
+ ```bash
79
+ subtatix input.mp4 --output some-path/some-file-name
80
+ ```
81
+
82
+ `--output` is a base path, not a full `.srt` filename. This writes `some-path/some-file-name.srt`. If you also translate to Spanish, it writes `some-path/some-file-name.es.srt`.
83
+
84
+ Set the source language explicitly:
85
+
86
+ ```bash
87
+ subtatix input.mp4 --source-language en
88
+ ```
89
+
90
+ Translate after transcription:
91
+
92
+ ```bash
93
+ subtatix input.mp4 --to es
94
+ ```
95
+
96
+ This writes both the original transcription SRT and the translated SRT by default.
97
+
98
+ If CUDA runs out of memory on larger files, reduce the batch size or force CPU mode:
99
+
100
+ ```bash
101
+ subtatix input.mp4 --batch-size 4
102
+ subtatix input.mp4 --device cpu
103
+ ```
104
+
105
+ To discard the original transcription and only keep the translated output:
106
+
107
+ ```bash
108
+ subtatix input.mp4 --to es --discard-transcription
109
+ ```
110
+
111
+ Passing an `--output` value that ends in `.srt` is rejected. Use a base path such as `--output subtitles` instead.
112
+
113
+ List supported language codes:
114
+
115
+ ```bash
116
+ subtatix --list-languages
117
+ subtatix --list-target-languages
118
+ ```
119
+
120
+ ## Models
121
+
122
+ By default, transcription uses WhisperX with the Whisper model `large-v2`. This is a good general default when you want higher transcription quality and aligned subtitle timings, but it is heavier and slower than smaller Whisper models.
123
+
124
+ Translation uses `facebook/nllb-200-1.3B`. The CLI accepts simple target codes such as `en`, `es`, `fr`, `de`, `pt`, `ja`, `ko`, `zh`, and also raw NLLB codes such as `spa_Latn`.
125
+
126
+ Other model options can also be used:
127
+
128
+ - For transcription, you can pass another Whisper model with `--model`, such as `small`, `medium`, or `large-v3`, depending on your speed and accuracy needs.
129
+ - For translation, the code currently defaults to the NLLB model above, but the translation layer is built around Hugging Face seq2seq models and could be adapted to use a different multilingual translation model if needed.
@@ -0,0 +1,98 @@
1
+ # Subtatix
2
+
3
+ `Subtatix` is a small CLI for generating `.srt` subtitles from audio or video files with [WhisperX](https://github.com/m-bain/whisperX), with optional subtitle translation.
4
+
5
+ It transcribes the input, aligns subtitle timings with WhisperX, and can then translate the resulting subtitle lines into another language.
6
+
7
+ ## Requirements
8
+
9
+ - Python 3.12+
10
+ - `ffmpeg` installed separately and available on your `PATH`
11
+ - Enough disk space for model downloads and caching
12
+
13
+ The first run will be slower because WhisperX and translation models need to be downloaded. Subsequent runs reuse the cached models and do not need to download them again unless the cache is cleared.
14
+
15
+ `ffmpeg` is an external system dependency. It is not installed by `pip`, `uvx`, or `uv tool install`.
16
+
17
+ ## Installation
18
+
19
+ Run without installing:
20
+
21
+ ```bash
22
+ uvx subtatix --help
23
+ ```
24
+
25
+ Install as a tool with `uv`:
26
+
27
+ ```bash
28
+ uv tool install subtatix
29
+ ```
30
+
31
+ Install with `pip`:
32
+
33
+ ```bash
34
+ pip install subtatix
35
+ ```
36
+
37
+ ## Usage
38
+
39
+ Run the CLI:
40
+
41
+ ```bash
42
+ subtatix input.mp4
43
+ ```
44
+
45
+ Transcribe to a specific output path:
46
+
47
+ ```bash
48
+ subtatix input.mp4 --output some-path/some-file-name
49
+ ```
50
+
51
+ `--output` is a base path, not a full `.srt` filename. This writes `some-path/some-file-name.srt`. If you also translate to Spanish, it writes `some-path/some-file-name.es.srt`.
52
+
53
+ Set the source language explicitly:
54
+
55
+ ```bash
56
+ subtatix input.mp4 --source-language en
57
+ ```
58
+
59
+ Translate after transcription:
60
+
61
+ ```bash
62
+ subtatix input.mp4 --to es
63
+ ```
64
+
65
+ This writes both the original transcription SRT and the translated SRT by default.
66
+
67
+ If CUDA runs out of memory on larger files, reduce the batch size or force CPU mode:
68
+
69
+ ```bash
70
+ subtatix input.mp4 --batch-size 4
71
+ subtatix input.mp4 --device cpu
72
+ ```
73
+
74
+ To discard the original transcription and only keep the translated output:
75
+
76
+ ```bash
77
+ subtatix input.mp4 --to es --discard-transcription
78
+ ```
79
+
80
+ Passing an `--output` value that ends in `.srt` is rejected. Use a base path such as `--output subtitles` instead.
81
+
82
+ List supported language codes:
83
+
84
+ ```bash
85
+ subtatix --list-languages
86
+ subtatix --list-target-languages
87
+ ```
88
+
89
+ ## Models
90
+
91
+ By default, transcription uses WhisperX with the Whisper model `large-v2`. This is a good general default when you want higher transcription quality and aligned subtitle timings, but it is heavier and slower than smaller Whisper models.
92
+
93
+ Translation uses `facebook/nllb-200-1.3B`. The CLI accepts simple target codes such as `en`, `es`, `fr`, `de`, `pt`, `ja`, `ko`, `zh`, and also raw NLLB codes such as `spa_Latn`.
94
+
95
+ Other model options can also be used:
96
+
97
+ - For transcription, you can pass another Whisper model with `--model`, such as `small`, `medium`, or `large-v3`, depending on your speed and accuracy needs.
98
+ - For translation, the code currently defaults to the NLLB model above, but the translation layer is built around Hugging Face seq2seq models and could be adapted to use a different multilingual translation model if needed.
@@ -0,0 +1,44 @@
1
+ [project]
2
+ name = "subtatix"
3
+ version = "0.1.0"
4
+ description = "CLI for generating and translating SRT subtitles with WhisperX"
5
+ readme = "README.md"
6
+ license = "BSD-2-Clause"
7
+ authors = [
8
+ { name = "Chris Paganon", email = "info@chrispaganon.com" }
9
+ ]
10
+ requires-python = ">=3.12"
11
+ keywords = ["cli", "subtitles", "srt", "whisperx", "transcription", "translation"]
12
+ classifiers = [
13
+ "Development Status :: 3 - Alpha",
14
+ "Environment :: Console",
15
+ "Intended Audience :: End Users/Desktop",
16
+ "License :: OSI Approved :: BSD License",
17
+ "Operating System :: OS Independent",
18
+ "Programming Language :: Python",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
22
+ "Topic :: Text Processing",
23
+ "Topic :: Utilities",
24
+ ]
25
+ dependencies = [
26
+ "accelerate>=1.13.0",
27
+ "sentencepiece>=0.2.1",
28
+ "torch>=2.8.0",
29
+ "transformers>=4.57.6",
30
+ "typer>=0.25.1",
31
+ "whisperx>=3.8.5",
32
+ ]
33
+
34
+ [project.urls]
35
+ Homepage = "https://github.com/chris-paganon/subtatix"
36
+ Repository = "https://github.com/chris-paganon/subtatix"
37
+ Issues = "https://github.com/chris-paganon/subtatix/issues"
38
+
39
+ [project.scripts]
40
+ subtatix = "subtatix.__main__:main"
41
+
42
+ [build-system]
43
+ requires = ["uv_build>=0.11.13,<0.12.0"]
44
+ build-backend = "uv_build"
@@ -0,0 +1 @@
1
+ __all__ = []
@@ -0,0 +1,5 @@
1
+ from subtatix.cli import main
2
+
3
+
4
+ if __name__ == "__main__":
5
+ raise SystemExit(main())
@@ -0,0 +1,273 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Annotated
6
+
7
+ import typer
8
+ from click.exceptions import ClickException
9
+
10
+ from subtatix.errors import SubtatixError
11
+
12
+ from subtatix.subtitles import (
13
+ DEFAULT_MODEL,
14
+ SUPPORTED_SOURCE_LANGUAGE_CODES,
15
+ require_ffmpeg,
16
+ transcribe_to_srt,
17
+ )
18
+ from subtatix.runtime import configure_runtime_noise
19
+ from subtatix.translation import (
20
+ DEFAULT_TRANSLATION_BATCH_SIZE,
21
+ SUPPORTED_TARGET_LANGUAGE_CODES,
22
+ get_available_nllb_languages,
23
+ translate_subtitles,
24
+ )
25
+
26
+ app = typer.Typer(
27
+ add_completion=False,
28
+ context_settings={"help_option_names": ["-h", "--help"]},
29
+ help=(
30
+ "Transcribe an audio or video file to SRT with WhisperX. "
31
+ "Without --to, the tool only transcribes. Passing --to also translates the "
32
+ "subtitles and keeps the original transcribed SRT unless "
33
+ "--discard-transcription is used. Output names are generated as "
34
+ "'.srt' and translated variants like '.es.srt'. "
35
+ "Use the language listing flags to inspect supported source, mapped target, "
36
+ "and raw NLLB codes."
37
+ ),
38
+ )
39
+
40
+ class ProgressBar:
41
+ def __init__(self, label: str, total: int, width: int = 28) -> None:
42
+ self._label = label
43
+ self._total = max(1, total)
44
+ self._width = width
45
+ self._current = 0
46
+ self._visible = False
47
+
48
+ def __enter__(self) -> "ProgressBar":
49
+ return self
50
+
51
+ def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
52
+ if exc_type is None and self._current < self._total:
53
+ self._current = self._total
54
+ self._render()
55
+ if self._visible:
56
+ sys.stderr.write("\n")
57
+ sys.stderr.flush()
58
+
59
+ def update_to(self, value: int) -> None:
60
+ bounded = min(self._total, max(0, value))
61
+ if bounded != self._current:
62
+ self._current = bounded
63
+ self._render()
64
+
65
+ def reset(self) -> None:
66
+ if self._current != 0:
67
+ self._clear_line()
68
+ self._current = 0
69
+ self._visible = False
70
+
71
+ def write_message(self, message: str) -> None:
72
+ if self._visible:
73
+ sys.stderr.write("\n")
74
+ sys.stderr.write(f"{message}\n")
75
+ sys.stderr.flush()
76
+ if self._visible and self._current < self._total:
77
+ self._render()
78
+
79
+ def _render(self) -> None:
80
+ filled = round((self._current / self._total) * self._width)
81
+ empty = self._width - filled
82
+ percent = round((self._current / self._total) * 100)
83
+ self._visible = True
84
+ sys.stderr.write(
85
+ f"\r{self._label} [{'#' * filled}{'.' * empty}] {percent:>3}%"
86
+ )
87
+ sys.stderr.flush()
88
+
89
+ def _clear_line(self) -> None:
90
+ sys.stderr.write("\r" + (" " * (self._width + len(self._label) + 10)) + "\r")
91
+ sys.stderr.flush()
92
+
93
+
94
+ @app.command()
95
+ def run(
96
+ input_file: Annotated[
97
+ Path | None,
98
+ typer.Argument(help="Path to the input audio or video file."),
99
+ ] = None,
100
+ target_language: Annotated[
101
+ str | None,
102
+ typer.Option(
103
+ "--to",
104
+ "--target-language",
105
+ "-t",
106
+ help=(
107
+ "Translate to this language. Use one of the mapped Whisper target "
108
+ "codes or a raw NLLB code like 'spa_Latn'. If omitted, the tool only "
109
+ "transcribes. Use --list-target-languages to inspect supported values."
110
+ ),
111
+ ),
112
+ ] = None,
113
+ source_language: Annotated[
114
+ str | None,
115
+ typer.Option(
116
+ "--source-language",
117
+ "-s",
118
+ help=(
119
+ "Optional Whisper source language code to skip language detection, "
120
+ "for example 'en', 'es', or 'fr'."
121
+ ),
122
+ ),
123
+ ] = None,
124
+ output: Annotated[
125
+ Path | None,
126
+ typer.Option(
127
+ "--output",
128
+ "-o",
129
+ help=(
130
+ "Base output path without a .srt suffix. Subtatix writes "
131
+ "'.srt' and translated variants like '.es.srt'. If a directory is "
132
+ "provided, the default filename based on the input file is used inside it."
133
+ ),
134
+ ),
135
+ ] = None,
136
+ model: Annotated[
137
+ str,
138
+ typer.Option(help=f"Whisper model name to use. Default: {DEFAULT_MODEL}."),
139
+ ] = DEFAULT_MODEL,
140
+ batch_size: Annotated[
141
+ int,
142
+ typer.Option(
143
+ "--batch-size",
144
+ help="Batch size for Whisper inference. Reduce this if you run out of GPU memory.",
145
+ ),
146
+ ] = 8,
147
+ device: Annotated[
148
+ str,
149
+ typer.Option(
150
+ "--device",
151
+ help=(
152
+ "Execution device for WhisperX: 'auto', 'cuda', or 'cpu'. "
153
+ "Default: auto."
154
+ ),
155
+ ),
156
+ ] = "auto",
157
+ discard_transcription: Annotated[
158
+ bool,
159
+ typer.Option(
160
+ "--discard-transcription",
161
+ help=(
162
+ "When used with --to, do not save the original untranslated SRT file. "
163
+ "Without --to, the transcribed SRT is still written."
164
+ ),
165
+ is_flag=True,
166
+ ),
167
+ ] = False,
168
+ list_languages: Annotated[
169
+ bool,
170
+ typer.Option(
171
+ "--list-languages",
172
+ help=(
173
+ "List source Whisper codes and the convenience target codes. "
174
+ "Use --list-target-languages for the full raw NLLB target list."
175
+ ),
176
+ is_flag=True,
177
+ ),
178
+ ] = False,
179
+ list_source_languages: Annotated[
180
+ bool,
181
+ typer.Option(
182
+ "--list-source-languages",
183
+ help="List supported Whisper source language codes.",
184
+ is_flag=True,
185
+ ),
186
+ ] = False,
187
+ list_target_languages: Annotated[
188
+ bool,
189
+ typer.Option(
190
+ "--list-target-languages",
191
+ help=(
192
+ "List the convenience target codes for --to, followed by the full raw "
193
+ "NLLB target language codes accepted by --to."
194
+ ),
195
+ is_flag=True,
196
+ ),
197
+ ] = False,
198
+ ) -> None:
199
+ if list_languages or list_source_languages or list_target_languages:
200
+ if input_file is not None:
201
+ raise typer.BadParameter(
202
+ "INPUT_FILE cannot be used with language listing options."
203
+ )
204
+ if list_languages or list_source_languages:
205
+ typer.echo("Source languages (Whisper codes):")
206
+ typer.echo(", ".join(SUPPORTED_SOURCE_LANGUAGE_CODES))
207
+ typer.echo()
208
+ if list_languages or list_target_languages:
209
+ typer.echo(
210
+ "Convenience target languages (--to Whisper-style codes mapped to NLLB):"
211
+ )
212
+ typer.echo(", ".join(SUPPORTED_TARGET_LANGUAGE_CODES))
213
+ typer.echo()
214
+ if list_languages:
215
+ typer.echo(
216
+ "Use --list-target-languages to also show the full raw NLLB target list."
217
+ )
218
+ if list_target_languages:
219
+ typer.echo("Raw NLLB target languages (--to NLLB codes):")
220
+ typer.echo(", ".join(get_available_nllb_languages()))
221
+ return
222
+
223
+ if input_file is None:
224
+ raise typer.BadParameter("Missing argument 'INPUT_FILE'.")
225
+
226
+ configure_runtime_noise()
227
+ require_ffmpeg()
228
+ write_original_srt = target_language is None or not discard_transcription
229
+ with ProgressBar("Transcription", 100) as transcription_progress:
230
+ document = transcribe_to_srt(
231
+ input_file=input_file,
232
+ model_name=model,
233
+ batch_size=batch_size,
234
+ output_file=output,
235
+ write_output=write_original_srt,
236
+ source_language=source_language,
237
+ device_preference=device,
238
+ log=transcription_progress.write_message,
239
+ progress_callback=lambda percent: transcription_progress.update_to(
240
+ round(percent)
241
+ ),
242
+ progress_reset=transcription_progress.reset,
243
+ )
244
+ if write_original_srt:
245
+ typer.echo(document.subtitle_path)
246
+
247
+ if target_language:
248
+ total_batches = max(
249
+ 1,
250
+ (len(document.cues) + DEFAULT_TRANSLATION_BATCH_SIZE - 1)
251
+ // DEFAULT_TRANSLATION_BATCH_SIZE,
252
+ )
253
+ with ProgressBar("Translation", total_batches) as translation_progress:
254
+ translated_path = translate_subtitles(
255
+ document=document,
256
+ target_language=target_language,
257
+ progress_callback=lambda batch_index, total: translation_progress.update_to(
258
+ batch_index
259
+ ),
260
+ )
261
+ typer.echo(translated_path)
262
+
263
+
264
+ def main() -> int:
265
+ try:
266
+ app(standalone_mode=False)
267
+ except SubtatixError as error:
268
+ ClickException(str(error)).show()
269
+ return 1
270
+ except ClickException as error:
271
+ error.show()
272
+ return error.exit_code
273
+ return 0
@@ -0,0 +1,5 @@
1
+ from __future__ import annotations
2
+
3
+
4
+ class SubtatixError(Exception):
5
+ """Expected user-facing error that should be rendered without a traceback."""
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import logging
5
+ import warnings
6
+
7
+ import torch
8
+
9
+ from subtatix.errors import SubtatixError
10
+
11
+
12
+ def get_device(preferred: str = "auto") -> str:
13
+ normalized = preferred.strip().lower()
14
+ if normalized not in {"auto", "cpu", "cuda"}:
15
+ raise SubtatixError(
16
+ f"Unsupported device '{preferred}'. Use 'auto', 'cpu', or 'cuda'."
17
+ )
18
+ if normalized == "cpu":
19
+ return "cpu"
20
+ if normalized == "cuda":
21
+ if not torch.cuda.is_available():
22
+ raise SubtatixError(
23
+ "CUDA was requested but is not available on this system."
24
+ )
25
+ return "cuda"
26
+ if torch.cuda.is_available():
27
+ return "cuda"
28
+ return "cpu"
29
+
30
+
31
+ def get_whisperx_runtime(preferred_device: str = "auto") -> tuple[str, str]:
32
+ device = get_device(preferred_device)
33
+ if device == "cuda":
34
+ return device, "float16"
35
+ return device, "float32"
36
+
37
+
38
+ def configure_runtime_noise() -> None:
39
+ for logger_name in (
40
+ "whisperx",
41
+ "whisperx.asr",
42
+ "whisperx.vads",
43
+ "whisperx.vads.pyannote",
44
+ "lightning",
45
+ "lightning.pytorch",
46
+ "lightning.pytorch.utilities.migration",
47
+ "lightning.pytorch.utilities.migration.utils",
48
+ "pytorch_lightning",
49
+ "pytorch_lightning.utilities.migration",
50
+ "pytorch_lightning.utilities.migration.utils",
51
+ ):
52
+ logging.getLogger(logger_name).setLevel(logging.ERROR)
53
+
54
+ warnings.filterwarnings(
55
+ "ignore",
56
+ message=r"TensorFloat-32 \(TF32\) has been disabled.*",
57
+ module=r"pyannote\.audio\.utils\.reproducibility",
58
+ )
59
+
60
+
61
+ def release_memory() -> None:
62
+ gc.collect()
63
+ if torch.cuda.is_available():
64
+ torch.cuda.empty_cache()
65
+ torch.cuda.ipc_collect()
@@ -0,0 +1,256 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import shutil
5
+ from pathlib import Path
6
+ from typing import Callable
7
+
8
+ from faster_whisper.tokenizer import _LANGUAGE_CODES
9
+ import whisperx
10
+
11
+ from subtatix.errors import SubtatixError
12
+ from subtatix.runtime import get_whisperx_runtime, release_memory
13
+
14
+
15
+ DEFAULT_MODEL = "large-v2"
16
+ SUPPORTED_SOURCE_LANGUAGE_CODES = tuple(sorted(_LANGUAGE_CODES))
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class SubtitleCue:
21
+ start: float
22
+ end: float
23
+ text: str
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class SubtitleDocument:
28
+ source_language: str
29
+ subtitle_path: Path
30
+ cues: list[SubtitleCue]
31
+
32
+
33
+ def require_ffmpeg() -> None:
34
+ if shutil.which("ffmpeg") is None:
35
+ raise SubtatixError(
36
+ "ffmpeg is required but was not found on PATH. Install ffmpeg and try again."
37
+ )
38
+
39
+
40
+ def normalize_source_language(language: str) -> str:
41
+ language_code = language.strip().lower()
42
+ if language_code in SUPPORTED_SOURCE_LANGUAGE_CODES:
43
+ return language_code
44
+
45
+ raise SubtatixError(
46
+ f"Unsupported source language '{language}'. Use a Whisper language code like "
47
+ "'en', 'es', or 'fr'."
48
+ )
49
+
50
+
51
+ def resolve_output_path(input_file: Path, output_file: Path | None) -> Path:
52
+ default_output_path = input_file.with_suffix(".srt")
53
+ if output_file is None:
54
+ return default_output_path
55
+
56
+ output_file = output_file.expanduser()
57
+ if output_file.exists() and output_file.is_dir():
58
+ return (output_file / default_output_path.name).resolve()
59
+ if output_file.suffix == ".srt":
60
+ raise SubtatixError(
61
+ "Output paths must not end in '.srt'. Pass a base output path like "
62
+ "'some-path/some-file-name' so Subtatix can write '.srt' and translated variants like "
63
+ "'.es.srt'."
64
+ )
65
+ return output_file.with_suffix(".srt").resolve()
66
+
67
+
68
+ def format_srt_timestamp(seconds: float) -> str:
69
+ total_milliseconds = max(0, round(seconds * 1000))
70
+ hours, remainder = divmod(total_milliseconds, 3_600_000)
71
+ minutes, remainder = divmod(remainder, 60_000)
72
+ secs, milliseconds = divmod(remainder, 1000)
73
+ return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
74
+
75
+
76
+ def build_cues(aligned_transcription: dict) -> list[SubtitleCue]:
77
+ cues: list[SubtitleCue] = []
78
+ for segment in aligned_transcription["segments"]:
79
+ text = segment["text"].strip()
80
+ if not text:
81
+ continue
82
+ cues.append(
83
+ SubtitleCue(
84
+ start=float(segment["start"]),
85
+ end=float(segment["end"]),
86
+ text=text,
87
+ )
88
+ )
89
+ return cues
90
+
91
+
92
+ def write_srt(subtitle_path: Path, cues: list[SubtitleCue]) -> None:
93
+ subtitle_path.parent.mkdir(parents=True, exist_ok=True)
94
+ blocks = []
95
+ for index, cue in enumerate(cues, start=1):
96
+ blocks.append(
97
+ "\n".join(
98
+ [
99
+ str(index),
100
+ (
101
+ f"{format_srt_timestamp(cue.start)} --> "
102
+ f"{format_srt_timestamp(cue.end)}"
103
+ ),
104
+ cue.text,
105
+ ]
106
+ )
107
+ )
108
+ subtitle_path.write_text("\n\n".join(blocks) + "\n", encoding="utf-8")
109
+
110
+
111
+ def is_cuda_oom(error: RuntimeError) -> bool:
112
+ message = str(error).lower()
113
+ return "cuda" in message and "out of memory" in message
114
+
115
+
116
+ def iter_retry_batch_sizes(batch_size: int) -> list[int]:
117
+ sizes: list[int] = []
118
+ current = max(1, batch_size)
119
+ while current not in sizes:
120
+ sizes.append(current)
121
+ if current == 1:
122
+ break
123
+ current = max(1, current // 2)
124
+ return sizes
125
+
126
+
127
+ def transcribe_with_backoff(
128
+ whisper_model: object,
129
+ audio: object,
130
+ batch_size: int,
131
+ source_language: str | None,
132
+ device: str,
133
+ log: Callable[[str], None] | None = None,
134
+ progress_callback: Callable[[float], None] | None = None,
135
+ progress_reset: Callable[[], None] | None = None,
136
+ ) -> tuple[dict, int]:
137
+ attempts = iter_retry_batch_sizes(batch_size)
138
+ last_error: RuntimeError | None = None
139
+ for attempt_batch_size in attempts:
140
+ if log is not None:
141
+ log(f"Starting transcription with batch size {attempt_batch_size}.")
142
+ try:
143
+ transcription = whisper_model.transcribe(
144
+ audio,
145
+ batch_size=attempt_batch_size,
146
+ language=source_language,
147
+ progress_callback=progress_callback,
148
+ )
149
+ return transcription, attempt_batch_size
150
+ except RuntimeError as error:
151
+ if device != "cuda" or not is_cuda_oom(error):
152
+ raise
153
+ last_error = error
154
+ next_attempt_index = attempts.index(attempt_batch_size) + 1
155
+ if log is not None and next_attempt_index < len(attempts):
156
+ log(
157
+ "CUDA out of memory at batch size "
158
+ f"{attempt_batch_size}; retrying with batch size "
159
+ f"{attempts[next_attempt_index]}."
160
+ )
161
+ if progress_reset is not None and next_attempt_index < len(attempts):
162
+ progress_reset()
163
+ release_memory()
164
+ assert last_error is not None
165
+ raise SubtatixError(
166
+ "CUDA ran out of memory during transcription even after retrying with "
167
+ f"smaller batch sizes {attempts}. Retry with --device cpu, a smaller "
168
+ "--model, or a lower --batch-size."
169
+ ) from last_error
170
+
171
+
172
+ def transcribe_to_srt(
173
+ input_file: Path,
174
+ model_name: str = DEFAULT_MODEL,
175
+ batch_size: int = 8,
176
+ output_file: Path | None = None,
177
+ write_output: bool = True,
178
+ source_language: str | None = None,
179
+ device_preference: str = "auto",
180
+ log: Callable[[str], None] | None = None,
181
+ progress_callback: Callable[[float], None] | None = None,
182
+ progress_reset: Callable[[], None] | None = None,
183
+ ) -> SubtitleDocument:
184
+ input_file = input_file.expanduser().resolve()
185
+ if not input_file.is_file():
186
+ raise SubtatixError(f"Input file not found: {input_file}")
187
+ output_path = resolve_output_path(input_file, output_file)
188
+ normalized_source_language = (
189
+ normalize_source_language(source_language)
190
+ if source_language is not None
191
+ else None
192
+ )
193
+
194
+ device, compute_type = get_whisperx_runtime(device_preference)
195
+ if log is not None:
196
+ log(
197
+ f"Using device: {device} (compute_type={compute_type}, "
198
+ f"requested={device_preference}, initial_batch_size={batch_size})."
199
+ )
200
+ whisper_model = None
201
+ align_model = None
202
+ audio = None
203
+ transcription = None
204
+ aligned_transcription = None
205
+
206
+ try:
207
+ whisper_model = whisperx.load_model(
208
+ model_name,
209
+ device,
210
+ compute_type=compute_type,
211
+ language=normalized_source_language,
212
+ )
213
+
214
+ audio = whisperx.load_audio(str(input_file))
215
+ transcription, _ = transcribe_with_backoff(
216
+ whisper_model=whisper_model,
217
+ audio=audio,
218
+ batch_size=batch_size,
219
+ source_language=normalized_source_language,
220
+ device=device,
221
+ log=log,
222
+ progress_callback=progress_callback,
223
+ progress_reset=progress_reset,
224
+ )
225
+ language = transcription["language"]
226
+ if normalized_source_language is None and log is not None:
227
+ log(f"Auto-detected source language: {language}.")
228
+
229
+ align_model, align_metadata = whisperx.load_align_model(
230
+ language_code=language,
231
+ device=device,
232
+ )
233
+ aligned_transcription = whisperx.align(
234
+ transcription["segments"],
235
+ align_model,
236
+ align_metadata,
237
+ audio,
238
+ device,
239
+ return_char_alignments=False,
240
+ )
241
+ aligned_transcription["language"] = language
242
+ cues = build_cues(aligned_transcription)
243
+ if write_output:
244
+ write_srt(output_path, cues)
245
+ return SubtitleDocument(
246
+ source_language=language,
247
+ subtitle_path=output_path,
248
+ cues=cues,
249
+ )
250
+ finally:
251
+ del whisper_model
252
+ del align_model
253
+ del audio
254
+ del transcription
255
+ del aligned_transcription
256
+ release_memory()
@@ -0,0 +1,151 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ import math
5
+ from pathlib import Path
6
+ from typing import Callable
7
+
8
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9
+
10
+ from subtatix.errors import SubtatixError
11
+ from subtatix.runtime import get_device, release_memory
12
+ from subtatix.subtitles import SubtitleCue, SubtitleDocument, write_srt
13
+
14
+
15
+ DEFAULT_TRANSLATION_MODEL = "facebook/nllb-200-1.3B"
16
+ DEFAULT_TRANSLATION_BATCH_SIZE = 16
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class LanguageSpec:
21
+ nllb_code: str
22
+ suffix: str
23
+
24
+
25
+ LANGUAGE_SPECS = {
26
+ "ca": LanguageSpec("cat_Latn", "ca"),
27
+ "de": LanguageSpec("deu_Latn", "de"),
28
+ "en": LanguageSpec("eng_Latn", "en"),
29
+ "es": LanguageSpec("spa_Latn", "es"),
30
+ "fr": LanguageSpec("fra_Latn", "fr"),
31
+ "it": LanguageSpec("ita_Latn", "it"),
32
+ "ja": LanguageSpec("jpn_Jpan", "ja"),
33
+ "ko": LanguageSpec("kor_Hang", "ko"),
34
+ "nl": LanguageSpec("nld_Latn", "nl"),
35
+ "pt": LanguageSpec("por_Latn", "pt"),
36
+ "ru": LanguageSpec("rus_Cyrl", "ru"),
37
+ "zh": LanguageSpec("zho_Hans", "zh"),
38
+ }
39
+ SUPPORTED_TARGET_LANGUAGE_CODES = tuple(sorted(LANGUAGE_SPECS))
40
+
41
+ _TRANSLATION_MODELS: dict[str, tuple[object, object, str]] = {}
42
+ _NLLB_LANGUAGE_CODES: tuple[str, ...] | None = None
43
+
44
+
45
+ def resolve_language(language: str) -> LanguageSpec:
46
+ raw_language = language.strip()
47
+ key = raw_language.lower()
48
+ if key in LANGUAGE_SPECS:
49
+ return LANGUAGE_SPECS[key]
50
+
51
+ parts = raw_language.split("_", 1)
52
+ if len(parts) == 2 and len(parts[0]) == 3 and parts[0].islower():
53
+ return LanguageSpec(raw_language, parts[0])
54
+
55
+ raise SubtatixError(
56
+ f"Unsupported language '{language}'. Use a Whisper language code like 'es' "
57
+ "or a full NLLB language code like 'spa_Latn'."
58
+ )
59
+
60
+
61
+ def resolve_translation_output_path(
62
+ subtitle_path: Path,
63
+ target_language: LanguageSpec,
64
+ ) -> Path:
65
+ if subtitle_path.suffix != ".srt":
66
+ raise SubtatixError(f"Expected an .srt subtitle file, got: {subtitle_path}")
67
+ return subtitle_path.with_name(
68
+ f"{subtitle_path.stem}.{target_language.suffix}{subtitle_path.suffix}"
69
+ )
70
+
71
+
72
+ def get_translation_backend(
73
+ model_name: str = DEFAULT_TRANSLATION_MODEL,
74
+ ) -> tuple[object, object, str]:
75
+ if model_name not in _TRANSLATION_MODELS:
76
+ release_memory()
77
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
78
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
79
+ device = get_device()
80
+ _TRANSLATION_MODELS[model_name] = (tokenizer, model.to(device), device)
81
+ return _TRANSLATION_MODELS[model_name]
82
+
83
+
84
+ def get_available_nllb_languages(
85
+ model_name: str = DEFAULT_TRANSLATION_MODEL,
86
+ ) -> tuple[str, ...]:
87
+ global _NLLB_LANGUAGE_CODES
88
+ if _NLLB_LANGUAGE_CODES is None:
89
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
90
+ _NLLB_LANGUAGE_CODES = tuple(sorted(tokenizer.additional_special_tokens))
91
+ return _NLLB_LANGUAGE_CODES
92
+
93
+
94
+ def translate_batch(
95
+ texts: list[str],
96
+ src_lang: str,
97
+ tgt_lang: str,
98
+ model_name: str = DEFAULT_TRANSLATION_MODEL,
99
+ max_length: int = 400,
100
+ ) -> list[str]:
101
+ tokenizer, model, device = get_translation_backend(model_name)
102
+ tokenizer.src_lang = src_lang
103
+ inputs = tokenizer(
104
+ texts,
105
+ return_tensors="pt",
106
+ padding=True,
107
+ truncation=True,
108
+ ).to(device)
109
+ translated_tokens = model.generate(
110
+ **inputs,
111
+ forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
112
+ max_length=max_length,
113
+ )
114
+ return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
115
+
116
+
117
+ def translate_subtitles(
118
+ document: SubtitleDocument,
119
+ target_language: str,
120
+ model_name: str = DEFAULT_TRANSLATION_MODEL,
121
+ batch_size: int = DEFAULT_TRANSLATION_BATCH_SIZE,
122
+ max_length: int = 400,
123
+ progress_callback: Callable[[int, int], None] | None = None,
124
+ ) -> Path:
125
+ source = resolve_language(document.source_language)
126
+ target = resolve_language(target_language)
127
+ output_path = resolve_translation_output_path(document.subtitle_path, target)
128
+ cues = document.cues
129
+ total_batches = max(1, math.ceil(len(cues) / batch_size))
130
+
131
+ translated_texts: list[str] = []
132
+ for batch_index, start in enumerate(range(0, len(cues), batch_size), start=1):
133
+ batch = cues[start : start + batch_size]
134
+ translated_texts.extend(
135
+ translate_batch(
136
+ [cue.text for cue in batch],
137
+ src_lang=source.nllb_code,
138
+ tgt_lang=target.nllb_code,
139
+ model_name=model_name,
140
+ max_length=max_length,
141
+ )
142
+ )
143
+ if progress_callback is not None:
144
+ progress_callback(batch_index, total_batches)
145
+
146
+ translated_cues = [
147
+ SubtitleCue(start=cue.start, end=cue.end, text=text)
148
+ for cue, text in zip(cues, translated_texts, strict=True)
149
+ ]
150
+ write_srt(output_path, translated_cues)
151
+ return output_path