subtatix 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- subtatix-0.1.0/PKG-INFO +129 -0
- subtatix-0.1.0/README.md +98 -0
- subtatix-0.1.0/pyproject.toml +44 -0
- subtatix-0.1.0/src/subtatix/__init__.py +1 -0
- subtatix-0.1.0/src/subtatix/__main__.py +5 -0
- subtatix-0.1.0/src/subtatix/cli.py +273 -0
- subtatix-0.1.0/src/subtatix/errors.py +5 -0
- subtatix-0.1.0/src/subtatix/runtime.py +65 -0
- subtatix-0.1.0/src/subtatix/subtitles.py +256 -0
- subtatix-0.1.0/src/subtatix/translation.py +151 -0
subtatix-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: subtatix
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI for generating and translating SRT subtitles with WhisperX
|
|
5
|
+
Keywords: cli,subtitles,srt,whisperx,transcription,translation
|
|
6
|
+
Author: Chris Paganon
|
|
7
|
+
Author-email: Chris Paganon <info@chrispaganon.com>
|
|
8
|
+
License-Expression: BSD-2-Clause
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
12
|
+
Classifier: License :: OSI Approved :: BSD License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
18
|
+
Classifier: Topic :: Text Processing
|
|
19
|
+
Classifier: Topic :: Utilities
|
|
20
|
+
Requires-Dist: accelerate>=1.13.0
|
|
21
|
+
Requires-Dist: sentencepiece>=0.2.1
|
|
22
|
+
Requires-Dist: torch>=2.8.0
|
|
23
|
+
Requires-Dist: transformers>=4.57.6
|
|
24
|
+
Requires-Dist: typer>=0.25.1
|
|
25
|
+
Requires-Dist: whisperx>=3.8.5
|
|
26
|
+
Requires-Python: >=3.12
|
|
27
|
+
Project-URL: Homepage, https://github.com/chris-paganon/subtatix
|
|
28
|
+
Project-URL: Repository, https://github.com/chris-paganon/subtatix
|
|
29
|
+
Project-URL: Issues, https://github.com/chris-paganon/subtatix/issues
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# Subtatix
|
|
33
|
+
|
|
34
|
+
`Subtatix` is a small CLI for generating `.srt` subtitles from audio or video files with [WhisperX](https://github.com/m-bain/whisperX), with optional subtitle translation.
|
|
35
|
+
|
|
36
|
+
It transcribes the input, aligns subtitle timings with WhisperX, and can then translate the resulting subtitle lines into another language.
|
|
37
|
+
|
|
38
|
+
## Requirements
|
|
39
|
+
|
|
40
|
+
- Python 3.12+
|
|
41
|
+
- `ffmpeg` installed separately and available on your `PATH`
|
|
42
|
+
- Enough disk space for model downloads and caching
|
|
43
|
+
|
|
44
|
+
The first run will be slower because WhisperX and translation models need to be downloaded. Subsequent runs reuse the cached models and do not need to download them again unless the cache is cleared.
|
|
45
|
+
|
|
46
|
+
`ffmpeg` is an external system dependency. It is not installed by `pip`, `uvx`, or `uv tool install`.
|
|
47
|
+
|
|
48
|
+
## Installation
|
|
49
|
+
|
|
50
|
+
Run without installing:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
uvx subtatix --help
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Install as a tool with `uv`:
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
uv tool install subtatix
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
Install with `pip`:
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install subtatix
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
## Usage
|
|
69
|
+
|
|
70
|
+
Run the CLI:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
subtatix input.mp4
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
Transcribe to a specific output path:
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
subtatix input.mp4 --output some-path/some-file-name
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
`--output` is a base path, not a full `.srt` filename. This writes `some-path/some-file-name.srt`. If you also translate to Spanish, it writes `some-path/some-file-name.es.srt`.
|
|
83
|
+
|
|
84
|
+
Set the source language explicitly:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
subtatix input.mp4 --source-language en
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Translate after transcription:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
subtatix input.mp4 --to es
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
This writes both the original transcription SRT and the translated SRT by default.
|
|
97
|
+
|
|
98
|
+
If CUDA runs out of memory on larger files, reduce the batch size or force CPU mode:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
subtatix input.mp4 --batch-size 4
|
|
102
|
+
subtatix input.mp4 --device cpu
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
To discard the original transcription and only keep the translated output:
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
subtatix input.mp4 --to es --discard-transcription
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
Passing an `--output` value that ends in `.srt` is rejected. Use a base path such as `--output subtitles` instead.
|
|
112
|
+
|
|
113
|
+
List supported language codes:
|
|
114
|
+
|
|
115
|
+
```bash
|
|
116
|
+
subtatix --list-languages
|
|
117
|
+
subtatix --list-target-languages
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## Models
|
|
121
|
+
|
|
122
|
+
By default, transcription uses WhisperX with the Whisper model `large-v2`. This is a good general default when you want higher transcription quality and aligned subtitle timings, but it is heavier and slower than smaller Whisper models.
|
|
123
|
+
|
|
124
|
+
Translation uses `facebook/nllb-200-1.3B`. The CLI accepts simple target codes such as `en`, `es`, `fr`, `de`, `pt`, `ja`, `ko`, `zh`, and also raw NLLB codes such as `spa_Latn`.
|
|
125
|
+
|
|
126
|
+
Other model options can also be used:
|
|
127
|
+
|
|
128
|
+
- For transcription, you can pass another Whisper model with `--model`, such as `small`, `medium`, or `large-v3`, depending on your speed and accuracy needs.
|
|
129
|
+
- For translation, the code currently defaults to the NLLB model above, but the translation layer is built around Hugging Face seq2seq models and could be adapted to use a different multilingual translation model if needed.
|
subtatix-0.1.0/README.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# Subtatix
|
|
2
|
+
|
|
3
|
+
`Subtatix` is a small CLI for generating `.srt` subtitles from audio or video files with [WhisperX](https://github.com/m-bain/whisperX), with optional subtitle translation.
|
|
4
|
+
|
|
5
|
+
It transcribes the input, aligns subtitle timings with WhisperX, and can then translate the resulting subtitle lines into another language.
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- Python 3.12+
|
|
10
|
+
- `ffmpeg` installed separately and available on your `PATH`
|
|
11
|
+
- Enough disk space for model downloads and caching
|
|
12
|
+
|
|
13
|
+
The first run will be slower because WhisperX and translation models need to be downloaded. Subsequent runs reuse the cached models and do not need to download them again unless the cache is cleared.
|
|
14
|
+
|
|
15
|
+
`ffmpeg` is an external system dependency. It is not installed by `pip`, `uvx`, or `uv tool install`.
|
|
16
|
+
|
|
17
|
+
## Installation
|
|
18
|
+
|
|
19
|
+
Run without installing:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
uvx subtatix --help
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Install as a tool with `uv`:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv tool install subtatix
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Install with `pip`:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install subtatix
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Usage
|
|
38
|
+
|
|
39
|
+
Run the CLI:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
subtatix input.mp4
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Transcribe to a specific output path:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
subtatix input.mp4 --output some-path/some-file-name
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
`--output` is a base path, not a full `.srt` filename. This writes `some-path/some-file-name.srt`. If you also translate to Spanish, it writes `some-path/some-file-name.es.srt`.
|
|
52
|
+
|
|
53
|
+
Set the source language explicitly:
|
|
54
|
+
|
|
55
|
+
```bash
|
|
56
|
+
subtatix input.mp4 --source-language en
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Translate after transcription:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
subtatix input.mp4 --to es
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This writes both the original transcription SRT and the translated SRT by default.
|
|
66
|
+
|
|
67
|
+
If CUDA runs out of memory on larger files, reduce the batch size or force CPU mode:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
subtatix input.mp4 --batch-size 4
|
|
71
|
+
subtatix input.mp4 --device cpu
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
To discard the original transcription and only keep the translated output:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
subtatix input.mp4 --to es --discard-transcription
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Passing an `--output` value that ends in `.srt` is rejected. Use a base path such as `--output subtitles` instead.
|
|
81
|
+
|
|
82
|
+
List supported language codes:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
subtatix --list-languages
|
|
86
|
+
subtatix --list-target-languages
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Models
|
|
90
|
+
|
|
91
|
+
By default, transcription uses WhisperX with the Whisper model `large-v2`. This is a good general default when you want higher transcription quality and aligned subtitle timings, but it is heavier and slower than smaller Whisper models.
|
|
92
|
+
|
|
93
|
+
Translation uses `facebook/nllb-200-1.3B`. The CLI accepts simple target codes such as `en`, `es`, `fr`, `de`, `pt`, `ja`, `ko`, `zh`, and also raw NLLB codes such as `spa_Latn`.
|
|
94
|
+
|
|
95
|
+
Other model options can also be used:
|
|
96
|
+
|
|
97
|
+
- For transcription, you can pass another Whisper model with `--model`, such as `small`, `medium`, or `large-v3`, depending on your speed and accuracy needs.
|
|
98
|
+
- For translation, the code currently defaults to the NLLB model above, but the translation layer is built around Hugging Face seq2seq models and could be adapted to use a different multilingual translation model if needed.
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "subtatix"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "CLI for generating and translating SRT subtitles with WhisperX"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = "BSD-2-Clause"
|
|
7
|
+
authors = [
|
|
8
|
+
{ name = "Chris Paganon", email = "info@chrispaganon.com" }
|
|
9
|
+
]
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
keywords = ["cli", "subtitles", "srt", "whisperx", "transcription", "translation"]
|
|
12
|
+
classifiers = [
|
|
13
|
+
"Development Status :: 3 - Alpha",
|
|
14
|
+
"Environment :: Console",
|
|
15
|
+
"Intended Audience :: End Users/Desktop",
|
|
16
|
+
"License :: OSI Approved :: BSD License",
|
|
17
|
+
"Operating System :: OS Independent",
|
|
18
|
+
"Programming Language :: Python",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Multimedia :: Sound/Audio :: Speech",
|
|
22
|
+
"Topic :: Text Processing",
|
|
23
|
+
"Topic :: Utilities",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"accelerate>=1.13.0",
|
|
27
|
+
"sentencepiece>=0.2.1",
|
|
28
|
+
"torch>=2.8.0",
|
|
29
|
+
"transformers>=4.57.6",
|
|
30
|
+
"typer>=0.25.1",
|
|
31
|
+
"whisperx>=3.8.5",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.urls]
|
|
35
|
+
Homepage = "https://github.com/chris-paganon/subtatix"
|
|
36
|
+
Repository = "https://github.com/chris-paganon/subtatix"
|
|
37
|
+
Issues = "https://github.com/chris-paganon/subtatix/issues"
|
|
38
|
+
|
|
39
|
+
[project.scripts]
|
|
40
|
+
subtatix = "subtatix.__main__:main"
|
|
41
|
+
|
|
42
|
+
[build-system]
|
|
43
|
+
requires = ["uv_build>=0.11.13,<0.12.0"]
|
|
44
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__all__ = []
|
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Annotated
|
|
6
|
+
|
|
7
|
+
import typer
|
|
8
|
+
from click.exceptions import ClickException
|
|
9
|
+
|
|
10
|
+
from subtatix.errors import SubtatixError
|
|
11
|
+
|
|
12
|
+
from subtatix.subtitles import (
|
|
13
|
+
DEFAULT_MODEL,
|
|
14
|
+
SUPPORTED_SOURCE_LANGUAGE_CODES,
|
|
15
|
+
require_ffmpeg,
|
|
16
|
+
transcribe_to_srt,
|
|
17
|
+
)
|
|
18
|
+
from subtatix.runtime import configure_runtime_noise
|
|
19
|
+
from subtatix.translation import (
|
|
20
|
+
DEFAULT_TRANSLATION_BATCH_SIZE,
|
|
21
|
+
SUPPORTED_TARGET_LANGUAGE_CODES,
|
|
22
|
+
get_available_nllb_languages,
|
|
23
|
+
translate_subtitles,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
app = typer.Typer(
|
|
27
|
+
add_completion=False,
|
|
28
|
+
context_settings={"help_option_names": ["-h", "--help"]},
|
|
29
|
+
help=(
|
|
30
|
+
"Transcribe an audio or video file to SRT with WhisperX. "
|
|
31
|
+
"Without --to, the tool only transcribes. Passing --to also translates the "
|
|
32
|
+
"subtitles and keeps the original transcribed SRT unless "
|
|
33
|
+
"--discard-transcription is used. Output names are generated as "
|
|
34
|
+
"'.srt' and translated variants like '.es.srt'. "
|
|
35
|
+
"Use the language listing flags to inspect supported source, mapped target, "
|
|
36
|
+
"and raw NLLB codes."
|
|
37
|
+
),
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
class ProgressBar:
|
|
41
|
+
def __init__(self, label: str, total: int, width: int = 28) -> None:
|
|
42
|
+
self._label = label
|
|
43
|
+
self._total = max(1, total)
|
|
44
|
+
self._width = width
|
|
45
|
+
self._current = 0
|
|
46
|
+
self._visible = False
|
|
47
|
+
|
|
48
|
+
def __enter__(self) -> "ProgressBar":
|
|
49
|
+
return self
|
|
50
|
+
|
|
51
|
+
def __exit__(self, exc_type: object, exc: object, tb: object) -> None:
|
|
52
|
+
if exc_type is None and self._current < self._total:
|
|
53
|
+
self._current = self._total
|
|
54
|
+
self._render()
|
|
55
|
+
if self._visible:
|
|
56
|
+
sys.stderr.write("\n")
|
|
57
|
+
sys.stderr.flush()
|
|
58
|
+
|
|
59
|
+
def update_to(self, value: int) -> None:
|
|
60
|
+
bounded = min(self._total, max(0, value))
|
|
61
|
+
if bounded != self._current:
|
|
62
|
+
self._current = bounded
|
|
63
|
+
self._render()
|
|
64
|
+
|
|
65
|
+
def reset(self) -> None:
|
|
66
|
+
if self._current != 0:
|
|
67
|
+
self._clear_line()
|
|
68
|
+
self._current = 0
|
|
69
|
+
self._visible = False
|
|
70
|
+
|
|
71
|
+
def write_message(self, message: str) -> None:
|
|
72
|
+
if self._visible:
|
|
73
|
+
sys.stderr.write("\n")
|
|
74
|
+
sys.stderr.write(f"{message}\n")
|
|
75
|
+
sys.stderr.flush()
|
|
76
|
+
if self._visible and self._current < self._total:
|
|
77
|
+
self._render()
|
|
78
|
+
|
|
79
|
+
def _render(self) -> None:
|
|
80
|
+
filled = round((self._current / self._total) * self._width)
|
|
81
|
+
empty = self._width - filled
|
|
82
|
+
percent = round((self._current / self._total) * 100)
|
|
83
|
+
self._visible = True
|
|
84
|
+
sys.stderr.write(
|
|
85
|
+
f"\r{self._label} [{'#' * filled}{'.' * empty}] {percent:>3}%"
|
|
86
|
+
)
|
|
87
|
+
sys.stderr.flush()
|
|
88
|
+
|
|
89
|
+
def _clear_line(self) -> None:
|
|
90
|
+
sys.stderr.write("\r" + (" " * (self._width + len(self._label) + 10)) + "\r")
|
|
91
|
+
sys.stderr.flush()
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
@app.command()
|
|
95
|
+
def run(
|
|
96
|
+
input_file: Annotated[
|
|
97
|
+
Path | None,
|
|
98
|
+
typer.Argument(help="Path to the input audio or video file."),
|
|
99
|
+
] = None,
|
|
100
|
+
target_language: Annotated[
|
|
101
|
+
str | None,
|
|
102
|
+
typer.Option(
|
|
103
|
+
"--to",
|
|
104
|
+
"--target-language",
|
|
105
|
+
"-t",
|
|
106
|
+
help=(
|
|
107
|
+
"Translate to this language. Use one of the mapped Whisper target "
|
|
108
|
+
"codes or a raw NLLB code like 'spa_Latn'. If omitted, the tool only "
|
|
109
|
+
"transcribes. Use --list-target-languages to inspect supported values."
|
|
110
|
+
),
|
|
111
|
+
),
|
|
112
|
+
] = None,
|
|
113
|
+
source_language: Annotated[
|
|
114
|
+
str | None,
|
|
115
|
+
typer.Option(
|
|
116
|
+
"--source-language",
|
|
117
|
+
"-s",
|
|
118
|
+
help=(
|
|
119
|
+
"Optional Whisper source language code to skip language detection, "
|
|
120
|
+
"for example 'en', 'es', or 'fr'."
|
|
121
|
+
),
|
|
122
|
+
),
|
|
123
|
+
] = None,
|
|
124
|
+
output: Annotated[
|
|
125
|
+
Path | None,
|
|
126
|
+
typer.Option(
|
|
127
|
+
"--output",
|
|
128
|
+
"-o",
|
|
129
|
+
help=(
|
|
130
|
+
"Base output path without a .srt suffix. Subtatix writes "
|
|
131
|
+
"'.srt' and translated variants like '.es.srt'. If a directory is "
|
|
132
|
+
"provided, the default filename based on the input file is used inside it."
|
|
133
|
+
),
|
|
134
|
+
),
|
|
135
|
+
] = None,
|
|
136
|
+
model: Annotated[
|
|
137
|
+
str,
|
|
138
|
+
typer.Option(help=f"Whisper model name to use. Default: {DEFAULT_MODEL}."),
|
|
139
|
+
] = DEFAULT_MODEL,
|
|
140
|
+
batch_size: Annotated[
|
|
141
|
+
int,
|
|
142
|
+
typer.Option(
|
|
143
|
+
"--batch-size",
|
|
144
|
+
help="Batch size for Whisper inference. Reduce this if you run out of GPU memory.",
|
|
145
|
+
),
|
|
146
|
+
] = 8,
|
|
147
|
+
device: Annotated[
|
|
148
|
+
str,
|
|
149
|
+
typer.Option(
|
|
150
|
+
"--device",
|
|
151
|
+
help=(
|
|
152
|
+
"Execution device for WhisperX: 'auto', 'cuda', or 'cpu'. "
|
|
153
|
+
"Default: auto."
|
|
154
|
+
),
|
|
155
|
+
),
|
|
156
|
+
] = "auto",
|
|
157
|
+
discard_transcription: Annotated[
|
|
158
|
+
bool,
|
|
159
|
+
typer.Option(
|
|
160
|
+
"--discard-transcription",
|
|
161
|
+
help=(
|
|
162
|
+
"When used with --to, do not save the original untranslated SRT file. "
|
|
163
|
+
"Without --to, the transcribed SRT is still written."
|
|
164
|
+
),
|
|
165
|
+
is_flag=True,
|
|
166
|
+
),
|
|
167
|
+
] = False,
|
|
168
|
+
list_languages: Annotated[
|
|
169
|
+
bool,
|
|
170
|
+
typer.Option(
|
|
171
|
+
"--list-languages",
|
|
172
|
+
help=(
|
|
173
|
+
"List source Whisper codes and the convenience target codes. "
|
|
174
|
+
"Use --list-target-languages for the full raw NLLB target list."
|
|
175
|
+
),
|
|
176
|
+
is_flag=True,
|
|
177
|
+
),
|
|
178
|
+
] = False,
|
|
179
|
+
list_source_languages: Annotated[
|
|
180
|
+
bool,
|
|
181
|
+
typer.Option(
|
|
182
|
+
"--list-source-languages",
|
|
183
|
+
help="List supported Whisper source language codes.",
|
|
184
|
+
is_flag=True,
|
|
185
|
+
),
|
|
186
|
+
] = False,
|
|
187
|
+
list_target_languages: Annotated[
|
|
188
|
+
bool,
|
|
189
|
+
typer.Option(
|
|
190
|
+
"--list-target-languages",
|
|
191
|
+
help=(
|
|
192
|
+
"List the convenience target codes for --to, followed by the full raw "
|
|
193
|
+
"NLLB target language codes accepted by --to."
|
|
194
|
+
),
|
|
195
|
+
is_flag=True,
|
|
196
|
+
),
|
|
197
|
+
] = False,
|
|
198
|
+
) -> None:
|
|
199
|
+
if list_languages or list_source_languages or list_target_languages:
|
|
200
|
+
if input_file is not None:
|
|
201
|
+
raise typer.BadParameter(
|
|
202
|
+
"INPUT_FILE cannot be used with language listing options."
|
|
203
|
+
)
|
|
204
|
+
if list_languages or list_source_languages:
|
|
205
|
+
typer.echo("Source languages (Whisper codes):")
|
|
206
|
+
typer.echo(", ".join(SUPPORTED_SOURCE_LANGUAGE_CODES))
|
|
207
|
+
typer.echo()
|
|
208
|
+
if list_languages or list_target_languages:
|
|
209
|
+
typer.echo(
|
|
210
|
+
"Convenience target languages (--to Whisper-style codes mapped to NLLB):"
|
|
211
|
+
)
|
|
212
|
+
typer.echo(", ".join(SUPPORTED_TARGET_LANGUAGE_CODES))
|
|
213
|
+
typer.echo()
|
|
214
|
+
if list_languages:
|
|
215
|
+
typer.echo(
|
|
216
|
+
"Use --list-target-languages to also show the full raw NLLB target list."
|
|
217
|
+
)
|
|
218
|
+
if list_target_languages:
|
|
219
|
+
typer.echo("Raw NLLB target languages (--to NLLB codes):")
|
|
220
|
+
typer.echo(", ".join(get_available_nllb_languages()))
|
|
221
|
+
return
|
|
222
|
+
|
|
223
|
+
if input_file is None:
|
|
224
|
+
raise typer.BadParameter("Missing argument 'INPUT_FILE'.")
|
|
225
|
+
|
|
226
|
+
configure_runtime_noise()
|
|
227
|
+
require_ffmpeg()
|
|
228
|
+
write_original_srt = target_language is None or not discard_transcription
|
|
229
|
+
with ProgressBar("Transcription", 100) as transcription_progress:
|
|
230
|
+
document = transcribe_to_srt(
|
|
231
|
+
input_file=input_file,
|
|
232
|
+
model_name=model,
|
|
233
|
+
batch_size=batch_size,
|
|
234
|
+
output_file=output,
|
|
235
|
+
write_output=write_original_srt,
|
|
236
|
+
source_language=source_language,
|
|
237
|
+
device_preference=device,
|
|
238
|
+
log=transcription_progress.write_message,
|
|
239
|
+
progress_callback=lambda percent: transcription_progress.update_to(
|
|
240
|
+
round(percent)
|
|
241
|
+
),
|
|
242
|
+
progress_reset=transcription_progress.reset,
|
|
243
|
+
)
|
|
244
|
+
if write_original_srt:
|
|
245
|
+
typer.echo(document.subtitle_path)
|
|
246
|
+
|
|
247
|
+
if target_language:
|
|
248
|
+
total_batches = max(
|
|
249
|
+
1,
|
|
250
|
+
(len(document.cues) + DEFAULT_TRANSLATION_BATCH_SIZE - 1)
|
|
251
|
+
// DEFAULT_TRANSLATION_BATCH_SIZE,
|
|
252
|
+
)
|
|
253
|
+
with ProgressBar("Translation", total_batches) as translation_progress:
|
|
254
|
+
translated_path = translate_subtitles(
|
|
255
|
+
document=document,
|
|
256
|
+
target_language=target_language,
|
|
257
|
+
progress_callback=lambda batch_index, total: translation_progress.update_to(
|
|
258
|
+
batch_index
|
|
259
|
+
),
|
|
260
|
+
)
|
|
261
|
+
typer.echo(translated_path)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def main() -> int:
|
|
265
|
+
try:
|
|
266
|
+
app(standalone_mode=False)
|
|
267
|
+
except SubtatixError as error:
|
|
268
|
+
ClickException(str(error)).show()
|
|
269
|
+
return 1
|
|
270
|
+
except ClickException as error:
|
|
271
|
+
error.show()
|
|
272
|
+
return error.exit_code
|
|
273
|
+
return 0
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import gc
|
|
4
|
+
import logging
|
|
5
|
+
import warnings
|
|
6
|
+
|
|
7
|
+
import torch
|
|
8
|
+
|
|
9
|
+
from subtatix.errors import SubtatixError
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_device(preferred: str = "auto") -> str:
|
|
13
|
+
normalized = preferred.strip().lower()
|
|
14
|
+
if normalized not in {"auto", "cpu", "cuda"}:
|
|
15
|
+
raise SubtatixError(
|
|
16
|
+
f"Unsupported device '{preferred}'. Use 'auto', 'cpu', or 'cuda'."
|
|
17
|
+
)
|
|
18
|
+
if normalized == "cpu":
|
|
19
|
+
return "cpu"
|
|
20
|
+
if normalized == "cuda":
|
|
21
|
+
if not torch.cuda.is_available():
|
|
22
|
+
raise SubtatixError(
|
|
23
|
+
"CUDA was requested but is not available on this system."
|
|
24
|
+
)
|
|
25
|
+
return "cuda"
|
|
26
|
+
if torch.cuda.is_available():
|
|
27
|
+
return "cuda"
|
|
28
|
+
return "cpu"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_whisperx_runtime(preferred_device: str = "auto") -> tuple[str, str]:
|
|
32
|
+
device = get_device(preferred_device)
|
|
33
|
+
if device == "cuda":
|
|
34
|
+
return device, "float16"
|
|
35
|
+
return device, "float32"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def configure_runtime_noise() -> None:
|
|
39
|
+
for logger_name in (
|
|
40
|
+
"whisperx",
|
|
41
|
+
"whisperx.asr",
|
|
42
|
+
"whisperx.vads",
|
|
43
|
+
"whisperx.vads.pyannote",
|
|
44
|
+
"lightning",
|
|
45
|
+
"lightning.pytorch",
|
|
46
|
+
"lightning.pytorch.utilities.migration",
|
|
47
|
+
"lightning.pytorch.utilities.migration.utils",
|
|
48
|
+
"pytorch_lightning",
|
|
49
|
+
"pytorch_lightning.utilities.migration",
|
|
50
|
+
"pytorch_lightning.utilities.migration.utils",
|
|
51
|
+
):
|
|
52
|
+
logging.getLogger(logger_name).setLevel(logging.ERROR)
|
|
53
|
+
|
|
54
|
+
warnings.filterwarnings(
|
|
55
|
+
"ignore",
|
|
56
|
+
message=r"TensorFloat-32 \(TF32\) has been disabled.*",
|
|
57
|
+
module=r"pyannote\.audio\.utils\.reproducibility",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def release_memory() -> None:
|
|
62
|
+
gc.collect()
|
|
63
|
+
if torch.cuda.is_available():
|
|
64
|
+
torch.cuda.empty_cache()
|
|
65
|
+
torch.cuda.ipc_collect()
|
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import shutil
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
from faster_whisper.tokenizer import _LANGUAGE_CODES
|
|
9
|
+
import whisperx
|
|
10
|
+
|
|
11
|
+
from subtatix.errors import SubtatixError
|
|
12
|
+
from subtatix.runtime import get_whisperx_runtime, release_memory
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DEFAULT_MODEL = "large-v2"
|
|
16
|
+
SUPPORTED_SOURCE_LANGUAGE_CODES = tuple(sorted(_LANGUAGE_CODES))
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class SubtitleCue:
|
|
21
|
+
start: float
|
|
22
|
+
end: float
|
|
23
|
+
text: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass(frozen=True)
|
|
27
|
+
class SubtitleDocument:
|
|
28
|
+
source_language: str
|
|
29
|
+
subtitle_path: Path
|
|
30
|
+
cues: list[SubtitleCue]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def require_ffmpeg() -> None:
|
|
34
|
+
if shutil.which("ffmpeg") is None:
|
|
35
|
+
raise SubtatixError(
|
|
36
|
+
"ffmpeg is required but was not found on PATH. Install ffmpeg and try again."
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def normalize_source_language(language: str) -> str:
|
|
41
|
+
language_code = language.strip().lower()
|
|
42
|
+
if language_code in SUPPORTED_SOURCE_LANGUAGE_CODES:
|
|
43
|
+
return language_code
|
|
44
|
+
|
|
45
|
+
raise SubtatixError(
|
|
46
|
+
f"Unsupported source language '{language}'. Use a Whisper language code like "
|
|
47
|
+
"'en', 'es', or 'fr'."
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def resolve_output_path(input_file: Path, output_file: Path | None) -> Path:
|
|
52
|
+
default_output_path = input_file.with_suffix(".srt")
|
|
53
|
+
if output_file is None:
|
|
54
|
+
return default_output_path
|
|
55
|
+
|
|
56
|
+
output_file = output_file.expanduser()
|
|
57
|
+
if output_file.exists() and output_file.is_dir():
|
|
58
|
+
return (output_file / default_output_path.name).resolve()
|
|
59
|
+
if output_file.suffix == ".srt":
|
|
60
|
+
raise SubtatixError(
|
|
61
|
+
"Output paths must not end in '.srt'. Pass a base output path like "
|
|
62
|
+
"'some-path/some-file-name' so Subtatix can write '.srt' and translated variants like "
|
|
63
|
+
"'.es.srt'."
|
|
64
|
+
)
|
|
65
|
+
return output_file.with_suffix(".srt").resolve()
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def format_srt_timestamp(seconds: float) -> str:
|
|
69
|
+
total_milliseconds = max(0, round(seconds * 1000))
|
|
70
|
+
hours, remainder = divmod(total_milliseconds, 3_600_000)
|
|
71
|
+
minutes, remainder = divmod(remainder, 60_000)
|
|
72
|
+
secs, milliseconds = divmod(remainder, 1000)
|
|
73
|
+
return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def build_cues(aligned_transcription: dict) -> list[SubtitleCue]:
|
|
77
|
+
cues: list[SubtitleCue] = []
|
|
78
|
+
for segment in aligned_transcription["segments"]:
|
|
79
|
+
text = segment["text"].strip()
|
|
80
|
+
if not text:
|
|
81
|
+
continue
|
|
82
|
+
cues.append(
|
|
83
|
+
SubtitleCue(
|
|
84
|
+
start=float(segment["start"]),
|
|
85
|
+
end=float(segment["end"]),
|
|
86
|
+
text=text,
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
return cues
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def write_srt(subtitle_path: Path, cues: list[SubtitleCue]) -> None:
|
|
93
|
+
subtitle_path.parent.mkdir(parents=True, exist_ok=True)
|
|
94
|
+
blocks = []
|
|
95
|
+
for index, cue in enumerate(cues, start=1):
|
|
96
|
+
blocks.append(
|
|
97
|
+
"\n".join(
|
|
98
|
+
[
|
|
99
|
+
str(index),
|
|
100
|
+
(
|
|
101
|
+
f"{format_srt_timestamp(cue.start)} --> "
|
|
102
|
+
f"{format_srt_timestamp(cue.end)}"
|
|
103
|
+
),
|
|
104
|
+
cue.text,
|
|
105
|
+
]
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
subtitle_path.write_text("\n\n".join(blocks) + "\n", encoding="utf-8")
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def is_cuda_oom(error: RuntimeError) -> bool:
|
|
112
|
+
message = str(error).lower()
|
|
113
|
+
return "cuda" in message and "out of memory" in message
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def iter_retry_batch_sizes(batch_size: int) -> list[int]:
|
|
117
|
+
sizes: list[int] = []
|
|
118
|
+
current = max(1, batch_size)
|
|
119
|
+
while current not in sizes:
|
|
120
|
+
sizes.append(current)
|
|
121
|
+
if current == 1:
|
|
122
|
+
break
|
|
123
|
+
current = max(1, current // 2)
|
|
124
|
+
return sizes
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def transcribe_with_backoff(
|
|
128
|
+
whisper_model: object,
|
|
129
|
+
audio: object,
|
|
130
|
+
batch_size: int,
|
|
131
|
+
source_language: str | None,
|
|
132
|
+
device: str,
|
|
133
|
+
log: Callable[[str], None] | None = None,
|
|
134
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
135
|
+
progress_reset: Callable[[], None] | None = None,
|
|
136
|
+
) -> tuple[dict, int]:
|
|
137
|
+
attempts = iter_retry_batch_sizes(batch_size)
|
|
138
|
+
last_error: RuntimeError | None = None
|
|
139
|
+
for attempt_batch_size in attempts:
|
|
140
|
+
if log is not None:
|
|
141
|
+
log(f"Starting transcription with batch size {attempt_batch_size}.")
|
|
142
|
+
try:
|
|
143
|
+
transcription = whisper_model.transcribe(
|
|
144
|
+
audio,
|
|
145
|
+
batch_size=attempt_batch_size,
|
|
146
|
+
language=source_language,
|
|
147
|
+
progress_callback=progress_callback,
|
|
148
|
+
)
|
|
149
|
+
return transcription, attempt_batch_size
|
|
150
|
+
except RuntimeError as error:
|
|
151
|
+
if device != "cuda" or not is_cuda_oom(error):
|
|
152
|
+
raise
|
|
153
|
+
last_error = error
|
|
154
|
+
next_attempt_index = attempts.index(attempt_batch_size) + 1
|
|
155
|
+
if log is not None and next_attempt_index < len(attempts):
|
|
156
|
+
log(
|
|
157
|
+
"CUDA out of memory at batch size "
|
|
158
|
+
f"{attempt_batch_size}; retrying with batch size "
|
|
159
|
+
f"{attempts[next_attempt_index]}."
|
|
160
|
+
)
|
|
161
|
+
if progress_reset is not None and next_attempt_index < len(attempts):
|
|
162
|
+
progress_reset()
|
|
163
|
+
release_memory()
|
|
164
|
+
assert last_error is not None
|
|
165
|
+
raise SubtatixError(
|
|
166
|
+
"CUDA ran out of memory during transcription even after retrying with "
|
|
167
|
+
f"smaller batch sizes {attempts}. Retry with --device cpu, a smaller "
|
|
168
|
+
"--model, or a lower --batch-size."
|
|
169
|
+
) from last_error
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def transcribe_to_srt(
|
|
173
|
+
input_file: Path,
|
|
174
|
+
model_name: str = DEFAULT_MODEL,
|
|
175
|
+
batch_size: int = 8,
|
|
176
|
+
output_file: Path | None = None,
|
|
177
|
+
write_output: bool = True,
|
|
178
|
+
source_language: str | None = None,
|
|
179
|
+
device_preference: str = "auto",
|
|
180
|
+
log: Callable[[str], None] | None = None,
|
|
181
|
+
progress_callback: Callable[[float], None] | None = None,
|
|
182
|
+
progress_reset: Callable[[], None] | None = None,
|
|
183
|
+
) -> SubtitleDocument:
|
|
184
|
+
input_file = input_file.expanduser().resolve()
|
|
185
|
+
if not input_file.is_file():
|
|
186
|
+
raise SubtatixError(f"Input file not found: {input_file}")
|
|
187
|
+
output_path = resolve_output_path(input_file, output_file)
|
|
188
|
+
normalized_source_language = (
|
|
189
|
+
normalize_source_language(source_language)
|
|
190
|
+
if source_language is not None
|
|
191
|
+
else None
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
device, compute_type = get_whisperx_runtime(device_preference)
|
|
195
|
+
if log is not None:
|
|
196
|
+
log(
|
|
197
|
+
f"Using device: {device} (compute_type={compute_type}, "
|
|
198
|
+
f"requested={device_preference}, initial_batch_size={batch_size})."
|
|
199
|
+
)
|
|
200
|
+
whisper_model = None
|
|
201
|
+
align_model = None
|
|
202
|
+
audio = None
|
|
203
|
+
transcription = None
|
|
204
|
+
aligned_transcription = None
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
whisper_model = whisperx.load_model(
|
|
208
|
+
model_name,
|
|
209
|
+
device,
|
|
210
|
+
compute_type=compute_type,
|
|
211
|
+
language=normalized_source_language,
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
audio = whisperx.load_audio(str(input_file))
|
|
215
|
+
transcription, _ = transcribe_with_backoff(
|
|
216
|
+
whisper_model=whisper_model,
|
|
217
|
+
audio=audio,
|
|
218
|
+
batch_size=batch_size,
|
|
219
|
+
source_language=normalized_source_language,
|
|
220
|
+
device=device,
|
|
221
|
+
log=log,
|
|
222
|
+
progress_callback=progress_callback,
|
|
223
|
+
progress_reset=progress_reset,
|
|
224
|
+
)
|
|
225
|
+
language = transcription["language"]
|
|
226
|
+
if normalized_source_language is None and log is not None:
|
|
227
|
+
log(f"Auto-detected source language: {language}.")
|
|
228
|
+
|
|
229
|
+
align_model, align_metadata = whisperx.load_align_model(
|
|
230
|
+
language_code=language,
|
|
231
|
+
device=device,
|
|
232
|
+
)
|
|
233
|
+
aligned_transcription = whisperx.align(
|
|
234
|
+
transcription["segments"],
|
|
235
|
+
align_model,
|
|
236
|
+
align_metadata,
|
|
237
|
+
audio,
|
|
238
|
+
device,
|
|
239
|
+
return_char_alignments=False,
|
|
240
|
+
)
|
|
241
|
+
aligned_transcription["language"] = language
|
|
242
|
+
cues = build_cues(aligned_transcription)
|
|
243
|
+
if write_output:
|
|
244
|
+
write_srt(output_path, cues)
|
|
245
|
+
return SubtitleDocument(
|
|
246
|
+
source_language=language,
|
|
247
|
+
subtitle_path=output_path,
|
|
248
|
+
cues=cues,
|
|
249
|
+
)
|
|
250
|
+
finally:
|
|
251
|
+
del whisper_model
|
|
252
|
+
del align_model
|
|
253
|
+
del audio
|
|
254
|
+
del transcription
|
|
255
|
+
del aligned_transcription
|
|
256
|
+
release_memory()
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
import math
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
|
9
|
+
|
|
10
|
+
from subtatix.errors import SubtatixError
|
|
11
|
+
from subtatix.runtime import get_device, release_memory
|
|
12
|
+
from subtatix.subtitles import SubtitleCue, SubtitleDocument, write_srt
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
DEFAULT_TRANSLATION_MODEL = "facebook/nllb-200-1.3B"
|
|
16
|
+
DEFAULT_TRANSLATION_BATCH_SIZE = 16
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(frozen=True)
|
|
20
|
+
class LanguageSpec:
|
|
21
|
+
nllb_code: str
|
|
22
|
+
suffix: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
LANGUAGE_SPECS = {
|
|
26
|
+
"ca": LanguageSpec("cat_Latn", "ca"),
|
|
27
|
+
"de": LanguageSpec("deu_Latn", "de"),
|
|
28
|
+
"en": LanguageSpec("eng_Latn", "en"),
|
|
29
|
+
"es": LanguageSpec("spa_Latn", "es"),
|
|
30
|
+
"fr": LanguageSpec("fra_Latn", "fr"),
|
|
31
|
+
"it": LanguageSpec("ita_Latn", "it"),
|
|
32
|
+
"ja": LanguageSpec("jpn_Jpan", "ja"),
|
|
33
|
+
"ko": LanguageSpec("kor_Hang", "ko"),
|
|
34
|
+
"nl": LanguageSpec("nld_Latn", "nl"),
|
|
35
|
+
"pt": LanguageSpec("por_Latn", "pt"),
|
|
36
|
+
"ru": LanguageSpec("rus_Cyrl", "ru"),
|
|
37
|
+
"zh": LanguageSpec("zho_Hans", "zh"),
|
|
38
|
+
}
|
|
39
|
+
SUPPORTED_TARGET_LANGUAGE_CODES = tuple(sorted(LANGUAGE_SPECS))
|
|
40
|
+
|
|
41
|
+
_TRANSLATION_MODELS: dict[str, tuple[object, object, str]] = {}
|
|
42
|
+
_NLLB_LANGUAGE_CODES: tuple[str, ...] | None = None
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def resolve_language(language: str) -> LanguageSpec:
|
|
46
|
+
raw_language = language.strip()
|
|
47
|
+
key = raw_language.lower()
|
|
48
|
+
if key in LANGUAGE_SPECS:
|
|
49
|
+
return LANGUAGE_SPECS[key]
|
|
50
|
+
|
|
51
|
+
parts = raw_language.split("_", 1)
|
|
52
|
+
if len(parts) == 2 and len(parts[0]) == 3 and parts[0].islower():
|
|
53
|
+
return LanguageSpec(raw_language, parts[0])
|
|
54
|
+
|
|
55
|
+
raise SubtatixError(
|
|
56
|
+
f"Unsupported language '{language}'. Use a Whisper language code like 'es' "
|
|
57
|
+
"or a full NLLB language code like 'spa_Latn'."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def resolve_translation_output_path(
|
|
62
|
+
subtitle_path: Path,
|
|
63
|
+
target_language: LanguageSpec,
|
|
64
|
+
) -> Path:
|
|
65
|
+
if subtitle_path.suffix != ".srt":
|
|
66
|
+
raise SubtatixError(f"Expected an .srt subtitle file, got: {subtitle_path}")
|
|
67
|
+
return subtitle_path.with_name(
|
|
68
|
+
f"{subtitle_path.stem}.{target_language.suffix}{subtitle_path.suffix}"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_translation_backend(
|
|
73
|
+
model_name: str = DEFAULT_TRANSLATION_MODEL,
|
|
74
|
+
) -> tuple[object, object, str]:
|
|
75
|
+
if model_name not in _TRANSLATION_MODELS:
|
|
76
|
+
release_memory()
|
|
77
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
78
|
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
79
|
+
device = get_device()
|
|
80
|
+
_TRANSLATION_MODELS[model_name] = (tokenizer, model.to(device), device)
|
|
81
|
+
return _TRANSLATION_MODELS[model_name]
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_available_nllb_languages(
|
|
85
|
+
model_name: str = DEFAULT_TRANSLATION_MODEL,
|
|
86
|
+
) -> tuple[str, ...]:
|
|
87
|
+
global _NLLB_LANGUAGE_CODES
|
|
88
|
+
if _NLLB_LANGUAGE_CODES is None:
|
|
89
|
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
90
|
+
_NLLB_LANGUAGE_CODES = tuple(sorted(tokenizer.additional_special_tokens))
|
|
91
|
+
return _NLLB_LANGUAGE_CODES
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def translate_batch(
|
|
95
|
+
texts: list[str],
|
|
96
|
+
src_lang: str,
|
|
97
|
+
tgt_lang: str,
|
|
98
|
+
model_name: str = DEFAULT_TRANSLATION_MODEL,
|
|
99
|
+
max_length: int = 400,
|
|
100
|
+
) -> list[str]:
|
|
101
|
+
tokenizer, model, device = get_translation_backend(model_name)
|
|
102
|
+
tokenizer.src_lang = src_lang
|
|
103
|
+
inputs = tokenizer(
|
|
104
|
+
texts,
|
|
105
|
+
return_tensors="pt",
|
|
106
|
+
padding=True,
|
|
107
|
+
truncation=True,
|
|
108
|
+
).to(device)
|
|
109
|
+
translated_tokens = model.generate(
|
|
110
|
+
**inputs,
|
|
111
|
+
forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
|
|
112
|
+
max_length=max_length,
|
|
113
|
+
)
|
|
114
|
+
return tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def translate_subtitles(
|
|
118
|
+
document: SubtitleDocument,
|
|
119
|
+
target_language: str,
|
|
120
|
+
model_name: str = DEFAULT_TRANSLATION_MODEL,
|
|
121
|
+
batch_size: int = DEFAULT_TRANSLATION_BATCH_SIZE,
|
|
122
|
+
max_length: int = 400,
|
|
123
|
+
progress_callback: Callable[[int, int], None] | None = None,
|
|
124
|
+
) -> Path:
|
|
125
|
+
source = resolve_language(document.source_language)
|
|
126
|
+
target = resolve_language(target_language)
|
|
127
|
+
output_path = resolve_translation_output_path(document.subtitle_path, target)
|
|
128
|
+
cues = document.cues
|
|
129
|
+
total_batches = max(1, math.ceil(len(cues) / batch_size))
|
|
130
|
+
|
|
131
|
+
translated_texts: list[str] = []
|
|
132
|
+
for batch_index, start in enumerate(range(0, len(cues), batch_size), start=1):
|
|
133
|
+
batch = cues[start : start + batch_size]
|
|
134
|
+
translated_texts.extend(
|
|
135
|
+
translate_batch(
|
|
136
|
+
[cue.text for cue in batch],
|
|
137
|
+
src_lang=source.nllb_code,
|
|
138
|
+
tgt_lang=target.nllb_code,
|
|
139
|
+
model_name=model_name,
|
|
140
|
+
max_length=max_length,
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
if progress_callback is not None:
|
|
144
|
+
progress_callback(batch_index, total_batches)
|
|
145
|
+
|
|
146
|
+
translated_cues = [
|
|
147
|
+
SubtitleCue(start=cue.start, end=cue.end, text=text)
|
|
148
|
+
for cue, text in zip(cues, translated_texts, strict=True)
|
|
149
|
+
]
|
|
150
|
+
write_srt(output_path, translated_cues)
|
|
151
|
+
return output_path
|