subtitle-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- subtitle_engine/__init__.py +3 -0
- subtitle_engine/captioner.py +78 -0
- subtitle_engine/cli.py +170 -0
- subtitle_engine/srt_writer.py +45 -0
- subtitle_engine/transcriber.py +129 -0
- subtitle_engine/utils.py +39 -0
- subtitle_engine-0.1.0.dist-info/METADATA +97 -0
- subtitle_engine-0.1.0.dist-info/RECORD +12 -0
- subtitle_engine-0.1.0.dist-info/WHEEL +5 -0
- subtitle_engine-0.1.0.dist-info/entry_points.txt +2 -0
- subtitle_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
- subtitle_engine-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Generate captions from transcripts using a local Ollama instance."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _default_prompt(transcript: str) -> str:
|
|
10
|
+
"""Build the prompt sent to the LLM."""
|
|
11
|
+
return (
|
|
12
|
+
"Create a short, engaging caption (1-2 sentences) for a video based on the following transcript. "
|
|
13
|
+
"Write the caption in the same language as the transcript. "
|
|
14
|
+
"Answer directly with the caption only, without any thinking or explanation.\n\n"
|
|
15
|
+
f"Transcript:\n{transcript}"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def generate_caption(
|
|
20
|
+
transcript: str,
|
|
21
|
+
*,
|
|
22
|
+
model: str,
|
|
23
|
+
host: str = "http://localhost:11434",
|
|
24
|
+
prompt: Optional[str] = None,
|
|
25
|
+
) -> str:
|
|
26
|
+
"""Generate a caption from a transcript via Ollama.
|
|
27
|
+
|
|
28
|
+
Parameters
|
|
29
|
+
----------
|
|
30
|
+
transcript:
|
|
31
|
+
The transcript text to summarize.
|
|
32
|
+
model:
|
|
33
|
+
Name of the Ollama model to use.
|
|
34
|
+
host:
|
|
35
|
+
Base URL of the Ollama API.
|
|
36
|
+
prompt:
|
|
37
|
+
Custom prompt. A default prompt is used if omitted.
|
|
38
|
+
|
|
39
|
+
Returns
|
|
40
|
+
-------
|
|
41
|
+
The generated caption string.
|
|
42
|
+
"""
|
|
43
|
+
if not transcript.strip():
|
|
44
|
+
raise ValueError("Cannot generate a caption from an empty transcript")
|
|
45
|
+
|
|
46
|
+
url = f"{host.rstrip('/')}/api/generate"
|
|
47
|
+
payload = {
|
|
48
|
+
"model": model,
|
|
49
|
+
"prompt": prompt or _default_prompt(transcript),
|
|
50
|
+
"stream": False,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
response = requests.post(url, json=payload, timeout=300)
|
|
55
|
+
except requests.ConnectionError as exc:
|
|
56
|
+
raise ConnectionError(
|
|
57
|
+
f"Could not connect to Ollama at {host}. Is Ollama running?"
|
|
58
|
+
) from exc
|
|
59
|
+
|
|
60
|
+
response.raise_for_status()
|
|
61
|
+
data = response.json()
|
|
62
|
+
caption = data.get("response", "").strip()
|
|
63
|
+
|
|
64
|
+
if not caption:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
"Ollama returned an empty caption. "
|
|
67
|
+
"This can happen with some models or languages — try a different --ollama-model."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
return caption
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def list_models(host: str = "http://localhost:11434") -> list[str]:
|
|
74
|
+
"""Return the names of models available in the local Ollama instance."""
|
|
75
|
+
url = f"{host.rstrip('/')}/api/tags"
|
|
76
|
+
response = requests.get(url, timeout=30)
|
|
77
|
+
response.raise_for_status()
|
|
78
|
+
return [model["name"] for model in response.json().get("models", [])]
|
subtitle_engine/cli.py
ADDED
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""Command-line interface for subtitle-engine."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Annotated, Optional
|
|
5
|
+
|
|
6
|
+
import typer
|
|
7
|
+
from rich.console import Console
|
|
8
|
+
|
|
9
|
+
from subtitle_engine.captioner import generate_caption
|
|
10
|
+
from subtitle_engine.srt_writer import write_srt
|
|
11
|
+
from subtitle_engine.transcriber import transcribe
|
|
12
|
+
from subtitle_engine.utils import resolve_output_path, validate_media_file
|
|
13
|
+
|
|
14
|
+
app = typer.Typer(
|
|
15
|
+
help="Generate SRT subtitles from audio/video files using WhisperX",
|
|
16
|
+
no_args_is_help=True,
|
|
17
|
+
)
|
|
18
|
+
console = Console()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def main(
|
|
23
|
+
input_file: Annotated[
|
|
24
|
+
Path,
|
|
25
|
+
typer.Argument(
|
|
26
|
+
help="Audio or video file to transcribe",
|
|
27
|
+
exists=True,
|
|
28
|
+
file_okay=True,
|
|
29
|
+
dir_okay=False,
|
|
30
|
+
readable=True,
|
|
31
|
+
),
|
|
32
|
+
],
|
|
33
|
+
output: Annotated[
|
|
34
|
+
Optional[Path],
|
|
35
|
+
typer.Option(
|
|
36
|
+
"--output",
|
|
37
|
+
"-o",
|
|
38
|
+
help="Output SRT file (default: <input>.srt)",
|
|
39
|
+
file_okay=True,
|
|
40
|
+
dir_okay=False,
|
|
41
|
+
),
|
|
42
|
+
] = None,
|
|
43
|
+
model: Annotated[
|
|
44
|
+
str,
|
|
45
|
+
typer.Option(
|
|
46
|
+
"--model",
|
|
47
|
+
"-m",
|
|
48
|
+
help="WhisperX model: tiny, base, small, medium, large-v2, large-v3",
|
|
49
|
+
),
|
|
50
|
+
] = "small",
|
|
51
|
+
language: Annotated[
|
|
52
|
+
Optional[str],
|
|
53
|
+
typer.Option(
|
|
54
|
+
"--language",
|
|
55
|
+
"-l",
|
|
56
|
+
help="Language code, e.g. en, fi. Auto-detected if omitted.",
|
|
57
|
+
),
|
|
58
|
+
] = None,
|
|
59
|
+
device: Annotated[
|
|
60
|
+
Optional[str],
|
|
61
|
+
typer.Option(
|
|
62
|
+
"--device",
|
|
63
|
+
"-d",
|
|
64
|
+
help="Device: cpu, cuda or mps. Auto-detected if omitted.",
|
|
65
|
+
),
|
|
66
|
+
] = None,
|
|
67
|
+
batch_size: Annotated[
|
|
68
|
+
int,
|
|
69
|
+
typer.Option(
|
|
70
|
+
"--batch-size",
|
|
71
|
+
"-b",
|
|
72
|
+
help="WhisperX inference batch size",
|
|
73
|
+
min=1,
|
|
74
|
+
),
|
|
75
|
+
] = 16,
|
|
76
|
+
compute_type: Annotated[
|
|
77
|
+
Optional[str],
|
|
78
|
+
typer.Option(
|
|
79
|
+
"--compute-type",
|
|
80
|
+
"-c",
|
|
81
|
+
help="Compute type: int8 or float16. Auto-selected if omitted.",
|
|
82
|
+
),
|
|
83
|
+
] = None,
|
|
84
|
+
diarize: Annotated[
|
|
85
|
+
bool,
|
|
86
|
+
typer.Option(
|
|
87
|
+
"--diarize",
|
|
88
|
+
help="Run speaker diarization (requires --hf-token)",
|
|
89
|
+
),
|
|
90
|
+
] = False,
|
|
91
|
+
hf_token: Annotated[
|
|
92
|
+
Optional[str],
|
|
93
|
+
typer.Option(
|
|
94
|
+
"--hf-token",
|
|
95
|
+
help="Hugging Face token for diarization",
|
|
96
|
+
envvar="HF_TOKEN",
|
|
97
|
+
),
|
|
98
|
+
] = None,
|
|
99
|
+
caption: Annotated[
|
|
100
|
+
bool,
|
|
101
|
+
typer.Option(
|
|
102
|
+
"--caption",
|
|
103
|
+
help="Generate a caption from the transcript using Ollama",
|
|
104
|
+
),
|
|
105
|
+
] = False,
|
|
106
|
+
ollama_model: Annotated[
|
|
107
|
+
Optional[str],
|
|
108
|
+
typer.Option(
|
|
109
|
+
"--ollama-model",
|
|
110
|
+
help="Ollama model for caption generation. Required if --caption is set.",
|
|
111
|
+
),
|
|
112
|
+
] = None,
|
|
113
|
+
ollama_host: Annotated[
|
|
114
|
+
str,
|
|
115
|
+
typer.Option(
|
|
116
|
+
"--ollama-host",
|
|
117
|
+
help="Ollama API host",
|
|
118
|
+
envvar="OLLAMA_HOST",
|
|
119
|
+
),
|
|
120
|
+
] = "http://localhost:11434",
|
|
121
|
+
) -> None:
|
|
122
|
+
"""Generate SRT subtitles from a media file."""
|
|
123
|
+
try:
|
|
124
|
+
validate_media_file(input_file)
|
|
125
|
+
output_path = resolve_output_path(input_file, output)
|
|
126
|
+
|
|
127
|
+
if caption and not ollama_model:
|
|
128
|
+
raise ValueError("--ollama-model is required when using --caption")
|
|
129
|
+
|
|
130
|
+
console.print(f"[bold]Transcribing:[/bold] {input_file}")
|
|
131
|
+
console.print(f"[bold]Model:[/bold] {model}")
|
|
132
|
+
if language:
|
|
133
|
+
console.print(f"[bold]Language:[/bold] {language}")
|
|
134
|
+
if device:
|
|
135
|
+
console.print(f"[bold]Device:[/bold] {device}")
|
|
136
|
+
|
|
137
|
+
segments = transcribe(
|
|
138
|
+
input_file,
|
|
139
|
+
model_name=model,
|
|
140
|
+
language=language,
|
|
141
|
+
device=device,
|
|
142
|
+
batch_size=batch_size,
|
|
143
|
+
compute_type=compute_type,
|
|
144
|
+
diarize=diarize,
|
|
145
|
+
hf_token=hf_token,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
write_srt(segments, output_path)
|
|
149
|
+
console.print(f"[green]Wrote subtitles to:[/green] {output_path}")
|
|
150
|
+
|
|
151
|
+
if caption:
|
|
152
|
+
transcript = " ".join(str(segment.get("text", "")).strip() for segment in segments)
|
|
153
|
+
caption_text = generate_caption(
|
|
154
|
+
transcript,
|
|
155
|
+
model=ollama_model,
|
|
156
|
+
host=ollama_host,
|
|
157
|
+
)
|
|
158
|
+
caption_path = output_path.with_suffix(".caption.txt")
|
|
159
|
+
caption_path.write_text(caption_text, encoding="utf-8")
|
|
160
|
+
console.print(f"[green]Wrote caption to:[/green] {caption_path}")
|
|
161
|
+
except (ValueError, FileNotFoundError, ConnectionError) as exc:
|
|
162
|
+
console.print(f"[red]Error:[/red] {exc}")
|
|
163
|
+
raise typer.Exit(code=1) from exc
|
|
164
|
+
except Exception as exc: # noqa: BLE001
|
|
165
|
+
console.print(f"[red]Transcription failed:[/red] {exc}")
|
|
166
|
+
raise typer.Exit(code=1) from exc
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
app()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Convert transcription segments to SRT format."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _format_time(seconds: float) -> str:
|
|
8
|
+
"""Convert seconds to SRT time format HH:MM:SS,mmm."""
|
|
9
|
+
total_millis = int(round(seconds * 1000))
|
|
10
|
+
hours = total_millis // 3_600_000
|
|
11
|
+
minutes = (total_millis % 3_600_000) // 60_000
|
|
12
|
+
secs = (total_millis % 60_000) // 1_000
|
|
13
|
+
millis = total_millis % 1_000
|
|
14
|
+
|
|
15
|
+
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _format_segment(index: int, start: float, end: float, text: str) -> str:
|
|
19
|
+
"""Format a single segment as an SRT block."""
|
|
20
|
+
cleaned_text = text.strip()
|
|
21
|
+
if not cleaned_text:
|
|
22
|
+
cleaned_text = "..."
|
|
23
|
+
return f"{index}\n{_format_time(start)} --> {_format_time(end)}\n{cleaned_text}\n"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def segments_to_srt(segments: Iterable[dict]) -> str:
|
|
27
|
+
"""Build an SRT string from WhisperX-style segments.
|
|
28
|
+
|
|
29
|
+
Each segment is expected to be a dict with keys:
|
|
30
|
+
``start`` (float), ``end`` (float), and ``text`` (str).
|
|
31
|
+
"""
|
|
32
|
+
blocks = []
|
|
33
|
+
for index, segment in enumerate(segments, start=1):
|
|
34
|
+
start = float(segment["start"])
|
|
35
|
+
end = float(segment["end"])
|
|
36
|
+
text = str(segment["text"])
|
|
37
|
+
blocks.append(_format_segment(index, start, end, text))
|
|
38
|
+
return "\n".join(blocks)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def write_srt(segments: Iterable[dict], output_path: Path) -> None:
|
|
42
|
+
"""Write segments to an SRT file."""
|
|
43
|
+
output_path = Path(output_path)
|
|
44
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
45
|
+
output_path.write_text(segments_to_srt(segments), encoding="utf-8")
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""WhisperX transcription wrapper."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
import whisperx
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
VALID_MODELS = {"tiny", "base", "small", "medium", "large-v2", "large-v3"}
|
|
11
|
+
VALID_DEVICES = {"cpu", "cuda", "mps"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _detect_device(device: Optional[str]) -> str:
|
|
15
|
+
"""Pick a device if none was specified."""
|
|
16
|
+
if device:
|
|
17
|
+
return device
|
|
18
|
+
if torch.cuda.is_available():
|
|
19
|
+
return "cuda"
|
|
20
|
+
if torch.backends.mps.is_available():
|
|
21
|
+
return "mps"
|
|
22
|
+
return "cpu"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _default_compute_type(device: str) -> str:
|
|
26
|
+
"""Pick a safe compute type for the device."""
|
|
27
|
+
if device == "cpu":
|
|
28
|
+
return "int8"
|
|
29
|
+
return "float16"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _validate_model(model_name: str) -> None:
|
|
33
|
+
"""Raise a ValueError if the model name is unknown."""
|
|
34
|
+
if model_name not in VALID_MODELS:
|
|
35
|
+
joined = ", ".join(sorted(VALID_MODELS))
|
|
36
|
+
raise ValueError(f"Unknown model '{model_name}'. Choose from: {joined}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _validate_device(device: str) -> None:
|
|
40
|
+
"""Raise a ValueError if the device name is unknown."""
|
|
41
|
+
if device not in VALID_DEVICES:
|
|
42
|
+
joined = ", ".join(sorted(VALID_DEVICES))
|
|
43
|
+
raise ValueError(f"Unknown device '{device}'. Choose from: {joined}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def transcribe(
|
|
47
|
+
audio_path: Path,
|
|
48
|
+
*,
|
|
49
|
+
model_name: str = "small",
|
|
50
|
+
language: Optional[str] = None,
|
|
51
|
+
device: Optional[str] = None,
|
|
52
|
+
batch_size: int = 16,
|
|
53
|
+
compute_type: Optional[str] = None,
|
|
54
|
+
diarize: bool = False,
|
|
55
|
+
hf_token: Optional[str] = None,
|
|
56
|
+
) -> list[dict]:
|
|
57
|
+
"""Transcribe an audio/video file and return SRT-ready segments.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
audio_path:
|
|
62
|
+
Path to the media file to transcribe.
|
|
63
|
+
model_name:
|
|
64
|
+
WhisperX model size. One of: tiny, base, small, medium, large-v2, large-v3.
|
|
65
|
+
language:
|
|
66
|
+
ISO language code, e.g. ``en`` or ``fi``. If ``None``, WhisperX auto-detects.
|
|
67
|
+
device:
|
|
68
|
+
``cpu``, ``cuda`` or ``mps``. Auto-detected if omitted.
|
|
69
|
+
batch_size:
|
|
70
|
+
WhisperX batch size for transcription.
|
|
71
|
+
compute_type:
|
|
72
|
+
``int8`` or ``float16``. Auto-selected per device if omitted.
|
|
73
|
+
diarize:
|
|
74
|
+
Whether to run speaker diarization.
|
|
75
|
+
hf_token:
|
|
76
|
+
Hugging Face token required for diarization.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
A list of segment dicts with ``start``, ``end`` and ``text`` keys.
|
|
81
|
+
"""
|
|
82
|
+
_validate_model(model_name)
|
|
83
|
+
|
|
84
|
+
audio_path = Path(audio_path)
|
|
85
|
+
if not audio_path.exists():
|
|
86
|
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
|
87
|
+
|
|
88
|
+
device = _detect_device(device)
|
|
89
|
+
_validate_device(device)
|
|
90
|
+
|
|
91
|
+
if compute_type is None:
|
|
92
|
+
compute_type = _default_compute_type(device)
|
|
93
|
+
|
|
94
|
+
if diarize and not hf_token:
|
|
95
|
+
raise ValueError("--hf-token is required when using --diarize")
|
|
96
|
+
|
|
97
|
+
audio = whisperx.load_audio(str(audio_path))
|
|
98
|
+
|
|
99
|
+
model = whisperx.load_model(model_name, device, compute_type=compute_type)
|
|
100
|
+
result = model.transcribe(audio, batch_size=batch_size, language=language)
|
|
101
|
+
|
|
102
|
+
# Free transcription model memory before alignment
|
|
103
|
+
del model
|
|
104
|
+
|
|
105
|
+
detected_language = result.get("language")
|
|
106
|
+
if detected_language:
|
|
107
|
+
align_model, align_metadata = whisperx.load_align_model(
|
|
108
|
+
language_code=detected_language, device=device
|
|
109
|
+
)
|
|
110
|
+
result = whisperx.align(
|
|
111
|
+
result["segments"],
|
|
112
|
+
align_model,
|
|
113
|
+
align_metadata,
|
|
114
|
+
audio,
|
|
115
|
+
device,
|
|
116
|
+
return_char_alignments=False,
|
|
117
|
+
)
|
|
118
|
+
del align_model
|
|
119
|
+
|
|
120
|
+
if diarize:
|
|
121
|
+
diarize_model = whisperx.DiarizationPipeline(
|
|
122
|
+
model_name="pyannote/speaker-diarization-3.1",
|
|
123
|
+
use_auth_token=hf_token,
|
|
124
|
+
device=device,
|
|
125
|
+
)
|
|
126
|
+
diarize_segments = diarize_model(audio)
|
|
127
|
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
|
128
|
+
|
|
129
|
+
return result["segments"]
|
subtitle_engine/utils.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""CLI helpers and path utilities."""
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
SUPPORTED_EXTENSIONS = {
|
|
8
|
+
".mp3",
|
|
9
|
+
".wav",
|
|
10
|
+
".flac",
|
|
11
|
+
".aac",
|
|
12
|
+
".ogg",
|
|
13
|
+
".m4a",
|
|
14
|
+
".mp4",
|
|
15
|
+
".mov",
|
|
16
|
+
".mkv",
|
|
17
|
+
".avi",
|
|
18
|
+
".webm",
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def resolve_output_path(input_path: Path, output: Optional[Path] = None) -> Path:
|
|
23
|
+
"""Resolve the SRT output path.
|
|
24
|
+
|
|
25
|
+
If ``output`` is provided, use it. Otherwise create ``<input>.srt``
|
|
26
|
+
next to the input file.
|
|
27
|
+
"""
|
|
28
|
+
if output:
|
|
29
|
+
return output
|
|
30
|
+
return input_path.with_suffix(".srt")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def validate_media_file(path: Path) -> None:
|
|
34
|
+
"""Raise a ValueError if the path does not look like a media file."""
|
|
35
|
+
if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
|
|
36
|
+
joined = ", ".join(sorted(SUPPORTED_EXTENSIONS))
|
|
37
|
+
raise ValueError(
|
|
38
|
+
f"Unsupported file type '{path.suffix}'. Supported: {joined}"
|
|
39
|
+
)
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: subtitle-engine
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Generate SRT subtitles from audio/video files using WhisperX
|
|
5
|
+
Author: Leevi Puntanen
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/leevipuntanen/subtitle-engine
|
|
8
|
+
Project-URL: Issues, https://github.com/leevipuntanen/subtitle-engine/issues
|
|
9
|
+
Keywords: subtitles,srt,whisperx,transcription,asr
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
16
|
+
Classifier: Topic :: Utilities
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: typer>=0.12.0
|
|
21
|
+
Requires-Dist: whisperx>=3.8.0
|
|
22
|
+
Requires-Dist: requests>=2.32.0
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# subtitle-engine
|
|
28
|
+
|
|
29
|
+
Generate `.srt` subtitle files from audio or video files using [WhisperX](https://github.com/m-bain/whisperX). Optionally generate a caption from the transcript with a local [Ollama](https://ollama.com/) LLM.
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Requires Python 3.12 or newer.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install subtitle-engine
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Or install from source:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
git clone https://github.com/leevipuntanen/subtitle-engine.git
|
|
43
|
+
cd subtitle-engine
|
|
44
|
+
python -m venv .venv
|
|
45
|
+
source .venv/bin/activate
|
|
46
|
+
pip install -e ".[dev]"
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
## Usage
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
# Basic usage — writes <input>.srt next to the source file
|
|
53
|
+
subeng video.mp4
|
|
54
|
+
|
|
55
|
+
# Specify output file
|
|
56
|
+
subeng video.mp4 --output subtitles.srt
|
|
57
|
+
|
|
58
|
+
# Use a different model or language
|
|
59
|
+
subeng video.mp4 --model medium --language fi
|
|
60
|
+
|
|
61
|
+
# Force CPU / CUDA / MPS
|
|
62
|
+
subeng video.mp4 --device cpu
|
|
63
|
+
|
|
64
|
+
# Speaker diarization (requires a Hugging Face token)
|
|
65
|
+
subeng video.mp4 --diarize --hf-token $HF_TOKEN
|
|
66
|
+
|
|
67
|
+
# Generate a caption from the transcript using Ollama
|
|
68
|
+
subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Options
|
|
72
|
+
|
|
73
|
+
| Option | Description |
|
|
74
|
+
|--------|-------------|
|
|
75
|
+
| `--output`, `-o` | Output SRT file path |
|
|
76
|
+
| `--model`, `-m` | WhisperX model: `tiny`, `base`, `small` (default), `medium`, `large-v2`, `large-v3` |
|
|
77
|
+
| `--language`, `-l` | ISO language code, e.g. `en`, `fi`. Auto-detected if omitted. |
|
|
78
|
+
| `--device`, `-d` | `cpu`, `cuda` or `mps`. Auto-detected if omitted. |
|
|
79
|
+
| `--batch-size`, `-b` | Inference batch size (default: 16) |
|
|
80
|
+
| `--compute-type`, `-c` | `int8` or `float16`. Auto-selected if omitted. |
|
|
81
|
+
| `--diarize` | Enable speaker diarization |
|
|
82
|
+
| `--hf-token` | Hugging Face token for diarization (or set `HF_TOKEN` env var) |
|
|
83
|
+
| `--caption` | Generate a caption from the transcript via Ollama |
|
|
84
|
+
| `--ollama-model` | Ollama model name (required with `--caption`) |
|
|
85
|
+
| `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
|
|
86
|
+
|
|
87
|
+
## Development
|
|
88
|
+
|
|
89
|
+
Run the test suite:
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
pytest
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## License
|
|
96
|
+
|
|
97
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
subtitle_engine/__init__.py,sha256=hQMR44TH4Q2t3giLJX1w4C48X7I1_BbLYvWv6UE193U,83
|
|
2
|
+
subtitle_engine/captioner.py,sha256=YhddyIGe2Q05OHe9_VWjRgmnfu_z0fqHTeZGGuOVVYY,2273
|
|
3
|
+
subtitle_engine/cli.py,sha256=I6mSqJ3CPMqmDYg0SXLnd8wVScajMKijUS5KmlnJOac,4923
|
|
4
|
+
subtitle_engine/srt_writer.py,sha256=mCfYz5f_vezyKntJZknCDBfbr_wqN6tezOg3tdymTh4,1578
|
|
5
|
+
subtitle_engine/transcriber.py,sha256=A9-iv0C69ZhwV2JeCAqOnBFVyymsVyXituMLCa0cbrw,3834
|
|
6
|
+
subtitle_engine/utils.py,sha256=zhpyusQowoD2V0SGEt6CqgXiPzYfFSgrbdYc2tC5rcg,907
|
|
7
|
+
subtitle_engine-0.1.0.dist-info/licenses/LICENSE,sha256=o1T82nY2oKB-YHPfmfFxSaovPjMXAO6zhl9gTwB7pxw,1071
|
|
8
|
+
subtitle_engine-0.1.0.dist-info/METADATA,sha256=KGXISZd_1o-W6nbQGGtflIfOpALPrA0sXoEJ4Y9jCoc,2922
|
|
9
|
+
subtitle_engine-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
10
|
+
subtitle_engine-0.1.0.dist-info/entry_points.txt,sha256=DPgVIpA0GoIWbk4QkXbP_Ez4H7QYpFwqtC8QwPKV9v0,51
|
|
11
|
+
subtitle_engine-0.1.0.dist-info/top_level.txt,sha256=VDgcRWJUpgMgxj4aQ_Xcby1C9FKQGEdVaC5GgRR4lYQ,16
|
|
12
|
+
subtitle_engine-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Leevi Puntanen
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
subtitle_engine
|