subtitle-engine 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Leevi Puntanen
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: subtitle-engine
3
+ Version: 0.1.0
4
+ Summary: Generate SRT subtitles from audio/video files using WhisperX
5
+ Author: Leevi Puntanen
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/leevipuntanen/subtitle-engine
8
+ Project-URL: Issues, https://github.com/leevipuntanen/subtitle-engine/issues
9
+ Keywords: subtitles,srt,whisperx,transcription,asr
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: End Users/Desktop
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
16
+ Classifier: Topic :: Utilities
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: typer>=0.12.0
21
+ Requires-Dist: whisperx>=3.8.0
22
+ Requires-Dist: requests>=2.32.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # subtitle-engine
28
+
29
+ Generate `.srt` subtitle files from audio or video files using [WhisperX](https://github.com/m-bain/whisperX). Optionally generate a caption from the transcript with a local [Ollama](https://ollama.com/) LLM.
30
+
31
+ ## Installation
32
+
33
+ Requires Python 3.12 or newer.
34
+
35
+ ```bash
36
+ pip install subtitle-engine
37
+ ```
38
+
39
+ Or install from source:
40
+
41
+ ```bash
42
+ git clone https://github.com/leevipuntanen/subtitle-engine.git
43
+ cd subtitle-engine
44
+ python -m venv .venv
45
+ source .venv/bin/activate
46
+ pip install -e ".[dev]"
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ```bash
52
+ # Basic usage — writes <input>.srt next to the source file
53
+ subeng video.mp4
54
+
55
+ # Specify output file
56
+ subeng video.mp4 --output subtitles.srt
57
+
58
+ # Use a different model or language
59
+ subeng video.mp4 --model medium --language fi
60
+
61
+ # Force CPU / CUDA / MPS
62
+ subeng video.mp4 --device cpu
63
+
64
+ # Speaker diarization (requires a Hugging Face token)
65
+ subeng video.mp4 --diarize --hf-token $HF_TOKEN
66
+
67
+ # Generate a caption from the transcript using Ollama
68
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
69
+ ```
70
+
71
+ ## Options
72
+
73
+ | Option | Description |
74
+ |--------|-------------|
75
+ | `--output`, `-o` | Output SRT file path |
76
+ | `--model`, `-m` | WhisperX model: `tiny`, `base`, `small` (default), `medium`, `large-v2`, `large-v3` |
77
+ | `--language`, `-l` | ISO language code, e.g. `en`, `fi`. Auto-detected if omitted. |
78
+ | `--device`, `-d` | `cpu`, `cuda` or `mps`. Auto-detected if omitted. |
79
+ | `--batch-size`, `-b` | Inference batch size (default: 16) |
80
+ | `--compute-type`, `-c` | `int8` or `float16`. Auto-selected if omitted. |
81
+ | `--diarize` | Enable speaker diarization |
82
+ | `--hf-token` | Hugging Face token for diarization (or set `HF_TOKEN` env var) |
83
+ | `--caption` | Generate a caption from the transcript via Ollama |
84
+ | `--ollama-model` | Ollama model name (required with `--caption`) |
85
+ | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
86
+
87
+ ## Development
88
+
89
+ Run the test suite:
90
+
91
+ ```bash
92
+ pytest
93
+ ```
94
+
95
+ ## License
96
+
97
+ MIT
@@ -0,0 +1,71 @@
1
+ # subtitle-engine
2
+
3
+ Generate `.srt` subtitle files from audio or video files using [WhisperX](https://github.com/m-bain/whisperX). Optionally generate a caption from the transcript with a local [Ollama](https://ollama.com/) LLM.
4
+
5
+ ## Installation
6
+
7
+ Requires Python 3.12 or newer.
8
+
9
+ ```bash
10
+ pip install subtitle-engine
11
+ ```
12
+
13
+ Or install from source:
14
+
15
+ ```bash
16
+ git clone https://github.com/leevipuntanen/subtitle-engine.git
17
+ cd subtitle-engine
18
+ python -m venv .venv
19
+ source .venv/bin/activate
20
+ pip install -e ".[dev]"
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ```bash
26
+ # Basic usage — writes <input>.srt next to the source file
27
+ subeng video.mp4
28
+
29
+ # Specify output file
30
+ subeng video.mp4 --output subtitles.srt
31
+
32
+ # Use a different model or language
33
+ subeng video.mp4 --model medium --language fi
34
+
35
+ # Force CPU / CUDA / MPS
36
+ subeng video.mp4 --device cpu
37
+
38
+ # Speaker diarization (requires a Hugging Face token)
39
+ subeng video.mp4 --diarize --hf-token $HF_TOKEN
40
+
41
+ # Generate a caption from the transcript using Ollama
42
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
43
+ ```
44
+
45
+ ## Options
46
+
47
+ | Option | Description |
48
+ |--------|-------------|
49
+ | `--output`, `-o` | Output SRT file path |
50
+ | `--model`, `-m` | WhisperX model: `tiny`, `base`, `small` (default), `medium`, `large-v2`, `large-v3` |
51
+ | `--language`, `-l` | ISO language code, e.g. `en`, `fi`. Auto-detected if omitted. |
52
+ | `--device`, `-d` | `cpu`, `cuda` or `mps`. Auto-detected if omitted. |
53
+ | `--batch-size`, `-b` | Inference batch size (default: 16) |
54
+ | `--compute-type`, `-c` | `int8` or `float16`. Auto-selected if omitted. |
55
+ | `--diarize` | Enable speaker diarization |
56
+ | `--hf-token` | Hugging Face token for diarization (or set `HF_TOKEN` env var) |
57
+ | `--caption` | Generate a caption from the transcript via Ollama |
58
+ | `--ollama-model` | Ollama model name (required with `--caption`) |
59
+ | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
60
+
61
+ ## Development
62
+
63
+ Run the test suite:
64
+
65
+ ```bash
66
+ pytest
67
+ ```
68
+
69
+ ## License
70
+
71
+ MIT
@@ -0,0 +1,48 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77.0.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "subtitle-engine"
7
+ version = "0.1.0"
8
+ description = "Generate SRT subtitles from audio/video files using WhisperX"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
12
+ authors = [
13
+ {name = "Leevi Puntanen"},
14
+ ]
15
+ requires-python = ">=3.12"
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: End Users/Desktop",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Multimedia :: Sound/Audio :: Speech",
23
+ "Topic :: Utilities",
24
+ ]
25
+ keywords = ["subtitles", "srt", "whisperx", "transcription", "asr"]
26
+ dependencies = [
27
+ "typer>=0.12.0",
28
+ "whisperx>=3.8.0",
29
+ "requests>=2.32.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ dev = [
34
+ "pytest>=8.0.0",
35
+ ]
36
+
37
+ [project.scripts]
38
+ subeng = "subtitle_engine.cli:app"
39
+
40
+ [project.urls]
41
+ Homepage = "https://github.com/leevipuntanen/subtitle-engine"
42
+ Issues = "https://github.com/leevipuntanen/subtitle-engine/issues"
43
+
44
+ [tool.setuptools.packages.find]
45
+ where = ["src"]
46
+
47
+ [tool.pytest.ini_options]
48
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """Subtitle Engine — generate SRT files with WhisperX."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,78 @@
1
+ """Generate captions from transcripts using a local Ollama instance."""
2
+
3
+ import json
4
+ from typing import Optional
5
+
6
+ import requests
7
+
8
+
9
+ def _default_prompt(transcript: str) -> str:
10
+ """Build the prompt sent to the LLM."""
11
+ return (
12
+ "Create a short, engaging caption (1-2 sentences) for a video based on the following transcript. "
13
+ "Write the caption in the same language as the transcript. "
14
+ "Answer directly with the caption only, without any thinking or explanation.\n\n"
15
+ f"Transcript:\n{transcript}"
16
+ )
17
+
18
+
19
+ def generate_caption(
20
+ transcript: str,
21
+ *,
22
+ model: str,
23
+ host: str = "http://localhost:11434",
24
+ prompt: Optional[str] = None,
25
+ ) -> str:
26
+ """Generate a caption from a transcript via Ollama.
27
+
28
+ Parameters
29
+ ----------
30
+ transcript:
31
+ The transcript text to summarize.
32
+ model:
33
+ Name of the Ollama model to use.
34
+ host:
35
+ Base URL of the Ollama API.
36
+ prompt:
37
+ Custom prompt. A default prompt is used if omitted.
38
+
39
+ Returns
40
+ -------
41
+ The generated caption string.
42
+ """
43
+ if not transcript.strip():
44
+ raise ValueError("Cannot generate a caption from an empty transcript")
45
+
46
+ url = f"{host.rstrip('/')}/api/generate"
47
+ payload = {
48
+ "model": model,
49
+ "prompt": prompt or _default_prompt(transcript),
50
+ "stream": False,
51
+ }
52
+
53
+ try:
54
+ response = requests.post(url, json=payload, timeout=300)
55
+ except requests.ConnectionError as exc:
56
+ raise ConnectionError(
57
+ f"Could not connect to Ollama at {host}. Is Ollama running?"
58
+ ) from exc
59
+
60
+ response.raise_for_status()
61
+ data = response.json()
62
+ caption = data.get("response", "").strip()
63
+
64
+ if not caption:
65
+ raise ValueError(
66
+ "Ollama returned an empty caption. "
67
+ "This can happen with some models or languages — try a different --ollama-model."
68
+ )
69
+
70
+ return caption
71
+
72
+
73
+ def list_models(host: str = "http://localhost:11434") -> list[str]:
74
+ """Return the names of models available in the local Ollama instance."""
75
+ url = f"{host.rstrip('/')}/api/tags"
76
+ response = requests.get(url, timeout=30)
77
+ response.raise_for_status()
78
+ return [model["name"] for model in response.json().get("models", [])]
@@ -0,0 +1,170 @@
1
+ """Command-line interface for subtitle-engine."""
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated, Optional
5
+
6
+ import typer
7
+ from rich.console import Console
8
+
9
+ from subtitle_engine.captioner import generate_caption
10
+ from subtitle_engine.srt_writer import write_srt
11
+ from subtitle_engine.transcriber import transcribe
12
+ from subtitle_engine.utils import resolve_output_path, validate_media_file
13
+
14
+ app = typer.Typer(
15
+ help="Generate SRT subtitles from audio/video files using WhisperX",
16
+ no_args_is_help=True,
17
+ )
18
+ console = Console()
19
+
20
+
21
+ @app.command()
22
+ def main(
23
+ input_file: Annotated[
24
+ Path,
25
+ typer.Argument(
26
+ help="Audio or video file to transcribe",
27
+ exists=True,
28
+ file_okay=True,
29
+ dir_okay=False,
30
+ readable=True,
31
+ ),
32
+ ],
33
+ output: Annotated[
34
+ Optional[Path],
35
+ typer.Option(
36
+ "--output",
37
+ "-o",
38
+ help="Output SRT file (default: <input>.srt)",
39
+ file_okay=True,
40
+ dir_okay=False,
41
+ ),
42
+ ] = None,
43
+ model: Annotated[
44
+ str,
45
+ typer.Option(
46
+ "--model",
47
+ "-m",
48
+ help="WhisperX model: tiny, base, small, medium, large-v2, large-v3",
49
+ ),
50
+ ] = "small",
51
+ language: Annotated[
52
+ Optional[str],
53
+ typer.Option(
54
+ "--language",
55
+ "-l",
56
+ help="Language code, e.g. en, fi. Auto-detected if omitted.",
57
+ ),
58
+ ] = None,
59
+ device: Annotated[
60
+ Optional[str],
61
+ typer.Option(
62
+ "--device",
63
+ "-d",
64
+ help="Device: cpu, cuda or mps. Auto-detected if omitted.",
65
+ ),
66
+ ] = None,
67
+ batch_size: Annotated[
68
+ int,
69
+ typer.Option(
70
+ "--batch-size",
71
+ "-b",
72
+ help="WhisperX inference batch size",
73
+ min=1,
74
+ ),
75
+ ] = 16,
76
+ compute_type: Annotated[
77
+ Optional[str],
78
+ typer.Option(
79
+ "--compute-type",
80
+ "-c",
81
+ help="Compute type: int8 or float16. Auto-selected if omitted.",
82
+ ),
83
+ ] = None,
84
+ diarize: Annotated[
85
+ bool,
86
+ typer.Option(
87
+ "--diarize",
88
+ help="Run speaker diarization (requires --hf-token)",
89
+ ),
90
+ ] = False,
91
+ hf_token: Annotated[
92
+ Optional[str],
93
+ typer.Option(
94
+ "--hf-token",
95
+ help="Hugging Face token for diarization",
96
+ envvar="HF_TOKEN",
97
+ ),
98
+ ] = None,
99
+ caption: Annotated[
100
+ bool,
101
+ typer.Option(
102
+ "--caption",
103
+ help="Generate a caption from the transcript using Ollama",
104
+ ),
105
+ ] = False,
106
+ ollama_model: Annotated[
107
+ Optional[str],
108
+ typer.Option(
109
+ "--ollama-model",
110
+ help="Ollama model for caption generation. Required if --caption is set.",
111
+ ),
112
+ ] = None,
113
+ ollama_host: Annotated[
114
+ str,
115
+ typer.Option(
116
+ "--ollama-host",
117
+ help="Ollama API host",
118
+ envvar="OLLAMA_HOST",
119
+ ),
120
+ ] = "http://localhost:11434",
121
+ ) -> None:
122
+ """Generate SRT subtitles from a media file."""
123
+ try:
124
+ validate_media_file(input_file)
125
+ output_path = resolve_output_path(input_file, output)
126
+
127
+ if caption and not ollama_model:
128
+ raise ValueError("--ollama-model is required when using --caption")
129
+
130
+ console.print(f"[bold]Transcribing:[/bold] {input_file}")
131
+ console.print(f"[bold]Model:[/bold] {model}")
132
+ if language:
133
+ console.print(f"[bold]Language:[/bold] {language}")
134
+ if device:
135
+ console.print(f"[bold]Device:[/bold] {device}")
136
+
137
+ segments = transcribe(
138
+ input_file,
139
+ model_name=model,
140
+ language=language,
141
+ device=device,
142
+ batch_size=batch_size,
143
+ compute_type=compute_type,
144
+ diarize=diarize,
145
+ hf_token=hf_token,
146
+ )
147
+
148
+ write_srt(segments, output_path)
149
+ console.print(f"[green]Wrote subtitles to:[/green] {output_path}")
150
+
151
+ if caption:
152
+ transcript = " ".join(str(segment.get("text", "")).strip() for segment in segments)
153
+ caption_text = generate_caption(
154
+ transcript,
155
+ model=ollama_model,
156
+ host=ollama_host,
157
+ )
158
+ caption_path = output_path.with_suffix(".caption.txt")
159
+ caption_path.write_text(caption_text, encoding="utf-8")
160
+ console.print(f"[green]Wrote caption to:[/green] {caption_path}")
161
+ except (ValueError, FileNotFoundError, ConnectionError) as exc:
162
+ console.print(f"[red]Error:[/red] {exc}")
163
+ raise typer.Exit(code=1) from exc
164
+ except Exception as exc: # noqa: BLE001
165
+ console.print(f"[red]Transcription failed:[/red] {exc}")
166
+ raise typer.Exit(code=1) from exc
167
+
168
+
169
+ if __name__ == "__main__":
170
+ app()
@@ -0,0 +1,45 @@
1
+ """Convert transcription segments to SRT format."""
2
+
3
+ from pathlib import Path
4
+ from typing import Iterable
5
+
6
+
7
+ def _format_time(seconds: float) -> str:
8
+ """Convert seconds to SRT time format HH:MM:SS,mmm."""
9
+ total_millis = int(round(seconds * 1000))
10
+ hours = total_millis // 3_600_000
11
+ minutes = (total_millis % 3_600_000) // 60_000
12
+ secs = (total_millis % 60_000) // 1_000
13
+ millis = total_millis % 1_000
14
+
15
+ return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}"
16
+
17
+
18
+ def _format_segment(index: int, start: float, end: float, text: str) -> str:
19
+ """Format a single segment as an SRT block."""
20
+ cleaned_text = text.strip()
21
+ if not cleaned_text:
22
+ cleaned_text = "..."
23
+ return f"{index}\n{_format_time(start)} --> {_format_time(end)}\n{cleaned_text}\n"
24
+
25
+
26
+ def segments_to_srt(segments: Iterable[dict]) -> str:
27
+ """Build an SRT string from WhisperX-style segments.
28
+
29
+ Each segment is expected to be a dict with keys:
30
+ ``start`` (float), ``end`` (float), and ``text`` (str).
31
+ """
32
+ blocks = []
33
+ for index, segment in enumerate(segments, start=1):
34
+ start = float(segment["start"])
35
+ end = float(segment["end"])
36
+ text = str(segment["text"])
37
+ blocks.append(_format_segment(index, start, end, text))
38
+ return "\n".join(blocks)
39
+
40
+
41
+ def write_srt(segments: Iterable[dict], output_path: Path) -> None:
42
+ """Write segments to an SRT file."""
43
+ output_path = Path(output_path)
44
+ output_path.parent.mkdir(parents=True, exist_ok=True)
45
+ output_path.write_text(segments_to_srt(segments), encoding="utf-8")
@@ -0,0 +1,129 @@
1
+ """WhisperX transcription wrapper."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+ import torch
7
+ import whisperx
8
+
9
+
10
+ VALID_MODELS = {"tiny", "base", "small", "medium", "large-v2", "large-v3"}
11
+ VALID_DEVICES = {"cpu", "cuda", "mps"}
12
+
13
+
14
+ def _detect_device(device: Optional[str]) -> str:
15
+ """Pick a device if none was specified."""
16
+ if device:
17
+ return device
18
+ if torch.cuda.is_available():
19
+ return "cuda"
20
+ if torch.backends.mps.is_available():
21
+ return "mps"
22
+ return "cpu"
23
+
24
+
25
+ def _default_compute_type(device: str) -> str:
26
+ """Pick a safe compute type for the device."""
27
+ if device == "cpu":
28
+ return "int8"
29
+ return "float16"
30
+
31
+
32
+ def _validate_model(model_name: str) -> None:
33
+ """Raise a ValueError if the model name is unknown."""
34
+ if model_name not in VALID_MODELS:
35
+ joined = ", ".join(sorted(VALID_MODELS))
36
+ raise ValueError(f"Unknown model '{model_name}'. Choose from: {joined}")
37
+
38
+
39
+ def _validate_device(device: str) -> None:
40
+ """Raise a ValueError if the device name is unknown."""
41
+ if device not in VALID_DEVICES:
42
+ joined = ", ".join(sorted(VALID_DEVICES))
43
+ raise ValueError(f"Unknown device '{device}'. Choose from: {joined}")
44
+
45
+
46
+ def transcribe(
47
+ audio_path: Path,
48
+ *,
49
+ model_name: str = "small",
50
+ language: Optional[str] = None,
51
+ device: Optional[str] = None,
52
+ batch_size: int = 16,
53
+ compute_type: Optional[str] = None,
54
+ diarize: bool = False,
55
+ hf_token: Optional[str] = None,
56
+ ) -> list[dict]:
57
+ """Transcribe an audio/video file and return SRT-ready segments.
58
+
59
+ Parameters
60
+ ----------
61
+ audio_path:
62
+ Path to the media file to transcribe.
63
+ model_name:
64
+ WhisperX model size. One of: tiny, base, small, medium, large-v2, large-v3.
65
+ language:
66
+ ISO language code, e.g. ``en`` or ``fi``. If ``None``, WhisperX auto-detects.
67
+ device:
68
+ ``cpu``, ``cuda`` or ``mps``. Auto-detected if omitted.
69
+ batch_size:
70
+ WhisperX batch size for transcription.
71
+ compute_type:
72
+ ``int8`` or ``float16``. Auto-selected per device if omitted.
73
+ diarize:
74
+ Whether to run speaker diarization.
75
+ hf_token:
76
+ Hugging Face token required for diarization.
77
+
78
+ Returns
79
+ -------
80
+ A list of segment dicts with ``start``, ``end`` and ``text`` keys.
81
+ """
82
+ _validate_model(model_name)
83
+
84
+ audio_path = Path(audio_path)
85
+ if not audio_path.exists():
86
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
87
+
88
+ device = _detect_device(device)
89
+ _validate_device(device)
90
+
91
+ if compute_type is None:
92
+ compute_type = _default_compute_type(device)
93
+
94
+ if diarize and not hf_token:
95
+ raise ValueError("--hf-token is required when using --diarize")
96
+
97
+ audio = whisperx.load_audio(str(audio_path))
98
+
99
+ model = whisperx.load_model(model_name, device, compute_type=compute_type)
100
+ result = model.transcribe(audio, batch_size=batch_size, language=language)
101
+
102
+ # Free transcription model memory before alignment
103
+ del model
104
+
105
+ detected_language = result.get("language")
106
+ if detected_language:
107
+ align_model, align_metadata = whisperx.load_align_model(
108
+ language_code=detected_language, device=device
109
+ )
110
+ result = whisperx.align(
111
+ result["segments"],
112
+ align_model,
113
+ align_metadata,
114
+ audio,
115
+ device,
116
+ return_char_alignments=False,
117
+ )
118
+ del align_model
119
+
120
+ if diarize:
121
+ diarize_model = whisperx.DiarizationPipeline(
122
+ model_name="pyannote/speaker-diarization-3.1",
123
+ use_auth_token=hf_token,
124
+ device=device,
125
+ )
126
+ diarize_segments = diarize_model(audio)
127
+ result = whisperx.assign_word_speakers(diarize_segments, result)
128
+
129
+ return result["segments"]
@@ -0,0 +1,39 @@
1
+ """CLI helpers and path utilities."""
2
+
3
+ from pathlib import Path
4
+ from typing import Optional
5
+
6
+
7
+ SUPPORTED_EXTENSIONS = {
8
+ ".mp3",
9
+ ".wav",
10
+ ".flac",
11
+ ".aac",
12
+ ".ogg",
13
+ ".m4a",
14
+ ".mp4",
15
+ ".mov",
16
+ ".mkv",
17
+ ".avi",
18
+ ".webm",
19
+ }
20
+
21
+
22
+ def resolve_output_path(input_path: Path, output: Optional[Path] = None) -> Path:
23
+ """Resolve the SRT output path.
24
+
25
+ If ``output`` is provided, use it. Otherwise create ``<input>.srt``
26
+ next to the input file.
27
+ """
28
+ if output:
29
+ return output
30
+ return input_path.with_suffix(".srt")
31
+
32
+
33
+ def validate_media_file(path: Path) -> None:
34
+ """Raise a ValueError if the path does not look like a media file."""
35
+ if path.suffix.lower() not in SUPPORTED_EXTENSIONS:
36
+ joined = ", ".join(sorted(SUPPORTED_EXTENSIONS))
37
+ raise ValueError(
38
+ f"Unsupported file type '{path.suffix}'. Supported: {joined}"
39
+ )
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: subtitle-engine
3
+ Version: 0.1.0
4
+ Summary: Generate SRT subtitles from audio/video files using WhisperX
5
+ Author: Leevi Puntanen
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/leevipuntanen/subtitle-engine
8
+ Project-URL: Issues, https://github.com/leevipuntanen/subtitle-engine/issues
9
+ Keywords: subtitles,srt,whisperx,transcription,asr
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: End Users/Desktop
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
16
+ Classifier: Topic :: Utilities
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: typer>=0.12.0
21
+ Requires-Dist: whisperx>=3.8.0
22
+ Requires-Dist: requests>=2.32.0
23
+ Provides-Extra: dev
24
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
25
+ Dynamic: license-file
26
+
27
+ # subtitle-engine
28
+
29
+ Generate `.srt` subtitle files from audio or video files using [WhisperX](https://github.com/m-bain/whisperX). Optionally generate a caption from the transcript with a local [Ollama](https://ollama.com/) LLM.
30
+
31
+ ## Installation
32
+
33
+ Requires Python 3.12 or newer.
34
+
35
+ ```bash
36
+ pip install subtitle-engine
37
+ ```
38
+
39
+ Or install from source:
40
+
41
+ ```bash
42
+ git clone https://github.com/leevipuntanen/subtitle-engine.git
43
+ cd subtitle-engine
44
+ python -m venv .venv
45
+ source .venv/bin/activate
46
+ pip install -e ".[dev]"
47
+ ```
48
+
49
+ ## Usage
50
+
51
+ ```bash
52
+ # Basic usage — writes <input>.srt next to the source file
53
+ subeng video.mp4
54
+
55
+ # Specify output file
56
+ subeng video.mp4 --output subtitles.srt
57
+
58
+ # Use a different model or language
59
+ subeng video.mp4 --model medium --language fi
60
+
61
+ # Force CPU / CUDA / MPS
62
+ subeng video.mp4 --device cpu
63
+
64
+ # Speaker diarization (requires a Hugging Face token)
65
+ subeng video.mp4 --diarize --hf-token $HF_TOKEN
66
+
67
+ # Generate a caption from the transcript using Ollama
68
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
69
+ ```
70
+
71
+ ## Options
72
+
73
+ | Option | Description |
74
+ |--------|-------------|
75
+ | `--output`, `-o` | Output SRT file path |
76
+ | `--model`, `-m` | WhisperX model: `tiny`, `base`, `small` (default), `medium`, `large-v2`, `large-v3` |
77
+ | `--language`, `-l` | ISO language code, e.g. `en`, `fi`. Auto-detected if omitted. |
78
+ | `--device`, `-d` | `cpu`, `cuda` or `mps`. Auto-detected if omitted. |
79
+ | `--batch-size`, `-b` | Inference batch size (default: 16) |
80
+ | `--compute-type`, `-c` | `int8` or `float16`. Auto-selected if omitted. |
81
+ | `--diarize` | Enable speaker diarization |
82
+ | `--hf-token` | Hugging Face token for diarization (or set `HF_TOKEN` env var) |
83
+ | `--caption` | Generate a caption from the transcript via Ollama |
84
+ | `--ollama-model` | Ollama model name (required with `--caption`) |
85
+ | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
86
+
87
+ ## Development
88
+
89
+ Run the test suite:
90
+
91
+ ```bash
92
+ pytest
93
+ ```
94
+
95
+ ## License
96
+
97
+ MIT
@@ -0,0 +1,18 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ src/subtitle_engine/__init__.py
5
+ src/subtitle_engine/captioner.py
6
+ src/subtitle_engine/cli.py
7
+ src/subtitle_engine/srt_writer.py
8
+ src/subtitle_engine/transcriber.py
9
+ src/subtitle_engine/utils.py
10
+ src/subtitle_engine.egg-info/PKG-INFO
11
+ src/subtitle_engine.egg-info/SOURCES.txt
12
+ src/subtitle_engine.egg-info/dependency_links.txt
13
+ src/subtitle_engine.egg-info/entry_points.txt
14
+ src/subtitle_engine.egg-info/requires.txt
15
+ src/subtitle_engine.egg-info/top_level.txt
16
+ tests/test_captioner.py
17
+ tests/test_cli.py
18
+ tests/test_srt_writer.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ subeng = subtitle_engine.cli:app
@@ -0,0 +1,6 @@
1
+ typer>=0.12.0
2
+ whisperx>=3.8.0
3
+ requests>=2.32.0
4
+
5
+ [dev]
6
+ pytest>=8.0.0
@@ -0,0 +1 @@
1
+ subtitle_engine
@@ -0,0 +1,67 @@
1
+ """Tests for the Ollama captioner module."""
2
+
3
+ from unittest.mock import Mock, patch
4
+
5
+ import pytest
6
+ import requests
7
+
8
+ from subtitle_engine.captioner import generate_caption, list_models
9
+
10
+
11
+ @patch("subtitle_engine.captioner.requests.post")
12
+ def test_generate_caption_success(mock_post):
13
+ mock_post.return_value = Mock(
14
+ status_code=200,
15
+ json=lambda: {"response": " A short caption. "},
16
+ raise_for_status=lambda: None,
17
+ )
18
+
19
+ caption = generate_caption(
20
+ "hello world",
21
+ model="qwen3.5:0.8b",
22
+ host="http://localhost:11434",
23
+ )
24
+
25
+ assert caption == "A short caption."
26
+ mock_post.assert_called_once()
27
+ _, kwargs = mock_post.call_args
28
+ assert kwargs["json"]["model"] == "qwen3.5:0.8b"
29
+ assert "hello world" in kwargs["json"]["prompt"]
30
+
31
+
32
+ @patch("subtitle_engine.captioner.requests.post")
33
+ def test_generate_caption_empty_response(mock_post):
34
+ mock_post.return_value = Mock(
35
+ status_code=200,
36
+ json=lambda: {"response": " "},
37
+ raise_for_status=lambda: None,
38
+ )
39
+
40
+ with pytest.raises(ValueError, match="empty caption"):
41
+ generate_caption("hello", model="qwen3.5:0.8b")
42
+
43
+
44
+ def test_generate_caption_empty_transcript():
45
+ with pytest.raises(ValueError, match="empty transcript"):
46
+ generate_caption(" ", model="qwen3.5:0.8b")
47
+
48
+
49
+ @patch(
50
+ "subtitle_engine.captioner.requests.post",
51
+ side_effect=requests.ConnectionError("connection refused"),
52
+ )
53
+ def test_generate_caption_connection_error(mock_post):
54
+ with pytest.raises(ConnectionError):
55
+ generate_caption("hello", model="qwen3.5:0.8b")
56
+
57
+
58
+ @patch("subtitle_engine.captioner.requests.get")
59
+ def test_list_models(mock_get):
60
+ mock_get.return_value = Mock(
61
+ status_code=200,
62
+ json=lambda: {"models": [{"name": "qwen3.5:0.8b"}, {"name": "llama3.2"}]},
63
+ raise_for_status=lambda: None,
64
+ )
65
+
66
+ models = list_models()
67
+ assert models == ["qwen3.5:0.8b", "llama3.2"]
@@ -0,0 +1,51 @@
1
+ """Tests for CLI helpers and argument parsing."""
2
+
3
+ from pathlib import Path
4
+
5
+ import pytest
6
+ from typer.testing import CliRunner
7
+
8
+ from subtitle_engine.cli import app
9
+ from subtitle_engine.utils import resolve_output_path, validate_media_file
10
+
11
+ runner = CliRunner()
12
+
13
+
14
+ def test_resolve_output_path_default():
15
+ input_path = Path("movie.mp4")
16
+ assert resolve_output_path(input_path) == Path("movie.srt")
17
+
18
+
19
+ def test_resolve_output_path_explicit():
20
+ input_path = Path("movie.mp4")
21
+ output = Path("custom.srt")
22
+ assert resolve_output_path(input_path, output) == output
23
+
24
+
25
+ def test_validate_media_file_supported():
26
+ validate_media_file(Path("video.mp4"))
27
+
28
+
29
+ def test_validate_media_file_unsupported():
30
+ with pytest.raises(ValueError, match="Unsupported file type"):
31
+ validate_media_file(Path("file.txt"))
32
+
33
+
34
+ def test_cli_help():
35
+ result = runner.invoke(app, ["--help"])
36
+ assert result.exit_code == 0
37
+ assert "Generate SRT subtitles" in result.output
38
+
39
+
40
+ def test_cli_no_args():
41
+ result = runner.invoke(app)
42
+ assert result.exit_code != 0
43
+ assert "Usage:" in result.output
44
+
45
+
46
+ def test_caption_requires_ollama_model(tmp_path: Path):
47
+ media = tmp_path / "video.mp4"
48
+ media.write_bytes(b"fake")
49
+ result = runner.invoke(app, [str(media), "--caption"])
50
+ assert result.exit_code != 0
51
+ assert "--ollama-model is required" in result.output
@@ -0,0 +1,66 @@
1
+ """Tests for the SRT writer module."""
2
+
3
+ from pathlib import Path
4
+
5
+ from subtitle_engine.srt_writer import (
6
+ _format_segment,
7
+ _format_time,
8
+ segments_to_srt,
9
+ write_srt,
10
+ )
11
+
12
+
13
+ def test_format_time_zero():
14
+ assert _format_time(0.0) == "00:00:00,000"
15
+
16
+
17
+ def test_format_time_with_hours():
18
+ assert _format_time(3661.123) == "01:01:01,123"
19
+
20
+
21
+ def test_format_time_milliseconds_rounding():
22
+ assert _format_time(0.9996) == "00:00:01,000"
23
+
24
+
25
+ def test_format_time_millis_ceiling_guard():
26
+ # 1.9999 rounds to 2.000 -> should not produce 1000 ms
27
+ assert _format_time(1.9999) == "00:00:02,000"
28
+
29
+
30
+ def test_format_segment():
31
+ block = _format_segment(1, 1.5, 4.25, "Hello world")
32
+ assert block == "1\n00:00:01,500 --> 00:00:04,250\nHello world\n"
33
+
34
+
35
+ def test_segments_to_srt():
36
+ segments = [
37
+ {"start": 0.0, "end": 2.0, "text": "First line"},
38
+ {"start": 3.5, "end": 5.5, "text": "Second line"},
39
+ ]
40
+ srt = segments_to_srt(segments)
41
+ assert "1\n00:00:00,000 --> 00:00:02,000\nFirst line" in srt
42
+ assert "2\n00:00:03,500 --> 00:00:05,500\nSecond line" in srt
43
+
44
+
45
+ def test_segments_to_srt_empty_text_falls_back():
46
+ srt = segments_to_srt([{"start": 0.0, "end": 1.0, "text": " "}])
47
+ assert "..." in srt
48
+
49
+
50
+ def test_segments_to_srt_empty():
51
+ assert segments_to_srt([]) == ""
52
+
53
+
54
+ def test_write_srt(tmp_path: Path):
55
+ segments = [{"start": 0.0, "end": 1.0, "text": "Hello"}]
56
+ output = tmp_path / "subs.srt"
57
+ write_srt(segments, output)
58
+ assert output.exists()
59
+ assert "00:00:00,000 --> 00:00:01,000" in output.read_text(encoding="utf-8")
60
+
61
+
62
+ def test_write_srt_creates_parent_dirs(tmp_path: Path):
63
+ segments = [{"start": 0.0, "end": 1.0, "text": "Hello"}]
64
+ output = tmp_path / "nested" / "dir" / "subs.srt"
65
+ write_srt(segments, output)
66
+ assert output.exists()