subtitle-engine 0.1.2__tar.gz → 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (25) hide show
  1. {subtitle_engine-0.1.2/src/subtitle_engine.egg-info → subtitle_engine-0.1.3}/PKG-INFO +9 -2
  2. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/README.md +8 -1
  3. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/pyproject.toml +1 -1
  4. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/__init__.py +1 -1
  5. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/cli.py +15 -0
  6. subtitle_engine-0.1.3/src/subtitle_engine/segmenter.py +165 -0
  7. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3/src/subtitle_engine.egg-info}/PKG-INFO +9 -2
  8. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/SOURCES.txt +2 -0
  9. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_cli.py +27 -2
  10. subtitle_engine-0.1.3/tests/test_segmenter.py +84 -0
  11. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/LICENSE +0 -0
  12. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/setup.cfg +0 -0
  13. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/captioner.py +0 -0
  14. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/srt_writer.py +0 -0
  15. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/transcriber.py +0 -0
  16. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/updater.py +0 -0
  17. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/utils.py +0 -0
  18. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/dependency_links.txt +0 -0
  19. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/entry_points.txt +0 -0
  20. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/requires.txt +0 -0
  21. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/top_level.txt +0 -0
  22. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_captioner.py +0 -0
  23. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_srt_writer.py +0 -0
  24. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_transcriber.py +0 -0
  25. {subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_updater.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subtitle-engine
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Generate SRT subtitles from audio/video files using WhisperX
5
5
  Author: Leevi Puntanen
6
6
  License-Expression: MIT
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
65
65
  subeng video.mp4 --diarize --hf-token $HF_TOKEN
66
66
 
67
67
  # Generate a caption from the transcript using Ollama
68
- subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
68
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
69
+
70
+ # Short-form subtitles (2-5 words per line, default)
71
+ subeng video.mp4 --preset shortform
72
+
73
+ # Long-form subtitles (10-14 words per line)
74
+ subeng video.mp4 --preset longform
69
75
  ```
70
76
 
71
77
  ## Options
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
83
89
  | `--caption` | Generate a caption from the transcript via Ollama |
84
90
  | `--ollama-model` | Ollama model name (required with `--caption`) |
85
91
  | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
92
+ | `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
86
93
 
87
94
  ## Development
88
95
 
@@ -39,7 +39,13 @@ subeng video.mp4 --device cpu
39
39
  subeng video.mp4 --diarize --hf-token $HF_TOKEN
40
40
 
41
41
  # Generate a caption from the transcript using Ollama
42
- subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
42
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
43
+
44
+ # Short-form subtitles (2-5 words per line, default)
45
+ subeng video.mp4 --preset shortform
46
+
47
+ # Long-form subtitles (10-14 words per line)
48
+ subeng video.mp4 --preset longform
43
49
  ```
44
50
 
45
51
  ## Options
@@ -57,6 +63,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
57
63
  | `--caption` | Generate a caption from the transcript via Ollama |
58
64
  | `--ollama-model` | Ollama model name (required with `--caption`) |
59
65
  | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
66
+ | `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
60
67
 
61
68
  ## Development
62
69
 
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "subtitle-engine"
7
- version = "0.1.2"
7
+ version = "0.1.3"
8
8
  description = "Generate SRT subtitles from audio/video files using WhisperX"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -1,3 +1,3 @@
1
1
  """Subtitle Engine — generate SRT files with WhisperX."""
2
2
 
3
- __version__ = "0.1.1"
3
+ __version__ = "0.1.3"
@@ -9,6 +9,7 @@ from rich.console import Console
9
9
 
10
10
  from subtitle_engine import __version__
11
11
  from subtitle_engine.captioner import generate_caption
12
+ from subtitle_engine.segmenter import VALID_PRESETS, split_segments
12
13
  from subtitle_engine.srt_writer import write_srt
13
14
  from subtitle_engine.transcriber import transcribe
14
15
  from subtitle_engine.updater import UpdateCheckError, check_for_update, update_package
@@ -162,6 +163,14 @@ def main(
162
163
  envvar="OLLAMA_HOST",
163
164
  ),
164
165
  ] = "http://localhost:11434",
166
+ preset: Annotated[
167
+ str,
168
+ typer.Option(
169
+ "--preset",
170
+ "-p",
171
+ help="Subtitle style: shortform (2-5 words) or longform (10-14 words).",
172
+ ),
173
+ ] = "shortform",
165
174
  quiet: Annotated[
166
175
  bool,
167
176
  typer.Option(
@@ -190,6 +199,10 @@ def main(
190
199
  ) -> None:
191
200
  """Generate SRT subtitles from a media file."""
192
201
  try:
202
+ if preset not in VALID_PRESETS:
203
+ valid = ", ".join(sorted(VALID_PRESETS))
204
+ raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
205
+
193
206
  validate_media_file(input_file)
194
207
  output_path = resolve_output_path(input_file, output)
195
208
 
@@ -208,6 +221,7 @@ def main(
208
221
  if not quiet:
209
222
  console.print(f"[bold]Transcribing:[/bold] {input_file}")
210
223
  console.print(f"[bold]Model:[/bold] {model}")
224
+ console.print(f"[bold]Preset:[/bold] {preset}")
211
225
  if language:
212
226
  console.print(f"[bold]Language:[/bold] {language}")
213
227
  if device:
@@ -225,6 +239,7 @@ def main(
225
239
  verbose=verbose,
226
240
  )
227
241
 
242
+ segments = split_segments(segments, preset=preset)
228
243
  write_srt(segments, output_path)
229
244
  if not quiet:
230
245
  console.print(f"[green]Wrote subtitles to:[/green] {output_path}")
@@ -0,0 +1,165 @@
1
+ """Split WhisperX segments into shorter or longer subtitle chunks."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections import Counter
6
+ from typing import Iterable
7
+
8
+
9
+ PRESET_SHORTFORM = "shortform"
10
+ PRESET_LONGFORM = "longform"
11
+ VALID_PRESETS = {PRESET_SHORTFORM, PRESET_LONGFORM}
12
+
13
+ # Word-count targets per subtitle block.
14
+ PRESET_TARGETS = {
15
+ PRESET_SHORTFORM: (2, 5), # min, max
16
+ PRESET_LONGFORM: (10, 14), # min, max
17
+ }
18
+
19
+
20
+ def _sanitize_text(text: str) -> str:
21
+ """Return a cleaned version of the text for display."""
22
+ return " ".join(text.split())
23
+
24
+
25
+ def _words_from_segment(segment: dict) -> list[dict]:
26
+ """Extract a clean list of word dicts from a WhisperX segment.
27
+
28
+ Each word dict should have ``word`` and optionally ``start``/``end``.
29
+ """
30
+ raw_words = segment.get("words", [])
31
+ words = []
32
+ for word_entry in raw_words:
33
+ if isinstance(word_entry, dict):
34
+ word_text = word_entry.get("word", "").strip()
35
+ else:
36
+ word_text = str(word_entry).strip()
37
+ if word_text:
38
+ words.append({"word": word_text, **word_entry} if isinstance(word_entry, dict) else {"word": word_text})
39
+ return words
40
+
41
+
42
+ def _split_text_evenly(text: str, chunk_count: int) -> list[str]:
43
+ """Split text into ``chunk_count`` roughly equal word groups."""
44
+ tokens = text.split()
45
+ if chunk_count <= 1 or len(tokens) <= chunk_count:
46
+ return [text]
47
+
48
+ base_size, remainder = divmod(len(tokens), chunk_count)
49
+ chunks = []
50
+ index = 0
51
+ for i in range(chunk_count):
52
+ size = base_size + (1 if i < remainder else 0)
53
+ chunks.append(" ".join(tokens[index : index + size]))
54
+ index += size
55
+ return chunks
56
+
57
+
58
+ def _dominant_speaker(words: list[dict]) -> str | None:
59
+ """Return the most common speaker label among the given words, if any."""
60
+ speakers = [w.get("speaker") for w in words if w.get("speaker")]
61
+ if not speakers:
62
+ return None
63
+ return Counter(speakers).most_common(1)[0][0]
64
+
65
+
66
+ def _prefix_speaker(text: str, speaker: str | None) -> str:
67
+ """Prefix a speaker label to text when one is known."""
68
+ if not speaker:
69
+ return text
70
+ return f"[{speaker}] {text}"
71
+
72
+
73
+ def _split_segment(
74
+ segment: dict,
75
+ min_words: int,
76
+ max_words: int,
77
+ ) -> list[dict]:
78
+ """Split a single WhisperX segment into subtitle-sized chunks.
79
+
80
+ Word-level timings are used when available. If not, the segment's total
81
+ duration is divided proportionally among the chunks.
82
+ """
83
+ words = _words_from_segment(segment)
84
+ segment_start = float(segment.get("start", 0.0))
85
+ segment_end = float(segment.get("end", segment_start))
86
+
87
+ if not words:
88
+ cleaned = _sanitize_text(str(segment.get("text", "")))
89
+ if cleaned:
90
+ return [{"start": segment_start, "end": segment_end, "text": cleaned}]
91
+ return []
92
+
93
+ # Build chunks based on word count targets.
94
+ chunks: list[list[dict]] = []
95
+ current_chunk: list[dict] = []
96
+
97
+ for word in words:
98
+ current_chunk.append(word)
99
+ if len(current_chunk) >= max_words:
100
+ chunks.append(current_chunk)
101
+ current_chunk = []
102
+
103
+ if current_chunk:
104
+ # Merge a tiny trailing chunk with the previous one if possible.
105
+ if len(current_chunk) < min_words and chunks:
106
+ chunks[-1].extend(current_chunk)
107
+ else:
108
+ chunks.append(current_chunk)
109
+
110
+ # Resolve timings per chunk.
111
+ result = []
112
+ for chunk in chunks:
113
+ text_words = [w["word"].strip() for w in chunk]
114
+ text = _sanitize_text(" ".join(text_words))
115
+ if not text:
116
+ continue
117
+
118
+ timed_words = [w for w in chunk if isinstance(w, dict) and w.get("start") is not None and w.get("end") is not None]
119
+ if timed_words:
120
+ start = float(timed_words[0]["start"])
121
+ end = float(timed_words[-1]["end"])
122
+ else:
123
+ # Fallback: divide the segment duration proportionally.
124
+ ratio = max(1, len(chunk)) / max(1, len(words))
125
+ duration = segment_end - segment_start
126
+ chunk_index = chunks.index(chunk)
127
+ start = segment_start + duration * (chunk_index / len(chunks))
128
+ end = segment_start + duration * ((chunk_index + 1) / len(chunks))
129
+
130
+ speaker = _dominant_speaker(chunk)
131
+ text = _prefix_speaker(text, speaker)
132
+ result.append({"start": start, "end": end, "text": text})
133
+
134
+ return result
135
+
136
+
137
+ def split_segments(
138
+ segments: Iterable[dict],
139
+ preset: str = PRESET_SHORTFORM,
140
+ ) -> list[dict]:
141
+ """Split or join WhisperX segments according to the chosen preset.
142
+
143
+ Parameters
144
+ ----------
145
+ segments:
146
+ WhisperX segments with ``start``, ``end``, ``text`` and optionally
147
+ per-word timings.
148
+ preset:
149
+ ``shortform`` or ``longform``.
150
+
151
+ Returns
152
+ -------
153
+ A flat list of segment dicts suitable for writing to SRT.
154
+ """
155
+ if preset not in VALID_PRESETS:
156
+ valid = ", ".join(sorted(VALID_PRESETS))
157
+ raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
158
+
159
+ min_words, max_words = PRESET_TARGETS[preset]
160
+
161
+ output: list[dict] = []
162
+ for segment in segments:
163
+ output.extend(_split_segment(segment, min_words, max_words))
164
+
165
+ return output
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: subtitle-engine
3
- Version: 0.1.2
3
+ Version: 0.1.3
4
4
  Summary: Generate SRT subtitles from audio/video files using WhisperX
5
5
  Author: Leevi Puntanen
6
6
  License-Expression: MIT
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
65
65
  subeng video.mp4 --diarize --hf-token $HF_TOKEN
66
66
 
67
67
  # Generate a caption from the transcript using Ollama
68
- subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
68
+ subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
69
+
70
+ # Short-form subtitles (2-5 words per line, default)
71
+ subeng video.mp4 --preset shortform
72
+
73
+ # Long-form subtitles (10-14 words per line)
74
+ subeng video.mp4 --preset longform
69
75
  ```
70
76
 
71
77
  ## Options
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
83
89
  | `--caption` | Generate a caption from the transcript via Ollama |
84
90
  | `--ollama-model` | Ollama model name (required with `--caption`) |
85
91
  | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
92
+ | `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
86
93
 
87
94
  ## Development
88
95
 
@@ -4,6 +4,7 @@ pyproject.toml
4
4
  src/subtitle_engine/__init__.py
5
5
  src/subtitle_engine/captioner.py
6
6
  src/subtitle_engine/cli.py
7
+ src/subtitle_engine/segmenter.py
7
8
  src/subtitle_engine/srt_writer.py
8
9
  src/subtitle_engine/transcriber.py
9
10
  src/subtitle_engine/updater.py
@@ -16,6 +17,7 @@ src/subtitle_engine.egg-info/requires.txt
16
17
  src/subtitle_engine.egg-info/top_level.txt
17
18
  tests/test_captioner.py
18
19
  tests/test_cli.py
20
+ tests/test_segmenter.py
19
21
  tests/test_srt_writer.py
20
22
  tests/test_transcriber.py
21
23
  tests/test_updater.py
@@ -67,7 +67,7 @@ def test_cli_version_long():
67
67
  result = runner.invoke(app, ["--version"])
68
68
  assert result.exit_code == 0
69
69
  assert "subeng" in result.output
70
- assert "0.1.1" in result.output
70
+ assert "0.1.2" in result.output
71
71
 
72
72
 
73
73
  def test_cli_version_short():
@@ -79,7 +79,7 @@ def test_cli_version_short():
79
79
  def test_cli_version_no_extra_output():
80
80
  result = runner.invoke(app, ["--version"])
81
81
  assert result.exit_code == 0
82
- assert result.output.strip() == "subeng 0.1.1"
82
+ assert result.output.strip() == "subeng 0.1.2"
83
83
 
84
84
 
85
85
  def test_cli_quiet_hides_status_but_keeps_errors(tmp_path: Path):
@@ -145,3 +145,28 @@ def test_main_entry_runs_typer_app_for_transcription():
145
145
  with patch.object(sys, "argv", ["subeng", "video.mp4"]):
146
146
  main_entry()
147
147
  mock_app.assert_called_once()
148
+
149
+
150
+ def test_cli_preset_shortform_accepted(tmp_path: Path):
151
+ media = tmp_path / "video.mp4"
152
+ media.write_bytes(b"fake")
153
+ result = runner.invoke(app, [str(media), "--preset", "shortform"])
154
+ # Validation passes; transcription fails because the file is fake.
155
+ assert result.exit_code != 0
156
+ assert "Preset: shortform" in result.output
157
+
158
+
159
+ def test_cli_preset_longform_accepted(tmp_path: Path):
160
+ media = tmp_path / "video.mp4"
161
+ media.write_bytes(b"fake")
162
+ result = runner.invoke(app, [str(media), "--preset", "longform"])
163
+ assert result.exit_code != 0
164
+ assert "Preset: longform" in result.output
165
+
166
+
167
+ def test_cli_invalid_preset_rejected(tmp_path: Path):
168
+ media = tmp_path / "video.mp4"
169
+ media.write_bytes(b"fake")
170
+ result = runner.invoke(app, [str(media), "--preset", "invalid"])
171
+ assert result.exit_code != 0
172
+ assert "Unknown preset" in result.output
@@ -0,0 +1,84 @@
1
+ """Tests for the segmenter/preset splitter."""
2
+
3
+ import pytest
4
+
5
+ from subtitle_engine.segmenter import (
6
+ PRESET_LONGFORM,
7
+ PRESET_SHORTFORM,
8
+ split_segments,
9
+ )
10
+
11
+
12
+ def _segment(text: str, words: list[dict] | None = None, start: float = 0.0, end: float = 1.0) -> dict:
13
+ return {
14
+ "start": start,
15
+ "end": end,
16
+ "text": text,
17
+ "words": words if words is not None else [{"word": w} for w in text.split()],
18
+ }
19
+
20
+
21
+ def test_shortform_splits_to_small_chunks():
22
+ segment = _segment("one two three four five six seven eight", start=0.0, end=8.0)
23
+ result = split_segments([segment], preset=PRESET_SHORTFORM)
24
+
25
+ assert len(result) >= 2
26
+ for chunk in result:
27
+ word_count = len(chunk["text"].split())
28
+ assert 1 <= word_count <= 5
29
+
30
+
31
+ def test_longform_allows_larger_chunks():
32
+ segment = _segment(" ".join(str(i) for i in range(25)), start=0.0, end=25.0)
33
+ result = split_segments([segment], preset=PRESET_LONGFORM)
34
+
35
+ assert len(result) >= 1
36
+ for chunk in result:
37
+ word_count = len(chunk["text"].split())
38
+ assert 1 <= word_count <= 14
39
+
40
+
41
+ def test_unknown_preset_raises():
42
+ with pytest.raises(ValueError, match="Unknown preset"):
43
+ split_segments([], preset="invalid")
44
+
45
+
46
+ def test_word_timings_are_used():
47
+ words = [
48
+ {"word": "hello", "start": 0.0, "end": 0.5},
49
+ {"word": "world", "start": 0.5, "end": 1.0},
50
+ {"word": "today", "start": 1.0, "end": 1.5},
51
+ ]
52
+ segment = _segment("hello world today", words=words, start=0.0, end=1.5)
53
+ result = split_segments([segment], preset=PRESET_SHORTFORM)
54
+
55
+ assert result[0]["start"] == pytest.approx(0.0)
56
+ assert result[-1]["end"] == pytest.approx(1.5)
57
+
58
+
59
+ def test_speaker_label_is_preserved():
60
+ words = [
61
+ {"word": "hello", "speaker": "SPEAKER_01"},
62
+ {"word": "world", "speaker": "SPEAKER_01"},
63
+ {"word": "today", "speaker": "SPEAKER_02"},
64
+ ]
65
+ segment = _segment("hello world today", words=words, start=0.0, end=3.0)
66
+ result = split_segments([segment], preset=PRESET_SHORTFORM)
67
+
68
+ assert any("[SPEAKER_01]" in chunk["text"] for chunk in result)
69
+
70
+
71
+ def test_empty_segment_is_ignored():
72
+ segment = {"start": 0.0, "end": 1.0, "text": " ", "words": []}
73
+ result = split_segments([segment], preset=PRESET_SHORTFORM)
74
+ assert result == []
75
+
76
+
77
+ def test_segment_without_words_splits_by_text():
78
+ segment = {"start": 0.0, "end": 9.0, "text": "one two three four five six seven eight nine"}
79
+ result = split_segments([segment], preset=PRESET_SHORTFORM)
80
+
81
+ total_words = sum(len(chunk["text"].split()) for chunk in result)
82
+ assert total_words == 9
83
+ assert result[0]["start"] == pytest.approx(0.0)
84
+ assert result[-1]["end"] == pytest.approx(9.0)
File without changes