PyPI - subtitle-engine - Versions diffs - 0.1.2__tar.gz → 0.1.3__tar.gz - Mend

subtitle-engine 0.1.2tar.gz → 0.1.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{subtitle_engine-0.1.2/src/subtitle_engine.egg-info → subtitle_engine-0.1.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: subtitle-engine
-Version: 0.1.2
+Version: 0.1.3
 Summary: Generate SRT subtitles from audio/video files using WhisperX
 Author: Leevi Puntanen
 License-Expression: MIT
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
 subeng video.mp4 --diarize --hf-token $HF_TOKEN
 # Generate a caption from the transcript using Ollama
-subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
+subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
+# Short-form subtitles (2-5 words per line, default)
+subeng video.mp4 --preset shortform
+# Long-form subtitles (10-14 words per line)
+subeng video.mp4 --preset longform
 ```
 ## Options
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
 | `--caption` | Generate a caption from the transcript via Ollama |
 | `--ollama-model` | Ollama model name (required with `--caption`) |
 | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
+| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
 ## Development

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/README.md RENAMED Viewed

@@ -39,7 +39,13 @@ subeng video.mp4 --device cpu
 subeng video.mp4 --diarize --hf-token $HF_TOKEN
 # Generate a caption from the transcript using Ollama
-subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
+subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
+# Short-form subtitles (2-5 words per line, default)
+subeng video.mp4 --preset shortform
+# Long-form subtitles (10-14 words per line)
+subeng video.mp4 --preset longform
 ```
 ## Options
@@ -57,6 +63,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
 | `--caption` | Generate a caption from the transcript via Ollama |
 | `--ollama-model` | Ollama model name (required with `--caption`) |
 | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
+| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
 ## Development

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "subtitle-engine"
-version = "0.1.2"
+version = "0.1.3"
 description = "Generate SRT subtitles from audio/video files using WhisperX"
 readme = "README.md"
 license = "MIT"

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
 """Subtitle Engine — generate SRT files with WhisperX."""
-__version__ = "0.1.1"
+__version__ = "0.1.3"

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine/cli.py RENAMED Viewed

@@ -9,6 +9,7 @@ from rich.console import Console
 from subtitle_engine import __version__
 from subtitle_engine.captioner import generate_caption
+from subtitle_engine.segmenter import VALID_PRESETS, split_segments
 from subtitle_engine.srt_writer import write_srt
 from subtitle_engine.transcriber import transcribe
 from subtitle_engine.updater import UpdateCheckError, check_for_update, update_package
@@ -162,6 +163,14 @@ def main(
             envvar="OLLAMA_HOST",
         ),
     ] = "http://localhost:11434",
+    preset: Annotated[
+        str,
+        typer.Option(
+            "--preset",
+            "-p",
+            help="Subtitle style: shortform (2-5 words) or longform (10-14 words).",
+        ),
+    ] = "shortform",
     quiet: Annotated[
         bool,
         typer.Option(
@@ -190,6 +199,10 @@ def main(
 ) -> None:
     """Generate SRT subtitles from a media file."""
     try:
+        if preset not in VALID_PRESETS:
+            valid = ", ".join(sorted(VALID_PRESETS))
+            raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
         validate_media_file(input_file)
         output_path = resolve_output_path(input_file, output)
@@ -208,6 +221,7 @@ def main(
         if not quiet:
             console.print(f"[bold]Transcribing:[/bold] {input_file}")
             console.print(f"[bold]Model:[/bold] {model}")
+            console.print(f"[bold]Preset:[/bold] {preset}")
             if language:
                 console.print(f"[bold]Language:[/bold] {language}")
             if device:
@@ -225,6 +239,7 @@ def main(
             verbose=verbose,
         )
+        segments = split_segments(segments, preset=preset)
         write_srt(segments, output_path)
         if not quiet:
             console.print(f"[green]Wrote subtitles to:[/green] {output_path}")

subtitle_engine-0.1.3/src/subtitle_engine/segmenter.py ADDED Viewed

@@ -0,0 +1,165 @@
+"""Split WhisperX segments into shorter or longer subtitle chunks."""
+from __future__ import annotations
+from collections import Counter
+from typing import Iterable
+PRESET_SHORTFORM = "shortform"
+PRESET_LONGFORM = "longform"
+VALID_PRESETS = {PRESET_SHORTFORM, PRESET_LONGFORM}
+# Word-count targets per subtitle block.
+PRESET_TARGETS = {
+    PRESET_SHORTFORM: (2, 5),   # min, max
+    PRESET_LONGFORM: (10, 14),  # min, max
+}
+def _sanitize_text(text: str) -> str:
+    """Return a cleaned version of the text for display."""
+    return " ".join(text.split())
+def _words_from_segment(segment: dict) -> list[dict]:
+    """Extract a clean list of word dicts from a WhisperX segment.
+    Each word dict should have ``word`` and optionally ``start``/``end``.
+    """
+    raw_words = segment.get("words", [])
+    words = []
+    for word_entry in raw_words:
+        if isinstance(word_entry, dict):
+            word_text = word_entry.get("word", "").strip()
+        else:
+            word_text = str(word_entry).strip()
+        if word_text:
+            words.append({"word": word_text, **word_entry} if isinstance(word_entry, dict) else {"word": word_text})
+    return words
+def _split_text_evenly(text: str, chunk_count: int) -> list[str]:
+    """Split text into ``chunk_count`` roughly equal word groups."""
+    tokens = text.split()
+    if chunk_count <= 1 or len(tokens) <= chunk_count:
+        return [text]
+    base_size, remainder = divmod(len(tokens), chunk_count)
+    chunks = []
+    index = 0
+    for i in range(chunk_count):
+        size = base_size + (1 if i < remainder else 0)
+        chunks.append(" ".join(tokens[index : index + size]))
+        index += size
+    return chunks
+def _dominant_speaker(words: list[dict]) -> str | None:
+    """Return the most common speaker label among the given words, if any."""
+    speakers = [w.get("speaker") for w in words if w.get("speaker")]
+    if not speakers:
+        return None
+    return Counter(speakers).most_common(1)[0][0]
+def _prefix_speaker(text: str, speaker: str | None) -> str:
+    """Prefix a speaker label to text when one is known."""
+    if not speaker:
+        return text
+    return f"[{speaker}] {text}"
+def _split_segment(
+    segment: dict,
+    min_words: int,
+    max_words: int,
+) -> list[dict]:
+    """Split a single WhisperX segment into subtitle-sized chunks.
+    Word-level timings are used when available. If not, the segment's total
+    duration is divided proportionally among the chunks.
+    """
+    words = _words_from_segment(segment)
+    segment_start = float(segment.get("start", 0.0))
+    segment_end = float(segment.get("end", segment_start))
+    if not words:
+        cleaned = _sanitize_text(str(segment.get("text", "")))
+        if cleaned:
+            return [{"start": segment_start, "end": segment_end, "text": cleaned}]
+        return []
+    # Build chunks based on word count targets.
+    chunks: list[list[dict]] = []
+    current_chunk: list[dict] = []
+    for word in words:
+        current_chunk.append(word)
+        if len(current_chunk) >= max_words:
+            chunks.append(current_chunk)
+            current_chunk = []
+    if current_chunk:
+        # Merge a tiny trailing chunk with the previous one if possible.
+        if len(current_chunk) < min_words and chunks:
+            chunks[-1].extend(current_chunk)
+        else:
+            chunks.append(current_chunk)
+    # Resolve timings per chunk.
+    result = []
+    for chunk in chunks:
+        text_words = [w["word"].strip() for w in chunk]
+        text = _sanitize_text(" ".join(text_words))
+        if not text:
+            continue
+        timed_words = [w for w in chunk if isinstance(w, dict) and w.get("start") is not None and w.get("end") is not None]
+        if timed_words:
+            start = float(timed_words[0]["start"])
+            end = float(timed_words[-1]["end"])
+        else:
+            # Fallback: divide the segment duration proportionally.
+            ratio = max(1, len(chunk)) / max(1, len(words))
+            duration = segment_end - segment_start
+            chunk_index = chunks.index(chunk)
+            start = segment_start + duration * (chunk_index / len(chunks))
+            end = segment_start + duration * ((chunk_index + 1) / len(chunks))
+        speaker = _dominant_speaker(chunk)
+        text = _prefix_speaker(text, speaker)
+        result.append({"start": start, "end": end, "text": text})
+    return result
+def split_segments(
+    segments: Iterable[dict],
+    preset: str = PRESET_SHORTFORM,
+) -> list[dict]:
+    """Split or join WhisperX segments according to the chosen preset.
+    Parameters
+    ----------
+    segments:
+        WhisperX segments with ``start``, ``end``, ``text`` and optionally
+        per-word timings.
+    preset:
+        ``shortform`` or ``longform``.
+    Returns
+    -------
+    A flat list of segment dicts suitable for writing to SRT.
+    """
+    if preset not in VALID_PRESETS:
+        valid = ", ".join(sorted(VALID_PRESETS))
+        raise ValueError(f"Unknown preset '{preset}'. Choose from: {valid}")
+    min_words, max_words = PRESET_TARGETS[preset]
+    output: list[dict] = []
+    for segment in segments:
+        output.extend(_split_segment(segment, min_words, max_words))
+    return output

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3/src/subtitle_engine.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: subtitle-engine
-Version: 0.1.2
+Version: 0.1.3
 Summary: Generate SRT subtitles from audio/video files using WhisperX
 Author: Leevi Puntanen
 License-Expression: MIT
@@ -65,7 +65,13 @@ subeng video.mp4 --device cpu
 subeng video.mp4 --diarize --hf-token $HF_TOKEN
 # Generate a caption from the transcript using Ollama
-subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
+subeng video.mp4 --caption --ollama-model qwen3.5:0.6b
+# Short-form subtitles (2-5 words per line, default)
+subeng video.mp4 --preset shortform
+# Long-form subtitles (10-14 words per line)
+subeng video.mp4 --preset longform
 ```
 ## Options
@@ -83,6 +89,7 @@ subeng video.mp4 --caption --ollama-model qwen3.5:0.8b
 | `--caption` | Generate a caption from the transcript via Ollama |
 | `--ollama-model` | Ollama model name (required with `--caption`) |
 | `--ollama-host` | Ollama API host (default: `http://localhost:11434`) |
+| `--preset`, `-p` | Subtitle style: `shortform` (2-5 words, default) or `longform` (10-14 words) |
 ## Development

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/src/subtitle_engine.egg-info/SOURCES.txt RENAMED Viewed

@@ -4,6 +4,7 @@ pyproject.toml
 src/subtitle_engine/__init__.py
 src/subtitle_engine/captioner.py
 src/subtitle_engine/cli.py
+src/subtitle_engine/segmenter.py
 src/subtitle_engine/srt_writer.py
 src/subtitle_engine/transcriber.py
 src/subtitle_engine/updater.py
@@ -16,6 +17,7 @@ src/subtitle_engine.egg-info/requires.txt
 src/subtitle_engine.egg-info/top_level.txt
 tests/test_captioner.py
 tests/test_cli.py
+tests/test_segmenter.py
 tests/test_srt_writer.py
 tests/test_transcriber.py
 tests/test_updater.py

{subtitle_engine-0.1.2 → subtitle_engine-0.1.3}/tests/test_cli.py RENAMED Viewed

@@ -67,7 +67,7 @@ def test_cli_version_long():
     result = runner.invoke(app, ["--version"])
     assert result.exit_code == 0
     assert "subeng" in result.output
-    assert "0.1.1" in result.output
+    assert "0.1.2" in result.output
 def test_cli_version_short():
@@ -79,7 +79,7 @@ def test_cli_version_short():
 def test_cli_version_no_extra_output():
     result = runner.invoke(app, ["--version"])
     assert result.exit_code == 0
-    assert result.output.strip() == "subeng 0.1.1"
+    assert result.output.strip() == "subeng 0.1.2"
 def test_cli_quiet_hides_status_but_keeps_errors(tmp_path: Path):
@@ -145,3 +145,28 @@ def test_main_entry_runs_typer_app_for_transcription():
         with patch.object(sys, "argv", ["subeng", "video.mp4"]):
             main_entry()
         mock_app.assert_called_once()
+def test_cli_preset_shortform_accepted(tmp_path: Path):
+    media = tmp_path / "video.mp4"
+    media.write_bytes(b"fake")
+    result = runner.invoke(app, [str(media), "--preset", "shortform"])
+    # Validation passes; transcription fails because the file is fake.
+    assert result.exit_code != 0
+    assert "Preset: shortform" in result.output
+def test_cli_preset_longform_accepted(tmp_path: Path):
+    media = tmp_path / "video.mp4"
+    media.write_bytes(b"fake")
+    result = runner.invoke(app, [str(media), "--preset", "longform"])
+    assert result.exit_code != 0
+    assert "Preset: longform" in result.output
+def test_cli_invalid_preset_rejected(tmp_path: Path):
+    media = tmp_path / "video.mp4"
+    media.write_bytes(b"fake")
+    result = runner.invoke(app, [str(media), "--preset", "invalid"])
+    assert result.exit_code != 0
+    assert "Unknown preset" in result.output

subtitle_engine-0.1.3/tests/test_segmenter.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Tests for the segmenter/preset splitter."""
+import pytest
+from subtitle_engine.segmenter import (
+    PRESET_LONGFORM,
+    PRESET_SHORTFORM,
+    split_segments,
+)
+def _segment(text: str, words: list[dict] | None = None, start: float = 0.0, end: float = 1.0) -> dict:
+    return {
+        "start": start,
+        "end": end,
+        "text": text,
+        "words": words if words is not None else [{"word": w} for w in text.split()],
+    }
+def test_shortform_splits_to_small_chunks():
+    segment = _segment("one two three four five six seven eight", start=0.0, end=8.0)
+    result = split_segments([segment], preset=PRESET_SHORTFORM)
+    assert len(result) >= 2
+    for chunk in result:
+        word_count = len(chunk["text"].split())
+        assert 1 <= word_count <= 5
+def test_longform_allows_larger_chunks():
+    segment = _segment(" ".join(str(i) for i in range(25)), start=0.0, end=25.0)
+    result = split_segments([segment], preset=PRESET_LONGFORM)
+    assert len(result) >= 1
+    for chunk in result:
+        word_count = len(chunk["text"].split())
+        assert 1 <= word_count <= 14
+def test_unknown_preset_raises():
+    with pytest.raises(ValueError, match="Unknown preset"):
+        split_segments([], preset="invalid")
+def test_word_timings_are_used():
+    words = [
+        {"word": "hello", "start": 0.0, "end": 0.5},
+        {"word": "world", "start": 0.5, "end": 1.0},
+        {"word": "today", "start": 1.0, "end": 1.5},
+    ]
+    segment = _segment("hello world today", words=words, start=0.0, end=1.5)
+    result = split_segments([segment], preset=PRESET_SHORTFORM)
+    assert result[0]["start"] == pytest.approx(0.0)
+    assert result[-1]["end"] == pytest.approx(1.5)
+def test_speaker_label_is_preserved():
+    words = [
+        {"word": "hello", "speaker": "SPEAKER_01"},
+        {"word": "world", "speaker": "SPEAKER_01"},
+        {"word": "today", "speaker": "SPEAKER_02"},
+    ]
+    segment = _segment("hello world today", words=words, start=0.0, end=3.0)
+    result = split_segments([segment], preset=PRESET_SHORTFORM)
+    assert any("[SPEAKER_01]" in chunk["text"] for chunk in result)
+def test_empty_segment_is_ignored():
+    segment = {"start": 0.0, "end": 1.0, "text": "   ", "words": []}
+    result = split_segments([segment], preset=PRESET_SHORTFORM)
+    assert result == []
+def test_segment_without_words_splits_by_text():
+    segment = {"start": 0.0, "end": 9.0, "text": "one two three four five six seven eight nine"}
+    result = split_segments([segment], preset=PRESET_SHORTFORM)
+    total_words = sum(len(chunk["text"].split()) for chunk in result)
+    assert total_words == 9
+    assert result[0]["start"] == pytest.approx(0.0)
+    assert result[-1]["end"] == pytest.approx(9.0)