PyPI - videopython - Versions diffs - 0.33.3__tar.gz → 0.33.4__tar.gz - Mend

videopython 0.33.3tar.gz → 0.33.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{videopython-0.33.3 → videopython-0.33.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.33.3
+Version: 0.33.4
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.33.3 → videopython-0.33.4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.33.3"
+version = "0.33.4"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.33.3 → videopython-0.33.4}/src/videopython/base/transcription.py RENAMED Viewed

@@ -6,6 +6,13 @@ from typing import Any
 __all__ = ["Transcription", "TranscriptionSegment", "TranscriptionWord"]
+# Sentence-ending punctuation used by ``Transcription.capitalize_sentences``.
+_SENTENCE_TERMINATORS = (".", "!", "?", "…")
+# Trailing characters stripped before checking for a sentence terminator
+# (closing quotes/brackets and whitespace), so ``end."`` still ends a sentence.
+_TRAILING_WRAPPERS = "\"')]}»”’ "
 @dataclass
 class TranscriptionWord:
@@ -279,6 +286,92 @@ class Transcription:
         return Transcription(segments=standardized_segments, language=self.language)
+    def capitalize_sentences(self) -> Transcription:
+        """Return a new Transcription with sentence-start capitalization.
+        The first letter of the first spoken word and of every word that
+        follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
+        upper-cased. Remaining characters are left untouched, so acronyms and
+        proper nouns from the source transcription are preserved. Timing,
+        speaker, and language are carried through unchanged.
+        Abbreviation detection is intentionally not attempted: a token like
+        ``"U.S."`` is treated as a sentence end. This heuristic is adequate
+        for burned-in subtitles and avoids a brittle abbreviation list.
+        """
+        capitalized_segments: list[TranscriptionSegment] = []
+        start_of_sentence = True
+        for segment in self.segments:
+            new_words: list[TranscriptionWord] = []
+            for word in segment.words:
+                token = word.word
+                if start_of_sentence:
+                    idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
+                    if idx is not None:
+                        token = token[:idx] + token[idx].upper() + token[idx + 1 :]
+                        start_of_sentence = False
+                if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
+                    start_of_sentence = True
+                new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
+            capitalized_segments.append(
+                TranscriptionSegment(
+                    start=segment.start,
+                    end=segment.end,
+                    text=" ".join(w.word for w in new_words),
+                    words=new_words,
+                    speaker=segment.speaker,
+                    avg_logprob=segment.avg_logprob,
+                    no_speech_prob=segment.no_speech_prob,
+                    compression_ratio=segment.compression_ratio,
+                )
+            )
+        return Transcription(segments=capitalized_segments, language=self.language)
+    def chunk_segments(self, max_words: int) -> Transcription:
+        """Return a new Transcription splitting each segment into smaller cues.
+        Each segment is split into consecutive groups of at most ``max_words``
+        words, using that group's own first/last word timings. Unlike
+        :meth:`standardize_segments`, words are never merged across the
+        original segments, so silence gaps between segments are preserved and
+        subtitles do not linger over pauses. Speaker, confidence, and language
+        metadata are carried through unchanged.
+        Args:
+            max_words: Maximum number of words per output segment.
+        Raises:
+            ValueError: If ``max_words`` is not positive.
+        """
+        if max_words <= 0:
+            raise ValueError("max_words must be positive")
+        chunked_segments: list[TranscriptionSegment] = []
+        for segment in self.segments:
+            words = segment.words
+            if not words:
+                chunked_segments.append(segment)
+                continue
+            for i in range(0, len(words), max_words):
+                group = words[i : i + max_words]
+                chunked_segments.append(
+                    TranscriptionSegment(
+                        start=group[0].start,
+                        end=group[-1].end,
+                        text=" ".join(w.word for w in group),
+                        words=list(group),
+                        speaker=segment.speaker,
+                        avg_logprob=segment.avg_logprob,
+                        no_speech_prob=segment.no_speech_prob,
+                        compression_ratio=segment.compression_ratio,
+                    )
+                )
+        return Transcription(segments=chunked_segments, language=self.language)
     def slice(self, start: float, end: float) -> Transcription | None:
         """Return a new Transcription containing only words within the time range.

{videopython-0.33.3 → videopython-0.33.4}/src/videopython/editing/transcription_overlay.py RENAMED Viewed

@@ -78,6 +78,24 @@ class TranscriptionOverlay(Effect):
     highlight_bold_font: str | None = Field(
         None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
     )
+    max_words_per_cue: int | None = Field(
+        5,
+        ge=1,
+        description=(
+            "Maximum words shown on screen at once. Each transcription segment is re-chunked into "
+            "cues of at most this many words, without bridging the silence gaps between segments, so "
+            "subtitles stay readable and don't linger over pauses. None preserves the source "
+            "transcription's segmentation."
+        ),
+    )
+    capitalize: bool = Field(
+        True,
+        description=(
+            "Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
+            "Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
+            "exactly as transcribed."
+        ),
+    )
     _overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
@@ -140,6 +158,11 @@ class TranscriptionOverlay(Effect):
                 "Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
             )
+        if self.max_words_per_cue is not None:
+            transcription = transcription.chunk_segments(self.max_words_per_cue)
+        if self.capitalize:
+            transcription = transcription.capitalize_sentences()
         logger.info("Applying transcription overlay...")
         new_frames = []
         for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):