PyPI - videopython - Versions diffs - 0.33.3__tar.gz → 0.33.5__tar.gz - Mend

videopython 0.33.3tar.gz → 0.33.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

{videopython-0.33.3 → videopython-0.33.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.33.3
+Version: 0.33.5
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.33.3 → videopython-0.33.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.33.3"
+version = "0.33.5"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.33.3 → videopython-0.33.5}/src/videopython/base/transcription.py RENAMED Viewed

@@ -1,11 +1,18 @@
 from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from pathlib import Path
 from typing import Any
 __all__ = ["Transcription", "TranscriptionSegment", "TranscriptionWord"]
+# Sentence-ending punctuation used by ``Transcription.capitalize_sentences``.
+_SENTENCE_TERMINATORS = (".", "!", "?", "…")
+# Trailing characters stripped before checking for a sentence terminator
+# (closing quotes/brackets and whitespace), so ``end."`` still ends a sentence.
+_TRAILING_WRAPPERS = "\"')]}»”’ "
 @dataclass
 class TranscriptionWord:
@@ -72,6 +79,38 @@ class TranscriptionSegment:
             compression_ratio=data.get("compression_ratio"),
         )
+    @classmethod
+    def from_words(
+        cls,
+        words: list[TranscriptionWord],
+        *,
+        speaker: str | None = None,
+        avg_logprob: float | None = None,
+        no_speech_prob: float | None = None,
+        compression_ratio: float | None = None,
+    ) -> TranscriptionSegment:
+        """Build a segment spanning ``words``, deriving start/end/text from them.
+        ``words`` must be non-empty: ``start``/``end`` come from the first/last
+        word and ``text`` is the words joined by single spaces. Speaker and the
+        confidence fields are passed through so callers re-segmenting *within* a
+        known source segment can preserve them; callers regrouping words across
+        segments (where these are ambiguous) simply omit them, leaving ``None``.
+        The ``words`` list is copied, so the result never aliases the caller's.
+        """
+        if not words:
+            raise ValueError("from_words requires a non-empty word list")
+        return cls(
+            start=words[0].start,
+            end=words[-1].end,
+            text=" ".join(w.word for w in words),
+            words=list(words),
+            speaker=speaker,
+            avg_logprob=avg_logprob,
+            no_speech_prob=no_speech_prob,
+            compression_ratio=compression_ratio,
+        )
 class Transcription:
     def __init__(
@@ -117,39 +156,19 @@ class Transcription:
             return []
         current_speaker = words[0].speaker
-        current_words = []
-        segment_start = words[0].start
+        current_words: list[TranscriptionWord] = []
         segments = []
         for word in words:
             if current_speaker == word.speaker:
                 current_words.append(word)
             else:
-                segment_text = " ".join(w.word for w in current_words)
-                segments.append(
-                    TranscriptionSegment(
-                        start=segment_start,
-                        end=current_words[-1].end,
-                        text=segment_text.strip(),
-                        words=current_words.copy(),
-                        speaker=current_speaker,
-                    )
-                )
+                segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                 current_speaker = word.speaker
                 current_words = [word]
-                segment_start = word.start
         if current_words:
-            segment_text = " ".join(w.word for w in current_words)
-            segments.append(
-                TranscriptionSegment(
-                    start=segment_start,
-                    end=current_words[-1].end,
-                    text=segment_text.strip(),
-                    words=current_words.copy(),
-                    speaker=current_speaker,
-                )
-            )
+            segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
         return segments
@@ -183,22 +202,14 @@ class Transcription:
         offset_segments = []
         for segment in self.segments:
-            offset_words = []
-            for word in segment.words:
-                offset_words.append(
-                    TranscriptionWord(
-                        start=word.start + time, end=word.end + time, word=word.word, speaker=word.speaker
-                    )
-                )
+            offset_words = [
+                TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
+                for w in segment.words
+            ]
+            # ``replace`` carries text, speaker, and confidence fields through a
+            # pure timing shift unchanged -- only timestamps move.
             offset_segments.append(
-                TranscriptionSegment(
-                    start=segment.start + time,
-                    end=segment.end + time,
-                    text=segment.text,
-                    words=offset_words,
-                    speaker=segment.speaker,
-                )
+                replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
             )
         return Transcription(segments=offset_segments, language=self.language)
@@ -238,16 +249,9 @@ class Transcription:
         def _flush(words: list[TranscriptionWord]) -> None:
             if not words:
                 return
-            segment_text = " ".join(w.word for w in words)
-            standardized_segments.append(
-                TranscriptionSegment(
-                    start=words[0].start,
-                    end=words[-1].end,
-                    text=segment_text,
-                    words=words.copy(),
-                    speaker=words[0].speaker,
-                )
-            )
+            # Words here are regrouped across original segments, so the source
+            # segments' confidence fields no longer apply -- left as None.
+            standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))
         if time is not None:
             current_words: list[TranscriptionWord] = []
@@ -279,6 +283,84 @@ class Transcription:
         return Transcription(segments=standardized_segments, language=self.language)
+    def capitalize_sentences(self) -> Transcription:
+        """Return a new Transcription with sentence-start capitalization.
+        The first letter of the first spoken word and of every word that
+        follows sentence-ending punctuation (``.``, ``!``, ``?``, ``…``) is
+        upper-cased. Remaining characters are left untouched, so acronyms and
+        proper nouns from the source transcription are preserved. Timing,
+        speaker, and language are carried through unchanged.
+        Abbreviation detection is intentionally not attempted: a token like
+        ``"U.S."`` is treated as a sentence end. This heuristic is adequate
+        for burned-in subtitles and avoids a brittle abbreviation list.
+        """
+        capitalized_segments: list[TranscriptionSegment] = []
+        start_of_sentence = True
+        for segment in self.segments:
+            new_words: list[TranscriptionWord] = []
+            for word in segment.words:
+                token = word.word
+                if start_of_sentence:
+                    idx = next((i for i, ch in enumerate(token) if ch.isalpha()), None)
+                    if idx is not None:
+                        token = token[:idx] + token[idx].upper() + token[idx + 1 :]
+                        start_of_sentence = False
+                if token.rstrip(_TRAILING_WRAPPERS).endswith(_SENTENCE_TERMINATORS):
+                    start_of_sentence = True
+                new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
+            # Casing-only rewrite: segment boundaries, speaker, and confidence
+            # are unchanged; only the tokens (and joined text) differ.
+            capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))
+        return Transcription(segments=capitalized_segments, language=self.language)
+    def chunk_segments(self, max_words: int) -> Transcription:
+        """Return a new Transcription splitting each segment into smaller cues.
+        Each segment is split into consecutive groups of at most ``max_words``
+        words, using that group's own first/last word timings. Unlike
+        :meth:`standardize_segments`, words are never merged across the
+        original segments, so silence gaps between segments are preserved and
+        subtitles do not linger over pauses. Speaker, confidence, and language
+        metadata are carried through unchanged.
+        Args:
+            max_words: Maximum number of words per output segment.
+        Raises:
+            ValueError: If ``max_words`` is not positive.
+        """
+        if max_words <= 0:
+            raise ValueError("max_words must be positive")
+        chunked_segments: list[TranscriptionSegment] = []
+        for segment in self.segments:
+            words = segment.words
+            if not words:
+                # Nothing to split; emit a fresh copy so the result never
+                # aliases the source segment.
+                chunked_segments.append(replace(segment, words=list(segment.words)))
+                continue
+            for i in range(0, len(words), max_words):
+                group = words[i : i + max_words]
+                # Splitting *within* one source segment -- its confidence
+                # fields still apply, so carry them through.
+                chunked_segments.append(
+                    TranscriptionSegment.from_words(
+                        group,
+                        speaker=segment.speaker,
+                        avg_logprob=segment.avg_logprob,
+                        no_speech_prob=segment.no_speech_prob,
+                        compression_ratio=segment.compression_ratio,
+                    )
+                )
+        return Transcription(segments=chunked_segments, language=self.language)
     def slice(self, start: float, end: float) -> Transcription | None:
         """Return a new Transcription containing only words within the time range.
@@ -316,34 +398,17 @@ class Transcription:
             if word.speaker == current_speaker:
                 current_words.append(word)
             else:
-                # Finish current segment
+                # Finish current segment (speaker is ambiguous across the
+                # original segments these words came from -- confidence omitted)
                 if current_words:
-                    segment_text = " ".join(w.word for w in current_words)
-                    sliced_segments.append(
-                        TranscriptionSegment(
-                            start=current_words[0].start,
-                            end=current_words[-1].end,
-                            text=segment_text,
-                            words=current_words.copy(),
-                            speaker=current_speaker,
-                        )
-                    )
+                    sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                 # Start new segment
                 current_speaker = word.speaker
                 current_words = [word]
         # Add final segment
         if current_words:
-            segment_text = " ".join(w.word for w in current_words)
-            sliced_segments.append(
-                TranscriptionSegment(
-                    start=current_words[0].start,
-                    end=current_words[-1].end,
-                    text=segment_text,
-                    words=current_words.copy(),
-                    speaker=current_speaker,
-                )
-            )
+            sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
         return Transcription(segments=sliced_segments, language=self.language)

{videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/transcription_overlay.py RENAMED Viewed

@@ -78,6 +78,24 @@ class TranscriptionOverlay(Effect):
     highlight_bold_font: str | None = Field(
         None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
     )
+    max_words_per_cue: int | None = Field(
+        5,
+        ge=1,
+        description=(
+            "Maximum words shown on screen at once. Each transcription segment is re-chunked into "
+            "cues of at most this many words, without bridging the silence gaps between segments, so "
+            "subtitles stay readable and don't linger over pauses. None preserves the source "
+            "transcription's segmentation."
+        ),
+    )
+    capitalize: bool = Field(
+        True,
+        description=(
+            "Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
+            "Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
+            "exactly as transcribed."
+        ),
+    )
     _overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
@@ -140,6 +158,11 @@ class TranscriptionOverlay(Effect):
                 "Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
             )
+        if self.max_words_per_cue is not None:
+            transcription = transcription.chunk_segments(self.max_words_per_cue)
+        if self.capitalize:
+            transcription = transcription.capitalize_sentences()
         logger.info("Applying transcription overlay...")
         new_frames = []
         for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):

{videopython-0.33.3 → videopython-0.33.5}/src/videopython/editing/video_edit.py RENAMED Viewed

@@ -24,7 +24,7 @@ import subprocess
 import tempfile
 import warnings
 from pathlib import Path
-from typing import Annotated, Any
+from typing import Annotated, Any, Protocol, runtime_checkable
 from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, SerializeAsAny, model_validator
@@ -65,6 +65,72 @@ def _resolve_operation(value: Any) -> Operation:
 OperationInput = Annotated[SerializeAsAny[Operation], BeforeValidator(_resolve_operation)]
+@runtime_checkable
+class SegmentRebaseable(Protocol):
+    """A runtime-context value carrying a source-absolute timeline.
+    Any context entry implementing both ``slice(start, end)`` and
+    ``offset(delta)`` -- e.g. :class:`videopython.base.transcription.Transcription`
+    -- is automatically re-based onto each segment's 0-based local timeline by
+    the runner, with no per-type wiring. Keying off structure rather than a
+    concrete class keeps the context mechanism generic for future time-based
+    context (beat maps, scene markers, ...) and avoids a layering dependency
+    from the editing layer onto every such type.
+    """
+    def slice(self, start: float, end: float) -> SegmentRebaseable | None: ...
+    def offset(self, delta: float) -> SegmentRebaseable: ...
+def _rebaseable_keys(context: dict[str, Any] | None) -> set[str]:
+    """Context keys whose value carries a re-baseable source-absolute timeline."""
+    if not context:
+        return set()
+    return {k for k, v in context.items() if isinstance(v, SegmentRebaseable)}
+def _segment_context(
+    context: dict[str, Any] | None,
+    start: float,
+    end: float,
+) -> dict[str, Any] | None:
+    """Re-base time-based context entries onto a cut segment's local timeline.
+    A cut segment is decoded 0-based -- its first frame is ``t=0`` -- but
+    context values may carry source-absolute timestamps. Every value
+    implementing :class:`SegmentRebaseable` (e.g. a ``Transcription``) is
+    sliced to ``[start, end)`` and shifted by ``-start`` so segment operations
+    (``add_subtitles``, ``silence_removal``) see segment-local time. Without
+    this, subtitles on a segment cut from the middle of a video render blank.
+    Values that don't implement the protocol pass through untouched.
+    Slicing always runs (even for ``start == 0``) so out-of-range entries do
+    not bleed in. When ``slice`` yields nothing the key is dropped rather than
+    passed empty, so the consuming operation raises its own clear "requires
+    ..." error instead of silently doing nothing.
+    Scope: per-segment only. ``post_operations`` run on the assembled,
+    concatenated timeline; re-basing time-based context across a multi-segment
+    concat is unsupported and rejected up front by
+    :meth:`VideoEdit._assert_post_ops_supported` (single-segment plans are
+    unaffected).
+    """
+    if not context:
+        return context
+    rebaseable = {k: v for k, v in context.items() if isinstance(v, SegmentRebaseable)}
+    if not rebaseable:
+        return context
+    rebased = dict(context)
+    for key, value in rebaseable.items():
+        sliced = value.slice(start, end)
+        if sliced is None:
+            del rebased[key]
+        else:
+            rebased[key] = sliced.offset(-start)
+    return rebased
 def _apply_with_context(op: Operation, video: Video, context: dict[str, Any] | None) -> Video:
     """Apply ``op`` to ``video``, threading ``op.requires`` keys from ``context``."""
     if op.requires and context:
@@ -139,9 +205,14 @@ class SegmentConfig(BaseModel):
         )
     def process(self, video: Video, context: dict[str, Any] | None = None) -> Video:
-        """Apply every operation in this segment to ``video`` in order."""
+        """Apply every operation in this segment to ``video`` in order.
+        Time-based context (e.g. ``transcription``) is re-based onto this
+        segment's 0-based local timeline before any operation sees it.
+        """
+        seg_context = _segment_context(context, self.start, self.end)
         for op in self.operations:
-            video = _apply_with_context(op, video, context)
+            video = _apply_with_context(op, video, seg_context)
         return video
@@ -288,11 +359,38 @@ class VideoEdit(BaseModel):
                 metas.append(source_metadata[key])
         return self._validate(metas, context)
+    def _assert_post_ops_supported(self, context: dict[str, Any] | None) -> None:
+        """Reject post_operations needing time-based context on a multi-segment plan.
+        ``post_operations`` run on the assembled, concatenated timeline. A
+        source-absolute context value (e.g. a ``Transcription``) cannot be
+        re-based across a multi-segment concat, and passing the raw value would
+        silently mis-time the op (subtitles/silence-removal against the wrong
+        timeline). Fail fast with an actionable message instead of producing a
+        wrong render. Single-segment plans are unaffected -- their concatenated
+        timeline is just the one segment's, handled by ``_segment_context``.
+        """
+        if len(self.segments) <= 1 or not self.post_operations:
+            return
+        rebaseable = _rebaseable_keys(context)
+        if not rebaseable:
+            return
+        for op in self.post_operations:
+            clash = sorted(set(op.requires) & rebaseable)
+            if clash:
+                raise ValueError(
+                    f"post_operation '{op.op}' requires time-based context {clash}, but the plan "
+                    f"has {len(self.segments)} segments. post_operations run on the concatenated "
+                    "timeline and time-based context is not re-based across a multi-segment concat. "
+                    f"Move '{op.op}' into a segment, or use a single-segment plan."
+                )
     def _validate(
         self,
         source_metas: list[VideoMetadata],
         context: dict[str, Any] | None,
     ) -> VideoMetadata:
+        self._assert_post_ops_supported(context)
         cut_metas: list[VideoMetadata] = []
         for i, (seg, meta) in enumerate(zip(self.segments, source_metas)):
             if seg.end > meta.total_seconds + 1e-3:
@@ -325,10 +423,11 @@ class VideoEdit(BaseModel):
         meta: VideoMetadata,
         context: dict[str, Any] | None,
     ) -> VideoMetadata:
+        seg_context = _segment_context(context, segment.start, segment.end)
         for op in segment.operations:
             _validate_effect_window(op, meta.total_seconds)
             try:
-                meta = _predict_with_context(op, meta, context)
+                meta = _predict_with_context(op, meta, seg_context)
             except (ValueError, TypeError) as e:
                 raise ValueError(f"Segment {index}: metadata prediction failed for '{op.op}': {e}") from e
         return meta
@@ -367,6 +466,7 @@ class VideoEdit(BaseModel):
     def run(self, context: dict[str, Any] | None = None) -> Video:
         """Execute the plan in memory and return the final ``Video``."""
+        self._assert_post_ops_supported(context)
         target_fps, target_w, target_h = self._matching_targets_from_disk()
         videos = [
             segment.process(segment.load(fps=target_fps, width=target_w, height=target_h), context)
@@ -393,6 +493,7 @@ class VideoEdit(BaseModel):
         isn't streamable. Memory usage is O(1) w.r.t. video length for fully
         streamable pipelines.
         """
+        self._assert_post_ops_supported(context)
         output_path = Path(output_path).with_suffix(f".{format}")
         output_path.parent.mkdir(parents=True, exist_ok=True)
@@ -412,6 +513,11 @@ class VideoEdit(BaseModel):
             plan = plans[0]
             total_frames = round((plan.end_second - plan.start_second) * plan.output_fps)
             for op in self.post_operations:
+                if op.requires:
+                    # Same reason as the per-segment guard: no runtime context
+                    # in the streaming path. (Multi-segment + requires already
+                    # raised by _assert_post_ops_supported.)
+                    return self._run_to_file_eager(output_path, format, preset, crf, context)
                 if not isinstance(op, Effect) or not op.streamable:
                     return self._run_to_file_eager(output_path, format, preset, crf, context)
                 start_f, end_f = _effect_frame_range(op, plan.output_fps, total_frames)
@@ -477,6 +583,12 @@ class VideoEdit(BaseModel):
         effect_schedule: list[EffectScheduleEntry] = []
         for op in segment.operations:
+            if op.requires:
+                # Streaming schedules effects by frame range with no runtime
+                # context, so it can't supply -- let alone re-base onto the
+                # segment's local timeline -- anything an op `requires`. Defer
+                # to the eager path, where _segment_context handles re-basing.
+                return None
             if isinstance(op, Effect):
                 if not op.streamable:
                     return None