PyPI - videopython - Versions diffs - 0.33.4__tar.gz → 0.34.0__tar.gz - Mend

videopython 0.33.4tar.gz → 0.34.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

{videopython-0.33.4 → videopython-0.34.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: videopython
-Version: 0.33.4
+Version: 0.34.0
 Summary: Minimal video generation and processing library.
 Project-URL: Homepage, https://videopython.com
 Project-URL: Repository, https://github.com/bartwojtowicz/videopython/

{videopython-0.33.4 → videopython-0.34.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "videopython"
-version = "0.33.4"
+version = "0.34.0"
 description = "Minimal video generation and processing library."
 authors = [
     { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },

{videopython-0.33.4 → videopython-0.34.0}/src/videopython/ai/transforms.py RENAMED Viewed

@@ -12,7 +12,7 @@ from tqdm import tqdm
 from videopython.ai.understanding.faces import FaceTracker
 from videopython.base._dimensions import floor_to_even
-from videopython.base.video import Video
+from videopython.base.video import Video, VideoMetadata
 from videopython.editing.operation import OpCategory, Operation
 logger = logging.getLogger(__name__)
@@ -76,6 +76,28 @@ class FaceTrackingCrop(Operation):
         # "dynamic" — placeholder until motion/look-direction framing is implemented.
         return (face_cx, face_cy - self.headroom)
+    def _resolved_output_dims(self, w: int, h: int) -> tuple[int, int]:
+        """Output ``(width, height)`` after the crop + resize.
+        Every frame is resized to this size regardless of the per-frame face
+        position, so it is a pure function of the input dimensions and
+        ``target_aspect``. Single source of truth shared by :meth:`apply` and
+        :meth:`predict_metadata` (mirrors ``Resize._resolve_dims`` /
+        ``Crop._resolve_box``), so the dry-run cannot disagree with the render.
+        """
+        target_ratio = self.target_aspect[0] / self.target_aspect[1]
+        if target_ratio < w / h:
+            out_h = floor_to_even(h)
+            out_w = floor_to_even(int(out_h * target_ratio))
+        else:
+            out_w = floor_to_even(w)
+            out_h = floor_to_even(int(out_w / target_ratio))
+        return out_w, out_h
+    def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
+        out_w, out_h = self._resolved_output_dims(meta.width, meta.height)
+        return meta.with_dimensions(out_w, out_h)
     def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
         if self.max_speed is None:
             return target
@@ -135,13 +157,7 @@ class FaceTrackingCrop(Operation):
         )
         h, w = video.frame_shape[:2]
-        target_ratio = self.target_aspect[0] / self.target_aspect[1]
-        if target_ratio < w / h:
-            out_h = floor_to_even(h)
-            out_w = floor_to_even(int(out_h * target_ratio))
-        else:
-            out_w = floor_to_even(w)
-            out_h = floor_to_even(int(out_w / target_ratio))
+        out_w, out_h = self._resolved_output_dims(w, h)
         default_x = (w - out_w) // 2
         default_y = (h - out_h) // 2

{videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/image_text.py RENAMED Viewed

@@ -9,6 +9,7 @@ generation helpers (``ai/understanding/image.py``).
 from __future__ import annotations
+from dataclasses import dataclass
 from enum import Enum
 from typing import TypeAlias
@@ -18,7 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
 from videopython.base.exceptions import OutOfBoundsError
 from videopython.base.fonts import load_font
-__all__ = ["ImageText", "TextAlign", "AnchorPoint"]
+__all__ = ["ImageText", "TextAlign", "AnchorPoint", "TextBoxRect"]
 # Type aliases for clarity
 MarginType: TypeAlias = int | tuple[int, int, int, int]
@@ -79,6 +80,32 @@ class AnchorPoint(str, Enum):
         return (cls.BOTTOM_LEFT, cls.BOTTOM_CENTER, cls.BOTTOM_RIGHT)
+@dataclass(frozen=True)
+class TextBoxRect:
+    """Resolved geometry of a wrapped text box, without rendering it.
+    Returned by :meth:`ImageText.measure_text_box` — the single source of
+    truth for box measure/wrap/anchor/bounds, shared by the renderer
+    (:meth:`ImageText.write_text_box`) and dry-run validators so they can
+    never disagree on whether text fits.
+    For a non-degenerate box ``(x, y)`` is the anchor-adjusted top-left
+    corner and ``width``/``height`` span the wrapped lines. For a degenerate
+    box (whitespace-only text → no renderable lines) ``height == 0``,
+    ``(x, y)`` is the *unadjusted* insertion point, and ``fits`` is ``True``;
+    callers short-circuit such boxes (nothing to draw). ``width`` mirrors the
+    resolved ``box_width`` and may be a float when an absolute >1 value was
+    passed, matching legacy behaviour.
+    """
+    x: float
+    y: float
+    width: int | float
+    height: int
+    fits: bool
+    lines: tuple[str, ...]
 class ImageText:
     def __init__(
         self,
@@ -566,6 +593,97 @@ class ImageText:
         lines = [" ".join(line) for line in split_lines]
         return lines
+    def available_region(self, margin: MarginType = 0) -> tuple[int, int, int, int]:
+        """The drawable area inside ``margin`` as ``(left, top, width, height)``.
+        Single source of truth for margin-inset geometry: used by
+        :meth:`measure_text_box` and by callers that need to clamp a box
+        within the margins without re-deriving the margin math.
+        """
+        margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
+        available_width = self.image_size[1] - margin_left - margin_right
+        available_height = self.image_size[0] - margin_top - margin_bottom
+        return margin_left, margin_top, available_width, available_height
+    def measure_text_box(
+        self,
+        text: str,
+        font_filename: str | None,
+        xy: PositionType,
+        box_width: int | float | None = None,
+        font_size: int = 11,
+        anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
+        margin: MarginType = 0,
+    ) -> TextBoxRect:
+        """Measure where a wrapped text box would land, without drawing it.
+        Pure: resolves margins/box-width/position, wraps the text, applies the
+        anchor, and bounds-checks against the image — the exact math
+        :meth:`write_text_box` used to do inline. Highlighting and per-line
+        alignment (``place``) do not change the box envelope, so they are not
+        parameters here; this intentionally preserves the pre-existing
+        behaviour that an enlarged highlighted word is *not* accounted for in
+        the fit check.
+        Returns:
+            A :class:`TextBoxRect`. ``fits`` is ``False`` when the box would
+            fall outside the image bounds (the condition that makes
+            :meth:`write_text_box` raise :class:`OutOfBoundsError`).
+        Raises:
+            ValueError: If ``text`` is empty, ``font_size`` is not positive,
+                or an absolute ``box_width`` is not positive.
+        """
+        if not text:
+            raise ValueError("Text cannot be empty")
+        if font_size <= 0:
+            raise ValueError("Font size must be positive")
+        # Process margins to determine available area (shared with callers
+        # that clamp boxes inside the margins -- see ``available_region``).
+        margin_left, margin_top, available_width, available_height = self.available_region(margin)
+        # Handle relative box width
+        if box_width is None:
+            box_width = available_width
+        elif isinstance(box_width, float) and 0 < box_width <= 1:
+            box_width = int(available_width * box_width)
+        elif isinstance(box_width, int) and box_width <= 0:
+            raise ValueError("Box width must be positive")
+        # Calculate initial position based on margin and anchor before splitting text
+        x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
+        # Split text into lines that fit within box_width
+        lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
+        # Calculate total height of all lines
+        lines_height = sum(self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines)
+        if lines_height == 0:
+            # No renderable lines (e.g. whitespace-only text); position is the
+            # unadjusted insertion point and the box trivially "fits".
+            return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=0, fits=True, lines=tuple(lines))
+        # Final position calculation based on anchor point
+        if anchor in AnchorPoint.center_anchors():
+            x_pos -= box_width // 2
+        elif anchor in AnchorPoint.right_anchors():
+            x_pos -= box_width
+        if anchor in AnchorPoint.middle_anchors():
+            y_pos -= lines_height // 2
+        elif anchor in AnchorPoint.bottom_anchors():
+            y_pos -= lines_height
+        fits = not (
+            x_pos < 0
+            or y_pos < 0
+            or x_pos + box_width > self.image_size[1]
+            or y_pos + lines_height > self.image_size[0]
+        )
+        return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=lines_height, fits=fits, lines=tuple(lines))
     def write_text_box(
         self,
         text: str,
@@ -643,49 +761,24 @@ class ImageText:
         if highlight_word_index is not None and highlight_color is None:
             highlight_color = text_color
-        # Process margins to determine available area
-        margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
-        available_width = self.image_size[1] - margin_left - margin_right
-        available_height = self.image_size[0] - margin_top - margin_bottom
-        # Handle relative box width
-        if box_width is None:
-            box_width = available_width
-        elif isinstance(box_width, float) and 0 < box_width <= 1:
-            box_width = int(available_width * box_width)
-        elif isinstance(box_width, int) and box_width <= 0:
-            raise ValueError("Box width must be positive")
-        # Calculate initial position based on margin and anchor before splitting text
-        x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
-        # Split text into lines that fit within box_width
-        lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
-        # Calculate total height of all lines
-        lines_height = sum([self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines])
-        if lines_height == 0:
-            # If we have no valid lines or zero height, return the position
-            return (int(x_pos), int(y_pos))
-        # Final position calculation based on anchor point
-        if anchor in AnchorPoint.center_anchors():
-            x_pos -= box_width // 2
-        elif anchor in AnchorPoint.right_anchors():
-            x_pos -= box_width
-        if anchor in AnchorPoint.middle_anchors():
-            y_pos -= lines_height // 2
-        elif anchor in AnchorPoint.bottom_anchors():
-            y_pos -= lines_height
-        # Verify box will fit within bounds
-        if (
-            x_pos < 0
-            or y_pos < 0
-            or x_pos + box_width > self.image_size[1]
-            or y_pos + lines_height > self.image_size[0]
-        ):
+        # Measure (single source of truth for box geometry), then render.
+        rect = self.measure_text_box(
+            text=text,
+            font_filename=font_filename,
+            xy=xy,
+            box_width=box_width,
+            font_size=font_size,
+            anchor=anchor,
+            margin=margin,
+        )
+        lines = list(rect.lines)
+        if rect.height == 0:
+            # No renderable lines (e.g. whitespace-only text); nothing to draw.
+            return (int(rect.x), int(rect.y))
+        box_width = rect.width
+        x_pos, y_pos = rect.x, rect.y
+        lines_height = rect.height
+        if not rect.fits:
             raise OutOfBoundsError(
                 f"Text box with size ({box_width}x{lines_height}) at position ({x_pos}, {y_pos}) is out of bounds!"
             )

{videopython-0.33.4 → videopython-0.34.0}/src/videopython/base/transcription.py RENAMED Viewed

@@ -1,6 +1,6 @@
 from __future__ import annotations
-from dataclasses import dataclass
+from dataclasses import dataclass, replace
 from pathlib import Path
 from typing import Any
@@ -79,6 +79,38 @@ class TranscriptionSegment:
             compression_ratio=data.get("compression_ratio"),
         )
+    @classmethod
+    def from_words(
+        cls,
+        words: list[TranscriptionWord],
+        *,
+        speaker: str | None = None,
+        avg_logprob: float | None = None,
+        no_speech_prob: float | None = None,
+        compression_ratio: float | None = None,
+    ) -> TranscriptionSegment:
+        """Build a segment spanning ``words``, deriving start/end/text from them.
+        ``words`` must be non-empty: ``start``/``end`` come from the first/last
+        word and ``text`` is the words joined by single spaces. Speaker and the
+        confidence fields are passed through so callers re-segmenting *within* a
+        known source segment can preserve them; callers regrouping words across
+        segments (where these are ambiguous) simply omit them, leaving ``None``.
+        The ``words`` list is copied, so the result never aliases the caller's.
+        """
+        if not words:
+            raise ValueError("from_words requires a non-empty word list")
+        return cls(
+            start=words[0].start,
+            end=words[-1].end,
+            text=" ".join(w.word for w in words),
+            words=list(words),
+            speaker=speaker,
+            avg_logprob=avg_logprob,
+            no_speech_prob=no_speech_prob,
+            compression_ratio=compression_ratio,
+        )
 class Transcription:
     def __init__(
@@ -124,39 +156,19 @@ class Transcription:
             return []
         current_speaker = words[0].speaker
-        current_words = []
-        segment_start = words[0].start
+        current_words: list[TranscriptionWord] = []
         segments = []
         for word in words:
             if current_speaker == word.speaker:
                 current_words.append(word)
             else:
-                segment_text = " ".join(w.word for w in current_words)
-                segments.append(
-                    TranscriptionSegment(
-                        start=segment_start,
-                        end=current_words[-1].end,
-                        text=segment_text.strip(),
-                        words=current_words.copy(),
-                        speaker=current_speaker,
-                    )
-                )
+                segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                 current_speaker = word.speaker
                 current_words = [word]
-                segment_start = word.start
         if current_words:
-            segment_text = " ".join(w.word for w in current_words)
-            segments.append(
-                TranscriptionSegment(
-                    start=segment_start,
-                    end=current_words[-1].end,
-                    text=segment_text.strip(),
-                    words=current_words.copy(),
-                    speaker=current_speaker,
-                )
-            )
+            segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
         return segments
@@ -190,22 +202,14 @@ class Transcription:
         offset_segments = []
         for segment in self.segments:
-            offset_words = []
-            for word in segment.words:
-                offset_words.append(
-                    TranscriptionWord(
-                        start=word.start + time, end=word.end + time, word=word.word, speaker=word.speaker
-                    )
-                )
+            offset_words = [
+                TranscriptionWord(start=w.start + time, end=w.end + time, word=w.word, speaker=w.speaker)
+                for w in segment.words
+            ]
+            # ``replace`` carries text, speaker, and confidence fields through a
+            # pure timing shift unchanged -- only timestamps move.
             offset_segments.append(
-                TranscriptionSegment(
-                    start=segment.start + time,
-                    end=segment.end + time,
-                    text=segment.text,
-                    words=offset_words,
-                    speaker=segment.speaker,
-                )
+                replace(segment, start=segment.start + time, end=segment.end + time, words=offset_words)
             )
         return Transcription(segments=offset_segments, language=self.language)
@@ -245,16 +249,9 @@ class Transcription:
         def _flush(words: list[TranscriptionWord]) -> None:
             if not words:
                 return
-            segment_text = " ".join(w.word for w in words)
-            standardized_segments.append(
-                TranscriptionSegment(
-                    start=words[0].start,
-                    end=words[-1].end,
-                    text=segment_text,
-                    words=words.copy(),
-                    speaker=words[0].speaker,
-                )
-            )
+            # Words here are regrouped across original segments, so the source
+            # segments' confidence fields no longer apply -- left as None.
+            standardized_segments.append(TranscriptionSegment.from_words(words, speaker=words[0].speaker))
         if time is not None:
             current_words: list[TranscriptionWord] = []
@@ -315,18 +312,9 @@ class Transcription:
                     start_of_sentence = True
                 new_words.append(TranscriptionWord(start=word.start, end=word.end, word=token, speaker=word.speaker))
-            capitalized_segments.append(
-                TranscriptionSegment(
-                    start=segment.start,
-                    end=segment.end,
-                    text=" ".join(w.word for w in new_words),
-                    words=new_words,
-                    speaker=segment.speaker,
-                    avg_logprob=segment.avg_logprob,
-                    no_speech_prob=segment.no_speech_prob,
-                    compression_ratio=segment.compression_ratio,
-                )
-            )
+            # Casing-only rewrite: segment boundaries, speaker, and confidence
+            # are unchanged; only the tokens (and joined text) differ.
+            capitalized_segments.append(replace(segment, text=" ".join(w.word for w in new_words), words=new_words))
         return Transcription(segments=capitalized_segments, language=self.language)
@@ -353,16 +341,17 @@ class Transcription:
         for segment in self.segments:
             words = segment.words
             if not words:
-                chunked_segments.append(segment)
+                # Nothing to split; emit a fresh copy so the result never
+                # aliases the source segment.
+                chunked_segments.append(replace(segment, words=list(segment.words)))
                 continue
             for i in range(0, len(words), max_words):
                 group = words[i : i + max_words]
+                # Splitting *within* one source segment -- its confidence
+                # fields still apply, so carry them through.
                 chunked_segments.append(
-                    TranscriptionSegment(
-                        start=group[0].start,
-                        end=group[-1].end,
-                        text=" ".join(w.word for w in group),
-                        words=list(group),
+                    TranscriptionSegment.from_words(
+                        group,
                         speaker=segment.speaker,
                         avg_logprob=segment.avg_logprob,
                         no_speech_prob=segment.no_speech_prob,
@@ -409,34 +398,17 @@ class Transcription:
             if word.speaker == current_speaker:
                 current_words.append(word)
             else:
-                # Finish current segment
+                # Finish current segment (speaker is ambiguous across the
+                # original segments these words came from -- confidence omitted)
                 if current_words:
-                    segment_text = " ".join(w.word for w in current_words)
-                    sliced_segments.append(
-                        TranscriptionSegment(
-                            start=current_words[0].start,
-                            end=current_words[-1].end,
-                            text=segment_text,
-                            words=current_words.copy(),
-                            speaker=current_speaker,
-                        )
-                    )
+                    sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
                 # Start new segment
                 current_speaker = word.speaker
                 current_words = [word]
         # Add final segment
         if current_words:
-            segment_text = " ".join(w.word for w in current_words)
-            sliced_segments.append(
-                TranscriptionSegment(
-                    start=current_words[0].start,
-                    end=current_words[-1].end,
-                    text=segment_text,
-                    words=current_words.copy(),
-                    speaker=current_speaker,
-                )
-            )
+            sliced_segments.append(TranscriptionSegment.from_words(current_words, speaker=current_speaker))
         return Transcription(segments=sliced_segments, language=self.language)

{videopython-0.33.4 → videopython-0.34.0}/src/videopython/editing/__init__.py RENAMED Viewed

@@ -21,7 +21,7 @@ from .effects import (
     Zoom,
 )
 from .operation import FilterCtx, OpCategory, Operation, TimeRange
-from .transcription_overlay import TranscriptionOverlay
+from .transcription_overlay import SubtitleRegion, SubtitleStyle, TranscriptionOverlay
 from .transforms import (
     Crop,
     CropMode,
@@ -65,6 +65,8 @@ __all__ = [
     "VolumeAdjust",
     "TextOverlay",
     "TranscriptionOverlay",
+    "SubtitleStyle",
+    "SubtitleRegion",
     "Shake",
     "PunchIn",
     "Flash",

videopython 0.33.4__tar.gz → 0.34.0__tar.gz

videopython 0.33.4tar.gz → 0.34.0tar.gz