videopython 0.33.5__tar.gz → 0.34.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {videopython-0.33.5 → videopython-0.34.0}/PKG-INFO +1 -1
- {videopython-0.33.5 → videopython-0.34.0}/pyproject.toml +1 -1
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/transforms.py +24 -8
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/image_text.py +137 -44
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/__init__.py +3 -1
- videopython-0.34.0/src/videopython/editing/transcription_overlay.py +516 -0
- videopython-0.33.5/src/videopython/editing/transcription_overlay.py +0 -186
- {videopython-0.33.5 → videopython-0.34.0}/.gitignore +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/LICENSE +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/README.md +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/_device.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/config.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/dubber.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/expressiveness.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/loudness.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/models.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/quality.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/remux.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/timing.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/voice_sample.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/audio.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/image.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/qwen3.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/translation.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/video.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/audio.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/faces.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/image.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/separation.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/temporal.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/analyzer.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/models.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/sampling.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/stages.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/analysis.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/audio.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_dimensions.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_ffmpeg.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_video_io.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/description.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/__init__.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/transcription.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/video.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/effects.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/operation.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/streaming.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/transforms.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/video_edit.py +0 -0
- {videopython-0.33.5 → videopython-0.34.0}/src/videopython/py.typed +0 -0
|
@@ -12,7 +12,7 @@ from tqdm import tqdm
|
|
|
12
12
|
|
|
13
13
|
from videopython.ai.understanding.faces import FaceTracker
|
|
14
14
|
from videopython.base._dimensions import floor_to_even
|
|
15
|
-
from videopython.base.video import Video
|
|
15
|
+
from videopython.base.video import Video, VideoMetadata
|
|
16
16
|
from videopython.editing.operation import OpCategory, Operation
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
@@ -76,6 +76,28 @@ class FaceTrackingCrop(Operation):
|
|
|
76
76
|
# "dynamic" — placeholder until motion/look-direction framing is implemented.
|
|
77
77
|
return (face_cx, face_cy - self.headroom)
|
|
78
78
|
|
|
79
|
+
def _resolved_output_dims(self, w: int, h: int) -> tuple[int, int]:
|
|
80
|
+
"""Output ``(width, height)`` after the crop + resize.
|
|
81
|
+
|
|
82
|
+
Every frame is resized to this size regardless of the per-frame face
|
|
83
|
+
position, so it is a pure function of the input dimensions and
|
|
84
|
+
``target_aspect``. Single source of truth shared by :meth:`apply` and
|
|
85
|
+
:meth:`predict_metadata` (mirrors ``Resize._resolve_dims`` /
|
|
86
|
+
``Crop._resolve_box``), so the dry-run cannot disagree with the render.
|
|
87
|
+
"""
|
|
88
|
+
target_ratio = self.target_aspect[0] / self.target_aspect[1]
|
|
89
|
+
if target_ratio < w / h:
|
|
90
|
+
out_h = floor_to_even(h)
|
|
91
|
+
out_w = floor_to_even(int(out_h * target_ratio))
|
|
92
|
+
else:
|
|
93
|
+
out_w = floor_to_even(w)
|
|
94
|
+
out_h = floor_to_even(int(out_w / target_ratio))
|
|
95
|
+
return out_w, out_h
|
|
96
|
+
|
|
97
|
+
def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
|
|
98
|
+
out_w, out_h = self._resolved_output_dims(meta.width, meta.height)
|
|
99
|
+
return meta.with_dimensions(out_w, out_h)
|
|
100
|
+
|
|
79
101
|
def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
|
|
80
102
|
if self.max_speed is None:
|
|
81
103
|
return target
|
|
@@ -135,13 +157,7 @@ class FaceTrackingCrop(Operation):
|
|
|
135
157
|
)
|
|
136
158
|
|
|
137
159
|
h, w = video.frame_shape[:2]
|
|
138
|
-
|
|
139
|
-
if target_ratio < w / h:
|
|
140
|
-
out_h = floor_to_even(h)
|
|
141
|
-
out_w = floor_to_even(int(out_h * target_ratio))
|
|
142
|
-
else:
|
|
143
|
-
out_w = floor_to_even(w)
|
|
144
|
-
out_h = floor_to_even(int(out_w / target_ratio))
|
|
160
|
+
out_w, out_h = self._resolved_output_dims(w, h)
|
|
145
161
|
|
|
146
162
|
default_x = (w - out_w) // 2
|
|
147
163
|
default_y = (h - out_h) // 2
|
|
@@ -9,6 +9,7 @@ generation helpers (``ai/understanding/image.py``).
|
|
|
9
9
|
|
|
10
10
|
from __future__ import annotations
|
|
11
11
|
|
|
12
|
+
from dataclasses import dataclass
|
|
12
13
|
from enum import Enum
|
|
13
14
|
from typing import TypeAlias
|
|
14
15
|
|
|
@@ -18,7 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
|
|
|
18
19
|
from videopython.base.exceptions import OutOfBoundsError
|
|
19
20
|
from videopython.base.fonts import load_font
|
|
20
21
|
|
|
21
|
-
__all__ = ["ImageText", "TextAlign", "AnchorPoint"]
|
|
22
|
+
__all__ = ["ImageText", "TextAlign", "AnchorPoint", "TextBoxRect"]
|
|
22
23
|
|
|
23
24
|
# Type aliases for clarity
|
|
24
25
|
MarginType: TypeAlias = int | tuple[int, int, int, int]
|
|
@@ -79,6 +80,32 @@ class AnchorPoint(str, Enum):
|
|
|
79
80
|
return (cls.BOTTOM_LEFT, cls.BOTTOM_CENTER, cls.BOTTOM_RIGHT)
|
|
80
81
|
|
|
81
82
|
|
|
83
|
+
@dataclass(frozen=True)
|
|
84
|
+
class TextBoxRect:
|
|
85
|
+
"""Resolved geometry of a wrapped text box, without rendering it.
|
|
86
|
+
|
|
87
|
+
Returned by :meth:`ImageText.measure_text_box` — the single source of
|
|
88
|
+
truth for box measure/wrap/anchor/bounds, shared by the renderer
|
|
89
|
+
(:meth:`ImageText.write_text_box`) and dry-run validators so they can
|
|
90
|
+
never disagree on whether text fits.
|
|
91
|
+
|
|
92
|
+
For a non-degenerate box ``(x, y)`` is the anchor-adjusted top-left
|
|
93
|
+
corner and ``width``/``height`` span the wrapped lines. For a degenerate
|
|
94
|
+
box (whitespace-only text → no renderable lines) ``height == 0``,
|
|
95
|
+
``(x, y)`` is the *unadjusted* insertion point, and ``fits`` is ``True``;
|
|
96
|
+
callers short-circuit such boxes (nothing to draw). ``width`` mirrors the
|
|
97
|
+
resolved ``box_width`` and may be a float when an absolute >1 value was
|
|
98
|
+
passed, matching legacy behaviour.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
x: float
|
|
102
|
+
y: float
|
|
103
|
+
width: int | float
|
|
104
|
+
height: int
|
|
105
|
+
fits: bool
|
|
106
|
+
lines: tuple[str, ...]
|
|
107
|
+
|
|
108
|
+
|
|
82
109
|
class ImageText:
|
|
83
110
|
def __init__(
|
|
84
111
|
self,
|
|
@@ -566,6 +593,97 @@ class ImageText:
|
|
|
566
593
|
lines = [" ".join(line) for line in split_lines]
|
|
567
594
|
return lines
|
|
568
595
|
|
|
596
|
+
def available_region(self, margin: MarginType = 0) -> tuple[int, int, int, int]:
|
|
597
|
+
"""The drawable area inside ``margin`` as ``(left, top, width, height)``.
|
|
598
|
+
|
|
599
|
+
Single source of truth for margin-inset geometry: used by
|
|
600
|
+
:meth:`measure_text_box` and by callers that need to clamp a box
|
|
601
|
+
within the margins without re-deriving the margin math.
|
|
602
|
+
"""
|
|
603
|
+
margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
|
|
604
|
+
available_width = self.image_size[1] - margin_left - margin_right
|
|
605
|
+
available_height = self.image_size[0] - margin_top - margin_bottom
|
|
606
|
+
return margin_left, margin_top, available_width, available_height
|
|
607
|
+
|
|
608
|
+
def measure_text_box(
|
|
609
|
+
self,
|
|
610
|
+
text: str,
|
|
611
|
+
font_filename: str | None,
|
|
612
|
+
xy: PositionType,
|
|
613
|
+
box_width: int | float | None = None,
|
|
614
|
+
font_size: int = 11,
|
|
615
|
+
anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
|
|
616
|
+
margin: MarginType = 0,
|
|
617
|
+
) -> TextBoxRect:
|
|
618
|
+
"""Measure where a wrapped text box would land, without drawing it.
|
|
619
|
+
|
|
620
|
+
Pure: resolves margins/box-width/position, wraps the text, applies the
|
|
621
|
+
anchor, and bounds-checks against the image — the exact math
|
|
622
|
+
:meth:`write_text_box` used to do inline. Highlighting and per-line
|
|
623
|
+
alignment (``place``) do not change the box envelope, so they are not
|
|
624
|
+
parameters here; this intentionally preserves the pre-existing
|
|
625
|
+
behaviour that an enlarged highlighted word is *not* accounted for in
|
|
626
|
+
the fit check.
|
|
627
|
+
|
|
628
|
+
Returns:
|
|
629
|
+
A :class:`TextBoxRect`. ``fits`` is ``False`` when the box would
|
|
630
|
+
fall outside the image bounds (the condition that makes
|
|
631
|
+
:meth:`write_text_box` raise :class:`OutOfBoundsError`).
|
|
632
|
+
|
|
633
|
+
Raises:
|
|
634
|
+
ValueError: If ``text`` is empty, ``font_size`` is not positive,
|
|
635
|
+
or an absolute ``box_width`` is not positive.
|
|
636
|
+
"""
|
|
637
|
+
if not text:
|
|
638
|
+
raise ValueError("Text cannot be empty")
|
|
639
|
+
|
|
640
|
+
if font_size <= 0:
|
|
641
|
+
raise ValueError("Font size must be positive")
|
|
642
|
+
|
|
643
|
+
# Process margins to determine available area (shared with callers
|
|
644
|
+
# that clamp boxes inside the margins -- see ``available_region``).
|
|
645
|
+
margin_left, margin_top, available_width, available_height = self.available_region(margin)
|
|
646
|
+
|
|
647
|
+
# Handle relative box width
|
|
648
|
+
if box_width is None:
|
|
649
|
+
box_width = available_width
|
|
650
|
+
elif isinstance(box_width, float) and 0 < box_width <= 1:
|
|
651
|
+
box_width = int(available_width * box_width)
|
|
652
|
+
elif isinstance(box_width, int) and box_width <= 0:
|
|
653
|
+
raise ValueError("Box width must be positive")
|
|
654
|
+
|
|
655
|
+
# Calculate initial position based on margin and anchor before splitting text
|
|
656
|
+
x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
|
|
657
|
+
|
|
658
|
+
# Split text into lines that fit within box_width
|
|
659
|
+
lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
|
|
660
|
+
|
|
661
|
+
# Calculate total height of all lines
|
|
662
|
+
lines_height = sum(self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines)
|
|
663
|
+
if lines_height == 0:
|
|
664
|
+
# No renderable lines (e.g. whitespace-only text); position is the
|
|
665
|
+
# unadjusted insertion point and the box trivially "fits".
|
|
666
|
+
return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=0, fits=True, lines=tuple(lines))
|
|
667
|
+
|
|
668
|
+
# Final position calculation based on anchor point
|
|
669
|
+
if anchor in AnchorPoint.center_anchors():
|
|
670
|
+
x_pos -= box_width // 2
|
|
671
|
+
elif anchor in AnchorPoint.right_anchors():
|
|
672
|
+
x_pos -= box_width
|
|
673
|
+
|
|
674
|
+
if anchor in AnchorPoint.middle_anchors():
|
|
675
|
+
y_pos -= lines_height // 2
|
|
676
|
+
elif anchor in AnchorPoint.bottom_anchors():
|
|
677
|
+
y_pos -= lines_height
|
|
678
|
+
|
|
679
|
+
fits = not (
|
|
680
|
+
x_pos < 0
|
|
681
|
+
or y_pos < 0
|
|
682
|
+
or x_pos + box_width > self.image_size[1]
|
|
683
|
+
or y_pos + lines_height > self.image_size[0]
|
|
684
|
+
)
|
|
685
|
+
return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=lines_height, fits=fits, lines=tuple(lines))
|
|
686
|
+
|
|
569
687
|
def write_text_box(
|
|
570
688
|
self,
|
|
571
689
|
text: str,
|
|
@@ -643,49 +761,24 @@ class ImageText:
|
|
|
643
761
|
if highlight_word_index is not None and highlight_color is None:
|
|
644
762
|
highlight_color = text_color
|
|
645
763
|
|
|
646
|
-
#
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
# Calculate total height of all lines
|
|
666
|
-
lines_height = sum([self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines])
|
|
667
|
-
if lines_height == 0:
|
|
668
|
-
# If we have no valid lines or zero height, return the position
|
|
669
|
-
return (int(x_pos), int(y_pos))
|
|
670
|
-
|
|
671
|
-
# Final position calculation based on anchor point
|
|
672
|
-
if anchor in AnchorPoint.center_anchors():
|
|
673
|
-
x_pos -= box_width // 2
|
|
674
|
-
elif anchor in AnchorPoint.right_anchors():
|
|
675
|
-
x_pos -= box_width
|
|
676
|
-
|
|
677
|
-
if anchor in AnchorPoint.middle_anchors():
|
|
678
|
-
y_pos -= lines_height // 2
|
|
679
|
-
elif anchor in AnchorPoint.bottom_anchors():
|
|
680
|
-
y_pos -= lines_height
|
|
681
|
-
|
|
682
|
-
# Verify box will fit within bounds
|
|
683
|
-
if (
|
|
684
|
-
x_pos < 0
|
|
685
|
-
or y_pos < 0
|
|
686
|
-
or x_pos + box_width > self.image_size[1]
|
|
687
|
-
or y_pos + lines_height > self.image_size[0]
|
|
688
|
-
):
|
|
764
|
+
# Measure (single source of truth for box geometry), then render.
|
|
765
|
+
rect = self.measure_text_box(
|
|
766
|
+
text=text,
|
|
767
|
+
font_filename=font_filename,
|
|
768
|
+
xy=xy,
|
|
769
|
+
box_width=box_width,
|
|
770
|
+
font_size=font_size,
|
|
771
|
+
anchor=anchor,
|
|
772
|
+
margin=margin,
|
|
773
|
+
)
|
|
774
|
+
lines = list(rect.lines)
|
|
775
|
+
if rect.height == 0:
|
|
776
|
+
# No renderable lines (e.g. whitespace-only text); nothing to draw.
|
|
777
|
+
return (int(rect.x), int(rect.y))
|
|
778
|
+
box_width = rect.width
|
|
779
|
+
x_pos, y_pos = rect.x, rect.y
|
|
780
|
+
lines_height = rect.height
|
|
781
|
+
if not rect.fits:
|
|
689
782
|
raise OutOfBoundsError(
|
|
690
783
|
f"Text box with size ({box_width}x{lines_height}) at position ({x_pos}, {y_pos}) is out of bounds!"
|
|
691
784
|
)
|
|
@@ -21,7 +21,7 @@ from .effects import (
|
|
|
21
21
|
Zoom,
|
|
22
22
|
)
|
|
23
23
|
from .operation import FilterCtx, OpCategory, Operation, TimeRange
|
|
24
|
-
from .transcription_overlay import TranscriptionOverlay
|
|
24
|
+
from .transcription_overlay import SubtitleRegion, SubtitleStyle, TranscriptionOverlay
|
|
25
25
|
from .transforms import (
|
|
26
26
|
Crop,
|
|
27
27
|
CropMode,
|
|
@@ -65,6 +65,8 @@ __all__ = [
|
|
|
65
65
|
"VolumeAdjust",
|
|
66
66
|
"TextOverlay",
|
|
67
67
|
"TranscriptionOverlay",
|
|
68
|
+
"SubtitleStyle",
|
|
69
|
+
"SubtitleRegion",
|
|
68
70
|
"Shake",
|
|
69
71
|
"PunchIn",
|
|
70
72
|
"Flash",
|
|
@@ -0,0 +1,516 @@
|
|
|
1
|
+
"""Subtitle overlay effect.
|
|
2
|
+
|
|
3
|
+
``TranscriptionOverlay`` is an :class:`Effect` that renders animated
|
|
4
|
+
word-by-word subtitles onto a :class:`Video` using a word-level
|
|
5
|
+
:class:`Transcription`. Rendering is delegated to ``ImageText`` from
|
|
6
|
+
the sibling module.
|
|
7
|
+
|
|
8
|
+
The public surface is intentionally small and *resolution-independent*:
|
|
9
|
+
pick a ``style`` preset, a ``region``, and a ``font_scale`` (fraction of
|
|
10
|
+
frame height). The legacy absolute fields (``font_size``, explicit colors,
|
|
11
|
+
``position``/``anchor``/``box_width``/``margin``) remain as optional advanced
|
|
12
|
+
overrides for back-compat -- left unset they are derived from the presets, so
|
|
13
|
+
an authored plan cannot encode a resolution-specific value that overflows the
|
|
14
|
+
real (post-transform) frame.
|
|
15
|
+
|
|
16
|
+
Fit safety is layered:
|
|
17
|
+
|
|
18
|
+
* one routine (:meth:`_resolve_layout`) decides the final font size for both
|
|
19
|
+
the dry-run and the render, so they can never disagree;
|
|
20
|
+
* it auto-shrinks the font within a legible band and clamps the box inside
|
|
21
|
+
the frame (graceful render fit);
|
|
22
|
+
* only a cue that cannot fit even at the minimum legible size is an error,
|
|
23
|
+
and that error is raised by :meth:`predict_metadata` at
|
|
24
|
+
``VideoEdit.validate()`` time -- before any frame/GPU work -- never
|
|
25
|
+
mid-render.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import logging
|
|
31
|
+
from dataclasses import dataclass
|
|
32
|
+
from enum import Enum
|
|
33
|
+
from typing import Any, ClassVar, Literal
|
|
34
|
+
|
|
35
|
+
import numpy as np
|
|
36
|
+
from PIL import Image
|
|
37
|
+
from pydantic import Field
|
|
38
|
+
from tqdm import tqdm
|
|
39
|
+
|
|
40
|
+
from videopython.base.image_text import AnchorPoint, ImageText, TextAlign
|
|
41
|
+
from videopython.base.transcription import Transcription, TranscriptionSegment
|
|
42
|
+
from videopython.base.video import Video, VideoMetadata
|
|
43
|
+
from videopython.editing.operation import Effect
|
|
44
|
+
|
|
45
|
+
__all__ = ["TranscriptionOverlay", "SubtitleStyle", "SubtitleRegion"]
|
|
46
|
+
|
|
47
|
+
logger = logging.getLogger(__name__)
|
|
48
|
+
|
|
49
|
+
# Sentinel for ``background_color``: ``None`` already means "no background",
|
|
50
|
+
# so it cannot double as "derive from the style preset".
|
|
51
|
+
_AUTO: Literal["auto"] = "auto"
|
|
52
|
+
|
|
53
|
+
RGBColor = tuple[int, int, int]
|
|
54
|
+
RGBAColor = tuple[int, int, int, int]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class SubtitleStyle(str, Enum):
|
|
58
|
+
"""Named look bundling colors / border / background / highlight.
|
|
59
|
+
|
|
60
|
+
Lets a caller express intent ("boxed", "outline", ...) instead of a
|
|
61
|
+
dozen individual numbers. ``BOXED`` reproduces the historical defaults
|
|
62
|
+
exactly, so upgrading without changing fields is visually a no-op except
|
|
63
|
+
for the now resolution-relative font size.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
BOXED = "boxed"
|
|
67
|
+
OUTLINE = "outline"
|
|
68
|
+
CLEAN = "clean"
|
|
69
|
+
KARAOKE = "karaoke"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class SubtitleRegion(str, Enum):
|
|
73
|
+
"""Vertical safe-area band the subtitle box is centered in."""
|
|
74
|
+
|
|
75
|
+
TOP = "top"
|
|
76
|
+
CENTER = "center"
|
|
77
|
+
BOTTOM = "bottom"
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class _StyleParams:
|
|
82
|
+
text_color: RGBColor
|
|
83
|
+
highlight_color: RGBColor
|
|
84
|
+
border: int
|
|
85
|
+
background_color: RGBAColor | None
|
|
86
|
+
background_padding: int
|
|
87
|
+
highlight_size_multiplier: float
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
_STYLE_PRESETS: dict[SubtitleStyle, _StyleParams] = {
|
|
91
|
+
# Exactly the pre-redesign defaults.
|
|
92
|
+
SubtitleStyle.BOXED: _StyleParams((255, 235, 59), (76, 175, 80), 2, (0, 0, 0, 100), 15, 1.2),
|
|
93
|
+
SubtitleStyle.OUTLINE: _StyleParams((255, 255, 255), (255, 235, 59), 4, None, 0, 1.15),
|
|
94
|
+
SubtitleStyle.CLEAN: _StyleParams((255, 255, 255), (76, 175, 80), 2, None, 0, 1.1),
|
|
95
|
+
SubtitleStyle.KARAOKE: _StyleParams((255, 255, 255), (255, 90, 95), 3, (0, 0, 0, 120), 18, 1.25),
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
# region -> normalized (x, y) center of the box; anchor stays CENTER so the
|
|
99
|
+
# box is centered on this point. Chosen to sit inside a conventional safe area.
|
|
100
|
+
_REGION_POSITION: dict[SubtitleRegion, tuple[float, float]] = {
|
|
101
|
+
SubtitleRegion.TOP: (0.5, 0.18),
|
|
102
|
+
SubtitleRegion.CENTER: (0.5, 0.5),
|
|
103
|
+
SubtitleRegion.BOTTOM: (0.5, 0.82),
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
@dataclass(frozen=True)
|
|
108
|
+
class _CueBox:
|
|
109
|
+
"""Absolute, frame-clamped placement of one cue's text box."""
|
|
110
|
+
|
|
111
|
+
x: int
|
|
112
|
+
y: int
|
|
113
|
+
box_w: int
|
|
114
|
+
height: int
|
|
115
|
+
fits: bool
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass(frozen=True)
|
|
119
|
+
class _ResolvedConfig:
|
|
120
|
+
"""Every override-or-preset field resolved to a concrete value once.
|
|
121
|
+
|
|
122
|
+
Deterministic from the model fields, so the dry-run and the render
|
|
123
|
+
derive identical geometry and look (parity).
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
position: tuple[float, float]
|
|
127
|
+
anchor: AnchorPoint
|
|
128
|
+
box_width: float
|
|
129
|
+
text_align: TextAlign
|
|
130
|
+
margin: int | tuple[int, int, int, int]
|
|
131
|
+
style: _StyleParams
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@dataclass(frozen=True)
|
|
135
|
+
class _SubtitleLayout:
|
|
136
|
+
"""Outcome of resolving the overlay against a concrete frame size.
|
|
137
|
+
|
|
138
|
+
``segments`` are post-transform cues (``chunk`` + ``capitalize``) and
|
|
139
|
+
``config`` is the resolved geometry/look -- both shared by render and
|
|
140
|
+
dry-run so they measure and draw identical boxes. ``font_px`` is the
|
|
141
|
+
single font size both paths use. ``fits`` is False with a populated
|
|
142
|
+
``error`` only when a cue cannot fit even at the minimum legible size.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
segments: list[TranscriptionSegment]
|
|
146
|
+
config: _ResolvedConfig
|
|
147
|
+
font_px: int
|
|
148
|
+
fits: bool
|
|
149
|
+
error: str | None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class TranscriptionOverlay(Effect):
|
|
153
|
+
"""Renders animated word-by-word subtitles with the current word highlighted.
|
|
154
|
+
|
|
155
|
+
Each word lights up in the highlight color as it is spoken, based on
|
|
156
|
+
transcription timestamps. Requires a word-level transcription, which the
|
|
157
|
+
runner supplies via the ``requires=("transcription",)`` declaration.
|
|
158
|
+
|
|
159
|
+
Geometry is resolution-relative by default (``font_scale``/``region``), so
|
|
160
|
+
a plan validated by ``VideoEdit.validate()`` that passes will also render;
|
|
161
|
+
a plan that cannot fit fails fast in :meth:`predict_metadata` instead of
|
|
162
|
+
crashing mid-render after expensive upstream ops.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
op: Literal["add_subtitles"] = "add_subtitles"
|
|
166
|
+
streamable: ClassVar[bool] = False
|
|
167
|
+
requires: ClassVar[tuple[str, ...]] = ("transcription",)
|
|
168
|
+
|
|
169
|
+
# ---- primary, resolution-independent surface ----
|
|
170
|
+
style: SubtitleStyle = Field(
|
|
171
|
+
SubtitleStyle.BOXED,
|
|
172
|
+
description='Look preset bundling colors/border/background/highlight: "boxed", "outline", "clean", "karaoke".',
|
|
173
|
+
)
|
|
174
|
+
region: SubtitleRegion = Field(
|
|
175
|
+
SubtitleRegion.BOTTOM,
|
|
176
|
+
description='Vertical placement band: "top", "center", or "bottom" of the frame.',
|
|
177
|
+
)
|
|
178
|
+
font_scale: float = Field(
|
|
179
|
+
0.055,
|
|
180
|
+
gt=0.0,
|
|
181
|
+
le=0.5,
|
|
182
|
+
description=(
|
|
183
|
+
"Base font height as a fraction of frame height (resolution-independent; the recommended "
|
|
184
|
+
"way to size subtitles). Auto-shrinks toward `min_font_scale` if a cue would overflow."
|
|
185
|
+
),
|
|
186
|
+
)
|
|
187
|
+
min_font_scale: float = Field(
|
|
188
|
+
0.030,
|
|
189
|
+
gt=0.0,
|
|
190
|
+
le=0.5,
|
|
191
|
+
description=(
|
|
192
|
+
"Lower bound for auto-fit shrinking, as a fraction of frame height. A cue that cannot fit "
|
|
193
|
+
"even at this size is a validation error rather than an illegible render."
|
|
194
|
+
),
|
|
195
|
+
)
|
|
196
|
+
max_words_per_cue: int | None = Field(
|
|
197
|
+
5,
|
|
198
|
+
ge=1,
|
|
199
|
+
description=(
|
|
200
|
+
"Maximum words shown on screen at once. Each transcription segment is re-chunked into "
|
|
201
|
+
"cues of at most this many words, without bridging the silence gaps between segments, so "
|
|
202
|
+
"subtitles stay readable and don't linger over pauses. None preserves the source "
|
|
203
|
+
"transcription's segmentation."
|
|
204
|
+
),
|
|
205
|
+
)
|
|
206
|
+
capitalize: bool = Field(
|
|
207
|
+
True,
|
|
208
|
+
description=(
|
|
209
|
+
"Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
|
|
210
|
+
"Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
|
|
211
|
+
"exactly as transcribed."
|
|
212
|
+
),
|
|
213
|
+
)
|
|
214
|
+
font_filename: str | None = Field(
|
|
215
|
+
None,
|
|
216
|
+
description="Path to a .ttf font file for rendering subtitle text, or None for the bundled default font.",
|
|
217
|
+
)
|
|
218
|
+
highlight_bold_font: str | None = Field(
|
|
219
|
+
None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
# ---- advanced overrides: None => derive from style/region/font_scale ----
|
|
223
|
+
font_size: int | None = Field(
|
|
224
|
+
None,
|
|
225
|
+
ge=1,
|
|
226
|
+
description=(
|
|
227
|
+
"Advanced override: absolute base font size in pixels. Leave None to derive from "
|
|
228
|
+
"`font_scale` (recommended -- resolution-independent and overflow-safe)."
|
|
229
|
+
),
|
|
230
|
+
)
|
|
231
|
+
font_border_size: int | None = Field(
|
|
232
|
+
None, ge=0, description="Advanced override for outline thickness in px. None takes it from `style`."
|
|
233
|
+
)
|
|
234
|
+
text_color: RGBColor | None = Field(
|
|
235
|
+
None, description="Advanced override for default text color [R, G, B] (0-255). None takes it from `style`."
|
|
236
|
+
)
|
|
237
|
+
background_color: RGBAColor | None | Literal["auto"] = Field(
|
|
238
|
+
_AUTO,
|
|
239
|
+
description=(
|
|
240
|
+
'Advanced override for the box background [R, G, B, A] (0-255). "auto" takes it from `style`; '
|
|
241
|
+
"null explicitly disables the background."
|
|
242
|
+
),
|
|
243
|
+
)
|
|
244
|
+
background_padding: int | None = Field(
|
|
245
|
+
None, ge=0, description="Advanced override: px between text and background edge. None takes it from `style`."
|
|
246
|
+
)
|
|
247
|
+
highlight_color: RGBColor | None = Field(
|
|
248
|
+
None, description="Advanced override for the spoken-word color [R, G, B]. None takes it from `style`."
|
|
249
|
+
)
|
|
250
|
+
highlight_size_multiplier: float | None = Field(
|
|
251
|
+
None, gt=0, description="Advanced override: scale factor for the highlighted word. None takes it from `style`."
|
|
252
|
+
)
|
|
253
|
+
position: tuple[float, float] | None = Field(
|
|
254
|
+
None,
|
|
255
|
+
description="Advanced override: box center as normalized (x, y). None derives it from `region`.",
|
|
256
|
+
)
|
|
257
|
+
box_width: float | None = Field(
|
|
258
|
+
None,
|
|
259
|
+
gt=0.0,
|
|
260
|
+
le=1.0,
|
|
261
|
+
description="Advanced override: box width as a fraction of frame width in (0, 1]. None uses 0.6.",
|
|
262
|
+
)
|
|
263
|
+
text_align: TextAlign | None = Field(
|
|
264
|
+
None, description='Advanced override: text alignment within the box. None uses "center".'
|
|
265
|
+
)
|
|
266
|
+
anchor: AnchorPoint | None = Field(
|
|
267
|
+
None, description="Advanced override: which point of the box sits at the position. None uses center."
|
|
268
|
+
)
|
|
269
|
+
margin: int | tuple[int, int, int, int] | None = Field(
|
|
270
|
+
None,
|
|
271
|
+
description="Advanced override: space around the box in px (or [top, right, bottom, left]). None uses 20.",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# ------------------------------------------------------------- resolution
|
|
275
|
+
|
|
276
|
+
def _style_params(self) -> _StyleParams:
|
|
277
|
+
"""Effective look: the ``style`` preset overlaid by any explicit overrides."""
|
|
278
|
+
p = _STYLE_PRESETS[self.style]
|
|
279
|
+
bg = p.background_color if self.background_color == _AUTO else self.background_color
|
|
280
|
+
return _StyleParams(
|
|
281
|
+
text_color=self.text_color or p.text_color,
|
|
282
|
+
highlight_color=self.highlight_color or p.highlight_color,
|
|
283
|
+
border=self.font_border_size if self.font_border_size is not None else p.border,
|
|
284
|
+
background_color=bg,
|
|
285
|
+
background_padding=(
|
|
286
|
+
self.background_padding if self.background_padding is not None else p.background_padding
|
|
287
|
+
),
|
|
288
|
+
highlight_size_multiplier=(
|
|
289
|
+
self.highlight_size_multiplier
|
|
290
|
+
if self.highlight_size_multiplier is not None
|
|
291
|
+
else p.highlight_size_multiplier
|
|
292
|
+
),
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
def _resolve_config(self) -> _ResolvedConfig:
|
|
296
|
+
"""Resolve every override-or-preset field to a concrete value once."""
|
|
297
|
+
return _ResolvedConfig(
|
|
298
|
+
position=self.position if self.position is not None else _REGION_POSITION[self.region],
|
|
299
|
+
anchor=self.anchor if self.anchor is not None else AnchorPoint.CENTER,
|
|
300
|
+
box_width=self.box_width if self.box_width is not None else 0.6,
|
|
301
|
+
text_align=self.text_align if self.text_align is not None else TextAlign.CENTER,
|
|
302
|
+
margin=self.margin if self.margin is not None else 20,
|
|
303
|
+
style=self._style_params(),
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def _transform(self, transcription: Transcription) -> Transcription:
|
|
307
|
+
"""Apply the cue transforms render and dry-run MUST share."""
|
|
308
|
+
if self.max_words_per_cue is not None:
|
|
309
|
+
transcription = transcription.chunk_segments(self.max_words_per_cue)
|
|
310
|
+
if self.capitalize:
|
|
311
|
+
transcription = transcription.capitalize_sentences()
|
|
312
|
+
return transcription
|
|
313
|
+
|
|
314
|
+
def _place_cue(self, img_text: ImageText, text: str, font_px: int, cfg: _ResolvedConfig) -> _CueBox | None:
|
|
315
|
+
"""Measure ``text`` at ``font_px`` and clamp its box inside the margins.
|
|
316
|
+
|
|
317
|
+
Returns ``None`` for a degenerate (whitespace-only) cue. ``fits`` is
|
|
318
|
+
False when the box is larger than the drawable area even after
|
|
319
|
+
clamping -- i.e. shrinking the font is the only remedy. Used by both
|
|
320
|
+
the fit search and the renderer, so they never diverge. Margin math
|
|
321
|
+
comes from ``ImageText.available_region`` (one source of truth with
|
|
322
|
+
``measure_text_box``).
|
|
323
|
+
"""
|
|
324
|
+
rect = img_text.measure_text_box(
|
|
325
|
+
text=text,
|
|
326
|
+
font_filename=self.font_filename,
|
|
327
|
+
xy=cfg.position,
|
|
328
|
+
box_width=cfg.box_width,
|
|
329
|
+
font_size=font_px,
|
|
330
|
+
anchor=cfg.anchor,
|
|
331
|
+
margin=cfg.margin,
|
|
332
|
+
)
|
|
333
|
+
if rect.height == 0:
|
|
334
|
+
return None
|
|
335
|
+
box_w = int(rect.width)
|
|
336
|
+
box_h = rect.height
|
|
337
|
+
left, top, avail_w, avail_h = img_text.available_region(cfg.margin)
|
|
338
|
+
fits = box_w <= avail_w and box_h <= avail_h
|
|
339
|
+
x = min(max(int(round(rect.x)), left), left + avail_w - box_w)
|
|
340
|
+
y = min(max(int(round(rect.y)), top), top + avail_h - box_h)
|
|
341
|
+
return _CueBox(x=x, y=y, box_w=box_w, height=box_h, fits=fits)
|
|
342
|
+
|
|
343
|
+
def _resolve_layout(self, width: int, height: int, transcription: Transcription) -> _SubtitleLayout:
|
|
344
|
+
"""Single source of truth for config + font size + fit (render & dry-run)."""
|
|
345
|
+
segments = self._transform(transcription).segments
|
|
346
|
+
cues = [s for s in segments if s.text.strip()]
|
|
347
|
+
cfg = self._resolve_config()
|
|
348
|
+
|
|
349
|
+
desired = self.font_size if self.font_size is not None else max(1, round(self.font_scale * height))
|
|
350
|
+
floor = max(1, round(self.min_font_scale * height))
|
|
351
|
+
# Never search above the desired size nor below the legible floor --
|
|
352
|
+
# but if the user pinned a font_size below the floor, honor it.
|
|
353
|
+
lo = min(desired, floor)
|
|
354
|
+
|
|
355
|
+
img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
|
|
356
|
+
|
|
357
|
+
def first_unfit(font_px: int) -> TranscriptionSegment | None:
|
|
358
|
+
for cue in cues:
|
|
359
|
+
box = self._place_cue(img_text, cue.text, font_px, cfg)
|
|
360
|
+
if box is not None and not box.fits:
|
|
361
|
+
return cue
|
|
362
|
+
return None
|
|
363
|
+
|
|
364
|
+
# "fits" is monotonic in font size (a larger font never fits where a
|
|
365
|
+
# smaller one didn't -- box width is font-independent and box height
|
|
366
|
+
# is non-decreasing in font size), so binary-search the largest fit.
|
|
367
|
+
if first_unfit(desired) is None:
|
|
368
|
+
return _SubtitleLayout(segments, cfg, desired, True, None)
|
|
369
|
+
offender = first_unfit(lo)
|
|
370
|
+
if offender is not None:
|
|
371
|
+
error = (
|
|
372
|
+
f"Subtitle cue {offender.text!r} cannot fit in a {width}x{height} frame even at the "
|
|
373
|
+
f"minimum font size ({lo}px, min_font_scale={self.min_font_scale}). Lower min_font_scale, "
|
|
374
|
+
f"reduce max_words_per_cue, widen box_width, or render at a larger resolution."
|
|
375
|
+
)
|
|
376
|
+
return _SubtitleLayout(segments, cfg, lo, False, error)
|
|
377
|
+
low, high = lo, desired # invariant: fits at low, not at high
|
|
378
|
+
while high - low > 1:
|
|
379
|
+
mid = (low + high) // 2
|
|
380
|
+
if first_unfit(mid) is None:
|
|
381
|
+
low = mid
|
|
382
|
+
else:
|
|
383
|
+
high = mid
|
|
384
|
+
return _SubtitleLayout(segments, cfg, low, True, None)
|
|
385
|
+
|
|
386
|
+
# ------------------------------------------------------------- timeline
|
|
387
|
+
|
|
388
|
+
def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
|
|
389
|
+
for segment in transcription.segments:
|
|
390
|
+
if segment.start <= timestamp <= segment.end:
|
|
391
|
+
return segment
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
|
|
395
|
+
for i, word in enumerate(segment.words):
|
|
396
|
+
if word.start <= timestamp <= word.end:
|
|
397
|
+
return i
|
|
398
|
+
return None
|
|
399
|
+
|
|
400
|
+
def _create_text_overlay(
|
|
401
|
+
self,
|
|
402
|
+
video_shape: tuple[int, int, int],
|
|
403
|
+
segment: TranscriptionSegment,
|
|
404
|
+
highlight_word_index: int | None,
|
|
405
|
+
layout: _SubtitleLayout,
|
|
406
|
+
cache: dict[tuple[str, int | None], np.ndarray],
|
|
407
|
+
) -> np.ndarray:
|
|
408
|
+
height, width = video_shape[:2]
|
|
409
|
+
cache_key = (segment.text, highlight_word_index)
|
|
410
|
+
if cache_key in cache:
|
|
411
|
+
return cache[cache_key]
|
|
412
|
+
|
|
413
|
+
cfg = layout.config
|
|
414
|
+
img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
|
|
415
|
+
box = self._place_cue(img_text, segment.text, layout.font_px, cfg)
|
|
416
|
+
if box is not None:
|
|
417
|
+
sp = cfg.style
|
|
418
|
+
# Absolute, pre-clamped placement (anchor=TOP_LEFT, explicit px box,
|
|
419
|
+
# margin already applied) -- the same numbers _resolve_layout used,
|
|
420
|
+
# so a layout that validated cannot raise OutOfBoundsError here.
|
|
421
|
+
img_text.write_text_box(
|
|
422
|
+
text=segment.text,
|
|
423
|
+
font_filename=self.font_filename,
|
|
424
|
+
xy=(box.x, box.y),
|
|
425
|
+
box_width=box.box_w,
|
|
426
|
+
font_size=layout.font_px,
|
|
427
|
+
font_border_size=sp.border,
|
|
428
|
+
text_color=sp.text_color,
|
|
429
|
+
background_color=sp.background_color,
|
|
430
|
+
background_padding=sp.background_padding,
|
|
431
|
+
place=cfg.text_align,
|
|
432
|
+
anchor=AnchorPoint.TOP_LEFT,
|
|
433
|
+
margin=0,
|
|
434
|
+
words=[w.word for w in segment.words],
|
|
435
|
+
highlight_word_index=highlight_word_index,
|
|
436
|
+
highlight_color=sp.highlight_color,
|
|
437
|
+
highlight_size_multiplier=sp.highlight_size_multiplier,
|
|
438
|
+
highlight_bold_font=self.highlight_bold_font,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
overlay_image = img_text.img_array
|
|
442
|
+
cache[cache_key] = overlay_image
|
|
443
|
+
return overlay_image
|
|
444
|
+
|
|
445
|
+
def apply( # type: ignore[override]
|
|
446
|
+
self,
|
|
447
|
+
video: Video,
|
|
448
|
+
transcription: Transcription | None = None,
|
|
449
|
+
) -> Video:
|
|
450
|
+
if transcription is None:
|
|
451
|
+
raise ValueError(
|
|
452
|
+
"TranscriptionOverlay requires transcription data. "
|
|
453
|
+
"Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
height, width = video.frame_shape[:2]
|
|
457
|
+
layout = self._resolve_layout(width, height, transcription)
|
|
458
|
+
if not layout.fits:
|
|
459
|
+
# Should be unreachable when the plan went through validate(); kept
|
|
460
|
+
# as defense in depth so a direct apply() still fails clearly
|
|
461
|
+
# rather than crashing mid-render in ImageText.
|
|
462
|
+
raise ValueError(layout.error)
|
|
463
|
+
|
|
464
|
+
# Per-call memo of rendered overlays, keyed by (cue text, highlighted
|
|
465
|
+
# word). Local rather than instance state so the model stays stateless
|
|
466
|
+
# and re-entrant -- a reused instance can render differently sized
|
|
467
|
+
# videos without serving a stale-resolution overlay.
|
|
468
|
+
cache: dict[tuple[str, int | None], np.ndarray] = {}
|
|
469
|
+
transformed = Transcription(segments=layout.segments, language=transcription.language)
|
|
470
|
+
|
|
471
|
+
logger.info("Applying transcription overlay (font %dpx)...", layout.font_px)
|
|
472
|
+
new_frames = []
|
|
473
|
+
for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
|
|
474
|
+
timestamp = frame_index / video.fps
|
|
475
|
+
active_segment = self._get_active_segment(transformed, timestamp)
|
|
476
|
+
if active_segment is None:
|
|
477
|
+
new_frames.append(frame)
|
|
478
|
+
continue
|
|
479
|
+
highlight_word_index = self._get_active_word_index(active_segment, timestamp)
|
|
480
|
+
text_overlay = self._create_text_overlay(
|
|
481
|
+
video.frame_shape, active_segment, highlight_word_index, layout, cache
|
|
482
|
+
)
|
|
483
|
+
new_frames.append(self._apply_overlay_to_frame(frame, text_overlay))
|
|
484
|
+
|
|
485
|
+
new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
|
|
486
|
+
new_video.audio = video.audio
|
|
487
|
+
return new_video
|
|
488
|
+
|
|
489
|
+
def predict_metadata(
|
|
490
|
+
self,
|
|
491
|
+
meta: VideoMetadata,
|
|
492
|
+
transcription: Transcription | None = None,
|
|
493
|
+
**_context: Any,
|
|
494
|
+
) -> VideoMetadata:
|
|
495
|
+
"""Identity for metadata (shape/count preserved) -- but fail fast here
|
|
496
|
+
if the resolved subtitles cannot fit the predicted frame.
|
|
497
|
+
|
|
498
|
+
This is the backstop that closes the validate/run gap: ``VideoEdit``
|
|
499
|
+
runs it during the dry-run, so an un-fittable plan is rejected before
|
|
500
|
+
any frame/GPU work, symmetric with the timing/dimension checks. Mirrors
|
|
501
|
+
``SilenceRemoval``: with no ``transcription`` in the validate context
|
|
502
|
+
the layout cannot be checked, so this is a no-op identity (the same
|
|
503
|
+
conditional guarantee as time re-basing).
|
|
504
|
+
"""
|
|
505
|
+
if transcription is None:
|
|
506
|
+
return meta
|
|
507
|
+
layout = self._resolve_layout(meta.width, meta.height, transcription)
|
|
508
|
+
if not layout.fits:
|
|
509
|
+
raise ValueError(layout.error)
|
|
510
|
+
return meta
|
|
511
|
+
|
|
512
|
+
def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
|
|
513
|
+
frame_pil = Image.fromarray(frame)
|
|
514
|
+
overlay_pil = Image.fromarray(overlay)
|
|
515
|
+
frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
|
|
516
|
+
return np.array(frame_pil)
|
|
@@ -1,186 +0,0 @@
|
|
|
1
|
-
"""Subtitle overlay effect.
|
|
2
|
-
|
|
3
|
-
``TranscriptionOverlay`` is an :class:`Effect` that renders animated
|
|
4
|
-
word-by-word subtitles onto a :class:`Video` using a word-level
|
|
5
|
-
:class:`Transcription`. Rendering is delegated to ``ImageText`` from
|
|
6
|
-
the sibling module.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
from __future__ import annotations
|
|
10
|
-
|
|
11
|
-
import logging
|
|
12
|
-
from typing import ClassVar, Literal
|
|
13
|
-
|
|
14
|
-
import numpy as np
|
|
15
|
-
from PIL import Image
|
|
16
|
-
from pydantic import Field, PrivateAttr
|
|
17
|
-
from tqdm import tqdm
|
|
18
|
-
|
|
19
|
-
from videopython.base.image_text import AnchorPoint, ImageText, TextAlign
|
|
20
|
-
from videopython.base.transcription import Transcription, TranscriptionSegment
|
|
21
|
-
from videopython.base.video import Video
|
|
22
|
-
from videopython.editing.operation import Effect
|
|
23
|
-
|
|
24
|
-
__all__ = ["TranscriptionOverlay"]
|
|
25
|
-
|
|
26
|
-
logger = logging.getLogger(__name__)
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class TranscriptionOverlay(Effect):
|
|
30
|
-
"""Renders animated word-by-word subtitles with the current word highlighted.
|
|
31
|
-
|
|
32
|
-
Each word lights up in the highlight color as it is spoken, based on
|
|
33
|
-
transcription timestamps. Requires a word-level transcription, which the
|
|
34
|
-
runner supplies via the ``requires=("transcription",)`` declaration.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
op: Literal["add_subtitles"] = "add_subtitles"
|
|
38
|
-
streamable: ClassVar[bool] = False
|
|
39
|
-
requires: ClassVar[tuple[str, ...]] = ("transcription",)
|
|
40
|
-
|
|
41
|
-
font_filename: str | None = Field(
|
|
42
|
-
None,
|
|
43
|
-
description="Path to a .ttf font file for rendering subtitle text, or None for the bundled default font.",
|
|
44
|
-
)
|
|
45
|
-
font_size: int = Field(40, ge=1, description="Base font size in pixels.")
|
|
46
|
-
font_border_size: int = Field(
|
|
47
|
-
2, ge=0, description="Outline thickness around each character in pixels. 0 = no outline."
|
|
48
|
-
)
|
|
49
|
-
text_color: tuple[int, int, int] = Field((255, 235, 59), description="Default text color as [R, G, B], each 0-255.")
|
|
50
|
-
background_color: tuple[int, int, int, int] | None = Field(
|
|
51
|
-
(0, 0, 0, 100),
|
|
52
|
-
description="Subtitle box background as [R, G, B, A] (0-255), or None to disable the background.",
|
|
53
|
-
)
|
|
54
|
-
background_padding: int = Field(15, ge=0, description="Pixels of space between text and background edge.")
|
|
55
|
-
position: tuple[float, float] = Field(
|
|
56
|
-
(0.5, 0.7),
|
|
57
|
-
description="Text box center as normalized (x, y). (0, 0) = top-left, (1, 1) = bottom-right.",
|
|
58
|
-
)
|
|
59
|
-
box_width: float = Field(
|
|
60
|
-
0.6, gt=0.0, le=1.0, description="Width of the text box as a fraction of frame width, in (0, 1]."
|
|
61
|
-
)
|
|
62
|
-
text_align: TextAlign = Field(
|
|
63
|
-
TextAlign.CENTER, description='Text alignment within the box: "left", "right", or "center".'
|
|
64
|
-
)
|
|
65
|
-
anchor: AnchorPoint = Field(
|
|
66
|
-
AnchorPoint.CENTER, description="Which point of the text box sits at the position coordinate."
|
|
67
|
-
)
|
|
68
|
-
margin: int | tuple[int, int, int, int] = Field(
|
|
69
|
-
20,
|
|
70
|
-
description="Space around the text box in pixels, or a [top, right, bottom, left] tuple of per-side values.",
|
|
71
|
-
)
|
|
72
|
-
highlight_color: tuple[int, int, int] = Field(
|
|
73
|
-
(76, 175, 80), description="Color for the currently spoken word as [R, G, B]."
|
|
74
|
-
)
|
|
75
|
-
highlight_size_multiplier: float = Field(
|
|
76
|
-
1.2, gt=0, description="Scale factor for the highlighted word. 1.0 = same size, 1.2 = 20% larger."
|
|
77
|
-
)
|
|
78
|
-
highlight_bold_font: str | None = Field(
|
|
79
|
-
None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
|
|
80
|
-
)
|
|
81
|
-
max_words_per_cue: int | None = Field(
|
|
82
|
-
5,
|
|
83
|
-
ge=1,
|
|
84
|
-
description=(
|
|
85
|
-
"Maximum words shown on screen at once. Each transcription segment is re-chunked into "
|
|
86
|
-
"cues of at most this many words, without bridging the silence gaps between segments, so "
|
|
87
|
-
"subtitles stay readable and don't linger over pauses. None preserves the source "
|
|
88
|
-
"transcription's segmentation."
|
|
89
|
-
),
|
|
90
|
-
)
|
|
91
|
-
capitalize: bool = Field(
|
|
92
|
-
True,
|
|
93
|
-
description=(
|
|
94
|
-
"Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
|
|
95
|
-
"Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
|
|
96
|
-
"exactly as transcribed."
|
|
97
|
-
),
|
|
98
|
-
)
|
|
99
|
-
|
|
100
|
-
_overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
|
|
101
|
-
|
|
102
|
-
def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
|
|
103
|
-
for segment in transcription.segments:
|
|
104
|
-
if segment.start <= timestamp <= segment.end:
|
|
105
|
-
return segment
|
|
106
|
-
return None
|
|
107
|
-
|
|
108
|
-
def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
|
|
109
|
-
for i, word in enumerate(segment.words):
|
|
110
|
-
if word.start <= timestamp <= word.end:
|
|
111
|
-
return i
|
|
112
|
-
return None
|
|
113
|
-
|
|
114
|
-
def _create_text_overlay(
|
|
115
|
-
self,
|
|
116
|
-
video_shape: tuple[int, int, int],
|
|
117
|
-
segment: TranscriptionSegment,
|
|
118
|
-
highlight_word_index: int | None,
|
|
119
|
-
) -> np.ndarray:
|
|
120
|
-
height, width = video_shape[:2]
|
|
121
|
-
cache_key = (segment.text, highlight_word_index)
|
|
122
|
-
if cache_key in self._overlay_cache:
|
|
123
|
-
return self._overlay_cache[cache_key]
|
|
124
|
-
|
|
125
|
-
img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
|
|
126
|
-
img_text.write_text_box(
|
|
127
|
-
text=segment.text,
|
|
128
|
-
font_filename=self.font_filename,
|
|
129
|
-
xy=self.position,
|
|
130
|
-
box_width=self.box_width,
|
|
131
|
-
font_size=self.font_size,
|
|
132
|
-
font_border_size=self.font_border_size,
|
|
133
|
-
text_color=self.text_color,
|
|
134
|
-
background_color=self.background_color,
|
|
135
|
-
background_padding=self.background_padding,
|
|
136
|
-
place=self.text_align,
|
|
137
|
-
anchor=self.anchor,
|
|
138
|
-
margin=self.margin,
|
|
139
|
-
words=[w.word for w in segment.words],
|
|
140
|
-
highlight_word_index=highlight_word_index,
|
|
141
|
-
highlight_color=self.highlight_color,
|
|
142
|
-
highlight_size_multiplier=self.highlight_size_multiplier,
|
|
143
|
-
highlight_bold_font=self.highlight_bold_font,
|
|
144
|
-
)
|
|
145
|
-
|
|
146
|
-
overlay_image = img_text.img_array
|
|
147
|
-
self._overlay_cache[cache_key] = overlay_image
|
|
148
|
-
return overlay_image
|
|
149
|
-
|
|
150
|
-
def apply( # type: ignore[override]
|
|
151
|
-
self,
|
|
152
|
-
video: Video,
|
|
153
|
-
transcription: Transcription | None = None,
|
|
154
|
-
) -> Video:
|
|
155
|
-
if transcription is None:
|
|
156
|
-
raise ValueError(
|
|
157
|
-
"TranscriptionOverlay requires transcription data. "
|
|
158
|
-
"Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
|
|
159
|
-
)
|
|
160
|
-
|
|
161
|
-
if self.max_words_per_cue is not None:
|
|
162
|
-
transcription = transcription.chunk_segments(self.max_words_per_cue)
|
|
163
|
-
if self.capitalize:
|
|
164
|
-
transcription = transcription.capitalize_sentences()
|
|
165
|
-
|
|
166
|
-
logger.info("Applying transcription overlay...")
|
|
167
|
-
new_frames = []
|
|
168
|
-
for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
|
|
169
|
-
timestamp = frame_index / video.fps
|
|
170
|
-
active_segment = self._get_active_segment(transcription, timestamp)
|
|
171
|
-
if active_segment is None:
|
|
172
|
-
new_frames.append(frame)
|
|
173
|
-
continue
|
|
174
|
-
highlight_word_index = self._get_active_word_index(active_segment, timestamp)
|
|
175
|
-
text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
|
|
176
|
-
new_frames.append(self._apply_overlay_to_frame(frame, text_overlay))
|
|
177
|
-
|
|
178
|
-
new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
|
|
179
|
-
new_video.audio = video.audio
|
|
180
|
-
return new_video
|
|
181
|
-
|
|
182
|
-
def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
|
|
183
|
-
frame_pil = Image.fromarray(frame)
|
|
184
|
-
overlay_pil = Image.fromarray(overlay)
|
|
185
|
-
frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
|
|
186
|
-
return np.array(frame_pil)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|