videopython 0.33.5__tar.gz → 0.34.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. {videopython-0.33.5 → videopython-0.34.0}/PKG-INFO +1 -1
  2. {videopython-0.33.5 → videopython-0.34.0}/pyproject.toml +1 -1
  3. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/transforms.py +24 -8
  4. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/image_text.py +137 -44
  5. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/__init__.py +3 -1
  6. videopython-0.34.0/src/videopython/editing/transcription_overlay.py +516 -0
  7. videopython-0.33.5/src/videopython/editing/transcription_overlay.py +0 -186
  8. {videopython-0.33.5 → videopython-0.34.0}/.gitignore +0 -0
  9. {videopython-0.33.5 → videopython-0.34.0}/LICENSE +0 -0
  10. {videopython-0.33.5 → videopython-0.34.0}/README.md +0 -0
  11. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/__init__.py +0 -0
  12. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/__init__.py +0 -0
  13. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/_device.py +0 -0
  14. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/__init__.py +0 -0
  15. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/config.py +0 -0
  16. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/dubber.py +0 -0
  17. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/expressiveness.py +0 -0
  18. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/loudness.py +0 -0
  19. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/models.py +0 -0
  20. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/pipeline.py +0 -0
  21. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/quality.py +0 -0
  22. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/remux.py +0 -0
  23. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/timing.py +0 -0
  24. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/dubbing/voice_sample.py +0 -0
  25. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/__init__.py +0 -0
  26. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/audio.py +0 -0
  27. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/image.py +0 -0
  28. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/qwen3.py +0 -0
  29. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/translation.py +0 -0
  30. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/generation/video.py +0 -0
  31. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/__init__.py +0 -0
  32. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/audio.py +0 -0
  33. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/faces.py +0 -0
  34. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/image.py +0 -0
  35. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/separation.py +0 -0
  36. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/understanding/temporal.py +0 -0
  37. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/__init__.py +0 -0
  38. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/analyzer.py +0 -0
  39. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/models.py +0 -0
  40. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/sampling.py +0 -0
  41. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/ai/video_analysis/stages.py +0 -0
  42. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/__init__.py +0 -0
  43. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/analysis.py +0 -0
  44. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/audio/audio.py +0 -0
  45. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/__init__.py +0 -0
  46. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_dimensions.py +0 -0
  47. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_ffmpeg.py +0 -0
  48. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/_video_io.py +0 -0
  49. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/description.py +0 -0
  50. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/exceptions.py +0 -0
  51. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/DejaVuSans.ttf +0 -0
  52. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/LICENSE_DEJAVU +0 -0
  53. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/fonts/__init__.py +0 -0
  54. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/transcription.py +0 -0
  55. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/base/video.py +0 -0
  56. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/effects.py +0 -0
  57. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/operation.py +0 -0
  58. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/streaming.py +0 -0
  59. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/transforms.py +0 -0
  60. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/editing/video_edit.py +0 -0
  61. {videopython-0.33.5 → videopython-0.34.0}/src/videopython/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.33.5
3
+ Version: 0.34.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://videopython.com
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "videopython"
3
- version = "0.33.5"
3
+ version = "0.34.0"
4
4
  description = "Minimal video generation and processing library."
5
5
  authors = [
6
6
  { name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
@@ -12,7 +12,7 @@ from tqdm import tqdm
12
12
 
13
13
  from videopython.ai.understanding.faces import FaceTracker
14
14
  from videopython.base._dimensions import floor_to_even
15
- from videopython.base.video import Video
15
+ from videopython.base.video import Video, VideoMetadata
16
16
  from videopython.editing.operation import OpCategory, Operation
17
17
 
18
18
  logger = logging.getLogger(__name__)
@@ -76,6 +76,28 @@ class FaceTrackingCrop(Operation):
76
76
  # "dynamic" — placeholder until motion/look-direction framing is implemented.
77
77
  return (face_cx, face_cy - self.headroom)
78
78
 
79
+ def _resolved_output_dims(self, w: int, h: int) -> tuple[int, int]:
80
+ """Output ``(width, height)`` after the crop + resize.
81
+
82
+ Every frame is resized to this size regardless of the per-frame face
83
+ position, so it is a pure function of the input dimensions and
84
+ ``target_aspect``. Single source of truth shared by :meth:`apply` and
85
+ :meth:`predict_metadata` (mirrors ``Resize._resolve_dims`` /
86
+ ``Crop._resolve_box``), so the dry-run cannot disagree with the render.
87
+ """
88
+ target_ratio = self.target_aspect[0] / self.target_aspect[1]
89
+ if target_ratio < w / h:
90
+ out_h = floor_to_even(h)
91
+ out_w = floor_to_even(int(out_h * target_ratio))
92
+ else:
93
+ out_w = floor_to_even(w)
94
+ out_h = floor_to_even(int(out_w / target_ratio))
95
+ return out_w, out_h
96
+
97
+ def predict_metadata(self, meta: VideoMetadata) -> VideoMetadata:
98
+ out_w, out_h = self._resolved_output_dims(meta.width, meta.height)
99
+ return meta.with_dimensions(out_w, out_h)
100
+
79
101
  def _clamp_speed(self, current: tuple[float, float], target: tuple[float, float]) -> tuple[float, float]:
80
102
  if self.max_speed is None:
81
103
  return target
@@ -135,13 +157,7 @@ class FaceTrackingCrop(Operation):
135
157
  )
136
158
 
137
159
  h, w = video.frame_shape[:2]
138
- target_ratio = self.target_aspect[0] / self.target_aspect[1]
139
- if target_ratio < w / h:
140
- out_h = floor_to_even(h)
141
- out_w = floor_to_even(int(out_h * target_ratio))
142
- else:
143
- out_w = floor_to_even(w)
144
- out_h = floor_to_even(int(out_w / target_ratio))
160
+ out_w, out_h = self._resolved_output_dims(w, h)
145
161
 
146
162
  default_x = (w - out_w) // 2
147
163
  default_y = (h - out_h) // 2
@@ -9,6 +9,7 @@ generation helpers (``ai/understanding/image.py``).
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
+ from dataclasses import dataclass
12
13
  from enum import Enum
13
14
  from typing import TypeAlias
14
15
 
@@ -18,7 +19,7 @@ from PIL import Image, ImageDraw, ImageFont
18
19
  from videopython.base.exceptions import OutOfBoundsError
19
20
  from videopython.base.fonts import load_font
20
21
 
21
- __all__ = ["ImageText", "TextAlign", "AnchorPoint"]
22
+ __all__ = ["ImageText", "TextAlign", "AnchorPoint", "TextBoxRect"]
22
23
 
23
24
  # Type aliases for clarity
24
25
  MarginType: TypeAlias = int | tuple[int, int, int, int]
@@ -79,6 +80,32 @@ class AnchorPoint(str, Enum):
79
80
  return (cls.BOTTOM_LEFT, cls.BOTTOM_CENTER, cls.BOTTOM_RIGHT)
80
81
 
81
82
 
83
+ @dataclass(frozen=True)
84
+ class TextBoxRect:
85
+ """Resolved geometry of a wrapped text box, without rendering it.
86
+
87
+ Returned by :meth:`ImageText.measure_text_box` — the single source of
88
+ truth for box measure/wrap/anchor/bounds, shared by the renderer
89
+ (:meth:`ImageText.write_text_box`) and dry-run validators so they can
90
+ never disagree on whether text fits.
91
+
92
+ For a non-degenerate box ``(x, y)`` is the anchor-adjusted top-left
93
+ corner and ``width``/``height`` span the wrapped lines. For a degenerate
94
+ box (whitespace-only text → no renderable lines) ``height == 0``,
95
+ ``(x, y)`` is the *unadjusted* insertion point, and ``fits`` is ``True``;
96
+ callers short-circuit such boxes (nothing to draw). ``width`` mirrors the
97
+ resolved ``box_width`` and may be a float when an absolute >1 value was
98
+ passed, matching legacy behaviour.
99
+ """
100
+
101
+ x: float
102
+ y: float
103
+ width: int | float
104
+ height: int
105
+ fits: bool
106
+ lines: tuple[str, ...]
107
+
108
+
82
109
  class ImageText:
83
110
  def __init__(
84
111
  self,
@@ -566,6 +593,97 @@ class ImageText:
566
593
  lines = [" ".join(line) for line in split_lines]
567
594
  return lines
568
595
 
596
+ def available_region(self, margin: MarginType = 0) -> tuple[int, int, int, int]:
597
+ """The drawable area inside ``margin`` as ``(left, top, width, height)``.
598
+
599
+ Single source of truth for margin-inset geometry: used by
600
+ :meth:`measure_text_box` and by callers that need to clamp a box
601
+ within the margins without re-deriving the margin math.
602
+ """
603
+ margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
604
+ available_width = self.image_size[1] - margin_left - margin_right
605
+ available_height = self.image_size[0] - margin_top - margin_bottom
606
+ return margin_left, margin_top, available_width, available_height
607
+
608
+ def measure_text_box(
609
+ self,
610
+ text: str,
611
+ font_filename: str | None,
612
+ xy: PositionType,
613
+ box_width: int | float | None = None,
614
+ font_size: int = 11,
615
+ anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
616
+ margin: MarginType = 0,
617
+ ) -> TextBoxRect:
618
+ """Measure where a wrapped text box would land, without drawing it.
619
+
620
+ Pure: resolves margins/box-width/position, wraps the text, applies the
621
+ anchor, and bounds-checks against the image — the exact math
622
+ :meth:`write_text_box` used to do inline. Highlighting and per-line
623
+ alignment (``place``) do not change the box envelope, so they are not
624
+ parameters here; this intentionally preserves the pre-existing
625
+ behaviour that an enlarged highlighted word is *not* accounted for in
626
+ the fit check.
627
+
628
+ Returns:
629
+ A :class:`TextBoxRect`. ``fits`` is ``False`` when the box would
630
+ fall outside the image bounds (the condition that makes
631
+ :meth:`write_text_box` raise :class:`OutOfBoundsError`).
632
+
633
+ Raises:
634
+ ValueError: If ``text`` is empty, ``font_size`` is not positive,
635
+ or an absolute ``box_width`` is not positive.
636
+ """
637
+ if not text:
638
+ raise ValueError("Text cannot be empty")
639
+
640
+ if font_size <= 0:
641
+ raise ValueError("Font size must be positive")
642
+
643
+ # Process margins to determine available area (shared with callers
644
+ # that clamp boxes inside the margins -- see ``available_region``).
645
+ margin_left, margin_top, available_width, available_height = self.available_region(margin)
646
+
647
+ # Handle relative box width
648
+ if box_width is None:
649
+ box_width = available_width
650
+ elif isinstance(box_width, float) and 0 < box_width <= 1:
651
+ box_width = int(available_width * box_width)
652
+ elif isinstance(box_width, int) and box_width <= 0:
653
+ raise ValueError("Box width must be positive")
654
+
655
+ # Calculate initial position based on margin and anchor before splitting text
656
+ x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
657
+
658
+ # Split text into lines that fit within box_width
659
+ lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
660
+
661
+ # Calculate total height of all lines
662
+ lines_height = sum(self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines)
663
+ if lines_height == 0:
664
+ # No renderable lines (e.g. whitespace-only text); position is the
665
+ # unadjusted insertion point and the box trivially "fits".
666
+ return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=0, fits=True, lines=tuple(lines))
667
+
668
+ # Final position calculation based on anchor point
669
+ if anchor in AnchorPoint.center_anchors():
670
+ x_pos -= box_width // 2
671
+ elif anchor in AnchorPoint.right_anchors():
672
+ x_pos -= box_width
673
+
674
+ if anchor in AnchorPoint.middle_anchors():
675
+ y_pos -= lines_height // 2
676
+ elif anchor in AnchorPoint.bottom_anchors():
677
+ y_pos -= lines_height
678
+
679
+ fits = not (
680
+ x_pos < 0
681
+ or y_pos < 0
682
+ or x_pos + box_width > self.image_size[1]
683
+ or y_pos + lines_height > self.image_size[0]
684
+ )
685
+ return TextBoxRect(x=x_pos, y=y_pos, width=box_width, height=lines_height, fits=fits, lines=tuple(lines))
686
+
569
687
  def write_text_box(
570
688
  self,
571
689
  text: str,
@@ -643,49 +761,24 @@ class ImageText:
643
761
  if highlight_word_index is not None and highlight_color is None:
644
762
  highlight_color = text_color
645
763
 
646
- # Process margins to determine available area
647
- margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
648
- available_width = self.image_size[1] - margin_left - margin_right
649
- available_height = self.image_size[0] - margin_top - margin_bottom
650
-
651
- # Handle relative box width
652
- if box_width is None:
653
- box_width = available_width
654
- elif isinstance(box_width, float) and 0 < box_width <= 1:
655
- box_width = int(available_width * box_width)
656
- elif isinstance(box_width, int) and box_width <= 0:
657
- raise ValueError("Box width must be positive")
658
-
659
- # Calculate initial position based on margin and anchor before splitting text
660
- x_pos, y_pos = self._convert_position(xy, margin_top, margin_left, available_width, available_height)
661
-
662
- # Split text into lines that fit within box_width
663
- lines = self._split_lines_by_width(text, font_filename, font_size, int(box_width))
664
-
665
- # Calculate total height of all lines
666
- lines_height = sum([self.get_text_dimensions(font_filename, font_size, line)[1] for line in lines])
667
- if lines_height == 0:
668
- # If we have no valid lines or zero height, return the position
669
- return (int(x_pos), int(y_pos))
670
-
671
- # Final position calculation based on anchor point
672
- if anchor in AnchorPoint.center_anchors():
673
- x_pos -= box_width // 2
674
- elif anchor in AnchorPoint.right_anchors():
675
- x_pos -= box_width
676
-
677
- if anchor in AnchorPoint.middle_anchors():
678
- y_pos -= lines_height // 2
679
- elif anchor in AnchorPoint.bottom_anchors():
680
- y_pos -= lines_height
681
-
682
- # Verify box will fit within bounds
683
- if (
684
- x_pos < 0
685
- or y_pos < 0
686
- or x_pos + box_width > self.image_size[1]
687
- or y_pos + lines_height > self.image_size[0]
688
- ):
764
+ # Measure (single source of truth for box geometry), then render.
765
+ rect = self.measure_text_box(
766
+ text=text,
767
+ font_filename=font_filename,
768
+ xy=xy,
769
+ box_width=box_width,
770
+ font_size=font_size,
771
+ anchor=anchor,
772
+ margin=margin,
773
+ )
774
+ lines = list(rect.lines)
775
+ if rect.height == 0:
776
+ # No renderable lines (e.g. whitespace-only text); nothing to draw.
777
+ return (int(rect.x), int(rect.y))
778
+ box_width = rect.width
779
+ x_pos, y_pos = rect.x, rect.y
780
+ lines_height = rect.height
781
+ if not rect.fits:
689
782
  raise OutOfBoundsError(
690
783
  f"Text box with size ({box_width}x{lines_height}) at position ({x_pos}, {y_pos}) is out of bounds!"
691
784
  )
@@ -21,7 +21,7 @@ from .effects import (
21
21
  Zoom,
22
22
  )
23
23
  from .operation import FilterCtx, OpCategory, Operation, TimeRange
24
- from .transcription_overlay import TranscriptionOverlay
24
+ from .transcription_overlay import SubtitleRegion, SubtitleStyle, TranscriptionOverlay
25
25
  from .transforms import (
26
26
  Crop,
27
27
  CropMode,
@@ -65,6 +65,8 @@ __all__ = [
65
65
  "VolumeAdjust",
66
66
  "TextOverlay",
67
67
  "TranscriptionOverlay",
68
+ "SubtitleStyle",
69
+ "SubtitleRegion",
68
70
  "Shake",
69
71
  "PunchIn",
70
72
  "Flash",
@@ -0,0 +1,516 @@
1
+ """Subtitle overlay effect.
2
+
3
+ ``TranscriptionOverlay`` is an :class:`Effect` that renders animated
4
+ word-by-word subtitles onto a :class:`Video` using a word-level
5
+ :class:`Transcription`. Rendering is delegated to ``ImageText`` from
6
+ the sibling module.
7
+
8
+ The public surface is intentionally small and *resolution-independent*:
9
+ pick a ``style`` preset, a ``region``, and a ``font_scale`` (fraction of
10
+ frame height). The legacy absolute fields (``font_size``, explicit colors,
11
+ ``position``/``anchor``/``box_width``/``margin``) remain as optional advanced
12
+ overrides for back-compat -- left unset they are derived from the presets, so
13
+ an authored plan cannot encode a resolution-specific value that overflows the
14
+ real (post-transform) frame.
15
+
16
+ Fit safety is layered:
17
+
18
+ * one routine (:meth:`_resolve_layout`) decides the final font size for both
19
+ the dry-run and the render, so they can never disagree;
20
+ * it auto-shrinks the font within a legible band and clamps the box inside
21
+ the frame (graceful render fit);
22
+ * only a cue that cannot fit even at the minimum legible size is an error,
23
+ and that error is raised by :meth:`predict_metadata` at
24
+ ``VideoEdit.validate()`` time -- before any frame/GPU work -- never
25
+ mid-render.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import logging
31
+ from dataclasses import dataclass
32
+ from enum import Enum
33
+ from typing import Any, ClassVar, Literal
34
+
35
+ import numpy as np
36
+ from PIL import Image
37
+ from pydantic import Field
38
+ from tqdm import tqdm
39
+
40
+ from videopython.base.image_text import AnchorPoint, ImageText, TextAlign
41
+ from videopython.base.transcription import Transcription, TranscriptionSegment
42
+ from videopython.base.video import Video, VideoMetadata
43
+ from videopython.editing.operation import Effect
44
+
45
+ __all__ = ["TranscriptionOverlay", "SubtitleStyle", "SubtitleRegion"]
46
+
47
+ logger = logging.getLogger(__name__)
48
+
49
+ # Sentinel for ``background_color``: ``None`` already means "no background",
50
+ # so it cannot double as "derive from the style preset".
51
+ _AUTO: Literal["auto"] = "auto"
52
+
53
+ RGBColor = tuple[int, int, int]
54
+ RGBAColor = tuple[int, int, int, int]
55
+
56
+
57
+ class SubtitleStyle(str, Enum):
58
+ """Named look bundling colors / border / background / highlight.
59
+
60
+ Lets a caller express intent ("boxed", "outline", ...) instead of a
61
+ dozen individual numbers. ``BOXED`` reproduces the historical defaults
62
+ exactly, so upgrading without changing fields is visually a no-op except
63
+ for the now resolution-relative font size.
64
+ """
65
+
66
+ BOXED = "boxed"
67
+ OUTLINE = "outline"
68
+ CLEAN = "clean"
69
+ KARAOKE = "karaoke"
70
+
71
+
72
+ class SubtitleRegion(str, Enum):
73
+ """Vertical safe-area band the subtitle box is centered in."""
74
+
75
+ TOP = "top"
76
+ CENTER = "center"
77
+ BOTTOM = "bottom"
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class _StyleParams:
82
+ text_color: RGBColor
83
+ highlight_color: RGBColor
84
+ border: int
85
+ background_color: RGBAColor | None
86
+ background_padding: int
87
+ highlight_size_multiplier: float
88
+
89
+
90
+ _STYLE_PRESETS: dict[SubtitleStyle, _StyleParams] = {
91
+ # Exactly the pre-redesign defaults.
92
+ SubtitleStyle.BOXED: _StyleParams((255, 235, 59), (76, 175, 80), 2, (0, 0, 0, 100), 15, 1.2),
93
+ SubtitleStyle.OUTLINE: _StyleParams((255, 255, 255), (255, 235, 59), 4, None, 0, 1.15),
94
+ SubtitleStyle.CLEAN: _StyleParams((255, 255, 255), (76, 175, 80), 2, None, 0, 1.1),
95
+ SubtitleStyle.KARAOKE: _StyleParams((255, 255, 255), (255, 90, 95), 3, (0, 0, 0, 120), 18, 1.25),
96
+ }
97
+
98
+ # region -> normalized (x, y) center of the box; anchor stays CENTER so the
99
+ # box is centered on this point. Chosen to sit inside a conventional safe area.
100
+ _REGION_POSITION: dict[SubtitleRegion, tuple[float, float]] = {
101
+ SubtitleRegion.TOP: (0.5, 0.18),
102
+ SubtitleRegion.CENTER: (0.5, 0.5),
103
+ SubtitleRegion.BOTTOM: (0.5, 0.82),
104
+ }
105
+
106
+
107
+ @dataclass(frozen=True)
108
+ class _CueBox:
109
+ """Absolute, frame-clamped placement of one cue's text box."""
110
+
111
+ x: int
112
+ y: int
113
+ box_w: int
114
+ height: int
115
+ fits: bool
116
+
117
+
118
+ @dataclass(frozen=True)
119
+ class _ResolvedConfig:
120
+ """Every override-or-preset field resolved to a concrete value once.
121
+
122
+ Deterministic from the model fields, so the dry-run and the render
123
+ derive identical geometry and look (parity).
124
+ """
125
+
126
+ position: tuple[float, float]
127
+ anchor: AnchorPoint
128
+ box_width: float
129
+ text_align: TextAlign
130
+ margin: int | tuple[int, int, int, int]
131
+ style: _StyleParams
132
+
133
+
134
+ @dataclass(frozen=True)
135
+ class _SubtitleLayout:
136
+ """Outcome of resolving the overlay against a concrete frame size.
137
+
138
+ ``segments`` are post-transform cues (``chunk`` + ``capitalize``) and
139
+ ``config`` is the resolved geometry/look -- both shared by render and
140
+ dry-run so they measure and draw identical boxes. ``font_px`` is the
141
+ single font size both paths use. ``fits`` is False with a populated
142
+ ``error`` only when a cue cannot fit even at the minimum legible size.
143
+ """
144
+
145
+ segments: list[TranscriptionSegment]
146
+ config: _ResolvedConfig
147
+ font_px: int
148
+ fits: bool
149
+ error: str | None
150
+
151
+
152
+ class TranscriptionOverlay(Effect):
153
+ """Renders animated word-by-word subtitles with the current word highlighted.
154
+
155
+ Each word lights up in the highlight color as it is spoken, based on
156
+ transcription timestamps. Requires a word-level transcription, which the
157
+ runner supplies via the ``requires=("transcription",)`` declaration.
158
+
159
+ Geometry is resolution-relative by default (``font_scale``/``region``), so
160
+ a plan validated by ``VideoEdit.validate()`` that passes will also render;
161
+ a plan that cannot fit fails fast in :meth:`predict_metadata` instead of
162
+ crashing mid-render after expensive upstream ops.
163
+ """
164
+
165
+ op: Literal["add_subtitles"] = "add_subtitles"
166
+ streamable: ClassVar[bool] = False
167
+ requires: ClassVar[tuple[str, ...]] = ("transcription",)
168
+
169
+ # ---- primary, resolution-independent surface ----
170
+ style: SubtitleStyle = Field(
171
+ SubtitleStyle.BOXED,
172
+ description='Look preset bundling colors/border/background/highlight: "boxed", "outline", "clean", "karaoke".',
173
+ )
174
+ region: SubtitleRegion = Field(
175
+ SubtitleRegion.BOTTOM,
176
+ description='Vertical placement band: "top", "center", or "bottom" of the frame.',
177
+ )
178
+ font_scale: float = Field(
179
+ 0.055,
180
+ gt=0.0,
181
+ le=0.5,
182
+ description=(
183
+ "Base font height as a fraction of frame height (resolution-independent; the recommended "
184
+ "way to size subtitles). Auto-shrinks toward `min_font_scale` if a cue would overflow."
185
+ ),
186
+ )
187
+ min_font_scale: float = Field(
188
+ 0.030,
189
+ gt=0.0,
190
+ le=0.5,
191
+ description=(
192
+ "Lower bound for auto-fit shrinking, as a fraction of frame height. A cue that cannot fit "
193
+ "even at this size is a validation error rather than an illegible render."
194
+ ),
195
+ )
196
+ max_words_per_cue: int | None = Field(
197
+ 5,
198
+ ge=1,
199
+ description=(
200
+ "Maximum words shown on screen at once. Each transcription segment is re-chunked into "
201
+ "cues of at most this many words, without bridging the silence gaps between segments, so "
202
+ "subtitles stay readable and don't linger over pauses. None preserves the source "
203
+ "transcription's segmentation."
204
+ ),
205
+ )
206
+ capitalize: bool = Field(
207
+ True,
208
+ description=(
209
+ "Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
210
+ "Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
211
+ "exactly as transcribed."
212
+ ),
213
+ )
214
+ font_filename: str | None = Field(
215
+ None,
216
+ description="Path to a .ttf font file for rendering subtitle text, or None for the bundled default font.",
217
+ )
218
+ highlight_bold_font: str | None = Field(
219
+ None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
220
+ )
221
+
222
+ # ---- advanced overrides: None => derive from style/region/font_scale ----
223
+ font_size: int | None = Field(
224
+ None,
225
+ ge=1,
226
+ description=(
227
+ "Advanced override: absolute base font size in pixels. Leave None to derive from "
228
+ "`font_scale` (recommended -- resolution-independent and overflow-safe)."
229
+ ),
230
+ )
231
+ font_border_size: int | None = Field(
232
+ None, ge=0, description="Advanced override for outline thickness in px. None takes it from `style`."
233
+ )
234
+ text_color: RGBColor | None = Field(
235
+ None, description="Advanced override for default text color [R, G, B] (0-255). None takes it from `style`."
236
+ )
237
+ background_color: RGBAColor | None | Literal["auto"] = Field(
238
+ _AUTO,
239
+ description=(
240
+ 'Advanced override for the box background [R, G, B, A] (0-255). "auto" takes it from `style`; '
241
+ "null explicitly disables the background."
242
+ ),
243
+ )
244
+ background_padding: int | None = Field(
245
+ None, ge=0, description="Advanced override: px between text and background edge. None takes it from `style`."
246
+ )
247
+ highlight_color: RGBColor | None = Field(
248
+ None, description="Advanced override for the spoken-word color [R, G, B]. None takes it from `style`."
249
+ )
250
+ highlight_size_multiplier: float | None = Field(
251
+ None, gt=0, description="Advanced override: scale factor for the highlighted word. None takes it from `style`."
252
+ )
253
+ position: tuple[float, float] | None = Field(
254
+ None,
255
+ description="Advanced override: box center as normalized (x, y). None derives it from `region`.",
256
+ )
257
+ box_width: float | None = Field(
258
+ None,
259
+ gt=0.0,
260
+ le=1.0,
261
+ description="Advanced override: box width as a fraction of frame width in (0, 1]. None uses 0.6.",
262
+ )
263
+ text_align: TextAlign | None = Field(
264
+ None, description='Advanced override: text alignment within the box. None uses "center".'
265
+ )
266
+ anchor: AnchorPoint | None = Field(
267
+ None, description="Advanced override: which point of the box sits at the position. None uses center."
268
+ )
269
+ margin: int | tuple[int, int, int, int] | None = Field(
270
+ None,
271
+ description="Advanced override: space around the box in px (or [top, right, bottom, left]). None uses 20.",
272
+ )
273
+
274
+ # ------------------------------------------------------------- resolution
275
+
276
+ def _style_params(self) -> _StyleParams:
277
+ """Effective look: the ``style`` preset overlaid by any explicit overrides."""
278
+ p = _STYLE_PRESETS[self.style]
279
+ bg = p.background_color if self.background_color == _AUTO else self.background_color
280
+ return _StyleParams(
281
+ text_color=self.text_color or p.text_color,
282
+ highlight_color=self.highlight_color or p.highlight_color,
283
+ border=self.font_border_size if self.font_border_size is not None else p.border,
284
+ background_color=bg,
285
+ background_padding=(
286
+ self.background_padding if self.background_padding is not None else p.background_padding
287
+ ),
288
+ highlight_size_multiplier=(
289
+ self.highlight_size_multiplier
290
+ if self.highlight_size_multiplier is not None
291
+ else p.highlight_size_multiplier
292
+ ),
293
+ )
294
+
295
+ def _resolve_config(self) -> _ResolvedConfig:
296
+ """Resolve every override-or-preset field to a concrete value once."""
297
+ return _ResolvedConfig(
298
+ position=self.position if self.position is not None else _REGION_POSITION[self.region],
299
+ anchor=self.anchor if self.anchor is not None else AnchorPoint.CENTER,
300
+ box_width=self.box_width if self.box_width is not None else 0.6,
301
+ text_align=self.text_align if self.text_align is not None else TextAlign.CENTER,
302
+ margin=self.margin if self.margin is not None else 20,
303
+ style=self._style_params(),
304
+ )
305
+
306
+ def _transform(self, transcription: Transcription) -> Transcription:
307
+ """Apply the cue transforms render and dry-run MUST share."""
308
+ if self.max_words_per_cue is not None:
309
+ transcription = transcription.chunk_segments(self.max_words_per_cue)
310
+ if self.capitalize:
311
+ transcription = transcription.capitalize_sentences()
312
+ return transcription
313
+
314
+ def _place_cue(self, img_text: ImageText, text: str, font_px: int, cfg: _ResolvedConfig) -> _CueBox | None:
315
+ """Measure ``text`` at ``font_px`` and clamp its box inside the margins.
316
+
317
+ Returns ``None`` for a degenerate (whitespace-only) cue. ``fits`` is
318
+ False when the box is larger than the drawable area even after
319
+ clamping -- i.e. shrinking the font is the only remedy. Used by both
320
+ the fit search and the renderer, so they never diverge. Margin math
321
+ comes from ``ImageText.available_region`` (one source of truth with
322
+ ``measure_text_box``).
323
+ """
324
+ rect = img_text.measure_text_box(
325
+ text=text,
326
+ font_filename=self.font_filename,
327
+ xy=cfg.position,
328
+ box_width=cfg.box_width,
329
+ font_size=font_px,
330
+ anchor=cfg.anchor,
331
+ margin=cfg.margin,
332
+ )
333
+ if rect.height == 0:
334
+ return None
335
+ box_w = int(rect.width)
336
+ box_h = rect.height
337
+ left, top, avail_w, avail_h = img_text.available_region(cfg.margin)
338
+ fits = box_w <= avail_w and box_h <= avail_h
339
+ x = min(max(int(round(rect.x)), left), left + avail_w - box_w)
340
+ y = min(max(int(round(rect.y)), top), top + avail_h - box_h)
341
+ return _CueBox(x=x, y=y, box_w=box_w, height=box_h, fits=fits)
342
+
343
+ def _resolve_layout(self, width: int, height: int, transcription: Transcription) -> _SubtitleLayout:
344
+ """Single source of truth for config + font size + fit (render & dry-run)."""
345
+ segments = self._transform(transcription).segments
346
+ cues = [s for s in segments if s.text.strip()]
347
+ cfg = self._resolve_config()
348
+
349
+ desired = self.font_size if self.font_size is not None else max(1, round(self.font_scale * height))
350
+ floor = max(1, round(self.min_font_scale * height))
351
+ # Never search above the desired size nor below the legible floor --
352
+ # but if the user pinned a font_size below the floor, honor it.
353
+ lo = min(desired, floor)
354
+
355
+ img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
356
+
357
+ def first_unfit(font_px: int) -> TranscriptionSegment | None:
358
+ for cue in cues:
359
+ box = self._place_cue(img_text, cue.text, font_px, cfg)
360
+ if box is not None and not box.fits:
361
+ return cue
362
+ return None
363
+
364
+ # "fits" is monotonic in font size (a larger font never fits where a
365
+ # smaller one didn't -- box width is font-independent and box height
366
+ # is non-decreasing in font size), so binary-search the largest fit.
367
+ if first_unfit(desired) is None:
368
+ return _SubtitleLayout(segments, cfg, desired, True, None)
369
+ offender = first_unfit(lo)
370
+ if offender is not None:
371
+ error = (
372
+ f"Subtitle cue {offender.text!r} cannot fit in a {width}x{height} frame even at the "
373
+ f"minimum font size ({lo}px, min_font_scale={self.min_font_scale}). Lower min_font_scale, "
374
+ f"reduce max_words_per_cue, widen box_width, or render at a larger resolution."
375
+ )
376
+ return _SubtitleLayout(segments, cfg, lo, False, error)
377
+ low, high = lo, desired # invariant: fits at low, not at high
378
+ while high - low > 1:
379
+ mid = (low + high) // 2
380
+ if first_unfit(mid) is None:
381
+ low = mid
382
+ else:
383
+ high = mid
384
+ return _SubtitleLayout(segments, cfg, low, True, None)
385
+
386
+ # ------------------------------------------------------------- timeline
387
+
388
+ def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
389
+ for segment in transcription.segments:
390
+ if segment.start <= timestamp <= segment.end:
391
+ return segment
392
+ return None
393
+
394
+ def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
395
+ for i, word in enumerate(segment.words):
396
+ if word.start <= timestamp <= word.end:
397
+ return i
398
+ return None
399
+
400
+ def _create_text_overlay(
401
+ self,
402
+ video_shape: tuple[int, int, int],
403
+ segment: TranscriptionSegment,
404
+ highlight_word_index: int | None,
405
+ layout: _SubtitleLayout,
406
+ cache: dict[tuple[str, int | None], np.ndarray],
407
+ ) -> np.ndarray:
408
+ height, width = video_shape[:2]
409
+ cache_key = (segment.text, highlight_word_index)
410
+ if cache_key in cache:
411
+ return cache[cache_key]
412
+
413
+ cfg = layout.config
414
+ img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
415
+ box = self._place_cue(img_text, segment.text, layout.font_px, cfg)
416
+ if box is not None:
417
+ sp = cfg.style
418
+ # Absolute, pre-clamped placement (anchor=TOP_LEFT, explicit px box,
419
+ # margin already applied) -- the same numbers _resolve_layout used,
420
+ # so a layout that validated cannot raise OutOfBoundsError here.
421
+ img_text.write_text_box(
422
+ text=segment.text,
423
+ font_filename=self.font_filename,
424
+ xy=(box.x, box.y),
425
+ box_width=box.box_w,
426
+ font_size=layout.font_px,
427
+ font_border_size=sp.border,
428
+ text_color=sp.text_color,
429
+ background_color=sp.background_color,
430
+ background_padding=sp.background_padding,
431
+ place=cfg.text_align,
432
+ anchor=AnchorPoint.TOP_LEFT,
433
+ margin=0,
434
+ words=[w.word for w in segment.words],
435
+ highlight_word_index=highlight_word_index,
436
+ highlight_color=sp.highlight_color,
437
+ highlight_size_multiplier=sp.highlight_size_multiplier,
438
+ highlight_bold_font=self.highlight_bold_font,
439
+ )
440
+
441
+ overlay_image = img_text.img_array
442
+ cache[cache_key] = overlay_image
443
+ return overlay_image
444
+
445
+ def apply( # type: ignore[override]
446
+ self,
447
+ video: Video,
448
+ transcription: Transcription | None = None,
449
+ ) -> Video:
450
+ if transcription is None:
451
+ raise ValueError(
452
+ "TranscriptionOverlay requires transcription data. "
453
+ "Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
454
+ )
455
+
456
+ height, width = video.frame_shape[:2]
457
+ layout = self._resolve_layout(width, height, transcription)
458
+ if not layout.fits:
459
+ # Should be unreachable when the plan went through validate(); kept
460
+ # as defense in depth so a direct apply() still fails clearly
461
+ # rather than crashing mid-render in ImageText.
462
+ raise ValueError(layout.error)
463
+
464
+ # Per-call memo of rendered overlays, keyed by (cue text, highlighted
465
+ # word). Local rather than instance state so the model stays stateless
466
+ # and re-entrant -- a reused instance can render differently sized
467
+ # videos without serving a stale-resolution overlay.
468
+ cache: dict[tuple[str, int | None], np.ndarray] = {}
469
+ transformed = Transcription(segments=layout.segments, language=transcription.language)
470
+
471
+ logger.info("Applying transcription overlay (font %dpx)...", layout.font_px)
472
+ new_frames = []
473
+ for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
474
+ timestamp = frame_index / video.fps
475
+ active_segment = self._get_active_segment(transformed, timestamp)
476
+ if active_segment is None:
477
+ new_frames.append(frame)
478
+ continue
479
+ highlight_word_index = self._get_active_word_index(active_segment, timestamp)
480
+ text_overlay = self._create_text_overlay(
481
+ video.frame_shape, active_segment, highlight_word_index, layout, cache
482
+ )
483
+ new_frames.append(self._apply_overlay_to_frame(frame, text_overlay))
484
+
485
+ new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
486
+ new_video.audio = video.audio
487
+ return new_video
488
+
489
+ def predict_metadata(
490
+ self,
491
+ meta: VideoMetadata,
492
+ transcription: Transcription | None = None,
493
+ **_context: Any,
494
+ ) -> VideoMetadata:
495
+ """Identity for metadata (shape/count preserved) -- but fail fast here
496
+ if the resolved subtitles cannot fit the predicted frame.
497
+
498
+ This is the backstop that closes the validate/run gap: ``VideoEdit``
499
+ runs it during the dry-run, so an un-fittable plan is rejected before
500
+ any frame/GPU work, symmetric with the timing/dimension checks. Mirrors
501
+ ``SilenceRemoval``: with no ``transcription`` in the validate context
502
+ the layout cannot be checked, so this is a no-op identity (the same
503
+ conditional guarantee as time re-basing).
504
+ """
505
+ if transcription is None:
506
+ return meta
507
+ layout = self._resolve_layout(meta.width, meta.height, transcription)
508
+ if not layout.fits:
509
+ raise ValueError(layout.error)
510
+ return meta
511
+
512
+ def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
513
+ frame_pil = Image.fromarray(frame)
514
+ overlay_pil = Image.fromarray(overlay)
515
+ frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
516
+ return np.array(frame_pil)
@@ -1,186 +0,0 @@
1
- """Subtitle overlay effect.
2
-
3
- ``TranscriptionOverlay`` is an :class:`Effect` that renders animated
4
- word-by-word subtitles onto a :class:`Video` using a word-level
5
- :class:`Transcription`. Rendering is delegated to ``ImageText`` from
6
- the sibling module.
7
- """
8
-
9
- from __future__ import annotations
10
-
11
- import logging
12
- from typing import ClassVar, Literal
13
-
14
- import numpy as np
15
- from PIL import Image
16
- from pydantic import Field, PrivateAttr
17
- from tqdm import tqdm
18
-
19
- from videopython.base.image_text import AnchorPoint, ImageText, TextAlign
20
- from videopython.base.transcription import Transcription, TranscriptionSegment
21
- from videopython.base.video import Video
22
- from videopython.editing.operation import Effect
23
-
24
- __all__ = ["TranscriptionOverlay"]
25
-
26
- logger = logging.getLogger(__name__)
27
-
28
-
29
- class TranscriptionOverlay(Effect):
30
- """Renders animated word-by-word subtitles with the current word highlighted.
31
-
32
- Each word lights up in the highlight color as it is spoken, based on
33
- transcription timestamps. Requires a word-level transcription, which the
34
- runner supplies via the ``requires=("transcription",)`` declaration.
35
- """
36
-
37
- op: Literal["add_subtitles"] = "add_subtitles"
38
- streamable: ClassVar[bool] = False
39
- requires: ClassVar[tuple[str, ...]] = ("transcription",)
40
-
41
- font_filename: str | None = Field(
42
- None,
43
- description="Path to a .ttf font file for rendering subtitle text, or None for the bundled default font.",
44
- )
45
- font_size: int = Field(40, ge=1, description="Base font size in pixels.")
46
- font_border_size: int = Field(
47
- 2, ge=0, description="Outline thickness around each character in pixels. 0 = no outline."
48
- )
49
- text_color: tuple[int, int, int] = Field((255, 235, 59), description="Default text color as [R, G, B], each 0-255.")
50
- background_color: tuple[int, int, int, int] | None = Field(
51
- (0, 0, 0, 100),
52
- description="Subtitle box background as [R, G, B, A] (0-255), or None to disable the background.",
53
- )
54
- background_padding: int = Field(15, ge=0, description="Pixels of space between text and background edge.")
55
- position: tuple[float, float] = Field(
56
- (0.5, 0.7),
57
- description="Text box center as normalized (x, y). (0, 0) = top-left, (1, 1) = bottom-right.",
58
- )
59
- box_width: float = Field(
60
- 0.6, gt=0.0, le=1.0, description="Width of the text box as a fraction of frame width, in (0, 1]."
61
- )
62
- text_align: TextAlign = Field(
63
- TextAlign.CENTER, description='Text alignment within the box: "left", "right", or "center".'
64
- )
65
- anchor: AnchorPoint = Field(
66
- AnchorPoint.CENTER, description="Which point of the text box sits at the position coordinate."
67
- )
68
- margin: int | tuple[int, int, int, int] = Field(
69
- 20,
70
- description="Space around the text box in pixels, or a [top, right, bottom, left] tuple of per-side values.",
71
- )
72
- highlight_color: tuple[int, int, int] = Field(
73
- (76, 175, 80), description="Color for the currently spoken word as [R, G, B]."
74
- )
75
- highlight_size_multiplier: float = Field(
76
- 1.2, gt=0, description="Scale factor for the highlighted word. 1.0 = same size, 1.2 = 20% larger."
77
- )
78
- highlight_bold_font: str | None = Field(
79
- None, description="Path to a bold .ttf font for the highlighted word, or None to use the regular font."
80
- )
81
- max_words_per_cue: int | None = Field(
82
- 5,
83
- ge=1,
84
- description=(
85
- "Maximum words shown on screen at once. Each transcription segment is re-chunked into "
86
- "cues of at most this many words, without bridging the silence gaps between segments, so "
87
- "subtitles stay readable and don't linger over pauses. None preserves the source "
88
- "transcription's segmentation."
89
- ),
90
- )
91
- capitalize: bool = Field(
92
- True,
93
- description=(
94
- "Capitalize the first letter of each sentence (first word, and words after '.', '!', '?'). "
95
- "Fixes lowercase sentence starts from word-level speech-to-text. Set False to render text "
96
- "exactly as transcribed."
97
- ),
98
- )
99
-
100
- _overlay_cache: dict[tuple[str, int | None], np.ndarray] = PrivateAttr(default_factory=dict)
101
-
102
- def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
103
- for segment in transcription.segments:
104
- if segment.start <= timestamp <= segment.end:
105
- return segment
106
- return None
107
-
108
- def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
109
- for i, word in enumerate(segment.words):
110
- if word.start <= timestamp <= word.end:
111
- return i
112
- return None
113
-
114
- def _create_text_overlay(
115
- self,
116
- video_shape: tuple[int, int, int],
117
- segment: TranscriptionSegment,
118
- highlight_word_index: int | None,
119
- ) -> np.ndarray:
120
- height, width = video_shape[:2]
121
- cache_key = (segment.text, highlight_word_index)
122
- if cache_key in self._overlay_cache:
123
- return self._overlay_cache[cache_key]
124
-
125
- img_text = ImageText(image_size=(height, width), background=(0, 0, 0, 0))
126
- img_text.write_text_box(
127
- text=segment.text,
128
- font_filename=self.font_filename,
129
- xy=self.position,
130
- box_width=self.box_width,
131
- font_size=self.font_size,
132
- font_border_size=self.font_border_size,
133
- text_color=self.text_color,
134
- background_color=self.background_color,
135
- background_padding=self.background_padding,
136
- place=self.text_align,
137
- anchor=self.anchor,
138
- margin=self.margin,
139
- words=[w.word for w in segment.words],
140
- highlight_word_index=highlight_word_index,
141
- highlight_color=self.highlight_color,
142
- highlight_size_multiplier=self.highlight_size_multiplier,
143
- highlight_bold_font=self.highlight_bold_font,
144
- )
145
-
146
- overlay_image = img_text.img_array
147
- self._overlay_cache[cache_key] = overlay_image
148
- return overlay_image
149
-
150
- def apply( # type: ignore[override]
151
- self,
152
- video: Video,
153
- transcription: Transcription | None = None,
154
- ) -> Video:
155
- if transcription is None:
156
- raise ValueError(
157
- "TranscriptionOverlay requires transcription data. "
158
- "Pass it via VideoEdit.run(context={'transcription': ...}) or directly to apply()."
159
- )
160
-
161
- if self.max_words_per_cue is not None:
162
- transcription = transcription.chunk_segments(self.max_words_per_cue)
163
- if self.capitalize:
164
- transcription = transcription.capitalize_sentences()
165
-
166
- logger.info("Applying transcription overlay...")
167
- new_frames = []
168
- for frame_index, frame in enumerate(tqdm(video.frames, desc="Transcription overlay")):
169
- timestamp = frame_index / video.fps
170
- active_segment = self._get_active_segment(transcription, timestamp)
171
- if active_segment is None:
172
- new_frames.append(frame)
173
- continue
174
- highlight_word_index = self._get_active_word_index(active_segment, timestamp)
175
- text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
176
- new_frames.append(self._apply_overlay_to_frame(frame, text_overlay))
177
-
178
- new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
179
- new_video.audio = video.audio
180
- return new_video
181
-
182
- def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
183
- frame_pil = Image.fromarray(frame)
184
- overlay_pil = Image.fromarray(overlay)
185
- frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
186
- return np.array(frame_pil)
File without changes
File without changes
File without changes