videopython 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videopython might be problematic. Click here for more details.

@@ -1,37 +1,66 @@
1
- from typing import Literal
1
+ from typing import Literal, Union
2
2
 
3
3
  import whisper
4
+ from soundpython import Audio
4
5
 
5
- from videopython.base.transcription import Transcription, TranscriptionSegment
6
+ from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
6
7
  from videopython.base.video import Video
7
8
 
8
9
 
9
- class VideoTranscription:
10
+ class CreateTranscription:
11
+ """Unified transcription service for both audio and video."""
12
+
10
13
  def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
11
14
  self.model = whisper.load_model(name=model_name)
12
15
 
13
- def transcribe_video(self, video: Video) -> Transcription:
14
- """Transcribes video to text.
16
+ def _process_transcription_result(self, transcription_result: dict) -> Transcription:
17
+ """Process raw transcription result into Transcription object.
15
18
 
16
19
  Args:
17
- video: Video to transcribe.
20
+ transcription_result: Raw result from whisper model
18
21
 
19
22
  Returns:
20
- List of dictionaries with segments of text and their start and end times.
23
+ Processed Transcription object
21
24
  """
22
- if video.audio.is_silent:
23
- return Transcription(segments=[])
25
+ transcription_segments = []
26
+ for segment in transcription_result["segments"]:
27
+ transcription_words = [
28
+ TranscriptionWord(word=word["word"], start=float(word["start"]), end=float(word["end"]))
29
+ for word in segment["words"]
30
+ ]
31
+ transcription_segment = TranscriptionSegment(
32
+ start=segment["start"], end=segment["end"], text=segment["text"], words=transcription_words
33
+ )
34
+ transcription_segments.append(transcription_segment)
35
+
36
+ return Transcription(segments=transcription_segments)
37
+
38
+ def transcribe(self, media: Union[Audio, Video]) -> Transcription:
39
+ """Transcribe audio or video to text.
40
+
41
+ Args:
42
+ media: Audio or Video to transcribe.
43
+
44
+ Returns:
45
+ Transcription object with segments of text and their timestamps.
46
+ """
47
+ if isinstance(media, Video):
48
+ # Handle video transcription
49
+ if media.audio.is_silent:
50
+ return Transcription(segments=[])
51
+
52
+ audio = media.audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
53
+ transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
24
54
 
25
- audio = video.audio.to_mono()
26
- audio = audio.resample(whisper.audio.SAMPLE_RATE)
27
- audio_data = audio.data
55
+ elif isinstance(media, Audio):
56
+ # Handle audio transcription
57
+ if media.is_silent:
58
+ return Transcription(segments=[])
28
59
 
29
- transcription = self.model.transcribe(audio=audio_data, word_timestamps=True)
60
+ audio = media.to_mono().resample(whisper.audio.SAMPLE_RATE)
61
+ transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
30
62
 
31
- transcription_segments = [
32
- TranscriptionSegment(start=segment["start"], end=segment["end"], text=segment["text"])
33
- for segment in transcription["segments"]
34
- ]
35
- result = Transcription(segments=transcription_segments)
63
+ else:
64
+ raise TypeError(f"Unsupported media type: {type(media)}. Expected Audio or Video.")
36
65
 
37
- return result
66
+ return self._process_transcription_result(transcription_result)
@@ -0,0 +1,45 @@
1
+ from typing import Literal
2
+
3
+ import numpy as np
4
+
5
+ from videopython.base.transforms import ResampleFPS, Resize
6
+ from videopython.base.video import Video
7
+
8
+
9
+ class StackVideos:
10
+ def __init__(self, mode: Literal["horizontal", "vertical"]) -> None:
11
+ self.mode = mode
12
+
13
+ def _validate(self, video1: Video, video2: Video) -> tuple[Video, Video]:
14
+ video1, video2 = self._align_shapes(video1, video2)
15
+ video1, video2 = self._align_fps(video1, video2)
16
+ video1, video2 = self._align_duration(video1, video2)
17
+ return video1, video2
18
+
19
+ def _align_fps(self, video1: Video, video2: Video) -> tuple[Video, Video]:
20
+ if video1.fps > video2.fps:
21
+ video1 = ResampleFPS(fps=video2.fps).apply(video1)
22
+ elif video1.fps < video2.fps:
23
+ video2 = ResampleFPS(fps=video1.fps).apply(video2)
24
+ return (video1, video2)
25
+
26
+ def _align_shapes(self, video1: Video, video2: Video) -> tuple[Video, Video]:
27
+ if self.mode == "horizontal":
28
+ video2 = Resize(height=video1.metadata.height).apply(video2)
29
+ elif self.mode == "vertical":
30
+ video2 = Resize(width=video1.metadata.width).apply(video2)
31
+ return (video1, video2)
32
+
33
+ def _align_duration(self, video1: Video, video2: Video) -> tuple[Video, Video]:
34
+ if len(video1.frames) > len(video2.frames):
35
+ video1 = video1[: len(video2.frames)]
36
+ elif len(video1.frames) < len(video2.frames):
37
+ video2 = video2[: len(video1.frames)]
38
+ return (video1, video2)
39
+
40
+ def apply(self, videos: tuple[Video, Video]) -> Video:
41
+ videos = self._validate(*videos)
42
+ axis = 1 if self.mode == "vertical" else 2
43
+ new_frames = np.concatenate((videos[0].frames, videos[1].frames), axis=axis)
44
+ new_audio = videos[0].audio.overlay(videos[1].audio)
45
+ return Video(frames=new_frames, fps=videos[0].fps, audio=new_audio)
File without changes
@@ -1,10 +1,21 @@
1
+ """
2
+ Beware, the code below was heavily "vibe-coded".
3
+
4
+ The main purpose of this file are 2 classes:
5
+ 1. `ImageText` class for creating RGBA image with rendered subtitles
6
+ 2. `TranscriptionOverlay` class, which takes the `Transcription` and `Video` objects and overlays subtitles on `Video`.
7
+ """
8
+
1
9
  from enum import Enum
2
10
  from typing import TypeAlias, Union
3
11
 
4
12
  import numpy as np
5
13
  from PIL import Image, ImageDraw, ImageFont
14
+ from tqdm import tqdm
6
15
 
7
16
  from videopython.base.exceptions import OutOfBoundsError
17
+ from videopython.base.text.transcription import Transcription, TranscriptionSegment
18
+ from videopython.base.video import Video
8
19
 
9
20
  # Type aliases for clarity
10
21
  MarginType: TypeAlias = Union[int, tuple[int, int, int, int]]
@@ -319,6 +330,7 @@ class ImageText:
319
330
  font_filename: str,
320
331
  xy: PositionType,
321
332
  font_size: int | None = 11,
333
+ font_border_size: int = 0,
322
334
  color: RGBColor = (0, 0, 0),
323
335
  max_width: int | None = None,
324
336
  max_height: int | None = None,
@@ -333,6 +345,7 @@ class ImageText:
333
345
  font_filename: Path to the font file
334
346
  xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
335
347
  font_size: Size of the font in points, or None to auto-calculate
348
+ font_border_size: Size of border around text in pixels (0 for no border)
336
349
  color: RGB color of the text
337
350
  max_width: Maximum width for auto font sizing
338
351
  max_height: Maximum height for auto font sizing
@@ -355,6 +368,9 @@ class ImageText:
355
368
  if font_size is not None and font_size <= 0:
356
369
  raise ValueError("Font size must be positive")
357
370
 
371
+ if font_border_size < 0:
372
+ raise ValueError("Font border size cannot be negative")
373
+
358
374
  if font_size is None and (max_width is None or max_height is None):
359
375
  raise ValueError("Must set either `font_size`, or both `max_width` and `max_height`!")
360
376
  elif font_size is None:
@@ -371,6 +387,15 @@ class ImageText:
371
387
  if x < 0 or y < 0 or x + text_dimensions[0] > self.image_size[0] or y + text_dimensions[1] > self.image_size[1]:
372
388
  raise OutOfBoundsError(f"Text with size {text_dimensions} at position ({x}, {y}) is out of bounds!")
373
389
 
390
+ # Draw border if requested
391
+ if font_border_size > 0:
392
+ # Draw text border by drawing text in multiple positions around the main text
393
+ for border_x in range(-font_border_size, font_border_size + 1):
394
+ for border_y in range(-font_border_size, font_border_size + 1):
395
+ if border_x != 0 or border_y != 0: # Skip the center position
396
+ self._draw.text((x + border_x, y + border_y), text, font=font, fill=(0, 0, 0))
397
+
398
+ # Draw the main text on top
374
399
  self._draw.text((x, y), text, font=font, fill=color)
375
400
  return text_dimensions
376
401
 
@@ -423,6 +448,46 @@ class ImageText:
423
448
  except Exception as e:
424
449
  raise ValueError(f"Error measuring text: {str(e)}")
425
450
 
451
+ def _get_font_baseline_offset(
452
+ self, base_font_filename: str, base_font_size: int, highlight_font_filename: str, highlight_font_size: int
453
+ ) -> int:
454
+ """
455
+ Calculate the vertical offset needed to align baselines of different fonts and sizes.
456
+
457
+ Args:
458
+ base_font_filename: Path to the base font file
459
+ base_font_size: Font size of normal text
460
+ highlight_font_filename: Path to the highlight font file
461
+ highlight_font_size: Font size of highlighted text
462
+
463
+ Returns:
464
+ Vertical offset in pixels to align highlighted text baseline with normal text baseline
465
+ """
466
+ base_font = self._get_font(base_font_filename, base_font_size)
467
+ highlight_font = self._get_font(highlight_font_filename, highlight_font_size)
468
+
469
+ # Use a reference character to get baseline metrics
470
+ # We use 'A' as it's a good reference for ascender height
471
+ ref_char = "A"
472
+
473
+ # Get bounding boxes for the reference character
474
+ base_bbox = base_font.getbbox(ref_char)
475
+ highlight_bbox = highlight_font.getbbox(ref_char)
476
+
477
+ if base_bbox is None or highlight_bbox is None:
478
+ return 0 # Fallback if bbox calculation fails
479
+
480
+ # The baseline offset is the difference in the top of the bounding box
481
+ # since getbbox returns (left, top, right, bottom) where top is negative for ascenders
482
+ base_ascent = -base_bbox[1] # Distance from baseline to top of character
483
+ highlight_ascent = -highlight_bbox[1] # Distance from baseline to top of character
484
+
485
+ # Calculate the offset needed to align baselines
486
+ # If highlighted text has a larger ascent, we need to move it down
487
+ baseline_offset = highlight_ascent - base_ascent
488
+
489
+ return baseline_offset
490
+
426
491
  def _split_lines_by_width(
427
492
  self,
428
493
  text: str,
@@ -499,12 +564,18 @@ class ImageText:
499
564
  xy: PositionType,
500
565
  box_width: Union[int, float] | None = None,
501
566
  font_size: int = 11,
567
+ font_border_size: int = 0,
502
568
  text_color: RGBColor = (0, 0, 0),
503
569
  background_color: RGBAColor | None = None,
504
570
  background_padding: int = 0,
505
571
  place: TextAlign = TextAlign.LEFT,
506
572
  anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
507
573
  margin: MarginType = 0,
574
+ words: list[str] | None = None,
575
+ highlight_word_index: int | None = None,
576
+ highlight_color: RGBColor | None = None,
577
+ highlight_size_multiplier: float = 1.5,
578
+ highlight_bold_font: str | None = None,
508
579
  ) -> tuple[int, int]:
509
580
  """
510
581
  Write text in a box with advanced positioning and alignment options.
@@ -515,12 +586,18 @@ class ImageText:
515
586
  xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
516
587
  box_width: Width of the box in pixels (int) or relative to frame width (float 0-1)
517
588
  font_size: Font size in points
589
+ font_border_size: Size of border around text in pixels (0 for no border)
518
590
  text_color: RGB color of the text
519
591
  background_color: If set, adds background color to the text box. Expects RGBA values.
520
592
  background_padding: Number of padding pixels to add when adding text background color
521
593
  place: Text alignment within the box (TextAlign.LEFT, TextAlign.RIGHT, TextAlign.CENTER)
522
594
  anchor: Which part of the text box to anchor at the position
523
595
  margin: Margin in pixels (single value or [top, right, bottom, left])
596
+ words: All words occuring in text, helpful for highlighting.
597
+ highlight_word_index: Index of word to highlight (0-based, None to disable highlighting)
598
+ highlight_color: RGB color for the highlighted word (defaults to text_color if None)
599
+ highlight_size_multiplier: Font size multiplier for highlighted word
600
+ highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
524
601
 
525
602
  Returns:
526
603
  Coordinates of the lower-right corner of the written text box (x, y)
@@ -541,6 +618,25 @@ class ImageText:
541
618
  if background_padding < 0:
542
619
  raise ValueError("Background padding cannot be negative")
543
620
 
621
+ if font_border_size < 0:
622
+ raise ValueError("Font border size cannot be negative")
623
+
624
+ # Validate highlighting parameters
625
+ if highlight_word_index is not None:
626
+ if not words:
627
+ words = text.split()
628
+ if highlight_word_index < 0 or highlight_word_index >= len(words):
629
+ raise ValueError(
630
+ f"highlight_word_index {highlight_word_index} out of range for text with {len(words)} words"
631
+ )
632
+
633
+ if highlight_size_multiplier <= 0:
634
+ raise ValueError("highlight_size_multiplier must be positive")
635
+
636
+ # Set default highlight color if not provided
637
+ if highlight_word_index is not None and highlight_color is None:
638
+ highlight_color = text_color
639
+
544
640
  # Process margins to determine available area
545
641
  margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
546
642
  available_width = self.image_size[0] - margin_left - margin_right
@@ -590,6 +686,7 @@ class ImageText:
590
686
 
591
687
  # Write lines
592
688
  current_text_height = y_pos
689
+ word_index_offset = 0 # Track global word index across lines
593
690
  for line in lines:
594
691
  line_dimensions = self.get_text_dimensions(font_filename, font_size, line)
595
692
 
@@ -604,14 +701,49 @@ class ImageText:
604
701
  valid_places = [e.value for e in TextAlign]
605
702
  raise ValueError(f"Place '{place}' is not supported. Must be one of: {', '.join(valid_places)}")
606
703
 
607
- # Write the line
608
- self.write_text(
609
- text=line,
610
- font_filename=font_filename,
611
- xy=(x_left, current_text_height),
612
- font_size=font_size,
613
- color=text_color,
614
- )
704
+ # Check if highlighting is needed for this line
705
+ if highlight_word_index is not None:
706
+ line_words = line.split()
707
+ line_start_word_index = word_index_offset
708
+ line_end_word_index = word_index_offset + len(line_words) - 1
709
+
710
+ # Check if the highlighted word is in this line
711
+ if line_start_word_index <= highlight_word_index <= line_end_word_index:
712
+ self._write_line_with_highlight(
713
+ line=line,
714
+ font_filename=font_filename,
715
+ font_size=font_size,
716
+ font_border_size=font_border_size,
717
+ text_color=text_color,
718
+ highlight_color=highlight_color or (255, 255, 255),
719
+ highlight_size_multiplier=highlight_size_multiplier,
720
+ highlight_word_local_index=highlight_word_index - line_start_word_index,
721
+ highlight_bold_font=highlight_bold_font,
722
+ x_left=int(x_left),
723
+ y_top=int(current_text_height),
724
+ )
725
+ else:
726
+ # Write normal line without highlighting
727
+ self.write_text(
728
+ text=line,
729
+ font_filename=font_filename,
730
+ xy=(x_left, current_text_height),
731
+ font_size=font_size,
732
+ font_border_size=font_border_size,
733
+ color=text_color,
734
+ )
735
+
736
+ word_index_offset += len(line_words)
737
+ else:
738
+ # Write normal line without highlighting
739
+ self.write_text(
740
+ text=line,
741
+ font_filename=font_filename,
742
+ xy=(x_left, current_text_height),
743
+ font_size=font_size,
744
+ font_border_size=font_border_size,
745
+ color=text_color,
746
+ )
615
747
 
616
748
  # Increment vertical position for next line
617
749
  current_text_height += line_dimensions[1]
@@ -690,6 +822,88 @@ class ImageText:
690
822
 
691
823
  return (int(x_pos + box_width), int(current_text_height))
692
824
 
825
+ def _write_line_with_highlight(
826
+ self,
827
+ line: str,
828
+ font_filename: str,
829
+ font_size: int,
830
+ font_border_size: int,
831
+ text_color: RGBColor,
832
+ highlight_color: RGBColor,
833
+ highlight_size_multiplier: float,
834
+ highlight_word_local_index: int,
835
+ highlight_bold_font: str | None,
836
+ x_left: int,
837
+ y_top: int,
838
+ ) -> None:
839
+ """
840
+ Write a line of text with one word highlighted using word-by-word rendering with baseline alignment.
841
+
842
+ Args:
843
+ line: The text line to render
844
+ font_filename: Path to the font file
845
+ font_size: Base font size in points
846
+ font_border_size: Size of border around text in pixels (0 for no border)
847
+ text_color: RGB color for normal text
848
+ highlight_color: RGB color for highlighted word
849
+ highlight_size_multiplier: Font size multiplier for highlighted word
850
+ highlight_word_local_index: Index of word to highlight within this line (0-based)
851
+ highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
852
+ x_left: Left x position for the line
853
+ y_top: Top y position for the line
854
+ """
855
+ # Split line into words
856
+ words = line.split()
857
+ if highlight_word_local_index >= len(words):
858
+ return # Safety check
859
+
860
+ # Calculate highlighted font size and determine font files
861
+ highlight_font_size = int(font_size * highlight_size_multiplier)
862
+ highlight_font_file = highlight_bold_font if highlight_bold_font is not None else font_filename
863
+
864
+ # Calculate baseline offset for highlighted words (using the appropriate font files)
865
+ baseline_offset = self._get_font_baseline_offset(
866
+ font_filename, font_size, highlight_font_file, highlight_font_size
867
+ )
868
+
869
+ # Render words one by one with proper spacing
870
+ current_x = x_left
871
+
872
+ for i, word in enumerate(words):
873
+ # Determine if this is the highlighted word
874
+ is_highlighted = i == highlight_word_local_index
875
+
876
+ # Choose font file, size, and color based on highlighting
877
+ word_font_file = highlight_font_file if is_highlighted else font_filename
878
+ word_font_size = highlight_font_size if is_highlighted else font_size
879
+ word_color = highlight_color if is_highlighted else text_color
880
+
881
+ # Calculate y position with baseline alignment
882
+ word_y = y_top
883
+ if is_highlighted:
884
+ word_y += baseline_offset
885
+
886
+ # Render the word
887
+ self.write_text(
888
+ text=word,
889
+ font_filename=word_font_file,
890
+ xy=(current_x, word_y),
891
+ font_size=word_font_size,
892
+ font_border_size=font_border_size,
893
+ color=word_color,
894
+ )
895
+
896
+ # Calculate the width of this word for spacing
897
+ word_width = self.get_text_dimensions(word_font_file, word_font_size, word)[0]
898
+
899
+ # Update current_x for next word (add word width plus space)
900
+ current_x += word_width
901
+
902
+ # Add space between words (except after the last word)
903
+ if i < len(words) - 1:
904
+ space_width = self.get_text_dimensions(font_filename, font_size, " ")[0]
905
+ current_x += space_width
906
+
693
907
  def _find_smallest_bounding_rect(self, mask: np.ndarray) -> tuple[int, int, int, int]:
694
908
  """
695
909
  Find the smallest bounding rectangle containing non-zero values in the mask.
@@ -725,3 +939,164 @@ class ImageText:
725
939
  xmin, xmax = col_indices[[0, -1]]
726
940
 
727
941
  return xmin, xmax, ymin, ymax
942
+
943
+
944
+ class TranscriptionOverlay:
945
+ def __init__(
946
+ self,
947
+ font_filename: str,
948
+ font_size: int = 40,
949
+ font_border_size: int = 2,
950
+ text_color: RGBColor = (255, 235, 59),
951
+ background_color: RGBAColor | None = (0, 0, 0, 100),
952
+ background_padding: int = 15,
953
+ position: PositionType = (0.5, 0.7),
954
+ box_width: Union[int, float] = 0.6,
955
+ text_align: TextAlign = TextAlign.CENTER,
956
+ anchor: AnchorPoint = AnchorPoint.CENTER,
957
+ margin: MarginType = 20,
958
+ highlight_color: RGBColor = (76, 175, 80),
959
+ highlight_size_multiplier: float = 1.2,
960
+ highlight_bold_font: str | None = None,
961
+ ):
962
+ """
963
+ Initialize TranscriptionOverlay effect.
964
+
965
+ Args:
966
+ font_filename: Path to font file for text rendering
967
+ font_size: Base font size for text
968
+ text_color: RGB color for normal text
969
+ font_border_size: Size of border around text in pixels (0 for no border)
970
+ background_color: RGBA background color (None for no background)
971
+ background_padding: Padding around text background
972
+ position: Position of text box (relative 0-1 or absolute pixels)
973
+ box_width: Width of text box (relative 0-1 or absolute pixels)
974
+ text_align: Text alignment within box
975
+ anchor: Anchor point for text positioning
976
+ margin: Margin around text box
977
+ highlight_color: RGB color for highlighted words
978
+ highlight_size_multiplier: Size multiplier for highlighted words
979
+ highlight_bold_font: Optional bold font for highlighting
980
+ """
981
+ self.font_filename = font_filename
982
+ self.font_size = font_size
983
+ self.text_color = text_color
984
+ self.font_border_size = font_border_size
985
+ self.background_color = background_color
986
+ self.background_padding = background_padding
987
+ self.position = position
988
+ self.box_width = box_width
989
+ self.text_align = text_align
990
+ self.anchor = anchor
991
+ self.margin = margin
992
+ self.highlight_color = highlight_color
993
+ self.highlight_size_multiplier = highlight_size_multiplier
994
+ self.highlight_bold_font = highlight_bold_font
995
+
996
+ # Cache for text overlays to avoid regenerating identical frames
997
+ self._overlay_cache: dict[tuple[str, int | None], np.ndarray] = {}
998
+
999
+ def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
1000
+ """Get the transcription segment active at the given timestamp."""
1001
+ for segment in transcription.segments:
1002
+ if segment.start <= timestamp <= segment.end:
1003
+ return segment
1004
+ return None
1005
+
1006
+ def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
1007
+ """Get the index of the word being spoken at the given timestamp within a segment."""
1008
+ for i, word in enumerate(segment.words):
1009
+ if word.start <= timestamp <= word.end:
1010
+ return i
1011
+ return None
1012
+
1013
+ def _create_text_overlay(
1014
+ self, video_shape: tuple[int, int, int], segment: TranscriptionSegment, highlight_word_index: int | None
1015
+ ) -> np.ndarray:
1016
+ """Create a text overlay image for the given segment and highlight."""
1017
+ # Use video frame dimensions for overlay
1018
+ height, width = video_shape[:2]
1019
+
1020
+ # Create cache key based on segment text and highlight
1021
+ cache_key = (segment.text, highlight_word_index)
1022
+ if cache_key in self._overlay_cache:
1023
+ return self._overlay_cache[cache_key]
1024
+
1025
+ # Create ImageText with video dimensions
1026
+ img_text = ImageText(image_size=(width, height), background=(0, 0, 0, 0))
1027
+
1028
+ # Write text with highlighting
1029
+ img_text.write_text_box(
1030
+ text=segment.text,
1031
+ font_filename=self.font_filename,
1032
+ xy=self.position,
1033
+ box_width=self.box_width,
1034
+ font_size=self.font_size,
1035
+ font_border_size=self.font_border_size,
1036
+ text_color=self.text_color,
1037
+ background_color=self.background_color,
1038
+ background_padding=self.background_padding,
1039
+ place=self.text_align,
1040
+ anchor=self.anchor,
1041
+ margin=self.margin,
1042
+ words=[w.word for w in segment.words],
1043
+ highlight_word_index=highlight_word_index,
1044
+ highlight_color=self.highlight_color,
1045
+ highlight_size_multiplier=self.highlight_size_multiplier,
1046
+ highlight_bold_font=self.highlight_bold_font,
1047
+ )
1048
+
1049
+ overlay_image = img_text.img_array
1050
+
1051
+ # Cache the overlay
1052
+ self._overlay_cache[cache_key] = overlay_image
1053
+
1054
+ return overlay_image
1055
+
1056
+ def apply(self, video: Video, transcription: Transcription) -> Video:
1057
+ """Apply transcription overlay to video frames."""
1058
+ print("Applying transcription overlay...")
1059
+
1060
+ new_frames = []
1061
+
1062
+ for frame_idx, frame in enumerate(tqdm(video.frames)):
1063
+ # Calculate timestamp for this frame
1064
+ timestamp = frame_idx / video.fps
1065
+
1066
+ # Get active segment at this timestamp
1067
+ active_segment = self._get_active_segment(transcription, timestamp)
1068
+
1069
+ if active_segment is None:
1070
+ # No active transcription, keep original frame
1071
+ new_frames.append(frame)
1072
+ continue
1073
+
1074
+ # Get active word index for highlighting
1075
+ highlight_word_index = self._get_active_word_index(active_segment, timestamp)
1076
+
1077
+ # Create text overlay
1078
+ text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
1079
+
1080
+ # Apply overlay to frame
1081
+ overlaid_frame = self._apply_overlay_to_frame(frame, text_overlay)
1082
+ new_frames.append(overlaid_frame)
1083
+
1084
+ # Create new video with overlaid frames
1085
+ new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
1086
+ new_video.audio = video.audio # Preserve audio
1087
+
1088
+ return new_video
1089
+
1090
+ def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
1091
+ """Apply a text overlay to a single frame."""
1092
+
1093
+ # Convert frame to PIL Image
1094
+ frame_pil = Image.fromarray(frame)
1095
+
1096
+ # Convert overlay to PIL Image
1097
+ overlay_pil = Image.fromarray(overlay)
1098
+
1099
+ # Paste overlay onto frame using alpha channel
1100
+ frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
1101
+
1102
+ return np.array(frame_pil)