videopython 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videopython might be problematic. Click here for more details.
- videopython/ai/understanding/transcribe.py +48 -19
- videopython/base/combine.py +45 -0
- videopython/base/text/__init__.py +0 -0
- videopython/{utils/text.py → base/text/overlay.py} +383 -8
- videopython/base/text/transcription.py +121 -0
- videopython/base/utils.py +6 -0
- videopython/base/video.py +164 -77
- videopython-0.5.0.dist-info/METADATA +194 -0
- {videopython-0.4.0.dist-info → videopython-0.5.0.dist-info}/RECORD +11 -12
- videopython/base/compose.py +0 -55
- videopython/base/transcription.py +0 -13
- videopython/utils/__init__.py +0 -3
- videopython/utils/common.py +0 -31
- videopython/utils/image.py +0 -47
- videopython-0.4.0.dist-info/METADATA +0 -118
- {videopython-0.4.0.dist-info → videopython-0.5.0.dist-info}/WHEEL +0 -0
- {videopython-0.4.0.dist-info → videopython-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,37 +1,66 @@
|
|
|
1
|
-
from typing import Literal
|
|
1
|
+
from typing import Literal, Union
|
|
2
2
|
|
|
3
3
|
import whisper
|
|
4
|
+
from soundpython import Audio
|
|
4
5
|
|
|
5
|
-
from videopython.base.transcription import Transcription, TranscriptionSegment
|
|
6
|
+
from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
|
|
6
7
|
from videopython.base.video import Video
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
class
|
|
10
|
+
class CreateTranscription:
|
|
11
|
+
"""Unified transcription service for both audio and video."""
|
|
12
|
+
|
|
10
13
|
def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
|
|
11
14
|
self.model = whisper.load_model(name=model_name)
|
|
12
15
|
|
|
13
|
-
def
|
|
14
|
-
"""
|
|
16
|
+
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
17
|
+
"""Process raw transcription result into Transcription object.
|
|
15
18
|
|
|
16
19
|
Args:
|
|
17
|
-
|
|
20
|
+
transcription_result: Raw result from whisper model
|
|
18
21
|
|
|
19
22
|
Returns:
|
|
20
|
-
|
|
23
|
+
Processed Transcription object
|
|
21
24
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
25
|
+
transcription_segments = []
|
|
26
|
+
for segment in transcription_result["segments"]:
|
|
27
|
+
transcription_words = [
|
|
28
|
+
TranscriptionWord(word=word["word"], start=float(word["start"]), end=float(word["end"]))
|
|
29
|
+
for word in segment["words"]
|
|
30
|
+
]
|
|
31
|
+
transcription_segment = TranscriptionSegment(
|
|
32
|
+
start=segment["start"], end=segment["end"], text=segment["text"], words=transcription_words
|
|
33
|
+
)
|
|
34
|
+
transcription_segments.append(transcription_segment)
|
|
35
|
+
|
|
36
|
+
return Transcription(segments=transcription_segments)
|
|
37
|
+
|
|
38
|
+
def transcribe(self, media: Union[Audio, Video]) -> Transcription:
|
|
39
|
+
"""Transcribe audio or video to text.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
media: Audio or Video to transcribe.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Transcription object with segments of text and their timestamps.
|
|
46
|
+
"""
|
|
47
|
+
if isinstance(media, Video):
|
|
48
|
+
# Handle video transcription
|
|
49
|
+
if media.audio.is_silent:
|
|
50
|
+
return Transcription(segments=[])
|
|
51
|
+
|
|
52
|
+
audio = media.audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
53
|
+
transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
|
|
24
54
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
55
|
+
elif isinstance(media, Audio):
|
|
56
|
+
# Handle audio transcription
|
|
57
|
+
if media.is_silent:
|
|
58
|
+
return Transcription(segments=[])
|
|
28
59
|
|
|
29
|
-
|
|
60
|
+
audio = media.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
61
|
+
transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
|
|
30
62
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
for segment in transcription["segments"]
|
|
34
|
-
]
|
|
35
|
-
result = Transcription(segments=transcription_segments)
|
|
63
|
+
else:
|
|
64
|
+
raise TypeError(f"Unsupported media type: {type(media)}. Expected Audio or Video.")
|
|
36
65
|
|
|
37
|
-
return
|
|
66
|
+
return self._process_transcription_result(transcription_result)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
|
|
5
|
+
from videopython.base.transforms import ResampleFPS, Resize
|
|
6
|
+
from videopython.base.video import Video
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class StackVideos:
|
|
10
|
+
def __init__(self, mode: Literal["horizontal", "vertical"]) -> None:
|
|
11
|
+
self.mode = mode
|
|
12
|
+
|
|
13
|
+
def _validate(self, video1: Video, video2: Video) -> tuple[Video, Video]:
|
|
14
|
+
video1, video2 = self._align_shapes(video1, video2)
|
|
15
|
+
video1, video2 = self._align_fps(video1, video2)
|
|
16
|
+
video1, video2 = self._align_duration(video1, video2)
|
|
17
|
+
return video1, video2
|
|
18
|
+
|
|
19
|
+
def _align_fps(self, video1: Video, video2: Video) -> tuple[Video, Video]:
|
|
20
|
+
if video1.fps > video2.fps:
|
|
21
|
+
video1 = ResampleFPS(fps=video2.fps).apply(video1)
|
|
22
|
+
elif video1.fps < video2.fps:
|
|
23
|
+
video2 = ResampleFPS(fps=video1.fps).apply(video2)
|
|
24
|
+
return (video1, video2)
|
|
25
|
+
|
|
26
|
+
def _align_shapes(self, video1: Video, video2: Video) -> tuple[Video, Video]:
|
|
27
|
+
if self.mode == "horizontal":
|
|
28
|
+
video2 = Resize(height=video1.metadata.height).apply(video2)
|
|
29
|
+
elif self.mode == "vertical":
|
|
30
|
+
video2 = Resize(width=video1.metadata.width).apply(video2)
|
|
31
|
+
return (video1, video2)
|
|
32
|
+
|
|
33
|
+
def _align_duration(self, video1: Video, video2: Video) -> tuple[Video, Video]:
|
|
34
|
+
if len(video1.frames) > len(video2.frames):
|
|
35
|
+
video1 = video1[: len(video2.frames)]
|
|
36
|
+
elif len(video1.frames) < len(video2.frames):
|
|
37
|
+
video2 = video2[: len(video1.frames)]
|
|
38
|
+
return (video1, video2)
|
|
39
|
+
|
|
40
|
+
def apply(self, videos: tuple[Video, Video]) -> Video:
|
|
41
|
+
videos = self._validate(*videos)
|
|
42
|
+
axis = 1 if self.mode == "vertical" else 2
|
|
43
|
+
new_frames = np.concatenate((videos[0].frames, videos[1].frames), axis=axis)
|
|
44
|
+
new_audio = videos[0].audio.overlay(videos[1].audio)
|
|
45
|
+
return Video(frames=new_frames, fps=videos[0].fps, audio=new_audio)
|
|
File without changes
|
|
@@ -1,10 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Beware, the code below was heavily "vibe-coded".
|
|
3
|
+
|
|
4
|
+
The main purpose of this file are 2 classes:
|
|
5
|
+
1. `ImageText` class for creating RGBA image with rendered subtitles
|
|
6
|
+
2. `TranscriptionOverlay` class, which takes the `Transcription` and `Video` objects and overlays subtitles on `Video`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from enum import Enum
|
|
2
10
|
from typing import TypeAlias, Union
|
|
3
11
|
|
|
4
12
|
import numpy as np
|
|
5
13
|
from PIL import Image, ImageDraw, ImageFont
|
|
14
|
+
from tqdm import tqdm
|
|
6
15
|
|
|
7
16
|
from videopython.base.exceptions import OutOfBoundsError
|
|
17
|
+
from videopython.base.text.transcription import Transcription, TranscriptionSegment
|
|
18
|
+
from videopython.base.video import Video
|
|
8
19
|
|
|
9
20
|
# Type aliases for clarity
|
|
10
21
|
MarginType: TypeAlias = Union[int, tuple[int, int, int, int]]
|
|
@@ -319,6 +330,7 @@ class ImageText:
|
|
|
319
330
|
font_filename: str,
|
|
320
331
|
xy: PositionType,
|
|
321
332
|
font_size: int | None = 11,
|
|
333
|
+
font_border_size: int = 0,
|
|
322
334
|
color: RGBColor = (0, 0, 0),
|
|
323
335
|
max_width: int | None = None,
|
|
324
336
|
max_height: int | None = None,
|
|
@@ -333,6 +345,7 @@ class ImageText:
|
|
|
333
345
|
font_filename: Path to the font file
|
|
334
346
|
xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
|
|
335
347
|
font_size: Size of the font in points, or None to auto-calculate
|
|
348
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
336
349
|
color: RGB color of the text
|
|
337
350
|
max_width: Maximum width for auto font sizing
|
|
338
351
|
max_height: Maximum height for auto font sizing
|
|
@@ -355,6 +368,9 @@ class ImageText:
|
|
|
355
368
|
if font_size is not None and font_size <= 0:
|
|
356
369
|
raise ValueError("Font size must be positive")
|
|
357
370
|
|
|
371
|
+
if font_border_size < 0:
|
|
372
|
+
raise ValueError("Font border size cannot be negative")
|
|
373
|
+
|
|
358
374
|
if font_size is None and (max_width is None or max_height is None):
|
|
359
375
|
raise ValueError("Must set either `font_size`, or both `max_width` and `max_height`!")
|
|
360
376
|
elif font_size is None:
|
|
@@ -371,6 +387,15 @@ class ImageText:
|
|
|
371
387
|
if x < 0 or y < 0 or x + text_dimensions[0] > self.image_size[0] or y + text_dimensions[1] > self.image_size[1]:
|
|
372
388
|
raise OutOfBoundsError(f"Text with size {text_dimensions} at position ({x}, {y}) is out of bounds!")
|
|
373
389
|
|
|
390
|
+
# Draw border if requested
|
|
391
|
+
if font_border_size > 0:
|
|
392
|
+
# Draw text border by drawing text in multiple positions around the main text
|
|
393
|
+
for border_x in range(-font_border_size, font_border_size + 1):
|
|
394
|
+
for border_y in range(-font_border_size, font_border_size + 1):
|
|
395
|
+
if border_x != 0 or border_y != 0: # Skip the center position
|
|
396
|
+
self._draw.text((x + border_x, y + border_y), text, font=font, fill=(0, 0, 0))
|
|
397
|
+
|
|
398
|
+
# Draw the main text on top
|
|
374
399
|
self._draw.text((x, y), text, font=font, fill=color)
|
|
375
400
|
return text_dimensions
|
|
376
401
|
|
|
@@ -423,6 +448,46 @@ class ImageText:
|
|
|
423
448
|
except Exception as e:
|
|
424
449
|
raise ValueError(f"Error measuring text: {str(e)}")
|
|
425
450
|
|
|
451
|
+
def _get_font_baseline_offset(
|
|
452
|
+
self, base_font_filename: str, base_font_size: int, highlight_font_filename: str, highlight_font_size: int
|
|
453
|
+
) -> int:
|
|
454
|
+
"""
|
|
455
|
+
Calculate the vertical offset needed to align baselines of different fonts and sizes.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
base_font_filename: Path to the base font file
|
|
459
|
+
base_font_size: Font size of normal text
|
|
460
|
+
highlight_font_filename: Path to the highlight font file
|
|
461
|
+
highlight_font_size: Font size of highlighted text
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Vertical offset in pixels to align highlighted text baseline with normal text baseline
|
|
465
|
+
"""
|
|
466
|
+
base_font = self._get_font(base_font_filename, base_font_size)
|
|
467
|
+
highlight_font = self._get_font(highlight_font_filename, highlight_font_size)
|
|
468
|
+
|
|
469
|
+
# Use a reference character to get baseline metrics
|
|
470
|
+
# We use 'A' as it's a good reference for ascender height
|
|
471
|
+
ref_char = "A"
|
|
472
|
+
|
|
473
|
+
# Get bounding boxes for the reference character
|
|
474
|
+
base_bbox = base_font.getbbox(ref_char)
|
|
475
|
+
highlight_bbox = highlight_font.getbbox(ref_char)
|
|
476
|
+
|
|
477
|
+
if base_bbox is None or highlight_bbox is None:
|
|
478
|
+
return 0 # Fallback if bbox calculation fails
|
|
479
|
+
|
|
480
|
+
# The baseline offset is the difference in the top of the bounding box
|
|
481
|
+
# since getbbox returns (left, top, right, bottom) where top is negative for ascenders
|
|
482
|
+
base_ascent = -base_bbox[1] # Distance from baseline to top of character
|
|
483
|
+
highlight_ascent = -highlight_bbox[1] # Distance from baseline to top of character
|
|
484
|
+
|
|
485
|
+
# Calculate the offset needed to align baselines
|
|
486
|
+
# If highlighted text has a larger ascent, we need to move it down
|
|
487
|
+
baseline_offset = highlight_ascent - base_ascent
|
|
488
|
+
|
|
489
|
+
return baseline_offset
|
|
490
|
+
|
|
426
491
|
def _split_lines_by_width(
|
|
427
492
|
self,
|
|
428
493
|
text: str,
|
|
@@ -499,12 +564,18 @@ class ImageText:
|
|
|
499
564
|
xy: PositionType,
|
|
500
565
|
box_width: Union[int, float] | None = None,
|
|
501
566
|
font_size: int = 11,
|
|
567
|
+
font_border_size: int = 0,
|
|
502
568
|
text_color: RGBColor = (0, 0, 0),
|
|
503
569
|
background_color: RGBAColor | None = None,
|
|
504
570
|
background_padding: int = 0,
|
|
505
571
|
place: TextAlign = TextAlign.LEFT,
|
|
506
572
|
anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
|
|
507
573
|
margin: MarginType = 0,
|
|
574
|
+
words: list[str] | None = None,
|
|
575
|
+
highlight_word_index: int | None = None,
|
|
576
|
+
highlight_color: RGBColor | None = None,
|
|
577
|
+
highlight_size_multiplier: float = 1.5,
|
|
578
|
+
highlight_bold_font: str | None = None,
|
|
508
579
|
) -> tuple[int, int]:
|
|
509
580
|
"""
|
|
510
581
|
Write text in a box with advanced positioning and alignment options.
|
|
@@ -515,12 +586,18 @@ class ImageText:
|
|
|
515
586
|
xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
|
|
516
587
|
box_width: Width of the box in pixels (int) or relative to frame width (float 0-1)
|
|
517
588
|
font_size: Font size in points
|
|
589
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
518
590
|
text_color: RGB color of the text
|
|
519
591
|
background_color: If set, adds background color to the text box. Expects RGBA values.
|
|
520
592
|
background_padding: Number of padding pixels to add when adding text background color
|
|
521
593
|
place: Text alignment within the box (TextAlign.LEFT, TextAlign.RIGHT, TextAlign.CENTER)
|
|
522
594
|
anchor: Which part of the text box to anchor at the position
|
|
523
595
|
margin: Margin in pixels (single value or [top, right, bottom, left])
|
|
596
|
+
words: All words occuring in text, helpful for highlighting.
|
|
597
|
+
highlight_word_index: Index of word to highlight (0-based, None to disable highlighting)
|
|
598
|
+
highlight_color: RGB color for the highlighted word (defaults to text_color if None)
|
|
599
|
+
highlight_size_multiplier: Font size multiplier for highlighted word
|
|
600
|
+
highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
|
|
524
601
|
|
|
525
602
|
Returns:
|
|
526
603
|
Coordinates of the lower-right corner of the written text box (x, y)
|
|
@@ -541,6 +618,25 @@ class ImageText:
|
|
|
541
618
|
if background_padding < 0:
|
|
542
619
|
raise ValueError("Background padding cannot be negative")
|
|
543
620
|
|
|
621
|
+
if font_border_size < 0:
|
|
622
|
+
raise ValueError("Font border size cannot be negative")
|
|
623
|
+
|
|
624
|
+
# Validate highlighting parameters
|
|
625
|
+
if highlight_word_index is not None:
|
|
626
|
+
if not words:
|
|
627
|
+
words = text.split()
|
|
628
|
+
if highlight_word_index < 0 or highlight_word_index >= len(words):
|
|
629
|
+
raise ValueError(
|
|
630
|
+
f"highlight_word_index {highlight_word_index} out of range for text with {len(words)} words"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if highlight_size_multiplier <= 0:
|
|
634
|
+
raise ValueError("highlight_size_multiplier must be positive")
|
|
635
|
+
|
|
636
|
+
# Set default highlight color if not provided
|
|
637
|
+
if highlight_word_index is not None and highlight_color is None:
|
|
638
|
+
highlight_color = text_color
|
|
639
|
+
|
|
544
640
|
# Process margins to determine available area
|
|
545
641
|
margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
|
|
546
642
|
available_width = self.image_size[0] - margin_left - margin_right
|
|
@@ -590,6 +686,7 @@ class ImageText:
|
|
|
590
686
|
|
|
591
687
|
# Write lines
|
|
592
688
|
current_text_height = y_pos
|
|
689
|
+
word_index_offset = 0 # Track global word index across lines
|
|
593
690
|
for line in lines:
|
|
594
691
|
line_dimensions = self.get_text_dimensions(font_filename, font_size, line)
|
|
595
692
|
|
|
@@ -604,14 +701,49 @@ class ImageText:
|
|
|
604
701
|
valid_places = [e.value for e in TextAlign]
|
|
605
702
|
raise ValueError(f"Place '{place}' is not supported. Must be one of: {', '.join(valid_places)}")
|
|
606
703
|
|
|
607
|
-
#
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
704
|
+
# Check if highlighting is needed for this line
|
|
705
|
+
if highlight_word_index is not None:
|
|
706
|
+
line_words = line.split()
|
|
707
|
+
line_start_word_index = word_index_offset
|
|
708
|
+
line_end_word_index = word_index_offset + len(line_words) - 1
|
|
709
|
+
|
|
710
|
+
# Check if the highlighted word is in this line
|
|
711
|
+
if line_start_word_index <= highlight_word_index <= line_end_word_index:
|
|
712
|
+
self._write_line_with_highlight(
|
|
713
|
+
line=line,
|
|
714
|
+
font_filename=font_filename,
|
|
715
|
+
font_size=font_size,
|
|
716
|
+
font_border_size=font_border_size,
|
|
717
|
+
text_color=text_color,
|
|
718
|
+
highlight_color=highlight_color or (255, 255, 255),
|
|
719
|
+
highlight_size_multiplier=highlight_size_multiplier,
|
|
720
|
+
highlight_word_local_index=highlight_word_index - line_start_word_index,
|
|
721
|
+
highlight_bold_font=highlight_bold_font,
|
|
722
|
+
x_left=int(x_left),
|
|
723
|
+
y_top=int(current_text_height),
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
# Write normal line without highlighting
|
|
727
|
+
self.write_text(
|
|
728
|
+
text=line,
|
|
729
|
+
font_filename=font_filename,
|
|
730
|
+
xy=(x_left, current_text_height),
|
|
731
|
+
font_size=font_size,
|
|
732
|
+
font_border_size=font_border_size,
|
|
733
|
+
color=text_color,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
word_index_offset += len(line_words)
|
|
737
|
+
else:
|
|
738
|
+
# Write normal line without highlighting
|
|
739
|
+
self.write_text(
|
|
740
|
+
text=line,
|
|
741
|
+
font_filename=font_filename,
|
|
742
|
+
xy=(x_left, current_text_height),
|
|
743
|
+
font_size=font_size,
|
|
744
|
+
font_border_size=font_border_size,
|
|
745
|
+
color=text_color,
|
|
746
|
+
)
|
|
615
747
|
|
|
616
748
|
# Increment vertical position for next line
|
|
617
749
|
current_text_height += line_dimensions[1]
|
|
@@ -690,6 +822,88 @@ class ImageText:
|
|
|
690
822
|
|
|
691
823
|
return (int(x_pos + box_width), int(current_text_height))
|
|
692
824
|
|
|
825
|
+
def _write_line_with_highlight(
|
|
826
|
+
self,
|
|
827
|
+
line: str,
|
|
828
|
+
font_filename: str,
|
|
829
|
+
font_size: int,
|
|
830
|
+
font_border_size: int,
|
|
831
|
+
text_color: RGBColor,
|
|
832
|
+
highlight_color: RGBColor,
|
|
833
|
+
highlight_size_multiplier: float,
|
|
834
|
+
highlight_word_local_index: int,
|
|
835
|
+
highlight_bold_font: str | None,
|
|
836
|
+
x_left: int,
|
|
837
|
+
y_top: int,
|
|
838
|
+
) -> None:
|
|
839
|
+
"""
|
|
840
|
+
Write a line of text with one word highlighted using word-by-word rendering with baseline alignment.
|
|
841
|
+
|
|
842
|
+
Args:
|
|
843
|
+
line: The text line to render
|
|
844
|
+
font_filename: Path to the font file
|
|
845
|
+
font_size: Base font size in points
|
|
846
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
847
|
+
text_color: RGB color for normal text
|
|
848
|
+
highlight_color: RGB color for highlighted word
|
|
849
|
+
highlight_size_multiplier: Font size multiplier for highlighted word
|
|
850
|
+
highlight_word_local_index: Index of word to highlight within this line (0-based)
|
|
851
|
+
highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
|
|
852
|
+
x_left: Left x position for the line
|
|
853
|
+
y_top: Top y position for the line
|
|
854
|
+
"""
|
|
855
|
+
# Split line into words
|
|
856
|
+
words = line.split()
|
|
857
|
+
if highlight_word_local_index >= len(words):
|
|
858
|
+
return # Safety check
|
|
859
|
+
|
|
860
|
+
# Calculate highlighted font size and determine font files
|
|
861
|
+
highlight_font_size = int(font_size * highlight_size_multiplier)
|
|
862
|
+
highlight_font_file = highlight_bold_font if highlight_bold_font is not None else font_filename
|
|
863
|
+
|
|
864
|
+
# Calculate baseline offset for highlighted words (using the appropriate font files)
|
|
865
|
+
baseline_offset = self._get_font_baseline_offset(
|
|
866
|
+
font_filename, font_size, highlight_font_file, highlight_font_size
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Render words one by one with proper spacing
|
|
870
|
+
current_x = x_left
|
|
871
|
+
|
|
872
|
+
for i, word in enumerate(words):
|
|
873
|
+
# Determine if this is the highlighted word
|
|
874
|
+
is_highlighted = i == highlight_word_local_index
|
|
875
|
+
|
|
876
|
+
# Choose font file, size, and color based on highlighting
|
|
877
|
+
word_font_file = highlight_font_file if is_highlighted else font_filename
|
|
878
|
+
word_font_size = highlight_font_size if is_highlighted else font_size
|
|
879
|
+
word_color = highlight_color if is_highlighted else text_color
|
|
880
|
+
|
|
881
|
+
# Calculate y position with baseline alignment
|
|
882
|
+
word_y = y_top
|
|
883
|
+
if is_highlighted:
|
|
884
|
+
word_y += baseline_offset
|
|
885
|
+
|
|
886
|
+
# Render the word
|
|
887
|
+
self.write_text(
|
|
888
|
+
text=word,
|
|
889
|
+
font_filename=word_font_file,
|
|
890
|
+
xy=(current_x, word_y),
|
|
891
|
+
font_size=word_font_size,
|
|
892
|
+
font_border_size=font_border_size,
|
|
893
|
+
color=word_color,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
# Calculate the width of this word for spacing
|
|
897
|
+
word_width = self.get_text_dimensions(word_font_file, word_font_size, word)[0]
|
|
898
|
+
|
|
899
|
+
# Update current_x for next word (add word width plus space)
|
|
900
|
+
current_x += word_width
|
|
901
|
+
|
|
902
|
+
# Add space between words (except after the last word)
|
|
903
|
+
if i < len(words) - 1:
|
|
904
|
+
space_width = self.get_text_dimensions(font_filename, font_size, " ")[0]
|
|
905
|
+
current_x += space_width
|
|
906
|
+
|
|
693
907
|
def _find_smallest_bounding_rect(self, mask: np.ndarray) -> tuple[int, int, int, int]:
|
|
694
908
|
"""
|
|
695
909
|
Find the smallest bounding rectangle containing non-zero values in the mask.
|
|
@@ -725,3 +939,164 @@ class ImageText:
|
|
|
725
939
|
xmin, xmax = col_indices[[0, -1]]
|
|
726
940
|
|
|
727
941
|
return xmin, xmax, ymin, ymax
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
class TranscriptionOverlay:
|
|
945
|
+
def __init__(
|
|
946
|
+
self,
|
|
947
|
+
font_filename: str,
|
|
948
|
+
font_size: int = 40,
|
|
949
|
+
font_border_size: int = 2,
|
|
950
|
+
text_color: RGBColor = (255, 235, 59),
|
|
951
|
+
background_color: RGBAColor | None = (0, 0, 0, 100),
|
|
952
|
+
background_padding: int = 15,
|
|
953
|
+
position: PositionType = (0.5, 0.7),
|
|
954
|
+
box_width: Union[int, float] = 0.6,
|
|
955
|
+
text_align: TextAlign = TextAlign.CENTER,
|
|
956
|
+
anchor: AnchorPoint = AnchorPoint.CENTER,
|
|
957
|
+
margin: MarginType = 20,
|
|
958
|
+
highlight_color: RGBColor = (76, 175, 80),
|
|
959
|
+
highlight_size_multiplier: float = 1.2,
|
|
960
|
+
highlight_bold_font: str | None = None,
|
|
961
|
+
):
|
|
962
|
+
"""
|
|
963
|
+
Initialize TranscriptionOverlay effect.
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
font_filename: Path to font file for text rendering
|
|
967
|
+
font_size: Base font size for text
|
|
968
|
+
text_color: RGB color for normal text
|
|
969
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
970
|
+
background_color: RGBA background color (None for no background)
|
|
971
|
+
background_padding: Padding around text background
|
|
972
|
+
position: Position of text box (relative 0-1 or absolute pixels)
|
|
973
|
+
box_width: Width of text box (relative 0-1 or absolute pixels)
|
|
974
|
+
text_align: Text alignment within box
|
|
975
|
+
anchor: Anchor point for text positioning
|
|
976
|
+
margin: Margin around text box
|
|
977
|
+
highlight_color: RGB color for highlighted words
|
|
978
|
+
highlight_size_multiplier: Size multiplier for highlighted words
|
|
979
|
+
highlight_bold_font: Optional bold font for highlighting
|
|
980
|
+
"""
|
|
981
|
+
self.font_filename = font_filename
|
|
982
|
+
self.font_size = font_size
|
|
983
|
+
self.text_color = text_color
|
|
984
|
+
self.font_border_size = font_border_size
|
|
985
|
+
self.background_color = background_color
|
|
986
|
+
self.background_padding = background_padding
|
|
987
|
+
self.position = position
|
|
988
|
+
self.box_width = box_width
|
|
989
|
+
self.text_align = text_align
|
|
990
|
+
self.anchor = anchor
|
|
991
|
+
self.margin = margin
|
|
992
|
+
self.highlight_color = highlight_color
|
|
993
|
+
self.highlight_size_multiplier = highlight_size_multiplier
|
|
994
|
+
self.highlight_bold_font = highlight_bold_font
|
|
995
|
+
|
|
996
|
+
# Cache for text overlays to avoid regenerating identical frames
|
|
997
|
+
self._overlay_cache: dict[tuple[str, int | None], np.ndarray] = {}
|
|
998
|
+
|
|
999
|
+
def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
|
|
1000
|
+
"""Get the transcription segment active at the given timestamp."""
|
|
1001
|
+
for segment in transcription.segments:
|
|
1002
|
+
if segment.start <= timestamp <= segment.end:
|
|
1003
|
+
return segment
|
|
1004
|
+
return None
|
|
1005
|
+
|
|
1006
|
+
def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
|
|
1007
|
+
"""Get the index of the word being spoken at the given timestamp within a segment."""
|
|
1008
|
+
for i, word in enumerate(segment.words):
|
|
1009
|
+
if word.start <= timestamp <= word.end:
|
|
1010
|
+
return i
|
|
1011
|
+
return None
|
|
1012
|
+
|
|
1013
|
+
def _create_text_overlay(
|
|
1014
|
+
self, video_shape: tuple[int, int, int], segment: TranscriptionSegment, highlight_word_index: int | None
|
|
1015
|
+
) -> np.ndarray:
|
|
1016
|
+
"""Create a text overlay image for the given segment and highlight."""
|
|
1017
|
+
# Use video frame dimensions for overlay
|
|
1018
|
+
height, width = video_shape[:2]
|
|
1019
|
+
|
|
1020
|
+
# Create cache key based on segment text and highlight
|
|
1021
|
+
cache_key = (segment.text, highlight_word_index)
|
|
1022
|
+
if cache_key in self._overlay_cache:
|
|
1023
|
+
return self._overlay_cache[cache_key]
|
|
1024
|
+
|
|
1025
|
+
# Create ImageText with video dimensions
|
|
1026
|
+
img_text = ImageText(image_size=(width, height), background=(0, 0, 0, 0))
|
|
1027
|
+
|
|
1028
|
+
# Write text with highlighting
|
|
1029
|
+
img_text.write_text_box(
|
|
1030
|
+
text=segment.text,
|
|
1031
|
+
font_filename=self.font_filename,
|
|
1032
|
+
xy=self.position,
|
|
1033
|
+
box_width=self.box_width,
|
|
1034
|
+
font_size=self.font_size,
|
|
1035
|
+
font_border_size=self.font_border_size,
|
|
1036
|
+
text_color=self.text_color,
|
|
1037
|
+
background_color=self.background_color,
|
|
1038
|
+
background_padding=self.background_padding,
|
|
1039
|
+
place=self.text_align,
|
|
1040
|
+
anchor=self.anchor,
|
|
1041
|
+
margin=self.margin,
|
|
1042
|
+
words=[w.word for w in segment.words],
|
|
1043
|
+
highlight_word_index=highlight_word_index,
|
|
1044
|
+
highlight_color=self.highlight_color,
|
|
1045
|
+
highlight_size_multiplier=self.highlight_size_multiplier,
|
|
1046
|
+
highlight_bold_font=self.highlight_bold_font,
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
overlay_image = img_text.img_array
|
|
1050
|
+
|
|
1051
|
+
# Cache the overlay
|
|
1052
|
+
self._overlay_cache[cache_key] = overlay_image
|
|
1053
|
+
|
|
1054
|
+
return overlay_image
|
|
1055
|
+
|
|
1056
|
+
def apply(self, video: Video, transcription: Transcription) -> Video:
|
|
1057
|
+
"""Apply transcription overlay to video frames."""
|
|
1058
|
+
print("Applying transcription overlay...")
|
|
1059
|
+
|
|
1060
|
+
new_frames = []
|
|
1061
|
+
|
|
1062
|
+
for frame_idx, frame in enumerate(tqdm(video.frames)):
|
|
1063
|
+
# Calculate timestamp for this frame
|
|
1064
|
+
timestamp = frame_idx / video.fps
|
|
1065
|
+
|
|
1066
|
+
# Get active segment at this timestamp
|
|
1067
|
+
active_segment = self._get_active_segment(transcription, timestamp)
|
|
1068
|
+
|
|
1069
|
+
if active_segment is None:
|
|
1070
|
+
# No active transcription, keep original frame
|
|
1071
|
+
new_frames.append(frame)
|
|
1072
|
+
continue
|
|
1073
|
+
|
|
1074
|
+
# Get active word index for highlighting
|
|
1075
|
+
highlight_word_index = self._get_active_word_index(active_segment, timestamp)
|
|
1076
|
+
|
|
1077
|
+
# Create text overlay
|
|
1078
|
+
text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
|
|
1079
|
+
|
|
1080
|
+
# Apply overlay to frame
|
|
1081
|
+
overlaid_frame = self._apply_overlay_to_frame(frame, text_overlay)
|
|
1082
|
+
new_frames.append(overlaid_frame)
|
|
1083
|
+
|
|
1084
|
+
# Create new video with overlaid frames
|
|
1085
|
+
new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
|
|
1086
|
+
new_video.audio = video.audio # Preserve audio
|
|
1087
|
+
|
|
1088
|
+
return new_video
|
|
1089
|
+
|
|
1090
|
+
def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
|
|
1091
|
+
"""Apply a text overlay to a single frame."""
|
|
1092
|
+
|
|
1093
|
+
# Convert frame to PIL Image
|
|
1094
|
+
frame_pil = Image.fromarray(frame)
|
|
1095
|
+
|
|
1096
|
+
# Convert overlay to PIL Image
|
|
1097
|
+
overlay_pil = Image.fromarray(overlay)
|
|
1098
|
+
|
|
1099
|
+
# Paste overlay onto frame using alpha channel
|
|
1100
|
+
frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
|
|
1101
|
+
|
|
1102
|
+
return np.array(frame_pil)
|