videopython 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videopython might be problematic. Click here for more details.
- videopython/ai/understanding/transcribe.py +48 -19
- videopython/base/text/__init__.py +0 -0
- videopython/{utils/text.py → base/text/overlay.py} +383 -8
- videopython/base/text/transcription.py +121 -0
- videopython/base/utils.py +6 -0
- videopython/base/video.py +100 -58
- {videopython-0.4.1.dist-info → videopython-0.5.0.dist-info}/METADATA +91 -28
- {videopython-0.4.1.dist-info → videopython-0.5.0.dist-info}/RECORD +10 -12
- videopython/base/compose.py +0 -55
- videopython/base/transcription.py +0 -13
- videopython/utils/__init__.py +0 -3
- videopython/utils/common.py +0 -31
- videopython/utils/image.py +0 -47
- {videopython-0.4.1.dist-info → videopython-0.5.0.dist-info}/WHEEL +0 -0
- {videopython-0.4.1.dist-info → videopython-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,37 +1,66 @@
|
|
|
1
|
-
from typing import Literal
|
|
1
|
+
from typing import Literal, Union
|
|
2
2
|
|
|
3
3
|
import whisper
|
|
4
|
+
from soundpython import Audio
|
|
4
5
|
|
|
5
|
-
from videopython.base.transcription import Transcription, TranscriptionSegment
|
|
6
|
+
from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
|
|
6
7
|
from videopython.base.video import Video
|
|
7
8
|
|
|
8
9
|
|
|
9
|
-
class
|
|
10
|
+
class CreateTranscription:
|
|
11
|
+
"""Unified transcription service for both audio and video."""
|
|
12
|
+
|
|
10
13
|
def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
|
|
11
14
|
self.model = whisper.load_model(name=model_name)
|
|
12
15
|
|
|
13
|
-
def
|
|
14
|
-
"""
|
|
16
|
+
def _process_transcription_result(self, transcription_result: dict) -> Transcription:
|
|
17
|
+
"""Process raw transcription result into Transcription object.
|
|
15
18
|
|
|
16
19
|
Args:
|
|
17
|
-
|
|
20
|
+
transcription_result: Raw result from whisper model
|
|
18
21
|
|
|
19
22
|
Returns:
|
|
20
|
-
|
|
23
|
+
Processed Transcription object
|
|
21
24
|
"""
|
|
22
|
-
|
|
23
|
-
|
|
25
|
+
transcription_segments = []
|
|
26
|
+
for segment in transcription_result["segments"]:
|
|
27
|
+
transcription_words = [
|
|
28
|
+
TranscriptionWord(word=word["word"], start=float(word["start"]), end=float(word["end"]))
|
|
29
|
+
for word in segment["words"]
|
|
30
|
+
]
|
|
31
|
+
transcription_segment = TranscriptionSegment(
|
|
32
|
+
start=segment["start"], end=segment["end"], text=segment["text"], words=transcription_words
|
|
33
|
+
)
|
|
34
|
+
transcription_segments.append(transcription_segment)
|
|
35
|
+
|
|
36
|
+
return Transcription(segments=transcription_segments)
|
|
37
|
+
|
|
38
|
+
def transcribe(self, media: Union[Audio, Video]) -> Transcription:
|
|
39
|
+
"""Transcribe audio or video to text.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
media: Audio or Video to transcribe.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Transcription object with segments of text and their timestamps.
|
|
46
|
+
"""
|
|
47
|
+
if isinstance(media, Video):
|
|
48
|
+
# Handle video transcription
|
|
49
|
+
if media.audio.is_silent:
|
|
50
|
+
return Transcription(segments=[])
|
|
51
|
+
|
|
52
|
+
audio = media.audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
53
|
+
transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
|
|
24
54
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
55
|
+
elif isinstance(media, Audio):
|
|
56
|
+
# Handle audio transcription
|
|
57
|
+
if media.is_silent:
|
|
58
|
+
return Transcription(segments=[])
|
|
28
59
|
|
|
29
|
-
|
|
60
|
+
audio = media.to_mono().resample(whisper.audio.SAMPLE_RATE)
|
|
61
|
+
transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
|
|
30
62
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
for segment in transcription["segments"]
|
|
34
|
-
]
|
|
35
|
-
result = Transcription(segments=transcription_segments)
|
|
63
|
+
else:
|
|
64
|
+
raise TypeError(f"Unsupported media type: {type(media)}. Expected Audio or Video.")
|
|
36
65
|
|
|
37
|
-
return
|
|
66
|
+
return self._process_transcription_result(transcription_result)
|
|
File without changes
|
|
@@ -1,10 +1,21 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Beware, the code below was heavily "vibe-coded".
|
|
3
|
+
|
|
4
|
+
The main purpose of this file are 2 classes:
|
|
5
|
+
1. `ImageText` class for creating RGBA image with rendered subtitles
|
|
6
|
+
2. `TranscriptionOverlay` class, which takes the `Transcription` and `Video` objects and overlays subtitles on `Video`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
1
9
|
from enum import Enum
|
|
2
10
|
from typing import TypeAlias, Union
|
|
3
11
|
|
|
4
12
|
import numpy as np
|
|
5
13
|
from PIL import Image, ImageDraw, ImageFont
|
|
14
|
+
from tqdm import tqdm
|
|
6
15
|
|
|
7
16
|
from videopython.base.exceptions import OutOfBoundsError
|
|
17
|
+
from videopython.base.text.transcription import Transcription, TranscriptionSegment
|
|
18
|
+
from videopython.base.video import Video
|
|
8
19
|
|
|
9
20
|
# Type aliases for clarity
|
|
10
21
|
MarginType: TypeAlias = Union[int, tuple[int, int, int, int]]
|
|
@@ -319,6 +330,7 @@ class ImageText:
|
|
|
319
330
|
font_filename: str,
|
|
320
331
|
xy: PositionType,
|
|
321
332
|
font_size: int | None = 11,
|
|
333
|
+
font_border_size: int = 0,
|
|
322
334
|
color: RGBColor = (0, 0, 0),
|
|
323
335
|
max_width: int | None = None,
|
|
324
336
|
max_height: int | None = None,
|
|
@@ -333,6 +345,7 @@ class ImageText:
|
|
|
333
345
|
font_filename: Path to the font file
|
|
334
346
|
xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
|
|
335
347
|
font_size: Size of the font in points, or None to auto-calculate
|
|
348
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
336
349
|
color: RGB color of the text
|
|
337
350
|
max_width: Maximum width for auto font sizing
|
|
338
351
|
max_height: Maximum height for auto font sizing
|
|
@@ -355,6 +368,9 @@ class ImageText:
|
|
|
355
368
|
if font_size is not None and font_size <= 0:
|
|
356
369
|
raise ValueError("Font size must be positive")
|
|
357
370
|
|
|
371
|
+
if font_border_size < 0:
|
|
372
|
+
raise ValueError("Font border size cannot be negative")
|
|
373
|
+
|
|
358
374
|
if font_size is None and (max_width is None or max_height is None):
|
|
359
375
|
raise ValueError("Must set either `font_size`, or both `max_width` and `max_height`!")
|
|
360
376
|
elif font_size is None:
|
|
@@ -371,6 +387,15 @@ class ImageText:
|
|
|
371
387
|
if x < 0 or y < 0 or x + text_dimensions[0] > self.image_size[0] or y + text_dimensions[1] > self.image_size[1]:
|
|
372
388
|
raise OutOfBoundsError(f"Text with size {text_dimensions} at position ({x}, {y}) is out of bounds!")
|
|
373
389
|
|
|
390
|
+
# Draw border if requested
|
|
391
|
+
if font_border_size > 0:
|
|
392
|
+
# Draw text border by drawing text in multiple positions around the main text
|
|
393
|
+
for border_x in range(-font_border_size, font_border_size + 1):
|
|
394
|
+
for border_y in range(-font_border_size, font_border_size + 1):
|
|
395
|
+
if border_x != 0 or border_y != 0: # Skip the center position
|
|
396
|
+
self._draw.text((x + border_x, y + border_y), text, font=font, fill=(0, 0, 0))
|
|
397
|
+
|
|
398
|
+
# Draw the main text on top
|
|
374
399
|
self._draw.text((x, y), text, font=font, fill=color)
|
|
375
400
|
return text_dimensions
|
|
376
401
|
|
|
@@ -423,6 +448,46 @@ class ImageText:
|
|
|
423
448
|
except Exception as e:
|
|
424
449
|
raise ValueError(f"Error measuring text: {str(e)}")
|
|
425
450
|
|
|
451
|
+
def _get_font_baseline_offset(
|
|
452
|
+
self, base_font_filename: str, base_font_size: int, highlight_font_filename: str, highlight_font_size: int
|
|
453
|
+
) -> int:
|
|
454
|
+
"""
|
|
455
|
+
Calculate the vertical offset needed to align baselines of different fonts and sizes.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
base_font_filename: Path to the base font file
|
|
459
|
+
base_font_size: Font size of normal text
|
|
460
|
+
highlight_font_filename: Path to the highlight font file
|
|
461
|
+
highlight_font_size: Font size of highlighted text
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
Vertical offset in pixels to align highlighted text baseline with normal text baseline
|
|
465
|
+
"""
|
|
466
|
+
base_font = self._get_font(base_font_filename, base_font_size)
|
|
467
|
+
highlight_font = self._get_font(highlight_font_filename, highlight_font_size)
|
|
468
|
+
|
|
469
|
+
# Use a reference character to get baseline metrics
|
|
470
|
+
# We use 'A' as it's a good reference for ascender height
|
|
471
|
+
ref_char = "A"
|
|
472
|
+
|
|
473
|
+
# Get bounding boxes for the reference character
|
|
474
|
+
base_bbox = base_font.getbbox(ref_char)
|
|
475
|
+
highlight_bbox = highlight_font.getbbox(ref_char)
|
|
476
|
+
|
|
477
|
+
if base_bbox is None or highlight_bbox is None:
|
|
478
|
+
return 0 # Fallback if bbox calculation fails
|
|
479
|
+
|
|
480
|
+
# The baseline offset is the difference in the top of the bounding box
|
|
481
|
+
# since getbbox returns (left, top, right, bottom) where top is negative for ascenders
|
|
482
|
+
base_ascent = -base_bbox[1] # Distance from baseline to top of character
|
|
483
|
+
highlight_ascent = -highlight_bbox[1] # Distance from baseline to top of character
|
|
484
|
+
|
|
485
|
+
# Calculate the offset needed to align baselines
|
|
486
|
+
# If highlighted text has a larger ascent, we need to move it down
|
|
487
|
+
baseline_offset = highlight_ascent - base_ascent
|
|
488
|
+
|
|
489
|
+
return baseline_offset
|
|
490
|
+
|
|
426
491
|
def _split_lines_by_width(
|
|
427
492
|
self,
|
|
428
493
|
text: str,
|
|
@@ -499,12 +564,18 @@ class ImageText:
|
|
|
499
564
|
xy: PositionType,
|
|
500
565
|
box_width: Union[int, float] | None = None,
|
|
501
566
|
font_size: int = 11,
|
|
567
|
+
font_border_size: int = 0,
|
|
502
568
|
text_color: RGBColor = (0, 0, 0),
|
|
503
569
|
background_color: RGBAColor | None = None,
|
|
504
570
|
background_padding: int = 0,
|
|
505
571
|
place: TextAlign = TextAlign.LEFT,
|
|
506
572
|
anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
|
|
507
573
|
margin: MarginType = 0,
|
|
574
|
+
words: list[str] | None = None,
|
|
575
|
+
highlight_word_index: int | None = None,
|
|
576
|
+
highlight_color: RGBColor | None = None,
|
|
577
|
+
highlight_size_multiplier: float = 1.5,
|
|
578
|
+
highlight_bold_font: str | None = None,
|
|
508
579
|
) -> tuple[int, int]:
|
|
509
580
|
"""
|
|
510
581
|
Write text in a box with advanced positioning and alignment options.
|
|
@@ -515,12 +586,18 @@ class ImageText:
|
|
|
515
586
|
xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
|
|
516
587
|
box_width: Width of the box in pixels (int) or relative to frame width (float 0-1)
|
|
517
588
|
font_size: Font size in points
|
|
589
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
518
590
|
text_color: RGB color of the text
|
|
519
591
|
background_color: If set, adds background color to the text box. Expects RGBA values.
|
|
520
592
|
background_padding: Number of padding pixels to add when adding text background color
|
|
521
593
|
place: Text alignment within the box (TextAlign.LEFT, TextAlign.RIGHT, TextAlign.CENTER)
|
|
522
594
|
anchor: Which part of the text box to anchor at the position
|
|
523
595
|
margin: Margin in pixels (single value or [top, right, bottom, left])
|
|
596
|
+
words: All words occuring in text, helpful for highlighting.
|
|
597
|
+
highlight_word_index: Index of word to highlight (0-based, None to disable highlighting)
|
|
598
|
+
highlight_color: RGB color for the highlighted word (defaults to text_color if None)
|
|
599
|
+
highlight_size_multiplier: Font size multiplier for highlighted word
|
|
600
|
+
highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
|
|
524
601
|
|
|
525
602
|
Returns:
|
|
526
603
|
Coordinates of the lower-right corner of the written text box (x, y)
|
|
@@ -541,6 +618,25 @@ class ImageText:
|
|
|
541
618
|
if background_padding < 0:
|
|
542
619
|
raise ValueError("Background padding cannot be negative")
|
|
543
620
|
|
|
621
|
+
if font_border_size < 0:
|
|
622
|
+
raise ValueError("Font border size cannot be negative")
|
|
623
|
+
|
|
624
|
+
# Validate highlighting parameters
|
|
625
|
+
if highlight_word_index is not None:
|
|
626
|
+
if not words:
|
|
627
|
+
words = text.split()
|
|
628
|
+
if highlight_word_index < 0 or highlight_word_index >= len(words):
|
|
629
|
+
raise ValueError(
|
|
630
|
+
f"highlight_word_index {highlight_word_index} out of range for text with {len(words)} words"
|
|
631
|
+
)
|
|
632
|
+
|
|
633
|
+
if highlight_size_multiplier <= 0:
|
|
634
|
+
raise ValueError("highlight_size_multiplier must be positive")
|
|
635
|
+
|
|
636
|
+
# Set default highlight color if not provided
|
|
637
|
+
if highlight_word_index is not None and highlight_color is None:
|
|
638
|
+
highlight_color = text_color
|
|
639
|
+
|
|
544
640
|
# Process margins to determine available area
|
|
545
641
|
margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
|
|
546
642
|
available_width = self.image_size[0] - margin_left - margin_right
|
|
@@ -590,6 +686,7 @@ class ImageText:
|
|
|
590
686
|
|
|
591
687
|
# Write lines
|
|
592
688
|
current_text_height = y_pos
|
|
689
|
+
word_index_offset = 0 # Track global word index across lines
|
|
593
690
|
for line in lines:
|
|
594
691
|
line_dimensions = self.get_text_dimensions(font_filename, font_size, line)
|
|
595
692
|
|
|
@@ -604,14 +701,49 @@ class ImageText:
|
|
|
604
701
|
valid_places = [e.value for e in TextAlign]
|
|
605
702
|
raise ValueError(f"Place '{place}' is not supported. Must be one of: {', '.join(valid_places)}")
|
|
606
703
|
|
|
607
|
-
#
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
704
|
+
# Check if highlighting is needed for this line
|
|
705
|
+
if highlight_word_index is not None:
|
|
706
|
+
line_words = line.split()
|
|
707
|
+
line_start_word_index = word_index_offset
|
|
708
|
+
line_end_word_index = word_index_offset + len(line_words) - 1
|
|
709
|
+
|
|
710
|
+
# Check if the highlighted word is in this line
|
|
711
|
+
if line_start_word_index <= highlight_word_index <= line_end_word_index:
|
|
712
|
+
self._write_line_with_highlight(
|
|
713
|
+
line=line,
|
|
714
|
+
font_filename=font_filename,
|
|
715
|
+
font_size=font_size,
|
|
716
|
+
font_border_size=font_border_size,
|
|
717
|
+
text_color=text_color,
|
|
718
|
+
highlight_color=highlight_color or (255, 255, 255),
|
|
719
|
+
highlight_size_multiplier=highlight_size_multiplier,
|
|
720
|
+
highlight_word_local_index=highlight_word_index - line_start_word_index,
|
|
721
|
+
highlight_bold_font=highlight_bold_font,
|
|
722
|
+
x_left=int(x_left),
|
|
723
|
+
y_top=int(current_text_height),
|
|
724
|
+
)
|
|
725
|
+
else:
|
|
726
|
+
# Write normal line without highlighting
|
|
727
|
+
self.write_text(
|
|
728
|
+
text=line,
|
|
729
|
+
font_filename=font_filename,
|
|
730
|
+
xy=(x_left, current_text_height),
|
|
731
|
+
font_size=font_size,
|
|
732
|
+
font_border_size=font_border_size,
|
|
733
|
+
color=text_color,
|
|
734
|
+
)
|
|
735
|
+
|
|
736
|
+
word_index_offset += len(line_words)
|
|
737
|
+
else:
|
|
738
|
+
# Write normal line without highlighting
|
|
739
|
+
self.write_text(
|
|
740
|
+
text=line,
|
|
741
|
+
font_filename=font_filename,
|
|
742
|
+
xy=(x_left, current_text_height),
|
|
743
|
+
font_size=font_size,
|
|
744
|
+
font_border_size=font_border_size,
|
|
745
|
+
color=text_color,
|
|
746
|
+
)
|
|
615
747
|
|
|
616
748
|
# Increment vertical position for next line
|
|
617
749
|
current_text_height += line_dimensions[1]
|
|
@@ -690,6 +822,88 @@ class ImageText:
|
|
|
690
822
|
|
|
691
823
|
return (int(x_pos + box_width), int(current_text_height))
|
|
692
824
|
|
|
825
|
+
def _write_line_with_highlight(
|
|
826
|
+
self,
|
|
827
|
+
line: str,
|
|
828
|
+
font_filename: str,
|
|
829
|
+
font_size: int,
|
|
830
|
+
font_border_size: int,
|
|
831
|
+
text_color: RGBColor,
|
|
832
|
+
highlight_color: RGBColor,
|
|
833
|
+
highlight_size_multiplier: float,
|
|
834
|
+
highlight_word_local_index: int,
|
|
835
|
+
highlight_bold_font: str | None,
|
|
836
|
+
x_left: int,
|
|
837
|
+
y_top: int,
|
|
838
|
+
) -> None:
|
|
839
|
+
"""
|
|
840
|
+
Write a line of text with one word highlighted using word-by-word rendering with baseline alignment.
|
|
841
|
+
|
|
842
|
+
Args:
|
|
843
|
+
line: The text line to render
|
|
844
|
+
font_filename: Path to the font file
|
|
845
|
+
font_size: Base font size in points
|
|
846
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
847
|
+
text_color: RGB color for normal text
|
|
848
|
+
highlight_color: RGB color for highlighted word
|
|
849
|
+
highlight_size_multiplier: Font size multiplier for highlighted word
|
|
850
|
+
highlight_word_local_index: Index of word to highlight within this line (0-based)
|
|
851
|
+
highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
|
|
852
|
+
x_left: Left x position for the line
|
|
853
|
+
y_top: Top y position for the line
|
|
854
|
+
"""
|
|
855
|
+
# Split line into words
|
|
856
|
+
words = line.split()
|
|
857
|
+
if highlight_word_local_index >= len(words):
|
|
858
|
+
return # Safety check
|
|
859
|
+
|
|
860
|
+
# Calculate highlighted font size and determine font files
|
|
861
|
+
highlight_font_size = int(font_size * highlight_size_multiplier)
|
|
862
|
+
highlight_font_file = highlight_bold_font if highlight_bold_font is not None else font_filename
|
|
863
|
+
|
|
864
|
+
# Calculate baseline offset for highlighted words (using the appropriate font files)
|
|
865
|
+
baseline_offset = self._get_font_baseline_offset(
|
|
866
|
+
font_filename, font_size, highlight_font_file, highlight_font_size
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
# Render words one by one with proper spacing
|
|
870
|
+
current_x = x_left
|
|
871
|
+
|
|
872
|
+
for i, word in enumerate(words):
|
|
873
|
+
# Determine if this is the highlighted word
|
|
874
|
+
is_highlighted = i == highlight_word_local_index
|
|
875
|
+
|
|
876
|
+
# Choose font file, size, and color based on highlighting
|
|
877
|
+
word_font_file = highlight_font_file if is_highlighted else font_filename
|
|
878
|
+
word_font_size = highlight_font_size if is_highlighted else font_size
|
|
879
|
+
word_color = highlight_color if is_highlighted else text_color
|
|
880
|
+
|
|
881
|
+
# Calculate y position with baseline alignment
|
|
882
|
+
word_y = y_top
|
|
883
|
+
if is_highlighted:
|
|
884
|
+
word_y += baseline_offset
|
|
885
|
+
|
|
886
|
+
# Render the word
|
|
887
|
+
self.write_text(
|
|
888
|
+
text=word,
|
|
889
|
+
font_filename=word_font_file,
|
|
890
|
+
xy=(current_x, word_y),
|
|
891
|
+
font_size=word_font_size,
|
|
892
|
+
font_border_size=font_border_size,
|
|
893
|
+
color=word_color,
|
|
894
|
+
)
|
|
895
|
+
|
|
896
|
+
# Calculate the width of this word for spacing
|
|
897
|
+
word_width = self.get_text_dimensions(word_font_file, word_font_size, word)[0]
|
|
898
|
+
|
|
899
|
+
# Update current_x for next word (add word width plus space)
|
|
900
|
+
current_x += word_width
|
|
901
|
+
|
|
902
|
+
# Add space between words (except after the last word)
|
|
903
|
+
if i < len(words) - 1:
|
|
904
|
+
space_width = self.get_text_dimensions(font_filename, font_size, " ")[0]
|
|
905
|
+
current_x += space_width
|
|
906
|
+
|
|
693
907
|
def _find_smallest_bounding_rect(self, mask: np.ndarray) -> tuple[int, int, int, int]:
|
|
694
908
|
"""
|
|
695
909
|
Find the smallest bounding rectangle containing non-zero values in the mask.
|
|
@@ -725,3 +939,164 @@ class ImageText:
|
|
|
725
939
|
xmin, xmax = col_indices[[0, -1]]
|
|
726
940
|
|
|
727
941
|
return xmin, xmax, ymin, ymax
|
|
942
|
+
|
|
943
|
+
|
|
944
|
+
class TranscriptionOverlay:
|
|
945
|
+
def __init__(
|
|
946
|
+
self,
|
|
947
|
+
font_filename: str,
|
|
948
|
+
font_size: int = 40,
|
|
949
|
+
font_border_size: int = 2,
|
|
950
|
+
text_color: RGBColor = (255, 235, 59),
|
|
951
|
+
background_color: RGBAColor | None = (0, 0, 0, 100),
|
|
952
|
+
background_padding: int = 15,
|
|
953
|
+
position: PositionType = (0.5, 0.7),
|
|
954
|
+
box_width: Union[int, float] = 0.6,
|
|
955
|
+
text_align: TextAlign = TextAlign.CENTER,
|
|
956
|
+
anchor: AnchorPoint = AnchorPoint.CENTER,
|
|
957
|
+
margin: MarginType = 20,
|
|
958
|
+
highlight_color: RGBColor = (76, 175, 80),
|
|
959
|
+
highlight_size_multiplier: float = 1.2,
|
|
960
|
+
highlight_bold_font: str | None = None,
|
|
961
|
+
):
|
|
962
|
+
"""
|
|
963
|
+
Initialize TranscriptionOverlay effect.
|
|
964
|
+
|
|
965
|
+
Args:
|
|
966
|
+
font_filename: Path to font file for text rendering
|
|
967
|
+
font_size: Base font size for text
|
|
968
|
+
text_color: RGB color for normal text
|
|
969
|
+
font_border_size: Size of border around text in pixels (0 for no border)
|
|
970
|
+
background_color: RGBA background color (None for no background)
|
|
971
|
+
background_padding: Padding around text background
|
|
972
|
+
position: Position of text box (relative 0-1 or absolute pixels)
|
|
973
|
+
box_width: Width of text box (relative 0-1 or absolute pixels)
|
|
974
|
+
text_align: Text alignment within box
|
|
975
|
+
anchor: Anchor point for text positioning
|
|
976
|
+
margin: Margin around text box
|
|
977
|
+
highlight_color: RGB color for highlighted words
|
|
978
|
+
highlight_size_multiplier: Size multiplier for highlighted words
|
|
979
|
+
highlight_bold_font: Optional bold font for highlighting
|
|
980
|
+
"""
|
|
981
|
+
self.font_filename = font_filename
|
|
982
|
+
self.font_size = font_size
|
|
983
|
+
self.text_color = text_color
|
|
984
|
+
self.font_border_size = font_border_size
|
|
985
|
+
self.background_color = background_color
|
|
986
|
+
self.background_padding = background_padding
|
|
987
|
+
self.position = position
|
|
988
|
+
self.box_width = box_width
|
|
989
|
+
self.text_align = text_align
|
|
990
|
+
self.anchor = anchor
|
|
991
|
+
self.margin = margin
|
|
992
|
+
self.highlight_color = highlight_color
|
|
993
|
+
self.highlight_size_multiplier = highlight_size_multiplier
|
|
994
|
+
self.highlight_bold_font = highlight_bold_font
|
|
995
|
+
|
|
996
|
+
# Cache for text overlays to avoid regenerating identical frames
|
|
997
|
+
self._overlay_cache: dict[tuple[str, int | None], np.ndarray] = {}
|
|
998
|
+
|
|
999
|
+
def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
|
|
1000
|
+
"""Get the transcription segment active at the given timestamp."""
|
|
1001
|
+
for segment in transcription.segments:
|
|
1002
|
+
if segment.start <= timestamp <= segment.end:
|
|
1003
|
+
return segment
|
|
1004
|
+
return None
|
|
1005
|
+
|
|
1006
|
+
def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
|
|
1007
|
+
"""Get the index of the word being spoken at the given timestamp within a segment."""
|
|
1008
|
+
for i, word in enumerate(segment.words):
|
|
1009
|
+
if word.start <= timestamp <= word.end:
|
|
1010
|
+
return i
|
|
1011
|
+
return None
|
|
1012
|
+
|
|
1013
|
+
def _create_text_overlay(
|
|
1014
|
+
self, video_shape: tuple[int, int, int], segment: TranscriptionSegment, highlight_word_index: int | None
|
|
1015
|
+
) -> np.ndarray:
|
|
1016
|
+
"""Create a text overlay image for the given segment and highlight."""
|
|
1017
|
+
# Use video frame dimensions for overlay
|
|
1018
|
+
height, width = video_shape[:2]
|
|
1019
|
+
|
|
1020
|
+
# Create cache key based on segment text and highlight
|
|
1021
|
+
cache_key = (segment.text, highlight_word_index)
|
|
1022
|
+
if cache_key in self._overlay_cache:
|
|
1023
|
+
return self._overlay_cache[cache_key]
|
|
1024
|
+
|
|
1025
|
+
# Create ImageText with video dimensions
|
|
1026
|
+
img_text = ImageText(image_size=(width, height), background=(0, 0, 0, 0))
|
|
1027
|
+
|
|
1028
|
+
# Write text with highlighting
|
|
1029
|
+
img_text.write_text_box(
|
|
1030
|
+
text=segment.text,
|
|
1031
|
+
font_filename=self.font_filename,
|
|
1032
|
+
xy=self.position,
|
|
1033
|
+
box_width=self.box_width,
|
|
1034
|
+
font_size=self.font_size,
|
|
1035
|
+
font_border_size=self.font_border_size,
|
|
1036
|
+
text_color=self.text_color,
|
|
1037
|
+
background_color=self.background_color,
|
|
1038
|
+
background_padding=self.background_padding,
|
|
1039
|
+
place=self.text_align,
|
|
1040
|
+
anchor=self.anchor,
|
|
1041
|
+
margin=self.margin,
|
|
1042
|
+
words=[w.word for w in segment.words],
|
|
1043
|
+
highlight_word_index=highlight_word_index,
|
|
1044
|
+
highlight_color=self.highlight_color,
|
|
1045
|
+
highlight_size_multiplier=self.highlight_size_multiplier,
|
|
1046
|
+
highlight_bold_font=self.highlight_bold_font,
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
overlay_image = img_text.img_array
|
|
1050
|
+
|
|
1051
|
+
# Cache the overlay
|
|
1052
|
+
self._overlay_cache[cache_key] = overlay_image
|
|
1053
|
+
|
|
1054
|
+
return overlay_image
|
|
1055
|
+
|
|
1056
|
+
def apply(self, video: Video, transcription: Transcription) -> Video:
|
|
1057
|
+
"""Apply transcription overlay to video frames."""
|
|
1058
|
+
print("Applying transcription overlay...")
|
|
1059
|
+
|
|
1060
|
+
new_frames = []
|
|
1061
|
+
|
|
1062
|
+
for frame_idx, frame in enumerate(tqdm(video.frames)):
|
|
1063
|
+
# Calculate timestamp for this frame
|
|
1064
|
+
timestamp = frame_idx / video.fps
|
|
1065
|
+
|
|
1066
|
+
# Get active segment at this timestamp
|
|
1067
|
+
active_segment = self._get_active_segment(transcription, timestamp)
|
|
1068
|
+
|
|
1069
|
+
if active_segment is None:
|
|
1070
|
+
# No active transcription, keep original frame
|
|
1071
|
+
new_frames.append(frame)
|
|
1072
|
+
continue
|
|
1073
|
+
|
|
1074
|
+
# Get active word index for highlighting
|
|
1075
|
+
highlight_word_index = self._get_active_word_index(active_segment, timestamp)
|
|
1076
|
+
|
|
1077
|
+
# Create text overlay
|
|
1078
|
+
text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
|
|
1079
|
+
|
|
1080
|
+
# Apply overlay to frame
|
|
1081
|
+
overlaid_frame = self._apply_overlay_to_frame(frame, text_overlay)
|
|
1082
|
+
new_frames.append(overlaid_frame)
|
|
1083
|
+
|
|
1084
|
+
# Create new video with overlaid frames
|
|
1085
|
+
new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
|
|
1086
|
+
new_video.audio = video.audio # Preserve audio
|
|
1087
|
+
|
|
1088
|
+
return new_video
|
|
1089
|
+
|
|
1090
|
+
def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
|
|
1091
|
+
"""Apply a text overlay to a single frame."""
|
|
1092
|
+
|
|
1093
|
+
# Convert frame to PIL Image
|
|
1094
|
+
frame_pil = Image.fromarray(frame)
|
|
1095
|
+
|
|
1096
|
+
# Convert overlay to PIL Image
|
|
1097
|
+
overlay_pil = Image.fromarray(overlay)
|
|
1098
|
+
|
|
1099
|
+
# Paste overlay onto frame using alpha channel
|
|
1100
|
+
frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
|
|
1101
|
+
|
|
1102
|
+
return np.array(frame_pil)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class TranscriptionWord:
|
|
8
|
+
start: float
|
|
9
|
+
end: float
|
|
10
|
+
word: str
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class TranscriptionSegment:
|
|
15
|
+
start: float
|
|
16
|
+
end: float
|
|
17
|
+
text: str
|
|
18
|
+
words: list[TranscriptionWord]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class Transcription:
|
|
23
|
+
segments: list[TranscriptionSegment]
|
|
24
|
+
|
|
25
|
+
def offset(self, time: float) -> Transcription:
|
|
26
|
+
"""Return a new Transcription with all timings offset by the provided time value."""
|
|
27
|
+
offset_segments = []
|
|
28
|
+
|
|
29
|
+
for segment in self.segments:
|
|
30
|
+
offset_words = []
|
|
31
|
+
for word in segment.words:
|
|
32
|
+
offset_words.append(TranscriptionWord(start=word.start + time, end=word.end + time, word=word.word))
|
|
33
|
+
|
|
34
|
+
offset_segments.append(
|
|
35
|
+
TranscriptionSegment(
|
|
36
|
+
start=segment.start + time, end=segment.end + time, text=segment.text, words=offset_words
|
|
37
|
+
)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
return Transcription(segments=offset_segments)
|
|
41
|
+
|
|
42
|
+
def standardize_segments(self, *, time: float | None = None, num_words: int | None = None) -> Transcription:
|
|
43
|
+
"""Return a new Transcription with standardized segments.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
time: Maximum duration in seconds for each segment
|
|
47
|
+
num_words: Exact number of words per segment
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If both time and num_words are provided or if neither is provided
|
|
51
|
+
"""
|
|
52
|
+
if (time is None) == (num_words is None):
|
|
53
|
+
raise ValueError("Exactly one of 'time' or 'num_words' must be provided")
|
|
54
|
+
|
|
55
|
+
if time is not None and time <= 0:
|
|
56
|
+
raise ValueError("Time must be positive")
|
|
57
|
+
|
|
58
|
+
if num_words is not None and num_words <= 0:
|
|
59
|
+
raise ValueError("Number of words must be positive")
|
|
60
|
+
|
|
61
|
+
# Collect all words from all segments
|
|
62
|
+
all_words = []
|
|
63
|
+
for segment in self.segments:
|
|
64
|
+
all_words.extend(segment.words)
|
|
65
|
+
|
|
66
|
+
if not all_words:
|
|
67
|
+
return Transcription(segments=[])
|
|
68
|
+
|
|
69
|
+
standardized_segments = []
|
|
70
|
+
|
|
71
|
+
if time is not None:
|
|
72
|
+
# Group words by time constraint
|
|
73
|
+
current_words = []
|
|
74
|
+
current_start = None
|
|
75
|
+
|
|
76
|
+
for word in all_words:
|
|
77
|
+
if current_start is None:
|
|
78
|
+
current_start = word.start
|
|
79
|
+
current_words = [word]
|
|
80
|
+
elif word.end - current_start <= time:
|
|
81
|
+
current_words.append(word)
|
|
82
|
+
else:
|
|
83
|
+
# Create segment from current words
|
|
84
|
+
if current_words:
|
|
85
|
+
segment_text = " ".join(w.word for w in current_words)
|
|
86
|
+
standardized_segments.append(
|
|
87
|
+
TranscriptionSegment(
|
|
88
|
+
start=current_start,
|
|
89
|
+
end=current_words[-1].end,
|
|
90
|
+
text=segment_text,
|
|
91
|
+
words=current_words.copy(),
|
|
92
|
+
)
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Start new segment
|
|
96
|
+
current_start = word.start
|
|
97
|
+
current_words = [word]
|
|
98
|
+
|
|
99
|
+
# Add final segment
|
|
100
|
+
if current_words:
|
|
101
|
+
segment_text = " ".join(w.word for w in current_words)
|
|
102
|
+
standardized_segments.append(
|
|
103
|
+
TranscriptionSegment(
|
|
104
|
+
start=current_start, # type: ignore
|
|
105
|
+
end=current_words[-1].end,
|
|
106
|
+
text=segment_text,
|
|
107
|
+
words=current_words.copy(),
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
elif num_words is not None:
|
|
111
|
+
# Group words by word count constraint
|
|
112
|
+
for i in range(0, len(all_words), num_words):
|
|
113
|
+
segment_words = all_words[i : i + num_words]
|
|
114
|
+
segment_text = " ".join(w.word for w in segment_words)
|
|
115
|
+
standardized_segments.append(
|
|
116
|
+
TranscriptionSegment(
|
|
117
|
+
start=segment_words[0].start, end=segment_words[-1].end, text=segment_text, words=segment_words
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return Transcription(segments=standardized_segments)
|
videopython/base/video.py
CHANGED
|
@@ -11,7 +11,7 @@ from typing import Literal, get_args
|
|
|
11
11
|
import numpy as np
|
|
12
12
|
from soundpython import Audio
|
|
13
13
|
|
|
14
|
-
from videopython.utils
|
|
14
|
+
from videopython.base.utils import generate_random_name
|
|
15
15
|
|
|
16
16
|
ALLOWED_VIDEO_FORMATS = Literal["mp4", "avi", "mov", "mkv", "webm"]
|
|
17
17
|
|
|
@@ -155,7 +155,6 @@ class Video:
|
|
|
155
155
|
width = metadata.width
|
|
156
156
|
height = metadata.height
|
|
157
157
|
fps = metadata.fps
|
|
158
|
-
total_frames = metadata.frame_count
|
|
159
158
|
total_duration = metadata.total_seconds
|
|
160
159
|
|
|
161
160
|
# Validate time bounds
|
|
@@ -166,99 +165,128 @@ class Video:
|
|
|
166
165
|
if start_second is not None and end_second is not None and start_second >= end_second:
|
|
167
166
|
raise ValueError("start_second must be less than end_second")
|
|
168
167
|
|
|
169
|
-
#
|
|
170
|
-
|
|
171
|
-
end_frame = int(end_second * fps) if end_second is not None else total_frames
|
|
168
|
+
# Build FFmpeg command with improved segment handling
|
|
169
|
+
ffmpeg_cmd = ["ffmpeg"]
|
|
172
170
|
|
|
173
|
-
#
|
|
174
|
-
start_frame = max(0, start_frame)
|
|
175
|
-
end_frame = min(total_frames, end_frame)
|
|
176
|
-
segment_frames = end_frame - start_frame
|
|
177
|
-
|
|
178
|
-
# Set up FFmpeg command for raw video extraction with time bounds
|
|
179
|
-
ffmpeg_cmd = [
|
|
180
|
-
"ffmpeg",
|
|
181
|
-
"-i",
|
|
182
|
-
path,
|
|
183
|
-
]
|
|
184
|
-
|
|
185
|
-
# Add seek and duration options if specified
|
|
171
|
+
# Add seek option BEFORE input for more efficient seeking
|
|
186
172
|
if start_second is not None:
|
|
187
173
|
ffmpeg_cmd.extend(["-ss", str(start_second)])
|
|
174
|
+
|
|
175
|
+
ffmpeg_cmd.extend(["-i", path])
|
|
176
|
+
|
|
177
|
+
# Add duration AFTER input for more precise timing
|
|
188
178
|
if end_second is not None and start_second is not None:
|
|
189
179
|
duration = end_second - start_second
|
|
190
180
|
ffmpeg_cmd.extend(["-t", str(duration)])
|
|
191
181
|
elif end_second is not None:
|
|
192
182
|
ffmpeg_cmd.extend(["-t", str(end_second)])
|
|
193
183
|
|
|
184
|
+
# Output format settings - removed problematic -vsync 0
|
|
194
185
|
ffmpeg_cmd.extend(
|
|
195
186
|
[
|
|
196
187
|
"-f",
|
|
197
188
|
"rawvideo",
|
|
198
189
|
"-pix_fmt",
|
|
199
190
|
"rgb24",
|
|
200
|
-
"-vsync",
|
|
201
|
-
"0",
|
|
202
191
|
"-vcodec",
|
|
203
192
|
"rawvideo",
|
|
193
|
+
"-avoid_negative_ts",
|
|
194
|
+
"make_zero", # Handle timing issues
|
|
204
195
|
"-y",
|
|
205
196
|
"pipe:1",
|
|
206
197
|
]
|
|
207
198
|
)
|
|
208
199
|
|
|
209
|
-
# Start FFmpeg process
|
|
200
|
+
# Start FFmpeg process with stderr redirected to avoid deadlock
|
|
210
201
|
process = subprocess.Popen(
|
|
211
202
|
ffmpeg_cmd,
|
|
212
203
|
stdout=subprocess.PIPE,
|
|
213
|
-
stderr=subprocess.
|
|
214
|
-
bufsize=10**8, # Use large buffer
|
|
204
|
+
stderr=subprocess.DEVNULL, # Redirect stderr to avoid deadlock
|
|
205
|
+
bufsize=10**8, # Use large buffer for efficient I/O
|
|
215
206
|
)
|
|
216
207
|
|
|
217
208
|
# Calculate frame size in bytes
|
|
218
209
|
frame_size = width * height * 3 # 3 bytes per pixel for RGB
|
|
219
210
|
|
|
220
|
-
#
|
|
221
|
-
|
|
211
|
+
# Estimate frame count for pre-allocation
|
|
212
|
+
if start_second is not None and end_second is not None:
|
|
213
|
+
estimated_duration = end_second - start_second
|
|
214
|
+
elif end_second is not None:
|
|
215
|
+
estimated_duration = end_second
|
|
216
|
+
elif start_second is not None:
|
|
217
|
+
estimated_duration = total_duration - start_second
|
|
218
|
+
else:
|
|
219
|
+
estimated_duration = total_duration
|
|
220
|
+
|
|
221
|
+
# Add 10% buffer to handle frame rate variations and rounding
|
|
222
|
+
estimated_frames = int(estimated_duration * fps * 1.1) + 10
|
|
222
223
|
|
|
223
|
-
#
|
|
224
|
+
# Pre-allocate numpy array
|
|
225
|
+
frames = np.empty((estimated_frames, height, width, 3), dtype=np.uint8)
|
|
224
226
|
frames_read = 0
|
|
225
|
-
for frame_idx in range(0, segment_frames, read_batch_size):
|
|
226
|
-
batch_end = min(frame_idx + read_batch_size, segment_frames)
|
|
227
|
-
batch_size = batch_end - frame_idx
|
|
228
227
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
228
|
+
try:
|
|
229
|
+
while frames_read < estimated_frames:
|
|
230
|
+
# Calculate remaining frames to read
|
|
231
|
+
remaining_frames = estimated_frames - frames_read
|
|
232
|
+
batch_size = min(read_batch_size, remaining_frames)
|
|
233
233
|
|
|
234
|
-
|
|
235
|
-
|
|
234
|
+
# Read batch of data
|
|
235
|
+
batch_data = process.stdout.read(frame_size * batch_size) # type: ignore
|
|
236
236
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
if actual_frames > 0:
|
|
240
|
-
batch_frames = batch_frames[: actual_frames * height * width * 3]
|
|
241
|
-
batch_frames = batch_frames.reshape(-1, height, width, 3)
|
|
237
|
+
if not batch_data:
|
|
238
|
+
break
|
|
242
239
|
|
|
243
|
-
#
|
|
244
|
-
|
|
245
|
-
frames[frame_idx:end_idx] = batch_frames
|
|
246
|
-
frames_read += actual_frames
|
|
247
|
-
else:
|
|
248
|
-
break
|
|
240
|
+
# Convert to numpy array
|
|
241
|
+
batch_frames = np.frombuffer(batch_data, dtype=np.uint8)
|
|
249
242
|
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
process.stderr.close() # type: ignore
|
|
253
|
-
process.wait()
|
|
243
|
+
# Calculate how many complete frames we got
|
|
244
|
+
complete_frames = len(batch_frames) // (height * width * 3)
|
|
254
245
|
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
raise ValueError(f"FFmpeg error: {stderr_output}")
|
|
246
|
+
if complete_frames == 0:
|
|
247
|
+
break
|
|
258
248
|
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
249
|
+
# Only keep complete frames
|
|
250
|
+
complete_data = batch_frames[: complete_frames * height * width * 3]
|
|
251
|
+
batch_frames_array = complete_data.reshape(complete_frames, height, width, 3)
|
|
252
|
+
|
|
253
|
+
# Check if we have room in pre-allocated array
|
|
254
|
+
if frames_read + complete_frames > estimated_frames:
|
|
255
|
+
# Need to expand array - this should be rare with our buffer
|
|
256
|
+
new_size = max(estimated_frames * 2, frames_read + complete_frames + 100)
|
|
257
|
+
new_frames = np.empty((new_size, height, width, 3), dtype=np.uint8)
|
|
258
|
+
new_frames[:frames_read] = frames[:frames_read]
|
|
259
|
+
frames = new_frames
|
|
260
|
+
estimated_frames = new_size
|
|
261
|
+
|
|
262
|
+
# Store batch in pre-allocated array
|
|
263
|
+
end_idx = frames_read + complete_frames
|
|
264
|
+
frames[frames_read:end_idx] = batch_frames_array
|
|
265
|
+
frames_read += complete_frames
|
|
266
|
+
|
|
267
|
+
finally:
|
|
268
|
+
# Ensure process is properly terminated
|
|
269
|
+
if process.poll() is None:
|
|
270
|
+
process.terminate()
|
|
271
|
+
try:
|
|
272
|
+
process.wait(timeout=5)
|
|
273
|
+
except subprocess.TimeoutExpired:
|
|
274
|
+
process.kill()
|
|
275
|
+
process.wait()
|
|
276
|
+
|
|
277
|
+
# Clean up pipes
|
|
278
|
+
if process.stdout:
|
|
279
|
+
process.stdout.close()
|
|
280
|
+
|
|
281
|
+
# Check if FFmpeg had an error (non-zero return code)
|
|
282
|
+
if process.returncode not in (0, None) and frames_read == 0:
|
|
283
|
+
raise ValueError(f"FFmpeg failed to process video (return code: {process.returncode})")
|
|
284
|
+
|
|
285
|
+
if frames_read == 0:
|
|
286
|
+
raise ValueError("No frames were read from the video")
|
|
287
|
+
|
|
288
|
+
# Trim the pre-allocated array to actual frames read
|
|
289
|
+
frames = frames[:frames_read] # type: ignore
|
|
262
290
|
|
|
263
291
|
# Load audio for the specified segment
|
|
264
292
|
try:
|
|
@@ -270,8 +298,8 @@ class Video:
|
|
|
270
298
|
audio = audio.slice(start_seconds=audio_start, end_seconds=audio_end)
|
|
271
299
|
except Exception:
|
|
272
300
|
print(f"No audio found for `{path}`, adding silent track!")
|
|
273
|
-
# Create silent audio
|
|
274
|
-
segment_duration =
|
|
301
|
+
# Create silent audio based on actual frames read
|
|
302
|
+
segment_duration = frames_read / fps
|
|
275
303
|
audio = Audio.create_silent(duration_seconds=round(segment_duration, 2), stereo=True, sample_rate=44100)
|
|
276
304
|
|
|
277
305
|
return cls(frames=frames, fps=fps, audio=audio)
|
|
@@ -421,6 +449,20 @@ class Video:
|
|
|
421
449
|
raise
|
|
422
450
|
|
|
423
451
|
def add_audio(self, audio: Audio, overlay: bool = True) -> None:
|
|
452
|
+
video_duration = self.total_seconds
|
|
453
|
+
audio_duration = audio.metadata.duration_seconds
|
|
454
|
+
|
|
455
|
+
if audio_duration > video_duration:
|
|
456
|
+
audio = audio.slice(start_seconds=0, end_seconds=video_duration)
|
|
457
|
+
elif audio_duration < video_duration:
|
|
458
|
+
silence_duration = video_duration - audio_duration
|
|
459
|
+
silence = Audio.create_silent(
|
|
460
|
+
duration_seconds=silence_duration,
|
|
461
|
+
stereo=audio.metadata.channels == 2,
|
|
462
|
+
sample_rate=audio.metadata.sample_rate,
|
|
463
|
+
)
|
|
464
|
+
audio = audio.concat(silence)
|
|
465
|
+
|
|
424
466
|
if self.audio.is_silent:
|
|
425
467
|
self.audio = audio
|
|
426
468
|
elif overlay:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bartwojtowicz/videopython/
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -8,12 +8,13 @@ Project-URL: Documentation, https://github.com/bartwojtowicz/videopython/
|
|
|
8
8
|
Author-email: Bartosz Wójtowicz <bartoszwojtowicz@outlook.com>, Bartosz Rudnikowicz <bartoszrudnikowicz840@gmail.com>, Piotr Pukisz <piotr.pukisz@gmail.com>
|
|
9
9
|
License: Apache-2.0
|
|
10
10
|
License-File: LICENSE
|
|
11
|
-
Keywords: editing,generation,movie,opencv,python,video,videopython
|
|
11
|
+
Keywords: ai,editing,generation,movie,opencv,python,shorts,video,videopython
|
|
12
12
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
13
|
Classifier: Operating System :: OS Independent
|
|
14
14
|
Classifier: Programming Language :: Python :: 3
|
|
15
15
|
Classifier: Programming Language :: Python :: 3.10
|
|
16
16
|
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
18
|
Requires-Python: <3.13,>=3.10
|
|
18
19
|
Requires-Dist: numpy>=1.25.2
|
|
19
20
|
Requires-Dist: opencv-python>=4.9.0.80
|
|
@@ -38,11 +39,11 @@ Description-Content-Type: text/markdown
|
|
|
38
39
|
|
|
39
40
|
# About
|
|
40
41
|
|
|
41
|
-
|
|
42
|
+
Videopython is a minimal video generation and processing library designed with short-form videos in mind, with focus on simplicity and ease of use for both humans and AI agents.
|
|
42
43
|
|
|
43
|
-
|
|
44
|
+
# Setup
|
|
44
45
|
|
|
45
|
-
|
|
46
|
+
## Install ffmpeg
|
|
46
47
|
```bash
|
|
47
48
|
# Install with brew for MacOS:
|
|
48
49
|
brew install ffmpeg
|
|
@@ -50,16 +51,22 @@ brew install ffmpeg
|
|
|
50
51
|
sudo apt-get install ffmpeg
|
|
51
52
|
```
|
|
52
53
|
|
|
53
|
-
|
|
54
|
+
## Install library
|
|
55
|
+
|
|
54
56
|
```bash
|
|
57
|
+
# Install with your favourite package manager
|
|
58
|
+
uv add videopython --extra ai
|
|
59
|
+
|
|
60
|
+
# pip install works as well :)
|
|
55
61
|
pip install videopython[ai]
|
|
56
62
|
```
|
|
57
|
-
> You can install without `[ai]` dependencies for basic video handling and processing.
|
|
58
|
-
> The funcionalities found in `videopython.ai` won't work.
|
|
59
63
|
|
|
60
|
-
|
|
64
|
+
> You can install without `[ai]` dependencies for basic video handling and processing.
|
|
65
|
+
> The functionalities found in `videopython.ai` won't work.
|
|
66
|
+
|
|
67
|
+
# Usage examples
|
|
61
68
|
|
|
62
|
-
|
|
69
|
+
## Basic video editing
|
|
63
70
|
|
|
64
71
|
```python
|
|
65
72
|
from videopython.base.video import Video
|
|
@@ -90,6 +97,8 @@ video.add_audio_from_file("tests/test_data/test_audio.mp3")
|
|
|
90
97
|
savepath = video.save()
|
|
91
98
|
```
|
|
92
99
|
|
|
100
|
+
## AI powered examples
|
|
101
|
+
|
|
93
102
|
### Video Generation
|
|
94
103
|
|
|
95
104
|
> Using Nvidia A40 or better is recommended for the `videopython.ai` module.
|
|
@@ -97,7 +106,6 @@ savepath = video.save()
|
|
|
97
106
|
# Generate image and animate it
|
|
98
107
|
from videopython.ai.generation import ImageToVideo
|
|
99
108
|
from videopython.ai.generation import TextToImage
|
|
100
|
-
from videopython.ai.generation import TextToMusic
|
|
101
109
|
|
|
102
110
|
image = TextToImage().generate_image(prompt="Golden Retriever playing in the park")
|
|
103
111
|
video = ImageToVideo().generate_video(image=image, fps=24)
|
|
@@ -105,27 +113,82 @@ video = ImageToVideo().generate_video(image=image, fps=24)
|
|
|
105
113
|
# Video generation directly from prompt
|
|
106
114
|
from videopython.ai.generation import TextToVideo
|
|
107
115
|
video_gen = TextToVideo()
|
|
108
|
-
video = video_gen.generate_video("Dogs playing in the
|
|
116
|
+
video = video_gen.generate_video("Dogs playing in the park")
|
|
109
117
|
for _ in range(10):
|
|
110
|
-
video += video_gen.generate_video("Dogs playing in the
|
|
111
|
-
|
|
112
|
-
# Cut the first 2 seconds
|
|
113
|
-
from videopython.base.transforms import CutSeconds
|
|
114
|
-
transformed_video = CutSeconds(start_second=0, end_second=2).apply(video.copy())
|
|
115
|
-
|
|
116
|
-
# Upsample to 30 FPS
|
|
117
|
-
from videopython.base.transforms import ResampleFPS
|
|
118
|
-
transformed_video = ResampleFPS(new_fps=30).apply(transformed_video)
|
|
118
|
+
video += video_gen.generate_video("Dogs playing in the park")
|
|
119
|
+
```
|
|
119
120
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
121
|
+
### Audio generation
|
|
122
|
+
```python
|
|
123
|
+
from videopython.base.video import Video
|
|
124
|
+
video = Video.from_path("<PATH_TO_VIDEO>")
|
|
123
125
|
|
|
124
|
-
#
|
|
125
|
-
|
|
126
|
+
# Generate music on top of video
|
|
127
|
+
from videopython.ai.generation import TextToMusic
|
|
126
128
|
text_to_music = TextToMusic()
|
|
127
129
|
audio = text_to_music.generate_audio("Happy dogs playing together in a park", max_new_tokens=256)
|
|
128
|
-
|
|
130
|
+
video.add_audio(audio=audio)
|
|
131
|
+
|
|
132
|
+
# Add TTS on top of video
|
|
133
|
+
from videopython.ai.generation import TextToSpeech
|
|
134
|
+
text_to_speech = TextToSpeech()
|
|
135
|
+
audio = text_to_speech.generate_audio("Woof woof woof! Woooooof!")
|
|
136
|
+
video.add_audio(audio=audio)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### Generate and overlay subtitles
|
|
140
|
+
```python
|
|
141
|
+
from videopython.base.video import Video
|
|
142
|
+
video = Video.from_path("<PATH_TO_VIDEO>")
|
|
143
|
+
|
|
144
|
+
# Generate transcription with timestamps
|
|
145
|
+
from videopython.ai.understanding.transcribe import CreateTranscription
|
|
146
|
+
transcription = CreateTranscription("base").transcribe(video)
|
|
147
|
+
# Initialise object for overlaying. See `TranscriptionOverlay` to see detailed configuration options.
|
|
148
|
+
from videopython.base.text.overlay import TranscriptionOverlay
|
|
149
|
+
transcription_overlay = TranscriptionOverlay(font_filename="src/tests/test_data/test_font.ttf")
|
|
129
150
|
|
|
130
|
-
|
|
151
|
+
video = transcription_overlay.apply(video, transcription)
|
|
152
|
+
video.save()
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
# Development notes
|
|
156
|
+
|
|
157
|
+
## Project structure
|
|
158
|
+
|
|
159
|
+
Source code of the project can be found under `src/` directory, along with separate directories for unit tests and mypy stubs.
|
|
160
|
+
```
|
|
161
|
+
.
|
|
162
|
+
└── src
|
|
163
|
+
├── stubs # Contains stubs for mypy
|
|
164
|
+
├── tests # Unit tests
|
|
165
|
+
└── videopython # Library code
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
----
|
|
169
|
+
|
|
170
|
+
The `videopython` library is divided into 2 separate high-level modules:
|
|
171
|
+
* `videopython.base`: Contains base classes for handling videos and for basic video editing. There are no imports from `videopython.ai` within the `base` module, which allows users to install light-weight base dependencies to do simple video operations.
|
|
172
|
+
* `videopython.ai`: Contains AI-powered functionalities for video generation. It has its own `ai` dependency group, which contains all dependencies required to run AI models.
|
|
173
|
+
|
|
174
|
+
## Running locally
|
|
175
|
+
|
|
176
|
+
We are using [uv](https://docs.astral.sh/uv/) as project and package manager. Once you clone the repo and install uv locally, you can use it to sync the dependencies.
|
|
177
|
+
```bash
|
|
178
|
+
uv sync --all-extras
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
To run the unit tests, you can simply run:
|
|
182
|
+
```bash
|
|
183
|
+
uv run pytest
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
We also use [Ruff](https://docs.astral.sh/ruff/) for linting/formatting and [mypy](https://github.com/python/mypy) as type checker.
|
|
187
|
+
```bash
|
|
188
|
+
# Run formatting
|
|
189
|
+
uv run ruff format
|
|
190
|
+
# Run linting and apply fixes
|
|
191
|
+
uv run ruff check --fix
|
|
192
|
+
# Run type checks
|
|
193
|
+
uv run mypy src/
|
|
131
194
|
```
|
|
@@ -6,21 +6,19 @@ videopython/ai/generation/audio.py,sha256=CNf6ZeV3iU4CU0Kq8HtDLwLPP2ABq9AGQD1TBO
|
|
|
6
6
|
videopython/ai/generation/image.py,sha256=gS0zqzyIoCvjTjfks31ApG8lX0nUKXWRRgFGGLN4RjM,654
|
|
7
7
|
videopython/ai/generation/video.py,sha256=206YON_XjPTYyjIJ3j5uBgd_yHmCDg7SqbkIU9GzEgw,1831
|
|
8
8
|
videopython/ai/understanding/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
9
|
-
videopython/ai/understanding/transcribe.py,sha256=
|
|
9
|
+
videopython/ai/understanding/transcribe.py,sha256=hm2f5Fm1O_tMrSmUlcUdl_rQRhc5Sz_kaV4tnJ4IxbQ,2557
|
|
10
10
|
videopython/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
videopython/base/combine.py,sha256=XC_pzyhbIh6h0fmxX1LhhhtlmOBbUQX9Y4EtDJqQn8g,1900
|
|
12
|
-
videopython/base/compose.py,sha256=pti12VY3Yg7TZZiENPF6veM8POWssfsK8ePDdGlhAhA,1968
|
|
13
12
|
videopython/base/effects.py,sha256=1RbRLTQD0V26DBc4jbRCDI4eGr6-TyBdX-Ia2USKxmc,7554
|
|
14
13
|
videopython/base/exceptions.py,sha256=68_16lUPOR9_zhWdeBGS8_NFI32VbrcoDbN5KHHg0_w,44
|
|
15
|
-
videopython/base/transcription.py,sha256=FloqvY-OlBQPOCkPnSx6R7azn4smD5-JYd-pMNssuYw,196
|
|
16
14
|
videopython/base/transforms.py,sha256=FDh-8EgQoZxB6Gv-T15kZGctcu9_4XHsTy_n7kgxlQw,5828
|
|
17
15
|
videopython/base/transitions.py,sha256=P1bBsxugf5i0JEtx7MoRgxWSIDcBli-0QucRwBIFGqs,3687
|
|
18
|
-
videopython/base/
|
|
19
|
-
videopython/
|
|
20
|
-
videopython/
|
|
21
|
-
videopython/
|
|
22
|
-
videopython/
|
|
23
|
-
videopython-0.
|
|
24
|
-
videopython-0.
|
|
25
|
-
videopython-0.
|
|
26
|
-
videopython-0.
|
|
16
|
+
videopython/base/utils.py,sha256=bAwIagHvd1NWu8UYAsS-pDm38E4R8qRfeHvWk-O2__0,125
|
|
17
|
+
videopython/base/video.py,sha256=RxKHmR39EEvBa5m2xFDNj4_mq213RUG3NQ_lhk5U-PA,20462
|
|
18
|
+
videopython/base/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
videopython/base/text/overlay.py,sha256=EiBDSsnn2pSGeWGajblUxovcP_IdA6gk2zZ5rsjhdI8,44434
|
|
20
|
+
videopython/base/text/transcription.py,sha256=9c3FRBr7RkialHhdfSwEX303QnIt1sCSiXoId9_DRkk,4246
|
|
21
|
+
videopython-0.5.0.dist-info/METADATA,sha256=FTo8Bo3YLhp9bGTrctiehMMksQwecH1DN84JO5RydyU,6574
|
|
22
|
+
videopython-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
23
|
+
videopython-0.5.0.dist-info/licenses/LICENSE,sha256=nJL9jVOt2MSW7swNDq4Y6oD_n9bLI0B0afr8ougtZ6s,10832
|
|
24
|
+
videopython-0.5.0.dist-info/RECORD,,
|
videopython/base/compose.py
DELETED
|
@@ -1,55 +0,0 @@
|
|
|
1
|
-
from itertools import repeat
|
|
2
|
-
from multiprocessing import Pool
|
|
3
|
-
|
|
4
|
-
from videopython.base.transforms import TransformationPipeline
|
|
5
|
-
from videopython.base.transitions import InstantTransition, Transition
|
|
6
|
-
from videopython.base.video import Video
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class VideoComposer:
|
|
10
|
-
"""
|
|
11
|
-
Composes multiple Videos into single video using selected transformations
|
|
12
|
-
on each video and applies transitions.
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
def __init__(
|
|
16
|
-
self,
|
|
17
|
-
transformation_pipeline: TransformationPipeline | None = None,
|
|
18
|
-
transition: Transition = InstantTransition(),
|
|
19
|
-
):
|
|
20
|
-
"""Initializes VideoComposer.
|
|
21
|
-
|
|
22
|
-
Args:
|
|
23
|
-
transformation_pipeline: Pipeline of transformations to apply on each video.
|
|
24
|
-
transition: Transition to apply between videos
|
|
25
|
-
"""
|
|
26
|
-
self.transition = transition
|
|
27
|
-
self.transformation_pipeline = transformation_pipeline
|
|
28
|
-
|
|
29
|
-
def _apply_transformation(self, video: Video, transformation_pipeline: TransformationPipeline) -> Video:
|
|
30
|
-
return transformation_pipeline(video)
|
|
31
|
-
|
|
32
|
-
def compose(self, videos: list[Video]) -> Video:
|
|
33
|
-
# Apply transformation on each video using multiprocessing pool:
|
|
34
|
-
if self.transformation_pipeline:
|
|
35
|
-
transformed_videos = []
|
|
36
|
-
with Pool() as pool:
|
|
37
|
-
transformed_videos = pool.starmap(
|
|
38
|
-
self._apply_transformation,
|
|
39
|
-
zip(videos, repeat(self.transformation_pipeline)),
|
|
40
|
-
)
|
|
41
|
-
videos = transformed_videos
|
|
42
|
-
|
|
43
|
-
# Check if videos are compatible:
|
|
44
|
-
self._compatibility_check(videos)
|
|
45
|
-
|
|
46
|
-
# Apply transition:
|
|
47
|
-
final_video = videos.pop(0)
|
|
48
|
-
for _ in range(len(videos)):
|
|
49
|
-
final_video = self.transition.apply((final_video, videos.pop(0)))
|
|
50
|
-
|
|
51
|
-
return final_video
|
|
52
|
-
|
|
53
|
-
@staticmethod
|
|
54
|
-
def _compatibility_check(videos: list[Video]):
|
|
55
|
-
assert all([videos[0].metadata.can_be_merged_with(other_video.metadata) for other_video in videos])
|
videopython/utils/__init__.py
DELETED
videopython/utils/common.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import time
|
|
2
|
-
import uuid
|
|
3
|
-
from pathlib import Path
|
|
4
|
-
from typing import Callable
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def generate_random_name(suffix=".mp4"):
|
|
8
|
-
"""Generates random name."""
|
|
9
|
-
return f"{uuid.uuid4()}{suffix}"
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def timeit(func: Callable):
|
|
13
|
-
"""Decorator to measure execution time of a function."""
|
|
14
|
-
|
|
15
|
-
def timed(*args, **kwargs):
|
|
16
|
-
start = time.time()
|
|
17
|
-
result = func(*args, **kwargs)
|
|
18
|
-
end = time.time()
|
|
19
|
-
print(f"Execution time: {end - start:.3f} seconds.")
|
|
20
|
-
return result
|
|
21
|
-
|
|
22
|
-
return timed
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def check_path(path: str, dir_exists: bool = True, suffix: str | None = None) -> str:
|
|
26
|
-
fullpath = Path(path).resolve()
|
|
27
|
-
if dir_exists and not fullpath.parent.exists():
|
|
28
|
-
raise ValueError(f"Directory `{fullpath.parent}` does not exist!")
|
|
29
|
-
if suffix and suffix != fullpath.suffix:
|
|
30
|
-
raise ValueError(f"Required suffix `{suffix}` does not match the file suffix `{fullpath.suffix}`")
|
|
31
|
-
return str(fullpath)
|
videopython/utils/image.py
DELETED
|
@@ -1,47 +0,0 @@
|
|
|
1
|
-
from typing import Literal
|
|
2
|
-
|
|
3
|
-
import cv2
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
from videopython.base.video import Video
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class SlideOverImage:
|
|
10
|
-
def __init__(
|
|
11
|
-
self,
|
|
12
|
-
direction: Literal["left", "right"],
|
|
13
|
-
video_shape: tuple[int, int] = (1080, 1920),
|
|
14
|
-
fps: float = 24.0,
|
|
15
|
-
length_seconds: float = 1.0,
|
|
16
|
-
) -> None:
|
|
17
|
-
self.direction = direction
|
|
18
|
-
self.video_width, self.video_height = video_shape
|
|
19
|
-
self.fps = fps
|
|
20
|
-
self.length_seconds = length_seconds
|
|
21
|
-
|
|
22
|
-
def apply(self, image: np.ndarray) -> Video:
|
|
23
|
-
image = self._resize(image)
|
|
24
|
-
max_offset = image.shape[1] - self.video_width
|
|
25
|
-
frame_count = round(self.fps * self.length_seconds)
|
|
26
|
-
|
|
27
|
-
deltas = np.linspace(0, max_offset, frame_count)
|
|
28
|
-
frames = []
|
|
29
|
-
|
|
30
|
-
for delta in deltas:
|
|
31
|
-
if self.direction == "right":
|
|
32
|
-
frame = image[:, round(delta) : round(delta) + self.video_width]
|
|
33
|
-
elif self.direction == "left":
|
|
34
|
-
frame = image[:, image.shape[1] - round(delta) - self.video_width : image.shape[1] - round(delta)]
|
|
35
|
-
frames.append(frame)
|
|
36
|
-
|
|
37
|
-
return Video.from_frames(frames=np.stack(frames, axis=0), fps=self.fps)
|
|
38
|
-
|
|
39
|
-
def _resize(self, image: np.ndarray) -> np.ndarray:
|
|
40
|
-
resize_factor = image.shape[0] / self.video_height
|
|
41
|
-
resize_dims = (round(image.shape[1] / resize_factor), round(image.shape[0] / resize_factor)) # width, height
|
|
42
|
-
image = cv2.resize(image, resize_dims)
|
|
43
|
-
if self.video_height > image.shape[0] or self.video_width > image.shape[1]:
|
|
44
|
-
raise ValueError(
|
|
45
|
-
f"Image `{image.shape}` is too small for the video frame `({self.video_width}, {self.video_height})`!"
|
|
46
|
-
)
|
|
47
|
-
return image
|
|
File without changes
|
|
File without changes
|