videopython 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videopython might be problematic. Click here for more details.

@@ -1,37 +1,66 @@
1
- from typing import Literal
1
+ from typing import Literal, Union
2
2
 
3
3
  import whisper
4
+ from soundpython import Audio
4
5
 
5
- from videopython.base.transcription import Transcription, TranscriptionSegment
6
+ from videopython.base.text.transcription import Transcription, TranscriptionSegment, TranscriptionWord
6
7
  from videopython.base.video import Video
7
8
 
8
9
 
9
- class VideoTranscription:
10
+ class CreateTranscription:
11
+ """Unified transcription service for both audio and video."""
12
+
10
13
  def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
11
14
  self.model = whisper.load_model(name=model_name)
12
15
 
13
- def transcribe_video(self, video: Video) -> Transcription:
14
- """Transcribes video to text.
16
+ def _process_transcription_result(self, transcription_result: dict) -> Transcription:
17
+ """Process raw transcription result into Transcription object.
15
18
 
16
19
  Args:
17
- video: Video to transcribe.
20
+ transcription_result: Raw result from whisper model
18
21
 
19
22
  Returns:
20
- List of dictionaries with segments of text and their start and end times.
23
+ Processed Transcription object
21
24
  """
22
- if video.audio.is_silent:
23
- return Transcription(segments=[])
25
+ transcription_segments = []
26
+ for segment in transcription_result["segments"]:
27
+ transcription_words = [
28
+ TranscriptionWord(word=word["word"], start=float(word["start"]), end=float(word["end"]))
29
+ for word in segment["words"]
30
+ ]
31
+ transcription_segment = TranscriptionSegment(
32
+ start=segment["start"], end=segment["end"], text=segment["text"], words=transcription_words
33
+ )
34
+ transcription_segments.append(transcription_segment)
35
+
36
+ return Transcription(segments=transcription_segments)
37
+
38
+ def transcribe(self, media: Union[Audio, Video]) -> Transcription:
39
+ """Transcribe audio or video to text.
40
+
41
+ Args:
42
+ media: Audio or Video to transcribe.
43
+
44
+ Returns:
45
+ Transcription object with segments of text and their timestamps.
46
+ """
47
+ if isinstance(media, Video):
48
+ # Handle video transcription
49
+ if media.audio.is_silent:
50
+ return Transcription(segments=[])
51
+
52
+ audio = media.audio.to_mono().resample(whisper.audio.SAMPLE_RATE)
53
+ transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
24
54
 
25
- audio = video.audio.to_mono()
26
- audio = audio.resample(whisper.audio.SAMPLE_RATE)
27
- audio_data = audio.data
55
+ elif isinstance(media, Audio):
56
+ # Handle audio transcription
57
+ if media.is_silent:
58
+ return Transcription(segments=[])
28
59
 
29
- transcription = self.model.transcribe(audio=audio_data, word_timestamps=True)
60
+ audio = media.to_mono().resample(whisper.audio.SAMPLE_RATE)
61
+ transcription_result = self.model.transcribe(audio=audio.data, word_timestamps=True)
30
62
 
31
- transcription_segments = [
32
- TranscriptionSegment(start=segment["start"], end=segment["end"], text=segment["text"])
33
- for segment in transcription["segments"]
34
- ]
35
- result = Transcription(segments=transcription_segments)
63
+ else:
64
+ raise TypeError(f"Unsupported media type: {type(media)}. Expected Audio or Video.")
36
65
 
37
- return result
66
+ return self._process_transcription_result(transcription_result)
File without changes
@@ -1,10 +1,21 @@
1
+ """
2
+ Beware, the code below was heavily "vibe-coded".
3
+
4
+ The main purpose of this file are 2 classes:
5
+ 1. `ImageText` class for creating RGBA image with rendered subtitles
6
+ 2. `TranscriptionOverlay` class, which takes the `Transcription` and `Video` objects and overlays subtitles on `Video`.
7
+ """
8
+
1
9
  from enum import Enum
2
10
  from typing import TypeAlias, Union
3
11
 
4
12
  import numpy as np
5
13
  from PIL import Image, ImageDraw, ImageFont
14
+ from tqdm import tqdm
6
15
 
7
16
  from videopython.base.exceptions import OutOfBoundsError
17
+ from videopython.base.text.transcription import Transcription, TranscriptionSegment
18
+ from videopython.base.video import Video
8
19
 
9
20
  # Type aliases for clarity
10
21
  MarginType: TypeAlias = Union[int, tuple[int, int, int, int]]
@@ -319,6 +330,7 @@ class ImageText:
319
330
  font_filename: str,
320
331
  xy: PositionType,
321
332
  font_size: int | None = 11,
333
+ font_border_size: int = 0,
322
334
  color: RGBColor = (0, 0, 0),
323
335
  max_width: int | None = None,
324
336
  max_height: int | None = None,
@@ -333,6 +345,7 @@ class ImageText:
333
345
  font_filename: Path to the font file
334
346
  xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
335
347
  font_size: Size of the font in points, or None to auto-calculate
348
+ font_border_size: Size of border around text in pixels (0 for no border)
336
349
  color: RGB color of the text
337
350
  max_width: Maximum width for auto font sizing
338
351
  max_height: Maximum height for auto font sizing
@@ -355,6 +368,9 @@ class ImageText:
355
368
  if font_size is not None and font_size <= 0:
356
369
  raise ValueError("Font size must be positive")
357
370
 
371
+ if font_border_size < 0:
372
+ raise ValueError("Font border size cannot be negative")
373
+
358
374
  if font_size is None and (max_width is None or max_height is None):
359
375
  raise ValueError("Must set either `font_size`, or both `max_width` and `max_height`!")
360
376
  elif font_size is None:
@@ -371,6 +387,15 @@ class ImageText:
371
387
  if x < 0 or y < 0 or x + text_dimensions[0] > self.image_size[0] or y + text_dimensions[1] > self.image_size[1]:
372
388
  raise OutOfBoundsError(f"Text with size {text_dimensions} at position ({x}, {y}) is out of bounds!")
373
389
 
390
+ # Draw border if requested
391
+ if font_border_size > 0:
392
+ # Draw text border by drawing text in multiple positions around the main text
393
+ for border_x in range(-font_border_size, font_border_size + 1):
394
+ for border_y in range(-font_border_size, font_border_size + 1):
395
+ if border_x != 0 or border_y != 0: # Skip the center position
396
+ self._draw.text((x + border_x, y + border_y), text, font=font, fill=(0, 0, 0))
397
+
398
+ # Draw the main text on top
374
399
  self._draw.text((x, y), text, font=font, fill=color)
375
400
  return text_dimensions
376
401
 
@@ -423,6 +448,46 @@ class ImageText:
423
448
  except Exception as e:
424
449
  raise ValueError(f"Error measuring text: {str(e)}")
425
450
 
451
+ def _get_font_baseline_offset(
452
+ self, base_font_filename: str, base_font_size: int, highlight_font_filename: str, highlight_font_size: int
453
+ ) -> int:
454
+ """
455
+ Calculate the vertical offset needed to align baselines of different fonts and sizes.
456
+
457
+ Args:
458
+ base_font_filename: Path to the base font file
459
+ base_font_size: Font size of normal text
460
+ highlight_font_filename: Path to the highlight font file
461
+ highlight_font_size: Font size of highlighted text
462
+
463
+ Returns:
464
+ Vertical offset in pixels to align highlighted text baseline with normal text baseline
465
+ """
466
+ base_font = self._get_font(base_font_filename, base_font_size)
467
+ highlight_font = self._get_font(highlight_font_filename, highlight_font_size)
468
+
469
+ # Use a reference character to get baseline metrics
470
+ # We use 'A' as it's a good reference for ascender height
471
+ ref_char = "A"
472
+
473
+ # Get bounding boxes for the reference character
474
+ base_bbox = base_font.getbbox(ref_char)
475
+ highlight_bbox = highlight_font.getbbox(ref_char)
476
+
477
+ if base_bbox is None or highlight_bbox is None:
478
+ return 0 # Fallback if bbox calculation fails
479
+
480
+ # The baseline offset is the difference in the top of the bounding box
481
+ # since getbbox returns (left, top, right, bottom) where top is negative for ascenders
482
+ base_ascent = -base_bbox[1] # Distance from baseline to top of character
483
+ highlight_ascent = -highlight_bbox[1] # Distance from baseline to top of character
484
+
485
+ # Calculate the offset needed to align baselines
486
+ # If highlighted text has a larger ascent, we need to move it down
487
+ baseline_offset = highlight_ascent - base_ascent
488
+
489
+ return baseline_offset
490
+
426
491
  def _split_lines_by_width(
427
492
  self,
428
493
  text: str,
@@ -499,12 +564,18 @@ class ImageText:
499
564
  xy: PositionType,
500
565
  box_width: Union[int, float] | None = None,
501
566
  font_size: int = 11,
567
+ font_border_size: int = 0,
502
568
  text_color: RGBColor = (0, 0, 0),
503
569
  background_color: RGBAColor | None = None,
504
570
  background_padding: int = 0,
505
571
  place: TextAlign = TextAlign.LEFT,
506
572
  anchor: AnchorPoint = AnchorPoint.TOP_LEFT,
507
573
  margin: MarginType = 0,
574
+ words: list[str] | None = None,
575
+ highlight_word_index: int | None = None,
576
+ highlight_color: RGBColor | None = None,
577
+ highlight_size_multiplier: float = 1.5,
578
+ highlight_bold_font: str | None = None,
508
579
  ) -> tuple[int, int]:
509
580
  """
510
581
  Write text in a box with advanced positioning and alignment options.
@@ -515,12 +586,18 @@ class ImageText:
515
586
  xy: Position (x,y) either as absolute pixels (int) or relative to frame (float 0-1)
516
587
  box_width: Width of the box in pixels (int) or relative to frame width (float 0-1)
517
588
  font_size: Font size in points
589
+ font_border_size: Size of border around text in pixels (0 for no border)
518
590
  text_color: RGB color of the text
519
591
  background_color: If set, adds background color to the text box. Expects RGBA values.
520
592
  background_padding: Number of padding pixels to add when adding text background color
521
593
  place: Text alignment within the box (TextAlign.LEFT, TextAlign.RIGHT, TextAlign.CENTER)
522
594
  anchor: Which part of the text box to anchor at the position
523
595
  margin: Margin in pixels (single value or [top, right, bottom, left])
596
+ words: All words occuring in text, helpful for highlighting.
597
+ highlight_word_index: Index of word to highlight (0-based, None to disable highlighting)
598
+ highlight_color: RGB color for the highlighted word (defaults to text_color if None)
599
+ highlight_size_multiplier: Font size multiplier for highlighted word
600
+ highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
524
601
 
525
602
  Returns:
526
603
  Coordinates of the lower-right corner of the written text box (x, y)
@@ -541,6 +618,25 @@ class ImageText:
541
618
  if background_padding < 0:
542
619
  raise ValueError("Background padding cannot be negative")
543
620
 
621
+ if font_border_size < 0:
622
+ raise ValueError("Font border size cannot be negative")
623
+
624
+ # Validate highlighting parameters
625
+ if highlight_word_index is not None:
626
+ if not words:
627
+ words = text.split()
628
+ if highlight_word_index < 0 or highlight_word_index >= len(words):
629
+ raise ValueError(
630
+ f"highlight_word_index {highlight_word_index} out of range for text with {len(words)} words"
631
+ )
632
+
633
+ if highlight_size_multiplier <= 0:
634
+ raise ValueError("highlight_size_multiplier must be positive")
635
+
636
+ # Set default highlight color if not provided
637
+ if highlight_word_index is not None and highlight_color is None:
638
+ highlight_color = text_color
639
+
544
640
  # Process margins to determine available area
545
641
  margin_top, margin_right, margin_bottom, margin_left = self._process_margin(margin)
546
642
  available_width = self.image_size[0] - margin_left - margin_right
@@ -590,6 +686,7 @@ class ImageText:
590
686
 
591
687
  # Write lines
592
688
  current_text_height = y_pos
689
+ word_index_offset = 0 # Track global word index across lines
593
690
  for line in lines:
594
691
  line_dimensions = self.get_text_dimensions(font_filename, font_size, line)
595
692
 
@@ -604,14 +701,49 @@ class ImageText:
604
701
  valid_places = [e.value for e in TextAlign]
605
702
  raise ValueError(f"Place '{place}' is not supported. Must be one of: {', '.join(valid_places)}")
606
703
 
607
- # Write the line
608
- self.write_text(
609
- text=line,
610
- font_filename=font_filename,
611
- xy=(x_left, current_text_height),
612
- font_size=font_size,
613
- color=text_color,
614
- )
704
+ # Check if highlighting is needed for this line
705
+ if highlight_word_index is not None:
706
+ line_words = line.split()
707
+ line_start_word_index = word_index_offset
708
+ line_end_word_index = word_index_offset + len(line_words) - 1
709
+
710
+ # Check if the highlighted word is in this line
711
+ if line_start_word_index <= highlight_word_index <= line_end_word_index:
712
+ self._write_line_with_highlight(
713
+ line=line,
714
+ font_filename=font_filename,
715
+ font_size=font_size,
716
+ font_border_size=font_border_size,
717
+ text_color=text_color,
718
+ highlight_color=highlight_color or (255, 255, 255),
719
+ highlight_size_multiplier=highlight_size_multiplier,
720
+ highlight_word_local_index=highlight_word_index - line_start_word_index,
721
+ highlight_bold_font=highlight_bold_font,
722
+ x_left=int(x_left),
723
+ y_top=int(current_text_height),
724
+ )
725
+ else:
726
+ # Write normal line without highlighting
727
+ self.write_text(
728
+ text=line,
729
+ font_filename=font_filename,
730
+ xy=(x_left, current_text_height),
731
+ font_size=font_size,
732
+ font_border_size=font_border_size,
733
+ color=text_color,
734
+ )
735
+
736
+ word_index_offset += len(line_words)
737
+ else:
738
+ # Write normal line without highlighting
739
+ self.write_text(
740
+ text=line,
741
+ font_filename=font_filename,
742
+ xy=(x_left, current_text_height),
743
+ font_size=font_size,
744
+ font_border_size=font_border_size,
745
+ color=text_color,
746
+ )
615
747
 
616
748
  # Increment vertical position for next line
617
749
  current_text_height += line_dimensions[1]
@@ -690,6 +822,88 @@ class ImageText:
690
822
 
691
823
  return (int(x_pos + box_width), int(current_text_height))
692
824
 
825
+ def _write_line_with_highlight(
826
+ self,
827
+ line: str,
828
+ font_filename: str,
829
+ font_size: int,
830
+ font_border_size: int,
831
+ text_color: RGBColor,
832
+ highlight_color: RGBColor,
833
+ highlight_size_multiplier: float,
834
+ highlight_word_local_index: int,
835
+ highlight_bold_font: str | None,
836
+ x_left: int,
837
+ y_top: int,
838
+ ) -> None:
839
+ """
840
+ Write a line of text with one word highlighted using word-by-word rendering with baseline alignment.
841
+
842
+ Args:
843
+ line: The text line to render
844
+ font_filename: Path to the font file
845
+ font_size: Base font size in points
846
+ font_border_size: Size of border around text in pixels (0 for no border)
847
+ text_color: RGB color for normal text
848
+ highlight_color: RGB color for highlighted word
849
+ highlight_size_multiplier: Font size multiplier for highlighted word
850
+ highlight_word_local_index: Index of word to highlight within this line (0-based)
851
+ highlight_bold_font: Path to bold font file for highlighted word (defaults to font_filename if None)
852
+ x_left: Left x position for the line
853
+ y_top: Top y position for the line
854
+ """
855
+ # Split line into words
856
+ words = line.split()
857
+ if highlight_word_local_index >= len(words):
858
+ return # Safety check
859
+
860
+ # Calculate highlighted font size and determine font files
861
+ highlight_font_size = int(font_size * highlight_size_multiplier)
862
+ highlight_font_file = highlight_bold_font if highlight_bold_font is not None else font_filename
863
+
864
+ # Calculate baseline offset for highlighted words (using the appropriate font files)
865
+ baseline_offset = self._get_font_baseline_offset(
866
+ font_filename, font_size, highlight_font_file, highlight_font_size
867
+ )
868
+
869
+ # Render words one by one with proper spacing
870
+ current_x = x_left
871
+
872
+ for i, word in enumerate(words):
873
+ # Determine if this is the highlighted word
874
+ is_highlighted = i == highlight_word_local_index
875
+
876
+ # Choose font file, size, and color based on highlighting
877
+ word_font_file = highlight_font_file if is_highlighted else font_filename
878
+ word_font_size = highlight_font_size if is_highlighted else font_size
879
+ word_color = highlight_color if is_highlighted else text_color
880
+
881
+ # Calculate y position with baseline alignment
882
+ word_y = y_top
883
+ if is_highlighted:
884
+ word_y += baseline_offset
885
+
886
+ # Render the word
887
+ self.write_text(
888
+ text=word,
889
+ font_filename=word_font_file,
890
+ xy=(current_x, word_y),
891
+ font_size=word_font_size,
892
+ font_border_size=font_border_size,
893
+ color=word_color,
894
+ )
895
+
896
+ # Calculate the width of this word for spacing
897
+ word_width = self.get_text_dimensions(word_font_file, word_font_size, word)[0]
898
+
899
+ # Update current_x for next word (add word width plus space)
900
+ current_x += word_width
901
+
902
+ # Add space between words (except after the last word)
903
+ if i < len(words) - 1:
904
+ space_width = self.get_text_dimensions(font_filename, font_size, " ")[0]
905
+ current_x += space_width
906
+
693
907
  def _find_smallest_bounding_rect(self, mask: np.ndarray) -> tuple[int, int, int, int]:
694
908
  """
695
909
  Find the smallest bounding rectangle containing non-zero values in the mask.
@@ -725,3 +939,164 @@ class ImageText:
725
939
  xmin, xmax = col_indices[[0, -1]]
726
940
 
727
941
  return xmin, xmax, ymin, ymax
942
+
943
+
944
+ class TranscriptionOverlay:
945
+ def __init__(
946
+ self,
947
+ font_filename: str,
948
+ font_size: int = 40,
949
+ font_border_size: int = 2,
950
+ text_color: RGBColor = (255, 235, 59),
951
+ background_color: RGBAColor | None = (0, 0, 0, 100),
952
+ background_padding: int = 15,
953
+ position: PositionType = (0.5, 0.7),
954
+ box_width: Union[int, float] = 0.6,
955
+ text_align: TextAlign = TextAlign.CENTER,
956
+ anchor: AnchorPoint = AnchorPoint.CENTER,
957
+ margin: MarginType = 20,
958
+ highlight_color: RGBColor = (76, 175, 80),
959
+ highlight_size_multiplier: float = 1.2,
960
+ highlight_bold_font: str | None = None,
961
+ ):
962
+ """
963
+ Initialize TranscriptionOverlay effect.
964
+
965
+ Args:
966
+ font_filename: Path to font file for text rendering
967
+ font_size: Base font size for text
968
+ text_color: RGB color for normal text
969
+ font_border_size: Size of border around text in pixels (0 for no border)
970
+ background_color: RGBA background color (None for no background)
971
+ background_padding: Padding around text background
972
+ position: Position of text box (relative 0-1 or absolute pixels)
973
+ box_width: Width of text box (relative 0-1 or absolute pixels)
974
+ text_align: Text alignment within box
975
+ anchor: Anchor point for text positioning
976
+ margin: Margin around text box
977
+ highlight_color: RGB color for highlighted words
978
+ highlight_size_multiplier: Size multiplier for highlighted words
979
+ highlight_bold_font: Optional bold font for highlighting
980
+ """
981
+ self.font_filename = font_filename
982
+ self.font_size = font_size
983
+ self.text_color = text_color
984
+ self.font_border_size = font_border_size
985
+ self.background_color = background_color
986
+ self.background_padding = background_padding
987
+ self.position = position
988
+ self.box_width = box_width
989
+ self.text_align = text_align
990
+ self.anchor = anchor
991
+ self.margin = margin
992
+ self.highlight_color = highlight_color
993
+ self.highlight_size_multiplier = highlight_size_multiplier
994
+ self.highlight_bold_font = highlight_bold_font
995
+
996
+ # Cache for text overlays to avoid regenerating identical frames
997
+ self._overlay_cache: dict[tuple[str, int | None], np.ndarray] = {}
998
+
999
+ def _get_active_segment(self, transcription: Transcription, timestamp: float) -> TranscriptionSegment | None:
1000
+ """Get the transcription segment active at the given timestamp."""
1001
+ for segment in transcription.segments:
1002
+ if segment.start <= timestamp <= segment.end:
1003
+ return segment
1004
+ return None
1005
+
1006
+ def _get_active_word_index(self, segment: TranscriptionSegment, timestamp: float) -> int | None:
1007
+ """Get the index of the word being spoken at the given timestamp within a segment."""
1008
+ for i, word in enumerate(segment.words):
1009
+ if word.start <= timestamp <= word.end:
1010
+ return i
1011
+ return None
1012
+
1013
+ def _create_text_overlay(
1014
+ self, video_shape: tuple[int, int, int], segment: TranscriptionSegment, highlight_word_index: int | None
1015
+ ) -> np.ndarray:
1016
+ """Create a text overlay image for the given segment and highlight."""
1017
+ # Use video frame dimensions for overlay
1018
+ height, width = video_shape[:2]
1019
+
1020
+ # Create cache key based on segment text and highlight
1021
+ cache_key = (segment.text, highlight_word_index)
1022
+ if cache_key in self._overlay_cache:
1023
+ return self._overlay_cache[cache_key]
1024
+
1025
+ # Create ImageText with video dimensions
1026
+ img_text = ImageText(image_size=(width, height), background=(0, 0, 0, 0))
1027
+
1028
+ # Write text with highlighting
1029
+ img_text.write_text_box(
1030
+ text=segment.text,
1031
+ font_filename=self.font_filename,
1032
+ xy=self.position,
1033
+ box_width=self.box_width,
1034
+ font_size=self.font_size,
1035
+ font_border_size=self.font_border_size,
1036
+ text_color=self.text_color,
1037
+ background_color=self.background_color,
1038
+ background_padding=self.background_padding,
1039
+ place=self.text_align,
1040
+ anchor=self.anchor,
1041
+ margin=self.margin,
1042
+ words=[w.word for w in segment.words],
1043
+ highlight_word_index=highlight_word_index,
1044
+ highlight_color=self.highlight_color,
1045
+ highlight_size_multiplier=self.highlight_size_multiplier,
1046
+ highlight_bold_font=self.highlight_bold_font,
1047
+ )
1048
+
1049
+ overlay_image = img_text.img_array
1050
+
1051
+ # Cache the overlay
1052
+ self._overlay_cache[cache_key] = overlay_image
1053
+
1054
+ return overlay_image
1055
+
1056
+ def apply(self, video: Video, transcription: Transcription) -> Video:
1057
+ """Apply transcription overlay to video frames."""
1058
+ print("Applying transcription overlay...")
1059
+
1060
+ new_frames = []
1061
+
1062
+ for frame_idx, frame in enumerate(tqdm(video.frames)):
1063
+ # Calculate timestamp for this frame
1064
+ timestamp = frame_idx / video.fps
1065
+
1066
+ # Get active segment at this timestamp
1067
+ active_segment = self._get_active_segment(transcription, timestamp)
1068
+
1069
+ if active_segment is None:
1070
+ # No active transcription, keep original frame
1071
+ new_frames.append(frame)
1072
+ continue
1073
+
1074
+ # Get active word index for highlighting
1075
+ highlight_word_index = self._get_active_word_index(active_segment, timestamp)
1076
+
1077
+ # Create text overlay
1078
+ text_overlay = self._create_text_overlay(video.frame_shape, active_segment, highlight_word_index)
1079
+
1080
+ # Apply overlay to frame
1081
+ overlaid_frame = self._apply_overlay_to_frame(frame, text_overlay)
1082
+ new_frames.append(overlaid_frame)
1083
+
1084
+ # Create new video with overlaid frames
1085
+ new_video = Video.from_frames(np.array(new_frames), fps=video.fps)
1086
+ new_video.audio = video.audio # Preserve audio
1087
+
1088
+ return new_video
1089
+
1090
+ def _apply_overlay_to_frame(self, frame: np.ndarray, overlay: np.ndarray) -> np.ndarray:
1091
+ """Apply a text overlay to a single frame."""
1092
+
1093
+ # Convert frame to PIL Image
1094
+ frame_pil = Image.fromarray(frame)
1095
+
1096
+ # Convert overlay to PIL Image
1097
+ overlay_pil = Image.fromarray(overlay)
1098
+
1099
+ # Paste overlay onto frame using alpha channel
1100
+ frame_pil.paste(overlay_pil, (0, 0), overlay_pil)
1101
+
1102
+ return np.array(frame_pil)
@@ -0,0 +1,121 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+
5
+
6
+ @dataclass
7
+ class TranscriptionWord:
8
+ start: float
9
+ end: float
10
+ word: str
11
+
12
+
13
+ @dataclass
14
+ class TranscriptionSegment:
15
+ start: float
16
+ end: float
17
+ text: str
18
+ words: list[TranscriptionWord]
19
+
20
+
21
+ @dataclass
22
+ class Transcription:
23
+ segments: list[TranscriptionSegment]
24
+
25
+ def offset(self, time: float) -> Transcription:
26
+ """Return a new Transcription with all timings offset by the provided time value."""
27
+ offset_segments = []
28
+
29
+ for segment in self.segments:
30
+ offset_words = []
31
+ for word in segment.words:
32
+ offset_words.append(TranscriptionWord(start=word.start + time, end=word.end + time, word=word.word))
33
+
34
+ offset_segments.append(
35
+ TranscriptionSegment(
36
+ start=segment.start + time, end=segment.end + time, text=segment.text, words=offset_words
37
+ )
38
+ )
39
+
40
+ return Transcription(segments=offset_segments)
41
+
42
+ def standardize_segments(self, *, time: float | None = None, num_words: int | None = None) -> Transcription:
43
+ """Return a new Transcription with standardized segments.
44
+
45
+ Args:
46
+ time: Maximum duration in seconds for each segment
47
+ num_words: Exact number of words per segment
48
+
49
+ Raises:
50
+ ValueError: If both time and num_words are provided or if neither is provided
51
+ """
52
+ if (time is None) == (num_words is None):
53
+ raise ValueError("Exactly one of 'time' or 'num_words' must be provided")
54
+
55
+ if time is not None and time <= 0:
56
+ raise ValueError("Time must be positive")
57
+
58
+ if num_words is not None and num_words <= 0:
59
+ raise ValueError("Number of words must be positive")
60
+
61
+ # Collect all words from all segments
62
+ all_words = []
63
+ for segment in self.segments:
64
+ all_words.extend(segment.words)
65
+
66
+ if not all_words:
67
+ return Transcription(segments=[])
68
+
69
+ standardized_segments = []
70
+
71
+ if time is not None:
72
+ # Group words by time constraint
73
+ current_words = []
74
+ current_start = None
75
+
76
+ for word in all_words:
77
+ if current_start is None:
78
+ current_start = word.start
79
+ current_words = [word]
80
+ elif word.end - current_start <= time:
81
+ current_words.append(word)
82
+ else:
83
+ # Create segment from current words
84
+ if current_words:
85
+ segment_text = " ".join(w.word for w in current_words)
86
+ standardized_segments.append(
87
+ TranscriptionSegment(
88
+ start=current_start,
89
+ end=current_words[-1].end,
90
+ text=segment_text,
91
+ words=current_words.copy(),
92
+ )
93
+ )
94
+
95
+ # Start new segment
96
+ current_start = word.start
97
+ current_words = [word]
98
+
99
+ # Add final segment
100
+ if current_words:
101
+ segment_text = " ".join(w.word for w in current_words)
102
+ standardized_segments.append(
103
+ TranscriptionSegment(
104
+ start=current_start, # type: ignore
105
+ end=current_words[-1].end,
106
+ text=segment_text,
107
+ words=current_words.copy(),
108
+ )
109
+ )
110
+ elif num_words is not None:
111
+ # Group words by word count constraint
112
+ for i in range(0, len(all_words), num_words):
113
+ segment_words = all_words[i : i + num_words]
114
+ segment_text = " ".join(w.word for w in segment_words)
115
+ standardized_segments.append(
116
+ TranscriptionSegment(
117
+ start=segment_words[0].start, end=segment_words[-1].end, text=segment_text, words=segment_words
118
+ )
119
+ )
120
+
121
+ return Transcription(segments=standardized_segments)
@@ -0,0 +1,6 @@
1
+ import uuid
2
+
3
+
4
+ def generate_random_name(suffix=".mp4"):
5
+ """Generates random name."""
6
+ return f"{uuid.uuid4()}{suffix}"
videopython/base/video.py CHANGED
@@ -11,7 +11,7 @@ from typing import Literal, get_args
11
11
  import numpy as np
12
12
  from soundpython import Audio
13
13
 
14
- from videopython.utils.common import generate_random_name
14
+ from videopython.base.utils import generate_random_name
15
15
 
16
16
  ALLOWED_VIDEO_FORMATS = Literal["mp4", "avi", "mov", "mkv", "webm"]
17
17
 
@@ -155,7 +155,6 @@ class Video:
155
155
  width = metadata.width
156
156
  height = metadata.height
157
157
  fps = metadata.fps
158
- total_frames = metadata.frame_count
159
158
  total_duration = metadata.total_seconds
160
159
 
161
160
  # Validate time bounds
@@ -166,99 +165,128 @@ class Video:
166
165
  if start_second is not None and end_second is not None and start_second >= end_second:
167
166
  raise ValueError("start_second must be less than end_second")
168
167
 
169
- # Calculate frame indices for the desired segment
170
- start_frame = int(start_second * fps) if start_second is not None else 0
171
- end_frame = int(end_second * fps) if end_second is not None else total_frames
168
+ # Build FFmpeg command with improved segment handling
169
+ ffmpeg_cmd = ["ffmpeg"]
172
170
 
173
- # Ensure we don't exceed bounds
174
- start_frame = max(0, start_frame)
175
- end_frame = min(total_frames, end_frame)
176
- segment_frames = end_frame - start_frame
177
-
178
- # Set up FFmpeg command for raw video extraction with time bounds
179
- ffmpeg_cmd = [
180
- "ffmpeg",
181
- "-i",
182
- path,
183
- ]
184
-
185
- # Add seek and duration options if specified
171
+ # Add seek option BEFORE input for more efficient seeking
186
172
  if start_second is not None:
187
173
  ffmpeg_cmd.extend(["-ss", str(start_second)])
174
+
175
+ ffmpeg_cmd.extend(["-i", path])
176
+
177
+ # Add duration AFTER input for more precise timing
188
178
  if end_second is not None and start_second is not None:
189
179
  duration = end_second - start_second
190
180
  ffmpeg_cmd.extend(["-t", str(duration)])
191
181
  elif end_second is not None:
192
182
  ffmpeg_cmd.extend(["-t", str(end_second)])
193
183
 
184
+ # Output format settings - removed problematic -vsync 0
194
185
  ffmpeg_cmd.extend(
195
186
  [
196
187
  "-f",
197
188
  "rawvideo",
198
189
  "-pix_fmt",
199
190
  "rgb24",
200
- "-vsync",
201
- "0",
202
191
  "-vcodec",
203
192
  "rawvideo",
193
+ "-avoid_negative_ts",
194
+ "make_zero", # Handle timing issues
204
195
  "-y",
205
196
  "pipe:1",
206
197
  ]
207
198
  )
208
199
 
209
- # Start FFmpeg process
200
+ # Start FFmpeg process with stderr redirected to avoid deadlock
210
201
  process = subprocess.Popen(
211
202
  ffmpeg_cmd,
212
203
  stdout=subprocess.PIPE,
213
- stderr=subprocess.PIPE,
214
- bufsize=10**8, # Use large buffer
204
+ stderr=subprocess.DEVNULL, # Redirect stderr to avoid deadlock
205
+ bufsize=10**8, # Use large buffer for efficient I/O
215
206
  )
216
207
 
217
208
  # Calculate frame size in bytes
218
209
  frame_size = width * height * 3 # 3 bytes per pixel for RGB
219
210
 
220
- # Pre-allocate numpy array for segment frames
221
- frames = np.empty((segment_frames, height, width, 3), dtype=np.uint8)
211
+ # Estimate frame count for pre-allocation
212
+ if start_second is not None and end_second is not None:
213
+ estimated_duration = end_second - start_second
214
+ elif end_second is not None:
215
+ estimated_duration = end_second
216
+ elif start_second is not None:
217
+ estimated_duration = total_duration - start_second
218
+ else:
219
+ estimated_duration = total_duration
220
+
221
+ # Add 10% buffer to handle frame rate variations and rounding
222
+ estimated_frames = int(estimated_duration * fps * 1.1) + 10
222
223
 
223
- # Read frames in batches
224
+ # Pre-allocate numpy array
225
+ frames = np.empty((estimated_frames, height, width, 3), dtype=np.uint8)
224
226
  frames_read = 0
225
- for frame_idx in range(0, segment_frames, read_batch_size):
226
- batch_end = min(frame_idx + read_batch_size, segment_frames)
227
- batch_size = batch_end - frame_idx
228
227
 
229
- # Read batch of frames
230
- raw_data = process.stdout.read(frame_size * batch_size) # type: ignore
231
- if not raw_data:
232
- break
228
+ try:
229
+ while frames_read < estimated_frames:
230
+ # Calculate remaining frames to read
231
+ remaining_frames = estimated_frames - frames_read
232
+ batch_size = min(read_batch_size, remaining_frames)
233
233
 
234
- # Convert raw bytes to numpy array and reshape
235
- batch_frames = np.frombuffer(raw_data, dtype=np.uint8)
234
+ # Read batch of data
235
+ batch_data = process.stdout.read(frame_size * batch_size) # type: ignore
236
236
 
237
- # Handle case where we might get fewer frames than expected
238
- actual_frames = len(batch_frames) // (height * width * 3)
239
- if actual_frames > 0:
240
- batch_frames = batch_frames[: actual_frames * height * width * 3]
241
- batch_frames = batch_frames.reshape(-1, height, width, 3)
237
+ if not batch_data:
238
+ break
242
239
 
243
- # Store batch in pre-allocated array
244
- end_idx = frame_idx + actual_frames
245
- frames[frame_idx:end_idx] = batch_frames
246
- frames_read += actual_frames
247
- else:
248
- break
240
+ # Convert to numpy array
241
+ batch_frames = np.frombuffer(batch_data, dtype=np.uint8)
249
242
 
250
- # Clean up FFmpeg process
251
- process.stdout.close() # type: ignore
252
- process.stderr.close() # type: ignore
253
- process.wait()
243
+ # Calculate how many complete frames we got
244
+ complete_frames = len(batch_frames) // (height * width * 3)
254
245
 
255
- if process.returncode != 0:
256
- stderr_output = process.stderr.read().decode() if process.stderr else "Unknown error"
257
- raise ValueError(f"FFmpeg error: {stderr_output}")
246
+ if complete_frames == 0:
247
+ break
258
248
 
259
- # Trim frames array if we read fewer frames than expected
260
- if frames_read < segment_frames:
261
- frames = frames[:frames_read] # type: ignore[assignment]
249
+ # Only keep complete frames
250
+ complete_data = batch_frames[: complete_frames * height * width * 3]
251
+ batch_frames_array = complete_data.reshape(complete_frames, height, width, 3)
252
+
253
+ # Check if we have room in pre-allocated array
254
+ if frames_read + complete_frames > estimated_frames:
255
+ # Need to expand array - this should be rare with our buffer
256
+ new_size = max(estimated_frames * 2, frames_read + complete_frames + 100)
257
+ new_frames = np.empty((new_size, height, width, 3), dtype=np.uint8)
258
+ new_frames[:frames_read] = frames[:frames_read]
259
+ frames = new_frames
260
+ estimated_frames = new_size
261
+
262
+ # Store batch in pre-allocated array
263
+ end_idx = frames_read + complete_frames
264
+ frames[frames_read:end_idx] = batch_frames_array
265
+ frames_read += complete_frames
266
+
267
+ finally:
268
+ # Ensure process is properly terminated
269
+ if process.poll() is None:
270
+ process.terminate()
271
+ try:
272
+ process.wait(timeout=5)
273
+ except subprocess.TimeoutExpired:
274
+ process.kill()
275
+ process.wait()
276
+
277
+ # Clean up pipes
278
+ if process.stdout:
279
+ process.stdout.close()
280
+
281
+ # Check if FFmpeg had an error (non-zero return code)
282
+ if process.returncode not in (0, None) and frames_read == 0:
283
+ raise ValueError(f"FFmpeg failed to process video (return code: {process.returncode})")
284
+
285
+ if frames_read == 0:
286
+ raise ValueError("No frames were read from the video")
287
+
288
+ # Trim the pre-allocated array to actual frames read
289
+ frames = frames[:frames_read] # type: ignore
262
290
 
263
291
  # Load audio for the specified segment
264
292
  try:
@@ -270,8 +298,8 @@ class Video:
270
298
  audio = audio.slice(start_seconds=audio_start, end_seconds=audio_end)
271
299
  except Exception:
272
300
  print(f"No audio found for `{path}`, adding silent track!")
273
- # Create silent audio for the segment duration
274
- segment_duration = len(frames) / fps
301
+ # Create silent audio based on actual frames read
302
+ segment_duration = frames_read / fps
275
303
  audio = Audio.create_silent(duration_seconds=round(segment_duration, 2), stereo=True, sample_rate=44100)
276
304
 
277
305
  return cls(frames=frames, fps=fps, audio=audio)
@@ -421,6 +449,20 @@ class Video:
421
449
  raise
422
450
 
423
451
  def add_audio(self, audio: Audio, overlay: bool = True) -> None:
452
+ video_duration = self.total_seconds
453
+ audio_duration = audio.metadata.duration_seconds
454
+
455
+ if audio_duration > video_duration:
456
+ audio = audio.slice(start_seconds=0, end_seconds=video_duration)
457
+ elif audio_duration < video_duration:
458
+ silence_duration = video_duration - audio_duration
459
+ silence = Audio.create_silent(
460
+ duration_seconds=silence_duration,
461
+ stereo=audio.metadata.channels == 2,
462
+ sample_rate=audio.metadata.sample_rate,
463
+ )
464
+ audio = audio.concat(silence)
465
+
424
466
  if self.audio.is_silent:
425
467
  self.audio = audio
426
468
  elif overlay:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: videopython
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: Minimal video generation and processing library.
5
5
  Project-URL: Homepage, https://github.com/bartwojtowicz/videopython/
6
6
  Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
@@ -8,12 +8,13 @@ Project-URL: Documentation, https://github.com/bartwojtowicz/videopython/
8
8
  Author-email: Bartosz Wójtowicz <bartoszwojtowicz@outlook.com>, Bartosz Rudnikowicz <bartoszrudnikowicz840@gmail.com>, Piotr Pukisz <piotr.pukisz@gmail.com>
9
9
  License: Apache-2.0
10
10
  License-File: LICENSE
11
- Keywords: editing,generation,movie,opencv,python,video,videopython
11
+ Keywords: ai,editing,generation,movie,opencv,python,shorts,video,videopython
12
12
  Classifier: License :: OSI Approved :: Apache Software License
13
13
  Classifier: Operating System :: OS Independent
14
14
  Classifier: Programming Language :: Python :: 3
15
15
  Classifier: Programming Language :: Python :: 3.10
16
16
  Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
17
18
  Requires-Python: <3.13,>=3.10
18
19
  Requires-Dist: numpy>=1.25.2
19
20
  Requires-Dist: opencv-python>=4.9.0.80
@@ -38,11 +39,11 @@ Description-Content-Type: text/markdown
38
39
 
39
40
  # About
40
41
 
41
- Minimal video generation and processing library.
42
+ Videopython is a minimal video generation and processing library designed with short-form videos in mind, with focus on simplicity and ease of use for both humans and AI agents.
42
43
 
43
- ## Setup
44
+ # Setup
44
45
 
45
- ### Install ffmpeg
46
+ ## Install ffmpeg
46
47
  ```bash
47
48
  # Install with brew for MacOS:
48
49
  brew install ffmpeg
@@ -50,16 +51,22 @@ brew install ffmpeg
50
51
  sudo apt-get install ffmpeg
51
52
  ```
52
53
 
53
- ### Install with pip
54
+ ## Install library
55
+
54
56
  ```bash
57
+ # Install with your favourite package manager
58
+ uv add videopython --extra ai
59
+
60
+ # pip install works as well :)
55
61
  pip install videopython[ai]
56
62
  ```
57
- > You can install without `[ai]` dependencies for basic video handling and processing.
58
- > The funcionalities found in `videopython.ai` won't work.
59
63
 
60
- ## Basic Usage
64
+ > You can install without `[ai]` dependencies for basic video handling and processing.
65
+ > The functionalities found in `videopython.ai` won't work.
66
+
67
+ # Usage examples
61
68
 
62
- ### Video handling
69
+ ## Basic video editing
63
70
 
64
71
  ```python
65
72
  from videopython.base.video import Video
@@ -90,6 +97,8 @@ video.add_audio_from_file("tests/test_data/test_audio.mp3")
90
97
  savepath = video.save()
91
98
  ```
92
99
 
100
+ ## AI powered examples
101
+
93
102
  ### Video Generation
94
103
 
95
104
  > Using Nvidia A40 or better is recommended for the `videopython.ai` module.
@@ -97,7 +106,6 @@ savepath = video.save()
97
106
  # Generate image and animate it
98
107
  from videopython.ai.generation import ImageToVideo
99
108
  from videopython.ai.generation import TextToImage
100
- from videopython.ai.generation import TextToMusic
101
109
 
102
110
  image = TextToImage().generate_image(prompt="Golden Retriever playing in the park")
103
111
  video = ImageToVideo().generate_video(image=image, fps=24)
@@ -105,27 +113,82 @@ video = ImageToVideo().generate_video(image=image, fps=24)
105
113
  # Video generation directly from prompt
106
114
  from videopython.ai.generation import TextToVideo
107
115
  video_gen = TextToVideo()
108
- video = video_gen.generate_video("Dogs playing in the snow")
116
+ video = video_gen.generate_video("Dogs playing in the park")
109
117
  for _ in range(10):
110
- video += video_gen.generate_video("Dogs playing in the snow")
111
-
112
- # Cut the first 2 seconds
113
- from videopython.base.transforms import CutSeconds
114
- transformed_video = CutSeconds(start_second=0, end_second=2).apply(video.copy())
115
-
116
- # Upsample to 30 FPS
117
- from videopython.base.transforms import ResampleFPS
118
- transformed_video = ResampleFPS(new_fps=30).apply(transformed_video)
118
+ video += video_gen.generate_video("Dogs playing in the park")
119
+ ```
119
120
 
120
- # Resize to 1000x1000
121
- from videopython.base.transforms import Resize
122
- transformed_video = Resize(width=1000, height=1000).apply(transformed_video)
121
+ ### Audio generation
122
+ ```python
123
+ from videopython.base.video import Video
124
+ video = Video.from_path("<PATH_TO_VIDEO>")
123
125
 
124
- # Add generated music
125
- # MusicGen cannot generate more than 1503 tokens (~30seconds of audio)
126
+ # Generate music on top of video
127
+ from videopython.ai.generation import TextToMusic
126
128
  text_to_music = TextToMusic()
127
129
  audio = text_to_music.generate_audio("Happy dogs playing together in a park", max_new_tokens=256)
128
- transformed_video.add_audio(audio=audio)
130
+ video.add_audio(audio=audio)
131
+
132
+ # Add TTS on top of video
133
+ from videopython.ai.generation import TextToSpeech
134
+ text_to_speech = TextToSpeech()
135
+ audio = text_to_speech.generate_audio("Woof woof woof! Woooooof!")
136
+ video.add_audio(audio=audio)
137
+ ```
138
+
139
+ ### Generate and overlay subtitles
140
+ ```python
141
+ from videopython.base.video import Video
142
+ video = Video.from_path("<PATH_TO_VIDEO>")
143
+
144
+ # Generate transcription with timestamps
145
+ from videopython.ai.understanding.transcribe import CreateTranscription
146
+ transcription = CreateTranscription("base").transcribe(video)
147
+ # Initialise object for overlaying. See `TranscriptionOverlay` to see detailed configuration options.
148
+ from videopython.base.text.overlay import TranscriptionOverlay
149
+ transcription_overlay = TranscriptionOverlay(font_filename="src/tests/test_data/test_font.ttf")
129
150
 
130
- filepath = transformed_video.save()
151
+ video = transcription_overlay.apply(video, transcription)
152
+ video.save()
153
+ ```
154
+
155
+ # Development notes
156
+
157
+ ## Project structure
158
+
159
+ Source code of the project can be found under `src/` directory, along with separate directories for unit tests and mypy stubs.
160
+ ```
161
+ .
162
+ └── src
163
+ ├── stubs # Contains stubs for mypy
164
+ ├── tests # Unit tests
165
+ └── videopython # Library code
166
+ ```
167
+
168
+ ----
169
+
170
+ The `videopython` library is divided into 2 separate high-level modules:
171
+ * `videopython.base`: Contains base classes for handling videos and for basic video editing. There are no imports from `videopython.ai` within the `base` module, which allows users to install light-weight base dependencies to do simple video operations.
172
+ * `videopython.ai`: Contains AI-powered functionalities for video generation. It has its own `ai` dependency group, which contains all dependencies required to run AI models.
173
+
174
+ ## Running locally
175
+
176
+ We are using [uv](https://docs.astral.sh/uv/) as project and package manager. Once you clone the repo and install uv locally, you can use it to sync the dependencies.
177
+ ```bash
178
+ uv sync --all-extras
179
+ ```
180
+
181
+ To run the unit tests, you can simply run:
182
+ ```bash
183
+ uv run pytest
184
+ ```
185
+
186
+ We also use [Ruff](https://docs.astral.sh/ruff/) for linting/formatting and [mypy](https://github.com/python/mypy) as type checker.
187
+ ```bash
188
+ # Run formatting
189
+ uv run ruff format
190
+ # Run linting and apply fixes
191
+ uv run ruff check --fix
192
+ # Run type checks
193
+ uv run mypy src/
131
194
  ```
@@ -6,21 +6,19 @@ videopython/ai/generation/audio.py,sha256=CNf6ZeV3iU4CU0Kq8HtDLwLPP2ABq9AGQD1TBO
6
6
  videopython/ai/generation/image.py,sha256=gS0zqzyIoCvjTjfks31ApG8lX0nUKXWRRgFGGLN4RjM,654
7
7
  videopython/ai/generation/video.py,sha256=206YON_XjPTYyjIJ3j5uBgd_yHmCDg7SqbkIU9GzEgw,1831
8
8
  videopython/ai/understanding/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- videopython/ai/understanding/transcribe.py,sha256=VNgXnzbTH0NHDKHjanj6CjUnl-XwT-nsOkd5zqn9a_E,1219
9
+ videopython/ai/understanding/transcribe.py,sha256=hm2f5Fm1O_tMrSmUlcUdl_rQRhc5Sz_kaV4tnJ4IxbQ,2557
10
10
  videopython/base/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  videopython/base/combine.py,sha256=XC_pzyhbIh6h0fmxX1LhhhtlmOBbUQX9Y4EtDJqQn8g,1900
12
- videopython/base/compose.py,sha256=pti12VY3Yg7TZZiENPF6veM8POWssfsK8ePDdGlhAhA,1968
13
12
  videopython/base/effects.py,sha256=1RbRLTQD0V26DBc4jbRCDI4eGr6-TyBdX-Ia2USKxmc,7554
14
13
  videopython/base/exceptions.py,sha256=68_16lUPOR9_zhWdeBGS8_NFI32VbrcoDbN5KHHg0_w,44
15
- videopython/base/transcription.py,sha256=FloqvY-OlBQPOCkPnSx6R7azn4smD5-JYd-pMNssuYw,196
16
14
  videopython/base/transforms.py,sha256=FDh-8EgQoZxB6Gv-T15kZGctcu9_4XHsTy_n7kgxlQw,5828
17
15
  videopython/base/transitions.py,sha256=P1bBsxugf5i0JEtx7MoRgxWSIDcBli-0QucRwBIFGqs,3687
18
- videopython/base/video.py,sha256=m_AzlUVvZYIkLih7EbJS7TSC2FIm6q06I1Zp9UHadl0,18444
19
- videopython/utils/__init__.py,sha256=uhFG_cnw6zZUWxpfs_I3-82mh-NBLqivbPDnsdOEUmI,122
20
- videopython/utils/common.py,sha256=F-30YoKUwWDI7HiJUWw0gRFUguhShSVaxT0aFfvpifg,936
21
- videopython/utils/image.py,sha256=zR5_WnSBXGgyE9gNpXnNXmPtfdmnlY7kdOsgkZUGOds,1747
22
- videopython/utils/text.py,sha256=T0W6VgpLfLczMMdUXEhkEftUQmuNzuQusO9I7-HU8Zg,27962
23
- videopython-0.4.1.dist-info/METADATA,sha256=xd-a02H1P_mq8nNGBIkuPfGYhO950MY3kJFm18rpAHs,4453
24
- videopython-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
25
- videopython-0.4.1.dist-info/licenses/LICENSE,sha256=nJL9jVOt2MSW7swNDq4Y6oD_n9bLI0B0afr8ougtZ6s,10832
26
- videopython-0.4.1.dist-info/RECORD,,
16
+ videopython/base/utils.py,sha256=bAwIagHvd1NWu8UYAsS-pDm38E4R8qRfeHvWk-O2__0,125
17
+ videopython/base/video.py,sha256=RxKHmR39EEvBa5m2xFDNj4_mq213RUG3NQ_lhk5U-PA,20462
18
+ videopython/base/text/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ videopython/base/text/overlay.py,sha256=EiBDSsnn2pSGeWGajblUxovcP_IdA6gk2zZ5rsjhdI8,44434
20
+ videopython/base/text/transcription.py,sha256=9c3FRBr7RkialHhdfSwEX303QnIt1sCSiXoId9_DRkk,4246
21
+ videopython-0.5.0.dist-info/METADATA,sha256=FTo8Bo3YLhp9bGTrctiehMMksQwecH1DN84JO5RydyU,6574
22
+ videopython-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
23
+ videopython-0.5.0.dist-info/licenses/LICENSE,sha256=nJL9jVOt2MSW7swNDq4Y6oD_n9bLI0B0afr8ougtZ6s,10832
24
+ videopython-0.5.0.dist-info/RECORD,,
@@ -1,55 +0,0 @@
1
- from itertools import repeat
2
- from multiprocessing import Pool
3
-
4
- from videopython.base.transforms import TransformationPipeline
5
- from videopython.base.transitions import InstantTransition, Transition
6
- from videopython.base.video import Video
7
-
8
-
9
- class VideoComposer:
10
- """
11
- Composes multiple Videos into single video using selected transformations
12
- on each video and applies transitions.
13
- """
14
-
15
- def __init__(
16
- self,
17
- transformation_pipeline: TransformationPipeline | None = None,
18
- transition: Transition = InstantTransition(),
19
- ):
20
- """Initializes VideoComposer.
21
-
22
- Args:
23
- transformation_pipeline: Pipeline of transformations to apply on each video.
24
- transition: Transition to apply between videos
25
- """
26
- self.transition = transition
27
- self.transformation_pipeline = transformation_pipeline
28
-
29
- def _apply_transformation(self, video: Video, transformation_pipeline: TransformationPipeline) -> Video:
30
- return transformation_pipeline(video)
31
-
32
- def compose(self, videos: list[Video]) -> Video:
33
- # Apply transformation on each video using multiprocessing pool:
34
- if self.transformation_pipeline:
35
- transformed_videos = []
36
- with Pool() as pool:
37
- transformed_videos = pool.starmap(
38
- self._apply_transformation,
39
- zip(videos, repeat(self.transformation_pipeline)),
40
- )
41
- videos = transformed_videos
42
-
43
- # Check if videos are compatible:
44
- self._compatibility_check(videos)
45
-
46
- # Apply transition:
47
- final_video = videos.pop(0)
48
- for _ in range(len(videos)):
49
- final_video = self.transition.apply((final_video, videos.pop(0)))
50
-
51
- return final_video
52
-
53
- @staticmethod
54
- def _compatibility_check(videos: list[Video]):
55
- assert all([videos[0].metadata.can_be_merged_with(other_video.metadata) for other_video in videos])
@@ -1,13 +0,0 @@
1
- from dataclasses import dataclass
2
-
3
-
4
- @dataclass
5
- class TranscriptionSegment:
6
- start: float
7
- end: float
8
- text: str
9
-
10
-
11
- @dataclass
12
- class Transcription:
13
- segments: list[TranscriptionSegment]
@@ -1,3 +0,0 @@
1
- from videopython.utils.text import AnchorPoint, ImageText, TextAlign
2
-
3
- __all__ = ["AnchorPoint", "ImageText", "TextAlign"]
@@ -1,31 +0,0 @@
1
- import time
2
- import uuid
3
- from pathlib import Path
4
- from typing import Callable
5
-
6
-
7
- def generate_random_name(suffix=".mp4"):
8
- """Generates random name."""
9
- return f"{uuid.uuid4()}{suffix}"
10
-
11
-
12
- def timeit(func: Callable):
13
- """Decorator to measure execution time of a function."""
14
-
15
- def timed(*args, **kwargs):
16
- start = time.time()
17
- result = func(*args, **kwargs)
18
- end = time.time()
19
- print(f"Execution time: {end - start:.3f} seconds.")
20
- return result
21
-
22
- return timed
23
-
24
-
25
- def check_path(path: str, dir_exists: bool = True, suffix: str | None = None) -> str:
26
- fullpath = Path(path).resolve()
27
- if dir_exists and not fullpath.parent.exists():
28
- raise ValueError(f"Directory `{fullpath.parent}` does not exist!")
29
- if suffix and suffix != fullpath.suffix:
30
- raise ValueError(f"Required suffix `{suffix}` does not match the file suffix `{fullpath.suffix}`")
31
- return str(fullpath)
@@ -1,47 +0,0 @@
1
- from typing import Literal
2
-
3
- import cv2
4
- import numpy as np
5
-
6
- from videopython.base.video import Video
7
-
8
-
9
- class SlideOverImage:
10
- def __init__(
11
- self,
12
- direction: Literal["left", "right"],
13
- video_shape: tuple[int, int] = (1080, 1920),
14
- fps: float = 24.0,
15
- length_seconds: float = 1.0,
16
- ) -> None:
17
- self.direction = direction
18
- self.video_width, self.video_height = video_shape
19
- self.fps = fps
20
- self.length_seconds = length_seconds
21
-
22
- def apply(self, image: np.ndarray) -> Video:
23
- image = self._resize(image)
24
- max_offset = image.shape[1] - self.video_width
25
- frame_count = round(self.fps * self.length_seconds)
26
-
27
- deltas = np.linspace(0, max_offset, frame_count)
28
- frames = []
29
-
30
- for delta in deltas:
31
- if self.direction == "right":
32
- frame = image[:, round(delta) : round(delta) + self.video_width]
33
- elif self.direction == "left":
34
- frame = image[:, image.shape[1] - round(delta) - self.video_width : image.shape[1] - round(delta)]
35
- frames.append(frame)
36
-
37
- return Video.from_frames(frames=np.stack(frames, axis=0), fps=self.fps)
38
-
39
- def _resize(self, image: np.ndarray) -> np.ndarray:
40
- resize_factor = image.shape[0] / self.video_height
41
- resize_dims = (round(image.shape[1] / resize_factor), round(image.shape[0] / resize_factor)) # width, height
42
- image = cv2.resize(image, resize_dims)
43
- if self.video_height > image.shape[0] or self.video_width > image.shape[1]:
44
- raise ValueError(
45
- f"Image `{image.shape}` is too small for the video frame `({self.video_width}, {self.video_height})`!"
46
- )
47
- return image