videopython 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videopython might be problematic. Click here for more details.

File without changes
@@ -1,6 +1,3 @@
1
- import io
2
- import os
3
-
4
1
  import torch
5
2
  from diffusers import DiffusionPipeline
6
3
  from PIL import Image
File without changes
@@ -0,0 +1,37 @@
1
+ from typing import Literal
2
+
3
+ import whisper
4
+
5
+ from videopython.base.transcription import Transcription, TranscriptionSegment
6
+ from videopython.base.video import Video
7
+
8
+
9
+ class VideoTranscription:
10
+ def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
11
+ self.model = whisper.load_model(name=model_name)
12
+
13
+ def transcribe_video(self, video: Video) -> Transcription:
14
+ """Transcribes video to text.
15
+
16
+ Args:
17
+ video: Video to transcribe.
18
+
19
+ Returns:
20
+ List of dictionaries with segments of text and their start and end times.
21
+ """
22
+ if video.audio.is_silent:
23
+ return Transcription(segments=[])
24
+
25
+ audio = video.audio.to_mono()
26
+ audio = audio.resample(whisper.audio.SAMPLE_RATE)
27
+ audio_data = audio.data
28
+
29
+ transcription = self.model.transcribe(audio=audio_data, word_timestamps=True)
30
+
31
+ transcription_segments = [
32
+ TranscriptionSegment(start=segment["start"], end=segment["end"], text=segment["text"])
33
+ for segment in transcription["segments"]
34
+ ]
35
+ result = Transcription(segments=transcription_segments)
36
+
37
+ return result
@@ -0,0 +1,45 @@
1
+ from typing import Literal
2
+
3
+ import numpy as np
4
+
5
+ from videopython.base.transforms import ResampleFPS, Resize
6
+ from videopython.base.video import Video
7
+
8
+
9
+ class StackVideos:
10
+ def __init__(self, mode: Literal["horizontal", "vertical"]) -> None:
11
+ self.mode = mode
12
+
13
+ def _validate(self, video1: Video, video2: Video) -> tuple[Video, Video]:
14
+ video1, video2 = self._align_shapes(video1, video2)
15
+ video1, video2 = self._align_fps(video1, video2)
16
+ video1, video2 = self._align_duration(video1, video2)
17
+ return video1, video2
18
+
19
+ def _align_fps(self, video1: Video, video2: Video) -> tuple[Video, Video]:
20
+ if video1.fps > video2.fps:
21
+ video1 = ResampleFPS(fps=video2.fps).apply(video1)
22
+ elif video1.fps < video2.fps:
23
+ video2 = ResampleFPS(fps=video1.fps).apply(video2)
24
+ return (video1, video2)
25
+
26
+ def _align_shapes(self, video1: Video, video2: Video) -> tuple[Video, Video]:
27
+ if self.mode == "horizontal":
28
+ video2 = Resize(height=video1.metadata.height).apply(video2)
29
+ elif self.mode == "vertical":
30
+ video2 = Resize(width=video1.metadata.width).apply(video2)
31
+ return (video1, video2)
32
+
33
+ def _align_duration(self, video1: Video, video2: Video) -> tuple[Video, Video]:
34
+ if len(video1.frames) > len(video2.frames):
35
+ video1 = video1[: len(video2.frames)]
36
+ elif len(video1.frames) < len(video2.frames):
37
+ video2 = video2[: len(video1.frames)]
38
+ return (video1, video2)
39
+
40
+ def apply(self, videos: tuple[Video, Video]) -> Video:
41
+ videos = self._validate(*videos)
42
+ axis = 1 if self.mode == "vertical" else 2
43
+ new_frames = np.concatenate((videos[0].frames, videos[1].frames), axis=axis)
44
+ new_audio = videos[0].audio.overlay(videos[1].audio)
45
+ return Video(frames=new_frames, fps=videos[0].fps, audio=new_audio)
@@ -156,13 +156,13 @@ class Zoom(Effect):
156
156
 
157
157
  width = video.metadata.width
158
158
  height = video.metadata.height
159
- crop_sizes_w, crop_sizes_h = np.linspace(width // self.zoom_factor, width, n_frames), np.linspace(
160
- height // self.zoom_factor, height, n_frames
159
+ crop_sizes_w, crop_sizes_h = (
160
+ np.linspace(width // self.zoom_factor, width, n_frames),
161
+ np.linspace(height // self.zoom_factor, height, n_frames),
161
162
  )
162
163
 
163
164
  if self.mode == "in":
164
165
  for frame, w, h in tqdm(zip(video.frames, reversed(crop_sizes_w), reversed(crop_sizes_h))):
165
-
166
166
  x = width / 2 - w / 2
167
167
  y = height / 2 - h / 2
168
168
 
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class TranscriptionSegment:
6
+ start: float
7
+ end: float
8
+ text: str
9
+
10
+
11
+ @dataclass
12
+ class Transcription:
13
+ segments: list[TranscriptionSegment]
@@ -1,7 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
3
  from multiprocessing import Pool
4
- from typing import Literal
5
4
 
6
5
  import cv2
7
6
  import numpy as np
@@ -154,7 +153,6 @@ class CropMode(Enum):
154
153
 
155
154
 
156
155
  class Crop(Transformation):
157
-
158
156
  def __init__(self, width: int, height: int, mode: CropMode = CropMode.CENTER):
159
157
  self.width = width
160
158
  self.height = height