videopython 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of videopython might be problematic. Click here for more details.

File without changes
@@ -1,6 +1,3 @@
1
- import io
2
- import os
3
-
4
1
  import torch
5
2
  from diffusers import DiffusionPipeline
6
3
  from PIL import Image
File without changes
@@ -0,0 +1,37 @@
1
+ from typing import Literal
2
+
3
+ import whisper
4
+
5
+ from videopython.base.transcription import Transcription, TranscriptionSegment
6
+ from videopython.base.video import Video
7
+
8
+
9
+ class VideoTranscription:
10
+ def __init__(self, model_name: Literal["tiny", "base", "small", "medium", "large", "turbo"] = "small") -> None:
11
+ self.model = whisper.load_model(name=model_name)
12
+
13
+ def transcribe_video(self, video: Video) -> Transcription:
14
+ """Transcribes video to text.
15
+
16
+ Args:
17
+ video: Video to transcribe.
18
+
19
+ Returns:
20
+ List of dictionaries with segments of text and their start and end times.
21
+ """
22
+ if video.audio.is_silent:
23
+ return Transcription(segments=[])
24
+
25
+ audio = video.audio.to_mono()
26
+ audio = audio.resample(whisper.audio.SAMPLE_RATE)
27
+ audio_data = audio.data
28
+
29
+ transcription = self.model.transcribe(audio=audio_data, word_timestamps=True)
30
+
31
+ transcription_segments = [
32
+ TranscriptionSegment(start=segment["start"], end=segment["end"], text=segment["text"])
33
+ for segment in transcription["segments"]
34
+ ]
35
+ result = Transcription(segments=transcription_segments)
36
+
37
+ return result
@@ -156,13 +156,13 @@ class Zoom(Effect):
156
156
 
157
157
  width = video.metadata.width
158
158
  height = video.metadata.height
159
- crop_sizes_w, crop_sizes_h = np.linspace(width // self.zoom_factor, width, n_frames), np.linspace(
160
- height // self.zoom_factor, height, n_frames
159
+ crop_sizes_w, crop_sizes_h = (
160
+ np.linspace(width // self.zoom_factor, width, n_frames),
161
+ np.linspace(height // self.zoom_factor, height, n_frames),
161
162
  )
162
163
 
163
164
  if self.mode == "in":
164
165
  for frame, w, h in tqdm(zip(video.frames, reversed(crop_sizes_w), reversed(crop_sizes_h))):
165
-
166
166
  x = width / 2 - w / 2
167
167
  y = height / 2 - h / 2
168
168
 
@@ -0,0 +1,13 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class TranscriptionSegment:
6
+ start: float
7
+ end: float
8
+ text: str
9
+
10
+
11
+ @dataclass
12
+ class Transcription:
13
+ segments: list[TranscriptionSegment]
@@ -1,7 +1,6 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from enum import Enum
3
3
  from multiprocessing import Pool
4
- from typing import Literal
5
4
 
6
5
  import cv2
7
6
  import numpy as np
@@ -154,7 +153,6 @@ class CropMode(Enum):
154
153
 
155
154
 
156
155
  class Crop(Transformation):
157
-
158
156
  def __init__(self, width: int, height: int, mode: CropMode = CropMode.CENTER):
159
157
  self.width = width
160
158
  self.height = height
videopython/base/video.py CHANGED
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import json
3
4
  import subprocess
4
5
  import tempfile
5
6
  from dataclasses import dataclass
7
+ from fractions import Fraction
6
8
  from pathlib import Path
7
9
  from typing import Literal, get_args
8
10
 
9
- import cv2
10
11
  import numpy as np
11
12
  from soundpython import Audio
12
13
 
@@ -15,6 +16,12 @@ from videopython.utils.common import generate_random_name
15
16
  ALLOWED_VIDEO_FORMATS = Literal["mp4", "avi", "mov", "mkv", "webm"]
16
17
 
17
18
 
19
+ class VideoMetadataError(Exception):
20
+ """Raised when there's an error getting video metadata"""
21
+
22
+ pass
23
+
24
+
18
25
  @dataclass
19
26
  class VideoMetadata:
20
27
  """Class to store video metadata."""
@@ -25,37 +32,80 @@ class VideoMetadata:
25
32
  frame_count: int
26
33
  total_seconds: float
27
34
 
28
- def __str__(self):
35
+ def __str__(self) -> str:
29
36
  return f"{self.width}x{self.height} @ {self.fps}fps, {self.total_seconds} seconds"
30
37
 
31
38
  def __repr__(self) -> str:
32
39
  return self.__str__()
33
40
 
34
- def get_frame_shape(self):
41
+ def get_frame_shape(self) -> np.ndarray:
35
42
  """Returns frame shape."""
36
43
  return np.array((self.height, self.width, 3))
37
44
 
38
- def get_video_shape(self):
45
+ def get_video_shape(self) -> np.ndarray:
39
46
  """Returns video shape."""
40
47
  return np.array((self.frame_count, self.height, self.width, 3))
41
48
 
49
+ @staticmethod
50
+ def _run_ffprobe(video_path: str | Path) -> dict:
51
+ """Run ffprobe and return parsed JSON output."""
52
+ cmd = [
53
+ "ffprobe",
54
+ "-v",
55
+ "error",
56
+ "-select_streams",
57
+ "v:0",
58
+ "-show_entries",
59
+ "stream=width,height,r_frame_rate,nb_frames",
60
+ "-show_entries",
61
+ "format=duration",
62
+ "-print_format",
63
+ "json",
64
+ str(video_path),
65
+ ]
66
+
67
+ try:
68
+ result = subprocess.run(cmd, capture_output=True, text=True, check=True)
69
+ return json.loads(result.stdout)
70
+ except subprocess.CalledProcessError as e:
71
+ raise VideoMetadataError(f"FFprobe error: {e.stderr}")
72
+ except json.JSONDecodeError as e:
73
+ raise VideoMetadataError(f"Error parsing FFprobe output: {e}")
74
+
42
75
  @classmethod
43
- def from_path(cls, video_path: str) -> VideoMetadata:
44
- """Creates VideoMetadata object from video file."""
45
- video = cv2.VideoCapture(video_path)
46
- frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
47
- fps = round(video.get(cv2.CAP_PROP_FPS), 2)
48
- height = round(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
49
- width = round(video.get(cv2.CAP_PROP_FRAME_WIDTH))
50
- total_seconds = round(frame_count / fps, 2)
51
-
52
- return cls(
53
- height=height,
54
- width=width,
55
- fps=fps,
56
- frame_count=frame_count,
57
- total_seconds=total_seconds,
58
- )
76
+ def from_path(cls, video_path: str | Path) -> VideoMetadata:
77
+ """Creates VideoMetadata object from video file using ffprobe."""
78
+ if not Path(video_path).exists():
79
+ raise FileNotFoundError(f"Video file not found: {video_path}")
80
+
81
+ probe_data = cls._run_ffprobe(video_path)
82
+
83
+ try:
84
+ stream_info = probe_data["streams"][0]
85
+
86
+ width = int(stream_info["width"])
87
+ height = int(stream_info["height"])
88
+
89
+ try:
90
+ fps_fraction = Fraction(stream_info["r_frame_rate"])
91
+ fps = float(fps_fraction)
92
+ except (ValueError, ZeroDivisionError):
93
+ raise VideoMetadataError(f"Invalid frame rate: {stream_info['r_frame_rate']}")
94
+
95
+ if "nb_frames" in stream_info and stream_info["nb_frames"].isdigit():
96
+ frame_count = int(stream_info["nb_frames"])
97
+ else:
98
+ duration = float(probe_data["format"]["duration"])
99
+ frame_count = int(round(duration * fps))
100
+
101
+ total_seconds = round(frame_count / fps, 2)
102
+
103
+ return cls(height=height, width=width, fps=fps, frame_count=frame_count, total_seconds=total_seconds)
104
+
105
+ except KeyError as e:
106
+ raise VideoMetadataError(f"Missing required metadata field: {e}")
107
+ except Exception as e:
108
+ raise VideoMetadataError(f"Error extracting video metadata: {e}")
59
109
 
60
110
  @classmethod
61
111
  def from_video(cls, video: Video) -> VideoMetadata:
@@ -63,15 +113,10 @@ class VideoMetadata:
63
113
  frame_count, height, width, _ = video.frames.shape
64
114
  total_seconds = round(frame_count / video.fps, 2)
65
115
 
66
- return cls(
67
- height=height,
68
- width=width,
69
- fps=video.fps,
70
- frame_count=frame_count,
71
- total_seconds=total_seconds,
72
- )
116
+ return cls(height=height, width=width, fps=video.fps, frame_count=frame_count, total_seconds=total_seconds)
73
117
 
74
118
  def can_be_merged_with(self, other_format: VideoMetadata) -> bool:
119
+ """Check if videos can be merged."""
75
120
  return (
76
121
  self.height == other_format.height
77
122
  and self.width == other_format.width
@@ -79,14 +124,7 @@ class VideoMetadata:
79
124
  )
80
125
 
81
126
  def can_be_downsampled_to(self, target_format: VideoMetadata) -> bool:
82
- """Checks if video can be downsampled to `target_format`.
83
-
84
- Args:
85
- target_format: Desired video format.
86
-
87
- Returns:
88
- True if video can be downsampled to `target_format`, False otherwise.
89
- """
127
+ """Checks if video can be downsampled to target_format."""
90
128
  return (
91
129
  self.height >= target_format.height
92
130
  and self.width >= target_format.width
@@ -102,18 +140,94 @@ class Video:
102
140
  self.audio = None
103
141
 
104
142
  @classmethod
105
- def from_path(cls, path: str) -> Video:
143
+ def from_path(cls, path: str, read_batch_size: int = 100) -> Video:
106
144
  new_vid = cls()
107
- new_vid.frames, new_vid.fps = cls._load_video_from_path(path)
108
145
 
109
146
  try:
110
- new_vid.audio = Audio.from_file(path)
111
- except Exception as e:
112
- print(f"No audio found for `{path}`, adding silent track!")
113
- new_vid.audio = Audio.create_silent(
114
- duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
147
+ # Get video metadata using VideoMetadata.from_path
148
+ metadata = VideoMetadata.from_path(path)
149
+
150
+ width = metadata.width
151
+ height = metadata.height
152
+ fps = metadata.fps
153
+ total_frames = metadata.frame_count
154
+
155
+ # Set up FFmpeg command for raw video extraction
156
+ ffmpeg_cmd = [
157
+ "ffmpeg",
158
+ "-i",
159
+ path,
160
+ "-f",
161
+ "rawvideo",
162
+ "-pix_fmt",
163
+ "rgb24",
164
+ "-vsync",
165
+ "0",
166
+ "-vcodec",
167
+ "rawvideo",
168
+ "-y",
169
+ "pipe:1",
170
+ ]
171
+
172
+ # Start FFmpeg process
173
+ process = subprocess.Popen(
174
+ ffmpeg_cmd,
175
+ stdout=subprocess.PIPE,
176
+ stderr=subprocess.PIPE,
177
+ bufsize=10**8, # Use large buffer
115
178
  )
116
- return new_vid
179
+
180
+ # Calculate frame size in bytes
181
+ frame_size = width * height * 3 # 3 bytes per pixel for RGB
182
+
183
+ # Pre-allocate numpy array for all frames
184
+ frames = np.empty((total_frames, height, width, 3), dtype=np.uint8)
185
+
186
+ # Read frames in batches
187
+ for frame_idx in range(0, total_frames, read_batch_size):
188
+ batch_end = min(frame_idx + read_batch_size, total_frames)
189
+ batch_size = batch_end - frame_idx
190
+
191
+ # Read batch of frames
192
+ raw_data = process.stdout.read(frame_size * batch_size) # type: ignore
193
+ if not raw_data:
194
+ break
195
+
196
+ # Convert raw bytes to numpy array and reshape
197
+ batch_frames = np.frombuffer(raw_data, dtype=np.uint8)
198
+ batch_frames = batch_frames.reshape(-1, height, width, 3)
199
+
200
+ # Store batch in pre-allocated array
201
+ frames[frame_idx:batch_end] = batch_frames
202
+
203
+ # Clean up FFmpeg process
204
+ process.stdout.close() # type: ignore
205
+ process.stderr.close() # type: ignore
206
+ process.wait()
207
+
208
+ if process.returncode != 0:
209
+ raise ValueError(f"FFmpeg error: {process.stderr.read().decode()}") # type: ignore
210
+
211
+ new_vid.frames = frames
212
+ new_vid.fps = fps
213
+
214
+ # Load audio
215
+ try:
216
+ new_vid.audio = Audio.from_file(path)
217
+ except Exception:
218
+ print(f"No audio found for `{path}`, adding silent track!")
219
+ new_vid.audio = Audio.create_silent(
220
+ duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
221
+ )
222
+
223
+ return new_vid
224
+
225
+ except VideoMetadataError as e:
226
+ raise ValueError(f"Error getting video metadata: {e}")
227
+ except subprocess.CalledProcessError as e:
228
+ raise ValueError(f"Error processing video file: {e}")
229
+ except Exception as e:
230
+ raise ValueError(f"Error loading video: {e}")
117
231
 
118
232
  @classmethod
119
233
  def from_frames(cls, frames: np.ndarray, fps: float) -> Video:
@@ -168,6 +282,19 @@ class Video:
168
282
  return split_videos
169
283
 
170
284
  def save(self, filename: str | Path | None = None, format: ALLOWED_VIDEO_FORMATS = "mp4") -> Path:
285
+ """Save video to file with optimized performance.
286
+
287
+ Args:
288
+ filename: Output filename. If None, generates random name
289
+ format: Output format (mp4, avi, mov, mkv, webm)
290
+
291
+ Returns:
292
+ Path to saved video file
293
+
294
+ Raises:
295
+ RuntimeError: If video is not loaded
296
+ ValueError: If format is not supported
297
+ """
171
298
  if not self.is_loaded():
172
299
  raise RuntimeError("Video is not loaded, cannot save!")
173
300
 
@@ -182,80 +309,71 @@ class Video:
182
309
  filename = Path(filename).with_suffix(f".{format}")
183
310
  filename.parent.mkdir(parents=True, exist_ok=True)
184
311
 
185
- with tempfile.TemporaryDirectory() as temp_dir:
186
- temp_dir_path = Path(temp_dir)
187
-
188
- # Save frames as images
189
- for i, frame in enumerate(self.frames):
190
- frame_path = temp_dir_path / f"frame_{i:04d}.png"
191
- cv2.imwrite(str(frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
192
-
193
- # Calculate exact video duration
194
- video_duration = len(self.frames) / self.fps
195
-
196
- # Ensure audio duration matches video duration
197
- if (
198
- abs(self.audio.metadata.duration_seconds - video_duration) > 0.001
199
- ): # Small threshold for float comparison
200
- if self.audio.metadata.duration_seconds < video_duration:
201
- # Create silent audio for the remaining duration
202
- remaining_duration = video_duration - self.audio.metadata.duration_seconds
203
- silent_audio = Audio.create_silent(
204
- duration_seconds=remaining_duration,
205
- stereo=(self.audio.metadata.channels == 2),
206
- sample_rate=self.audio.metadata.sample_rate,
207
- sample_width=self.audio.metadata.sample_width,
208
- )
209
- # Concatenate original audio with silent padding
210
- padded_audio = self.audio.concat(silent_audio)
211
- else:
212
- # Trim audio to match video duration
213
- padded_audio = self.audio.slice(end_seconds=video_duration)
214
- else:
215
- padded_audio = self.audio
312
+ # Create a temporary raw video file
313
+ with tempfile.NamedTemporaryFile(suffix=".raw") as raw_video:
314
+ # Convert frames to raw video data
315
+ raw_data = self.frames.astype(np.uint8).tobytes()
316
+ raw_video.write(raw_data)
317
+ raw_video.flush()
216
318
 
217
319
  # Save audio to temporary WAV file
218
- temp_audio = temp_dir_path / "temp_audio.wav"
219
- padded_audio.save(str(temp_audio), format="wav")
220
-
221
- # Construct FFmpeg command with explicit duration
222
- ffmpeg_command = [
223
- "ffmpeg",
224
- "-y",
225
- "-framerate",
226
- str(self.fps), # Use -framerate instead of -r for input
227
- "-i",
228
- str(temp_dir_path / "frame_%04d.png"),
229
- "-i",
230
- str(temp_audio),
231
- "-c:v",
232
- "libx264",
233
- "-preset",
234
- "medium",
235
- "-crf",
236
- "23",
237
- "-c:a",
238
- "aac", # Use AAC instead of copy for more reliable audio
239
- "-b:a",
240
- "192k",
241
- "-pix_fmt",
242
- "yuv420p",
243
- "-map",
244
- "0:v:0", # Map video from first input
245
- "-map",
246
- "1:a:0", # Map audio from second input
247
- "-vsync",
248
- "cfr", # Force constant frame rate
249
- str(filename),
250
- ]
251
-
252
- try:
253
- subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
254
- return filename
255
- except subprocess.CalledProcessError as e:
256
- print(f"Error saving video: {e}")
257
- print(f"FFmpeg stderr: {e.stderr}")
258
- raise
320
+ with tempfile.NamedTemporaryFile(suffix=".wav") as temp_audio:
321
+ self.audio.save(temp_audio.name, format="wav")
322
+
323
+ # Calculate exact duration
324
+ duration = len(self.frames) / self.fps
325
+
326
+ # Construct FFmpeg command for maximum performance
327
+ ffmpeg_command = [
328
+ "ffmpeg",
329
+ "-y",
330
+ # Raw video input settings
331
+ "-f",
332
+ "rawvideo",
333
+ "-pixel_format",
334
+ "rgb24",
335
+ "-video_size",
336
+ f"{self.frame_shape[1]}x{self.frame_shape[0]}",
337
+ "-framerate",
338
+ str(self.fps),
339
+ "-i",
340
+ raw_video.name,
341
+ # Audio input
342
+ "-i",
343
+ temp_audio.name,
344
+ # Video encoding settings
345
+ "-c:v",
346
+ "libx264",
347
+ "-preset",
348
+ "ultrafast", # Fastest encoding
349
+ "-tune",
350
+ "zerolatency", # Reduce encoding latency
351
+ "-crf",
352
+ "23", # Reasonable quality/size tradeoff
353
+ # Audio settings
354
+ "-c:a",
355
+ "aac",
356
+ "-b:a",
357
+ "192k",
358
+ # Output settings
359
+ "-pix_fmt",
360
+ "yuv420p",
361
+ "-movflags",
362
+ "+faststart", # Enable fast start for web playback
363
+ "-t",
364
+ str(duration),
365
+ "-vsync",
366
+ "cfr",
367
+ str(filename),
368
+ ]
369
+
370
+ try:
371
+ subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
372
+ return filename
373
+ except subprocess.CalledProcessError as e:
374
+ print(f"Error saving video: {e}")
375
+ print(f"FFmpeg stderr: {e.stderr}")
376
+ raise
259
377
 
260
378
  def add_audio(self, audio: Audio, overlay: bool = True) -> None:
261
379
  if self.audio.is_silent:
@@ -269,7 +387,7 @@ class Video:
269
387
  try:
270
388
  new_audio = Audio.from_file(path)
271
389
  self.add_audio(new_audio, overlay)
272
- except Exception as e:
390
+ except Exception:
273
391
  print(f"Audio file `{path}` not found or invalid, skipping!")
274
392
 
275
393
  def __add__(self, other: Video) -> Video:
@@ -305,29 +423,6 @@ class Video:
305
423
  sliced.audio = self.audio.slice(start_seconds=audio_start, end_seconds=audio_end)
306
424
  return sliced
307
425
 
308
- @staticmethod
309
- def _load_video_from_path(path: str) -> tuple[np.ndarray, float]:
310
- cap = cv2.VideoCapture(path)
311
- if not cap.isOpened():
312
- raise ValueError(f"Unable to open video file: {path}")
313
-
314
- fps = cap.get(cv2.CAP_PROP_FPS)
315
- frames = []
316
-
317
- while True:
318
- ret, frame = cap.read()
319
- if not ret:
320
- break
321
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
322
- frames.append(frame)
323
-
324
- cap.release()
325
-
326
- if not frames:
327
- raise ValueError(f"No frames could be read from the video file: {path}")
328
-
329
- return np.array(frames), fps
330
-
331
426
  @property
332
427
  def video_shape(self) -> tuple[int, int, int, int]:
333
428
  return self.frames.shape
@@ -0,0 +1,3 @@
1
+ from videopython.utils.text import AnchorPoint, ImageText, TextAlign
2
+
3
+ __all__ = ["AnchorPoint", "ImageText", "TextAlign"]