videopython 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of videopython might be problematic. Click here for more details.
- {videopython-0.2.1 → videopython-0.3.0}/.gitignore +4 -1
- {videopython-0.2.1 → videopython-0.3.0}/PKG-INFO +3 -15
- {videopython-0.2.1 → videopython-0.3.0}/pyproject.toml +12 -31
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/transitions.py +2 -2
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/video.py +88 -101
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/generation/audio.py +25 -13
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/utils/image.py +4 -0
- {videopython-0.2.1 → videopython-0.3.0}/LICENSE +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/README.md +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/__init__.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/__init__.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/compose.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/effects.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/exceptions.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/base/transforms.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/generation/__init__.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/generation/image.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/generation/video.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/py.typed +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/utils/__init__.py +0 -0
- {videopython-0.2.1 → videopython-0.3.0}/src/videopython/utils/common.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: videopython
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Minimal video generation and processing library.
|
|
5
5
|
Project-URL: Homepage, https://github.com/bartwojtowicz/videopython/
|
|
6
6
|
Project-URL: Repository, https://github.com/bartwojtowicz/videopython/
|
|
@@ -19,20 +19,8 @@ Requires-Dist: numpy>=1.25.2
|
|
|
19
19
|
Requires-Dist: opencv-python>=4.9.0.80
|
|
20
20
|
Requires-Dist: pillow>=10.3.0
|
|
21
21
|
Requires-Dist: pydub>=0.25.1
|
|
22
|
+
Requires-Dist: soundpython>=0.1.9
|
|
22
23
|
Requires-Dist: tqdm>=4.66.3
|
|
23
|
-
Provides-Extra: dev
|
|
24
|
-
Requires-Dist: black==24.3.0; extra == 'dev'
|
|
25
|
-
Requires-Dist: isort==5.12.0; extra == 'dev'
|
|
26
|
-
Requires-Dist: mypy==1.8.0; extra == 'dev'
|
|
27
|
-
Requires-Dist: pydub-stubs==0.25.1.1; extra == 'dev'
|
|
28
|
-
Requires-Dist: pytest==7.4.0; extra == 'dev'
|
|
29
|
-
Requires-Dist: types-pillow==10.2.0.20240213; extra == 'dev'
|
|
30
|
-
Requires-Dist: types-tqdm==4.66.0.20240106; extra == 'dev'
|
|
31
|
-
Provides-Extra: generation
|
|
32
|
-
Requires-Dist: accelerate>=0.29.2; extra == 'generation'
|
|
33
|
-
Requires-Dist: diffusers>=0.26.3; extra == 'generation'
|
|
34
|
-
Requires-Dist: torch>=2.1.0; extra == 'generation'
|
|
35
|
-
Requires-Dist: transformers>=4.38.1; extra == 'generation'
|
|
36
24
|
Description-Content-Type: text/markdown
|
|
37
25
|
|
|
38
26
|
# About
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "videopython"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.3.0"
|
|
4
4
|
description = "Minimal video generation and processing library."
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Bartosz Wójtowicz", email = "bartoszwojtowicz@outlook.com" },
|
|
@@ -24,18 +24,18 @@ dependencies = [
|
|
|
24
24
|
"opencv-python>=4.9.0.80",
|
|
25
25
|
"pillow>=10.3.0",
|
|
26
26
|
"pydub>=0.25.1",
|
|
27
|
+
"soundpython>=0.1.9",
|
|
27
28
|
"tqdm>=4.66.3",
|
|
28
29
|
]
|
|
29
30
|
|
|
30
|
-
[
|
|
31
|
+
[dependency-groups]
|
|
31
32
|
dev = [
|
|
32
|
-
"black
|
|
33
|
-
"isort
|
|
34
|
-
"mypy
|
|
35
|
-
"pytest
|
|
36
|
-
"types-Pillow
|
|
37
|
-
"types-tqdm
|
|
38
|
-
"pydub-stubs==0.25.1.1",
|
|
33
|
+
"black>=24.3.0",
|
|
34
|
+
"isort>=5.12.0",
|
|
35
|
+
"mypy>=1.8.0",
|
|
36
|
+
"pytest>=7.4.0",
|
|
37
|
+
"types-Pillow>=10.2.0.20240213",
|
|
38
|
+
"types-tqdm>=4.66.0.20240106",
|
|
39
39
|
]
|
|
40
40
|
generation = [
|
|
41
41
|
"accelerate>=0.29.2",
|
|
@@ -49,25 +49,8 @@ Homepage = "https://github.com/bartwojtowicz/videopython/"
|
|
|
49
49
|
Repository = "https://github.com/bartwojtowicz/videopython/"
|
|
50
50
|
Documentation = "https://github.com/bartwojtowicz/videopython/"
|
|
51
51
|
|
|
52
|
-
[tool.
|
|
53
|
-
|
|
54
|
-
dev-dependencies = [
|
|
55
|
-
"black==24.3.0",
|
|
56
|
-
"isort==5.12.0",
|
|
57
|
-
"mypy==1.8.0",
|
|
58
|
-
"pytest==7.4.0",
|
|
59
|
-
"types-Pillow==10.2.0.20240213",
|
|
60
|
-
"types-tqdm==4.66.0.20240106",
|
|
61
|
-
"pydub-stubs==0.25.1.1",
|
|
62
|
-
]
|
|
63
|
-
|
|
64
|
-
[tool.rye.scripts]
|
|
65
|
-
test-unit = "pytest"
|
|
66
|
-
test-type = "mypy src"
|
|
67
|
-
test-static = { chain = [
|
|
68
|
-
"black src -l 120 --check",
|
|
69
|
-
"isort src --profile black --check"
|
|
70
|
-
]}
|
|
52
|
+
[tool.mypy]
|
|
53
|
+
mypy_path = "stubs"
|
|
71
54
|
|
|
72
55
|
[build-system]
|
|
73
56
|
requires = ["hatchling"]
|
|
@@ -79,10 +62,8 @@ packages = ["src/videopython"]
|
|
|
79
62
|
[tool.hatch.build.targets.sdist]
|
|
80
63
|
include = ["src/videopython", "src/videopython/py.typed"]
|
|
81
64
|
|
|
82
|
-
[tool.mypy]
|
|
83
|
-
mypy_path = "stubs"
|
|
84
|
-
|
|
85
65
|
[tool.pytest]
|
|
66
|
+
pythonpath = [".src/"]
|
|
86
67
|
testpaths = ["src/tests"]
|
|
87
68
|
python_files = ["test_*.py"]
|
|
88
69
|
addopts = "-v --tb=short"
|
|
@@ -67,7 +67,7 @@ class FadeTransition(Transition):
|
|
|
67
67
|
],
|
|
68
68
|
fps=video_fps,
|
|
69
69
|
)
|
|
70
|
-
faded_videos.audio = videos[0].audio.
|
|
70
|
+
faded_videos.audio = videos[0].audio.concat(videos[1].audio, crossfade=(effect_time_fps / video_fps))
|
|
71
71
|
return faded_videos
|
|
72
72
|
|
|
73
73
|
|
|
@@ -102,5 +102,5 @@ class BlurTransition(Transition):
|
|
|
102
102
|
],
|
|
103
103
|
fps=video_fps,
|
|
104
104
|
)
|
|
105
|
-
blurred_videos.audio = videos[0].audio.
|
|
105
|
+
blurred_videos.audio = videos[0].audio.concat(videos[1].audio)
|
|
106
106
|
return blurred_videos
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import shlex
|
|
4
3
|
import subprocess
|
|
5
4
|
import tempfile
|
|
6
5
|
from dataclasses import dataclass
|
|
@@ -9,7 +8,7 @@ from typing import Literal, get_args
|
|
|
9
8
|
|
|
10
9
|
import cv2
|
|
11
10
|
import numpy as np
|
|
12
|
-
from
|
|
11
|
+
from soundpython import Audio
|
|
13
12
|
|
|
14
13
|
from videopython.utils.common import generate_random_name
|
|
15
14
|
|
|
@@ -42,11 +41,7 @@ class VideoMetadata:
|
|
|
42
41
|
|
|
43
42
|
@classmethod
|
|
44
43
|
def from_path(cls, video_path: str) -> VideoMetadata:
|
|
45
|
-
"""Creates VideoMetadata object from video file.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
video_path: Path to video file.
|
|
49
|
-
"""
|
|
44
|
+
"""Creates VideoMetadata object from video file."""
|
|
50
45
|
video = cv2.VideoCapture(video_path)
|
|
51
46
|
frame_count = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
|
|
52
47
|
fps = round(video.get(cv2.CAP_PROP_FPS), 2)
|
|
@@ -64,13 +59,7 @@ class VideoMetadata:
|
|
|
64
59
|
|
|
65
60
|
@classmethod
|
|
66
61
|
def from_video(cls, video: Video) -> VideoMetadata:
|
|
67
|
-
"""Creates VideoMetadata object from
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
frames: Frames of the video.
|
|
71
|
-
fps: Frames per second of the video.
|
|
72
|
-
"""
|
|
73
|
-
|
|
62
|
+
"""Creates VideoMetadata object from Video instance."""
|
|
74
63
|
frame_count, height, width, _ = video.frames.shape
|
|
75
64
|
total_seconds = round(frame_count / video.fps, 2)
|
|
76
65
|
|
|
@@ -116,11 +105,14 @@ class Video:
|
|
|
116
105
|
def from_path(cls, path: str) -> Video:
|
|
117
106
|
new_vid = cls()
|
|
118
107
|
new_vid.frames, new_vid.fps = cls._load_video_from_path(path)
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
new_vid.audio = Audio.from_file(path)
|
|
111
|
+
except Exception as e:
|
|
121
112
|
print(f"No audio found for `{path}`, adding silent track!")
|
|
122
|
-
audio =
|
|
123
|
-
|
|
113
|
+
new_vid.audio = Audio.create_silent(
|
|
114
|
+
duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
|
|
115
|
+
)
|
|
124
116
|
return new_vid
|
|
125
117
|
|
|
126
118
|
@classmethod
|
|
@@ -134,7 +126,9 @@ class Video:
|
|
|
134
126
|
raise ValueError(f"Unsupported number of dimensions: {frames.shape}!")
|
|
135
127
|
new_vid.frames = frames
|
|
136
128
|
new_vid.fps = fps
|
|
137
|
-
new_vid.audio =
|
|
129
|
+
new_vid.audio = Audio.create_silent(
|
|
130
|
+
duration_seconds=round(new_vid.total_seconds, 2), stereo=True, sample_rate=44100
|
|
131
|
+
)
|
|
138
132
|
return new_vid
|
|
139
133
|
|
|
140
134
|
@classmethod
|
|
@@ -144,12 +138,12 @@ class Video:
|
|
|
144
138
|
image = np.expand_dims(image, axis=0)
|
|
145
139
|
new_vid.frames = np.repeat(image, round(length_seconds * fps), axis=0)
|
|
146
140
|
new_vid.fps = fps
|
|
147
|
-
new_vid.audio =
|
|
141
|
+
new_vid.audio = Audio.create_silent(duration_seconds=length_seconds, stereo=True, sample_rate=44100)
|
|
148
142
|
return new_vid
|
|
149
143
|
|
|
150
144
|
def copy(self) -> Video:
|
|
151
145
|
copied = Video().from_frames(self.frames.copy(), self.fps)
|
|
152
|
-
copied.audio = self.audio
|
|
146
|
+
copied.audio = self.audio # Audio objects are immutable, no need to copy
|
|
153
147
|
return copied
|
|
154
148
|
|
|
155
149
|
def is_loaded(self) -> bool:
|
|
@@ -165,25 +159,18 @@ class Video:
|
|
|
165
159
|
self.from_frames(self.frames[:frame_idx], self.fps),
|
|
166
160
|
self.from_frames(self.frames[frame_idx:], self.fps),
|
|
167
161
|
)
|
|
168
|
-
audio_midpoint = (frame_idx / self.fps) * 1000
|
|
169
|
-
split_videos[0].audio = self.audio[:audio_midpoint]
|
|
170
|
-
split_videos[1].audio = self.audio[audio_midpoint:]
|
|
171
|
-
return split_videos
|
|
172
162
|
|
|
173
|
-
|
|
174
|
-
|
|
163
|
+
# Split audio at the corresponding time point
|
|
164
|
+
split_time = frame_idx / self.fps
|
|
165
|
+
split_videos[0].audio = self.audio.slice(start_seconds=0, end_seconds=split_time)
|
|
166
|
+
split_videos[1].audio = self.audio.slice(start_seconds=split_time)
|
|
175
167
|
|
|
176
|
-
|
|
177
|
-
filename: Name of the output video file. Generates random name if not provided.
|
|
178
|
-
format: Output format (default is 'mp4').
|
|
168
|
+
return split_videos
|
|
179
169
|
|
|
180
|
-
|
|
181
|
-
Path to the saved video file.
|
|
182
|
-
"""
|
|
170
|
+
def save(self, filename: str | Path | None = None, format: ALLOWED_VIDEO_FORMATS = "mp4") -> Path:
|
|
183
171
|
if not self.is_loaded():
|
|
184
172
|
raise RuntimeError("Video is not loaded, cannot save!")
|
|
185
173
|
|
|
186
|
-
# Check if the format is allowed
|
|
187
174
|
if format.lower() not in get_args(ALLOWED_VIDEO_FORMATS):
|
|
188
175
|
raise ValueError(
|
|
189
176
|
f"Unsupported format: {format}. Allowed formats are: {', '.join(get_args(ALLOWED_VIDEO_FORMATS))}"
|
|
@@ -203,79 +190,95 @@ class Video:
|
|
|
203
190
|
frame_path = temp_dir_path / f"frame_{i:04d}.png"
|
|
204
191
|
cv2.imwrite(str(frame_path), cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
|
|
205
192
|
|
|
206
|
-
#
|
|
193
|
+
# Calculate exact video duration
|
|
194
|
+
video_duration = len(self.frames) / self.fps
|
|
195
|
+
|
|
196
|
+
# Ensure audio duration matches video duration
|
|
197
|
+
if (
|
|
198
|
+
abs(self.audio.metadata.duration_seconds - video_duration) > 0.001
|
|
199
|
+
): # Small threshold for float comparison
|
|
200
|
+
if self.audio.metadata.duration_seconds < video_duration:
|
|
201
|
+
# Create silent audio for the remaining duration
|
|
202
|
+
remaining_duration = video_duration - self.audio.metadata.duration_seconds
|
|
203
|
+
silent_audio = Audio.create_silent(
|
|
204
|
+
duration_seconds=remaining_duration,
|
|
205
|
+
stereo=(self.audio.metadata.channels == 2),
|
|
206
|
+
sample_rate=self.audio.metadata.sample_rate,
|
|
207
|
+
sample_width=self.audio.metadata.sample_width,
|
|
208
|
+
)
|
|
209
|
+
# Concatenate original audio with silent padding
|
|
210
|
+
padded_audio = self.audio.concat(silent_audio)
|
|
211
|
+
else:
|
|
212
|
+
# Trim audio to match video duration
|
|
213
|
+
padded_audio = self.audio.slice(end_seconds=video_duration)
|
|
214
|
+
else:
|
|
215
|
+
padded_audio = self.audio
|
|
216
|
+
|
|
217
|
+
# Save audio to temporary WAV file
|
|
207
218
|
temp_audio = temp_dir_path / "temp_audio.wav"
|
|
208
|
-
|
|
219
|
+
padded_audio.save(str(temp_audio), format="wav")
|
|
209
220
|
|
|
210
|
-
# Construct FFmpeg command
|
|
221
|
+
# Construct FFmpeg command with explicit duration
|
|
211
222
|
ffmpeg_command = [
|
|
212
223
|
"ffmpeg",
|
|
213
|
-
"-y",
|
|
214
|
-
"-
|
|
215
|
-
str(self.fps), #
|
|
224
|
+
"-y",
|
|
225
|
+
"-framerate",
|
|
226
|
+
str(self.fps), # Use -framerate instead of -r for input
|
|
216
227
|
"-i",
|
|
217
|
-
str(temp_dir_path / "frame_%04d.png"),
|
|
228
|
+
str(temp_dir_path / "frame_%04d.png"),
|
|
218
229
|
"-i",
|
|
219
|
-
str(temp_audio),
|
|
230
|
+
str(temp_audio),
|
|
220
231
|
"-c:v",
|
|
221
|
-
"libx264",
|
|
232
|
+
"libx264",
|
|
222
233
|
"-preset",
|
|
223
|
-
"medium",
|
|
234
|
+
"medium",
|
|
224
235
|
"-crf",
|
|
225
|
-
"23",
|
|
236
|
+
"23",
|
|
226
237
|
"-c:a",
|
|
227
|
-
"
|
|
238
|
+
"aac", # Use AAC instead of copy for more reliable audio
|
|
228
239
|
"-b:a",
|
|
229
|
-
"192k",
|
|
240
|
+
"192k",
|
|
230
241
|
"-pix_fmt",
|
|
231
|
-
"yuv420p",
|
|
232
|
-
"-
|
|
242
|
+
"yuv420p",
|
|
243
|
+
"-map",
|
|
244
|
+
"0:v:0", # Map video from first input
|
|
245
|
+
"-map",
|
|
246
|
+
"1:a:0", # Map audio from second input
|
|
247
|
+
"-vsync",
|
|
248
|
+
"cfr", # Force constant frame rate
|
|
233
249
|
str(filename),
|
|
234
250
|
]
|
|
235
251
|
|
|
236
252
|
try:
|
|
237
253
|
subprocess.run(ffmpeg_command, check=True, capture_output=True, text=True)
|
|
238
|
-
print(f"Video saved successfully to: {filename}")
|
|
239
254
|
return filename
|
|
240
255
|
except subprocess.CalledProcessError as e:
|
|
241
256
|
print(f"Error saving video: {e}")
|
|
242
257
|
print(f"FFmpeg stderr: {e.stderr}")
|
|
243
258
|
raise
|
|
244
259
|
|
|
245
|
-
def add_audio(self, audio:
|
|
246
|
-
self.audio
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
return
|
|
253
|
-
|
|
254
|
-
self.audio = self._process_audio(audio=new_audio, overlay=overlay, overlay_gain=overlay_gain, loop=loop)
|
|
255
|
-
|
|
256
|
-
def _process_audio(
|
|
257
|
-
self, audio: AudioSegment, overlay: bool = True, overlay_gain: int = 0, loop: bool = False
|
|
258
|
-
) -> AudioSegment:
|
|
259
|
-
if (duration_diff := round(self.total_seconds - audio.duration_seconds)) > 0 and not loop:
|
|
260
|
-
audio = audio + AudioSegment.silent(duration_diff * 1000)
|
|
261
|
-
elif audio.duration_seconds > self.total_seconds:
|
|
262
|
-
audio = audio[: round(self.total_seconds * 1000)]
|
|
260
|
+
def add_audio(self, audio: Audio, overlay: bool = True) -> None:
|
|
261
|
+
if self.audio.is_silent:
|
|
262
|
+
self.audio = audio
|
|
263
|
+
elif overlay:
|
|
264
|
+
self.audio = self.audio.overlay(audio, position=0.0)
|
|
265
|
+
else:
|
|
266
|
+
self.audio = audio
|
|
263
267
|
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
268
|
+
def add_audio_from_file(self, path: str, overlay: bool = True) -> None:
|
|
269
|
+
try:
|
|
270
|
+
new_audio = Audio.from_file(path)
|
|
271
|
+
self.add_audio(new_audio, overlay)
|
|
272
|
+
except Exception as e:
|
|
273
|
+
print(f"Audio file `{path}` not found or invalid, skipping!")
|
|
267
274
|
|
|
268
275
|
def __add__(self, other: Video) -> Video:
|
|
269
|
-
# TODO: Should it be class method? How to make it work with sum()?
|
|
270
276
|
if self.fps != other.fps:
|
|
271
277
|
raise ValueError("FPS of videos do not match!")
|
|
272
278
|
elif self.frame_shape != other.frame_shape:
|
|
273
|
-
raise ValueError(
|
|
274
|
-
"Resolutions of the images do not match: "
|
|
275
|
-
f"{self.frame_shape} not compatible with {other.frame_shape}."
|
|
276
|
-
)
|
|
279
|
+
raise ValueError(f"Resolutions do not match: {self.frame_shape} vs {other.frame_shape}")
|
|
277
280
|
new_video = self.from_frames(np.r_["0,2", self.frames, other.frames], fps=self.fps)
|
|
278
|
-
new_video.audio = self.audio
|
|
281
|
+
new_video.audio = self.audio.concat(other.audio)
|
|
279
282
|
return new_video
|
|
280
283
|
|
|
281
284
|
def __str__(self) -> str:
|
|
@@ -285,37 +288,25 @@ class Video:
|
|
|
285
288
|
if not isinstance(val, slice):
|
|
286
289
|
raise ValueError("Only slices are supported for video indexing!")
|
|
287
290
|
|
|
288
|
-
# Sub-slice video
|
|
291
|
+
# Sub-slice video frames
|
|
289
292
|
sliced = self.from_frames(self.frames[val], fps=self.fps)
|
|
290
|
-
|
|
293
|
+
|
|
294
|
+
# Handle slicing bounds for audio
|
|
291
295
|
start = val.start if val.start else 0
|
|
292
296
|
stop = val.stop if val.stop else len(self.frames)
|
|
293
|
-
# Handle negative values for audio slices
|
|
294
297
|
if start < 0:
|
|
295
298
|
start = len(self.frames) + start
|
|
296
299
|
if stop < 0:
|
|
297
300
|
stop = len(self.frames) + stop
|
|
298
|
-
# Append audio to the slice
|
|
299
|
-
audio_start = round(start / self.fps) * 1000
|
|
300
|
-
audio_end = round(stop / self.fps) * 1000
|
|
301
|
-
sliced.audio = self.audio[audio_start:audio_end]
|
|
302
|
-
return sliced
|
|
303
301
|
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
except IndexError:
|
|
310
|
-
return None
|
|
302
|
+
# Slice audio to match video duration
|
|
303
|
+
audio_start = start / self.fps
|
|
304
|
+
audio_end = stop / self.fps
|
|
305
|
+
sliced.audio = self.audio.slice(start_seconds=audio_start, end_seconds=audio_end)
|
|
306
|
+
return sliced
|
|
311
307
|
|
|
312
308
|
@staticmethod
|
|
313
309
|
def _load_video_from_path(path: str) -> tuple[np.ndarray, float]:
|
|
314
|
-
"""Loads frames and fps information from video file.
|
|
315
|
-
|
|
316
|
-
Args:
|
|
317
|
-
path: Path to video file.
|
|
318
|
-
"""
|
|
319
310
|
cap = cv2.VideoCapture(path)
|
|
320
311
|
if not cap.isOpened():
|
|
321
312
|
raise ValueError(f"Unable to open video file: {path}")
|
|
@@ -339,20 +330,16 @@ class Video:
|
|
|
339
330
|
|
|
340
331
|
@property
|
|
341
332
|
def video_shape(self) -> tuple[int, int, int, int]:
|
|
342
|
-
"""Returns 4D video shape."""
|
|
343
333
|
return self.frames.shape
|
|
344
334
|
|
|
345
335
|
@property
|
|
346
336
|
def frame_shape(self) -> tuple[int, int, int]:
|
|
347
|
-
"""Returns 3D frame shape."""
|
|
348
337
|
return self.frames.shape[1:]
|
|
349
338
|
|
|
350
339
|
@property
|
|
351
340
|
def total_seconds(self) -> float:
|
|
352
|
-
"""Returns total seconds of the video."""
|
|
353
341
|
return round(self.frames.shape[0] / self.fps, 4)
|
|
354
342
|
|
|
355
343
|
@property
|
|
356
344
|
def metadata(self) -> VideoMetadata:
|
|
357
|
-
"""Returns VideoMetadata object."""
|
|
358
345
|
return VideoMetadata.from_video(self)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import numpy as np
|
|
2
1
|
import torch
|
|
3
|
-
from
|
|
2
|
+
from soundpython import Audio, AudioMetadata
|
|
4
3
|
from transformers import (
|
|
5
4
|
AutoProcessor,
|
|
6
5
|
AutoTokenizer,
|
|
@@ -17,15 +16,24 @@ class TextToSpeech:
|
|
|
17
16
|
self.pipeline = VitsModel.from_pretrained(TEXT_TO_SPEECH_MODEL)
|
|
18
17
|
self.tokenizer = AutoTokenizer.from_pretrained(TEXT_TO_SPEECH_MODEL)
|
|
19
18
|
|
|
20
|
-
def generate_audio(self, text: str) ->
|
|
19
|
+
def generate_audio(self, text: str) -> Audio:
|
|
21
20
|
tokenized = self.tokenizer(text, return_tensors="pt")
|
|
22
21
|
|
|
23
22
|
with torch.no_grad():
|
|
24
23
|
output = self.pipeline(**tokenized).waveform
|
|
25
24
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
25
|
+
# Convert to float32 and normalize to [-1, 1]
|
|
26
|
+
audio_data = output.T.float().numpy()
|
|
27
|
+
|
|
28
|
+
metadata = AudioMetadata(
|
|
29
|
+
sample_rate=self.pipeline.config.sampling_rate,
|
|
30
|
+
channels=1,
|
|
31
|
+
sample_width=4,
|
|
32
|
+
duration_seconds=len(audio_data) / self.pipeline.config.sampling_rate,
|
|
33
|
+
frame_count=len(audio_data),
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return Audio(audio_data, metadata)
|
|
29
37
|
|
|
30
38
|
|
|
31
39
|
class TextToMusic:
|
|
@@ -37,7 +45,7 @@ class TextToMusic:
|
|
|
37
45
|
self.processor = AutoProcessor.from_pretrained(MUSIC_GENERATION_MODEL_SMALL)
|
|
38
46
|
self.model = MusicgenForConditionalGeneration.from_pretrained(MUSIC_GENERATION_MODEL_SMALL)
|
|
39
47
|
|
|
40
|
-
def generate_audio(self, text: str, max_new_tokens: int) ->
|
|
48
|
+
def generate_audio(self, text: str, max_new_tokens: int) -> Audio:
|
|
41
49
|
inputs = self.processor(
|
|
42
50
|
text=[text],
|
|
43
51
|
padding=True,
|
|
@@ -45,12 +53,16 @@ class TextToMusic:
|
|
|
45
53
|
)
|
|
46
54
|
audio_values = self.model.generate(**inputs, max_new_tokens=max_new_tokens)
|
|
47
55
|
sampling_rate = self.model.config.audio_encoder.sampling_rate
|
|
48
|
-
output = (audio_values[0, 0].float().numpy() * (2**31 - 1)).astype(np.int32)
|
|
49
56
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
57
|
+
# Convert to float32 and normalize to [-1, 1]
|
|
58
|
+
audio_data = audio_values[0, 0].float().numpy()
|
|
59
|
+
|
|
60
|
+
metadata = AudioMetadata(
|
|
61
|
+
sample_rate=sampling_rate,
|
|
54
62
|
channels=1,
|
|
63
|
+
sample_width=4,
|
|
64
|
+
duration_seconds=len(audio_data) / sampling_rate,
|
|
65
|
+
frame_count=len(audio_data),
|
|
55
66
|
)
|
|
56
|
-
|
|
67
|
+
|
|
68
|
+
return Audio(audio_data, metadata)
|
|
@@ -197,6 +197,10 @@ class ImageText:
|
|
|
197
197
|
# Find bounding rectangle for written text
|
|
198
198
|
box_slice = img[y:current_text_height, x : x + box_width]
|
|
199
199
|
text_mask = np.any(box_slice != 0, axis=2).astype(np.uint8)
|
|
200
|
+
if not isinstance(text_mask, np.ndarray):
|
|
201
|
+
raise TypeError(
|
|
202
|
+
f"The returned text mask is of type {type(text_mask)}, " "but it should be numpy array!"
|
|
203
|
+
)
|
|
200
204
|
xmin, xmax, ymin, ymax = self._find_smallest_bounding_rect(text_mask)
|
|
201
205
|
# Get global bounding box position
|
|
202
206
|
xmin += x - background_padding
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|