torchcodec 0.7.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/__init__.py +16 -0
- torchcodec/_core/AVIOContextHolder.cpp +60 -0
- torchcodec/_core/AVIOContextHolder.h +64 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
- torchcodec/_core/AVIOFileLikeContext.h +55 -0
- torchcodec/_core/AVIOTensorContext.cpp +123 -0
- torchcodec/_core/AVIOTensorContext.h +43 -0
- torchcodec/_core/CMakeLists.txt +292 -0
- torchcodec/_core/Cache.h +138 -0
- torchcodec/_core/CpuDeviceInterface.cpp +266 -0
- torchcodec/_core/CpuDeviceInterface.h +70 -0
- torchcodec/_core/CudaDeviceInterface.cpp +514 -0
- torchcodec/_core/CudaDeviceInterface.h +37 -0
- torchcodec/_core/DeviceInterface.cpp +79 -0
- torchcodec/_core/DeviceInterface.h +67 -0
- torchcodec/_core/Encoder.cpp +514 -0
- torchcodec/_core/Encoder.h +123 -0
- torchcodec/_core/FFMPEGCommon.cpp +421 -0
- torchcodec/_core/FFMPEGCommon.h +227 -0
- torchcodec/_core/FilterGraph.cpp +142 -0
- torchcodec/_core/FilterGraph.h +45 -0
- torchcodec/_core/Frame.cpp +32 -0
- torchcodec/_core/Frame.h +118 -0
- torchcodec/_core/Metadata.h +72 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1715 -0
- torchcodec/_core/SingleStreamDecoder.h +380 -0
- torchcodec/_core/StreamOptions.h +53 -0
- torchcodec/_core/ValidationUtils.cpp +35 -0
- torchcodec/_core/ValidationUtils.h +21 -0
- torchcodec/_core/__init__.py +40 -0
- torchcodec/_core/_metadata.py +317 -0
- torchcodec/_core/custom_ops.cpp +727 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +300 -0
- torchcodec/_core/ops.py +455 -0
- torchcodec/_core/pybind_ops.cpp +87 -0
- torchcodec/_frame.py +145 -0
- torchcodec/_internally_replaced_utils.py +67 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +430 -0
- torchcodec/decoders/__init__.py +11 -0
- torchcodec/decoders/_audio_decoder.py +177 -0
- torchcodec/decoders/_decoder_utils.py +52 -0
- torchcodec/decoders/_video_decoder.py +464 -0
- torchcodec/encoders/__init__.py +1 -0
- torchcodec/encoders/_audio_encoder.py +150 -0
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +287 -0
- torchcodec/samplers/_time_based.py +350 -0
- torchcodec/version.py +2 -0
- torchcodec-0.7.0.dist-info/METADATA +242 -0
- torchcodec-0.7.0.dist-info/RECORD +67 -0
- torchcodec-0.7.0.dist-info/WHEEL +5 -0
- torchcodec-0.7.0.dist-info/licenses/LICENSE +28 -0
- torchcodec-0.7.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import importlib
|
|
8
|
+
import sys
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from types import ModuleType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# Copy pasted from torchvision
|
|
14
|
+
# https://github.com/pytorch/vision/blob/947ae1dc71867f28021d5bc0ff3a19c249236e2a/torchvision/_internally_replaced_utils.py#L25
|
|
15
|
+
def _get_extension_path(lib_name: str) -> str:
|
|
16
|
+
extension_suffixes = []
|
|
17
|
+
if sys.platform == "linux":
|
|
18
|
+
extension_suffixes = importlib.machinery.EXTENSION_SUFFIXES
|
|
19
|
+
elif sys.platform == "darwin":
|
|
20
|
+
extension_suffixes = importlib.machinery.EXTENSION_SUFFIXES + [".dylib"]
|
|
21
|
+
elif sys.platform in ("win32", "cygwin"):
|
|
22
|
+
extension_suffixes = importlib.machinery.EXTENSION_SUFFIXES + [".dll", ".pyd"]
|
|
23
|
+
else:
|
|
24
|
+
raise NotImplementedError(f"{sys.platform = } is not not supported")
|
|
25
|
+
loader_details = (
|
|
26
|
+
importlib.machinery.ExtensionFileLoader,
|
|
27
|
+
extension_suffixes,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
extfinder = importlib.machinery.FileFinder(
|
|
31
|
+
str(Path(__file__).parent), loader_details
|
|
32
|
+
)
|
|
33
|
+
ext_specs = extfinder.find_spec(lib_name)
|
|
34
|
+
if ext_specs is None:
|
|
35
|
+
raise ImportError(f"No spec found for {lib_name}")
|
|
36
|
+
|
|
37
|
+
if ext_specs.origin is None:
|
|
38
|
+
raise ImportError(f"Existing spec found for {lib_name} does not have an origin")
|
|
39
|
+
|
|
40
|
+
return ext_specs.origin
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _load_pybind11_module(module_name: str, library_path: str) -> ModuleType:
|
|
44
|
+
spec = importlib.util.spec_from_file_location(
|
|
45
|
+
module_name,
|
|
46
|
+
library_path,
|
|
47
|
+
)
|
|
48
|
+
if spec is None or spec.loader is None:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
f"Unable to load spec or spec.loader for module {module_name} from path {library_path}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
mod = importlib.util.module_from_spec(spec)
|
|
54
|
+
spec.loader.exec_module(mod)
|
|
55
|
+
|
|
56
|
+
return mod
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# Note that the return value from this function must match the value used as
|
|
60
|
+
# PYBIND_OPS_MODULE_NAME when we compile _core/pybind_ops.cpp. If the values
|
|
61
|
+
# do not match, we will not be able to import the C++ shared library as a
|
|
62
|
+
# Python module at runtime.
|
|
63
|
+
#
|
|
64
|
+
# The parameter ffmpeg_major_version is unused externally, but used
|
|
65
|
+
# internally.
|
|
66
|
+
def _get_pybind_ops_module_name(ffmpeg_major_version: int) -> str:
|
|
67
|
+
return "core_pybind_ops"
|
|
@@ -0,0 +1,430 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import abc
|
|
8
|
+
import json
|
|
9
|
+
import sys
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from typing import Any, Dict, List, Tuple, Union
|
|
12
|
+
|
|
13
|
+
import torch
|
|
14
|
+
from torch import nn, Tensor
|
|
15
|
+
|
|
16
|
+
from torchcodec._core import (
|
|
17
|
+
add_video_stream,
|
|
18
|
+
create_from_tensor,
|
|
19
|
+
get_frames_at_indices,
|
|
20
|
+
get_json_metadata,
|
|
21
|
+
get_next_frame,
|
|
22
|
+
scan_all_streams_to_update_metadata,
|
|
23
|
+
seek_to_pts,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class VideoTooShortException(Exception):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class DecoderArgs:
|
|
33
|
+
num_threads: int = 0
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass
|
|
37
|
+
class VideoArgs:
|
|
38
|
+
"""
|
|
39
|
+
VideoArgs contains video related information. Video width/heigh can't be co-exist with video min/max dimension.
|
|
40
|
+
Args:
|
|
41
|
+
desired_width (`int`): Target width of the video
|
|
42
|
+
desired_height (`int`): Target height of the video
|
|
43
|
+
desired_max_dimension (`int`): Target maximum dimension of the video
|
|
44
|
+
desired_min_dimension (`int`): Target minimum dimension of the video
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
desired_width: int = 0
|
|
48
|
+
desired_height: int = 0
|
|
49
|
+
desired_max_dimension: int = 0
|
|
50
|
+
desired_min_dimension: int = 0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class SamplerArgs(abc.ABC):
|
|
55
|
+
"""
|
|
56
|
+
Abstract class of sampler args, extended by TimeBasedSamplerArgs and IndexBasedSamplerArgs.
|
|
57
|
+
Frame refers to a video/audio frame, and clip is a list of frames which may be non-consecutive.
|
|
58
|
+
Args:
|
|
59
|
+
sampler_type (`str`): Sampler type, can be random, uniform, periodic, target
|
|
60
|
+
clips_per_video (`int`): Number of clips per video, this applys to random and uniform sampling
|
|
61
|
+
frames_per_clip (`int`): Number of frames per clip
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
sampler_type: str
|
|
65
|
+
clips_per_video: int
|
|
66
|
+
frames_per_clip: int
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class TimeBasedSamplerArgs(SamplerArgs):
|
|
71
|
+
"""
|
|
72
|
+
TimeBasedSamplerArgs inherits from SamplerArgs and describe the time based sampling behavior.
|
|
73
|
+
Args:
|
|
74
|
+
video_frame_dilation (`int`): Frame dilation of the video, if frame dilation is 2, we will sample every other frame within a clip.
|
|
75
|
+
sample_start_second (`float`): Start second of the sampler range, applies to all sampler types
|
|
76
|
+
sample_end_second (`float`): End second of the sampler range, applies to all sampler types
|
|
77
|
+
sample_per_second (`float`): Sample per second of the sampler range, applies to periodic sampling
|
|
78
|
+
target_sample_start_second (`float`): Start second of the target sampling range, applies to target sampling
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
video_frame_dilation: int = 1
|
|
82
|
+
sample_start_second: float = 0.0
|
|
83
|
+
sample_end_second: float = float("inf")
|
|
84
|
+
sample_per_second: float = 0.0
|
|
85
|
+
target_sample_start_second: List[float] = field(default_factory=lambda: [])
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class IndexBasedSamplerArgs(SamplerArgs):
|
|
90
|
+
"""
|
|
91
|
+
IndexBasedSamplerArgs inherits from SamplerArgs and describe the index based sampling behavior.
|
|
92
|
+
sample_start_index and sample_end_index together decide the range of the sampling.
|
|
93
|
+
sample_step decides step between each clip.
|
|
94
|
+
video_frame_dilation decides step between each frame within a clip.
|
|
95
|
+
Args:
|
|
96
|
+
video_frame_dilation (`int`): Frame dilation of the video, if frame dilation is 2, we will sample every other frame within a clip, applies to all sampler types
|
|
97
|
+
sample_start_index (`int`): Start index of the sampler range, applies to all sampler types
|
|
98
|
+
sample_end_index (`int`): End index of the sampler range, this is last possile frame you want to sample, applies to all sampler types
|
|
99
|
+
sample_step (`int`): Step of the sampler range, if step is 10, the interval between start frames of each clip will be 10, applies to periodic sampling only.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
video_frame_dilation: int = 1
|
|
103
|
+
sample_start_index: int = 0
|
|
104
|
+
sample_end_index: int = sys.maxsize
|
|
105
|
+
sample_step: int = 1
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class VideoClipSampler(nn.Module):
|
|
109
|
+
"""
|
|
110
|
+
VideoClipSampler will do video clip sampling with given video args and sampler args.
|
|
111
|
+
The video args contains video related information, frames_per_clip, dimensions etc.
|
|
112
|
+
The sampler args can be either time-based or index-based, it will be used to decide clip start time pts or index.
|
|
113
|
+
ClipSampling support, random, uniform, periodic, target, keyframe sampling etc.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
video_args (`VideoArgs`): The video args
|
|
117
|
+
sampler_args (`SamplerArgs`): The sampler args. Can be TimeBasedSamplerArgs or IndexBasedSamplerArgs
|
|
118
|
+
decoder_args (`DecoderArgs`): Decoder args contain value needs for decoder, for example, thread count
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
>>> video_args = VideoArgs(desired_width=224, desired_height=224)
|
|
122
|
+
>>> time_based_sampler_args = TimeBasedSamplerArgs(sampler_type="random", clips_per_video=1, frames_per_clip=4)
|
|
123
|
+
>>> video_decoder_args = DecoderArgs(num_threads=1)
|
|
124
|
+
>>> video_clip_sampler = VideoClipSampler(video_args, time_based_sampler_args, decoder_args)
|
|
125
|
+
>>> clips = video_clip_sampler(video_data)
|
|
126
|
+
clips now contains a list of clip, where clip is a list of frame tensors, each tensor represents a frame image.
|
|
127
|
+
"""
|
|
128
|
+
|
|
129
|
+
def __init__(
|
|
130
|
+
self,
|
|
131
|
+
video_args: VideoArgs,
|
|
132
|
+
sampler_args: SamplerArgs,
|
|
133
|
+
decoder_args: Union[None, DecoderArgs] = None,
|
|
134
|
+
) -> None:
|
|
135
|
+
super().__init__()
|
|
136
|
+
self.video_args = video_args
|
|
137
|
+
self.sampler_args = sampler_args
|
|
138
|
+
self.decoder_args = DecoderArgs() if decoder_args is None else decoder_args
|
|
139
|
+
|
|
140
|
+
def forward(self, video_data: Tensor) -> Union[List[Any]]:
|
|
141
|
+
"""Sample video clips from the video data
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
video_data (`Tensor`): The video data
|
|
145
|
+
|
|
146
|
+
Return
|
|
147
|
+
clips (` List[List[Tensor]]`): List of clips, where each clip is a list of Tensors, each tensor represents a frame image.
|
|
148
|
+
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
video_decoder = create_from_tensor(video_data)
|
|
152
|
+
scan_all_streams_to_update_metadata(video_decoder)
|
|
153
|
+
add_video_stream(video_decoder)
|
|
154
|
+
metadata_json = json.loads(get_json_metadata(video_decoder))
|
|
155
|
+
target_width, target_height = self._compute_frame_width_height(
|
|
156
|
+
metadata_json["width"], metadata_json["height"]
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
video_decoder = create_from_tensor(video_data)
|
|
160
|
+
scan_all_streams_to_update_metadata(video_decoder)
|
|
161
|
+
add_video_stream(
|
|
162
|
+
video_decoder,
|
|
163
|
+
width=target_width,
|
|
164
|
+
height=target_height,
|
|
165
|
+
num_threads=self.decoder_args.num_threads,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
clips: List[Any] = []
|
|
169
|
+
# Cast sampler args to be time based or index based
|
|
170
|
+
if isinstance(self.sampler_args, TimeBasedSamplerArgs):
|
|
171
|
+
time_based_sampler_args = self.sampler_args
|
|
172
|
+
clip_starts_in_seconds = self._get_start_seconds(
|
|
173
|
+
metadata_json, time_based_sampler_args
|
|
174
|
+
)
|
|
175
|
+
for start_ts in clip_starts_in_seconds:
|
|
176
|
+
clip = self._get_clip_with_start_second(
|
|
177
|
+
start_ts,
|
|
178
|
+
video_decoder,
|
|
179
|
+
time_based_sampler_args.video_frame_dilation,
|
|
180
|
+
)
|
|
181
|
+
clips.append(clip)
|
|
182
|
+
elif isinstance(self.sampler_args, IndexBasedSamplerArgs):
|
|
183
|
+
index_based_sampler_args = self.sampler_args
|
|
184
|
+
clips = self._get_clips_for_index_based_sampling(
|
|
185
|
+
video_decoder,
|
|
186
|
+
index_based_sampler_args,
|
|
187
|
+
metadata_json,
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
return clips
|
|
191
|
+
|
|
192
|
+
def _get_clips_for_index_based_sampling(
|
|
193
|
+
self,
|
|
194
|
+
video_decoder: Tensor,
|
|
195
|
+
index_based_sampler_args: IndexBasedSamplerArgs,
|
|
196
|
+
metadata_json: Dict[str, Any],
|
|
197
|
+
) -> List[Tensor]:
|
|
198
|
+
"""Get clips for index based sampling, the sampling is done in 3 steps:
|
|
199
|
+
1. Compute clip_start_idxs based on the sampler type and the sampler args;
|
|
200
|
+
2. For each clip, given clip_start_idx, video_frame_dilation, frames_per_clip, get indexes for all frames
|
|
201
|
+
3. With given index, fetch the frame and group into clip and then clips
|
|
202
|
+
|
|
203
|
+
Args:
|
|
204
|
+
video_decoder (`Tensor`): The video decoder
|
|
205
|
+
index_based_sampler_args (`IndexBasedSamplerArgs`): The index based sampler args
|
|
206
|
+
metadata_json (`Dict[str, Any]`): The metadata of the video in json format
|
|
207
|
+
|
|
208
|
+
Returns:
|
|
209
|
+
clips (` List[Tensor]`): List of clips, where each clip is a Tensor represents list of frames, Tensor shape default is NCHW.
|
|
210
|
+
"""
|
|
211
|
+
|
|
212
|
+
sample_start_index = max(0, index_based_sampler_args.sample_start_index)
|
|
213
|
+
sample_end_index = (
|
|
214
|
+
min(
|
|
215
|
+
index_based_sampler_args.sample_end_index + 1,
|
|
216
|
+
metadata_json["numFramesFromHeader"],
|
|
217
|
+
)
|
|
218
|
+
- index_based_sampler_args.video_frame_dilation
|
|
219
|
+
* index_based_sampler_args.frames_per_clip
|
|
220
|
+
)
|
|
221
|
+
sampler_type = index_based_sampler_args.sampler_type
|
|
222
|
+
|
|
223
|
+
if sampler_type == "random":
|
|
224
|
+
clip_start_idxs = torch.randint(
|
|
225
|
+
sample_start_index,
|
|
226
|
+
sample_end_index,
|
|
227
|
+
(index_based_sampler_args.clips_per_video,),
|
|
228
|
+
)
|
|
229
|
+
elif sampler_type == "uniform":
|
|
230
|
+
clip_start_idxs = torch.linspace(
|
|
231
|
+
sample_start_index,
|
|
232
|
+
sample_end_index,
|
|
233
|
+
index_based_sampler_args.clips_per_video,
|
|
234
|
+
dtype=torch.int32,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
clips = []
|
|
238
|
+
for clip_start_idx in clip_start_idxs:
|
|
239
|
+
batch_indexes = [
|
|
240
|
+
clip_start_idx + i * index_based_sampler_args.video_frame_dilation
|
|
241
|
+
for i in range(index_based_sampler_args.frames_per_clip)
|
|
242
|
+
]
|
|
243
|
+
frames, *_ = get_frames_at_indices(
|
|
244
|
+
video_decoder,
|
|
245
|
+
frame_indices=batch_indexes,
|
|
246
|
+
)
|
|
247
|
+
clips.append(frames)
|
|
248
|
+
|
|
249
|
+
return clips
|
|
250
|
+
|
|
251
|
+
def _get_start_seconds(
|
|
252
|
+
self,
|
|
253
|
+
metadata_json: Dict[str, Any],
|
|
254
|
+
time_based_sampler_args: TimeBasedSamplerArgs,
|
|
255
|
+
) -> List[float]:
|
|
256
|
+
"""Get start seconds for each clip.
|
|
257
|
+
Given different sampler type, the API returns different clip start seconds.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
metadata_json (`Dict[str, Any]`): The metadata of the video in json format
|
|
261
|
+
time_based_sampler_args: (`TimeBasedSamplerArgs`): The time based sampler args
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
(`List[float]`): List of the sampled clip start position in seconds
|
|
265
|
+
"""
|
|
266
|
+
video_duration_in_seconds = metadata_json["durationSecondsFromHeader"]
|
|
267
|
+
|
|
268
|
+
clip_duration_in_seconds = (
|
|
269
|
+
time_based_sampler_args.frames_per_clip
|
|
270
|
+
* time_based_sampler_args.video_frame_dilation
|
|
271
|
+
+ 1
|
|
272
|
+
) / metadata_json["averageFpsFromHeader"]
|
|
273
|
+
|
|
274
|
+
beginStreamSecondsFromContent = (
|
|
275
|
+
metadata_json["beginStreamSecondsFromContent"]
|
|
276
|
+
if metadata_json["beginStreamSecondsFromContent"]
|
|
277
|
+
else 0
|
|
278
|
+
)
|
|
279
|
+
endStreamSecondsFromContent = (
|
|
280
|
+
metadata_json["endStreamSecondsFromContent"]
|
|
281
|
+
if metadata_json["endStreamSecondsFromContent"] > 0
|
|
282
|
+
else video_duration_in_seconds
|
|
283
|
+
)
|
|
284
|
+
last_possible_clip_start_in_seconds = (
|
|
285
|
+
endStreamSecondsFromContent - clip_duration_in_seconds
|
|
286
|
+
)
|
|
287
|
+
if last_possible_clip_start_in_seconds < 0:
|
|
288
|
+
raise VideoTooShortException(
|
|
289
|
+
"Cannot get clips because video duration is shorter than the clip duration!"
|
|
290
|
+
)
|
|
291
|
+
sampler_type = time_based_sampler_args.sampler_type
|
|
292
|
+
clip_starts_in_seconds: List[float] = []
|
|
293
|
+
sample_start_second = max(
|
|
294
|
+
time_based_sampler_args.sample_start_second,
|
|
295
|
+
beginStreamSecondsFromContent,
|
|
296
|
+
)
|
|
297
|
+
sample_end_second = min(
|
|
298
|
+
last_possible_clip_start_in_seconds,
|
|
299
|
+
time_based_sampler_args.sample_end_second,
|
|
300
|
+
)
|
|
301
|
+
if sampler_type == "random":
|
|
302
|
+
clip_starts_in_seconds = (
|
|
303
|
+
torch.rand(time_based_sampler_args.clips_per_video)
|
|
304
|
+
* (sample_end_second - sample_start_second)
|
|
305
|
+
+ sample_start_second
|
|
306
|
+
).tolist()
|
|
307
|
+
clip_starts_in_seconds.sort()
|
|
308
|
+
elif sampler_type == "uniform":
|
|
309
|
+
clip_starts_in_seconds = torch.linspace(
|
|
310
|
+
sample_start_second,
|
|
311
|
+
sample_end_second,
|
|
312
|
+
time_based_sampler_args.clips_per_video,
|
|
313
|
+
).tolist()
|
|
314
|
+
else:
|
|
315
|
+
raise NotImplementedError
|
|
316
|
+
|
|
317
|
+
return clip_starts_in_seconds
|
|
318
|
+
|
|
319
|
+
def _get_clip_with_start_second(
|
|
320
|
+
self, start_second: float, video_decoder: Tensor, video_frame_dilation: int
|
|
321
|
+
) -> List[Tensor]:
|
|
322
|
+
"""Get clip with start second.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
`start_second` (`float`): The start second of the clip
|
|
326
|
+
`video_decoder` (`Tensor`): The video decoder
|
|
327
|
+
`video_frame_dilation` (`int`): The video frame dilation, by default it's 1.
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
`clip` (`List[Tensor]`): clip is list of frame tensor. Dimension of each frame tensor is user specified, by default it's HWC.
|
|
331
|
+
"""
|
|
332
|
+
seek_to_pts(video_decoder, start_second)
|
|
333
|
+
frames_needed_per_clip = (
|
|
334
|
+
self.sampler_args.frames_per_clip - 1
|
|
335
|
+
) * video_frame_dilation + 1
|
|
336
|
+
clip = []
|
|
337
|
+
for _ in range(frames_needed_per_clip):
|
|
338
|
+
frame, _, _ = get_next_frame(video_decoder)
|
|
339
|
+
clip.append(frame)
|
|
340
|
+
|
|
341
|
+
# slice the list of tensor with frame_dilation and stack to tensor
|
|
342
|
+
clip = clip[::video_frame_dilation]
|
|
343
|
+
return clip
|
|
344
|
+
|
|
345
|
+
def _compute_frame_width_height(
|
|
346
|
+
self, ori_width: int, ori_height: int
|
|
347
|
+
) -> Tuple[int, int]:
|
|
348
|
+
"""Compute output frame width and height
|
|
349
|
+
desired_width, desired_height, desired_min_dimension, desired_max_dimension, (`int`): Together decide the size of the decoded video clips. (Default: `0`).
|
|
350
|
+
Note that the desired_width/desired_height parameters are mutually exclusive with desired_min_dimension/desired_max_dimension parameters.
|
|
351
|
+
- When desired_width = 0, desired_height = 0, desired_min_dimension = 0,
|
|
352
|
+
and desired_max_dimension = 0, keep the original frame resolution
|
|
353
|
+
- When desired_width = 0, desired_height != 0, desired_min_dimension = 0,
|
|
354
|
+
and desired_max_dimension = 0, keep the aspect ratio and resize
|
|
355
|
+
the frame so that frame target_height is $desired_height
|
|
356
|
+
- When desired_width != 0, desired_height == 0, desired_min_dimension = 0,
|
|
357
|
+
and desired_max_dimension = 0, keep the aspect ratio and resize
|
|
358
|
+
the frame so that frame target_width is $desired_width
|
|
359
|
+
- When desired_width != 0, desired_height != 0, video_min_dimension = 0,
|
|
360
|
+
and desired_max_dimension = 0, resize the frame so that frame
|
|
361
|
+
target_width and target_height are set to $desired_width and
|
|
362
|
+
$desired_height, respectively
|
|
363
|
+
- When desired_width = 0, desired_height = 0, desired_min_dimension != 0,
|
|
364
|
+
and desired_max_dimension = 0, keep the aspect ratio and resize the
|
|
365
|
+
frame so that shorter edge size is desired_min_dimension
|
|
366
|
+
- When desired_width = 0, desired_height = 0, desired_min_dimension = 0,
|
|
367
|
+
and desired_max_dimension != 0, keep the aspect ratio and resize
|
|
368
|
+
the frame so that longer edge size is desired_max_dimension
|
|
369
|
+
- When desired_width = 0, desired_height = 0, desired_min_dimension != 0,
|
|
370
|
+
and desired_max_dimension != 0, resize the frame so that shorter
|
|
371
|
+
edge size is desired_min_dimension, and longer edge size is
|
|
372
|
+
desired_max_dimension. The aspect ratio may not be preserved
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
ori_width (`int`): Original width of the video
|
|
376
|
+
ori_height (`int`): Original height of the video
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
(`Tuple[int, int]`): output frame width and height
|
|
380
|
+
"""
|
|
381
|
+
width_height_ratio = ori_width / ori_height
|
|
382
|
+
height_width_ratio = ori_height / ori_width
|
|
383
|
+
|
|
384
|
+
target_width, target_height = ori_width, ori_height
|
|
385
|
+
|
|
386
|
+
# video_height and/or video_width is non zero
|
|
387
|
+
if self.video_args.desired_width == 0 and self.video_args.desired_height != 0:
|
|
388
|
+
target_height = self.video_args.desired_height
|
|
389
|
+
target_width = int(width_height_ratio * target_height)
|
|
390
|
+
elif self.video_args.desired_width != 0 and self.video_args.desired_height == 0:
|
|
391
|
+
target_width = self.video_args.desired_width
|
|
392
|
+
target_height = int(height_width_ratio * target_width)
|
|
393
|
+
elif self.video_args.desired_width != 0 and self.video_args.desired_height != 0:
|
|
394
|
+
target_width, target_height = (
|
|
395
|
+
self.video_args.desired_width,
|
|
396
|
+
self.video_args.desired_height,
|
|
397
|
+
)
|
|
398
|
+
# video_min_dimension and/or video_max_dimension is non zero
|
|
399
|
+
elif (
|
|
400
|
+
self.video_args.desired_min_dimension != 0
|
|
401
|
+
and self.video_args.desired_max_dimension == 0
|
|
402
|
+
):
|
|
403
|
+
if ori_width > ori_height:
|
|
404
|
+
target_height = self.video_args.desired_min_dimension
|
|
405
|
+
target_width = int(width_height_ratio * target_height)
|
|
406
|
+
else:
|
|
407
|
+
target_width = self.video_args.desired_min_dimension
|
|
408
|
+
target_height = int(height_width_ratio * target_width)
|
|
409
|
+
elif (
|
|
410
|
+
self.video_args.desired_min_dimension == 0
|
|
411
|
+
and self.video_args.desired_max_dimension != 0
|
|
412
|
+
):
|
|
413
|
+
if ori_width > ori_height:
|
|
414
|
+
target_width = self.video_args.desired_max_dimension
|
|
415
|
+
target_height = int(height_width_ratio * target_width)
|
|
416
|
+
else:
|
|
417
|
+
target_height = self.video_args.desired_max_dimension
|
|
418
|
+
target_width = int(width_height_ratio * target_height)
|
|
419
|
+
elif (
|
|
420
|
+
self.video_args.desired_min_dimension != 0
|
|
421
|
+
and self.video_args.desired_max_dimension != 0
|
|
422
|
+
):
|
|
423
|
+
if ori_width > ori_height:
|
|
424
|
+
target_width = self.video_args.desired_max_dimension
|
|
425
|
+
target_height = self.video_args.desired_min_dimension
|
|
426
|
+
else:
|
|
427
|
+
target_height = self.video_args.desired_max_dimension
|
|
428
|
+
target_width = self.video_args.desired_min_dimension
|
|
429
|
+
|
|
430
|
+
return target_width, target_height
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
from .._core import AudioStreamMetadata, VideoStreamMetadata
|
|
8
|
+
from ._audio_decoder import AudioDecoder # noqa
|
|
9
|
+
from ._video_decoder import VideoDecoder # noqa
|
|
10
|
+
|
|
11
|
+
SimpleVideoDecoder = VideoDecoder
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
import io
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Union
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
from torch import Tensor
|
|
13
|
+
|
|
14
|
+
from torchcodec import _core as core, AudioSamples
|
|
15
|
+
from torchcodec.decoders._decoder_utils import (
|
|
16
|
+
create_decoder,
|
|
17
|
+
ERROR_REPORTING_INSTRUCTIONS,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AudioDecoder:
|
|
22
|
+
"""A single-stream audio decoder.
|
|
23
|
+
|
|
24
|
+
This can be used to decode audio from pure audio files (e.g. mp3, wav,
|
|
25
|
+
etc.), or from videos that contain audio streams (e.g. mp4 videos).
|
|
26
|
+
|
|
27
|
+
Returned samples are float samples normalized in [-1, 1]
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
source (str, ``Pathlib.path``, bytes, ``torch.Tensor`` or file-like
|
|
31
|
+
object): The source of the video or audio:
|
|
32
|
+
|
|
33
|
+
- If ``str``: a local path or a URL to a video or audio file.
|
|
34
|
+
- If ``Pathlib.path``: a path to a local video or audio file.
|
|
35
|
+
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
|
|
36
|
+
- If file-like object: we read video data from the object on demand. The object must
|
|
37
|
+
expose the methods `read(self, size: int) -> bytes` and
|
|
38
|
+
`seek(self, offset: int, whence: int) -> int`. Read more in:
|
|
39
|
+
:ref:`sphx_glr_generated_examples_decoding_file_like.py`.
|
|
40
|
+
stream_index (int, optional): Specifies which stream in the file to decode samples from.
|
|
41
|
+
Note that this index is absolute across all media types. If left unspecified, then
|
|
42
|
+
the :term:`best stream` is used.
|
|
43
|
+
sample_rate (int, optional): The desired output sample rate of the decoded samples.
|
|
44
|
+
By default, the sample rate of the source is used.
|
|
45
|
+
num_channels (int, optional): The desired number of channels of the decoded samples.
|
|
46
|
+
By default, the number of channels of the source is used.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
metadata (AudioStreamMetadata): Metadata of the audio stream.
|
|
50
|
+
stream_index (int): The stream index that this decoder is retrieving samples from. If a
|
|
51
|
+
stream index was provided at initialization, this is the same value. If it was left
|
|
52
|
+
unspecified, this is the :term:`best stream`.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
source: Union[str, Path, io.RawIOBase, io.BufferedReader, bytes, Tensor],
|
|
58
|
+
*,
|
|
59
|
+
stream_index: Optional[int] = None,
|
|
60
|
+
sample_rate: Optional[int] = None,
|
|
61
|
+
num_channels: Optional[int] = None,
|
|
62
|
+
):
|
|
63
|
+
torch._C._log_api_usage_once("torchcodec.decoders.AudioDecoder")
|
|
64
|
+
self._decoder = create_decoder(source=source, seek_mode="approximate")
|
|
65
|
+
|
|
66
|
+
core.add_audio_stream(
|
|
67
|
+
self._decoder,
|
|
68
|
+
stream_index=stream_index,
|
|
69
|
+
sample_rate=sample_rate,
|
|
70
|
+
num_channels=num_channels,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
container_metadata = core.get_container_metadata(self._decoder)
|
|
74
|
+
self.stream_index = (
|
|
75
|
+
container_metadata.best_audio_stream_index
|
|
76
|
+
if stream_index is None
|
|
77
|
+
else stream_index
|
|
78
|
+
)
|
|
79
|
+
if self.stream_index is None:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
"The best audio stream is unknown and there is no specified stream. "
|
|
82
|
+
+ ERROR_REPORTING_INSTRUCTIONS
|
|
83
|
+
)
|
|
84
|
+
self.metadata = container_metadata.streams[self.stream_index]
|
|
85
|
+
assert isinstance(self.metadata, core.AudioStreamMetadata) # mypy
|
|
86
|
+
|
|
87
|
+
self._desired_sample_rate = (
|
|
88
|
+
sample_rate if sample_rate is not None else self.metadata.sample_rate
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
def get_all_samples(self) -> AudioSamples:
|
|
92
|
+
"""Returns all the audio samples from the source.
|
|
93
|
+
|
|
94
|
+
To decode samples in a specific range, use
|
|
95
|
+
:meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`.
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
AudioSamples: The samples within the file.
|
|
99
|
+
"""
|
|
100
|
+
return self.get_samples_played_in_range()
|
|
101
|
+
|
|
102
|
+
def get_samples_played_in_range(
|
|
103
|
+
self, start_seconds: float = 0.0, stop_seconds: Optional[float] = None
|
|
104
|
+
) -> AudioSamples:
|
|
105
|
+
"""Returns audio samples in the given range.
|
|
106
|
+
|
|
107
|
+
Samples are in the half open range [start_seconds, stop_seconds).
|
|
108
|
+
|
|
109
|
+
To decode all the samples from beginning to end, you can call this
|
|
110
|
+
method while leaving ``start_seconds`` and ``stop_seconds`` to their
|
|
111
|
+
default values, or use
|
|
112
|
+
:meth:`~torchcodec.decoders.AudioDecoder.get_all_samples` as a more
|
|
113
|
+
convenient alias.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
start_seconds (float): Time, in seconds, of the start of the
|
|
117
|
+
range. Default: 0.
|
|
118
|
+
stop_seconds (float or None): Time, in seconds, of the end of the
|
|
119
|
+
range. As a half open range, the end is excluded. Default: None,
|
|
120
|
+
which decodes samples until the end.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
AudioSamples: The samples within the specified range.
|
|
124
|
+
"""
|
|
125
|
+
if stop_seconds is not None and not start_seconds <= stop_seconds:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
|
|
128
|
+
)
|
|
129
|
+
frames, first_pts = core.get_frames_by_pts_in_range_audio(
|
|
130
|
+
self._decoder,
|
|
131
|
+
start_seconds=start_seconds,
|
|
132
|
+
stop_seconds=stop_seconds,
|
|
133
|
+
)
|
|
134
|
+
first_pts = first_pts.item()
|
|
135
|
+
|
|
136
|
+
# x = frame boundaries
|
|
137
|
+
#
|
|
138
|
+
# first_pts last_pts
|
|
139
|
+
# v v
|
|
140
|
+
# ....x..........x..........x...........x..........x..........x.....
|
|
141
|
+
# ^ ^
|
|
142
|
+
# start_seconds stop_seconds
|
|
143
|
+
#
|
|
144
|
+
# We want to return the samples in [start_seconds, stop_seconds). But
|
|
145
|
+
# because the core API is based on frames, the `frames` tensor contains
|
|
146
|
+
# the samples in [first_pts, last_pts)
|
|
147
|
+
# So we do some basic math to figure out the position of the view that
|
|
148
|
+
# we'll return.
|
|
149
|
+
|
|
150
|
+
sample_rate = self._desired_sample_rate
|
|
151
|
+
# TODO: metadata's sample_rate should probably not be Optional
|
|
152
|
+
assert sample_rate is not None # mypy.
|
|
153
|
+
|
|
154
|
+
if first_pts < start_seconds:
|
|
155
|
+
offset_beginning = round((start_seconds - first_pts) * sample_rate)
|
|
156
|
+
output_pts_seconds = start_seconds
|
|
157
|
+
else:
|
|
158
|
+
# In normal cases we'll have first_pts <= start_pts, but in some
|
|
159
|
+
# edge cases it's possible to have first_pts > start_seconds,
|
|
160
|
+
# typically if the stream's first frame's pts isn't exactly 0.
|
|
161
|
+
offset_beginning = 0
|
|
162
|
+
output_pts_seconds = first_pts
|
|
163
|
+
|
|
164
|
+
num_samples = frames.shape[1]
|
|
165
|
+
last_pts = first_pts + num_samples / sample_rate
|
|
166
|
+
if stop_seconds is not None and stop_seconds < last_pts:
|
|
167
|
+
offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
|
|
168
|
+
else:
|
|
169
|
+
offset_end = num_samples
|
|
170
|
+
|
|
171
|
+
data = frames[:, offset_beginning:offset_end]
|
|
172
|
+
return AudioSamples(
|
|
173
|
+
data=data,
|
|
174
|
+
pts_seconds=output_pts_seconds,
|
|
175
|
+
duration_seconds=data.shape[1] / sample_rate,
|
|
176
|
+
sample_rate=sample_rate,
|
|
177
|
+
)
|