torchcodec 0.10.0__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchcodec/__init__.py +27 -0
- torchcodec/_core/AVIOContextHolder.cpp +60 -0
- torchcodec/_core/AVIOContextHolder.h +64 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
- torchcodec/_core/AVIOFileLikeContext.h +55 -0
- torchcodec/_core/AVIOTensorContext.cpp +130 -0
- torchcodec/_core/AVIOTensorContext.h +44 -0
- torchcodec/_core/BetaCudaDeviceInterface.cpp +849 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +196 -0
- torchcodec/_core/CMakeLists.txt +295 -0
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +124 -0
- torchcodec/_core/CpuDeviceInterface.cpp +509 -0
- torchcodec/_core/CpuDeviceInterface.h +141 -0
- torchcodec/_core/CudaDeviceInterface.cpp +602 -0
- torchcodec/_core/CudaDeviceInterface.h +79 -0
- torchcodec/_core/DeviceInterface.cpp +117 -0
- torchcodec/_core/DeviceInterface.h +191 -0
- torchcodec/_core/Encoder.cpp +1054 -0
- torchcodec/_core/Encoder.h +192 -0
- torchcodec/_core/FFMPEGCommon.cpp +684 -0
- torchcodec/_core/FFMPEGCommon.h +314 -0
- torchcodec/_core/FilterGraph.cpp +159 -0
- torchcodec/_core/FilterGraph.h +59 -0
- torchcodec/_core/Frame.cpp +47 -0
- torchcodec/_core/Frame.h +72 -0
- torchcodec/_core/Metadata.cpp +124 -0
- torchcodec/_core/Metadata.h +92 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1586 -0
- torchcodec/_core/SingleStreamDecoder.h +391 -0
- torchcodec/_core/StreamOptions.h +70 -0
- torchcodec/_core/Transform.cpp +128 -0
- torchcodec/_core/Transform.h +86 -0
- torchcodec/_core/ValidationUtils.cpp +35 -0
- torchcodec/_core/ValidationUtils.h +21 -0
- torchcodec/_core/__init__.py +46 -0
- torchcodec/_core/_metadata.py +262 -0
- torchcodec/_core/custom_ops.cpp +1090 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +169 -0
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +605 -0
- torchcodec/_core/pybind_ops.cpp +50 -0
- torchcodec/_frame.py +146 -0
- torchcodec/_internally_replaced_utils.py +68 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +419 -0
- torchcodec/decoders/__init__.py +12 -0
- torchcodec/decoders/_audio_decoder.py +185 -0
- torchcodec/decoders/_decoder_utils.py +113 -0
- torchcodec/decoders/_video_decoder.py +601 -0
- torchcodec/encoders/__init__.py +2 -0
- torchcodec/encoders/_audio_encoder.py +149 -0
- torchcodec/encoders/_video_encoder.py +196 -0
- torchcodec/libtorchcodec_core4.so +0 -0
- torchcodec/libtorchcodec_core5.so +0 -0
- torchcodec/libtorchcodec_core6.so +0 -0
- torchcodec/libtorchcodec_core7.so +0 -0
- torchcodec/libtorchcodec_core8.so +0 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_custom_ops8.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops8.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +287 -0
- torchcodec/samplers/_time_based.py +358 -0
- torchcodec/share/cmake/TorchCodec/TorchCodecConfig.cmake +76 -0
- torchcodec/share/cmake/TorchCodec/ffmpeg_versions.cmake +122 -0
- torchcodec/transforms/__init__.py +12 -0
- torchcodec/transforms/_decoder_transforms.py +375 -0
- torchcodec/version.py +2 -0
- torchcodec-0.10.0.dist-info/METADATA +286 -0
- torchcodec-0.10.0.dist-info/RECORD +88 -0
- torchcodec-0.10.0.dist-info/WHEEL +5 -0
- torchcodec-0.10.0.dist-info/licenses/LICENSE +28 -0
- torchcodec-0.10.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,601 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import json
|
|
10
|
+
import numbers
|
|
11
|
+
from collections.abc import Sequence
|
|
12
|
+
from dataclasses import dataclass, field
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Literal
|
|
15
|
+
|
|
16
|
+
import torch
|
|
17
|
+
from torch import device as torch_device, nn, Tensor
|
|
18
|
+
|
|
19
|
+
from torchcodec import _core as core, Frame, FrameBatch
|
|
20
|
+
from torchcodec.decoders._decoder_utils import (
|
|
21
|
+
_get_cuda_backend,
|
|
22
|
+
create_decoder,
|
|
23
|
+
ERROR_REPORTING_INSTRUCTIONS,
|
|
24
|
+
)
|
|
25
|
+
from torchcodec.transforms import DecoderTransform
|
|
26
|
+
from torchcodec.transforms._decoder_transforms import _make_transform_specs
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class CpuFallbackStatus:
|
|
31
|
+
"""Information about CPU fallback status.
|
|
32
|
+
|
|
33
|
+
This class tracks whether the decoder fell back to CPU decoding.
|
|
34
|
+
Users should not instantiate this class directly; instead, access it
|
|
35
|
+
via the :attr:`VideoDecoder.cpu_fallback` attribute.
|
|
36
|
+
|
|
37
|
+
Usage:
|
|
38
|
+
|
|
39
|
+
- Use ``str(cpu_fallback_status)`` or ``print(cpu_fallback_status)`` to see the cpu fallback status
|
|
40
|
+
- Use ``if cpu_fallback_status:`` to check if any fallback occurred
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
status_known: bool = False
|
|
44
|
+
"""Whether the fallback status has been determined.
|
|
45
|
+
For the Beta CUDA backend (see :func:`~torchcodec.decoders.set_cuda_backend`),
|
|
46
|
+
this is always ``True`` immediately after decoder creation.
|
|
47
|
+
For the FFmpeg CUDA backend, this becomes ``True`` after decoding
|
|
48
|
+
the first frame."""
|
|
49
|
+
_nvcuvid_unavailable: bool = field(default=False, init=False)
|
|
50
|
+
_video_not_supported: bool = field(default=False, init=False)
|
|
51
|
+
_is_fallback: bool = field(default=False, init=False)
|
|
52
|
+
_backend: str = field(default="", init=False)
|
|
53
|
+
|
|
54
|
+
def __bool__(self):
|
|
55
|
+
"""Returns True if fallback occurred."""
|
|
56
|
+
return self.status_known and self._is_fallback
|
|
57
|
+
|
|
58
|
+
def __str__(self):
|
|
59
|
+
"""Returns a human-readable string representation of the cpu fallback status."""
|
|
60
|
+
if not self.status_known:
|
|
61
|
+
return f"[{self._backend}] Fallback status: Unknown"
|
|
62
|
+
|
|
63
|
+
reasons = []
|
|
64
|
+
if self._nvcuvid_unavailable:
|
|
65
|
+
reasons.append("NVcuvid unavailable")
|
|
66
|
+
elif self._video_not_supported:
|
|
67
|
+
reasons.append("Video not supported")
|
|
68
|
+
elif self._is_fallback:
|
|
69
|
+
reasons.append("Unknown reason - try the Beta interface to know more!")
|
|
70
|
+
|
|
71
|
+
if reasons:
|
|
72
|
+
return (
|
|
73
|
+
f"[{self._backend}] Fallback status: Falling back due to: "
|
|
74
|
+
+ ", ".join(reasons)
|
|
75
|
+
)
|
|
76
|
+
return f"[{self._backend}] Fallback status: No fallback required"
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class VideoDecoder:
|
|
80
|
+
"""A single-stream video decoder.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
source (str, ``Pathlib.path``, bytes, ``torch.Tensor`` or file-like object): The source of the video:
|
|
84
|
+
|
|
85
|
+
- If ``str``: a local path or a URL to a video file.
|
|
86
|
+
- If ``Pathlib.path``: a path to a local video file.
|
|
87
|
+
- If ``bytes`` object or ``torch.Tensor``: the raw encoded video data.
|
|
88
|
+
- If file-like object: we read video data from the object on demand. The object must
|
|
89
|
+
expose the methods `read(self, size: int) -> bytes` and
|
|
90
|
+
`seek(self, offset: int, whence: int) -> int`. Read more in:
|
|
91
|
+
:ref:`sphx_glr_generated_examples_decoding_file_like.py`.
|
|
92
|
+
stream_index (int, optional): Specifies which stream in the video to decode frames from.
|
|
93
|
+
Note that this index is absolute across all media types. If left unspecified, then
|
|
94
|
+
the :term:`best stream` is used.
|
|
95
|
+
dimension_order(str, optional): The dimension order of the decoded frames.
|
|
96
|
+
This can be either "NCHW" (default) or "NHWC", where N is the batch
|
|
97
|
+
size, C is the number of channels, H is the height, and W is the
|
|
98
|
+
width of the frames.
|
|
99
|
+
|
|
100
|
+
.. note::
|
|
101
|
+
|
|
102
|
+
Frames are natively decoded in NHWC format by the underlying
|
|
103
|
+
FFmpeg implementation. Converting those into NCHW format is a
|
|
104
|
+
cheap no-copy operation that allows these frames to be
|
|
105
|
+
transformed using the `torchvision transforms
|
|
106
|
+
<https://pytorch.org/vision/stable/transforms.html>`_.
|
|
107
|
+
num_ffmpeg_threads (int, optional): The number of threads to use for decoding.
|
|
108
|
+
Use 1 for single-threaded decoding which may be best if you are running multiple
|
|
109
|
+
instances of ``VideoDecoder`` in parallel. Use a higher number for multi-threaded
|
|
110
|
+
decoding which is best if you are running a single instance of ``VideoDecoder``.
|
|
111
|
+
Passing 0 lets FFmpeg decide on the number of threads.
|
|
112
|
+
Default: 1.
|
|
113
|
+
device (str or torch.device, optional): The device to use for decoding.
|
|
114
|
+
If ``None`` (default), uses the current default device.
|
|
115
|
+
If you pass a CUDA device, we recommend trying the "beta" CUDA
|
|
116
|
+
backend which is faster! See :func:`~torchcodec.decoders.set_cuda_backend`.
|
|
117
|
+
seek_mode (str, optional): Determines if frame access will be "exact" or
|
|
118
|
+
"approximate". Exact guarantees that requesting frame i will always
|
|
119
|
+
return frame i, but doing so requires an initial :term:`scan` of the
|
|
120
|
+
file. Approximate is faster as it avoids scanning the file, but less
|
|
121
|
+
accurate as it uses the file's metadata to calculate where i
|
|
122
|
+
probably is. Default: "exact".
|
|
123
|
+
Read more about this parameter in:
|
|
124
|
+
:ref:`sphx_glr_generated_examples_decoding_approximate_mode.py`
|
|
125
|
+
transforms (sequence of transform objects, optional): Sequence of transforms to be
|
|
126
|
+
applied to the decoded frames by the decoder itself, in order. Accepts both
|
|
127
|
+
:class:`~torchcodec.transforms.DecoderTransform` and
|
|
128
|
+
:class:`~torchvision.transforms.v2.Transform`
|
|
129
|
+
objects. Read more about this parameter in: TODO_DECODER_TRANSFORMS_TUTORIAL.
|
|
130
|
+
custom_frame_mappings (str, bytes, or file-like object, optional):
|
|
131
|
+
Mapping of frames to their metadata, typically generated via ffprobe.
|
|
132
|
+
This enables accurate frame seeking without requiring a full video scan.
|
|
133
|
+
Do not set seek_mode when custom_frame_mappings is provided.
|
|
134
|
+
Expected JSON format:
|
|
135
|
+
|
|
136
|
+
.. code-block:: json
|
|
137
|
+
|
|
138
|
+
{
|
|
139
|
+
"frames": [
|
|
140
|
+
{
|
|
141
|
+
"pts": 0,
|
|
142
|
+
"duration": 1001,
|
|
143
|
+
"key_frame": 1
|
|
144
|
+
}
|
|
145
|
+
]
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
Alternative field names "pkt_pts" and "pkt_duration" are also supported.
|
|
149
|
+
Read more about this parameter in:
|
|
150
|
+
:ref:`sphx_glr_generated_examples_decoding_custom_frame_mappings.py`
|
|
151
|
+
|
|
152
|
+
Attributes:
|
|
153
|
+
metadata (VideoStreamMetadata): Metadata of the video stream.
|
|
154
|
+
stream_index (int): The stream index that this decoder is retrieving frames from. If a
|
|
155
|
+
stream index was provided at initialization, this is the same value. If it was left
|
|
156
|
+
unspecified, this is the :term:`best stream`.
|
|
157
|
+
cpu_fallback (CpuFallbackStatus): Information about whether the decoder fell back to CPU
|
|
158
|
+
decoding. Use ``bool(cpu_fallback)`` to check if fallback occurred, or
|
|
159
|
+
``str(cpu_fallback)`` to get a human-readable status message. The status is only
|
|
160
|
+
determined after at least one frame has been decoded.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
def __init__(
|
|
164
|
+
self,
|
|
165
|
+
source: str | Path | io.RawIOBase | io.BufferedReader | bytes | Tensor,
|
|
166
|
+
*,
|
|
167
|
+
stream_index: int | None = None,
|
|
168
|
+
dimension_order: Literal["NCHW", "NHWC"] = "NCHW",
|
|
169
|
+
num_ffmpeg_threads: int = 1,
|
|
170
|
+
device: str | torch_device | None = None,
|
|
171
|
+
seek_mode: Literal["exact", "approximate"] = "exact",
|
|
172
|
+
transforms: Sequence[DecoderTransform | nn.Module] | None = None,
|
|
173
|
+
custom_frame_mappings: (
|
|
174
|
+
str | bytes | io.RawIOBase | io.BufferedReader | None
|
|
175
|
+
) = None,
|
|
176
|
+
):
|
|
177
|
+
torch._C._log_api_usage_once("torchcodec.decoders.VideoDecoder")
|
|
178
|
+
allowed_seek_modes = ("exact", "approximate")
|
|
179
|
+
if seek_mode not in allowed_seek_modes:
|
|
180
|
+
raise ValueError(
|
|
181
|
+
f"Invalid seek mode ({seek_mode}). "
|
|
182
|
+
f"Supported values are {', '.join(allowed_seek_modes)}."
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Validate seek_mode and custom_frame_mappings are not mismatched
|
|
186
|
+
if custom_frame_mappings is not None and seek_mode == "approximate":
|
|
187
|
+
raise ValueError(
|
|
188
|
+
"custom_frame_mappings is incompatible with seek_mode='approximate'. "
|
|
189
|
+
"Use seek_mode='custom_frame_mappings' or leave it unspecified to automatically use custom frame mappings."
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
# Auto-select custom_frame_mappings seek_mode and process data when mappings are provided
|
|
193
|
+
custom_frame_mappings_data = None
|
|
194
|
+
if custom_frame_mappings is not None:
|
|
195
|
+
seek_mode = "custom_frame_mappings" # type: ignore[assignment]
|
|
196
|
+
custom_frame_mappings_data = _read_custom_frame_mappings(
|
|
197
|
+
custom_frame_mappings
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
self._decoder = create_decoder(source=source, seek_mode=seek_mode)
|
|
201
|
+
|
|
202
|
+
(
|
|
203
|
+
self.metadata,
|
|
204
|
+
self.stream_index,
|
|
205
|
+
self._begin_stream_seconds,
|
|
206
|
+
self._end_stream_seconds,
|
|
207
|
+
self._num_frames,
|
|
208
|
+
) = _get_and_validate_stream_metadata(
|
|
209
|
+
decoder=self._decoder, stream_index=stream_index
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
allowed_dimension_orders = ("NCHW", "NHWC")
|
|
213
|
+
if dimension_order not in allowed_dimension_orders:
|
|
214
|
+
raise ValueError(
|
|
215
|
+
f"Invalid dimension order ({dimension_order}). "
|
|
216
|
+
f"Supported values are {', '.join(allowed_dimension_orders)}."
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
if num_ffmpeg_threads is None:
|
|
220
|
+
raise ValueError(f"{num_ffmpeg_threads = } should be an int.")
|
|
221
|
+
|
|
222
|
+
if device is None:
|
|
223
|
+
device = str(torch.get_default_device())
|
|
224
|
+
elif isinstance(device, torch_device):
|
|
225
|
+
device = str(device)
|
|
226
|
+
|
|
227
|
+
device_variant = _get_cuda_backend()
|
|
228
|
+
transform_specs = _make_transform_specs(
|
|
229
|
+
transforms,
|
|
230
|
+
input_dims=(self.metadata.height, self.metadata.width),
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
core.add_video_stream(
|
|
234
|
+
self._decoder,
|
|
235
|
+
stream_index=self.stream_index,
|
|
236
|
+
dimension_order=dimension_order,
|
|
237
|
+
num_threads=num_ffmpeg_threads,
|
|
238
|
+
device=device,
|
|
239
|
+
device_variant=device_variant,
|
|
240
|
+
transform_specs=transform_specs,
|
|
241
|
+
custom_frame_mappings=custom_frame_mappings_data,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
self._cpu_fallback = CpuFallbackStatus()
|
|
245
|
+
if device.startswith("cuda"):
|
|
246
|
+
if device_variant == "beta":
|
|
247
|
+
self._cpu_fallback._backend = "Beta CUDA"
|
|
248
|
+
else:
|
|
249
|
+
self._cpu_fallback._backend = "FFmpeg CUDA"
|
|
250
|
+
else:
|
|
251
|
+
self._cpu_fallback._backend = "CPU"
|
|
252
|
+
|
|
253
|
+
def __len__(self) -> int:
|
|
254
|
+
return self._num_frames
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def cpu_fallback(self) -> CpuFallbackStatus:
|
|
258
|
+
# We only query the CPU fallback info if status is unknown. That happens
|
|
259
|
+
# either when:
|
|
260
|
+
# - this @property has never been called before
|
|
261
|
+
# - no frame has been decoded yet on the FFmpeg interface.
|
|
262
|
+
# Note that for the beta interface, we're able to know the fallback status
|
|
263
|
+
# right when the VideoDecoder is instantiated, but the status_known
|
|
264
|
+
# attribute is initialized to False.
|
|
265
|
+
if not self._cpu_fallback.status_known:
|
|
266
|
+
backend_details = core._get_backend_details(self._decoder)
|
|
267
|
+
|
|
268
|
+
if "status unknown" not in backend_details:
|
|
269
|
+
self._cpu_fallback.status_known = True
|
|
270
|
+
|
|
271
|
+
if "CPU fallback" in backend_details:
|
|
272
|
+
self._cpu_fallback._is_fallback = True
|
|
273
|
+
if self._cpu_fallback._backend == "Beta CUDA":
|
|
274
|
+
# Only the beta interface can provide details.
|
|
275
|
+
# if it's not that nvcuvid is missing, it must be video-specific
|
|
276
|
+
if "NVCUVID not available" in backend_details:
|
|
277
|
+
self._cpu_fallback._nvcuvid_unavailable = True
|
|
278
|
+
else:
|
|
279
|
+
self._cpu_fallback._video_not_supported = True
|
|
280
|
+
|
|
281
|
+
return self._cpu_fallback
|
|
282
|
+
|
|
283
|
+
def _getitem_int(self, key: int) -> Tensor:
|
|
284
|
+
assert isinstance(key, int)
|
|
285
|
+
|
|
286
|
+
frame_data, *_ = core.get_frame_at_index(self._decoder, frame_index=key)
|
|
287
|
+
return frame_data
|
|
288
|
+
|
|
289
|
+
def _getitem_slice(self, key: slice) -> Tensor:
|
|
290
|
+
assert isinstance(key, slice)
|
|
291
|
+
|
|
292
|
+
start, stop, step = key.indices(len(self))
|
|
293
|
+
frame_data, *_ = core.get_frames_in_range(
|
|
294
|
+
self._decoder,
|
|
295
|
+
start=start,
|
|
296
|
+
stop=stop,
|
|
297
|
+
step=step,
|
|
298
|
+
)
|
|
299
|
+
return frame_data
|
|
300
|
+
|
|
301
|
+
def __getitem__(self, key: numbers.Integral | slice) -> Tensor:
|
|
302
|
+
"""Return frame or frames as tensors, at the given index or range.
|
|
303
|
+
|
|
304
|
+
.. note::
|
|
305
|
+
|
|
306
|
+
If you need to decode multiple frames, we recommend using the batch
|
|
307
|
+
methods instead, since they are faster:
|
|
308
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
|
|
309
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
|
|
310
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`, and
|
|
311
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.
|
|
312
|
+
|
|
313
|
+
Args:
|
|
314
|
+
key(int or slice): The index or range of frame(s) to retrieve.
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
torch.Tensor: The frame or frames at the given index or range.
|
|
318
|
+
"""
|
|
319
|
+
if isinstance(key, numbers.Integral):
|
|
320
|
+
return self._getitem_int(int(key))
|
|
321
|
+
elif isinstance(key, slice):
|
|
322
|
+
return self._getitem_slice(key)
|
|
323
|
+
|
|
324
|
+
raise TypeError(
|
|
325
|
+
f"Unsupported key type: {type(key)}. Supported types are int and slice."
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def _get_key_frame_indices(self) -> list[int]:
|
|
329
|
+
return core._get_key_frame_indices(self._decoder)
|
|
330
|
+
|
|
331
|
+
def get_frame_at(self, index: int) -> Frame:
|
|
332
|
+
"""Return a single frame at the given index.
|
|
333
|
+
|
|
334
|
+
.. note::
|
|
335
|
+
|
|
336
|
+
If you need to decode multiple frames, we recommend using the batch
|
|
337
|
+
methods instead, since they are faster:
|
|
338
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
|
|
339
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
|
|
340
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`,
|
|
341
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.
|
|
342
|
+
|
|
343
|
+
Args:
|
|
344
|
+
index (int): The index of the frame to retrieve.
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
Frame: The frame at the given index.
|
|
348
|
+
"""
|
|
349
|
+
data, pts_seconds, duration_seconds = core.get_frame_at_index(
|
|
350
|
+
self._decoder, frame_index=index
|
|
351
|
+
)
|
|
352
|
+
return Frame(
|
|
353
|
+
data=data,
|
|
354
|
+
pts_seconds=pts_seconds.item(),
|
|
355
|
+
duration_seconds=duration_seconds.item(),
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
def get_frames_at(self, indices: torch.Tensor | list[int]) -> FrameBatch:
|
|
359
|
+
"""Return frames at the given indices.
|
|
360
|
+
|
|
361
|
+
Args:
|
|
362
|
+
indices (torch.Tensor or list of int): The indices of the frames to retrieve.
|
|
363
|
+
|
|
364
|
+
Returns:
|
|
365
|
+
FrameBatch: The frames at the given indices.
|
|
366
|
+
"""
|
|
367
|
+
|
|
368
|
+
data, pts_seconds, duration_seconds = core.get_frames_at_indices(
|
|
369
|
+
self._decoder, frame_indices=indices
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return FrameBatch(
|
|
373
|
+
data=data,
|
|
374
|
+
pts_seconds=pts_seconds,
|
|
375
|
+
duration_seconds=duration_seconds,
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
def get_frames_in_range(self, start: int, stop: int, step: int = 1) -> FrameBatch:
|
|
379
|
+
"""Return multiple frames at the given index range.
|
|
380
|
+
|
|
381
|
+
Frames are in [start, stop).
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
start (int): Index of the first frame to retrieve.
|
|
385
|
+
stop (int): End of indexing range (exclusive, as per Python
|
|
386
|
+
conventions).
|
|
387
|
+
step (int, optional): Step size between frames. Default: 1.
|
|
388
|
+
|
|
389
|
+
Returns:
|
|
390
|
+
FrameBatch: The frames within the specified range.
|
|
391
|
+
"""
|
|
392
|
+
# Adjust start / stop indices to enable indexing semantics, ex. [-10, 1000] returns the last 10 frames
|
|
393
|
+
start, stop, step = slice(start, stop, step).indices(self._num_frames)
|
|
394
|
+
frames = core.get_frames_in_range(
|
|
395
|
+
self._decoder,
|
|
396
|
+
start=start,
|
|
397
|
+
stop=stop,
|
|
398
|
+
step=step,
|
|
399
|
+
)
|
|
400
|
+
return FrameBatch(*frames)
|
|
401
|
+
|
|
402
|
+
def get_frame_played_at(self, seconds: float) -> Frame:
|
|
403
|
+
"""Return a single frame played at the given timestamp in seconds.
|
|
404
|
+
|
|
405
|
+
.. note::
|
|
406
|
+
|
|
407
|
+
If you need to decode multiple frames, we recommend using the batch
|
|
408
|
+
methods instead, since they are faster:
|
|
409
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_at`,
|
|
410
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_in_range`,
|
|
411
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_at`,
|
|
412
|
+
:meth:`~torchcodec.decoders.VideoDecoder.get_frames_played_in_range`.
|
|
413
|
+
|
|
414
|
+
Args:
|
|
415
|
+
seconds (float): The time stamp in seconds when the frame is played.
|
|
416
|
+
|
|
417
|
+
Returns:
|
|
418
|
+
Frame: The frame that is played at ``seconds``.
|
|
419
|
+
"""
|
|
420
|
+
if not self._begin_stream_seconds <= seconds < self._end_stream_seconds:
|
|
421
|
+
raise IndexError(
|
|
422
|
+
f"Invalid pts in seconds: {seconds}. "
|
|
423
|
+
f"It must be greater than or equal to {self._begin_stream_seconds} "
|
|
424
|
+
f"and less than {self._end_stream_seconds}."
|
|
425
|
+
)
|
|
426
|
+
data, pts_seconds, duration_seconds = core.get_frame_at_pts(
|
|
427
|
+
self._decoder, seconds
|
|
428
|
+
)
|
|
429
|
+
return Frame(
|
|
430
|
+
data=data,
|
|
431
|
+
pts_seconds=pts_seconds.item(),
|
|
432
|
+
duration_seconds=duration_seconds.item(),
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
def get_frames_played_at(self, seconds: torch.Tensor | list[float]) -> FrameBatch:
|
|
436
|
+
"""Return frames played at the given timestamps in seconds.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
seconds (torch.Tensor or list of float): The timestamps in seconds when the frames are played.
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
FrameBatch: The frames that are played at ``seconds``.
|
|
443
|
+
"""
|
|
444
|
+
|
|
445
|
+
data, pts_seconds, duration_seconds = core.get_frames_by_pts(
|
|
446
|
+
self._decoder, timestamps=seconds
|
|
447
|
+
)
|
|
448
|
+
return FrameBatch(
|
|
449
|
+
data=data,
|
|
450
|
+
pts_seconds=pts_seconds,
|
|
451
|
+
duration_seconds=duration_seconds,
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
def get_frames_played_in_range(
|
|
455
|
+
self, start_seconds: float, stop_seconds: float
|
|
456
|
+
) -> FrameBatch:
|
|
457
|
+
"""Returns multiple frames in the given range.
|
|
458
|
+
|
|
459
|
+
Frames are in the half open range [start_seconds, stop_seconds). Each
|
|
460
|
+
returned frame's :term:`pts`, in seconds, is inside of the half open
|
|
461
|
+
range.
|
|
462
|
+
|
|
463
|
+
Args:
|
|
464
|
+
start_seconds (float): Time, in seconds, of the start of the
|
|
465
|
+
range.
|
|
466
|
+
stop_seconds (float): Time, in seconds, of the end of the
|
|
467
|
+
range. As a half open range, the end is excluded.
|
|
468
|
+
|
|
469
|
+
Returns:
|
|
470
|
+
FrameBatch: The frames within the specified range.
|
|
471
|
+
"""
|
|
472
|
+
if not start_seconds <= stop_seconds:
|
|
473
|
+
raise ValueError(
|
|
474
|
+
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
|
|
475
|
+
)
|
|
476
|
+
if not self._begin_stream_seconds <= start_seconds < self._end_stream_seconds:
|
|
477
|
+
raise ValueError(
|
|
478
|
+
f"Invalid start seconds: {start_seconds}. "
|
|
479
|
+
f"It must be greater than or equal to {self._begin_stream_seconds} "
|
|
480
|
+
f"and less than or equal to {self._end_stream_seconds}."
|
|
481
|
+
)
|
|
482
|
+
if not stop_seconds <= self._end_stream_seconds:
|
|
483
|
+
raise ValueError(
|
|
484
|
+
f"Invalid stop seconds: {stop_seconds}. "
|
|
485
|
+
f"It must be less than or equal to {self._end_stream_seconds}."
|
|
486
|
+
)
|
|
487
|
+
frames = core.get_frames_by_pts_in_range(
|
|
488
|
+
self._decoder,
|
|
489
|
+
start_seconds=start_seconds,
|
|
490
|
+
stop_seconds=stop_seconds,
|
|
491
|
+
)
|
|
492
|
+
return FrameBatch(*frames)
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def _get_and_validate_stream_metadata(
|
|
496
|
+
*,
|
|
497
|
+
decoder: Tensor,
|
|
498
|
+
stream_index: int | None = None,
|
|
499
|
+
) -> tuple[core._metadata.VideoStreamMetadata, int, float, float, int]:
|
|
500
|
+
|
|
501
|
+
container_metadata = core.get_container_metadata(decoder)
|
|
502
|
+
|
|
503
|
+
if stream_index is None:
|
|
504
|
+
if (stream_index := container_metadata.best_video_stream_index) is None:
|
|
505
|
+
raise ValueError(
|
|
506
|
+
"The best video stream is unknown and there is no specified stream. "
|
|
507
|
+
+ ERROR_REPORTING_INSTRUCTIONS
|
|
508
|
+
)
|
|
509
|
+
|
|
510
|
+
if stream_index >= len(container_metadata.streams):
|
|
511
|
+
raise ValueError(f"The stream index {stream_index} is not a valid stream.")
|
|
512
|
+
|
|
513
|
+
metadata = container_metadata.streams[stream_index]
|
|
514
|
+
if not isinstance(metadata, core._metadata.VideoStreamMetadata):
|
|
515
|
+
raise ValueError(f"The stream at index {stream_index} is not a video stream. ")
|
|
516
|
+
|
|
517
|
+
if metadata.begin_stream_seconds is None:
|
|
518
|
+
raise ValueError(
|
|
519
|
+
"The minimum pts value in seconds is unknown. "
|
|
520
|
+
+ ERROR_REPORTING_INSTRUCTIONS
|
|
521
|
+
)
|
|
522
|
+
begin_stream_seconds = metadata.begin_stream_seconds
|
|
523
|
+
|
|
524
|
+
if metadata.end_stream_seconds is None:
|
|
525
|
+
raise ValueError(
|
|
526
|
+
"The maximum pts value in seconds is unknown. "
|
|
527
|
+
+ ERROR_REPORTING_INSTRUCTIONS
|
|
528
|
+
)
|
|
529
|
+
end_stream_seconds = metadata.end_stream_seconds
|
|
530
|
+
|
|
531
|
+
if metadata.num_frames is None:
|
|
532
|
+
raise ValueError(
|
|
533
|
+
"The number of frames is unknown. " + ERROR_REPORTING_INSTRUCTIONS
|
|
534
|
+
)
|
|
535
|
+
num_frames = metadata.num_frames
|
|
536
|
+
|
|
537
|
+
return (
|
|
538
|
+
metadata,
|
|
539
|
+
stream_index,
|
|
540
|
+
begin_stream_seconds,
|
|
541
|
+
end_stream_seconds,
|
|
542
|
+
num_frames,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _read_custom_frame_mappings(
|
|
547
|
+
custom_frame_mappings: str | bytes | io.RawIOBase | io.BufferedReader,
|
|
548
|
+
) -> tuple[Tensor, Tensor, Tensor]:
|
|
549
|
+
"""Parse custom frame mappings from JSON data and extract frame metadata.
|
|
550
|
+
|
|
551
|
+
Args:
|
|
552
|
+
custom_frame_mappings: JSON data containing frame metadata, provided as:
|
|
553
|
+
- A JSON string (str, bytes)
|
|
554
|
+
- A file-like object with a read() method
|
|
555
|
+
|
|
556
|
+
Returns:
|
|
557
|
+
A tuple of three tensors:
|
|
558
|
+
- all_frames (Tensor): Presentation timestamps (PTS) for each frame
|
|
559
|
+
- is_key_frame (Tensor): Boolean tensor indicating which frames are key frames
|
|
560
|
+
- duration (Tensor): Duration of each frame
|
|
561
|
+
"""
|
|
562
|
+
try:
|
|
563
|
+
input_data = (
|
|
564
|
+
json.load(custom_frame_mappings)
|
|
565
|
+
if hasattr(custom_frame_mappings, "read")
|
|
566
|
+
else json.loads(custom_frame_mappings)
|
|
567
|
+
)
|
|
568
|
+
except json.JSONDecodeError as e:
|
|
569
|
+
raise ValueError(
|
|
570
|
+
f"Invalid custom frame mappings: {e}. It should be a valid JSON string or a file-like object."
|
|
571
|
+
) from e
|
|
572
|
+
|
|
573
|
+
if not input_data or "frames" not in input_data:
|
|
574
|
+
raise ValueError(
|
|
575
|
+
"Invalid custom frame mappings. The input is empty or missing the required 'frames' key."
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
first_frame = input_data["frames"][0]
|
|
579
|
+
pts_key = next((key for key in ("pts", "pkt_pts") if key in first_frame), None)
|
|
580
|
+
duration_key = next(
|
|
581
|
+
(key for key in ("duration", "pkt_duration") if key in first_frame), None
|
|
582
|
+
)
|
|
583
|
+
key_frame_present = "key_frame" in first_frame
|
|
584
|
+
|
|
585
|
+
if not pts_key or not duration_key or not key_frame_present:
|
|
586
|
+
raise ValueError(
|
|
587
|
+
"Invalid custom frame mappings. The 'pts'/'pkt_pts', 'duration'/'pkt_duration', and 'key_frame' keys are required in the frame metadata."
|
|
588
|
+
)
|
|
589
|
+
|
|
590
|
+
all_frames = torch.tensor(
|
|
591
|
+
[int(frame[pts_key]) for frame in input_data["frames"]], dtype=torch.int64
|
|
592
|
+
)
|
|
593
|
+
is_key_frame = torch.tensor(
|
|
594
|
+
[int(frame["key_frame"]) for frame in input_data["frames"]], dtype=torch.bool
|
|
595
|
+
)
|
|
596
|
+
duration = torch.tensor(
|
|
597
|
+
[int(frame[duration_key]) for frame in input_data["frames"]], dtype=torch.int64
|
|
598
|
+
)
|
|
599
|
+
if not (len(all_frames) == len(is_key_frame) == len(duration)):
|
|
600
|
+
raise ValueError("Mismatched lengths in frame index data")
|
|
601
|
+
return all_frames, is_key_frame, duration
|