torchcodec 0.10.0__cp312-cp312-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- torchcodec/__init__.py +27 -0
- torchcodec/_core/AVIOContextHolder.cpp +60 -0
- torchcodec/_core/AVIOContextHolder.h +64 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
- torchcodec/_core/AVIOFileLikeContext.h +55 -0
- torchcodec/_core/AVIOTensorContext.cpp +130 -0
- torchcodec/_core/AVIOTensorContext.h +44 -0
- torchcodec/_core/BetaCudaDeviceInterface.cpp +849 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +196 -0
- torchcodec/_core/CMakeLists.txt +295 -0
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +124 -0
- torchcodec/_core/CpuDeviceInterface.cpp +509 -0
- torchcodec/_core/CpuDeviceInterface.h +141 -0
- torchcodec/_core/CudaDeviceInterface.cpp +602 -0
- torchcodec/_core/CudaDeviceInterface.h +79 -0
- torchcodec/_core/DeviceInterface.cpp +117 -0
- torchcodec/_core/DeviceInterface.h +191 -0
- torchcodec/_core/Encoder.cpp +1054 -0
- torchcodec/_core/Encoder.h +192 -0
- torchcodec/_core/FFMPEGCommon.cpp +684 -0
- torchcodec/_core/FFMPEGCommon.h +314 -0
- torchcodec/_core/FilterGraph.cpp +159 -0
- torchcodec/_core/FilterGraph.h +59 -0
- torchcodec/_core/Frame.cpp +47 -0
- torchcodec/_core/Frame.h +72 -0
- torchcodec/_core/Metadata.cpp +124 -0
- torchcodec/_core/Metadata.h +92 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1586 -0
- torchcodec/_core/SingleStreamDecoder.h +391 -0
- torchcodec/_core/StreamOptions.h +70 -0
- torchcodec/_core/Transform.cpp +128 -0
- torchcodec/_core/Transform.h +86 -0
- torchcodec/_core/ValidationUtils.cpp +35 -0
- torchcodec/_core/ValidationUtils.h +21 -0
- torchcodec/_core/__init__.py +46 -0
- torchcodec/_core/_metadata.py +262 -0
- torchcodec/_core/custom_ops.cpp +1090 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +169 -0
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +605 -0
- torchcodec/_core/pybind_ops.cpp +50 -0
- torchcodec/_frame.py +146 -0
- torchcodec/_internally_replaced_utils.py +68 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +419 -0
- torchcodec/decoders/__init__.py +12 -0
- torchcodec/decoders/_audio_decoder.py +185 -0
- torchcodec/decoders/_decoder_utils.py +113 -0
- torchcodec/decoders/_video_decoder.py +601 -0
- torchcodec/encoders/__init__.py +2 -0
- torchcodec/encoders/_audio_encoder.py +149 -0
- torchcodec/encoders/_video_encoder.py +196 -0
- torchcodec/libtorchcodec_core4.so +0 -0
- torchcodec/libtorchcodec_core5.so +0 -0
- torchcodec/libtorchcodec_core6.so +0 -0
- torchcodec/libtorchcodec_core7.so +0 -0
- torchcodec/libtorchcodec_core8.so +0 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_custom_ops8.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops8.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +287 -0
- torchcodec/samplers/_time_based.py +358 -0
- torchcodec/share/cmake/TorchCodec/TorchCodecConfig.cmake +76 -0
- torchcodec/share/cmake/TorchCodec/ffmpeg_versions.cmake +122 -0
- torchcodec/transforms/__init__.py +12 -0
- torchcodec/transforms/_decoder_transforms.py +375 -0
- torchcodec/version.py +2 -0
- torchcodec-0.10.0.dist-info/METADATA +286 -0
- torchcodec-0.10.0.dist-info/RECORD +88 -0
- torchcodec-0.10.0.dist-info/WHEEL +5 -0
- torchcodec-0.10.0.dist-info/licenses/LICENSE +28 -0
- torchcodec-0.10.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
import torch
|
|
12
|
+
from torch import Tensor
|
|
13
|
+
|
|
14
|
+
from torchcodec import _core as core, AudioSamples
|
|
15
|
+
from torchcodec.decoders._decoder_utils import (
|
|
16
|
+
create_decoder,
|
|
17
|
+
ERROR_REPORTING_INSTRUCTIONS,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class AudioDecoder:
|
|
22
|
+
"""A single-stream audio decoder.
|
|
23
|
+
|
|
24
|
+
This can be used to decode audio from pure audio files (e.g. mp3, wav,
|
|
25
|
+
etc.), or from videos that contain audio streams (e.g. mp4 videos).
|
|
26
|
+
|
|
27
|
+
Returned samples are float samples normalized in [-1, 1]
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
source (str, ``Pathlib.path``, bytes, ``torch.Tensor`` or file-like
|
|
31
|
+
object): The source of the video or audio:
|
|
32
|
+
|
|
33
|
+
- If ``str``: a local path or a URL to a video or audio file.
|
|
34
|
+
- If ``Pathlib.path``: a path to a local video or audio file.
|
|
35
|
+
- If ``bytes`` object or ``torch.Tensor``: the raw encoded audio data.
|
|
36
|
+
- If file-like object: we read video data from the object on demand. The object must
|
|
37
|
+
expose the methods `read(self, size: int) -> bytes` and
|
|
38
|
+
`seek(self, offset: int, whence: int) -> int`. Read more in:
|
|
39
|
+
:ref:`sphx_glr_generated_examples_decoding_file_like.py`.
|
|
40
|
+
stream_index (int, optional): Specifies which stream in the file to decode samples from.
|
|
41
|
+
Note that this index is absolute across all media types. If left unspecified, then
|
|
42
|
+
the :term:`best stream` is used.
|
|
43
|
+
sample_rate (int, optional): The desired output sample rate of the decoded samples.
|
|
44
|
+
By default, the sample rate of the source is used.
|
|
45
|
+
num_channels (int, optional): The desired number of channels of the decoded samples.
|
|
46
|
+
By default, the number of channels of the source is used.
|
|
47
|
+
|
|
48
|
+
Attributes:
|
|
49
|
+
metadata (AudioStreamMetadata): Metadata of the audio stream.
|
|
50
|
+
stream_index (int): The stream index that this decoder is retrieving samples from. If a
|
|
51
|
+
stream index was provided at initialization, this is the same value. If it was left
|
|
52
|
+
unspecified, this is the :term:`best stream`.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(
|
|
56
|
+
self,
|
|
57
|
+
source: str | Path | io.RawIOBase | io.BufferedReader | bytes | Tensor,
|
|
58
|
+
*,
|
|
59
|
+
stream_index: int | None = None,
|
|
60
|
+
sample_rate: int | None = None,
|
|
61
|
+
num_channels: int | None = None,
|
|
62
|
+
):
|
|
63
|
+
torch._C._log_api_usage_once("torchcodec.decoders.AudioDecoder")
|
|
64
|
+
self._decoder = create_decoder(source=source, seek_mode="approximate")
|
|
65
|
+
|
|
66
|
+
container_metadata = core.get_container_metadata(self._decoder)
|
|
67
|
+
self.stream_index = (
|
|
68
|
+
container_metadata.best_audio_stream_index
|
|
69
|
+
if stream_index is None
|
|
70
|
+
else stream_index
|
|
71
|
+
)
|
|
72
|
+
if self.stream_index is None:
|
|
73
|
+
raise ValueError(
|
|
74
|
+
"The best audio stream is unknown and there is no specified stream. "
|
|
75
|
+
+ ERROR_REPORTING_INSTRUCTIONS
|
|
76
|
+
)
|
|
77
|
+
if self.stream_index >= len(container_metadata.streams):
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"The stream at index {stream_index} is not a valid stream."
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
self.metadata = container_metadata.streams[self.stream_index]
|
|
83
|
+
if not isinstance(self.metadata, core._metadata.AudioStreamMetadata):
|
|
84
|
+
raise ValueError(
|
|
85
|
+
f"The stream at index {stream_index} is not an audio stream. "
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
self._desired_sample_rate = (
|
|
89
|
+
sample_rate if sample_rate is not None else self.metadata.sample_rate
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
core.add_audio_stream(
|
|
93
|
+
self._decoder,
|
|
94
|
+
stream_index=stream_index,
|
|
95
|
+
sample_rate=sample_rate,
|
|
96
|
+
num_channels=num_channels,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
def get_all_samples(self) -> AudioSamples:
|
|
100
|
+
"""Returns all the audio samples from the source.
|
|
101
|
+
|
|
102
|
+
To decode samples in a specific range, use
|
|
103
|
+
:meth:`~torchcodec.decoders.AudioDecoder.get_samples_played_in_range`.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
AudioSamples: The samples within the file.
|
|
107
|
+
"""
|
|
108
|
+
return self.get_samples_played_in_range()
|
|
109
|
+
|
|
110
|
+
def get_samples_played_in_range(
|
|
111
|
+
self, start_seconds: float = 0.0, stop_seconds: float | None = None
|
|
112
|
+
) -> AudioSamples:
|
|
113
|
+
"""Returns audio samples in the given range.
|
|
114
|
+
|
|
115
|
+
Samples are in the half open range [start_seconds, stop_seconds).
|
|
116
|
+
|
|
117
|
+
To decode all the samples from beginning to end, you can call this
|
|
118
|
+
method while leaving ``start_seconds`` and ``stop_seconds`` to their
|
|
119
|
+
default values, or use
|
|
120
|
+
:meth:`~torchcodec.decoders.AudioDecoder.get_all_samples` as a more
|
|
121
|
+
convenient alias.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
start_seconds (float): Time, in seconds, of the start of the
|
|
125
|
+
range. Default: 0.
|
|
126
|
+
stop_seconds (float or None): Time, in seconds, of the end of the
|
|
127
|
+
range. As a half open range, the end is excluded. Default: None,
|
|
128
|
+
which decodes samples until the end.
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
AudioSamples: The samples within the specified range.
|
|
132
|
+
"""
|
|
133
|
+
if stop_seconds is not None and not start_seconds <= stop_seconds:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"Invalid start seconds: {start_seconds}. It must be less than or equal to stop seconds ({stop_seconds})."
|
|
136
|
+
)
|
|
137
|
+
frames, first_pts = core.get_frames_by_pts_in_range_audio(
|
|
138
|
+
self._decoder,
|
|
139
|
+
start_seconds=start_seconds,
|
|
140
|
+
stop_seconds=stop_seconds,
|
|
141
|
+
)
|
|
142
|
+
first_pts = first_pts.item()
|
|
143
|
+
|
|
144
|
+
# x = frame boundaries
|
|
145
|
+
#
|
|
146
|
+
# first_pts last_pts
|
|
147
|
+
# v v
|
|
148
|
+
# ....x..........x..........x...........x..........x..........x.....
|
|
149
|
+
# ^ ^
|
|
150
|
+
# start_seconds stop_seconds
|
|
151
|
+
#
|
|
152
|
+
# We want to return the samples in [start_seconds, stop_seconds). But
|
|
153
|
+
# because the core API is based on frames, the `frames` tensor contains
|
|
154
|
+
# the samples in [first_pts, last_pts)
|
|
155
|
+
# So we do some basic math to figure out the position of the view that
|
|
156
|
+
# we'll return.
|
|
157
|
+
|
|
158
|
+
sample_rate = self._desired_sample_rate
|
|
159
|
+
# TODO: metadata's sample_rate should probably not be Optional
|
|
160
|
+
assert sample_rate is not None # mypy.
|
|
161
|
+
|
|
162
|
+
if first_pts < start_seconds:
|
|
163
|
+
offset_beginning = round((start_seconds - first_pts) * sample_rate)
|
|
164
|
+
output_pts_seconds = start_seconds
|
|
165
|
+
else:
|
|
166
|
+
# In normal cases we'll have first_pts <= start_pts, but in some
|
|
167
|
+
# edge cases it's possible to have first_pts > start_seconds,
|
|
168
|
+
# typically if the stream's first frame's pts isn't exactly 0.
|
|
169
|
+
offset_beginning = 0
|
|
170
|
+
output_pts_seconds = first_pts
|
|
171
|
+
|
|
172
|
+
num_samples = frames.shape[1]
|
|
173
|
+
last_pts = first_pts + num_samples / sample_rate
|
|
174
|
+
if stop_seconds is not None and stop_seconds < last_pts:
|
|
175
|
+
offset_end = num_samples - round((last_pts - stop_seconds) * sample_rate)
|
|
176
|
+
else:
|
|
177
|
+
offset_end = num_samples
|
|
178
|
+
|
|
179
|
+
data = frames[:, offset_beginning:offset_end]
|
|
180
|
+
return AudioSamples(
|
|
181
|
+
data=data,
|
|
182
|
+
pts_seconds=output_pts_seconds,
|
|
183
|
+
duration_seconds=data.shape[1] / sample_rate,
|
|
184
|
+
sample_rate=sample_rate,
|
|
185
|
+
)
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
#
|
|
4
|
+
# This source code is licensed under the BSD-style license found in the
|
|
5
|
+
# LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
import contextvars
|
|
9
|
+
import io
|
|
10
|
+
|
|
11
|
+
from collections.abc import Generator
|
|
12
|
+
from contextlib import contextmanager
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from torch import Tensor
|
|
16
|
+
from torchcodec import _core as core
|
|
17
|
+
|
|
18
|
+
ERROR_REPORTING_INSTRUCTIONS = """
|
|
19
|
+
This should never happen. Please report an issue following the steps in
|
|
20
|
+
https://github.com/pytorch/torchcodec/issues/new?assignees=&labels=&projects=&template=bug-report.yml.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def create_decoder(
|
|
25
|
+
*,
|
|
26
|
+
source: str | Path | io.RawIOBase | io.BufferedReader | bytes | Tensor,
|
|
27
|
+
seek_mode: str,
|
|
28
|
+
) -> Tensor:
|
|
29
|
+
if isinstance(source, str):
|
|
30
|
+
return core.create_from_file(source, seek_mode)
|
|
31
|
+
elif isinstance(source, Path):
|
|
32
|
+
return core.create_from_file(str(source), seek_mode)
|
|
33
|
+
elif isinstance(source, io.RawIOBase) or isinstance(source, io.BufferedReader):
|
|
34
|
+
return core.create_from_file_like(source, seek_mode)
|
|
35
|
+
elif isinstance(source, bytes):
|
|
36
|
+
return core.create_from_bytes(source, seek_mode)
|
|
37
|
+
elif isinstance(source, Tensor):
|
|
38
|
+
return core.create_from_tensor(source, seek_mode)
|
|
39
|
+
elif isinstance(source, io.TextIOBase):
|
|
40
|
+
raise TypeError(
|
|
41
|
+
"source is for reading text, likely from open(..., 'r'). Try with 'rb' for binary reading?"
|
|
42
|
+
)
|
|
43
|
+
elif hasattr(source, "read") and hasattr(source, "seek"):
|
|
44
|
+
# This check must be after checking for text-based reading. Also placing
|
|
45
|
+
# it last in general to be defensive: hasattr is a blunt instrument. We
|
|
46
|
+
# could use the inspect module to check for methods with the right
|
|
47
|
+
# signature.
|
|
48
|
+
return core.create_from_file_like(source, seek_mode)
|
|
49
|
+
|
|
50
|
+
raise TypeError(
|
|
51
|
+
f"Unknown source type: {type(source)}. "
|
|
52
|
+
"Supported types are str, Path, bytes, Tensor and file-like objects with "
|
|
53
|
+
"read(self, size: int) -> bytes and "
|
|
54
|
+
"seek(self, offset: int, whence: int) -> int methods."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# Thread-local and async-safe storage for the current CUDA backend
|
|
59
|
+
_CUDA_BACKEND: contextvars.ContextVar[str] = contextvars.ContextVar(
|
|
60
|
+
"_CUDA_BACKEND", default="ffmpeg"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@contextmanager
|
|
65
|
+
def set_cuda_backend(backend: str) -> Generator[None, None, None]:
|
|
66
|
+
"""Context Manager to set the CUDA backend for :class:`~torchcodec.decoders.VideoDecoder`.
|
|
67
|
+
|
|
68
|
+
This context manager allows you to specify which CUDA backend implementation
|
|
69
|
+
to use when creating :class:`~torchcodec.decoders.VideoDecoder` instances
|
|
70
|
+
with CUDA devices.
|
|
71
|
+
|
|
72
|
+
.. note::
|
|
73
|
+
**We recommend trying the "beta" backend instead of the default "ffmpeg"
|
|
74
|
+
backend!** The beta backend is faster, and will eventually become the
|
|
75
|
+
default in future versions. It may have rough edges that we'll polish
|
|
76
|
+
over time, but it's already quite stable and ready for adoption. Let us
|
|
77
|
+
know what you think!
|
|
78
|
+
|
|
79
|
+
Only the creation of the decoder needs to be inside the context manager, the
|
|
80
|
+
decoding methods can be called outside of it. You still need to pass
|
|
81
|
+
``device="cuda"`` when creating the
|
|
82
|
+
:class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
|
|
83
|
+
specified, this context manager will have no effect. See example below.
|
|
84
|
+
|
|
85
|
+
This is thread-safe and async-safe.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
backend (str): The CUDA backend to use. Can be "ffmpeg" (default) or
|
|
89
|
+
"beta". We recommend trying "beta" as it's faster!
|
|
90
|
+
|
|
91
|
+
Example:
|
|
92
|
+
>>> with set_cuda_backend("beta"):
|
|
93
|
+
... decoder = VideoDecoder("video.mp4", device="cuda")
|
|
94
|
+
...
|
|
95
|
+
... # Only the decoder creation needs to be part of the context manager.
|
|
96
|
+
... # Decoder will now the beta CUDA implementation:
|
|
97
|
+
... decoder.get_frame_at(0)
|
|
98
|
+
"""
|
|
99
|
+
backend = backend.lower()
|
|
100
|
+
if backend not in ("ffmpeg", "beta"):
|
|
101
|
+
raise ValueError(
|
|
102
|
+
f"Invalid CUDA backend ({backend}). Supported values are 'ffmpeg' and 'beta'."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
previous_state = _CUDA_BACKEND.set(backend)
|
|
106
|
+
try:
|
|
107
|
+
yield
|
|
108
|
+
finally:
|
|
109
|
+
_CUDA_BACKEND.reset(previous_state)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _get_cuda_backend() -> str:
|
|
113
|
+
return _CUDA_BACKEND.get()
|