torchcodec 0.3.0__cp313-cp313-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/__init__.py +16 -0
- torchcodec/_core/AVIOBytesContext.cpp +70 -0
- torchcodec/_core/AVIOBytesContext.h +32 -0
- torchcodec/_core/AVIOContextHolder.cpp +50 -0
- torchcodec/_core/AVIOContextHolder.h +65 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
- torchcodec/_core/AVIOFileLikeContext.h +54 -0
- torchcodec/_core/CMakeLists.txt +237 -0
- torchcodec/_core/CudaDeviceInterface.cpp +289 -0
- torchcodec/_core/CudaDeviceInterface.h +34 -0
- torchcodec/_core/DeviceInterface.cpp +88 -0
- torchcodec/_core/DeviceInterface.h +66 -0
- torchcodec/_core/Encoder.cpp +319 -0
- torchcodec/_core/Encoder.h +39 -0
- torchcodec/_core/FFMPEGCommon.cpp +264 -0
- torchcodec/_core/FFMPEGCommon.h +180 -0
- torchcodec/_core/Frame.h +47 -0
- torchcodec/_core/Metadata.h +70 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
- torchcodec/_core/SingleStreamDecoder.h +462 -0
- torchcodec/_core/StreamOptions.h +49 -0
- torchcodec/_core/__init__.py +39 -0
- torchcodec/_core/_metadata.py +277 -0
- torchcodec/_core/custom_ops.cpp +681 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
- torchcodec/_core/ops.py +381 -0
- torchcodec/_core/pybind_ops.cpp +45 -0
- torchcodec/_frame.py +145 -0
- torchcodec/_internally_replaced_utils.py +53 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +430 -0
- torchcodec/decoders/__init__.py +11 -0
- torchcodec/decoders/_audio_decoder.py +168 -0
- torchcodec/decoders/_decoder_utils.py +52 -0
- torchcodec/decoders/_video_decoder.py +399 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_decoder4.so +0 -0
- torchcodec/libtorchcodec_decoder5.so +0 -0
- torchcodec/libtorchcodec_decoder6.so +0 -0
- torchcodec/libtorchcodec_decoder7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +285 -0
- torchcodec/samplers/_time_based.py +348 -0
- torchcodec/version.py +2 -0
- torchcodec-0.3.0.dist-info/LICENSE +28 -0
- torchcodec-0.3.0.dist-info/METADATA +280 -0
- torchcodec-0.3.0.dist-info/RECORD +57 -0
- torchcodec-0.3.0.dist-info/WHEEL +5 -0
- torchcodec-0.3.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
from typing import Literal, Optional
|
|
2
|
+
|
|
3
|
+
import torch
|
|
4
|
+
|
|
5
|
+
from torchcodec import FrameBatch
|
|
6
|
+
from torchcodec.samplers._common import (
|
|
7
|
+
_FRAMEBATCH_RETURN_DOCS,
|
|
8
|
+
_POLICY_FUNCTION_TYPE,
|
|
9
|
+
_POLICY_FUNCTIONS,
|
|
10
|
+
_reshape_4d_framebatch_into_5d,
|
|
11
|
+
_validate_common_params,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _validate_params_time_based(
|
|
16
|
+
*,
|
|
17
|
+
decoder,
|
|
18
|
+
num_clips,
|
|
19
|
+
seconds_between_clip_starts,
|
|
20
|
+
seconds_between_frames,
|
|
21
|
+
):
|
|
22
|
+
|
|
23
|
+
if (num_clips is None and seconds_between_clip_starts is None) or (
|
|
24
|
+
num_clips is not None and seconds_between_clip_starts is not None
|
|
25
|
+
):
|
|
26
|
+
raise ValueError("This is internal only and should never happen.")
|
|
27
|
+
|
|
28
|
+
if seconds_between_clip_starts is not None and seconds_between_clip_starts <= 0:
|
|
29
|
+
raise ValueError(
|
|
30
|
+
f"seconds_between_clip_starts ({seconds_between_clip_starts}) must be > 0"
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
if num_clips is not None and num_clips <= 0:
|
|
34
|
+
raise ValueError(f"num_clips ({num_clips}) must be > 0")
|
|
35
|
+
|
|
36
|
+
if decoder.metadata.average_fps is None:
|
|
37
|
+
raise ValueError(
|
|
38
|
+
"Could not infer average fps from video metadata. "
|
|
39
|
+
"Try using an index-based sampler instead."
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
# Note that metadata.begin_stream_seconds is a property that will always yield a valid
|
|
43
|
+
# value; if it is not present in the actual metadata, the metadata object will return 0.
|
|
44
|
+
# Hence, we do not test for it here and only test metadata.end_stream_seconds.
|
|
45
|
+
if decoder.metadata.end_stream_seconds is None:
|
|
46
|
+
raise ValueError(
|
|
47
|
+
"Could not infer stream end from video metadata. "
|
|
48
|
+
"Try using an index-based sampler instead."
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
average_frame_duration_seconds = 1 / decoder.metadata.average_fps
|
|
52
|
+
if seconds_between_frames is None:
|
|
53
|
+
seconds_between_frames = average_frame_duration_seconds
|
|
54
|
+
elif seconds_between_frames <= 0:
|
|
55
|
+
raise ValueError(
|
|
56
|
+
f"seconds_between_clip_starts ({seconds_between_clip_starts}) must be > 0, got"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
return seconds_between_frames
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _validate_sampling_range_time_based(
|
|
63
|
+
*,
|
|
64
|
+
num_frames_per_clip,
|
|
65
|
+
seconds_between_frames,
|
|
66
|
+
sampling_range_start,
|
|
67
|
+
sampling_range_end,
|
|
68
|
+
begin_stream_seconds,
|
|
69
|
+
end_stream_seconds,
|
|
70
|
+
):
|
|
71
|
+
|
|
72
|
+
if sampling_range_start is None:
|
|
73
|
+
sampling_range_start = begin_stream_seconds
|
|
74
|
+
else:
|
|
75
|
+
if sampling_range_start < begin_stream_seconds:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"sampling_range_start ({sampling_range_start}) must be at least {begin_stream_seconds}"
|
|
78
|
+
)
|
|
79
|
+
if sampling_range_start >= end_stream_seconds:
|
|
80
|
+
raise ValueError(
|
|
81
|
+
f"sampling_range_start ({sampling_range_start}) must be smaller than {end_stream_seconds}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if sampling_range_end is None:
|
|
85
|
+
# We allow a clip to start anywhere within
|
|
86
|
+
# [sampling_range_start, sampling_range_end)
|
|
87
|
+
# When sampling_range_end is None, we want to automatically set it to
|
|
88
|
+
# the largest possible value such that the sampled frames in any clip
|
|
89
|
+
# are within the bounds of the video duration (in other words, we don't
|
|
90
|
+
# want to have to resort to the `policy`).
|
|
91
|
+
# I.e. we want to guarantee that for all frames in any clip we have
|
|
92
|
+
# pts < end_stream_seconds.
|
|
93
|
+
#
|
|
94
|
+
# The frames of a clip will be sampled at the following pts:
|
|
95
|
+
# clip_timestamps = [
|
|
96
|
+
# clip_start + 0 * seconds_between_frames,
|
|
97
|
+
# clip_start + 1 * seconds_between_frames,
|
|
98
|
+
# clip_start + 2 * seconds_between_frames,
|
|
99
|
+
# ...
|
|
100
|
+
# clip_start + (num_frames_per_clip - 1) * seconds_between_frames,
|
|
101
|
+
# ]
|
|
102
|
+
# To guarantee that any such value is < end_stream_seconds, we only need
|
|
103
|
+
# to guarantee that
|
|
104
|
+
# clip_start < end_stream_seconds - (num_frames_per_clip - 1) * seconds_between_frames
|
|
105
|
+
#
|
|
106
|
+
# So that's the value of sampling_range_end we want to use.
|
|
107
|
+
sampling_range_end = (
|
|
108
|
+
end_stream_seconds - (num_frames_per_clip - 1) * seconds_between_frames
|
|
109
|
+
)
|
|
110
|
+
elif sampling_range_end <= begin_stream_seconds:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"sampling_range_end ({sampling_range_end}) must be at least {begin_stream_seconds}"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if sampling_range_start >= sampling_range_end:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"sampling_range_start ({sampling_range_start}) must be smaller than sampling_range_end ({sampling_range_end})"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
sampling_range_end = min(sampling_range_end, end_stream_seconds)
|
|
121
|
+
|
|
122
|
+
return sampling_range_start, sampling_range_end
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _build_all_clips_timestamps(
|
|
126
|
+
*,
|
|
127
|
+
clip_start_seconds: torch.Tensor, # 1D float tensor
|
|
128
|
+
num_frames_per_clip: int,
|
|
129
|
+
seconds_between_frames: float,
|
|
130
|
+
end_stream_seconds: float,
|
|
131
|
+
policy_fun: _POLICY_FUNCTION_TYPE,
|
|
132
|
+
) -> list[float]:
|
|
133
|
+
|
|
134
|
+
all_clips_timestamps: list[float] = []
|
|
135
|
+
for start_seconds in clip_start_seconds:
|
|
136
|
+
clip_timestamps = [
|
|
137
|
+
timestamp
|
|
138
|
+
for i in range(num_frames_per_clip)
|
|
139
|
+
if (timestamp := start_seconds + i * seconds_between_frames)
|
|
140
|
+
< end_stream_seconds
|
|
141
|
+
]
|
|
142
|
+
|
|
143
|
+
if len(clip_timestamps) < num_frames_per_clip:
|
|
144
|
+
clip_timestamps = policy_fun(clip_timestamps, num_frames_per_clip)
|
|
145
|
+
all_clips_timestamps += clip_timestamps
|
|
146
|
+
|
|
147
|
+
return all_clips_timestamps
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _generic_time_based_sampler(
|
|
151
|
+
kind: Literal["random", "regular"],
|
|
152
|
+
decoder,
|
|
153
|
+
*,
|
|
154
|
+
num_clips: Optional[int], # mutually exclusive with seconds_between_clip_starts
|
|
155
|
+
seconds_between_clip_starts: Optional[float],
|
|
156
|
+
num_frames_per_clip: int,
|
|
157
|
+
seconds_between_frames: Optional[float],
|
|
158
|
+
# None means "begining", which may not always be 0
|
|
159
|
+
sampling_range_start: Optional[float],
|
|
160
|
+
sampling_range_end: Optional[float], # interval is [start, end).
|
|
161
|
+
policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
|
|
162
|
+
) -> FrameBatch:
|
|
163
|
+
# Note: *everywhere*, sampling_range_end denotes the upper bound of where a
|
|
164
|
+
# clip can start. This is an *open* upper bound, i.e. we will make sure no
|
|
165
|
+
# clip starts exactly at (or above) sampling_range_end.
|
|
166
|
+
|
|
167
|
+
_validate_common_params(
|
|
168
|
+
decoder=decoder,
|
|
169
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
170
|
+
policy=policy,
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
seconds_between_frames = _validate_params_time_based(
|
|
174
|
+
decoder=decoder,
|
|
175
|
+
num_clips=num_clips,
|
|
176
|
+
seconds_between_clip_starts=seconds_between_clip_starts,
|
|
177
|
+
seconds_between_frames=seconds_between_frames,
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
sampling_range_start, sampling_range_end = _validate_sampling_range_time_based(
|
|
181
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
182
|
+
seconds_between_frames=seconds_between_frames,
|
|
183
|
+
sampling_range_start=sampling_range_start,
|
|
184
|
+
sampling_range_end=sampling_range_end,
|
|
185
|
+
begin_stream_seconds=decoder.metadata.begin_stream_seconds,
|
|
186
|
+
end_stream_seconds=decoder.metadata.end_stream_seconds,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
if kind == "random":
|
|
190
|
+
assert num_clips is not None # appease type-checker
|
|
191
|
+
sampling_range_width = sampling_range_end - sampling_range_start
|
|
192
|
+
# torch.rand() returns in [0, 1)
|
|
193
|
+
# which ensures all clip starts are < sampling_range_end
|
|
194
|
+
clip_start_seconds = (
|
|
195
|
+
torch.rand(num_clips) * sampling_range_width + sampling_range_start
|
|
196
|
+
)
|
|
197
|
+
else:
|
|
198
|
+
assert seconds_between_clip_starts is not None # appease type-checker
|
|
199
|
+
clip_start_seconds = torch.arange(
|
|
200
|
+
sampling_range_start,
|
|
201
|
+
sampling_range_end, # excluded
|
|
202
|
+
seconds_between_clip_starts,
|
|
203
|
+
)
|
|
204
|
+
num_clips = len(clip_start_seconds)
|
|
205
|
+
|
|
206
|
+
all_clips_timestamps = _build_all_clips_timestamps(
|
|
207
|
+
clip_start_seconds=clip_start_seconds,
|
|
208
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
209
|
+
seconds_between_frames=seconds_between_frames,
|
|
210
|
+
end_stream_seconds=decoder.metadata.end_stream_seconds,
|
|
211
|
+
policy_fun=_POLICY_FUNCTIONS[policy],
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
frames = decoder.get_frames_played_at(seconds=all_clips_timestamps)
|
|
215
|
+
return _reshape_4d_framebatch_into_5d(
|
|
216
|
+
frames=frames,
|
|
217
|
+
num_clips=num_clips,
|
|
218
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def clips_at_random_timestamps(
|
|
223
|
+
decoder,
|
|
224
|
+
*,
|
|
225
|
+
num_clips: int = 1,
|
|
226
|
+
num_frames_per_clip: int = 1,
|
|
227
|
+
seconds_between_frames: Optional[float] = None,
|
|
228
|
+
# None means "begining", which may not always be 0
|
|
229
|
+
sampling_range_start: Optional[float] = None,
|
|
230
|
+
sampling_range_end: Optional[float] = None, # interval is [start, end).
|
|
231
|
+
policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
|
|
232
|
+
) -> FrameBatch:
|
|
233
|
+
# See docstring below
|
|
234
|
+
return _generic_time_based_sampler(
|
|
235
|
+
kind="random",
|
|
236
|
+
decoder=decoder,
|
|
237
|
+
num_clips=num_clips,
|
|
238
|
+
seconds_between_clip_starts=None,
|
|
239
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
240
|
+
seconds_between_frames=seconds_between_frames,
|
|
241
|
+
sampling_range_start=sampling_range_start,
|
|
242
|
+
sampling_range_end=sampling_range_end,
|
|
243
|
+
policy=policy,
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def clips_at_regular_timestamps(
|
|
248
|
+
decoder,
|
|
249
|
+
*,
|
|
250
|
+
seconds_between_clip_starts: float,
|
|
251
|
+
num_frames_per_clip: int = 1,
|
|
252
|
+
seconds_between_frames: Optional[float] = None,
|
|
253
|
+
# None means "begining", which may not always be 0
|
|
254
|
+
sampling_range_start: Optional[float] = None,
|
|
255
|
+
sampling_range_end: Optional[float] = None, # interval is [start, end).
|
|
256
|
+
policy: Literal["repeat_last", "wrap", "error"] = "repeat_last",
|
|
257
|
+
) -> FrameBatch:
|
|
258
|
+
# See docstring below
|
|
259
|
+
return _generic_time_based_sampler(
|
|
260
|
+
kind="regular",
|
|
261
|
+
decoder=decoder,
|
|
262
|
+
num_clips=None,
|
|
263
|
+
seconds_between_clip_starts=seconds_between_clip_starts,
|
|
264
|
+
num_frames_per_clip=num_frames_per_clip,
|
|
265
|
+
seconds_between_frames=seconds_between_frames,
|
|
266
|
+
sampling_range_start=sampling_range_start,
|
|
267
|
+
sampling_range_end=sampling_range_end,
|
|
268
|
+
policy=policy,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
_COMMON_DOCS = """
|
|
273
|
+
{maybe_note}
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
decoder (VideoDecoder): The :class:`~torchcodec.decoders.VideoDecoder`
|
|
277
|
+
instance to sample clips from.
|
|
278
|
+
{num_clips_or_seconds_between_clip_starts}
|
|
279
|
+
num_frames_per_clip (int, optional): The number of frames per clips. Default: 1.
|
|
280
|
+
seconds_between_frames (float or None, optional): The time (in seconds)
|
|
281
|
+
between each frame within a clip. More accurately, this defines the
|
|
282
|
+
time between the *frame sampling point*, i.e. the timestamps at
|
|
283
|
+
which we sample the frames. Because frames span intervals in time ,
|
|
284
|
+
the resulting start of frames within a clip may not be exactly
|
|
285
|
+
spaced by ``seconds_between_frames`` - but on average, they will be.
|
|
286
|
+
Default is None, which is set to the average frame duration
|
|
287
|
+
(``1/average_fps``).
|
|
288
|
+
sampling_range_start (float or None, optional): The start of the
|
|
289
|
+
sampling range, which defines the first timestamp (in seconds) that
|
|
290
|
+
a clip may *start* at. Default: None, which corresponds to the start
|
|
291
|
+
of the video. (Note: some videos start at negative values, which is
|
|
292
|
+
why the default is not 0).
|
|
293
|
+
sampling_range_end (float or None, optional): The end of the sampling
|
|
294
|
+
range, which defines the last timestamp (in seconds) that a clip may
|
|
295
|
+
*start* at. This value is exclusive, i.e. a clip may only start within
|
|
296
|
+
[``sampling_range_start``, ``sampling_range_end``). If None
|
|
297
|
+
(default), the value is set automatically such that the clips never
|
|
298
|
+
span beyond the end of the video, i.e. it is set to
|
|
299
|
+
``end_video_seconds - (num_frames_per_clip - 1) *
|
|
300
|
+
seconds_between_frames``. When a clip spans beyond the end of the
|
|
301
|
+
video, the ``policy`` parameter defines how to construct such clip.
|
|
302
|
+
policy (str, optional): Defines how to construct clips that span beyond
|
|
303
|
+
the end of the video. This is best described with an example:
|
|
304
|
+
assuming the last valid (seekable) timestamp in a video is 10.9, and
|
|
305
|
+
a clip was sampled to start at timestamp 10.5, with
|
|
306
|
+
``num_frames_per_clip=5`` and ``seconds_between_frames=0.2``, the
|
|
307
|
+
sampling timestamps of the frames in the clip are supposed to be
|
|
308
|
+
[10.5, 10.7, 10.9, 11.1, 11.2]. But 11.1 and 11.2 are invalid
|
|
309
|
+
timestamps, so the ``policy`` parameter defines how to replace those
|
|
310
|
+
frames, with valid sampling timestamps:
|
|
311
|
+
|
|
312
|
+
- "repeat_last": repeats the last valid frame of the clip. We would
|
|
313
|
+
get frames sampled at timestamps [10.5, 10.7, 10.9, 10.9, 10.9].
|
|
314
|
+
- "wrap": wraps around to the beginning of the clip. We would get
|
|
315
|
+
frames sampled at timestamps [10.5, 10.7, 10.9, 10.5, 10.7].
|
|
316
|
+
- "error": raises an error.
|
|
317
|
+
|
|
318
|
+
Default is "repeat_last". Note that when ``sampling_range_end=None``
|
|
319
|
+
(default), this policy parameter is unlikely to be relevant.
|
|
320
|
+
|
|
321
|
+
{return_docs}
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
_NUM_CLIPS_DOCS = """
|
|
326
|
+
num_clips (int, optional): The number of clips to return. Default: 1.
|
|
327
|
+
"""
|
|
328
|
+
clips_at_random_timestamps.__doc__ = f"""Sample :term:`clips` at random timestamps.
|
|
329
|
+
{_COMMON_DOCS.format(maybe_note="", num_clips_or_seconds_between_clip_starts=_NUM_CLIPS_DOCS, return_docs=_FRAMEBATCH_RETURN_DOCS)}
|
|
330
|
+
"""
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
_SECONDS_BETWEEN_CLIP_STARTS = """
|
|
334
|
+
seconds_between_clip_starts (float): The space (in seconds) between each
|
|
335
|
+
clip start.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
_NOTE_DOCS = """
|
|
339
|
+
.. note::
|
|
340
|
+
For consistency with existing sampling APIs (such as torchvision), this
|
|
341
|
+
sampler takes a ``seconds_between_clip_starts`` parameter instead of
|
|
342
|
+
``num_clips``. If you find that supporting ``num_clips`` would be
|
|
343
|
+
useful, please let us know by `opening a feature request
|
|
344
|
+
<https://github.com/pytorch/torchcodec/issues?q=is:open+is:issue>`_.
|
|
345
|
+
"""
|
|
346
|
+
clips_at_regular_timestamps.__doc__ = f"""Sample :term:`clips` at regular (equally-spaced) timestamps.
|
|
347
|
+
{_COMMON_DOCS.format(maybe_note=_NOTE_DOCS, num_clips_or_seconds_between_clip_starts=_SECONDS_BETWEEN_CLIP_STARTS, return_docs=_FRAMEBATCH_RETURN_DOCS)}
|
|
348
|
+
"""
|
torchcodec/version.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
BSD 3-Clause License
|
|
2
|
+
|
|
3
|
+
Copyright 2024 Meta
|
|
4
|
+
|
|
5
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
6
|
+
are permitted provided that the following conditions are met:
|
|
7
|
+
|
|
8
|
+
1. Redistributions of source code must retain the above copyright notice,this list
|
|
9
|
+
of conditions and the following disclaimer.
|
|
10
|
+
|
|
11
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this
|
|
12
|
+
list of conditions and the following disclaimer in the documentation
|
|
13
|
+
and/or other materials provided with the distribution.
|
|
14
|
+
|
|
15
|
+
3. Neither the name of the copyright holder nor the names of its contributors may
|
|
16
|
+
be used to endorse or promote products derived from this software without specific
|
|
17
|
+
prior written permission.
|
|
18
|
+
|
|
19
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
|
|
20
|
+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
21
|
+
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
|
22
|
+
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
23
|
+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
|
24
|
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
25
|
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
26
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
27
|
+
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
28
|
+
DAMAGE.
|
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: torchcodec
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: A video decoder for PyTorch
|
|
5
|
+
Author-email: PyTorch Team <packages@pytorch.org>
|
|
6
|
+
License: BSD 3-Clause License
|
|
7
|
+
|
|
8
|
+
Copyright 2024 Meta
|
|
9
|
+
|
|
10
|
+
Redistribution and use in source and binary forms, with or without modification,
|
|
11
|
+
are permitted provided that the following conditions are met:
|
|
12
|
+
|
|
13
|
+
1. Redistributions of source code must retain the above copyright notice,this list
|
|
14
|
+
of conditions and the following disclaimer.
|
|
15
|
+
|
|
16
|
+
2. Redistributions in binary form must reproduce the above copyright notice, this
|
|
17
|
+
list of conditions and the following disclaimer in the documentation
|
|
18
|
+
and/or other materials provided with the distribution.
|
|
19
|
+
|
|
20
|
+
3. Neither the name of the copyright holder nor the names of its contributors may
|
|
21
|
+
be used to endorse or promote products derived from this software without specific
|
|
22
|
+
prior written permission.
|
|
23
|
+
|
|
24
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
|
|
25
|
+
EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
|
26
|
+
OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
|
|
27
|
+
SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
28
|
+
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
|
29
|
+
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
|
30
|
+
BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
31
|
+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
32
|
+
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
|
|
33
|
+
DAMAGE.
|
|
34
|
+
|
|
35
|
+
Project-URL: GitHub, https://github.com/pytorch/torchcodec
|
|
36
|
+
Project-URL: Documentation, https://pytorch.org/torchcodec/stable/index.html
|
|
37
|
+
Requires-Python: >=3.8
|
|
38
|
+
Description-Content-Type: text/markdown
|
|
39
|
+
License-File: LICENSE
|
|
40
|
+
Provides-Extra: dev
|
|
41
|
+
Requires-Dist: numpy; extra == "dev"
|
|
42
|
+
Requires-Dist: pytest; extra == "dev"
|
|
43
|
+
Requires-Dist: pillow; extra == "dev"
|
|
44
|
+
|
|
45
|
+
[**Installation**](#installing-torchcodec) | [**Simple Example**](#using-torchcodec) | [**Detailed Example**](https://pytorch.org/torchcodec/stable/generated_examples/) | [**Documentation**](https://pytorch.org/torchcodec) | [**Contributing**](CONTRIBUTING.md) | [**License**](#license)
|
|
46
|
+
|
|
47
|
+
# TorchCodec
|
|
48
|
+
|
|
49
|
+
TorchCodec is a Python library for decoding video and audio data into PyTorch
|
|
50
|
+
tensors, on CPU and CUDA GPU. It aims to be fast, easy to use, and well
|
|
51
|
+
integrated into the PyTorch ecosystem. If you want to use PyTorch to train ML
|
|
52
|
+
models on videos and audio, TorchCodec is how you turn these into data.
|
|
53
|
+
|
|
54
|
+
We achieve these capabilities through:
|
|
55
|
+
|
|
56
|
+
* Pythonic APIs that mirror Python and PyTorch conventions.
|
|
57
|
+
* Relying on [FFmpeg](https://www.ffmpeg.org/) to do the decoding. TorchCodec
|
|
58
|
+
uses the version of FFmpeg you already have installed. FFmpeg is a mature
|
|
59
|
+
library with broad coverage available on most systems. It is, however, not
|
|
60
|
+
easy to use. TorchCodec abstracts FFmpeg's complexity to ensure it is used
|
|
61
|
+
correctly and efficiently.
|
|
62
|
+
* Returning data as PyTorch tensors, ready to be fed into PyTorch transforms
|
|
63
|
+
or used directly to train models.
|
|
64
|
+
|
|
65
|
+
> [!NOTE]
|
|
66
|
+
> ⚠️ TorchCodec is still in development stage and some APIs may be updated
|
|
67
|
+
> in future versions, depending on user feedback.
|
|
68
|
+
> If you have any suggestions or issues, please let us know by
|
|
69
|
+
> [opening an issue](https://github.com/pytorch/torchcodec/issues/new/choose)!
|
|
70
|
+
|
|
71
|
+
## Using TorchCodec
|
|
72
|
+
|
|
73
|
+
Here's a condensed summary of what you can do with TorchCodec. For more detailed
|
|
74
|
+
examples, [check out our
|
|
75
|
+
documentation](https://pytorch.org/torchcodec/stable/generated_examples/)!
|
|
76
|
+
|
|
77
|
+
#### Decoding
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from torchcodec.decoders import VideoDecoder
|
|
81
|
+
|
|
82
|
+
device = "cpu" # or e.g. "cuda" !
|
|
83
|
+
decoder = VideoDecoder("path/to/video.mp4", device=device)
|
|
84
|
+
|
|
85
|
+
decoder.metadata
|
|
86
|
+
# VideoStreamMetadata:
|
|
87
|
+
# num_frames: 250
|
|
88
|
+
# duration_seconds: 10.0
|
|
89
|
+
# bit_rate: 31315.0
|
|
90
|
+
# codec: h264
|
|
91
|
+
# average_fps: 25.0
|
|
92
|
+
# ... (truncated output)
|
|
93
|
+
|
|
94
|
+
# Simple Indexing API
|
|
95
|
+
decoder[0] # uint8 tensor of shape [C, H, W]
|
|
96
|
+
decoder[0 : -1 : 20] # uint8 stacked tensor of shape [N, C, H, W]
|
|
97
|
+
|
|
98
|
+
# Indexing, with PTS and duration info:
|
|
99
|
+
decoder.get_frames_at(indices=[2, 100])
|
|
100
|
+
# FrameBatch:
|
|
101
|
+
# data (shape): torch.Size([2, 3, 270, 480])
|
|
102
|
+
# pts_seconds: tensor([0.0667, 3.3367], dtype=torch.float64)
|
|
103
|
+
# duration_seconds: tensor([0.0334, 0.0334], dtype=torch.float64)
|
|
104
|
+
|
|
105
|
+
# Time-based indexing with PTS and duration info
|
|
106
|
+
decoder.get_frames_played_at(seconds=[0.5, 10.4])
|
|
107
|
+
# FrameBatch:
|
|
108
|
+
# data (shape): torch.Size([2, 3, 270, 480])
|
|
109
|
+
# pts_seconds: tensor([ 0.4671, 10.3770], dtype=torch.float64)
|
|
110
|
+
# duration_seconds: tensor([0.0334, 0.0334], dtype=torch.float64)
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
#### Clip sampling
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
|
|
117
|
+
from torchcodec.samplers import clips_at_regular_timestamps
|
|
118
|
+
|
|
119
|
+
clips_at_regular_timestamps(
|
|
120
|
+
decoder,
|
|
121
|
+
seconds_between_clip_starts=1.5,
|
|
122
|
+
num_frames_per_clip=4,
|
|
123
|
+
seconds_between_frames=0.1
|
|
124
|
+
)
|
|
125
|
+
# FrameBatch:
|
|
126
|
+
# data (shape): torch.Size([9, 4, 3, 270, 480])
|
|
127
|
+
# pts_seconds: tensor([[ 0.0000, 0.0667, 0.1668, 0.2669],
|
|
128
|
+
# [ 1.4681, 1.5682, 1.6683, 1.7684],
|
|
129
|
+
# [ 2.9696, 3.0697, 3.1698, 3.2699],
|
|
130
|
+
# ... (truncated), dtype=torch.float64)
|
|
131
|
+
# duration_seconds: tensor([[0.0334, 0.0334, 0.0334, 0.0334],
|
|
132
|
+
# [0.0334, 0.0334, 0.0334, 0.0334],
|
|
133
|
+
# [0.0334, 0.0334, 0.0334, 0.0334],
|
|
134
|
+
# ... (truncated), dtype=torch.float64)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
You can use the following snippet to generate a video with FFmpeg and tryout
|
|
138
|
+
TorchCodec:
|
|
139
|
+
|
|
140
|
+
```bash
|
|
141
|
+
fontfile=/usr/share/fonts/dejavu-sans-mono-fonts/DejaVuSansMono-Bold.ttf
|
|
142
|
+
output_video_file=/tmp/output_video.mp4
|
|
143
|
+
|
|
144
|
+
ffmpeg -f lavfi -i \
|
|
145
|
+
color=size=640x400:duration=10:rate=25:color=blue \
|
|
146
|
+
-vf "drawtext=fontfile=${fontfile}:fontsize=30:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:text='Frame %{frame_num}'" \
|
|
147
|
+
${output_video_file}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## Installing TorchCodec
|
|
151
|
+
### Installing CPU-only TorchCodec
|
|
152
|
+
|
|
153
|
+
1. Install the latest stable version of PyTorch following the
|
|
154
|
+
[official instructions](https://pytorch.org/get-started/locally/). For other
|
|
155
|
+
versions, refer to the table below for compatibility between versions of
|
|
156
|
+
`torch` and `torchcodec`.
|
|
157
|
+
|
|
158
|
+
2. Install FFmpeg, if it's not already installed. Linux distributions usually
|
|
159
|
+
come with FFmpeg pre-installed. TorchCodec supports all major FFmpeg versions
|
|
160
|
+
in [4, 7].
|
|
161
|
+
|
|
162
|
+
If FFmpeg is not already installed, or you need a more recent version, an
|
|
163
|
+
easy way to install it is to use `conda`:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
conda install ffmpeg
|
|
167
|
+
# or
|
|
168
|
+
conda install ffmpeg -c conda-forge
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
3. Install TorchCodec:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
pip install torchcodec
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
The following table indicates the compatibility between versions of
|
|
178
|
+
`torchcodec`, `torch` and Python.
|
|
179
|
+
|
|
180
|
+
| `torchcodec` | `torch` | Python |
|
|
181
|
+
| ------------------ | ------------------ | ------------------- |
|
|
182
|
+
| `main` / `nightly` | `main` / `nightly` | `>=3.9`, `<=3.13` |
|
|
183
|
+
| `0.2` | `2.6` | `>=3.9`, `<=3.13` |
|
|
184
|
+
| `0.1` | `2.5` | `>=3.9`, `<=3.12` |
|
|
185
|
+
| `0.0.3` | `2.4` | `>=3.8`, `<=3.12` |
|
|
186
|
+
|
|
187
|
+
### Installing CUDA-enabled TorchCodec
|
|
188
|
+
|
|
189
|
+
First, make sure you have a GPU that has NVDEC hardware that can decode the
|
|
190
|
+
format you want. Refer to Nvidia's GPU support matrix for more details
|
|
191
|
+
[here](https://developer.nvidia.com/video-encode-and-decode-gpu-support-matrix-new).
|
|
192
|
+
|
|
193
|
+
1. Install Pytorch corresponding to your CUDA Toolkit using the
|
|
194
|
+
[official instructions](https://pytorch.org/get-started/locally/). You'll
|
|
195
|
+
need the `libnpp` and `libnvrtc` CUDA libraries, which are usually part of
|
|
196
|
+
the CUDA Toolkit.
|
|
197
|
+
|
|
198
|
+
2. Install or compile FFmpeg with NVDEC support.
|
|
199
|
+
TorchCodec with CUDA should work with FFmpeg versions in [4, 7].
|
|
200
|
+
|
|
201
|
+
If FFmpeg is not already installed, or you need a more recent version, an
|
|
202
|
+
easy way to install it is to use `conda`:
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
conda install ffmpeg
|
|
206
|
+
# or
|
|
207
|
+
conda install ffmpeg -c conda-forge
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
If you are building FFmpeg from source you can follow Nvidia's guide to
|
|
211
|
+
configuring and installing FFmpeg with NVDEC support
|
|
212
|
+
[here](https://docs.nvidia.com/video-technologies/video-codec-sdk/12.0/ffmpeg-with-nvidia-gpu/index.html).
|
|
213
|
+
|
|
214
|
+
After installing FFmpeg make sure it has NVDEC support when you list the supported
|
|
215
|
+
decoders:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
ffmpeg -decoders | grep -i nvidia
|
|
219
|
+
# This should show a line like this:
|
|
220
|
+
# V..... h264_cuvid Nvidia CUVID H264 decoder (codec h264)
|
|
221
|
+
```
|
|
222
|
+
|
|
223
|
+
To check that FFmpeg libraries work with NVDEC correctly you can decode a sample video:
|
|
224
|
+
|
|
225
|
+
```bash
|
|
226
|
+
ffmpeg -hwaccel cuda -hwaccel_output_format cuda -i test/resources/nasa_13013.mp4 -f null -
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
3. Install TorchCodec by passing in an `--index-url` parameter that corresponds
|
|
230
|
+
to your CUDA Toolkit version, example:
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
# This corresponds to CUDA Toolkit version 12.6. It should be the same one
|
|
234
|
+
# you used when you installed PyTorch (If you installed PyTorch with pip).
|
|
235
|
+
pip install torchcodec --index-url=https://download.pytorch.org/whl/cu126
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
Note that without passing in the `--index-url` parameter, `pip` installs
|
|
239
|
+
the CPU-only version of TorchCodec.
|
|
240
|
+
|
|
241
|
+
## Benchmark Results
|
|
242
|
+
|
|
243
|
+
The following was generated by running [our benchmark script](./benchmarks/decoders/generate_readme_data.py) on a lightly loaded 22-core machine with an Nvidia A100 with
|
|
244
|
+
5 [NVDEC decoders](https://docs.nvidia.com/video-technologies/video-codec-sdk/12.1/nvdec-application-note/index.html#).
|
|
245
|
+
|
|
246
|
+

|
|
247
|
+
|
|
248
|
+
The top row is a [Mandelbrot](https://ffmpeg.org/ffmpeg-filters.html#mandelbrot) video
|
|
249
|
+
generated from FFmpeg that has a resolution of 1280x720 at 60 fps and is 120 seconds long.
|
|
250
|
+
The bottom row is [promotional video from NASA](https://download.pytorch.org/torchaudio/tutorial-assets/stream-api/NASAs_Most_Scientifically_Complex_Space_Observatory_Requires_Precision-MP4_small.mp4)
|
|
251
|
+
that has a resolution of 960x540 at 29.7 fps and is 206 seconds long. Both videos were
|
|
252
|
+
encoded with libx264 and yuv420p pixel format. All decoders, except for TorchVision, used FFmpeg 6.1.2. TorchVision used FFmpeg 4.2.2.
|
|
253
|
+
|
|
254
|
+
For TorchCodec, the "approx" label means that it was using [approximate mode](https://pytorch.org/torchcodec/stable/generated_examples/approximate_mode.html)
|
|
255
|
+
for seeking.
|
|
256
|
+
|
|
257
|
+
## Planned future work
|
|
258
|
+
|
|
259
|
+
We are actively working on the following features:
|
|
260
|
+
|
|
261
|
+
- [Audio decoding](https://github.com/pytorch/torchcodec/issues/85)
|
|
262
|
+
|
|
263
|
+
Let us know if you have any feature requests by [opening an
|
|
264
|
+
issue](https://github.com/pytorch/torchcodec/issues/new?assignees=&labels=&projects=&template=feature-request.yml)!
|
|
265
|
+
|
|
266
|
+
## Contributing
|
|
267
|
+
|
|
268
|
+
We welcome contributions to TorchCodec! Please see our [contributing
|
|
269
|
+
guide](CONTRIBUTING.md) for more details.
|
|
270
|
+
|
|
271
|
+
## License
|
|
272
|
+
|
|
273
|
+
TorchCodec is released under the [BSD 3 license](./LICENSE).
|
|
274
|
+
|
|
275
|
+
However, TorchCodec may be used with code not written by Meta which may be
|
|
276
|
+
distributed under different licenses.
|
|
277
|
+
|
|
278
|
+
For example, if you build TorchCodec with ENABLE_CUDA=1 or use the CUDA-enabled
|
|
279
|
+
release of torchcodec, please review CUDA's license here:
|
|
280
|
+
[Nvidia licenses](https://docs.nvidia.com/cuda/eula/index.html).
|