torchcodec 0.8.0__cp313-cp313-macosx_12_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (82) hide show
  1. torchcodec/.dylibs/libc++.1.0.dylib +0 -0
  2. torchcodec/.dylibs/libpython3.13.dylib +0 -0
  3. torchcodec/__init__.py +16 -0
  4. torchcodec/_core/AVIOContextHolder.cpp +60 -0
  5. torchcodec/_core/AVIOContextHolder.h +64 -0
  6. torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
  7. torchcodec/_core/AVIOFileLikeContext.h +55 -0
  8. torchcodec/_core/AVIOTensorContext.cpp +123 -0
  9. torchcodec/_core/AVIOTensorContext.h +43 -0
  10. torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
  11. torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
  12. torchcodec/_core/CMakeLists.txt +325 -0
  13. torchcodec/_core/CUDACommon.cpp +315 -0
  14. torchcodec/_core/CUDACommon.h +46 -0
  15. torchcodec/_core/Cache.h +138 -0
  16. torchcodec/_core/CpuDeviceInterface.cpp +347 -0
  17. torchcodec/_core/CpuDeviceInterface.h +132 -0
  18. torchcodec/_core/CudaDeviceInterface.cpp +357 -0
  19. torchcodec/_core/CudaDeviceInterface.h +64 -0
  20. torchcodec/_core/DeviceInterface.cpp +117 -0
  21. torchcodec/_core/DeviceInterface.h +148 -0
  22. torchcodec/_core/Encoder.cpp +807 -0
  23. torchcodec/_core/Encoder.h +173 -0
  24. torchcodec/_core/FFMPEGCommon.cpp +608 -0
  25. torchcodec/_core/FFMPEGCommon.h +245 -0
  26. torchcodec/_core/FilterGraph.cpp +149 -0
  27. torchcodec/_core/FilterGraph.h +59 -0
  28. torchcodec/_core/Frame.cpp +42 -0
  29. torchcodec/_core/Frame.h +72 -0
  30. torchcodec/_core/Metadata.h +72 -0
  31. torchcodec/_core/NVDECCache.cpp +70 -0
  32. torchcodec/_core/NVDECCache.h +104 -0
  33. torchcodec/_core/SingleStreamDecoder.cpp +1719 -0
  34. torchcodec/_core/SingleStreamDecoder.h +405 -0
  35. torchcodec/_core/StreamOptions.h +63 -0
  36. torchcodec/_core/Transform.cpp +60 -0
  37. torchcodec/_core/Transform.h +59 -0
  38. torchcodec/_core/ValidationUtils.cpp +35 -0
  39. torchcodec/_core/ValidationUtils.h +21 -0
  40. torchcodec/_core/__init__.py +41 -0
  41. torchcodec/_core/_metadata.py +317 -0
  42. torchcodec/_core/custom_ops.cpp +875 -0
  43. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +360 -0
  44. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  45. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  46. torchcodec/_core/ops.py +498 -0
  47. torchcodec/_core/pybind_ops.cpp +50 -0
  48. torchcodec/_frame.py +145 -0
  49. torchcodec/_internally_replaced_utils.py +67 -0
  50. torchcodec/_samplers/__init__.py +7 -0
  51. torchcodec/_samplers/video_clip_sampler.py +418 -0
  52. torchcodec/decoders/__init__.py +12 -0
  53. torchcodec/decoders/_audio_decoder.py +177 -0
  54. torchcodec/decoders/_decoder_utils.py +112 -0
  55. torchcodec/decoders/_video_decoder.py +500 -0
  56. torchcodec/encoders/__init__.py +1 -0
  57. torchcodec/encoders/_audio_encoder.py +150 -0
  58. torchcodec/libtorchcodec_core4.dylib +0 -0
  59. torchcodec/libtorchcodec_core5.dylib +0 -0
  60. torchcodec/libtorchcodec_core6.dylib +0 -0
  61. torchcodec/libtorchcodec_core7.dylib +0 -0
  62. torchcodec/libtorchcodec_core8.dylib +0 -0
  63. torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
  64. torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
  65. torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
  66. torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
  67. torchcodec/libtorchcodec_custom_ops8.dylib +0 -0
  68. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  69. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  70. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  71. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  72. torchcodec/libtorchcodec_pybind_ops8.so +0 -0
  73. torchcodec/samplers/__init__.py +2 -0
  74. torchcodec/samplers/_common.py +84 -0
  75. torchcodec/samplers/_index_based.py +287 -0
  76. torchcodec/samplers/_time_based.py +358 -0
  77. torchcodec/version.py +2 -0
  78. torchcodec-0.8.0.dist-info/METADATA +253 -0
  79. torchcodec-0.8.0.dist-info/RECORD +82 -0
  80. torchcodec-0.8.0.dist-info/WHEEL +5 -0
  81. torchcodec-0.8.0.dist-info/licenses/LICENSE +28 -0
  82. torchcodec-0.8.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,315 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include "src/torchcodec/_core/CUDACommon.h"
8
+
9
+ namespace facebook::torchcodec {
10
+
11
+ namespace {
12
+
13
+ // Pytorch can only handle up to 128 GPUs.
14
+ // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
15
+ const int MAX_CUDA_GPUS = 128;
16
+ // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
17
+ // Set to a positive number to have a cache of that size.
18
+ const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
19
+
20
+ PerGpuCache<NppStreamContext> g_cached_npp_ctxs(
21
+ MAX_CUDA_GPUS,
22
+ MAX_CONTEXTS_PER_GPU_IN_CACHE);
23
+
24
+ } // namespace
25
+
26
+ void initializeCudaContextWithPytorch(const torch::Device& device) {
27
+ // It is important for pytorch itself to create the cuda context. If ffmpeg
28
+ // creates the context it may not be compatible with pytorch.
29
+ // This is a dummy tensor to initialize the cuda context.
30
+ torch::Tensor dummyTensorForCudaInitialization = torch::zeros(
31
+ {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device));
32
+ }
33
+
34
+ /* clang-format off */
35
+ // Note: [YUV -> RGB Color Conversion, color space and color range]
36
+ //
37
+ // The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV
38
+ // format. We need to convert them to RGB. This note attempts to describe this
39
+ // process. There may be some inaccuracies and approximations that experts will
40
+ // notice, but our goal is only to provide a good enough understanding of the
41
+ // process for torchcodec developers to implement and maintain it.
42
+ // On CPU, filtergraph and swscale handle everything for us. With CUDA, we have
43
+ // to do a lot of the heavy lifting ourselves.
44
+ //
45
+ // Color space and color range
46
+ // ---------------------------
47
+ // Two main characteristics of a frame will affect the conversion process:
48
+ // 1. Color space: This basically defines what YUV values correspond to which
49
+ // physical wavelength. No need to go into details here,the point is that
50
+ // videos can come in different color spaces, the most common ones being
51
+ // BT.601 and BT.709, but there are others.
52
+ // In FFmpeg this is represented with AVColorSpace:
53
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85
54
+ // 2. Color range: This defines the range of YUV values. There is:
55
+ // - full range, also called PC range: AVCOL_RANGE_JPEG
56
+ // - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG
57
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a
58
+ //
59
+ // Color space and color range are independent concepts, so we can have a BT.709
60
+ // with full range, and another one with limited range. Same for BT.601.
61
+ //
62
+ // In the first version of this note we'll focus on the full color range. It
63
+ // will later be updated to account for the limited range.
64
+ //
65
+ // Color conversion matrix
66
+ // -----------------------
67
+ // YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV,
68
+ // So this is where we'll start.
69
+ // At the core of a RGB -> YUV conversion are the "luma coefficients", which are
70
+ // specific to a given color space and defined by the color space standard. In
71
+ // FFmpeg they can be found here:
72
+ // https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56
73
+ //
74
+ // For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722
75
+ // Coefficients must sum to 1.
76
+ //
77
+ // Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range
78
+ // (that's mathematically, in practice they are represented in integer range).
79
+ // The conversion is defined as:
80
+ // https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr
81
+ // Y = kr*R + kg*G + kb*B
82
+ // U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb)
83
+ // V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr)
84
+ //
85
+ // Putting all this into matrix form, we get:
86
+ // [Y] = [kr kg kb ] [R]
87
+ // [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G]
88
+ // [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B]
89
+ //
90
+ //
91
+ // Now, to convert YUV to RGB, we just need to invert this matrix:
92
+ // ```py
93
+ // import torch
94
+ // kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients
95
+ // u_scale = 2 * (1 - kb)
96
+ // v_scale = 2 * (1 - kr)
97
+ //
98
+ // rgb_to_yuv = torch.tensor([
99
+ // [kr, kg, kb],
100
+ // [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
101
+ // [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
102
+ // ])
103
+ //
104
+ // yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv)
105
+ // print("YUV->RGB matrix (Full Range):")
106
+ // print(yuv_to_rgb_full)
107
+ // ```
108
+ // And we get:
109
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00],
110
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01],
111
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09]])
112
+ //
113
+ // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
114
+ //
115
+ // Color conversion in NPP
116
+ // -----------------------
117
+ // https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
118
+ //
119
+ // NPP provides different ways to convert YUV to RGB:
120
+ // - pre-defined color conversion functions like
121
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx
122
+ // which are for BT.709 limited and full range, respectively.
123
+ // - generic color conversion functions that accept a custom color conversion
124
+ // matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx
125
+ //
126
+ // We use the pre-defined functions or the color twist functions depending on
127
+ // which one we find to be closer to the CPU results.
128
+ //
129
+ // The color twist functionality is *partially* described in a section named
130
+ // "YUVToRGBColorTwist". Importantly:
131
+ //
132
+ // - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data
133
+ // and the color-conversion matrix as input. The function itself and the
134
+ // matrix assume different ranges for YUV values:
135
+ // - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in
136
+ // [-0.5, 0.5]. That's how we defined our matrix above.
137
+ // - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all
138
+ // of the input Y, U, V to be in [0, 255]. That's how the data comes out of
139
+ // the decoder.
140
+ // - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
141
+ // be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
142
+ // offset to U and V. Y doesn't need to be offset. The offset can be applied
143
+ // by adding a 4th column to the matrix.
144
+ //
145
+ //
146
+ // So our conversion matrix becomes the following, with new offset column:
147
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0]
148
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128]
149
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]])
150
+ //
151
+ // And that's what we need to pass for BT701, full range.
152
+ /* clang-format on */
153
+
154
+ // BT.709 full range color conversion matrix for YUV to RGB conversion.
155
+ // See Note [YUV -> RGB Color Conversion, color space and color range]
156
+ const Npp32f bt709FullRangeColorTwist[3][4] = {
157
+ {1.0f, 0.0f, 1.5748f, 0.0f},
158
+ {1.0f, -0.187324273f, -0.468124273f, -128.0f},
159
+ {1.0f, 1.8556f, 0.0f, -128.0f}};
160
+
161
+ torch::Tensor convertNV12FrameToRGB(
162
+ UniqueAVFrame& avFrame,
163
+ const torch::Device& device,
164
+ const UniqueNppContext& nppCtx,
165
+ at::cuda::CUDAStream nvdecStream,
166
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
167
+ auto frameDims = FrameDims(avFrame->height, avFrame->width);
168
+ torch::Tensor dst;
169
+ if (preAllocatedOutputTensor.has_value()) {
170
+ dst = preAllocatedOutputTensor.value();
171
+ } else {
172
+ dst = allocateEmptyHWCTensor(frameDims, device);
173
+ }
174
+
175
+ // We need to make sure NVDEC has finished decoding a frame before
176
+ // color-converting it with NPP.
177
+ // So we make the NPP stream wait for NVDEC to finish.
178
+ at::cuda::CUDAStream nppStream =
179
+ at::cuda::getCurrentCUDAStream(device.index());
180
+ at::cuda::CUDAEvent nvdecDoneEvent;
181
+ nvdecDoneEvent.record(nvdecStream);
182
+ nvdecDoneEvent.block(nppStream);
183
+
184
+ nppCtx->hStream = nppStream.stream();
185
+ cudaError_t err = cudaStreamGetFlags(nppCtx->hStream, &nppCtx->nStreamFlags);
186
+ TORCH_CHECK(
187
+ err == cudaSuccess,
188
+ "cudaStreamGetFlags failed: ",
189
+ cudaGetErrorString(err));
190
+
191
+ NppiSize oSizeROI = {frameDims.width, frameDims.height};
192
+ Npp8u* yuvData[2] = {avFrame->data[0], avFrame->data[1]};
193
+
194
+ NppStatus status;
195
+
196
+ // For background, see
197
+ // Note [YUV -> RGB Color Conversion, color space and color range]
198
+ if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
199
+ if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
200
+ // NPP provides a pre-defined color conversion function for BT.709 full
201
+ // range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely
202
+ // matching the results we have on CPU. So we're using a custom color
203
+ // conversion matrix, which provides more accurate results. See the note
204
+ // mentioned above for details, and headaches.
205
+
206
+ int srcStep[2] = {avFrame->linesize[0], avFrame->linesize[1]};
207
+
208
+ status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx(
209
+ yuvData,
210
+ srcStep,
211
+ static_cast<Npp8u*>(dst.data_ptr()),
212
+ dst.stride(0),
213
+ oSizeROI,
214
+ bt709FullRangeColorTwist,
215
+ *nppCtx);
216
+ } else {
217
+ // If not full range, we assume studio limited range.
218
+ // The color conversion matrix for BT.709 limited range should be:
219
+ // static const Npp32f bt709LimitedRangeColorTwist[3][4] = {
220
+ // {1.16438356f, 0.0f, 1.79274107f, -16.0f},
221
+ // {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f},
222
+ // {1.16438356f, 2.11240179f, 0.0f, -128.0f}
223
+ // };
224
+ // We get very close results to CPU with that, but using the pre-defined
225
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate.
226
+ status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
227
+ yuvData,
228
+ avFrame->linesize[0],
229
+ static_cast<Npp8u*>(dst.data_ptr()),
230
+ dst.stride(0),
231
+ oSizeROI,
232
+ *nppCtx);
233
+ }
234
+ } else {
235
+ // TODO we're assuming BT.601 color space (and probably limited range) by
236
+ // calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range,
237
+ // and other color-spaces like 2020.
238
+ status = nppiNV12ToRGB_8u_P2C3R_Ctx(
239
+ yuvData,
240
+ avFrame->linesize[0],
241
+ static_cast<Npp8u*>(dst.data_ptr()),
242
+ dst.stride(0),
243
+ oSizeROI,
244
+ *nppCtx);
245
+ }
246
+ TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
247
+
248
+ return dst;
249
+ }
250
+
251
+ UniqueNppContext getNppStreamContext(const torch::Device& device) {
252
+ torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
253
+
254
+ UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
255
+ if (nppCtx) {
256
+ return nppCtx;
257
+ }
258
+
259
+ // From 12.9, NPP recommends using a user-created NppStreamContext and using
260
+ // the `_Ctx()` calls:
261
+ // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
262
+ // And the nppGetStreamContext() helper is deprecated. We are explicitly
263
+ // supposed to create the NppStreamContext manually from the CUDA device
264
+ // properties:
265
+ // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72
266
+
267
+ nppCtx = std::make_unique<NppStreamContext>();
268
+ cudaDeviceProp prop{};
269
+ cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
270
+ TORCH_CHECK(
271
+ err == cudaSuccess,
272
+ "cudaGetDeviceProperties failed: ",
273
+ cudaGetErrorString(err));
274
+
275
+ nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
276
+ nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
277
+ nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
278
+ nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
279
+ nppCtx->nSharedMemPerBlock = prop.sharedMemPerBlock;
280
+ nppCtx->nCudaDevAttrComputeCapabilityMajor = prop.major;
281
+ nppCtx->nCudaDevAttrComputeCapabilityMinor = prop.minor;
282
+
283
+ return nppCtx;
284
+ }
285
+
286
+ void returnNppStreamContextToCache(
287
+ const torch::Device& device,
288
+ UniqueNppContext nppCtx) {
289
+ if (nppCtx) {
290
+ g_cached_npp_ctxs.addIfCacheHasCapacity(device, std::move(nppCtx));
291
+ }
292
+ }
293
+
294
+ void validatePreAllocatedTensorShape(
295
+ const std::optional<torch::Tensor>& preAllocatedOutputTensor,
296
+ const UniqueAVFrame& avFrame) {
297
+ // Note that CUDA does not yet support transforms, so the only possible
298
+ // frame dimensions are the raw decoded frame's dimensions.
299
+ auto frameDims = FrameDims(avFrame->height, avFrame->width);
300
+
301
+ if (preAllocatedOutputTensor.has_value()) {
302
+ auto shape = preAllocatedOutputTensor.value().sizes();
303
+ TORCH_CHECK(
304
+ (shape.size() == 3) && (shape[0] == frameDims.height) &&
305
+ (shape[1] == frameDims.width) && (shape[2] == 3),
306
+ "Expected tensor of shape ",
307
+ frameDims.height,
308
+ "x",
309
+ frameDims.width,
310
+ "x3, got ",
311
+ shape);
312
+ }
313
+ }
314
+
315
+ } // namespace facebook::torchcodec
@@ -0,0 +1,46 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include <ATen/cuda/CUDAEvent.h>
10
+ #include <c10/cuda/CUDAStream.h>
11
+ #include <npp.h>
12
+ #include <torch/types.h>
13
+
14
+ #include "src/torchcodec/_core/Cache.h"
15
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
16
+ #include "src/torchcodec/_core/Frame.h"
17
+
18
+ extern "C" {
19
+ #include <libavutil/hwcontext_cuda.h>
20
+ #include <libavutil/pixdesc.h>
21
+ }
22
+
23
+ namespace facebook::torchcodec {
24
+
25
+ void initializeCudaContextWithPytorch(const torch::Device& device);
26
+
27
+ // Unique pointer type for NPP stream context
28
+ using UniqueNppContext = std::unique_ptr<NppStreamContext>;
29
+
30
+ torch::Tensor convertNV12FrameToRGB(
31
+ UniqueAVFrame& avFrame,
32
+ const torch::Device& device,
33
+ const UniqueNppContext& nppCtx,
34
+ at::cuda::CUDAStream nvdecStream,
35
+ std::optional<torch::Tensor> preAllocatedOutputTensor = std::nullopt);
36
+
37
+ UniqueNppContext getNppStreamContext(const torch::Device& device);
38
+ void returnNppStreamContextToCache(
39
+ const torch::Device& device,
40
+ UniqueNppContext nppCtx);
41
+
42
+ void validatePreAllocatedTensorShape(
43
+ const std::optional<torch::Tensor>& preAllocatedOutputTensor,
44
+ const UniqueAVFrame& avFrame);
45
+
46
+ } // namespace facebook::torchcodec
@@ -0,0 +1,138 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include <torch/types.h>
10
+ #include <memory>
11
+ #include <mutex>
12
+
13
+ namespace facebook::torchcodec {
14
+
15
+ // This header defines simple cache class primitives to store reusable objects
16
+ // across TorchCodec stream instances. Intended usage is to store hardware
17
+ // contexts creation of which is expensive. The cache mechanism is as follows:
18
+ // 1. 'PerGpuCache' provides a dynamic cache with the specified maximum capacity
19
+ // for the given number of GPUs.
20
+ // 2. When stream object (e.g. SingleStreamDecoder) is destoyed cachable object
21
+ // must be released to the cache. Cache will accept the object if it is not
22
+ // full.
23
+ // 3. When stream object (e.g. SingleStreamDecoder) is created cachable object
24
+ // must be first queried from the cache. If the cache is empty then new
25
+ // object must be created.
26
+
27
+ template <typename T, typename D = std::default_delete<T>>
28
+ class Cache {
29
+ public:
30
+ using element_type = std::unique_ptr<T, D>;
31
+
32
+ explicit Cache(int capacity) : capacity_(capacity) {}
33
+
34
+ // Adds an object to the cache if the cache has capacity. Returns true
35
+ // if object was added and false otherwise.
36
+ bool addIfCacheHasCapacity(element_type&& obj);
37
+
38
+ // Returns an object from the cache. Cache does not hold a reference
39
+ // to the object after this call.
40
+ element_type get();
41
+
42
+ private:
43
+ int capacity_;
44
+ std::mutex mutex_;
45
+ std::vector<element_type> cache_;
46
+ };
47
+
48
+ template <typename T, typename D>
49
+ bool Cache<T, D>::addIfCacheHasCapacity(element_type&& obj) {
50
+ std::scoped_lock lock(mutex_);
51
+ if (capacity_ >= 0 && cache_.size() >= static_cast<size_t>(capacity_)) {
52
+ return false;
53
+ }
54
+ cache_.push_back(std::move(obj));
55
+ return true;
56
+ }
57
+
58
+ template <typename T, typename D>
59
+ typename Cache<T, D>::element_type Cache<T, D>::get() {
60
+ std::scoped_lock lock(mutex_);
61
+ if (cache_.empty()) {
62
+ return nullptr;
63
+ }
64
+
65
+ element_type obj = std::move(cache_.back());
66
+ cache_.pop_back();
67
+ return obj;
68
+ }
69
+
70
+ template <typename T, typename D = std::default_delete<T>>
71
+ class PerGpuCache {
72
+ public:
73
+ using element_type = typename Cache<T, D>::element_type;
74
+
75
+ // Initializes 'maxGpus' number of caches. Each cache can hold no
76
+ // more than 'capacity' items. If 'capacity' <0 cache size is unlimited.
77
+ PerGpuCache(int maxGpus, int capacity) {
78
+ TORCH_CHECK(maxGpus > 0, "maxGpus for PerGpuCache must be >0");
79
+ for (int i = 0; i < maxGpus; ++i) {
80
+ cache_.emplace_back(std::make_unique<Cache<T, D>>(capacity));
81
+ }
82
+ }
83
+
84
+ // Adds an object to the specified device cache if the cache has
85
+ // capacity. Returns true if object was added and false otherwise.
86
+ bool addIfCacheHasCapacity(const torch::Device& device, element_type&& obj);
87
+
88
+ // Returns an object from the cache of the specified device. Cache
89
+ // does not hold a reference to the object after this call.
90
+ element_type get(const torch::Device& device);
91
+
92
+ private:
93
+ // 'Cache' class implementation contains mutex which makes it non-movable
94
+ // and non-copyable, so we need to wrap it in std::unique_ptr.
95
+ std::vector<std::unique_ptr<Cache<T, D>>> cache_;
96
+ };
97
+
98
+ // Note: this function is inline for convenience, not performance. Because the
99
+ // rest of this file is template functions, they must all be defined in this
100
+ // header. This function is not a template function, and should, in principle,
101
+ // be defined in a .cpp file to preserve the One Definition Rule. That's
102
+ // annoying for such a small amount of code, so we just inline it. If this file
103
+ // grows, and there are more such functions, we should break them out into a
104
+ // .cpp file.
105
+ inline torch::DeviceIndex getNonNegativeDeviceIndex(
106
+ const torch::Device& device) {
107
+ torch::DeviceIndex deviceIndex = device.index();
108
+ // For single GPU machines libtorch returns -1 for the device index. So for
109
+ // that case we set the device index to 0. That's used in per-gpu cache
110
+ // implementation and during initialization of CUDA and FFmpeg contexts
111
+ // which require non negative indices.
112
+ deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0);
113
+ TORCH_CHECK(deviceIndex >= 0, "Device index out of range");
114
+ return deviceIndex;
115
+ }
116
+
117
+ template <typename T, typename D>
118
+ bool PerGpuCache<T, D>::addIfCacheHasCapacity(
119
+ const torch::Device& device,
120
+ element_type&& obj) {
121
+ torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
122
+ TORCH_CHECK(
123
+ static_cast<size_t>(deviceIndex) < cache_.size(),
124
+ "Device index out of range");
125
+ return cache_[deviceIndex]->addIfCacheHasCapacity(std::move(obj));
126
+ }
127
+
128
+ template <typename T, typename D>
129
+ typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get(
130
+ const torch::Device& device) {
131
+ torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
132
+ TORCH_CHECK(
133
+ static_cast<size_t>(deviceIndex) < cache_.size(),
134
+ "Device index out of range");
135
+ return cache_[deviceIndex]->get();
136
+ }
137
+
138
+ } // namespace facebook::torchcodec