torchcodec 0.7.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (67) hide show
  1. torchcodec/__init__.py +16 -0
  2. torchcodec/_core/AVIOContextHolder.cpp +60 -0
  3. torchcodec/_core/AVIOContextHolder.h +64 -0
  4. torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
  5. torchcodec/_core/AVIOFileLikeContext.h +55 -0
  6. torchcodec/_core/AVIOTensorContext.cpp +123 -0
  7. torchcodec/_core/AVIOTensorContext.h +43 -0
  8. torchcodec/_core/CMakeLists.txt +292 -0
  9. torchcodec/_core/Cache.h +138 -0
  10. torchcodec/_core/CpuDeviceInterface.cpp +266 -0
  11. torchcodec/_core/CpuDeviceInterface.h +70 -0
  12. torchcodec/_core/CudaDeviceInterface.cpp +514 -0
  13. torchcodec/_core/CudaDeviceInterface.h +37 -0
  14. torchcodec/_core/DeviceInterface.cpp +79 -0
  15. torchcodec/_core/DeviceInterface.h +67 -0
  16. torchcodec/_core/Encoder.cpp +514 -0
  17. torchcodec/_core/Encoder.h +123 -0
  18. torchcodec/_core/FFMPEGCommon.cpp +421 -0
  19. torchcodec/_core/FFMPEGCommon.h +227 -0
  20. torchcodec/_core/FilterGraph.cpp +142 -0
  21. torchcodec/_core/FilterGraph.h +45 -0
  22. torchcodec/_core/Frame.cpp +32 -0
  23. torchcodec/_core/Frame.h +118 -0
  24. torchcodec/_core/Metadata.h +72 -0
  25. torchcodec/_core/SingleStreamDecoder.cpp +1715 -0
  26. torchcodec/_core/SingleStreamDecoder.h +380 -0
  27. torchcodec/_core/StreamOptions.h +53 -0
  28. torchcodec/_core/ValidationUtils.cpp +35 -0
  29. torchcodec/_core/ValidationUtils.h +21 -0
  30. torchcodec/_core/__init__.py +40 -0
  31. torchcodec/_core/_metadata.py +317 -0
  32. torchcodec/_core/custom_ops.cpp +727 -0
  33. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +300 -0
  34. torchcodec/_core/ops.py +455 -0
  35. torchcodec/_core/pybind_ops.cpp +87 -0
  36. torchcodec/_frame.py +145 -0
  37. torchcodec/_internally_replaced_utils.py +67 -0
  38. torchcodec/_samplers/__init__.py +7 -0
  39. torchcodec/_samplers/video_clip_sampler.py +430 -0
  40. torchcodec/decoders/__init__.py +11 -0
  41. torchcodec/decoders/_audio_decoder.py +177 -0
  42. torchcodec/decoders/_decoder_utils.py +52 -0
  43. torchcodec/decoders/_video_decoder.py +464 -0
  44. torchcodec/encoders/__init__.py +1 -0
  45. torchcodec/encoders/_audio_encoder.py +150 -0
  46. torchcodec/libtorchcodec_core4.dll +0 -0
  47. torchcodec/libtorchcodec_core5.dll +0 -0
  48. torchcodec/libtorchcodec_core6.dll +0 -0
  49. torchcodec/libtorchcodec_core7.dll +0 -0
  50. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  51. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  52. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  53. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  54. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  55. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  56. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  57. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  58. torchcodec/samplers/__init__.py +2 -0
  59. torchcodec/samplers/_common.py +84 -0
  60. torchcodec/samplers/_index_based.py +287 -0
  61. torchcodec/samplers/_time_based.py +350 -0
  62. torchcodec/version.py +2 -0
  63. torchcodec-0.7.0.dist-info/METADATA +242 -0
  64. torchcodec-0.7.0.dist-info/RECORD +67 -0
  65. torchcodec-0.7.0.dist-info/WHEEL +5 -0
  66. torchcodec-0.7.0.dist-info/licenses/LICENSE +28 -0
  67. torchcodec-0.7.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,70 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include "src/torchcodec/_core/DeviceInterface.h"
10
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
11
+ #include "src/torchcodec/_core/FilterGraph.h"
12
+
13
+ namespace facebook::torchcodec {
14
+
15
+ class CpuDeviceInterface : public DeviceInterface {
16
+ public:
17
+ CpuDeviceInterface(const torch::Device& device);
18
+
19
+ virtual ~CpuDeviceInterface() {}
20
+
21
+ std::optional<const AVCodec*> findCodec(
22
+ [[maybe_unused]] const AVCodecID& codecId) override {
23
+ return std::nullopt;
24
+ }
25
+
26
+ void initializeContext(
27
+ [[maybe_unused]] AVCodecContext* codecContext) override {}
28
+
29
+ void convertAVFrameToFrameOutput(
30
+ const VideoStreamOptions& videoStreamOptions,
31
+ const AVRational& timeBase,
32
+ UniqueAVFrame& avFrame,
33
+ FrameOutput& frameOutput,
34
+ std::optional<torch::Tensor> preAllocatedOutputTensor =
35
+ std::nullopt) override;
36
+
37
+ private:
38
+ int convertAVFrameToTensorUsingSwsScale(
39
+ const UniqueAVFrame& avFrame,
40
+ torch::Tensor& outputTensor);
41
+
42
+ torch::Tensor convertAVFrameToTensorUsingFilterGraph(
43
+ const UniqueAVFrame& avFrame);
44
+
45
+ struct SwsFrameContext {
46
+ int inputWidth;
47
+ int inputHeight;
48
+ AVPixelFormat inputFormat;
49
+ int outputWidth;
50
+ int outputHeight;
51
+ bool operator==(const SwsFrameContext&) const;
52
+ bool operator!=(const SwsFrameContext&) const;
53
+ };
54
+
55
+ void createSwsContext(
56
+ const SwsFrameContext& swsFrameContext,
57
+ const enum AVColorSpace colorspace);
58
+
59
+ // color-conversion fields. Only one of FilterGraphContext and
60
+ // UniqueSwsContext should be non-null.
61
+ std::unique_ptr<FilterGraph> filterGraphContext_;
62
+ UniqueSwsContext swsContext_;
63
+
64
+ // Used to know whether a new FilterGraphContext or UniqueSwsContext should
65
+ // be created before decoding a new frame.
66
+ SwsFrameContext prevSwsFrameContext_;
67
+ FiltersContext prevFiltersContext_;
68
+ };
69
+
70
+ } // namespace facebook::torchcodec
@@ -0,0 +1,514 @@
1
+ #include <ATen/cuda/CUDAEvent.h>
2
+ #include <c10/cuda/CUDAStream.h>
3
+ #include <npp.h>
4
+ #include <torch/types.h>
5
+ #include <mutex>
6
+
7
+ #include "src/torchcodec/_core/Cache.h"
8
+ #include "src/torchcodec/_core/CudaDeviceInterface.h"
9
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
10
+
11
+ extern "C" {
12
+ #include <libavutil/hwcontext_cuda.h>
13
+ #include <libavutil/pixdesc.h>
14
+ }
15
+
16
+ namespace facebook::torchcodec {
17
+ namespace {
18
+
19
+ static bool g_cuda =
20
+ registerDeviceInterface(torch::kCUDA, [](const torch::Device& device) {
21
+ return new CudaDeviceInterface(device);
22
+ });
23
+
24
+ // BT.709 full range color conversion matrix for YUV to RGB conversion.
25
+ // See Note [YUV -> RGB Color Conversion, color space and color range] below.
26
+ constexpr Npp32f bt709FullRangeColorTwist[3][4] = {
27
+ {1.0f, 0.0f, 1.5748f, 0.0f},
28
+ {1.0f, -0.187324273f, -0.468124273f, -128.0f},
29
+ {1.0f, 1.8556f, 0.0f, -128.0f}};
30
+
31
+ // We reuse cuda contexts across VideoDeoder instances. This is because
32
+ // creating a cuda context is expensive. The cache mechanism is as follows:
33
+ // 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for
34
+ // each GPU.
35
+ // 2. When we destroy a SingleStreamDecoder instance we release the cuda context
36
+ // to
37
+ // the cache if the cache is not full.
38
+ // 3. When we create a SingleStreamDecoder instance we try to get a cuda context
39
+ // from
40
+ // the cache. If the cache is empty we create a new cuda context.
41
+
42
+ // Pytorch can only handle up to 128 GPUs.
43
+ // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
44
+ const int MAX_CUDA_GPUS = 128;
45
+ // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
46
+ // Set to a positive number to have a cache of that size.
47
+ const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
48
+ PerGpuCache<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>
49
+ g_cached_hw_device_ctxs(MAX_CUDA_GPUS, MAX_CONTEXTS_PER_GPU_IN_CACHE);
50
+ PerGpuCache<NppStreamContext> g_cached_npp_ctxs(
51
+ MAX_CUDA_GPUS,
52
+ MAX_CONTEXTS_PER_GPU_IN_CACHE);
53
+
54
+ #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
55
+
56
+ AVBufferRef* getFFMPEGContextFromExistingCudaContext(
57
+ const torch::Device& device,
58
+ torch::DeviceIndex nonNegativeDeviceIndex,
59
+ enum AVHWDeviceType type) {
60
+ c10::cuda::CUDAGuard deviceGuard(device);
61
+ // Valid values for the argument to cudaSetDevice are 0 to maxDevices - 1:
62
+ // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g159587909ffa0791bbe4b40187a4c6bb
63
+ // So we ensure the deviceIndex is not negative.
64
+ // We set the device because we may be called from a different thread than
65
+ // the one that initialized the cuda context.
66
+ cudaSetDevice(nonNegativeDeviceIndex);
67
+ AVBufferRef* hw_device_ctx = nullptr;
68
+ std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
69
+ int err = av_hwdevice_ctx_create(
70
+ &hw_device_ctx,
71
+ type,
72
+ deviceOrdinal.c_str(),
73
+ nullptr,
74
+ AV_CUDA_USE_CURRENT_CONTEXT);
75
+ if (err < 0) {
76
+ /* clang-format off */
77
+ TORCH_CHECK(
78
+ false,
79
+ "Failed to create specified HW device. This typically happens when ",
80
+ "your installed FFmpeg doesn't support CUDA (see ",
81
+ "https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec",
82
+ "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
83
+ /* clang-format on */
84
+ }
85
+ return hw_device_ctx;
86
+ }
87
+
88
+ #else
89
+
90
+ AVBufferRef* getFFMPEGContextFromNewCudaContext(
91
+ [[maybe_unused]] const torch::Device& device,
92
+ torch::DeviceIndex nonNegativeDeviceIndex,
93
+ enum AVHWDeviceType type) {
94
+ AVBufferRef* hw_device_ctx = nullptr;
95
+ std::string deviceOrdinal = std::to_string(nonNegativeDeviceIndex);
96
+ int err = av_hwdevice_ctx_create(
97
+ &hw_device_ctx, type, deviceOrdinal.c_str(), nullptr, 0);
98
+ if (err < 0) {
99
+ TORCH_CHECK(
100
+ false,
101
+ "Failed to create specified HW device",
102
+ getFFMPEGErrorStringFromErrorCode(err));
103
+ }
104
+ return hw_device_ctx;
105
+ }
106
+
107
+ #endif
108
+
109
+ UniqueAVBufferRef getCudaContext(const torch::Device& device) {
110
+ enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
111
+ TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
112
+ torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
113
+
114
+ UniqueAVBufferRef hw_device_ctx = g_cached_hw_device_ctxs.get(device);
115
+ if (hw_device_ctx) {
116
+ return hw_device_ctx;
117
+ }
118
+
119
+ // 58.26.100 introduced the concept of reusing the existing cuda context
120
+ // which is much faster and lower memory than creating a new cuda context.
121
+ // So we try to use that if it is available.
122
+ // FFMPEG 6.1.2 appears to be the earliest release that contains version
123
+ // 58.26.100 of avutil.
124
+ // https://github.com/FFmpeg/FFmpeg/blob/4acb9b7d1046944345ae506165fb55883d04d8a6/doc/APIchanges#L265
125
+ #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
126
+ return UniqueAVBufferRef(getFFMPEGContextFromExistingCudaContext(
127
+ device, nonNegativeDeviceIndex, type));
128
+ #else
129
+ return UniqueAVBufferRef(
130
+ getFFMPEGContextFromNewCudaContext(device, nonNegativeDeviceIndex, type));
131
+ #endif
132
+ }
133
+
134
+ std::unique_ptr<NppStreamContext> getNppStreamContext(
135
+ const torch::Device& device) {
136
+ torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
137
+
138
+ std::unique_ptr<NppStreamContext> nppCtx = g_cached_npp_ctxs.get(device);
139
+ if (nppCtx) {
140
+ return nppCtx;
141
+ }
142
+
143
+ // From 12.9, NPP recommends using a user-created NppStreamContext and using
144
+ // the `_Ctx()` calls:
145
+ // https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#npp-release-12-9-update-1
146
+ // And the nppGetStreamContext() helper is deprecated. We are explicitly
147
+ // supposed to create the NppStreamContext manually from the CUDA device
148
+ // properties:
149
+ // https://github.com/NVIDIA/CUDALibrarySamples/blob/d97803a40fab83c058bb3d68b6c38bd6eebfff43/NPP/README.md?plain=1#L54-L72
150
+
151
+ nppCtx = std::make_unique<NppStreamContext>();
152
+ cudaDeviceProp prop{};
153
+ cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
154
+ TORCH_CHECK(
155
+ err == cudaSuccess,
156
+ "cudaGetDeviceProperties failed: ",
157
+ cudaGetErrorString(err));
158
+
159
+ nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
160
+ nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
161
+ nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
162
+ nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
163
+ nppCtx->nSharedMemPerBlock = prop.sharedMemPerBlock;
164
+ nppCtx->nCudaDevAttrComputeCapabilityMajor = prop.major;
165
+ nppCtx->nCudaDevAttrComputeCapabilityMinor = prop.minor;
166
+
167
+ return nppCtx;
168
+ }
169
+
170
+ } // namespace
171
+
172
+ CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
173
+ : DeviceInterface(device) {
174
+ TORCH_CHECK(g_cuda, "CudaDeviceInterface was not registered!");
175
+ TORCH_CHECK(
176
+ device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
177
+ }
178
+
179
+ CudaDeviceInterface::~CudaDeviceInterface() {
180
+ if (ctx_) {
181
+ g_cached_hw_device_ctxs.addIfCacheHasCapacity(device_, std::move(ctx_));
182
+ }
183
+ if (nppCtx_) {
184
+ g_cached_npp_ctxs.addIfCacheHasCapacity(device_, std::move(nppCtx_));
185
+ }
186
+ }
187
+
188
+ void CudaDeviceInterface::initializeContext(AVCodecContext* codecContext) {
189
+ TORCH_CHECK(!ctx_, "FFmpeg HW device context already initialized");
190
+
191
+ // It is important for pytorch itself to create the cuda context. If ffmpeg
192
+ // creates the context it may not be compatible with pytorch.
193
+ // This is a dummy tensor to initialize the cuda context.
194
+ torch::Tensor dummyTensorForCudaInitialization = torch::empty(
195
+ {1}, torch::TensorOptions().dtype(torch::kUInt8).device(device_));
196
+ ctx_ = getCudaContext(device_);
197
+ nppCtx_ = getNppStreamContext(device_);
198
+ codecContext->hw_device_ctx = av_buffer_ref(ctx_.get());
199
+ return;
200
+ }
201
+
202
+ void CudaDeviceInterface::convertAVFrameToFrameOutput(
203
+ const VideoStreamOptions& videoStreamOptions,
204
+ [[maybe_unused]] const AVRational& timeBase,
205
+ UniqueAVFrame& avFrame,
206
+ FrameOutput& frameOutput,
207
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
208
+ if (avFrame->format != AV_PIX_FMT_CUDA) {
209
+ // The frame's format is AV_PIX_FMT_CUDA if and only if its content is on
210
+ // the GPU. In this branch, the frame is on the CPU: this is what NVDEC
211
+ // gives us if it wasn't able to decode a frame, for whatever reason.
212
+ // Typically that happens if the video's encoder isn't supported by NVDEC.
213
+ // Below, we choose to convert the frame's color-space using the CPU
214
+ // codepath, and send it back to the GPU at the very end.
215
+ // TODO: A possibly better solution would be to send the frame to the GPU
216
+ // first, and do the color conversion there.
217
+ auto cpuDevice = torch::Device(torch::kCPU);
218
+ auto cpuInterface = createDeviceInterface(cpuDevice);
219
+
220
+ FrameOutput cpuFrameOutput;
221
+ cpuInterface->convertAVFrameToFrameOutput(
222
+ videoStreamOptions,
223
+ timeBase,
224
+ avFrame,
225
+ cpuFrameOutput,
226
+ preAllocatedOutputTensor);
227
+
228
+ frameOutput.data = cpuFrameOutput.data.to(device_);
229
+ return;
230
+ }
231
+
232
+ // Above we checked that the AVFrame was on GPU, but that's not enough, we
233
+ // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
234
+ // because this is what the NPP color conversion routines expect.
235
+ // TODO: we should investigate how to can perform color conversion for
236
+ // non-8bit videos. This is supported on CPU.
237
+ TORCH_CHECK(
238
+ avFrame->hw_frames_ctx != nullptr,
239
+ "The AVFrame does not have a hw_frames_ctx. "
240
+ "That's unexpected, please report this to the TorchCodec repo.");
241
+
242
+ auto hwFramesCtx =
243
+ reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
244
+ AVPixelFormat actualFormat = hwFramesCtx->sw_format;
245
+ TORCH_CHECK(
246
+ actualFormat == AV_PIX_FMT_NV12,
247
+ "The AVFrame is ",
248
+ (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
249
+ : "unknown"),
250
+ ", but we expected AV_PIX_FMT_NV12. This typically happens when "
251
+ "the video isn't 8bit, which is not supported on CUDA at the moment. "
252
+ "Try using the CPU device instead. "
253
+ "If the video is 10bit, we are tracking 10bit support in "
254
+ "https://github.com/pytorch/torchcodec/issues/776");
255
+
256
+ auto frameDims =
257
+ getHeightAndWidthFromOptionsOrAVFrame(videoStreamOptions, avFrame);
258
+ int height = frameDims.height;
259
+ int width = frameDims.width;
260
+ torch::Tensor& dst = frameOutput.data;
261
+ if (preAllocatedOutputTensor.has_value()) {
262
+ dst = preAllocatedOutputTensor.value();
263
+ auto shape = dst.sizes();
264
+ TORCH_CHECK(
265
+ (shape.size() == 3) && (shape[0] == height) && (shape[1] == width) &&
266
+ (shape[2] == 3),
267
+ "Expected tensor of shape ",
268
+ height,
269
+ "x",
270
+ width,
271
+ "x3, got ",
272
+ shape);
273
+ } else {
274
+ dst = allocateEmptyHWCTensor(height, width, device_);
275
+ }
276
+
277
+ torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device_);
278
+
279
+ // Create a CUDA event and attach it to the AVFrame's CUDA stream. That's the
280
+ // NVDEC stream, i.e. the CUDA stream that the frame was decoded on.
281
+ // We will be waiting for this event to complete before calling the NPP
282
+ // functions, to ensure NVDEC has finished decoding the frame before running
283
+ // the NPP color-conversion.
284
+ // Note that our code is generic and assumes that the NVDEC's stream can be
285
+ // arbitrary, but unfortunately we know it's hardcoded to be the default
286
+ // stream by FFmpeg:
287
+ // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
288
+ TORCH_CHECK(
289
+ hwFramesCtx->device_ctx != nullptr,
290
+ "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
291
+ auto cudaDeviceCtx =
292
+ static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
293
+ at::cuda::CUDAEvent nvdecDoneEvent;
294
+ at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
295
+ c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, deviceIndex);
296
+ nvdecDoneEvent.record(nvdecStream);
297
+
298
+ // Don't start NPP work before NVDEC is done decoding the frame!
299
+ at::cuda::CUDAStream nppStream = at::cuda::getCurrentCUDAStream(deviceIndex);
300
+ nvdecDoneEvent.block(nppStream);
301
+
302
+ // Create the NPP context if we haven't yet.
303
+ nppCtx_->hStream = nppStream.stream();
304
+ cudaError_t err =
305
+ cudaStreamGetFlags(nppCtx_->hStream, &nppCtx_->nStreamFlags);
306
+ TORCH_CHECK(
307
+ err == cudaSuccess,
308
+ "cudaStreamGetFlags failed: ",
309
+ cudaGetErrorString(err));
310
+
311
+ NppiSize oSizeROI = {width, height};
312
+ Npp8u* yuvData[2] = {avFrame->data[0], avFrame->data[1]};
313
+
314
+ NppStatus status;
315
+
316
+ // For background, see
317
+ // Note [YUV -> RGB Color Conversion, color space and color range]
318
+ if (avFrame->colorspace == AVColorSpace::AVCOL_SPC_BT709) {
319
+ if (avFrame->color_range == AVColorRange::AVCOL_RANGE_JPEG) {
320
+ // NPP provides a pre-defined color conversion function for BT.709 full
321
+ // range: nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx. But it's not closely
322
+ // matching the results we have on CPU. So we're using a custom color
323
+ // conversion matrix, which provides more accurate results. See the note
324
+ // mentioned above for details, and headaches.
325
+
326
+ int srcStep[2] = {avFrame->linesize[0], avFrame->linesize[1]};
327
+
328
+ status = nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx(
329
+ yuvData,
330
+ srcStep,
331
+ static_cast<Npp8u*>(dst.data_ptr()),
332
+ dst.stride(0),
333
+ oSizeROI,
334
+ bt709FullRangeColorTwist,
335
+ *nppCtx_);
336
+ } else {
337
+ // If not full range, we assume studio limited range.
338
+ // The color conversion matrix for BT.709 limited range should be:
339
+ // static const Npp32f bt709LimitedRangeColorTwist[3][4] = {
340
+ // {1.16438356f, 0.0f, 1.79274107f, -16.0f},
341
+ // {1.16438356f, -0.213248614f, -0.5329093290f, -128.0f},
342
+ // {1.16438356f, 2.11240179f, 0.0f, -128.0f}
343
+ // };
344
+ // We get very close results to CPU with that, but using the pre-defined
345
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx seems to be even more accurate.
346
+ status = nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx(
347
+ yuvData,
348
+ avFrame->linesize[0],
349
+ static_cast<Npp8u*>(dst.data_ptr()),
350
+ dst.stride(0),
351
+ oSizeROI,
352
+ *nppCtx_);
353
+ }
354
+ } else {
355
+ // TODO we're assuming BT.601 color space (and probably limited range) by
356
+ // calling nppiNV12ToRGB_8u_P2C3R_Ctx. We should handle BT.601 full range,
357
+ // and other color-spaces like 2020.
358
+ status = nppiNV12ToRGB_8u_P2C3R_Ctx(
359
+ yuvData,
360
+ avFrame->linesize[0],
361
+ static_cast<Npp8u*>(dst.data_ptr()),
362
+ dst.stride(0),
363
+ oSizeROI,
364
+ *nppCtx_);
365
+ }
366
+ TORCH_CHECK(status == NPP_SUCCESS, "Failed to convert NV12 frame.");
367
+ }
368
+
369
+ // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
370
+ // we have to do this because of an FFmpeg bug where hardware decoding is not
371
+ // appropriately set, so we just go off and find the matching codec for the CUDA
372
+ // device
373
+ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
374
+ const AVCodecID& codecId) {
375
+ void* i = nullptr;
376
+ const AVCodec* codec = nullptr;
377
+ while ((codec = av_codec_iterate(&i)) != nullptr) {
378
+ if (codec->id != codecId || !av_codec_is_decoder(codec)) {
379
+ continue;
380
+ }
381
+
382
+ const AVCodecHWConfig* config = nullptr;
383
+ for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
384
+ ++j) {
385
+ if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
386
+ return codec;
387
+ }
388
+ }
389
+ }
390
+
391
+ return std::nullopt;
392
+ }
393
+
394
+ } // namespace facebook::torchcodec
395
+
396
+ /* clang-format off */
397
+ // Note: [YUV -> RGB Color Conversion, color space and color range]
398
+ //
399
+ // The frames we get from the decoder (FFmpeg decoder, or NVCUVID) are in YUV
400
+ // format. We need to convert them to RGB. This note attempts to describe this
401
+ // process. There may be some inaccuracies and approximations that experts will
402
+ // notice, but our goal is only to provide a good enough understanding of the
403
+ // process for torchcodec developers to implement and maintain it.
404
+ // On CPU, filtergraph and swscale handle everything for us. With CUDA, we have
405
+ // to do a lot of the heavy lifting ourselves.
406
+ //
407
+ // Color space and color range
408
+ // ---------------------------
409
+ // Two main characteristics of a frame will affect the conversion process:
410
+ // 1. Color space: This basically defines what YUV values correspond to which
411
+ // physical wavelength. No need to go into details here,the point is that
412
+ // videos can come in different color spaces, the most common ones being
413
+ // BT.601 and BT.709, but there are others.
414
+ // In FFmpeg this is represented with AVColorSpace:
415
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#aff71a069509a1ad3ff54d53a1c894c85
416
+ // 2. Color range: This defines the range of YUV values. There is:
417
+ // - full range, also called PC range: AVCOL_RANGE_JPEG
418
+ // - and the "limited" range, also called studio or TV range: AVCOL_RANGE_MPEG
419
+ // https://ffmpeg.org/doxygen/4.0/pixfmt_8h.html#a3da0bf691418bc22c4bcbe6583ad589a
420
+ //
421
+ // Color space and color range are independent concepts, so we can have a BT.709
422
+ // with full range, and another one with limited range. Same for BT.601.
423
+ //
424
+ // In the first version of this note we'll focus on the full color range. It
425
+ // will later be updated to account for the limited range.
426
+ //
427
+ // Color conversion matrix
428
+ // -----------------------
429
+ // YUV -> RGB conversion is defined as the reverse process of the RGB -> YUV,
430
+ // So this is where we'll start.
431
+ // At the core of a RGB -> YUV conversion are the "luma coefficients", which are
432
+ // specific to a given color space and defined by the color space standard. In
433
+ // FFmpeg they can be found here:
434
+ // https://github.com/FFmpeg/FFmpeg/blob/7d606ef0ccf2946a4a21ab1ec23486cadc21864b/libavutil/csp.c#L46-L56
435
+ //
436
+ // For example, the BT.709 coefficients are: kr=0.2126, kg=0.7152, kb=0.0722
437
+ // Coefficients must sum to 1.
438
+ //
439
+ // Conventionally Y is in [0, 1] range, and U and V are in [-0.5, 0.5] range
440
+ // (that's mathematically, in practice they are represented in integer range).
441
+ // The conversion is defined as:
442
+ // https://en.wikipedia.org/wiki/YCbCr#R'G'B'_to_Y%E2%80%B2PbPr
443
+ // Y = kr*R + kg*G + kb*B
444
+ // U = (B - Y) * 0.5 / (1 - kb) = (B - Y) / u_scale where u_scale = 2 * (1 - kb)
445
+ // V = (R - Y) * 0.5 / (1 - kr) = (R - Y) / v_scale where v_scale = 2 * (1 - kr)
446
+ //
447
+ // Putting all this into matrix form, we get:
448
+ // [Y] = [kr kg kb ] [R]
449
+ // [U] [-kr/u_scale -kg/u_scale (1-kb)/u_scale] [G]
450
+ // [V] [(1-kr)/v_scale -kg/v_scale -kb)/v_scale ] [B]
451
+ //
452
+ //
453
+ // Now, to convert YUV to RGB, we just need to invert this matrix:
454
+ // ```py
455
+ // import torch
456
+ // kr, kg, kb = 0.2126, 0.7152, 0.0722 # BT.709 luma coefficients
457
+ // u_scale = 2 * (1 - kb)
458
+ // v_scale = 2 * (1 - kr)
459
+ //
460
+ // rgb_to_yuv = torch.tensor([
461
+ // [kr, kg, kb],
462
+ // [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
463
+ // [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
464
+ // ])
465
+ //
466
+ // yuv_to_rgb_full = torch.linalg.inv(rgb_to_yuv)
467
+ // print("YUV->RGB matrix (Full Range):")
468
+ // print(yuv_to_rgb_full)
469
+ // ```
470
+ // And we get:
471
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00],
472
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01],
473
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09]])
474
+ //
475
+ // Which matches https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.709_conversion
476
+ //
477
+ // Color conversion in NPP
478
+ // -----------------------
479
+ // https://docs.nvidia.com/cuda/npp/image_color_conversion.html.
480
+ //
481
+ // NPP provides different ways to convert YUV to RGB:
482
+ // - pre-defined color conversion functions like
483
+ // nppiNV12ToRGB_709CSC_8u_P2C3R_Ctx and nppiNV12ToRGB_709HDTV_8u_P2C3R_Ctx
484
+ // which are for BT.709 limited and full range, respectively.
485
+ // - generic color conversion functions that accept a custom color conversion
486
+ // matrix, called ColorTwist, like nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx
487
+ //
488
+ // We use the pre-defined functions or the color twist functions depending on
489
+ // which one we find to be closer to the CPU results.
490
+ //
491
+ // The color twist functionality is *partially* described in a section named
492
+ // "YUVToRGBColorTwist". Importantly:
493
+ //
494
+ // - The `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` function takes the YUV data
495
+ // and the color-conversion matrix as input. The function itself and the
496
+ // matrix assume different ranges for YUV values:
497
+ // - The **matrix coefficient** must assume that Y is in [0, 1] and U,V are in
498
+ // [-0.5, 0.5]. That's how we defined our matrix above.
499
+ // - The function `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` however expects all
500
+ // of the input Y, U, V to be in [0, 255]. That's how the data comes out of
501
+ // the decoder.
502
+ // - But *internally*, `nppiNV12ToRGB_8u_ColorTwist32f_P2C3R_Ctx` needs U and V to
503
+ // be centered around 0, i.e. in [-128, 127]. So we need to apply a -128
504
+ // offset to U and V. Y doesn't need to be offset. The offset can be applied
505
+ // by adding a 4th column to the matrix.
506
+ //
507
+ //
508
+ // So our conversion matrix becomes the following, with new offset column:
509
+ // tensor([[ 1.0000e+00, -3.3142e-09, 1.5748e+00, 0]
510
+ // [ 1.0000e+00, -1.8732e-01, -4.6812e-01, -128]
511
+ // [ 1.0000e+00, 1.8556e+00, 4.6231e-09 , -128]])
512
+ //
513
+ // And that's what we need to pass for BT701, full range.
514
+ /* clang-format on */
@@ -0,0 +1,37 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include <npp.h>
10
+ #include "src/torchcodec/_core/DeviceInterface.h"
11
+
12
+ namespace facebook::torchcodec {
13
+
14
+ class CudaDeviceInterface : public DeviceInterface {
15
+ public:
16
+ CudaDeviceInterface(const torch::Device& device);
17
+
18
+ virtual ~CudaDeviceInterface();
19
+
20
+ std::optional<const AVCodec*> findCodec(const AVCodecID& codecId) override;
21
+
22
+ void initializeContext(AVCodecContext* codecContext) override;
23
+
24
+ void convertAVFrameToFrameOutput(
25
+ const VideoStreamOptions& videoStreamOptions,
26
+ const AVRational& timeBase,
27
+ UniqueAVFrame& avFrame,
28
+ FrameOutput& frameOutput,
29
+ std::optional<torch::Tensor> preAllocatedOutputTensor =
30
+ std::nullopt) override;
31
+
32
+ private:
33
+ UniqueAVBufferRef ctx_;
34
+ std::unique_ptr<NppStreamContext> nppCtx_;
35
+ };
36
+
37
+ } // namespace facebook::torchcodec