torchcodec 0.10.0__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. torchcodec/__init__.py +27 -0
  2. torchcodec/_core/AVIOContextHolder.cpp +60 -0
  3. torchcodec/_core/AVIOContextHolder.h +64 -0
  4. torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
  5. torchcodec/_core/AVIOFileLikeContext.h +55 -0
  6. torchcodec/_core/AVIOTensorContext.cpp +130 -0
  7. torchcodec/_core/AVIOTensorContext.h +44 -0
  8. torchcodec/_core/BetaCudaDeviceInterface.cpp +849 -0
  9. torchcodec/_core/BetaCudaDeviceInterface.h +196 -0
  10. torchcodec/_core/CMakeLists.txt +295 -0
  11. torchcodec/_core/CUDACommon.cpp +330 -0
  12. torchcodec/_core/CUDACommon.h +51 -0
  13. torchcodec/_core/Cache.h +124 -0
  14. torchcodec/_core/CpuDeviceInterface.cpp +509 -0
  15. torchcodec/_core/CpuDeviceInterface.h +141 -0
  16. torchcodec/_core/CudaDeviceInterface.cpp +602 -0
  17. torchcodec/_core/CudaDeviceInterface.h +79 -0
  18. torchcodec/_core/DeviceInterface.cpp +117 -0
  19. torchcodec/_core/DeviceInterface.h +191 -0
  20. torchcodec/_core/Encoder.cpp +1054 -0
  21. torchcodec/_core/Encoder.h +192 -0
  22. torchcodec/_core/FFMPEGCommon.cpp +684 -0
  23. torchcodec/_core/FFMPEGCommon.h +314 -0
  24. torchcodec/_core/FilterGraph.cpp +159 -0
  25. torchcodec/_core/FilterGraph.h +59 -0
  26. torchcodec/_core/Frame.cpp +47 -0
  27. torchcodec/_core/Frame.h +72 -0
  28. torchcodec/_core/Metadata.cpp +124 -0
  29. torchcodec/_core/Metadata.h +92 -0
  30. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  31. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  32. torchcodec/_core/NVDECCache.cpp +60 -0
  33. torchcodec/_core/NVDECCache.h +102 -0
  34. torchcodec/_core/SingleStreamDecoder.cpp +1586 -0
  35. torchcodec/_core/SingleStreamDecoder.h +391 -0
  36. torchcodec/_core/StreamOptions.h +70 -0
  37. torchcodec/_core/Transform.cpp +128 -0
  38. torchcodec/_core/Transform.h +86 -0
  39. torchcodec/_core/ValidationUtils.cpp +35 -0
  40. torchcodec/_core/ValidationUtils.h +21 -0
  41. torchcodec/_core/__init__.py +46 -0
  42. torchcodec/_core/_metadata.py +262 -0
  43. torchcodec/_core/custom_ops.cpp +1090 -0
  44. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +169 -0
  45. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  46. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  47. torchcodec/_core/ops.py +605 -0
  48. torchcodec/_core/pybind_ops.cpp +50 -0
  49. torchcodec/_frame.py +146 -0
  50. torchcodec/_internally_replaced_utils.py +68 -0
  51. torchcodec/_samplers/__init__.py +7 -0
  52. torchcodec/_samplers/video_clip_sampler.py +419 -0
  53. torchcodec/decoders/__init__.py +12 -0
  54. torchcodec/decoders/_audio_decoder.py +185 -0
  55. torchcodec/decoders/_decoder_utils.py +113 -0
  56. torchcodec/decoders/_video_decoder.py +601 -0
  57. torchcodec/encoders/__init__.py +2 -0
  58. torchcodec/encoders/_audio_encoder.py +149 -0
  59. torchcodec/encoders/_video_encoder.py +196 -0
  60. torchcodec/libtorchcodec_core4.so +0 -0
  61. torchcodec/libtorchcodec_core5.so +0 -0
  62. torchcodec/libtorchcodec_core6.so +0 -0
  63. torchcodec/libtorchcodec_core7.so +0 -0
  64. torchcodec/libtorchcodec_core8.so +0 -0
  65. torchcodec/libtorchcodec_custom_ops4.so +0 -0
  66. torchcodec/libtorchcodec_custom_ops5.so +0 -0
  67. torchcodec/libtorchcodec_custom_ops6.so +0 -0
  68. torchcodec/libtorchcodec_custom_ops7.so +0 -0
  69. torchcodec/libtorchcodec_custom_ops8.so +0 -0
  70. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  71. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  72. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  73. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  74. torchcodec/libtorchcodec_pybind_ops8.so +0 -0
  75. torchcodec/samplers/__init__.py +2 -0
  76. torchcodec/samplers/_common.py +84 -0
  77. torchcodec/samplers/_index_based.py +287 -0
  78. torchcodec/samplers/_time_based.py +358 -0
  79. torchcodec/share/cmake/TorchCodec/TorchCodecConfig.cmake +76 -0
  80. torchcodec/share/cmake/TorchCodec/ffmpeg_versions.cmake +122 -0
  81. torchcodec/transforms/__init__.py +12 -0
  82. torchcodec/transforms/_decoder_transforms.py +375 -0
  83. torchcodec/version.py +2 -0
  84. torchcodec-0.10.0.dist-info/METADATA +286 -0
  85. torchcodec-0.10.0.dist-info/RECORD +88 -0
  86. torchcodec-0.10.0.dist-info/WHEEL +5 -0
  87. torchcodec-0.10.0.dist-info/licenses/LICENSE +28 -0
  88. torchcodec-0.10.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,602 @@
1
+ #include <ATen/cuda/CUDAEvent.h>
2
+ #include <c10/cuda/CUDAStream.h>
3
+ #include <torch/types.h>
4
+ #include <mutex>
5
+
6
+ #include "Cache.h"
7
+ #include "CudaDeviceInterface.h"
8
+ #include "FFMPEGCommon.h"
9
+ #include "ValidationUtils.h"
10
+
11
+ extern "C" {
12
+ #include <libavutil/hwcontext_cuda.h>
13
+ #include <libavutil/pixdesc.h>
14
+ }
15
+
16
+ namespace facebook::torchcodec {
17
+ namespace {
18
+
19
+ static bool g_cuda = registerDeviceInterface(
20
+ DeviceInterfaceKey(torch::kCUDA),
21
+ [](const torch::Device& device) {
22
+ return new CudaDeviceInterface(device);
23
+ });
24
+
25
+ // We reuse cuda contexts across VideoDeoder instances. This is because
26
+ // creating a cuda context is expensive. The cache mechanism is as follows:
27
+ // 1. There is a cache of size MAX_CONTEXTS_PER_GPU_IN_CACHE cuda contexts for
28
+ // each GPU.
29
+ // 2. When we destroy a SingleStreamDecoder instance we release the cuda context
30
+ // to
31
+ // the cache if the cache is not full.
32
+ // 3. When we create a SingleStreamDecoder instance we try to get a cuda context
33
+ // from
34
+ // the cache. If the cache is empty we create a new cuda context.
35
+
36
+ // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
37
+ // Set to a positive number to have a cache of that size.
38
+ const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
39
+ PerGpuCache<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>
40
+ g_cached_hw_device_ctxs(MAX_CUDA_GPUS, MAX_CONTEXTS_PER_GPU_IN_CACHE);
41
+
42
+ int getFlagsAVHardwareDeviceContextCreate() {
43
+ // 58.26.100 introduced the concept of reusing the existing cuda context
44
+ // which is much faster and lower memory than creating a new cuda context.
45
+ #if LIBAVUTIL_VERSION_INT >= AV_VERSION_INT(58, 26, 100)
46
+ return AV_CUDA_USE_CURRENT_CONTEXT;
47
+ #else
48
+ return 0;
49
+ #endif
50
+ }
51
+
52
+ UniqueAVBufferRef getHardwareDeviceContext(const torch::Device& device) {
53
+ enum AVHWDeviceType type = av_hwdevice_find_type_by_name("cuda");
54
+ TORCH_CHECK(type != AV_HWDEVICE_TYPE_NONE, "Failed to find cuda device");
55
+ int deviceIndex = getDeviceIndex(device);
56
+
57
+ UniqueAVBufferRef hardwareDeviceCtx = g_cached_hw_device_ctxs.get(device);
58
+ if (hardwareDeviceCtx) {
59
+ return hardwareDeviceCtx;
60
+ }
61
+
62
+ // Create hardware device context
63
+ c10::cuda::CUDAGuard deviceGuard(device);
64
+ // We set the device because we may be called from a different thread than
65
+ // the one that initialized the cuda context.
66
+ TORCH_CHECK(
67
+ cudaSetDevice(deviceIndex) == cudaSuccess, "Failed to set CUDA device");
68
+ AVBufferRef* hardwareDeviceCtxRaw = nullptr;
69
+ std::string deviceOrdinal = std::to_string(deviceIndex);
70
+
71
+ int err = av_hwdevice_ctx_create(
72
+ &hardwareDeviceCtxRaw,
73
+ type,
74
+ deviceOrdinal.c_str(),
75
+ nullptr,
76
+ getFlagsAVHardwareDeviceContextCreate());
77
+
78
+ if (err < 0) {
79
+ /* clang-format off */
80
+ TORCH_CHECK(
81
+ false,
82
+ "Failed to create specified HW device. This typically happens when ",
83
+ "your installed FFmpeg doesn't support CUDA (see ",
84
+ "https://github.com/pytorch/torchcodec#installing-cuda-enabled-torchcodec",
85
+ "). FFmpeg error: ", getFFMPEGErrorStringFromErrorCode(err));
86
+ /* clang-format on */
87
+ }
88
+
89
+ return UniqueAVBufferRef(hardwareDeviceCtxRaw);
90
+ }
91
+
92
+ } // namespace
93
+
94
+ CudaDeviceInterface::CudaDeviceInterface(const torch::Device& device)
95
+ : DeviceInterface(device) {
96
+ TORCH_CHECK(g_cuda, "CudaDeviceInterface was not registered!");
97
+ TORCH_CHECK(
98
+ device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
99
+
100
+ initializeCudaContextWithPytorch(device_);
101
+
102
+ hardwareDeviceCtx_ = getHardwareDeviceContext(device_);
103
+ nppCtx_ = getNppStreamContext(device_);
104
+ }
105
+
106
+ CudaDeviceInterface::~CudaDeviceInterface() {
107
+ if (hardwareDeviceCtx_) {
108
+ g_cached_hw_device_ctxs.addIfCacheHasCapacity(
109
+ device_, std::move(hardwareDeviceCtx_));
110
+ }
111
+ returnNppStreamContextToCache(device_, std::move(nppCtx_));
112
+ }
113
+
114
+ void CudaDeviceInterface::initialize(
115
+ const AVStream* avStream,
116
+ const UniqueDecodingAVFormatContext& avFormatCtx,
117
+ const SharedAVCodecContext& codecContext) {
118
+ TORCH_CHECK(avStream != nullptr, "avStream is null");
119
+ codecContext_ = codecContext;
120
+ timeBase_ = avStream->time_base;
121
+
122
+ // TODO: Ideally, we should keep all interface implementations independent.
123
+ cpuInterface_ = createDeviceInterface(torch::kCPU);
124
+ TORCH_CHECK(
125
+ cpuInterface_ != nullptr, "Failed to create CPU device interface");
126
+ cpuInterface_->initialize(avStream, avFormatCtx, codecContext);
127
+ cpuInterface_->initializeVideo(
128
+ VideoStreamOptions(),
129
+ {},
130
+ /*resizedOutputDims=*/std::nullopt);
131
+ }
132
+
133
+ void CudaDeviceInterface::initializeVideo(
134
+ const VideoStreamOptions& videoStreamOptions,
135
+ [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>& transforms,
136
+ [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims) {
137
+ videoStreamOptions_ = videoStreamOptions;
138
+ }
139
+
140
+ void CudaDeviceInterface::registerHardwareDeviceWithCodec(
141
+ AVCodecContext* codecContext) {
142
+ TORCH_CHECK(
143
+ hardwareDeviceCtx_, "Hardware device context has not been initialized");
144
+ TORCH_CHECK(codecContext != nullptr, "codecContext is null");
145
+ codecContext->hw_device_ctx = av_buffer_ref(hardwareDeviceCtx_.get());
146
+ }
147
+
148
+ UniqueAVFrame CudaDeviceInterface::maybeConvertAVFrameToNV12OrRGB24(
149
+ UniqueAVFrame& avFrame) {
150
+ // We need FFmpeg filters to handle those conversion cases which are not
151
+ // directly implemented in CUDA or CPU device interface (in case of a
152
+ // fallback).
153
+
154
+ // Input frame is on CPU, we will just pass it to CPU device interface, so
155
+ // skipping filters context as CPU device interface will handle everything for
156
+ // us.
157
+ if (avFrame->format != AV_PIX_FMT_CUDA) {
158
+ return std::move(avFrame);
159
+ }
160
+
161
+ auto hwFramesCtx =
162
+ reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
163
+ TORCH_CHECK(
164
+ hwFramesCtx != nullptr,
165
+ "The AVFrame does not have a hw_frames_ctx. "
166
+ "That's unexpected, please report this to the TorchCodec repo.");
167
+
168
+ AVPixelFormat actualFormat = hwFramesCtx->sw_format;
169
+
170
+ // If the frame is already in NV12 format, we don't need to do anything.
171
+ if (actualFormat == AV_PIX_FMT_NV12) {
172
+ return std::move(avFrame);
173
+ }
174
+
175
+ AVPixelFormat outputFormat;
176
+ std::stringstream filters;
177
+
178
+ unsigned version_int = avfilter_version();
179
+ if (version_int < AV_VERSION_INT(8, 0, 103)) {
180
+ // Color conversion support ('format=' option) was added to scale_cuda from
181
+ // n5.0. With the earlier version of ffmpeg we have no choice but use CPU
182
+ // filters. See:
183
+ // https://github.com/FFmpeg/FFmpeg/commit/62dc5df941f5e196164c151691e4274195523e95
184
+ outputFormat = AV_PIX_FMT_RGB24;
185
+
186
+ auto actualFormatName = av_get_pix_fmt_name(actualFormat);
187
+ TORCH_CHECK(
188
+ actualFormatName != nullptr,
189
+ "The actual format of a frame is unknown to FFmpeg. "
190
+ "That's unexpected, please report this to the TorchCodec repo.");
191
+
192
+ filters << "hwdownload,format=" << actualFormatName;
193
+ } else {
194
+ // Actual output color format will be set via filter options
195
+ outputFormat = AV_PIX_FMT_CUDA;
196
+
197
+ filters << "scale_cuda=format=nv12:interp_algo=bilinear";
198
+ }
199
+
200
+ enum AVPixelFormat frameFormat =
201
+ static_cast<enum AVPixelFormat>(avFrame->format);
202
+
203
+ auto newContext = std::make_unique<FiltersContext>(
204
+ avFrame->width,
205
+ avFrame->height,
206
+ frameFormat,
207
+ avFrame->sample_aspect_ratio,
208
+ avFrame->width,
209
+ avFrame->height,
210
+ outputFormat,
211
+ filters.str(),
212
+ timeBase_,
213
+ av_buffer_ref(avFrame->hw_frames_ctx));
214
+
215
+ if (!nv12Conversion_ || *nv12ConversionContext_ != *newContext) {
216
+ nv12Conversion_ =
217
+ std::make_unique<FilterGraph>(*newContext, videoStreamOptions_);
218
+ nv12ConversionContext_ = std::move(newContext);
219
+ }
220
+ auto filteredAVFrame = nv12Conversion_->convert(avFrame);
221
+
222
+ // If this check fails it means the frame wasn't
223
+ // reshaped to its expected dimensions by filtergraph.
224
+ TORCH_CHECK(
225
+ (filteredAVFrame->width == nv12ConversionContext_->outputWidth) &&
226
+ (filteredAVFrame->height == nv12ConversionContext_->outputHeight),
227
+ "Expected frame from filter graph of ",
228
+ nv12ConversionContext_->outputWidth,
229
+ "x",
230
+ nv12ConversionContext_->outputHeight,
231
+ ", got ",
232
+ filteredAVFrame->width,
233
+ "x",
234
+ filteredAVFrame->height);
235
+
236
+ return filteredAVFrame;
237
+ }
238
+
239
+ void CudaDeviceInterface::convertAVFrameToFrameOutput(
240
+ UniqueAVFrame& avFrame,
241
+ FrameOutput& frameOutput,
242
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
243
+ validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
244
+
245
+ hasDecodedFrame_ = true;
246
+
247
+ // All of our CUDA decoding assumes NV12 format. We handle non-NV12 formats by
248
+ // converting them to NV12.
249
+ avFrame = maybeConvertAVFrameToNV12OrRGB24(avFrame);
250
+
251
+ if (avFrame->format != AV_PIX_FMT_CUDA) {
252
+ // The frame's format is AV_PIX_FMT_CUDA if and only if its content is on
253
+ // the GPU. In this branch, the frame is on the CPU. There are two possible
254
+ // reasons:
255
+ //
256
+ // 1. During maybeConvertAVFrameToNV12OrRGB24(), we had a non-NV12 format
257
+ // frame and we're on FFmpeg 4.4 or earlier. In such cases, we had to
258
+ // use CPU filters and we just converted the frame to RGB24.
259
+ // 2. This is what NVDEC gave us if it wasn't able to decode a frame, for
260
+ // whatever reason. Typically that happens if the video's encoder isn't
261
+ // supported by NVDEC.
262
+ //
263
+ // In both cases, we have a frame on the CPU. We send the frame back to the
264
+ // CUDA device when we're done.
265
+
266
+ enum AVPixelFormat frameFormat =
267
+ static_cast<enum AVPixelFormat>(avFrame->format);
268
+
269
+ FrameOutput cpuFrameOutput;
270
+ if (frameFormat == AV_PIX_FMT_RGB24) {
271
+ // Reason 1 above. The frame is already in RGB24, we just need to convert
272
+ // it to a tensor.
273
+ cpuFrameOutput.data = rgbAVFrameToTensor(avFrame);
274
+ } else {
275
+ // Reason 2 above. We need to do a full conversion which requires an
276
+ // actual CPU device.
277
+ cpuInterface_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
278
+ }
279
+
280
+ // Finally, we need to send the frame back to the GPU. Note that the
281
+ // pre-allocated tensor is on the GPU, so we can't send that to the CPU
282
+ // device interface. We copy it over here.
283
+ if (preAllocatedOutputTensor.has_value()) {
284
+ preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
285
+ frameOutput.data = preAllocatedOutputTensor.value();
286
+ } else {
287
+ frameOutput.data = cpuFrameOutput.data.to(device_);
288
+ }
289
+
290
+ usingCPUFallback_ = true;
291
+ return;
292
+ }
293
+
294
+ usingCPUFallback_ = false;
295
+
296
+ // Above we checked that the AVFrame was on GPU, but that's not enough, we
297
+ // also need to check that the AVFrame is in AV_PIX_FMT_NV12 format (8 bits),
298
+ // because this is what the NPP color conversion routines expect. This SHOULD
299
+ // be enforced by our call to maybeConvertAVFrameToNV12OrRGB24() above.
300
+ TORCH_CHECK(
301
+ avFrame->hw_frames_ctx != nullptr,
302
+ "The AVFrame does not have a hw_frames_ctx. This should never happen");
303
+ AVHWFramesContext* hwFramesCtx =
304
+ reinterpret_cast<AVHWFramesContext*>(avFrame->hw_frames_ctx->data);
305
+ TORCH_CHECK(
306
+ hwFramesCtx != nullptr,
307
+ "The AVFrame does not have a valid hw_frames_ctx. This should never happen");
308
+
309
+ AVPixelFormat actualFormat = hwFramesCtx->sw_format;
310
+ TORCH_CHECK(
311
+ actualFormat == AV_PIX_FMT_NV12,
312
+ "The AVFrame is ",
313
+ (av_get_pix_fmt_name(actualFormat) ? av_get_pix_fmt_name(actualFormat)
314
+ : "unknown"),
315
+ ", but we expected AV_PIX_FMT_NV12. "
316
+ "That's unexpected, please report this to the TorchCodec repo.");
317
+
318
+ // Figure out the NVDEC stream from the avFrame's hardware context.
319
+ // In reality, we know that this stream is hardcoded to be the default stream
320
+ // by FFmpeg:
321
+ // https://github.com/FFmpeg/FFmpeg/blob/66e40840d15b514f275ce3ce2a4bf72ec68c7311/libavutil/hwcontext_cuda.c#L387-L388
322
+ TORCH_CHECK(
323
+ hwFramesCtx->device_ctx != nullptr,
324
+ "The AVFrame's hw_frames_ctx does not have a device_ctx. ");
325
+ auto cudaDeviceCtx =
326
+ static_cast<AVCUDADeviceContext*>(hwFramesCtx->device_ctx->hwctx);
327
+ TORCH_CHECK(cudaDeviceCtx != nullptr, "The hardware context is null");
328
+ at::cuda::CUDAStream nvdecStream = // That's always the default stream. Sad.
329
+ c10::cuda::getStreamFromExternal(cudaDeviceCtx->stream, device_.index());
330
+
331
+ frameOutput.data = convertNV12FrameToRGB(
332
+ avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
333
+ }
334
+
335
+ // inspired by https://github.com/FFmpeg/FFmpeg/commit/ad67ea9
336
+ // we have to do this because of an FFmpeg bug where hardware decoding is not
337
+ // appropriately set, so we just go off and find the matching codec for the CUDA
338
+ // device
339
+ std::optional<const AVCodec*> CudaDeviceInterface::findCodec(
340
+ const AVCodecID& codecId,
341
+ bool isDecoder) {
342
+ void* i = nullptr;
343
+ const AVCodec* codec = nullptr;
344
+ while ((codec = av_codec_iterate(&i)) != nullptr) {
345
+ TORCH_CHECK(
346
+ codec != nullptr,
347
+ "codec returned by av_codec_iterate should not be null");
348
+ if (isDecoder) {
349
+ if (codec->id != codecId || !av_codec_is_decoder(codec)) {
350
+ continue;
351
+ }
352
+ } else {
353
+ if (codec->id != codecId || !av_codec_is_encoder(codec)) {
354
+ continue;
355
+ }
356
+ }
357
+
358
+ const AVCodecHWConfig* config = nullptr;
359
+ for (int j = 0; (config = avcodec_get_hw_config(codec, j)) != nullptr;
360
+ ++j) {
361
+ if (config->device_type == AV_HWDEVICE_TYPE_CUDA) {
362
+ return codec;
363
+ }
364
+ }
365
+ }
366
+
367
+ return std::nullopt;
368
+ }
369
+
370
+ std::string CudaDeviceInterface::getDetails() {
371
+ // Note: for this interface specifically the fallback is only known after a
372
+ // frame has been decoded, not before: that's when FFmpeg decides to fallback,
373
+ // so we can't know earlier.
374
+ if (!hasDecodedFrame_) {
375
+ return std::string(
376
+ "FFmpeg CUDA Device Interface. Fallback status unknown (no frames decoded).");
377
+ }
378
+ return std::string("FFmpeg CUDA Device Interface. Using ") +
379
+ (usingCPUFallback_ ? "CPU fallback." : "NVDEC.");
380
+ }
381
+
382
+ // --------------------------------------------------------------------------
383
+ // Below are methods exclusive to video encoding:
384
+ // --------------------------------------------------------------------------
385
+ namespace {
386
+ // Note: [RGB -> YUV Color Conversion, limited color range]
387
+ //
388
+ // For context on this subject, first read the note:
389
+ // [YUV -> RGB Color Conversion, color space and color range]
390
+ // https://github.com/meta-pytorch/torchcodec/blob/main/src/torchcodec/_core/CUDACommon.cpp#L63-L65
391
+ //
392
+ // Lets encode RGB -> YUV in the limited color range for BT.601 color space.
393
+ // In limited range, the [0, 255] range is mapped into [16-235] for Y, and into
394
+ // [16-240] for U,V.
395
+ // To implement, we get the full range conversion matrix as before, then scale:
396
+ // - Y channel: scale by (235-16)/255 = 219/255
397
+ // - U,V channels: scale by (240-16)/255 = 224/255
398
+ // https://en.wikipedia.org/wiki/YCbCr#Y%E2%80%B2PbPr_to_Y%E2%80%B2CbCr
399
+ //
400
+ // ```py
401
+ // import torch
402
+ // kr, kg, kb = 0.299, 0.587, 0.114 # BT.601 luma coefficients
403
+ // u_scale = 2 * (1 - kb)
404
+ // v_scale = 2 * (1 - kr)
405
+ //
406
+ // rgb_to_yuv_full = torch.tensor([
407
+ // [kr, kg, kb],
408
+ // [-kr/u_scale, -kg/u_scale, (1-kb)/u_scale],
409
+ // [(1-kr)/v_scale, -kg/v_scale, -kb/v_scale]
410
+ // ])
411
+ //
412
+ // full_to_limited_y_scale = 219.0 / 255.0
413
+ // full_to_limited_uv_scale = 224.0 / 255.0
414
+ //
415
+ // rgb_to_yuv_limited = rgb_to_yuv_full * torch.tensor([
416
+ // [full_to_limited_y_scale],
417
+ // [full_to_limited_uv_scale],
418
+ // [full_to_limited_uv_scale]
419
+ // ])
420
+ //
421
+ // print("RGB->YUV matrix (Limited Range BT.601):")
422
+ // print(rgb_to_yuv_limited)
423
+ // ```
424
+ //
425
+ // This yields:
426
+ // tensor([[ 0.2568, 0.5041, 0.0979],
427
+ // [-0.1482, -0.2910, 0.4392],
428
+ // [ 0.4392, -0.3678, -0.0714]])
429
+ //
430
+ // Which matches https://fourcc.org/fccyvrgb.php
431
+ //
432
+ // To perform color conversion in NPP, we are required to provide these color
433
+ // conversion matrices to ColorTwist functions, for example,
434
+ // `nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx`.
435
+ // https://docs.nvidia.com/cuda/npp/image_color_conversion.html
436
+ //
437
+ // These offsets are added in the 4th column of each conversion matrix below.
438
+ // - In limited range, Y is offset by 16 to add the lower margin.
439
+ // - In both color ranges, U,V are offset by 128 to be centered around 0.
440
+ //
441
+ // RGB to YUV conversion matrices to use in NPP color conversion functions
442
+ struct ColorConversionMatrices {
443
+ static constexpr Npp32f BT601_LIMITED[3][4] = {
444
+ {0.2568f, 0.5041f, 0.0979f, 16.0f},
445
+ {-0.1482f, -0.2910f, 0.4392f, 128.0f},
446
+ {0.4392f, -0.3678f, -0.0714f, 128.0f}};
447
+
448
+ static constexpr Npp32f BT601_FULL[3][4] = {
449
+ {0.2990f, 0.5870f, 0.1140f, 0.0f},
450
+ {-0.1687f, -0.3313f, 0.5000f, 128.0f},
451
+ {0.5000f, -0.4187f, -0.0813f, 128.0f}};
452
+
453
+ static constexpr Npp32f BT709_LIMITED[3][4] = {
454
+ {0.1826f, 0.6142f, 0.0620f, 16.0f},
455
+ {-0.1006f, -0.3386f, 0.4392f, 128.0f},
456
+ {0.4392f, -0.3989f, -0.0403f, 128.0f}};
457
+
458
+ static constexpr Npp32f BT709_FULL[3][4] = {
459
+ {0.2126f, 0.7152f, 0.0722f, 0.0f},
460
+ {-0.1146f, -0.3854f, 0.5000f, 128.0f},
461
+ {0.5000f, -0.4542f, -0.0458f, 128.0f}};
462
+
463
+ static constexpr Npp32f BT2020_LIMITED[3][4] = {
464
+ {0.2256f, 0.5823f, 0.0509f, 16.0f},
465
+ {-0.1227f, -0.3166f, 0.4392f, 128.0f},
466
+ {0.4392f, -0.4039f, -0.0353f, 128.0f}};
467
+
468
+ static constexpr Npp32f BT2020_FULL[3][4] = {
469
+ {0.2627f, 0.6780f, 0.0593f, 0.0f},
470
+ {-0.139630f, -0.360370f, 0.5000f, 128.0f},
471
+ {0.5000f, -0.459786f, -0.040214f, 128.0f}};
472
+ };
473
+
474
+ // Returns conversion matrix based on codec context color space and range
475
+ const Npp32f (*getConversionMatrix(AVCodecContext* codecContext))[4] {
476
+ if (codecContext->color_range == AVCOL_RANGE_MPEG || // limited range
477
+ codecContext->color_range == AVCOL_RANGE_UNSPECIFIED) {
478
+ if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
479
+ return ColorConversionMatrices::BT601_LIMITED;
480
+ } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
481
+ return ColorConversionMatrices::BT709_LIMITED;
482
+ } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
483
+ return ColorConversionMatrices::BT2020_LIMITED;
484
+ } else { // default to BT.601
485
+ return ColorConversionMatrices::BT601_LIMITED;
486
+ }
487
+ } else if (codecContext->color_range == AVCOL_RANGE_JPEG) { // full range
488
+ if (codecContext->colorspace == AVCOL_SPC_BT470BG) {
489
+ return ColorConversionMatrices::BT601_FULL;
490
+ } else if (codecContext->colorspace == AVCOL_SPC_BT709) {
491
+ return ColorConversionMatrices::BT709_FULL;
492
+ } else if (codecContext->colorspace == AVCOL_SPC_BT2020_NCL) {
493
+ return ColorConversionMatrices::BT2020_FULL;
494
+ } else { // default to BT.601
495
+ return ColorConversionMatrices::BT601_FULL;
496
+ }
497
+ }
498
+ return ColorConversionMatrices::BT601_LIMITED;
499
+ }
500
+ } // namespace
501
+
502
+ UniqueAVFrame CudaDeviceInterface::convertCUDATensorToAVFrameForEncoding(
503
+ const torch::Tensor& tensor,
504
+ int frameIndex,
505
+ AVCodecContext* codecContext) {
506
+ TORCH_CHECK(
507
+ tensor.dim() == 3 && tensor.size(0) == 3,
508
+ "Expected 3D RGB tensor (CHW format), got shape: ",
509
+ tensor.sizes());
510
+ TORCH_CHECK(
511
+ tensor.device().type() == torch::kCUDA,
512
+ "Expected tensor on CUDA device, got: ",
513
+ tensor.device().str());
514
+
515
+ UniqueAVFrame avFrame(av_frame_alloc());
516
+ TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
517
+ int height = static_cast<int>(tensor.size(1));
518
+ int width = static_cast<int>(tensor.size(2));
519
+
520
+ // TODO-VideoEncoder: Unify AVFrame creation with CPU version of this method
521
+ avFrame->format = AV_PIX_FMT_CUDA;
522
+ avFrame->height = height;
523
+ avFrame->width = width;
524
+ avFrame->pts = frameIndex;
525
+
526
+ // FFmpeg's av_hwframe_get_buffer is used to allocate memory on CUDA device.
527
+ // TODO-VideoEncoder: Consider using pytorch to allocate CUDA memory for
528
+ // efficiency
529
+ int ret =
530
+ av_hwframe_get_buffer(codecContext->hw_frames_ctx, avFrame.get(), 0);
531
+ TORCH_CHECK(
532
+ ret >= 0,
533
+ "Failed to allocate hardware frame: ",
534
+ getFFMPEGErrorStringFromErrorCode(ret));
535
+
536
+ TORCH_CHECK(
537
+ avFrame != nullptr && avFrame->data[0] != nullptr,
538
+ "avFrame must be pre-allocated with CUDA memory");
539
+
540
+ // TODO VideoEncoder: Investigate ways to avoid this copy
541
+ torch::Tensor hwcFrame = tensor.permute({1, 2, 0}).contiguous();
542
+
543
+ NppiSize oSizeROI = {width, height};
544
+ NppStatus status;
545
+ // Convert to NV12, as CUDA_ENCODING_PIXEL_FORMAT is always NV12 currently
546
+ status = nppiRGBToNV12_8u_ColorTwist32f_C3P2R_Ctx(
547
+ static_cast<const Npp8u*>(hwcFrame.data_ptr()),
548
+ hwcFrame.stride(0) * hwcFrame.element_size(),
549
+ avFrame->data,
550
+ avFrame->linesize,
551
+ oSizeROI,
552
+ getConversionMatrix(codecContext),
553
+ *nppCtx_);
554
+
555
+ TORCH_CHECK(
556
+ status == NPP_SUCCESS,
557
+ "Failed to convert RGB to ",
558
+ av_get_pix_fmt_name(DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT),
559
+ ": NPP error code ",
560
+ status);
561
+
562
+ avFrame->colorspace = codecContext->colorspace;
563
+ avFrame->color_range = codecContext->color_range;
564
+ return avFrame;
565
+ }
566
+
567
+ // Allocates and initializes AVHWFramesContext, and sets pixel format fields
568
+ // to enable encoding with CUDA device. The hw_frames_ctx field is needed by
569
+ // FFmpeg to allocate frames on GPU's memory.
570
+ void CudaDeviceInterface::setupHardwareFrameContextForEncoding(
571
+ AVCodecContext* codecContext) {
572
+ TORCH_CHECK(codecContext != nullptr, "codecContext is null");
573
+ TORCH_CHECK(
574
+ hardwareDeviceCtx_, "Hardware device context has not been initialized");
575
+
576
+ AVBufferRef* hwFramesCtxRef = av_hwframe_ctx_alloc(hardwareDeviceCtx_.get());
577
+ TORCH_CHECK(
578
+ hwFramesCtxRef != nullptr,
579
+ "Failed to allocate hardware frames context for codec");
580
+
581
+ codecContext->sw_pix_fmt = DeviceInterface::CUDA_ENCODING_PIXEL_FORMAT;
582
+ // Always set pixel format to support CUDA encoding.
583
+ codecContext->pix_fmt = AV_PIX_FMT_CUDA;
584
+
585
+ AVHWFramesContext* hwFramesCtx =
586
+ reinterpret_cast<AVHWFramesContext*>(hwFramesCtxRef->data);
587
+ hwFramesCtx->format = codecContext->pix_fmt;
588
+ hwFramesCtx->sw_format = codecContext->sw_pix_fmt;
589
+ hwFramesCtx->width = codecContext->width;
590
+ hwFramesCtx->height = codecContext->height;
591
+
592
+ int ret = av_hwframe_ctx_init(hwFramesCtxRef);
593
+ if (ret < 0) {
594
+ av_buffer_unref(&hwFramesCtxRef);
595
+ TORCH_CHECK(
596
+ false,
597
+ "Failed to initialize CUDA frames context for codec: ",
598
+ getFFMPEGErrorStringFromErrorCode(ret));
599
+ }
600
+ codecContext->hw_frames_ctx = hwFramesCtxRef;
601
+ }
602
+ } // namespace facebook::torchcodec
@@ -0,0 +1,79 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include "CUDACommon.h"
10
+ #include "DeviceInterface.h"
11
+ #include "FilterGraph.h"
12
+
13
+ namespace facebook::torchcodec {
14
+
15
+ class CudaDeviceInterface : public DeviceInterface {
16
+ public:
17
+ CudaDeviceInterface(const torch::Device& device);
18
+
19
+ virtual ~CudaDeviceInterface();
20
+
21
+ std::optional<const AVCodec*> findCodec(
22
+ const AVCodecID& codecId,
23
+ bool isDecoder = true) override;
24
+
25
+ void initialize(
26
+ const AVStream* avStream,
27
+ const UniqueDecodingAVFormatContext& avFormatCtx,
28
+ const SharedAVCodecContext& codecContext) override;
29
+
30
+ void initializeVideo(
31
+ const VideoStreamOptions& videoStreamOptions,
32
+ [[maybe_unused]] const std::vector<std::unique_ptr<Transform>>&
33
+ transforms,
34
+ [[maybe_unused]] const std::optional<FrameDims>& resizedOutputDims)
35
+ override;
36
+
37
+ void registerHardwareDeviceWithCodec(AVCodecContext* codecContext) override;
38
+
39
+ void convertAVFrameToFrameOutput(
40
+ UniqueAVFrame& avFrame,
41
+ FrameOutput& frameOutput,
42
+ std::optional<torch::Tensor> preAllocatedOutputTensor) override;
43
+
44
+ std::string getDetails() override;
45
+
46
+ UniqueAVFrame convertCUDATensorToAVFrameForEncoding(
47
+ const torch::Tensor& tensor,
48
+ int frameIndex,
49
+ AVCodecContext* codecContext) override;
50
+
51
+ void setupHardwareFrameContextForEncoding(
52
+ AVCodecContext* codecContext) override;
53
+
54
+ private:
55
+ // Our CUDA decoding code assumes NV12 format. In order to handle other
56
+ // kinds of input, we need to convert them to NV12. Our current implementation
57
+ // does this using filtergraph.
58
+ UniqueAVFrame maybeConvertAVFrameToNV12OrRGB24(UniqueAVFrame& avFrame);
59
+
60
+ // We sometimes encounter frames that cannot be decoded on the CUDA device.
61
+ // Rather than erroring out, we decode them on the CPU.
62
+ std::unique_ptr<DeviceInterface> cpuInterface_;
63
+
64
+ VideoStreamOptions videoStreamOptions_;
65
+ AVRational timeBase_;
66
+
67
+ UniqueAVBufferRef hardwareDeviceCtx_;
68
+ UniqueNppContext nppCtx_;
69
+
70
+ // This filtergraph instance is only used for NV12 format conversion in
71
+ // maybeConvertAVFrameToNV12().
72
+ std::unique_ptr<FiltersContext> nv12ConversionContext_;
73
+ std::unique_ptr<FilterGraph> nv12Conversion_;
74
+
75
+ bool usingCPUFallback_ = false;
76
+ bool hasDecodedFrame_ = false;
77
+ };
78
+
79
+ } // namespace facebook::torchcodec