torchcodec 0.10.0__cp312-cp312-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. torchcodec/__init__.py +27 -0
  2. torchcodec/_core/AVIOContextHolder.cpp +60 -0
  3. torchcodec/_core/AVIOContextHolder.h +64 -0
  4. torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
  5. torchcodec/_core/AVIOFileLikeContext.h +55 -0
  6. torchcodec/_core/AVIOTensorContext.cpp +130 -0
  7. torchcodec/_core/AVIOTensorContext.h +44 -0
  8. torchcodec/_core/BetaCudaDeviceInterface.cpp +849 -0
  9. torchcodec/_core/BetaCudaDeviceInterface.h +196 -0
  10. torchcodec/_core/CMakeLists.txt +295 -0
  11. torchcodec/_core/CUDACommon.cpp +330 -0
  12. torchcodec/_core/CUDACommon.h +51 -0
  13. torchcodec/_core/Cache.h +124 -0
  14. torchcodec/_core/CpuDeviceInterface.cpp +509 -0
  15. torchcodec/_core/CpuDeviceInterface.h +141 -0
  16. torchcodec/_core/CudaDeviceInterface.cpp +602 -0
  17. torchcodec/_core/CudaDeviceInterface.h +79 -0
  18. torchcodec/_core/DeviceInterface.cpp +117 -0
  19. torchcodec/_core/DeviceInterface.h +191 -0
  20. torchcodec/_core/Encoder.cpp +1054 -0
  21. torchcodec/_core/Encoder.h +192 -0
  22. torchcodec/_core/FFMPEGCommon.cpp +684 -0
  23. torchcodec/_core/FFMPEGCommon.h +314 -0
  24. torchcodec/_core/FilterGraph.cpp +159 -0
  25. torchcodec/_core/FilterGraph.h +59 -0
  26. torchcodec/_core/Frame.cpp +47 -0
  27. torchcodec/_core/Frame.h +72 -0
  28. torchcodec/_core/Metadata.cpp +124 -0
  29. torchcodec/_core/Metadata.h +92 -0
  30. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  31. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  32. torchcodec/_core/NVDECCache.cpp +60 -0
  33. torchcodec/_core/NVDECCache.h +102 -0
  34. torchcodec/_core/SingleStreamDecoder.cpp +1586 -0
  35. torchcodec/_core/SingleStreamDecoder.h +391 -0
  36. torchcodec/_core/StreamOptions.h +70 -0
  37. torchcodec/_core/Transform.cpp +128 -0
  38. torchcodec/_core/Transform.h +86 -0
  39. torchcodec/_core/ValidationUtils.cpp +35 -0
  40. torchcodec/_core/ValidationUtils.h +21 -0
  41. torchcodec/_core/__init__.py +46 -0
  42. torchcodec/_core/_metadata.py +262 -0
  43. torchcodec/_core/custom_ops.cpp +1090 -0
  44. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +169 -0
  45. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  46. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  47. torchcodec/_core/ops.py +605 -0
  48. torchcodec/_core/pybind_ops.cpp +50 -0
  49. torchcodec/_frame.py +146 -0
  50. torchcodec/_internally_replaced_utils.py +68 -0
  51. torchcodec/_samplers/__init__.py +7 -0
  52. torchcodec/_samplers/video_clip_sampler.py +419 -0
  53. torchcodec/decoders/__init__.py +12 -0
  54. torchcodec/decoders/_audio_decoder.py +185 -0
  55. torchcodec/decoders/_decoder_utils.py +113 -0
  56. torchcodec/decoders/_video_decoder.py +601 -0
  57. torchcodec/encoders/__init__.py +2 -0
  58. torchcodec/encoders/_audio_encoder.py +149 -0
  59. torchcodec/encoders/_video_encoder.py +196 -0
  60. torchcodec/libtorchcodec_core4.so +0 -0
  61. torchcodec/libtorchcodec_core5.so +0 -0
  62. torchcodec/libtorchcodec_core6.so +0 -0
  63. torchcodec/libtorchcodec_core7.so +0 -0
  64. torchcodec/libtorchcodec_core8.so +0 -0
  65. torchcodec/libtorchcodec_custom_ops4.so +0 -0
  66. torchcodec/libtorchcodec_custom_ops5.so +0 -0
  67. torchcodec/libtorchcodec_custom_ops6.so +0 -0
  68. torchcodec/libtorchcodec_custom_ops7.so +0 -0
  69. torchcodec/libtorchcodec_custom_ops8.so +0 -0
  70. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  71. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  72. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  73. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  74. torchcodec/libtorchcodec_pybind_ops8.so +0 -0
  75. torchcodec/samplers/__init__.py +2 -0
  76. torchcodec/samplers/_common.py +84 -0
  77. torchcodec/samplers/_index_based.py +287 -0
  78. torchcodec/samplers/_time_based.py +358 -0
  79. torchcodec/share/cmake/TorchCodec/TorchCodecConfig.cmake +76 -0
  80. torchcodec/share/cmake/TorchCodec/ffmpeg_versions.cmake +122 -0
  81. torchcodec/transforms/__init__.py +12 -0
  82. torchcodec/transforms/_decoder_transforms.py +375 -0
  83. torchcodec/version.py +2 -0
  84. torchcodec-0.10.0.dist-info/METADATA +286 -0
  85. torchcodec-0.10.0.dist-info/RECORD +88 -0
  86. torchcodec-0.10.0.dist-info/WHEEL +5 -0
  87. torchcodec-0.10.0.dist-info/licenses/LICENSE +28 -0
  88. torchcodec-0.10.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,849 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include <c10/cuda/CUDAStream.h>
8
+ #include <torch/types.h>
9
+ #include <mutex>
10
+ #include <vector>
11
+
12
+ #include "BetaCudaDeviceInterface.h"
13
+
14
+ #include "DeviceInterface.h"
15
+ #include "FFMPEGCommon.h"
16
+ #include "NVDECCache.h"
17
+
18
+ #include "NVCUVIDRuntimeLoader.h"
19
+ #include "nvcuvid_include/cuviddec.h"
20
+ #include "nvcuvid_include/nvcuvid.h"
21
+
22
+ extern "C" {
23
+ #include <libavutil/hwcontext_cuda.h>
24
+ #include <libavutil/pixdesc.h>
25
+ }
26
+
27
+ namespace facebook::torchcodec {
28
+
29
+ namespace {
30
+
31
+ static bool g_cuda_beta = registerDeviceInterface(
32
+ DeviceInterfaceKey(torch::kCUDA, /*variant=*/"beta"),
33
+ [](const torch::Device& device) {
34
+ return new BetaCudaDeviceInterface(device);
35
+ });
36
+
37
+ static int CUDAAPI
38
+ pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
39
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
40
+ return decoder->streamPropertyChange(videoFormat);
41
+ }
42
+
43
+ static int CUDAAPI
44
+ pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) {
45
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
46
+ return decoder->frameReadyForDecoding(picParams);
47
+ }
48
+
49
+ static int CUDAAPI
50
+ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
51
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
52
+ return decoder->frameReadyInDisplayOrder(dispInfo);
53
+ }
54
+
55
+ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
56
+ // Decoder creation parameters, most are taken from DALI
57
+ CUVIDDECODECREATEINFO decoderParams = {};
58
+ decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
59
+ decoderParams.ChromaFormat = videoFormat->chroma_format;
60
+ // We explicitly request NV12 format, which means 10bit videos will be
61
+ // automatically converted to 8bits by NVDEC itself. That is, the raw frames
62
+ // we get back from cuvidMapVideoFrame will already be in 8bit format. We
63
+ // won't need to do the conversion ourselves, so that's a lot easier.
64
+ // In the ffmpeg CUDA interface, we have to do the 10 -> 8bits conversion
65
+ // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
66
+ // requests 10 or 16bits output formats for >8-bit videos!
67
+ // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
68
+ decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
69
+ decoderParams.ulCreationFlags = cudaVideoCreate_Default;
70
+ decoderParams.CodecType = videoFormat->codec;
71
+ decoderParams.ulHeight = videoFormat->coded_height;
72
+ decoderParams.ulWidth = videoFormat->coded_width;
73
+ decoderParams.ulMaxHeight = videoFormat->coded_height;
74
+ decoderParams.ulMaxWidth = videoFormat->coded_width;
75
+ decoderParams.ulTargetHeight =
76
+ videoFormat->display_area.bottom - videoFormat->display_area.top;
77
+ decoderParams.ulTargetWidth =
78
+ videoFormat->display_area.right - videoFormat->display_area.left;
79
+ decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
80
+ // We should only ever need 1 output surface, since we process frames
81
+ // sequentially, and we always unmap the previous frame before mapping a new
82
+ // one.
83
+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
84
+ // benchmark to see if this makes any difference.
85
+ decoderParams.ulNumOutputSurfaces = 1;
86
+ decoderParams.display_area.left = videoFormat->display_area.left;
87
+ decoderParams.display_area.right = videoFormat->display_area.right;
88
+ decoderParams.display_area.top = videoFormat->display_area.top;
89
+ decoderParams.display_area.bottom = videoFormat->display_area.bottom;
90
+
91
+ CUvideodecoder* decoder = new CUvideodecoder();
92
+ CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
93
+ TORCH_CHECK(
94
+ result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
95
+ return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
96
+ }
97
+
98
+ std::optional<cudaVideoChromaFormat> validateChromaSupport(
99
+ const AVPixFmtDescriptor* desc) {
100
+ // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
101
+ // otherwise.
102
+ TORCH_CHECK(desc != nullptr, "desc can't be null");
103
+
104
+ if (desc->nb_components == 1) {
105
+ return cudaVideoChromaFormat_Monochrome;
106
+ } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
107
+ // Make sure it's YUV: has chroma planes and isn't RGB
108
+ if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
109
+ return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
110
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
111
+ return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
112
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
113
+ return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
114
+ }
115
+ }
116
+
117
+ return std::nullopt;
118
+ }
119
+
120
+ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
121
+ // Return the corresponding cudaVideoCodec if supported, std::nullopt
122
+ // otherwise
123
+ // Note that we currently return nullopt (and thus fallback to CPU) for some
124
+ // codecs that are technically supported by NVDEC, see comment below.
125
+ switch (codecId) {
126
+ case AV_CODEC_ID_H264:
127
+ return cudaVideoCodec_H264;
128
+ case AV_CODEC_ID_HEVC:
129
+ return cudaVideoCodec_HEVC;
130
+ case AV_CODEC_ID_AV1:
131
+ return cudaVideoCodec_AV1;
132
+ case AV_CODEC_ID_VP9:
133
+ return cudaVideoCodec_VP9;
134
+ case AV_CODEC_ID_VP8:
135
+ return cudaVideoCodec_VP8;
136
+ case AV_CODEC_ID_MPEG4:
137
+ return cudaVideoCodec_MPEG4;
138
+ // Formats below are currently not tested, but they should "mostly" work.
139
+ // MPEG1 was briefly locally tested and it was ok-ish despite duration being
140
+ // off. Since they're far less popular, we keep them disabled by default but
141
+ // we can consider enabling them upon user requests.
142
+ // case AV_CODEC_ID_MPEG1VIDEO:
143
+ // return cudaVideoCodec_MPEG1;
144
+ // case AV_CODEC_ID_MPEG2VIDEO:
145
+ // return cudaVideoCodec_MPEG2;
146
+ // case AV_CODEC_ID_MJPEG:
147
+ // return cudaVideoCodec_JPEG;
148
+ // case AV_CODEC_ID_VC1:
149
+ // return cudaVideoCodec_VC1;
150
+ default:
151
+ return std::nullopt;
152
+ }
153
+ }
154
+
155
+ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
156
+ // Return true iff the input video stream is supported by our NVDEC
157
+ // implementation.
158
+
159
+ auto codecType = validateCodecSupport(codecContext->codec_id);
160
+ if (!codecType.has_value()) {
161
+ return false;
162
+ }
163
+
164
+ const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
165
+ if (!desc) {
166
+ return false;
167
+ }
168
+
169
+ auto chromaFormat = validateChromaSupport(desc);
170
+ if (!chromaFormat.has_value()) {
171
+ return false;
172
+ }
173
+
174
+ auto caps = CUVIDDECODECAPS{};
175
+ caps.eCodecType = codecType.value();
176
+ caps.eChromaFormat = chromaFormat.value();
177
+ caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
178
+
179
+ CUresult result = cuvidGetDecoderCaps(&caps);
180
+ if (result != CUDA_SUCCESS) {
181
+ return false;
182
+ }
183
+
184
+ if (!caps.bIsSupported) {
185
+ return false;
186
+ }
187
+
188
+ auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
189
+ auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
190
+ if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
191
+ coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
192
+ coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
193
+ return false;
194
+ }
195
+
196
+ // See nMaxMBCount in cuviddec.h
197
+ constexpr unsigned int macroblockConstant = 256;
198
+ if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
199
+ return false;
200
+ }
201
+
202
+ // We'll set the decoderParams.OutputFormat to NV12, so we need to make
203
+ // sure it's actually supported.
204
+ // TODO: If this fail, we could consider decoding to something else than NV12
205
+ // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
206
+ // what FFmpeg does.
207
+ bool supportsNV12Output =
208
+ (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
209
+ if (!supportsNV12Output) {
210
+ return false;
211
+ }
212
+
213
+ return true;
214
+ }
215
+
216
+ // Callback for freeing CUDA memory associated with AVFrame see where it's used
217
+ // for more details.
218
+ void cudaBufferFreeCallback(void* opaque, [[maybe_unused]] uint8_t* data) {
219
+ cudaFree(opaque);
220
+ }
221
+
222
+ } // namespace
223
+
224
+ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
225
+ : DeviceInterface(device) {
226
+ TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
227
+ TORCH_CHECK(
228
+ device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
229
+
230
+ initializeCudaContextWithPytorch(device_);
231
+ nppCtx_ = getNppStreamContext(device_);
232
+
233
+ nvcuvidAvailable_ = loadNVCUVIDLibrary();
234
+ }
235
+
236
+ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
237
+ if (decoder_) {
238
+ // DALI doesn't seem to do any particular cleanup of the decoder before
239
+ // sending it to the cache, so we probably don't need to do anything either.
240
+ // Just to be safe, we flush.
241
+ // What happens to those decode surfaces that haven't yet been mapped is
242
+ // unclear.
243
+ flush();
244
+ unmapPreviousFrame();
245
+ NVDECCache::getCache(device_).returnDecoder(
246
+ &videoFormat_, std::move(decoder_));
247
+ }
248
+
249
+ if (videoParser_) {
250
+ cuvidDestroyVideoParser(videoParser_);
251
+ videoParser_ = nullptr;
252
+ }
253
+
254
+ returnNppStreamContextToCache(device_, std::move(nppCtx_));
255
+ }
256
+
257
+ void BetaCudaDeviceInterface::initialize(
258
+ const AVStream* avStream,
259
+ const UniqueDecodingAVFormatContext& avFormatCtx,
260
+ [[maybe_unused]] const SharedAVCodecContext& codecContext) {
261
+ if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
262
+ cpuFallback_ = createDeviceInterface(torch::kCPU);
263
+ TORCH_CHECK(
264
+ cpuFallback_ != nullptr, "Failed to create CPU device interface");
265
+ cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
266
+ cpuFallback_->initializeVideo(
267
+ VideoStreamOptions(),
268
+ {},
269
+ /*resizedOutputDims=*/std::nullopt);
270
+ // We'll always use the CPU fallback from now on, so we can return early.
271
+ return;
272
+ }
273
+
274
+ TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
275
+ timeBase_ = avStream->time_base;
276
+ frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
277
+
278
+ const AVCodecParameters* codecPar = avStream->codecpar;
279
+ TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
280
+
281
+ initializeBSF(codecPar, avFormatCtx);
282
+
283
+ // Create parser. Default values that aren't obvious are taken from DALI.
284
+ CUVIDPARSERPARAMS parserParams = {};
285
+ auto codecType = validateCodecSupport(codecPar->codec_id);
286
+ TORCH_CHECK(
287
+ codecType.has_value(),
288
+ "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
289
+ parserParams.CodecType = codecType.value();
290
+ parserParams.ulMaxNumDecodeSurfaces = 8;
291
+ parserParams.ulMaxDisplayDelay = 0;
292
+ // Callback setup, all are triggered by the parser within a call
293
+ // to cuvidParseVideoData
294
+ parserParams.pUserData = this;
295
+ parserParams.pfnSequenceCallback = pfnSequenceCallback;
296
+ parserParams.pfnDecodePicture = pfnDecodePictureCallback;
297
+ parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
298
+
299
+ CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
300
+ TORCH_CHECK(
301
+ result == CUDA_SUCCESS, "Failed to create video parser: ", result);
302
+ }
303
+
304
+ void BetaCudaDeviceInterface::initializeBSF(
305
+ const AVCodecParameters* codecPar,
306
+ const UniqueDecodingAVFormatContext& avFormatCtx) {
307
+ // Setup bit stream filters (BSF):
308
+ // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
309
+ // This is only needed for some formats, like H264 or HEVC.
310
+
311
+ TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
312
+ TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
313
+ TORCH_CHECK(
314
+ avFormatCtx->iformat != nullptr,
315
+ "AVFormatContext->iformat cannot be null");
316
+ std::string filterName;
317
+
318
+ // Matching logic is taken from DALI
319
+ switch (codecPar->codec_id) {
320
+ case AV_CODEC_ID_H264: {
321
+ const std::string formatName = avFormatCtx->iformat->long_name
322
+ ? avFormatCtx->iformat->long_name
323
+ : "";
324
+
325
+ if (formatName == "QuickTime / MOV" ||
326
+ formatName == "FLV (Flash Video)" ||
327
+ formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
328
+ filterName = "h264_mp4toannexb";
329
+ }
330
+ break;
331
+ }
332
+
333
+ case AV_CODEC_ID_HEVC: {
334
+ const std::string formatName = avFormatCtx->iformat->long_name
335
+ ? avFormatCtx->iformat->long_name
336
+ : "";
337
+
338
+ if (formatName == "QuickTime / MOV" ||
339
+ formatName == "FLV (Flash Video)" ||
340
+ formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
341
+ filterName = "hevc_mp4toannexb";
342
+ }
343
+ break;
344
+ }
345
+ case AV_CODEC_ID_MPEG4: {
346
+ const std::string formatName =
347
+ avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
348
+ if (formatName == "avi") {
349
+ filterName = "mpeg4_unpack_bframes";
350
+ }
351
+ break;
352
+ }
353
+
354
+ default:
355
+ // No bitstream filter needed for other codecs
356
+ break;
357
+ }
358
+
359
+ if (filterName.empty()) {
360
+ // Only initialize BSF if we actually need one
361
+ return;
362
+ }
363
+
364
+ const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
365
+ TORCH_CHECK(
366
+ avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
367
+
368
+ AVBSFContext* avBSFContext = nullptr;
369
+ int retVal = av_bsf_alloc(avBSF, &avBSFContext);
370
+ TORCH_CHECK(
371
+ retVal >= AVSUCCESS,
372
+ "Failed to allocate bitstream filter: ",
373
+ getFFMPEGErrorStringFromErrorCode(retVal));
374
+
375
+ bitstreamFilter_.reset(avBSFContext);
376
+
377
+ retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
378
+ TORCH_CHECK(
379
+ retVal >= AVSUCCESS,
380
+ "Failed to copy codec parameters: ",
381
+ getFFMPEGErrorStringFromErrorCode(retVal));
382
+
383
+ retVal = av_bsf_init(bitstreamFilter_.get());
384
+ TORCH_CHECK(
385
+ retVal == AVSUCCESS,
386
+ "Failed to initialize bitstream filter: ",
387
+ getFFMPEGErrorStringFromErrorCode(retVal));
388
+ }
389
+
390
+ // This callback is called by the parser within cuvidParseVideoData when there
391
+ // is a change in the stream's properties (like resolution change), as specified
392
+ // by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
393
+ // start of the stream.
394
+ // TODONVDEC P1: Code below mostly assume this is called only once at the start,
395
+ // we should handle the case of multiple calls. Probably need to flush buffers,
396
+ // etc.
397
+ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
398
+ TORCH_CHECK(videoFormat != nullptr, "Invalid video format");
399
+
400
+ videoFormat_ = *videoFormat;
401
+
402
+ if (videoFormat_.min_num_decode_surfaces == 0) {
403
+ // Same as DALI's fallback
404
+ videoFormat_.min_num_decode_surfaces = 20;
405
+ }
406
+
407
+ if (!decoder_) {
408
+ decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
409
+
410
+ if (!decoder_) {
411
+ // TODONVDEC P2: consider re-configuring an existing decoder instead of
412
+ // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
413
+ // be enabled in DALI by default.
414
+ decoder_ = createDecoder(videoFormat);
415
+ }
416
+
417
+ TORCH_CHECK(decoder_, "Failed to get or create decoder");
418
+ }
419
+
420
+ // DALI also returns min_num_decode_surfaces from this function. This
421
+ // instructs the parser to reset its ulMaxNumDecodeSurfaces field to this
422
+ // value.
423
+ return static_cast<int>(videoFormat_.min_num_decode_surfaces);
424
+ }
425
+
426
+ // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
427
+ // the NVCUVID parser.
428
+ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
429
+ if (cpuFallback_) {
430
+ return cpuFallback_->sendPacket(packet);
431
+ }
432
+
433
+ TORCH_CHECK(
434
+ packet.get() && packet->data && packet->size > 0,
435
+ "sendPacket received an empty packet, this is unexpected, please report.");
436
+
437
+ // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
438
+ // the original one if no BSF is needed. This new filtered packet must be
439
+ // allocated outside of applyBSF: if it were allocated inside applyBSF, it
440
+ // would be destroyed at the end of the function, leaving us with a dangling
441
+ // reference.
442
+ AutoAVPacket filteredAutoPacket;
443
+ ReferenceAVPacket filteredPacket(filteredAutoPacket);
444
+ ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);
445
+
446
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
447
+ cuvidPacket.payload = packetToSend->data;
448
+ cuvidPacket.payload_size = packetToSend->size;
449
+ cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
450
+ cuvidPacket.timestamp = packetToSend->pts;
451
+
452
+ return sendCuvidPacket(cuvidPacket);
453
+ }
454
+
455
+ int BetaCudaDeviceInterface::sendEOFPacket() {
456
+ if (cpuFallback_) {
457
+ return cpuFallback_->sendEOFPacket();
458
+ }
459
+
460
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
461
+ cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
462
+ eofSent_ = true;
463
+
464
+ return sendCuvidPacket(cuvidPacket);
465
+ }
466
+
467
+ int BetaCudaDeviceInterface::sendCuvidPacket(
468
+ CUVIDSOURCEDATAPACKET& cuvidPacket) {
469
+ CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
470
+ return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
471
+ }
472
+
473
+ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
474
+ ReferenceAVPacket& packet,
475
+ ReferenceAVPacket& filteredPacket) {
476
+ if (!bitstreamFilter_) {
477
+ return packet;
478
+ }
479
+
480
+ int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
481
+ TORCH_CHECK(
482
+ retVal >= AVSUCCESS,
483
+ "Failed to send packet to bitstream filter: ",
484
+ getFFMPEGErrorStringFromErrorCode(retVal));
485
+
486
+ // TODO P1: the docs mention there can theoretically be multiple output
487
+ // packets for a single input, i.e. we may need to call av_bsf_receive_packet
488
+ // more than once. We should figure out whether that applies to the BSF we're
489
+ // using.
490
+ retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
491
+ TORCH_CHECK(
492
+ retVal >= AVSUCCESS,
493
+ "Failed to receive packet from bitstream filter: ",
494
+ getFFMPEGErrorStringFromErrorCode(retVal));
495
+
496
+ return filteredPacket;
497
+ }
498
+
499
+ // Parser triggers this callback within cuvidParseVideoData when a frame is
500
+ // ready to be decoded, i.e. the parser received all the necessary packets for a
501
+ // given frame. It means we can send that frame to be decoded by the hardware
502
+ // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
503
+ int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
504
+ TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
505
+ TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
506
+ // Send frame to be decoded by NVDEC - non-blocking call.
507
+ CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
508
+
509
+ // Yes, you're reading that right, 0 means error, 1 means success
510
+ return (result == CUDA_SUCCESS);
511
+ }
512
+
513
+ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
514
+ CUVIDPARSERDISPINFO* dispInfo) {
515
+ readyFrames_.push(*dispInfo);
516
+ return 1; // success
517
+ }
518
+
519
+ // Moral equivalent of avcodec_receive_frame().
520
+ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
521
+ if (cpuFallback_) {
522
+ return cpuFallback_->receiveFrame(avFrame);
523
+ }
524
+
525
+ if (readyFrames_.empty()) {
526
+ // No frame found, instruct caller to try again later after sending more
527
+ // packets, or to stop if EOF was already sent.
528
+ return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
529
+ }
530
+
531
+ CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
532
+ readyFrames_.pop();
533
+
534
+ CUVIDPROCPARAMS procParams = {};
535
+ procParams.progressive_frame = dispInfo.progressive_frame;
536
+ procParams.top_field_first = dispInfo.top_field_first;
537
+ procParams.unpaired_field = dispInfo.repeat_first_field < 0;
538
+ // We set the NVDEC stream to the current stream. It will be waited upon by
539
+ // the NPP stream before any color conversion.
540
+ // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
541
+ // CUstream
542
+ procParams.output_stream = reinterpret_cast<CUstream>(
543
+ at::cuda::getCurrentCUDAStream(device_.index()).stream());
544
+
545
+ CUdeviceptr framePtr = 0;
546
+ unsigned int pitch = 0;
547
+
548
+ // We know the frame we want was sent to the hardware decoder, but now we need
549
+ // to "map" it to an "output surface" before we can use its data. This is a
550
+ // blocking calls that waits until the frame is fully decoded and ready to be
551
+ // used.
552
+ // When a frame is mapped to an output surface, it needs to be unmapped
553
+ // eventually, so that the decoder can re-use the output surface. Failing to
554
+ // unmap will cause map to eventually fail. DALI unmaps frames almost
555
+ // immediately after mapping them: they do the color-conversion in-between,
556
+ // which involves a copy of the data, so that works.
557
+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
558
+ // we unmap here: just before mapping a new frame. At that point we know that
559
+ // the previously-mapped frame is no longer needed: it was either
560
+ // color-converted (with a copy), or that's a frame that was discarded in
561
+ // SingleStreamDecoder. Either way, the underlying output surface can be
562
+ // safely re-used.
563
+ unmapPreviousFrame();
564
+ CUresult result = cuvidMapVideoFrame(
565
+ *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
566
+ if (result != CUDA_SUCCESS) {
567
+ return AVERROR_EXTERNAL;
568
+ }
569
+ previouslyMappedFrame_ = framePtr;
570
+
571
+ avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
572
+
573
+ return AVSUCCESS;
574
+ }
575
+
576
+ void BetaCudaDeviceInterface::unmapPreviousFrame() {
577
+ if (previouslyMappedFrame_ == 0) {
578
+ return;
579
+ }
580
+ CUresult result =
581
+ cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
582
+ TORCH_CHECK(
583
+ result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
584
+ previouslyMappedFrame_ = 0;
585
+ }
586
+
587
+ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
588
+ CUdeviceptr framePtr,
589
+ unsigned int pitch,
590
+ const CUVIDPARSERDISPINFO& dispInfo) {
591
+ TORCH_CHECK(framePtr != 0, "Invalid CUDA frame pointer");
592
+
593
+ // Get frame dimensions from video format display area (not coded dimensions)
594
+ // This matches DALI's approach and avoids padding issues
595
+ int width = videoFormat_.display_area.right - videoFormat_.display_area.left;
596
+ int height = videoFormat_.display_area.bottom - videoFormat_.display_area.top;
597
+
598
+ TORCH_CHECK(width > 0 && height > 0, "Invalid frame dimensions");
599
+ TORCH_CHECK(
600
+ pitch >= static_cast<unsigned int>(width), "Pitch must be >= width");
601
+
602
+ UniqueAVFrame avFrame(av_frame_alloc());
603
+ TORCH_CHECK(avFrame.get() != nullptr, "Failed to allocate AVFrame");
604
+
605
+ avFrame->width = width;
606
+ avFrame->height = height;
607
+ avFrame->format = AV_PIX_FMT_CUDA;
608
+ avFrame->pts = dispInfo.timestamp;
609
+
610
+ // TODONVDEC P2: We compute the duration based on average frame rate info, so
611
+ // so if the video has variable frame rate, the durations may be off. We
612
+ // should try to see if we can set the duration more accurately. Unfortunately
613
+ // it's not given by dispInfo. One option would be to set it based on the pts
614
+ // difference between consecutive frames, if the next frame is already
615
+ // available.
616
+ // Note that we used to rely on videoFormat_.frame_rate for this, but that
617
+ // proved less accurate than FFmpeg.
618
+ setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
619
+
620
+ // We need to assign the frame colorspace. This is crucial for proper color
621
+ // conversion. NVCUVID stores that in the matrix_coefficients field, but
622
+ // doesn't document the semantics of the values. Claude code generated this,
623
+ // which seems to work. Reassuringly, the values seem to match the
624
+ // corresponding indices in the FFmpeg enum for colorspace conversion
625
+ // (ff_yuv2rgb_coeffs):
626
+ // https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
627
+ switch (videoFormat_.video_signal_description.matrix_coefficients) {
628
+ case 1:
629
+ avFrame->colorspace = AVCOL_SPC_BT709;
630
+ break;
631
+ case 6:
632
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
633
+ break;
634
+ default:
635
+ // Default to BT.601
636
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M;
637
+ break;
638
+ }
639
+
640
+ avFrame->color_range =
641
+ videoFormat_.video_signal_description.video_full_range_flag
642
+ ? AVCOL_RANGE_JPEG
643
+ : AVCOL_RANGE_MPEG;
644
+
645
+ // Below: Ask Claude. I'm not going to even pretend.
646
+ avFrame->data[0] = reinterpret_cast<uint8_t*>(framePtr);
647
+ avFrame->data[1] = reinterpret_cast<uint8_t*>(framePtr + (pitch * height));
648
+ avFrame->data[2] = nullptr;
649
+ avFrame->data[3] = nullptr;
650
+ avFrame->linesize[0] = pitch;
651
+ avFrame->linesize[1] = pitch;
652
+ avFrame->linesize[2] = 0;
653
+ avFrame->linesize[3] = 0;
654
+
655
+ return avFrame;
656
+ }
657
+
658
+ void BetaCudaDeviceInterface::flush() {
659
+ if (cpuFallback_) {
660
+ cpuFallback_->flush();
661
+ return;
662
+ }
663
+
664
+ // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
665
+ // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
666
+ // don't say whether this should be an empty packet, or whether it should be a
667
+ // flag on the next non-empty packet. It doesn't matter: neither work :)
668
+ // Sending an EOF packet, however, does work. So we do that. And we re-set the
669
+ // eofSent_ flag to false because that's not a true EOF notification.
670
+ sendEOFPacket();
671
+ eofSent_ = false;
672
+
673
+ std::queue<CUVIDPARSERDISPINFO> emptyQueue;
674
+ std::swap(readyFrames_, emptyQueue);
675
+ }
676
+
677
+ UniqueAVFrame BetaCudaDeviceInterface::transferCpuFrameToGpuNV12(
678
+ UniqueAVFrame& cpuFrame) {
679
+ // This is called in the context of the CPU fallback: the frame was decoded on
680
+ // the CPU, and in this function we convert that frame into NV12 format and
681
+ // send it to the GPU.
682
+ // We do that in 2 steps:
683
+ // - First we convert the input CPU frame into an intermediate NV12 CPU frame
684
+ // using sws_scale.
685
+ // - Then we allocate GPU memory and copy the NV12 CPU frame to the GPU. This
686
+ // is what we return
687
+
688
+ TORCH_CHECK(cpuFrame != nullptr, "CPU frame cannot be null");
689
+
690
+ int width = cpuFrame->width;
691
+ int height = cpuFrame->height;
692
+
693
+ // intermediate NV12 CPU frame. It's not on the GPU yet.
694
+ UniqueAVFrame nv12CpuFrame(av_frame_alloc());
695
+ TORCH_CHECK(nv12CpuFrame != nullptr, "Failed to allocate NV12 CPU frame");
696
+
697
+ nv12CpuFrame->format = AV_PIX_FMT_NV12;
698
+ nv12CpuFrame->width = width;
699
+ nv12CpuFrame->height = height;
700
+
701
+ int ret = av_frame_get_buffer(nv12CpuFrame.get(), 0);
702
+ TORCH_CHECK(
703
+ ret >= 0,
704
+ "Failed to allocate NV12 CPU frame buffer: ",
705
+ getFFMPEGErrorStringFromErrorCode(ret));
706
+
707
+ SwsFrameContext swsFrameContext(
708
+ width,
709
+ height,
710
+ static_cast<AVPixelFormat>(cpuFrame->format),
711
+ width,
712
+ height);
713
+
714
+ if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
715
+ swsContext_ = createSwsContext(
716
+ swsFrameContext, cpuFrame->colorspace, AV_PIX_FMT_NV12, SWS_BILINEAR);
717
+ prevSwsFrameContext_ = swsFrameContext;
718
+ }
719
+
720
+ int convertedHeight = sws_scale(
721
+ swsContext_.get(),
722
+ cpuFrame->data,
723
+ cpuFrame->linesize,
724
+ 0,
725
+ height,
726
+ nv12CpuFrame->data,
727
+ nv12CpuFrame->linesize);
728
+ TORCH_CHECK(
729
+ convertedHeight == height, "sws_scale failed for CPU->NV12 conversion");
730
+
731
+ int ySize = width * height;
732
+ TORCH_CHECK(
733
+ ySize % 2 == 0,
734
+ "Y plane size must be even. Please report on TorchCodec repo.");
735
+ int uvSize = ySize / 2; // NV12: UV plane is half the size of Y plane
736
+ size_t totalSize = static_cast<size_t>(ySize + uvSize);
737
+
738
+ uint8_t* cudaBuffer = nullptr;
739
+ cudaError_t err =
740
+ cudaMalloc(reinterpret_cast<void**>(&cudaBuffer), totalSize);
741
+ TORCH_CHECK(
742
+ err == cudaSuccess,
743
+ "Failed to allocate CUDA memory: ",
744
+ cudaGetErrorString(err));
745
+
746
+ UniqueAVFrame gpuFrame(av_frame_alloc());
747
+ TORCH_CHECK(gpuFrame != nullptr, "Failed to allocate GPU AVFrame");
748
+
749
+ gpuFrame->format = AV_PIX_FMT_CUDA;
750
+ gpuFrame->width = width;
751
+ gpuFrame->height = height;
752
+ gpuFrame->data[0] = cudaBuffer;
753
+ gpuFrame->data[1] = cudaBuffer + ySize;
754
+ gpuFrame->linesize[0] = width;
755
+ gpuFrame->linesize[1] = width;
756
+
757
+ // Note that we use cudaMemcpy2D here instead of cudaMemcpy because the
758
+ // linesizes (strides) may be different than the widths for the input CPU
759
+ // frame. That's precisely what cudaMemcpy2D is for.
760
+ err = cudaMemcpy2D(
761
+ gpuFrame->data[0],
762
+ gpuFrame->linesize[0],
763
+ nv12CpuFrame->data[0],
764
+ nv12CpuFrame->linesize[0],
765
+ width,
766
+ height,
767
+ cudaMemcpyHostToDevice);
768
+ TORCH_CHECK(
769
+ err == cudaSuccess,
770
+ "Failed to copy Y plane to GPU: ",
771
+ cudaGetErrorString(err));
772
+
773
+ TORCH_CHECK(
774
+ height % 2 == 0,
775
+ "height must be even. Please report on TorchCodec repo.");
776
+ err = cudaMemcpy2D(
777
+ gpuFrame->data[1],
778
+ gpuFrame->linesize[1],
779
+ nv12CpuFrame->data[1],
780
+ nv12CpuFrame->linesize[1],
781
+ width,
782
+ height / 2,
783
+ cudaMemcpyHostToDevice);
784
+ TORCH_CHECK(
785
+ err == cudaSuccess,
786
+ "Failed to copy UV plane to GPU: ",
787
+ cudaGetErrorString(err));
788
+
789
+ ret = av_frame_copy_props(gpuFrame.get(), cpuFrame.get());
790
+ TORCH_CHECK(
791
+ ret >= 0,
792
+ "Failed to copy frame properties: ",
793
+ getFFMPEGErrorStringFromErrorCode(ret));
794
+
795
+ // We're almost done, but we need to make sure the CUDA memory is freed
796
+ // properly. Usually, AVFrame data is freed when av_frame_free() is called
797
+ // (upon UniqueAVFrame destruction), but since we allocated the CUDA memory
798
+ // ourselves, FFmpeg doesn't know how to free it. The recommended way to deal
799
+ // with this is to associate the opaque_ref field of the AVFrame with a `free`
800
+ // callback that will then be called by av_frame_free().
801
+ gpuFrame->opaque_ref = av_buffer_create(
802
+ nullptr, // data - we don't need any
803
+ 0, // data size
804
+ cudaBufferFreeCallback, // callback triggered by av_frame_free()
805
+ cudaBuffer, // parameter to callback
806
+ 0); // flags
807
+ TORCH_CHECK(
808
+ gpuFrame->opaque_ref != nullptr,
809
+ "Failed to create GPU memory cleanup reference");
810
+
811
+ return gpuFrame;
812
+ }
813
+
814
+ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
815
+ UniqueAVFrame& avFrame,
816
+ FrameOutput& frameOutput,
817
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
818
+ UniqueAVFrame gpuFrame =
819
+ cpuFallback_ ? transferCpuFrameToGpuNV12(avFrame) : std::move(avFrame);
820
+
821
+ // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
822
+ // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
823
+ TORCH_CHECK(
824
+ gpuFrame->format == AV_PIX_FMT_CUDA,
825
+ "Expected CUDA format frame from BETA CUDA interface");
826
+
827
+ validatePreAllocatedTensorShape(preAllocatedOutputTensor, gpuFrame);
828
+
829
+ at::cuda::CUDAStream nvdecStream =
830
+ at::cuda::getCurrentCUDAStream(device_.index());
831
+
832
+ frameOutput.data = convertNV12FrameToRGB(
833
+ gpuFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
834
+ }
835
+
836
+ std::string BetaCudaDeviceInterface::getDetails() {
837
+ std::string details = "Beta CUDA Device Interface.";
838
+ if (cpuFallback_) {
839
+ details += " Using CPU fallback.";
840
+ if (!nvcuvidAvailable_) {
841
+ details += " NVCUVID not available!";
842
+ }
843
+ } else {
844
+ details += " Using NVDEC.";
845
+ }
846
+ return details;
847
+ }
848
+
849
+ } // namespace facebook::torchcodec