torchcodec 0.7.0__cp313-cp313-win_amd64.whl → 0.8.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (66) hide show
  1. torchcodec/_core/AVIOTensorContext.cpp +23 -16
  2. torchcodec/_core/AVIOTensorContext.h +2 -1
  3. torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
  4. torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
  5. torchcodec/_core/CMakeLists.txt +18 -3
  6. torchcodec/_core/CUDACommon.cpp +330 -0
  7. torchcodec/_core/CUDACommon.h +51 -0
  8. torchcodec/_core/Cache.h +6 -20
  9. torchcodec/_core/CpuDeviceInterface.cpp +195 -108
  10. torchcodec/_core/CpuDeviceInterface.h +84 -19
  11. torchcodec/_core/CudaDeviceInterface.cpp +227 -376
  12. torchcodec/_core/CudaDeviceInterface.h +38 -6
  13. torchcodec/_core/DeviceInterface.cpp +57 -19
  14. torchcodec/_core/DeviceInterface.h +97 -16
  15. torchcodec/_core/Encoder.cpp +346 -9
  16. torchcodec/_core/Encoder.h +62 -1
  17. torchcodec/_core/FFMPEGCommon.cpp +190 -3
  18. torchcodec/_core/FFMPEGCommon.h +27 -1
  19. torchcodec/_core/FilterGraph.cpp +30 -22
  20. torchcodec/_core/FilterGraph.h +15 -1
  21. torchcodec/_core/Frame.cpp +22 -7
  22. torchcodec/_core/Frame.h +15 -61
  23. torchcodec/_core/Metadata.h +2 -2
  24. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  25. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  26. torchcodec/_core/NVDECCache.cpp +60 -0
  27. torchcodec/_core/NVDECCache.h +102 -0
  28. torchcodec/_core/SingleStreamDecoder.cpp +196 -201
  29. torchcodec/_core/SingleStreamDecoder.h +42 -15
  30. torchcodec/_core/StreamOptions.h +16 -6
  31. torchcodec/_core/Transform.cpp +87 -0
  32. torchcodec/_core/Transform.h +84 -0
  33. torchcodec/_core/__init__.py +4 -0
  34. torchcodec/_core/custom_ops.cpp +257 -32
  35. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
  36. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  37. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  38. torchcodec/_core/ops.py +147 -44
  39. torchcodec/_core/pybind_ops.cpp +22 -59
  40. torchcodec/_samplers/video_clip_sampler.py +7 -19
  41. torchcodec/decoders/__init__.py +1 -0
  42. torchcodec/decoders/_decoder_utils.py +61 -1
  43. torchcodec/decoders/_video_decoder.py +46 -20
  44. torchcodec/libtorchcodec_core4.dll +0 -0
  45. torchcodec/libtorchcodec_core5.dll +0 -0
  46. torchcodec/libtorchcodec_core6.dll +0 -0
  47. torchcodec/libtorchcodec_core7.dll +0 -0
  48. torchcodec/libtorchcodec_core8.dll +0 -0
  49. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  50. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  51. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  52. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  53. torchcodec/libtorchcodec_custom_ops8.dll +0 -0
  54. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  55. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  56. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  57. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  58. torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
  59. torchcodec/samplers/_time_based.py +8 -0
  60. torchcodec/version.py +1 -1
  61. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
  62. torchcodec-0.8.1.dist-info/RECORD +82 -0
  63. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
  64. torchcodec-0.7.0.dist-info/RECORD +0 -67
  65. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
  66. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,718 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include <c10/cuda/CUDAStream.h>
8
+ #include <torch/types.h>
9
+ #include <mutex>
10
+ #include <vector>
11
+
12
+ #include "src/torchcodec/_core/BetaCudaDeviceInterface.h"
13
+
14
+ #include "src/torchcodec/_core/DeviceInterface.h"
15
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
16
+ #include "src/torchcodec/_core/NVDECCache.h"
17
+
18
+ #include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
19
+ #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
20
+ #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
21
+
22
+ extern "C" {
23
+ #include <libavutil/hwcontext_cuda.h>
24
+ #include <libavutil/pixdesc.h>
25
+ }
26
+
27
+ namespace facebook::torchcodec {
28
+
29
+ namespace {
30
+
31
+ static bool g_cuda_beta = registerDeviceInterface(
32
+ DeviceInterfaceKey(torch::kCUDA, /*variant=*/"beta"),
33
+ [](const torch::Device& device) {
34
+ return new BetaCudaDeviceInterface(device);
35
+ });
36
+
37
+ static int CUDAAPI
38
+ pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
39
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
40
+ return decoder->streamPropertyChange(videoFormat);
41
+ }
42
+
43
+ static int CUDAAPI
44
+ pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) {
45
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
46
+ return decoder->frameReadyForDecoding(picParams);
47
+ }
48
+
49
+ static int CUDAAPI
50
+ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
51
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
52
+ return decoder->frameReadyInDisplayOrder(dispInfo);
53
+ }
54
+
55
+ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
56
+ // Decoder creation parameters, most are taken from DALI
57
+ CUVIDDECODECREATEINFO decoderParams = {};
58
+ decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
59
+ decoderParams.ChromaFormat = videoFormat->chroma_format;
60
+ // We explicitly request NV12 format, which means 10bit videos will be
61
+ // automatically converted to 8bits by NVDEC itself. That is, the raw frames
62
+ // we get back from cuvidMapVideoFrame will already be in 8bit format. We
63
+ // won't need to do the conversion ourselves, so that's a lot easier.
64
+ // In the ffmpeg CUDA interface, we have to do the 10 -> 8bits conversion
65
+ // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
66
+ // requests 10 or 16bits output formats for >8-bit videos!
67
+ // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
68
+ decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
69
+ decoderParams.ulCreationFlags = cudaVideoCreate_Default;
70
+ decoderParams.CodecType = videoFormat->codec;
71
+ decoderParams.ulHeight = videoFormat->coded_height;
72
+ decoderParams.ulWidth = videoFormat->coded_width;
73
+ decoderParams.ulMaxHeight = videoFormat->coded_height;
74
+ decoderParams.ulMaxWidth = videoFormat->coded_width;
75
+ decoderParams.ulTargetHeight =
76
+ videoFormat->display_area.bottom - videoFormat->display_area.top;
77
+ decoderParams.ulTargetWidth =
78
+ videoFormat->display_area.right - videoFormat->display_area.left;
79
+ decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
80
+ // We should only ever need 1 output surface, since we process frames
81
+ // sequentially, and we always unmap the previous frame before mapping a new
82
+ // one.
83
+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
84
+ // benchmark to see if this makes any difference.
85
+ decoderParams.ulNumOutputSurfaces = 1;
86
+ decoderParams.display_area.left = videoFormat->display_area.left;
87
+ decoderParams.display_area.right = videoFormat->display_area.right;
88
+ decoderParams.display_area.top = videoFormat->display_area.top;
89
+ decoderParams.display_area.bottom = videoFormat->display_area.bottom;
90
+
91
+ CUvideodecoder* decoder = new CUvideodecoder();
92
+ CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
93
+ TORCH_CHECK(
94
+ result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
95
+ return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
96
+ }
97
+
98
+ std::optional<cudaVideoChromaFormat> validateChromaSupport(
99
+ const AVPixFmtDescriptor* desc) {
100
+ // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
101
+ // otherwise.
102
+ TORCH_CHECK(desc != nullptr, "desc can't be null");
103
+
104
+ if (desc->nb_components == 1) {
105
+ return cudaVideoChromaFormat_Monochrome;
106
+ } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
107
+ // Make sure it's YUV: has chroma planes and isn't RGB
108
+ if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
109
+ return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
110
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
111
+ return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
112
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
113
+ return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
114
+ }
115
+ }
116
+
117
+ return std::nullopt;
118
+ }
119
+
120
+ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
121
+ // Return the corresponding cudaVideoCodec if supported, std::nullopt
122
+ // otherwise
123
+ // Note that we currently return nullopt (and thus fallback to CPU) for some
124
+ // codecs that are technically supported by NVDEC, see comment below.
125
+ switch (codecId) {
126
+ case AV_CODEC_ID_H264:
127
+ return cudaVideoCodec_H264;
128
+ case AV_CODEC_ID_HEVC:
129
+ return cudaVideoCodec_HEVC;
130
+ case AV_CODEC_ID_AV1:
131
+ return cudaVideoCodec_AV1;
132
+ case AV_CODEC_ID_VP9:
133
+ return cudaVideoCodec_VP9;
134
+ case AV_CODEC_ID_VP8:
135
+ return cudaVideoCodec_VP8;
136
+ case AV_CODEC_ID_MPEG4:
137
+ return cudaVideoCodec_MPEG4;
138
+ // Formats below are currently not tested, but they should "mostly" work.
139
+ // MPEG1 was briefly locally tested and it was ok-ish despite duration being
140
+ // off. Since they're far less popular, we keep them disabled by default but
141
+ // we can consider enabling them upon user requests.
142
+ // case AV_CODEC_ID_MPEG1VIDEO:
143
+ // return cudaVideoCodec_MPEG1;
144
+ // case AV_CODEC_ID_MPEG2VIDEO:
145
+ // return cudaVideoCodec_MPEG2;
146
+ // case AV_CODEC_ID_MJPEG:
147
+ // return cudaVideoCodec_JPEG;
148
+ // case AV_CODEC_ID_VC1:
149
+ // return cudaVideoCodec_VC1;
150
+ default:
151
+ return std::nullopt;
152
+ }
153
+ }
154
+
155
+ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
156
+ // Return true iff the input video stream is supported by our NVDEC
157
+ // implementation.
158
+
159
+ auto codecType = validateCodecSupport(codecContext->codec_id);
160
+ if (!codecType.has_value()) {
161
+ return false;
162
+ }
163
+
164
+ const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
165
+ if (!desc) {
166
+ return false;
167
+ }
168
+
169
+ auto chromaFormat = validateChromaSupport(desc);
170
+ if (!chromaFormat.has_value()) {
171
+ return false;
172
+ }
173
+
174
+ auto caps = CUVIDDECODECAPS{};
175
+ caps.eCodecType = codecType.value();
176
+ caps.eChromaFormat = chromaFormat.value();
177
+ caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
178
+
179
+ CUresult result = cuvidGetDecoderCaps(&caps);
180
+ if (result != CUDA_SUCCESS) {
181
+ return false;
182
+ }
183
+
184
+ if (!caps.bIsSupported) {
185
+ return false;
186
+ }
187
+
188
+ auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
189
+ auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
190
+ if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
191
+ coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
192
+ coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
193
+ return false;
194
+ }
195
+
196
+ // See nMaxMBCount in cuviddec.h
197
+ constexpr unsigned int macroblockConstant = 256;
198
+ if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
199
+ return false;
200
+ }
201
+
202
+ // We'll set the decoderParams.OutputFormat to NV12, so we need to make
203
+ // sure it's actually supported.
204
+ // TODO: If this fail, we could consider decoding to something else than NV12
205
+ // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
206
+ // what FFmpeg does.
207
+ bool supportsNV12Output =
208
+ (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
209
+ if (!supportsNV12Output) {
210
+ return false;
211
+ }
212
+
213
+ return true;
214
+ }
215
+
216
+ } // namespace
217
+
218
+ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
219
+ : DeviceInterface(device) {
220
+ TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
221
+ TORCH_CHECK(
222
+ device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
223
+
224
+ initializeCudaContextWithPytorch(device_);
225
+ nppCtx_ = getNppStreamContext(device_);
226
+
227
+ nvcuvidAvailable_ = loadNVCUVIDLibrary();
228
+ }
229
+
230
+ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
231
+ if (decoder_) {
232
+ // DALI doesn't seem to do any particular cleanup of the decoder before
233
+ // sending it to the cache, so we probably don't need to do anything either.
234
+ // Just to be safe, we flush.
235
+ // What happens to those decode surfaces that haven't yet been mapped is
236
+ // unclear.
237
+ flush();
238
+ unmapPreviousFrame();
239
+ NVDECCache::getCache(device_).returnDecoder(
240
+ &videoFormat_, std::move(decoder_));
241
+ }
242
+
243
+ if (videoParser_) {
244
+ cuvidDestroyVideoParser(videoParser_);
245
+ videoParser_ = nullptr;
246
+ }
247
+
248
+ returnNppStreamContextToCache(device_, std::move(nppCtx_));
249
+ }
250
+
251
+ void BetaCudaDeviceInterface::initialize(
252
+ const AVStream* avStream,
253
+ const UniqueDecodingAVFormatContext& avFormatCtx,
254
+ [[maybe_unused]] const SharedAVCodecContext& codecContext) {
255
+ if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
256
+ cpuFallback_ = createDeviceInterface(torch::kCPU);
257
+ TORCH_CHECK(
258
+ cpuFallback_ != nullptr, "Failed to create CPU device interface");
259
+ cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
260
+ cpuFallback_->initializeVideo(
261
+ VideoStreamOptions(),
262
+ {},
263
+ /*resizedOutputDims=*/std::nullopt);
264
+ // We'll always use the CPU fallback from now on, so we can return early.
265
+ return;
266
+ }
267
+
268
+ TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
269
+ timeBase_ = avStream->time_base;
270
+ frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
271
+
272
+ const AVCodecParameters* codecPar = avStream->codecpar;
273
+ TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
274
+
275
+ initializeBSF(codecPar, avFormatCtx);
276
+
277
+ // Create parser. Default values that aren't obvious are taken from DALI.
278
+ CUVIDPARSERPARAMS parserParams = {};
279
+ auto codecType = validateCodecSupport(codecPar->codec_id);
280
+ TORCH_CHECK(
281
+ codecType.has_value(),
282
+ "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
283
+ parserParams.CodecType = codecType.value();
284
+ parserParams.ulMaxNumDecodeSurfaces = 8;
285
+ parserParams.ulMaxDisplayDelay = 0;
286
+ // Callback setup, all are triggered by the parser within a call
287
+ // to cuvidParseVideoData
288
+ parserParams.pUserData = this;
289
+ parserParams.pfnSequenceCallback = pfnSequenceCallback;
290
+ parserParams.pfnDecodePicture = pfnDecodePictureCallback;
291
+ parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
292
+
293
+ CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
294
+ TORCH_CHECK(
295
+ result == CUDA_SUCCESS, "Failed to create video parser: ", result);
296
+ }
297
+
298
+ void BetaCudaDeviceInterface::initializeBSF(
299
+ const AVCodecParameters* codecPar,
300
+ const UniqueDecodingAVFormatContext& avFormatCtx) {
301
+ // Setup bit stream filters (BSF):
302
+ // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
303
+ // This is only needed for some formats, like H264 or HEVC.
304
+
305
+ TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
306
+ TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
307
+ TORCH_CHECK(
308
+ avFormatCtx->iformat != nullptr,
309
+ "AVFormatContext->iformat cannot be null");
310
+ std::string filterName;
311
+
312
+ // Matching logic is taken from DALI
313
+ switch (codecPar->codec_id) {
314
+ case AV_CODEC_ID_H264: {
315
+ const std::string formatName = avFormatCtx->iformat->long_name
316
+ ? avFormatCtx->iformat->long_name
317
+ : "";
318
+
319
+ if (formatName == "QuickTime / MOV" ||
320
+ formatName == "FLV (Flash Video)" ||
321
+ formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
322
+ filterName = "h264_mp4toannexb";
323
+ }
324
+ break;
325
+ }
326
+
327
+ case AV_CODEC_ID_HEVC: {
328
+ const std::string formatName = avFormatCtx->iformat->long_name
329
+ ? avFormatCtx->iformat->long_name
330
+ : "";
331
+
332
+ if (formatName == "QuickTime / MOV" ||
333
+ formatName == "FLV (Flash Video)" ||
334
+ formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
335
+ filterName = "hevc_mp4toannexb";
336
+ }
337
+ break;
338
+ }
339
+ case AV_CODEC_ID_MPEG4: {
340
+ const std::string formatName =
341
+ avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
342
+ if (formatName == "avi") {
343
+ filterName = "mpeg4_unpack_bframes";
344
+ }
345
+ break;
346
+ }
347
+
348
+ default:
349
+ // No bitstream filter needed for other codecs
350
+ break;
351
+ }
352
+
353
+ if (filterName.empty()) {
354
+ // Only initialize BSF if we actually need one
355
+ return;
356
+ }
357
+
358
+ const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
359
+ TORCH_CHECK(
360
+ avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
361
+
362
+ AVBSFContext* avBSFContext = nullptr;
363
+ int retVal = av_bsf_alloc(avBSF, &avBSFContext);
364
+ TORCH_CHECK(
365
+ retVal >= AVSUCCESS,
366
+ "Failed to allocate bitstream filter: ",
367
+ getFFMPEGErrorStringFromErrorCode(retVal));
368
+
369
+ bitstreamFilter_.reset(avBSFContext);
370
+
371
+ retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
372
+ TORCH_CHECK(
373
+ retVal >= AVSUCCESS,
374
+ "Failed to copy codec parameters: ",
375
+ getFFMPEGErrorStringFromErrorCode(retVal));
376
+
377
+ retVal = av_bsf_init(bitstreamFilter_.get());
378
+ TORCH_CHECK(
379
+ retVal == AVSUCCESS,
380
+ "Failed to initialize bitstream filter: ",
381
+ getFFMPEGErrorStringFromErrorCode(retVal));
382
+ }
383
+
384
+ // This callback is called by the parser within cuvidParseVideoData when there
385
+ // is a change in the stream's properties (like resolution change), as specified
386
+ // by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
387
+ // start of the stream.
388
+ // TODONVDEC P1: Code below mostly assume this is called only once at the start,
389
+ // we should handle the case of multiple calls. Probably need to flush buffers,
390
+ // etc.
391
+ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
392
+ TORCH_CHECK(videoFormat != nullptr, "Invalid video format");
393
+
394
+ videoFormat_ = *videoFormat;
395
+
396
+ if (videoFormat_.min_num_decode_surfaces == 0) {
397
+ // Same as DALI's fallback
398
+ videoFormat_.min_num_decode_surfaces = 20;
399
+ }
400
+
401
+ if (!decoder_) {
402
+ decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
403
+
404
+ if (!decoder_) {
405
+ // TODONVDEC P2: consider re-configuring an existing decoder instead of
406
+ // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
407
+ // be enabled in DALI by default.
408
+ decoder_ = createDecoder(videoFormat);
409
+ }
410
+
411
+ TORCH_CHECK(decoder_, "Failed to get or create decoder");
412
+ }
413
+
414
+ // DALI also returns min_num_decode_surfaces from this function. This
415
+ // instructs the parser to reset its ulMaxNumDecodeSurfaces field to this
416
+ // value.
417
+ return static_cast<int>(videoFormat_.min_num_decode_surfaces);
418
+ }
419
+
420
+ // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
421
+ // the NVCUVID parser.
422
+ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
423
+ if (cpuFallback_) {
424
+ return cpuFallback_->sendPacket(packet);
425
+ }
426
+
427
+ TORCH_CHECK(
428
+ packet.get() && packet->data && packet->size > 0,
429
+ "sendPacket received an empty packet, this is unexpected, please report.");
430
+
431
+ // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
432
+ // the original one if no BSF is needed. This new filtered packet must be
433
+ // allocated outside of applyBSF: if it were allocated inside applyBSF, it
434
+ // would be destroyed at the end of the function, leaving us with a dangling
435
+ // reference.
436
+ AutoAVPacket filteredAutoPacket;
437
+ ReferenceAVPacket filteredPacket(filteredAutoPacket);
438
+ ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);
439
+
440
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
441
+ cuvidPacket.payload = packetToSend->data;
442
+ cuvidPacket.payload_size = packetToSend->size;
443
+ cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
444
+ cuvidPacket.timestamp = packetToSend->pts;
445
+
446
+ return sendCuvidPacket(cuvidPacket);
447
+ }
448
+
449
+ int BetaCudaDeviceInterface::sendEOFPacket() {
450
+ if (cpuFallback_) {
451
+ return cpuFallback_->sendEOFPacket();
452
+ }
453
+
454
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
455
+ cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
456
+ eofSent_ = true;
457
+
458
+ return sendCuvidPacket(cuvidPacket);
459
+ }
460
+
461
+ int BetaCudaDeviceInterface::sendCuvidPacket(
462
+ CUVIDSOURCEDATAPACKET& cuvidPacket) {
463
+ CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
464
+ return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
465
+ }
466
+
467
+ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
468
+ ReferenceAVPacket& packet,
469
+ ReferenceAVPacket& filteredPacket) {
470
+ if (!bitstreamFilter_) {
471
+ return packet;
472
+ }
473
+
474
+ int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
475
+ TORCH_CHECK(
476
+ retVal >= AVSUCCESS,
477
+ "Failed to send packet to bitstream filter: ",
478
+ getFFMPEGErrorStringFromErrorCode(retVal));
479
+
480
+ // TODO P1: the docs mention there can theoretically be multiple output
481
+ // packets for a single input, i.e. we may need to call av_bsf_receive_packet
482
+ // more than once. We should figure out whether that applies to the BSF we're
483
+ // using.
484
+ retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
485
+ TORCH_CHECK(
486
+ retVal >= AVSUCCESS,
487
+ "Failed to receive packet from bitstream filter: ",
488
+ getFFMPEGErrorStringFromErrorCode(retVal));
489
+
490
+ return filteredPacket;
491
+ }
492
+
493
+ // Parser triggers this callback within cuvidParseVideoData when a frame is
494
+ // ready to be decoded, i.e. the parser received all the necessary packets for a
495
+ // given frame. It means we can send that frame to be decoded by the hardware
496
+ // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
497
+ int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
498
+ TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
499
+ TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
500
+ // Send frame to be decoded by NVDEC - non-blocking call.
501
+ CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
502
+
503
+ // Yes, you're reading that right, 0 means error, 1 means success
504
+ return (result == CUDA_SUCCESS);
505
+ }
506
+
507
+ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
508
+ CUVIDPARSERDISPINFO* dispInfo) {
509
+ readyFrames_.push(*dispInfo);
510
+ return 1; // success
511
+ }
512
+
513
+ // Moral equivalent of avcodec_receive_frame().
514
+ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
515
+ if (cpuFallback_) {
516
+ return cpuFallback_->receiveFrame(avFrame);
517
+ }
518
+
519
+ if (readyFrames_.empty()) {
520
+ // No frame found, instruct caller to try again later after sending more
521
+ // packets, or to stop if EOF was already sent.
522
+ return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
523
+ }
524
+
525
+ CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
526
+ readyFrames_.pop();
527
+
528
+ CUVIDPROCPARAMS procParams = {};
529
+ procParams.progressive_frame = dispInfo.progressive_frame;
530
+ procParams.top_field_first = dispInfo.top_field_first;
531
+ procParams.unpaired_field = dispInfo.repeat_first_field < 0;
532
+ // We set the NVDEC stream to the current stream. It will be waited upon by
533
+ // the NPP stream before any color conversion.
534
+ // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
535
+ // CUstream
536
+ procParams.output_stream = reinterpret_cast<CUstream>(
537
+ at::cuda::getCurrentCUDAStream(device_.index()).stream());
538
+
539
+ CUdeviceptr framePtr = 0;
540
+ unsigned int pitch = 0;
541
+
542
+ // We know the frame we want was sent to the hardware decoder, but now we need
543
+ // to "map" it to an "output surface" before we can use its data. This is a
544
+ // blocking calls that waits until the frame is fully decoded and ready to be
545
+ // used.
546
+ // When a frame is mapped to an output surface, it needs to be unmapped
547
+ // eventually, so that the decoder can re-use the output surface. Failing to
548
+ // unmap will cause map to eventually fail. DALI unmaps frames almost
549
+ // immediately after mapping them: they do the color-conversion in-between,
550
+ // which involves a copy of the data, so that works.
551
+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
552
+ // we unmap here: just before mapping a new frame. At that point we know that
553
+ // the previously-mapped frame is no longer needed: it was either
554
+ // color-converted (with a copy), or that's a frame that was discarded in
555
+ // SingleStreamDecoder. Either way, the underlying output surface can be
556
+ // safely re-used.
557
+ unmapPreviousFrame();
558
+ CUresult result = cuvidMapVideoFrame(
559
+ *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
560
+ if (result != CUDA_SUCCESS) {
561
+ return AVERROR_EXTERNAL;
562
+ }
563
+ previouslyMappedFrame_ = framePtr;
564
+
565
+ avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
566
+
567
+ return AVSUCCESS;
568
+ }
569
+
570
+ void BetaCudaDeviceInterface::unmapPreviousFrame() {
571
+ if (previouslyMappedFrame_ == 0) {
572
+ return;
573
+ }
574
+ CUresult result =
575
+ cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
576
+ TORCH_CHECK(
577
+ result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
578
+ previouslyMappedFrame_ = 0;
579
+ }
580
+
581
+ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
582
+ CUdeviceptr framePtr,
583
+ unsigned int pitch,
584
+ const CUVIDPARSERDISPINFO& dispInfo) {
585
+ TORCH_CHECK(framePtr != 0, "Invalid CUDA frame pointer");
586
+
587
+ // Get frame dimensions from video format display area (not coded dimensions)
588
+ // This matches DALI's approach and avoids padding issues
589
+ int width = videoFormat_.display_area.right - videoFormat_.display_area.left;
590
+ int height = videoFormat_.display_area.bottom - videoFormat_.display_area.top;
591
+
592
+ TORCH_CHECK(width > 0 && height > 0, "Invalid frame dimensions");
593
+ TORCH_CHECK(
594
+ pitch >= static_cast<unsigned int>(width), "Pitch must be >= width");
595
+
596
+ UniqueAVFrame avFrame(av_frame_alloc());
597
+ TORCH_CHECK(avFrame.get() != nullptr, "Failed to allocate AVFrame");
598
+
599
+ avFrame->width = width;
600
+ avFrame->height = height;
601
+ avFrame->format = AV_PIX_FMT_CUDA;
602
+ avFrame->pts = dispInfo.timestamp;
603
+
604
+ // TODONVDEC P2: We compute the duration based on average frame rate info, so
605
+ // so if the video has variable frame rate, the durations may be off. We
606
+ // should try to see if we can set the duration more accurately. Unfortunately
607
+ // it's not given by dispInfo. One option would be to set it based on the pts
608
+ // difference between consecutive frames, if the next frame is already
609
+ // available.
610
+ // Note that we used to rely on videoFormat_.frame_rate for this, but that
611
+ // proved less accurate than FFmpeg.
612
+ setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
613
+
614
+ // We need to assign the frame colorspace. This is crucial for proper color
615
+ // conversion. NVCUVID stores that in the matrix_coefficients field, but
616
+ // doesn't document the semantics of the values. Claude code generated this,
617
+ // which seems to work. Reassuringly, the values seem to match the
618
+ // corresponding indices in the FFmpeg enum for colorspace conversion
619
+ // (ff_yuv2rgb_coeffs):
620
+ // https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
621
+ switch (videoFormat_.video_signal_description.matrix_coefficients) {
622
+ case 1:
623
+ avFrame->colorspace = AVCOL_SPC_BT709;
624
+ break;
625
+ case 6:
626
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
627
+ break;
628
+ default:
629
+ // Default to BT.601
630
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M;
631
+ break;
632
+ }
633
+
634
+ avFrame->color_range =
635
+ videoFormat_.video_signal_description.video_full_range_flag
636
+ ? AVCOL_RANGE_JPEG
637
+ : AVCOL_RANGE_MPEG;
638
+
639
+ // Below: Ask Claude. I'm not going to even pretend.
640
+ avFrame->data[0] = reinterpret_cast<uint8_t*>(framePtr);
641
+ avFrame->data[1] = reinterpret_cast<uint8_t*>(framePtr + (pitch * height));
642
+ avFrame->data[2] = nullptr;
643
+ avFrame->data[3] = nullptr;
644
+ avFrame->linesize[0] = pitch;
645
+ avFrame->linesize[1] = pitch;
646
+ avFrame->linesize[2] = 0;
647
+ avFrame->linesize[3] = 0;
648
+
649
+ return avFrame;
650
+ }
651
+
652
+ void BetaCudaDeviceInterface::flush() {
653
+ if (cpuFallback_) {
654
+ cpuFallback_->flush();
655
+ return;
656
+ }
657
+
658
+ // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
659
+ // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
660
+ // don't say whether this should be an empty packet, or whether it should be a
661
+ // flag on the next non-empty packet. It doesn't matter: neither work :)
662
+ // Sending an EOF packet, however, does work. So we do that. And we re-set the
663
+ // eofSent_ flag to false because that's not a true EOF notification.
664
+ sendEOFPacket();
665
+ eofSent_ = false;
666
+
667
+ std::queue<CUVIDPARSERDISPINFO> emptyQueue;
668
+ std::swap(readyFrames_, emptyQueue);
669
+ }
670
+
671
+ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
672
+ UniqueAVFrame& avFrame,
673
+ FrameOutput& frameOutput,
674
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
675
+ if (cpuFallback_) {
676
+ // CPU decoded frame - need to do CPU color conversion then transfer to GPU
677
+ FrameOutput cpuFrameOutput;
678
+ cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
679
+
680
+ // Transfer CPU frame to GPU
681
+ if (preAllocatedOutputTensor.has_value()) {
682
+ preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
683
+ frameOutput.data = preAllocatedOutputTensor.value();
684
+ } else {
685
+ frameOutput.data = cpuFrameOutput.data.to(device_);
686
+ }
687
+ return;
688
+ }
689
+
690
+ // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
691
+ // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
692
+ TORCH_CHECK(
693
+ avFrame->format == AV_PIX_FMT_CUDA,
694
+ "Expected CUDA format frame from BETA CUDA interface");
695
+
696
+ validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
697
+
698
+ at::cuda::CUDAStream nvdecStream =
699
+ at::cuda::getCurrentCUDAStream(device_.index());
700
+
701
+ frameOutput.data = convertNV12FrameToRGB(
702
+ avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
703
+ }
704
+
705
+ std::string BetaCudaDeviceInterface::getDetails() {
706
+ std::string details = "Beta CUDA Device Interface.";
707
+ if (cpuFallback_) {
708
+ details += " Using CPU fallback.";
709
+ if (!nvcuvidAvailable_) {
710
+ details += " NVCUVID not available!";
711
+ }
712
+ } else {
713
+ details += " Using NVDEC.";
714
+ }
715
+ return details;
716
+ }
717
+
718
+ } // namespace facebook::torchcodec