torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (61) hide show
  1. torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
  2. torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
  3. torchcodec/_core/CMakeLists.txt +36 -3
  4. torchcodec/_core/CUDACommon.cpp +315 -0
  5. torchcodec/_core/CUDACommon.h +46 -0
  6. torchcodec/_core/CpuDeviceInterface.cpp +189 -108
  7. torchcodec/_core/CpuDeviceInterface.h +81 -19
  8. torchcodec/_core/CudaDeviceInterface.cpp +211 -368
  9. torchcodec/_core/CudaDeviceInterface.h +33 -6
  10. torchcodec/_core/DeviceInterface.cpp +57 -19
  11. torchcodec/_core/DeviceInterface.h +97 -16
  12. torchcodec/_core/Encoder.cpp +302 -9
  13. torchcodec/_core/Encoder.h +51 -1
  14. torchcodec/_core/FFMPEGCommon.cpp +189 -2
  15. torchcodec/_core/FFMPEGCommon.h +18 -0
  16. torchcodec/_core/FilterGraph.cpp +28 -21
  17. torchcodec/_core/FilterGraph.h +15 -1
  18. torchcodec/_core/Frame.cpp +17 -7
  19. torchcodec/_core/Frame.h +15 -61
  20. torchcodec/_core/Metadata.h +2 -2
  21. torchcodec/_core/NVDECCache.cpp +70 -0
  22. torchcodec/_core/NVDECCache.h +104 -0
  23. torchcodec/_core/SingleStreamDecoder.cpp +202 -198
  24. torchcodec/_core/SingleStreamDecoder.h +39 -14
  25. torchcodec/_core/StreamOptions.h +16 -6
  26. torchcodec/_core/Transform.cpp +60 -0
  27. torchcodec/_core/Transform.h +59 -0
  28. torchcodec/_core/__init__.py +1 -0
  29. torchcodec/_core/custom_ops.cpp +180 -32
  30. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
  31. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  32. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  33. torchcodec/_core/ops.py +86 -43
  34. torchcodec/_core/pybind_ops.cpp +22 -59
  35. torchcodec/_samplers/video_clip_sampler.py +7 -19
  36. torchcodec/decoders/__init__.py +1 -0
  37. torchcodec/decoders/_decoder_utils.py +61 -1
  38. torchcodec/decoders/_video_decoder.py +56 -20
  39. torchcodec/libtorchcodec_core4.dll +0 -0
  40. torchcodec/libtorchcodec_core5.dll +0 -0
  41. torchcodec/libtorchcodec_core6.dll +0 -0
  42. torchcodec/libtorchcodec_core7.dll +0 -0
  43. torchcodec/libtorchcodec_core8.dll +0 -0
  44. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  45. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  46. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  47. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  48. torchcodec/libtorchcodec_custom_ops8.dll +0 -0
  49. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  50. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  51. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  52. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  53. torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
  54. torchcodec/samplers/_time_based.py +8 -0
  55. torchcodec/version.py +1 -1
  56. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/METADATA +24 -13
  57. torchcodec-0.8.0.dist-info/RECORD +80 -0
  58. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/WHEEL +1 -1
  59. torchcodec-0.7.0.dist-info/RECORD +0 -67
  60. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/licenses/LICENSE +0 -0
  61. {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,636 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include <c10/cuda/CUDAStream.h>
8
+ #include <torch/types.h>
9
+ #include <mutex>
10
+ #include <vector>
11
+
12
+ #include "src/torchcodec/_core/BetaCudaDeviceInterface.h"
13
+
14
+ #include "src/torchcodec/_core/DeviceInterface.h"
15
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
16
+ #include "src/torchcodec/_core/NVDECCache.h"
17
+
18
+ // #include <cuda_runtime.h> // For cudaStreamSynchronize
19
+ #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
20
+ #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
21
+
22
+ extern "C" {
23
+ #include <libavutil/hwcontext_cuda.h>
24
+ #include <libavutil/pixdesc.h>
25
+ }
26
+
27
+ namespace facebook::torchcodec {
28
+
29
+ namespace {
30
+
31
+ static bool g_cuda_beta = registerDeviceInterface(
32
+ DeviceInterfaceKey(torch::kCUDA, /*variant=*/"beta"),
33
+ [](const torch::Device& device) {
34
+ return new BetaCudaDeviceInterface(device);
35
+ });
36
+
37
+ static int CUDAAPI
38
+ pfnSequenceCallback(void* pUserData, CUVIDEOFORMAT* videoFormat) {
39
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
40
+ return decoder->streamPropertyChange(videoFormat);
41
+ }
42
+
43
+ static int CUDAAPI
44
+ pfnDecodePictureCallback(void* pUserData, CUVIDPICPARAMS* picParams) {
45
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
46
+ return decoder->frameReadyForDecoding(picParams);
47
+ }
48
+
49
+ static int CUDAAPI
50
+ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
51
+ auto decoder = static_cast<BetaCudaDeviceInterface*>(pUserData);
52
+ return decoder->frameReadyInDisplayOrder(dispInfo);
53
+ }
54
+
55
+ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
56
+ // Check decoder capabilities - same checks as DALI
57
+ auto caps = CUVIDDECODECAPS{};
58
+ caps.eCodecType = videoFormat->codec;
59
+ caps.eChromaFormat = videoFormat->chroma_format;
60
+ caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
61
+ CUresult result = cuvidGetDecoderCaps(&caps);
62
+ TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
63
+
64
+ TORCH_CHECK(
65
+ caps.bIsSupported,
66
+ "Codec configuration not supported on this GPU. "
67
+ "Codec: ",
68
+ static_cast<int>(videoFormat->codec),
69
+ ", chroma format: ",
70
+ static_cast<int>(videoFormat->chroma_format),
71
+ ", bit depth: ",
72
+ videoFormat->bit_depth_luma_minus8 + 8);
73
+
74
+ TORCH_CHECK(
75
+ videoFormat->coded_width >= caps.nMinWidth &&
76
+ videoFormat->coded_height >= caps.nMinHeight,
77
+ "Video is too small in at least one dimension. Provided: ",
78
+ videoFormat->coded_width,
79
+ "x",
80
+ videoFormat->coded_height,
81
+ " vs supported:",
82
+ caps.nMinWidth,
83
+ "x",
84
+ caps.nMinHeight);
85
+
86
+ TORCH_CHECK(
87
+ videoFormat->coded_width <= caps.nMaxWidth &&
88
+ videoFormat->coded_height <= caps.nMaxHeight,
89
+ "Video is too large in at least one dimension. Provided: ",
90
+ videoFormat->coded_width,
91
+ "x",
92
+ videoFormat->coded_height,
93
+ " vs supported:",
94
+ caps.nMaxWidth,
95
+ "x",
96
+ caps.nMaxHeight);
97
+
98
+ // See nMaxMBCount in cuviddec.h
99
+ constexpr unsigned int macroblockConstant = 256;
100
+ TORCH_CHECK(
101
+ videoFormat->coded_width * videoFormat->coded_height /
102
+ macroblockConstant <=
103
+ caps.nMaxMBCount,
104
+ "Video is too large (too many macroblocks). "
105
+ "Provided (width * height / ",
106
+ macroblockConstant,
107
+ "): ",
108
+ videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
109
+ " vs supported:",
110
+ caps.nMaxMBCount);
111
+
112
+ // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
113
+ // sure it's actually supported.
114
+ TORCH_CHECK(
115
+ (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
116
+ "NV12 output format is not supported for this configuration. ",
117
+ "Codec: ",
118
+ static_cast<int>(videoFormat->codec),
119
+ ", chroma format: ",
120
+ static_cast<int>(videoFormat->chroma_format),
121
+ ", bit depth: ",
122
+ videoFormat->bit_depth_luma_minus8 + 8);
123
+
124
+ // Decoder creation parameters, most are taken from DALI
125
+ CUVIDDECODECREATEINFO decoderParams = {};
126
+ decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
127
+ decoderParams.ChromaFormat = videoFormat->chroma_format;
128
+ // We explicitly request NV12 format, which means 10bit videos will be
129
+ // automatically converted to 8bits by NVDEC itself. That is, the raw frames
130
+ // we get back from cuvidMapVideoFrame will already be in 8bit format. We
131
+ // won't need to do the conversion ourselves, so that's a lot easier.
132
+ // In the default interface, we have to do the 10 -> 8bits conversion
133
+ // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
134
+ // requests 10 or 16bits output formats for >8-bit videos!
135
+ // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
136
+ decoderParams.OutputFormat = cudaVideoSurfaceFormat_NV12;
137
+ decoderParams.ulCreationFlags = cudaVideoCreate_Default;
138
+ decoderParams.CodecType = videoFormat->codec;
139
+ decoderParams.ulHeight = videoFormat->coded_height;
140
+ decoderParams.ulWidth = videoFormat->coded_width;
141
+ decoderParams.ulMaxHeight = videoFormat->coded_height;
142
+ decoderParams.ulMaxWidth = videoFormat->coded_width;
143
+ decoderParams.ulTargetHeight =
144
+ videoFormat->display_area.bottom - videoFormat->display_area.top;
145
+ decoderParams.ulTargetWidth =
146
+ videoFormat->display_area.right - videoFormat->display_area.left;
147
+ decoderParams.ulNumDecodeSurfaces = videoFormat->min_num_decode_surfaces;
148
+ // We should only ever need 1 output surface, since we process frames
149
+ // sequentially, and we always unmap the previous frame before mapping a new
150
+ // one.
151
+ // TODONVDEC P3: set this to 2, allow for 2 frames to be mapped at a time, and
152
+ // benchmark to see if this makes any difference.
153
+ decoderParams.ulNumOutputSurfaces = 1;
154
+ decoderParams.display_area.left = videoFormat->display_area.left;
155
+ decoderParams.display_area.right = videoFormat->display_area.right;
156
+ decoderParams.display_area.top = videoFormat->display_area.top;
157
+ decoderParams.display_area.bottom = videoFormat->display_area.bottom;
158
+
159
+ CUvideodecoder* decoder = new CUvideodecoder();
160
+ result = cuvidCreateDecoder(decoder, &decoderParams);
161
+ TORCH_CHECK(
162
+ result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
163
+ return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
164
+ }
165
+
166
+ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
167
+ switch (codecId) {
168
+ case AV_CODEC_ID_H264:
169
+ return cudaVideoCodec_H264;
170
+ case AV_CODEC_ID_HEVC:
171
+ return cudaVideoCodec_HEVC;
172
+ case AV_CODEC_ID_AV1:
173
+ return cudaVideoCodec_AV1;
174
+ case AV_CODEC_ID_VP9:
175
+ return cudaVideoCodec_VP9;
176
+ case AV_CODEC_ID_VP8:
177
+ return cudaVideoCodec_VP8;
178
+ case AV_CODEC_ID_MPEG4:
179
+ return cudaVideoCodec_MPEG4;
180
+ // Formats below are currently not tested, but they should "mostly" work.
181
+ // MPEG1 was briefly locally tested and it was ok-ish despite duration being
182
+ // off. Since they're far less popular, we keep them disabled by default but
183
+ // we can consider enabling them upon user requests.
184
+ // case AV_CODEC_ID_MPEG1VIDEO:
185
+ // return cudaVideoCodec_MPEG1;
186
+ // case AV_CODEC_ID_MPEG2VIDEO:
187
+ // return cudaVideoCodec_MPEG2;
188
+ // case AV_CODEC_ID_MJPEG:
189
+ // return cudaVideoCodec_JPEG;
190
+ // case AV_CODEC_ID_VC1:
191
+ // return cudaVideoCodec_VC1;
192
+ default: {
193
+ TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
194
+ }
195
+ }
196
+ }
197
+
198
+ } // namespace
199
+
200
+ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
201
+ : DeviceInterface(device) {
202
+ TORCH_CHECK(g_cuda_beta, "BetaCudaDeviceInterface was not registered!");
203
+ TORCH_CHECK(
204
+ device_.type() == torch::kCUDA, "Unsupported device: ", device_.str());
205
+
206
+ initializeCudaContextWithPytorch(device_);
207
+ nppCtx_ = getNppStreamContext(device_);
208
+ }
209
+
210
+ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
211
+ if (decoder_) {
212
+ // DALI doesn't seem to do any particular cleanup of the decoder before
213
+ // sending it to the cache, so we probably don't need to do anything either.
214
+ // Just to be safe, we flush.
215
+ // What happens to those decode surfaces that haven't yet been mapped is
216
+ // unclear.
217
+ flush();
218
+ unmapPreviousFrame();
219
+ NVDECCache::getCache(device_.index())
220
+ .returnDecoder(&videoFormat_, std::move(decoder_));
221
+ }
222
+
223
+ if (videoParser_) {
224
+ // TODONVDEC P2: consider caching this? Does DALI do that?
225
+ cuvidDestroyVideoParser(videoParser_);
226
+ videoParser_ = nullptr;
227
+ }
228
+
229
+ returnNppStreamContextToCache(device_, std::move(nppCtx_));
230
+ }
231
+
232
+ void BetaCudaDeviceInterface::initialize(
233
+ const AVStream* avStream,
234
+ const UniqueDecodingAVFormatContext& avFormatCtx) {
235
+ TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
236
+ timeBase_ = avStream->time_base;
237
+ frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
238
+
239
+ const AVCodecParameters* codecPar = avStream->codecpar;
240
+ TORCH_CHECK(codecPar != nullptr, "CodecParameters cannot be null");
241
+
242
+ initializeBSF(codecPar, avFormatCtx);
243
+
244
+ // Create parser. Default values that aren't obvious are taken from DALI.
245
+ CUVIDPARSERPARAMS parserParams = {};
246
+ parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
247
+ parserParams.ulMaxNumDecodeSurfaces = 8;
248
+ parserParams.ulMaxDisplayDelay = 0;
249
+ // Callback setup, all are triggered by the parser within a call
250
+ // to cuvidParseVideoData
251
+ parserParams.pUserData = this;
252
+ parserParams.pfnSequenceCallback = pfnSequenceCallback;
253
+ parserParams.pfnDecodePicture = pfnDecodePictureCallback;
254
+ parserParams.pfnDisplayPicture = pfnDisplayPictureCallback;
255
+
256
+ CUresult result = cuvidCreateVideoParser(&videoParser_, &parserParams);
257
+ TORCH_CHECK(
258
+ result == CUDA_SUCCESS, "Failed to create video parser: ", result);
259
+ }
260
+
261
+ void BetaCudaDeviceInterface::initializeBSF(
262
+ const AVCodecParameters* codecPar,
263
+ const UniqueDecodingAVFormatContext& avFormatCtx) {
264
+ // Setup bit stream filters (BSF):
265
+ // https://ffmpeg.org/doxygen/7.0/group__lavc__bsf.html
266
+ // This is only needed for some formats, like H264 or HEVC.
267
+
268
+ TORCH_CHECK(codecPar != nullptr, "codecPar cannot be null");
269
+ TORCH_CHECK(avFormatCtx != nullptr, "AVFormatContext cannot be null");
270
+ TORCH_CHECK(
271
+ avFormatCtx->iformat != nullptr,
272
+ "AVFormatContext->iformat cannot be null");
273
+ std::string filterName;
274
+
275
+ // Matching logic is taken from DALI
276
+ switch (codecPar->codec_id) {
277
+ case AV_CODEC_ID_H264: {
278
+ const std::string formatName = avFormatCtx->iformat->long_name
279
+ ? avFormatCtx->iformat->long_name
280
+ : "";
281
+
282
+ if (formatName == "QuickTime / MOV" ||
283
+ formatName == "FLV (Flash Video)" ||
284
+ formatName == "Matroska / WebM" || formatName == "raw H.264 video") {
285
+ filterName = "h264_mp4toannexb";
286
+ }
287
+ break;
288
+ }
289
+
290
+ case AV_CODEC_ID_HEVC: {
291
+ const std::string formatName = avFormatCtx->iformat->long_name
292
+ ? avFormatCtx->iformat->long_name
293
+ : "";
294
+
295
+ if (formatName == "QuickTime / MOV" ||
296
+ formatName == "FLV (Flash Video)" ||
297
+ formatName == "Matroska / WebM" || formatName == "raw HEVC video") {
298
+ filterName = "hevc_mp4toannexb";
299
+ }
300
+ break;
301
+ }
302
+ case AV_CODEC_ID_MPEG4: {
303
+ const std::string formatName =
304
+ avFormatCtx->iformat->name ? avFormatCtx->iformat->name : "";
305
+ if (formatName == "avi") {
306
+ filterName = "mpeg4_unpack_bframes";
307
+ }
308
+ break;
309
+ }
310
+
311
+ default:
312
+ // No bitstream filter needed for other codecs
313
+ break;
314
+ }
315
+
316
+ if (filterName.empty()) {
317
+ // Only initialize BSF if we actually need one
318
+ return;
319
+ }
320
+
321
+ const AVBitStreamFilter* avBSF = av_bsf_get_by_name(filterName.c_str());
322
+ TORCH_CHECK(
323
+ avBSF != nullptr, "Failed to find bitstream filter: ", filterName);
324
+
325
+ AVBSFContext* avBSFContext = nullptr;
326
+ int retVal = av_bsf_alloc(avBSF, &avBSFContext);
327
+ TORCH_CHECK(
328
+ retVal >= AVSUCCESS,
329
+ "Failed to allocate bitstream filter: ",
330
+ getFFMPEGErrorStringFromErrorCode(retVal));
331
+
332
+ bitstreamFilter_.reset(avBSFContext);
333
+
334
+ retVal = avcodec_parameters_copy(bitstreamFilter_->par_in, codecPar);
335
+ TORCH_CHECK(
336
+ retVal >= AVSUCCESS,
337
+ "Failed to copy codec parameters: ",
338
+ getFFMPEGErrorStringFromErrorCode(retVal));
339
+
340
+ retVal = av_bsf_init(bitstreamFilter_.get());
341
+ TORCH_CHECK(
342
+ retVal == AVSUCCESS,
343
+ "Failed to initialize bitstream filter: ",
344
+ getFFMPEGErrorStringFromErrorCode(retVal));
345
+ }
346
+
347
+ // This callback is called by the parser within cuvidParseVideoData when there
348
+ // is a change in the stream's properties (like resolution change), as specified
349
+ // by CUVIDEOFORMAT. Particularly (but not just!), this is called at the very
350
+ // start of the stream.
351
+ // TODONVDEC P1: Code below mostly assume this is called only once at the start,
352
+ // we should handle the case of multiple calls. Probably need to flush buffers,
353
+ // etc.
354
+ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
355
+ TORCH_CHECK(videoFormat != nullptr, "Invalid video format");
356
+
357
+ videoFormat_ = *videoFormat;
358
+
359
+ if (videoFormat_.min_num_decode_surfaces == 0) {
360
+ // Same as DALI's fallback
361
+ videoFormat_.min_num_decode_surfaces = 20;
362
+ }
363
+
364
+ if (!decoder_) {
365
+ decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
366
+
367
+ if (!decoder_) {
368
+ // TODONVDEC P2: consider re-configuring an existing decoder instead of
369
+ // re-creating one. See docs, see DALI.
370
+ decoder_ = createDecoder(videoFormat);
371
+ }
372
+
373
+ TORCH_CHECK(decoder_, "Failed to get or create decoder");
374
+ }
375
+
376
+ // DALI also returns min_num_decode_surfaces from this function. This
377
+ // instructs the parser to reset its ulMaxNumDecodeSurfaces field to this
378
+ // value.
379
+ return static_cast<int>(videoFormat_.min_num_decode_surfaces);
380
+ }
381
+
382
+ // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
383
+ // the NVCUVID parser.
384
+ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
385
+ TORCH_CHECK(
386
+ packet.get() && packet->data && packet->size > 0,
387
+ "sendPacket received an empty packet, this is unexpected, please report.");
388
+
389
+ // Apply BSF if needed. We want applyBSF to return a *new* filtered packet, or
390
+ // the original one if no BSF is needed. This new filtered packet must be
391
+ // allocated outside of applyBSF: if it were allocated inside applyBSF, it
392
+ // would be destroyed at the end of the function, leaving us with a dangling
393
+ // reference.
394
+ AutoAVPacket filteredAutoPacket;
395
+ ReferenceAVPacket filteredPacket(filteredAutoPacket);
396
+ ReferenceAVPacket& packetToSend = applyBSF(packet, filteredPacket);
397
+
398
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
399
+ cuvidPacket.payload = packetToSend->data;
400
+ cuvidPacket.payload_size = packetToSend->size;
401
+ cuvidPacket.flags = CUVID_PKT_TIMESTAMP;
402
+ cuvidPacket.timestamp = packetToSend->pts;
403
+
404
+ return sendCuvidPacket(cuvidPacket);
405
+ }
406
+
407
+ int BetaCudaDeviceInterface::sendEOFPacket() {
408
+ CUVIDSOURCEDATAPACKET cuvidPacket = {};
409
+ cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
410
+ eofSent_ = true;
411
+
412
+ return sendCuvidPacket(cuvidPacket);
413
+ }
414
+
415
+ int BetaCudaDeviceInterface::sendCuvidPacket(
416
+ CUVIDSOURCEDATAPACKET& cuvidPacket) {
417
+ CUresult result = cuvidParseVideoData(videoParser_, &cuvidPacket);
418
+ return result == CUDA_SUCCESS ? AVSUCCESS : AVERROR_EXTERNAL;
419
+ }
420
+
421
+ ReferenceAVPacket& BetaCudaDeviceInterface::applyBSF(
422
+ ReferenceAVPacket& packet,
423
+ ReferenceAVPacket& filteredPacket) {
424
+ if (!bitstreamFilter_) {
425
+ return packet;
426
+ }
427
+
428
+ int retVal = av_bsf_send_packet(bitstreamFilter_.get(), packet.get());
429
+ TORCH_CHECK(
430
+ retVal >= AVSUCCESS,
431
+ "Failed to send packet to bitstream filter: ",
432
+ getFFMPEGErrorStringFromErrorCode(retVal));
433
+
434
+ // TODO P1: the docs mention there can theoretically be multiple output
435
+ // packets for a single input, i.e. we may need to call av_bsf_receive_packet
436
+ // more than once. We should figure out whether that applies to the BSF we're
437
+ // using.
438
+ retVal = av_bsf_receive_packet(bitstreamFilter_.get(), filteredPacket.get());
439
+ TORCH_CHECK(
440
+ retVal >= AVSUCCESS,
441
+ "Failed to receive packet from bitstream filter: ",
442
+ getFFMPEGErrorStringFromErrorCode(retVal));
443
+
444
+ return filteredPacket;
445
+ }
446
+
447
+ // Parser triggers this callback within cuvidParseVideoData when a frame is
448
+ // ready to be decoded, i.e. the parser received all the necessary packets for a
449
+ // given frame. It means we can send that frame to be decoded by the hardware
450
+ // NVDEC decoder by calling cuvidDecodePicture which is non-blocking.
451
+ int BetaCudaDeviceInterface::frameReadyForDecoding(CUVIDPICPARAMS* picParams) {
452
+ TORCH_CHECK(picParams != nullptr, "Invalid picture parameters");
453
+ TORCH_CHECK(decoder_, "Decoder not initialized before picture decode");
454
+ // Send frame to be decoded by NVDEC - non-blocking call.
455
+ CUresult result = cuvidDecodePicture(*decoder_.get(), picParams);
456
+
457
+ // Yes, you're reading that right, 0 means error, 1 means success
458
+ return (result == CUDA_SUCCESS);
459
+ }
460
+
461
+ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
462
+ CUVIDPARSERDISPINFO* dispInfo) {
463
+ readyFrames_.push(*dispInfo);
464
+ return 1; // success
465
+ }
466
+
467
+ // Moral equivalent of avcodec_receive_frame().
468
+ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
469
+ if (readyFrames_.empty()) {
470
+ // No frame found, instruct caller to try again later after sending more
471
+ // packets, or to stop if EOF was already sent.
472
+ return eofSent_ ? AVERROR_EOF : AVERROR(EAGAIN);
473
+ }
474
+
475
+ CUVIDPARSERDISPINFO dispInfo = readyFrames_.front();
476
+ readyFrames_.pop();
477
+
478
+ CUVIDPROCPARAMS procParams = {};
479
+ procParams.progressive_frame = dispInfo.progressive_frame;
480
+ procParams.top_field_first = dispInfo.top_field_first;
481
+ procParams.unpaired_field = dispInfo.repeat_first_field < 0;
482
+ // We set the NVDEC stream to the current stream. It will be waited upon by
483
+ // the NPP stream before any color conversion. Currently, that syncing logic
484
+ // is in the default interface.
485
+ // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
486
+ // CUstream
487
+ procParams.output_stream = reinterpret_cast<CUstream>(
488
+ at::cuda::getCurrentCUDAStream(device_.index()).stream());
489
+
490
+ CUdeviceptr framePtr = 0;
491
+ unsigned int pitch = 0;
492
+
493
+ // We know the frame we want was sent to the hardware decoder, but now we need
494
+ // to "map" it to an "output surface" before we can use its data. This is a
495
+ // blocking calls that waits until the frame is fully decoded and ready to be
496
+ // used.
497
+ // When a frame is mapped to an output surface, it needs to be unmapped
498
+ // eventually, so that the decoder can re-use the output surface. Failing to
499
+ // unmap will cause map to eventually fail. DALI unmaps frames almost
500
+ // immediately after mapping them: they do the color-conversion in-between,
501
+ // which involves a copy of the data, so that works.
502
+ // We, OTOH, will do the color-conversion later, outside of ReceiveFrame(). So
503
+ // we unmap here: just before mapping a new frame. At that point we know that
504
+ // the previously-mapped frame is no longer needed: it was either
505
+ // color-converted (with a copy), or that's a frame that was discarded in
506
+ // SingleStreamDecoder. Either way, the underlying output surface can be
507
+ // safely re-used.
508
+ unmapPreviousFrame();
509
+ CUresult result = cuvidMapVideoFrame(
510
+ *decoder_.get(), dispInfo.picture_index, &framePtr, &pitch, &procParams);
511
+ if (result != CUDA_SUCCESS) {
512
+ return AVERROR_EXTERNAL;
513
+ }
514
+ previouslyMappedFrame_ = framePtr;
515
+
516
+ avFrame = convertCudaFrameToAVFrame(framePtr, pitch, dispInfo);
517
+
518
+ return AVSUCCESS;
519
+ }
520
+
521
+ void BetaCudaDeviceInterface::unmapPreviousFrame() {
522
+ if (previouslyMappedFrame_ == 0) {
523
+ return;
524
+ }
525
+ CUresult result =
526
+ cuvidUnmapVideoFrame(*decoder_.get(), previouslyMappedFrame_);
527
+ TORCH_CHECK(
528
+ result == CUDA_SUCCESS, "Failed to unmap previous frame: ", result);
529
+ previouslyMappedFrame_ = 0;
530
+ }
531
+
532
+ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
533
+ CUdeviceptr framePtr,
534
+ unsigned int pitch,
535
+ const CUVIDPARSERDISPINFO& dispInfo) {
536
+ TORCH_CHECK(framePtr != 0, "Invalid CUDA frame pointer");
537
+
538
+ // Get frame dimensions from video format display area (not coded dimensions)
539
+ // This matches DALI's approach and avoids padding issues
540
+ int width = videoFormat_.display_area.right - videoFormat_.display_area.left;
541
+ int height = videoFormat_.display_area.bottom - videoFormat_.display_area.top;
542
+
543
+ TORCH_CHECK(width > 0 && height > 0, "Invalid frame dimensions");
544
+ TORCH_CHECK(
545
+ pitch >= static_cast<unsigned int>(width), "Pitch must be >= width");
546
+
547
+ UniqueAVFrame avFrame(av_frame_alloc());
548
+ TORCH_CHECK(avFrame.get() != nullptr, "Failed to allocate AVFrame");
549
+
550
+ avFrame->width = width;
551
+ avFrame->height = height;
552
+ avFrame->format = AV_PIX_FMT_CUDA;
553
+ avFrame->pts = dispInfo.timestamp;
554
+
555
+ // TODONVDEC P2: We compute the duration based on average frame rate info, so
556
+ // so if the video has variable frame rate, the durations may be off. We
557
+ // should try to see if we can set the duration more accurately. Unfortunately
558
+ // it's not given by dispInfo. One option would be to set it based on the pts
559
+ // difference between consecutive frames, if the next frame is already
560
+ // available.
561
+ // Note that we used to rely on videoFormat_.frame_rate for this, but that
562
+ // proved less accurate than FFmpeg.
563
+ setDuration(avFrame, computeSafeDuration(frameRateAvgFromFFmpeg_, timeBase_));
564
+
565
+ // We need to assign the frame colorspace. This is crucial for proper color
566
+ // conversion. NVCUVID stores that in the matrix_coefficients field, but
567
+ // doesn't document the semantics of the values. Claude code generated this,
568
+ // which seems to work. Reassuringly, the values seem to match the
569
+ // corresponding indices in the FFmpeg enum for colorspace conversion
570
+ // (ff_yuv2rgb_coeffs):
571
+ // https://ffmpeg.org/doxygen/trunk/yuv2rgb_8c_source.html#l00047
572
+ switch (videoFormat_.video_signal_description.matrix_coefficients) {
573
+ case 1:
574
+ avFrame->colorspace = AVCOL_SPC_BT709;
575
+ break;
576
+ case 6:
577
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M; // BT.601
578
+ break;
579
+ default:
580
+ // Default to BT.601
581
+ avFrame->colorspace = AVCOL_SPC_SMPTE170M;
582
+ break;
583
+ }
584
+
585
+ avFrame->color_range =
586
+ videoFormat_.video_signal_description.video_full_range_flag
587
+ ? AVCOL_RANGE_JPEG
588
+ : AVCOL_RANGE_MPEG;
589
+
590
+ // Below: Ask Claude. I'm not going to even pretend.
591
+ avFrame->data[0] = reinterpret_cast<uint8_t*>(framePtr);
592
+ avFrame->data[1] = reinterpret_cast<uint8_t*>(framePtr + (pitch * height));
593
+ avFrame->data[2] = nullptr;
594
+ avFrame->data[3] = nullptr;
595
+ avFrame->linesize[0] = pitch;
596
+ avFrame->linesize[1] = pitch;
597
+ avFrame->linesize[2] = 0;
598
+ avFrame->linesize[3] = 0;
599
+
600
+ return avFrame;
601
+ }
602
+
603
+ void BetaCudaDeviceInterface::flush() {
604
+ // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
605
+ // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
606
+ // don't say whether this should be an empty packet, or whether it should be a
607
+ // flag on the next non-empty packet. It doesn't matter: neither work :)
608
+ // Sending an EOF packet, however, does work. So we do that. And we re-set the
609
+ // eofSent_ flag to false because that's not a true EOF notification.
610
+ sendEOFPacket();
611
+ eofSent_ = false;
612
+
613
+ std::queue<CUVIDPARSERDISPINFO> emptyQueue;
614
+ std::swap(readyFrames_, emptyQueue);
615
+ }
616
+
617
+ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
618
+ UniqueAVFrame& avFrame,
619
+ FrameOutput& frameOutput,
620
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
621
+ // TODONVDEC P2: we may need to handle 10bit videos the same way the default
622
+ // interface does it with maybeConvertAVFrameToNV12OrRGB24().
623
+ TORCH_CHECK(
624
+ avFrame->format == AV_PIX_FMT_CUDA,
625
+ "Expected CUDA format frame from BETA CUDA interface");
626
+
627
+ validatePreAllocatedTensorShape(preAllocatedOutputTensor, avFrame);
628
+
629
+ at::cuda::CUDAStream nvdecStream =
630
+ at::cuda::getCurrentCUDAStream(device_.index());
631
+
632
+ frameOutput.data = convertNV12FrameToRGB(
633
+ avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
634
+ }
635
+
636
+ } // namespace facebook::torchcodec