torchcodec 0.8.0__cp311-cp311-macosx_11_0_arm64.whl → 0.8.1__cp311-cp311-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (57) hide show
  1. torchcodec/.dylibs/libc++.1.0.dylib +0 -0
  2. torchcodec/.dylibs/libpython3.11.dylib +0 -0
  3. torchcodec/_core/AVIOTensorContext.cpp +23 -16
  4. torchcodec/_core/AVIOTensorContext.h +2 -1
  5. torchcodec/_core/BetaCudaDeviceInterface.cpp +168 -86
  6. torchcodec/_core/BetaCudaDeviceInterface.h +7 -5
  7. torchcodec/_core/CMakeLists.txt +1 -19
  8. torchcodec/_core/CUDACommon.cpp +21 -6
  9. torchcodec/_core/CUDACommon.h +6 -1
  10. torchcodec/_core/Cache.h +6 -20
  11. torchcodec/_core/CpuDeviceInterface.cpp +7 -1
  12. torchcodec/_core/CpuDeviceInterface.h +4 -1
  13. torchcodec/_core/CudaDeviceInterface.cpp +19 -11
  14. torchcodec/_core/CudaDeviceInterface.h +6 -1
  15. torchcodec/_core/DeviceInterface.h +27 -27
  16. torchcodec/_core/Encoder.cpp +51 -7
  17. torchcodec/_core/Encoder.h +12 -1
  18. torchcodec/_core/FFMPEGCommon.cpp +1 -1
  19. torchcodec/_core/FFMPEGCommon.h +9 -1
  20. torchcodec/_core/FilterGraph.cpp +2 -1
  21. torchcodec/_core/Frame.cpp +5 -0
  22. torchcodec/_core/Frame.h +1 -1
  23. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  24. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  25. torchcodec/_core/NVDECCache.cpp +3 -13
  26. torchcodec/_core/NVDECCache.h +4 -6
  27. torchcodec/_core/SingleStreamDecoder.cpp +22 -31
  28. torchcodec/_core/SingleStreamDecoder.h +4 -2
  29. torchcodec/_core/StreamOptions.h +2 -2
  30. torchcodec/_core/Transform.cpp +27 -0
  31. torchcodec/_core/Transform.h +25 -0
  32. torchcodec/_core/__init__.py +3 -0
  33. torchcodec/_core/custom_ops.cpp +99 -22
  34. torchcodec/_core/ops.py +76 -16
  35. torchcodec/decoders/_video_decoder.py +0 -10
  36. torchcodec/libtorchcodec_core4.dylib +0 -0
  37. torchcodec/libtorchcodec_core5.dylib +0 -0
  38. torchcodec/libtorchcodec_core6.dylib +0 -0
  39. torchcodec/libtorchcodec_core7.dylib +0 -0
  40. torchcodec/libtorchcodec_core8.dylib +0 -0
  41. torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
  42. torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
  43. torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
  44. torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
  45. torchcodec/libtorchcodec_custom_ops8.dylib +0 -0
  46. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  47. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  48. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  49. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  50. torchcodec/libtorchcodec_pybind_ops8.so +0 -0
  51. torchcodec/version.py +1 -1
  52. {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +6 -4
  53. torchcodec-0.8.1.dist-info/RECORD +84 -0
  54. torchcodec-0.8.0.dist-info/RECORD +0 -82
  55. {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +0 -0
  56. {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
  57. {torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
Binary file
Binary file
@@ -18,15 +18,15 @@ constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
18
18
  int read(void* opaque, uint8_t* buf, int buf_size) {
19
19
  auto tensorContext = static_cast<detail::TensorContext*>(opaque);
20
20
  TORCH_CHECK(
21
- tensorContext->current <= tensorContext->data.numel(),
22
- "Tried to read outside of the buffer: current=",
23
- tensorContext->current,
21
+ tensorContext->current_pos <= tensorContext->data.numel(),
22
+ "Tried to read outside of the buffer: current_pos=",
23
+ tensorContext->current_pos,
24
24
  ", size=",
25
25
  tensorContext->data.numel());
26
26
 
27
27
  int64_t numBytesRead = std::min(
28
28
  static_cast<int64_t>(buf_size),
29
- tensorContext->data.numel() - tensorContext->current);
29
+ tensorContext->data.numel() - tensorContext->current_pos);
30
30
 
31
31
  TORCH_CHECK(
32
32
  numBytesRead >= 0,
@@ -34,8 +34,8 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
34
34
  numBytesRead,
35
35
  ", size=",
36
36
  tensorContext->data.numel(),
37
- ", current=",
38
- tensorContext->current);
37
+ ", current_pos=",
38
+ tensorContext->current_pos);
39
39
 
40
40
  if (numBytesRead == 0) {
41
41
  return AVERROR_EOF;
@@ -43,9 +43,9 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
43
43
 
44
44
  std::memcpy(
45
45
  buf,
46
- tensorContext->data.data_ptr<uint8_t>() + tensorContext->current,
46
+ tensorContext->data.data_ptr<uint8_t>() + tensorContext->current_pos,
47
47
  numBytesRead);
48
- tensorContext->current += numBytesRead;
48
+ tensorContext->current_pos += numBytesRead;
49
49
  return numBytesRead;
50
50
  }
51
51
 
@@ -54,7 +54,7 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
54
54
  auto tensorContext = static_cast<detail::TensorContext*>(opaque);
55
55
 
56
56
  int64_t bufSize = static_cast<int64_t>(buf_size);
57
- if (tensorContext->current + bufSize > tensorContext->data.numel()) {
57
+ if (tensorContext->current_pos + bufSize > tensorContext->data.numel()) {
58
58
  TORCH_CHECK(
59
59
  tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE,
60
60
  "We tried to allocate an output encoded tensor larger than ",
@@ -68,13 +68,17 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
68
68
  }
69
69
 
70
70
  TORCH_CHECK(
71
- tensorContext->current + bufSize <= tensorContext->data.numel(),
71
+ tensorContext->current_pos + bufSize <= tensorContext->data.numel(),
72
72
  "Re-allocation of the output tensor didn't work. ",
73
73
  "This should not happen, please report on TorchCodec bug tracker");
74
74
 
75
75
  uint8_t* outputTensorData = tensorContext->data.data_ptr<uint8_t>();
76
- std::memcpy(outputTensorData + tensorContext->current, buf, bufSize);
77
- tensorContext->current += bufSize;
76
+ std::memcpy(outputTensorData + tensorContext->current_pos, buf, bufSize);
77
+ tensorContext->current_pos += bufSize;
78
+ // Track the maximum position written so getOutputTensor's narrow() does not
79
+ // truncate the file if final seek was backwards
80
+ tensorContext->max_pos =
81
+ std::max(tensorContext->current_pos, tensorContext->max_pos);
78
82
  return buf_size;
79
83
  }
80
84
 
@@ -88,7 +92,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
88
92
  ret = tensorContext->data.numel();
89
93
  break;
90
94
  case SEEK_SET:
91
- tensorContext->current = offset;
95
+ tensorContext->current_pos = offset;
92
96
  ret = offset;
93
97
  break;
94
98
  default:
@@ -101,7 +105,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
101
105
  } // namespace
102
106
 
103
107
  AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
104
- : tensorContext_{data, 0} {
108
+ : tensorContext_{data, 0, 0} {
105
109
  TORCH_CHECK(data.numel() > 0, "data must not be empty");
106
110
  TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
107
111
  TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
@@ -110,14 +114,17 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
110
114
  }
111
115
 
112
116
  AVIOToTensorContext::AVIOToTensorContext()
113
- : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
117
+ : tensorContext_{
118
+ torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}),
119
+ 0,
120
+ 0} {
114
121
  createAVIOContext(
115
122
  nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
116
123
  }
117
124
 
118
125
  torch::Tensor AVIOToTensorContext::getOutputTensor() {
119
126
  return tensorContext_.data.narrow(
120
- /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.current);
127
+ /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max_pos);
121
128
  }
122
129
 
123
130
  } // namespace facebook::torchcodec
@@ -15,7 +15,8 @@ namespace detail {
15
15
 
16
16
  struct TensorContext {
17
17
  torch::Tensor data;
18
- int64_t current;
18
+ int64_t current_pos;
19
+ int64_t max_pos;
19
20
  };
20
21
 
21
22
  } // namespace detail
@@ -15,7 +15,7 @@
15
15
  #include "src/torchcodec/_core/FFMPEGCommon.h"
16
16
  #include "src/torchcodec/_core/NVDECCache.h"
17
17
 
18
- // #include <cuda_runtime.h> // For cudaStreamSynchronize
18
+ #include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
19
19
  #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
20
20
  #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
21
21
 
@@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
53
53
  }
54
54
 
55
55
  static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
56
- // Check decoder capabilities - same checks as DALI
57
- auto caps = CUVIDDECODECAPS{};
58
- caps.eCodecType = videoFormat->codec;
59
- caps.eChromaFormat = videoFormat->chroma_format;
60
- caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
61
- CUresult result = cuvidGetDecoderCaps(&caps);
62
- TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
63
-
64
- TORCH_CHECK(
65
- caps.bIsSupported,
66
- "Codec configuration not supported on this GPU. "
67
- "Codec: ",
68
- static_cast<int>(videoFormat->codec),
69
- ", chroma format: ",
70
- static_cast<int>(videoFormat->chroma_format),
71
- ", bit depth: ",
72
- videoFormat->bit_depth_luma_minus8 + 8);
73
-
74
- TORCH_CHECK(
75
- videoFormat->coded_width >= caps.nMinWidth &&
76
- videoFormat->coded_height >= caps.nMinHeight,
77
- "Video is too small in at least one dimension. Provided: ",
78
- videoFormat->coded_width,
79
- "x",
80
- videoFormat->coded_height,
81
- " vs supported:",
82
- caps.nMinWidth,
83
- "x",
84
- caps.nMinHeight);
85
-
86
- TORCH_CHECK(
87
- videoFormat->coded_width <= caps.nMaxWidth &&
88
- videoFormat->coded_height <= caps.nMaxHeight,
89
- "Video is too large in at least one dimension. Provided: ",
90
- videoFormat->coded_width,
91
- "x",
92
- videoFormat->coded_height,
93
- " vs supported:",
94
- caps.nMaxWidth,
95
- "x",
96
- caps.nMaxHeight);
97
-
98
- // See nMaxMBCount in cuviddec.h
99
- constexpr unsigned int macroblockConstant = 256;
100
- TORCH_CHECK(
101
- videoFormat->coded_width * videoFormat->coded_height /
102
- macroblockConstant <=
103
- caps.nMaxMBCount,
104
- "Video is too large (too many macroblocks). "
105
- "Provided (width * height / ",
106
- macroblockConstant,
107
- "): ",
108
- videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
109
- " vs supported:",
110
- caps.nMaxMBCount);
111
-
112
- // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
113
- // sure it's actually supported.
114
- TORCH_CHECK(
115
- (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
116
- "NV12 output format is not supported for this configuration. ",
117
- "Codec: ",
118
- static_cast<int>(videoFormat->codec),
119
- ", chroma format: ",
120
- static_cast<int>(videoFormat->chroma_format),
121
- ", bit depth: ",
122
- videoFormat->bit_depth_luma_minus8 + 8);
123
-
124
56
  // Decoder creation parameters, most are taken from DALI
125
57
  CUVIDDECODECREATEINFO decoderParams = {};
126
58
  decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
@@ -129,7 +61,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
129
61
  // automatically converted to 8bits by NVDEC itself. That is, the raw frames
130
62
  // we get back from cuvidMapVideoFrame will already be in 8bit format. We
131
63
  // won't need to do the conversion ourselves, so that's a lot easier.
132
- // In the default interface, we have to do the 10 -> 8bits conversion
64
+ // In the ffmpeg CUDA interface, we have to do the 10 -> 8bits conversion
133
65
  // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
134
66
  // requests 10 or 16bits output formats for >8-bit videos!
135
67
  // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
@@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
157
89
  decoderParams.display_area.bottom = videoFormat->display_area.bottom;
158
90
 
159
91
  CUvideodecoder* decoder = new CUvideodecoder();
160
- result = cuvidCreateDecoder(decoder, &decoderParams);
92
+ CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
161
93
  TORCH_CHECK(
162
94
  result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
163
95
  return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
164
96
  }
165
97
 
166
- cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
98
+ std::optional<cudaVideoChromaFormat> validateChromaSupport(
99
+ const AVPixFmtDescriptor* desc) {
100
+ // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
101
+ // otherwise.
102
+ TORCH_CHECK(desc != nullptr, "desc can't be null");
103
+
104
+ if (desc->nb_components == 1) {
105
+ return cudaVideoChromaFormat_Monochrome;
106
+ } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
107
+ // Make sure it's YUV: has chroma planes and isn't RGB
108
+ if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
109
+ return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
110
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
111
+ return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
112
+ } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
113
+ return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
114
+ }
115
+ }
116
+
117
+ return std::nullopt;
118
+ }
119
+
120
+ std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
121
+ // Return the corresponding cudaVideoCodec if supported, std::nullopt
122
+ // otherwise
123
+ // Note that we currently return nullopt (and thus fallback to CPU) for some
124
+ // codecs that are technically supported by NVDEC, see comment below.
167
125
  switch (codecId) {
168
126
  case AV_CODEC_ID_H264:
169
127
  return cudaVideoCodec_H264;
@@ -189,12 +147,72 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
189
147
  // return cudaVideoCodec_JPEG;
190
148
  // case AV_CODEC_ID_VC1:
191
149
  // return cudaVideoCodec_VC1;
192
- default: {
193
- TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
194
- }
150
+ default:
151
+ return std::nullopt;
195
152
  }
196
153
  }
197
154
 
155
+ bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
156
+ // Return true iff the input video stream is supported by our NVDEC
157
+ // implementation.
158
+
159
+ auto codecType = validateCodecSupport(codecContext->codec_id);
160
+ if (!codecType.has_value()) {
161
+ return false;
162
+ }
163
+
164
+ const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
165
+ if (!desc) {
166
+ return false;
167
+ }
168
+
169
+ auto chromaFormat = validateChromaSupport(desc);
170
+ if (!chromaFormat.has_value()) {
171
+ return false;
172
+ }
173
+
174
+ auto caps = CUVIDDECODECAPS{};
175
+ caps.eCodecType = codecType.value();
176
+ caps.eChromaFormat = chromaFormat.value();
177
+ caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
178
+
179
+ CUresult result = cuvidGetDecoderCaps(&caps);
180
+ if (result != CUDA_SUCCESS) {
181
+ return false;
182
+ }
183
+
184
+ if (!caps.bIsSupported) {
185
+ return false;
186
+ }
187
+
188
+ auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
189
+ auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
190
+ if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
191
+ coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
192
+ coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
193
+ return false;
194
+ }
195
+
196
+ // See nMaxMBCount in cuviddec.h
197
+ constexpr unsigned int macroblockConstant = 256;
198
+ if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
199
+ return false;
200
+ }
201
+
202
+ // We'll set the decoderParams.OutputFormat to NV12, so we need to make
203
+ // sure it's actually supported.
204
+ // TODO: If this fail, we could consider decoding to something else than NV12
205
+ // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
206
+ // what FFmpeg does.
207
+ bool supportsNV12Output =
208
+ (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
209
+ if (!supportsNV12Output) {
210
+ return false;
211
+ }
212
+
213
+ return true;
214
+ }
215
+
198
216
  } // namespace
199
217
 
200
218
  BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -205,6 +223,8 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
205
223
 
206
224
  initializeCudaContextWithPytorch(device_);
207
225
  nppCtx_ = getNppStreamContext(device_);
226
+
227
+ nvcuvidAvailable_ = loadNVCUVIDLibrary();
208
228
  }
209
229
 
210
230
  BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
@@ -216,12 +236,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
216
236
  // unclear.
217
237
  flush();
218
238
  unmapPreviousFrame();
219
- NVDECCache::getCache(device_.index())
220
- .returnDecoder(&videoFormat_, std::move(decoder_));
239
+ NVDECCache::getCache(device_).returnDecoder(
240
+ &videoFormat_, std::move(decoder_));
221
241
  }
222
242
 
223
243
  if (videoParser_) {
224
- // TODONVDEC P2: consider caching this? Does DALI do that?
225
244
  cuvidDestroyVideoParser(videoParser_);
226
245
  videoParser_ = nullptr;
227
246
  }
@@ -231,7 +250,21 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
231
250
 
232
251
  void BetaCudaDeviceInterface::initialize(
233
252
  const AVStream* avStream,
234
- const UniqueDecodingAVFormatContext& avFormatCtx) {
253
+ const UniqueDecodingAVFormatContext& avFormatCtx,
254
+ [[maybe_unused]] const SharedAVCodecContext& codecContext) {
255
+ if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
256
+ cpuFallback_ = createDeviceInterface(torch::kCPU);
257
+ TORCH_CHECK(
258
+ cpuFallback_ != nullptr, "Failed to create CPU device interface");
259
+ cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
260
+ cpuFallback_->initializeVideo(
261
+ VideoStreamOptions(),
262
+ {},
263
+ /*resizedOutputDims=*/std::nullopt);
264
+ // We'll always use the CPU fallback from now on, so we can return early.
265
+ return;
266
+ }
267
+
235
268
  TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
236
269
  timeBase_ = avStream->time_base;
237
270
  frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
@@ -243,7 +276,11 @@ void BetaCudaDeviceInterface::initialize(
243
276
 
244
277
  // Create parser. Default values that aren't obvious are taken from DALI.
245
278
  CUVIDPARSERPARAMS parserParams = {};
246
- parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
279
+ auto codecType = validateCodecSupport(codecPar->codec_id);
280
+ TORCH_CHECK(
281
+ codecType.has_value(),
282
+ "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
283
+ parserParams.CodecType = codecType.value();
247
284
  parserParams.ulMaxNumDecodeSurfaces = 8;
248
285
  parserParams.ulMaxDisplayDelay = 0;
249
286
  // Callback setup, all are triggered by the parser within a call
@@ -362,11 +399,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
362
399
  }
363
400
 
364
401
  if (!decoder_) {
365
- decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
402
+ decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
366
403
 
367
404
  if (!decoder_) {
368
405
  // TODONVDEC P2: consider re-configuring an existing decoder instead of
369
- // re-creating one. See docs, see DALI.
406
+ // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
407
+ // be enabled in DALI by default.
370
408
  decoder_ = createDecoder(videoFormat);
371
409
  }
372
410
 
@@ -382,6 +420,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
382
420
  // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
383
421
  // the NVCUVID parser.
384
422
  int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
423
+ if (cpuFallback_) {
424
+ return cpuFallback_->sendPacket(packet);
425
+ }
426
+
385
427
  TORCH_CHECK(
386
428
  packet.get() && packet->data && packet->size > 0,
387
429
  "sendPacket received an empty packet, this is unexpected, please report.");
@@ -405,6 +447,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
405
447
  }
406
448
 
407
449
  int BetaCudaDeviceInterface::sendEOFPacket() {
450
+ if (cpuFallback_) {
451
+ return cpuFallback_->sendEOFPacket();
452
+ }
453
+
408
454
  CUVIDSOURCEDATAPACKET cuvidPacket = {};
409
455
  cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
410
456
  eofSent_ = true;
@@ -466,6 +512,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
466
512
 
467
513
  // Moral equivalent of avcodec_receive_frame().
468
514
  int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
515
+ if (cpuFallback_) {
516
+ return cpuFallback_->receiveFrame(avFrame);
517
+ }
518
+
469
519
  if (readyFrames_.empty()) {
470
520
  // No frame found, instruct caller to try again later after sending more
471
521
  // packets, or to stop if EOF was already sent.
@@ -480,8 +530,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
480
530
  procParams.top_field_first = dispInfo.top_field_first;
481
531
  procParams.unpaired_field = dispInfo.repeat_first_field < 0;
482
532
  // We set the NVDEC stream to the current stream. It will be waited upon by
483
- // the NPP stream before any color conversion. Currently, that syncing logic
484
- // is in the default interface.
533
+ // the NPP stream before any color conversion.
485
534
  // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
486
535
  // CUstream
487
536
  procParams.output_stream = reinterpret_cast<CUstream>(
@@ -601,6 +650,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
601
650
  }
602
651
 
603
652
  void BetaCudaDeviceInterface::flush() {
653
+ if (cpuFallback_) {
654
+ cpuFallback_->flush();
655
+ return;
656
+ }
657
+
604
658
  // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
605
659
  // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
606
660
  // don't say whether this should be an empty packet, or whether it should be a
@@ -618,8 +672,23 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
618
672
  UniqueAVFrame& avFrame,
619
673
  FrameOutput& frameOutput,
620
674
  std::optional<torch::Tensor> preAllocatedOutputTensor) {
621
- // TODONVDEC P2: we may need to handle 10bit videos the same way the default
622
- // interface does it with maybeConvertAVFrameToNV12OrRGB24().
675
+ if (cpuFallback_) {
676
+ // CPU decoded frame - need to do CPU color conversion then transfer to GPU
677
+ FrameOutput cpuFrameOutput;
678
+ cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
679
+
680
+ // Transfer CPU frame to GPU
681
+ if (preAllocatedOutputTensor.has_value()) {
682
+ preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
683
+ frameOutput.data = preAllocatedOutputTensor.value();
684
+ } else {
685
+ frameOutput.data = cpuFrameOutput.data.to(device_);
686
+ }
687
+ return;
688
+ }
689
+
690
+ // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
691
+ // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
623
692
  TORCH_CHECK(
624
693
  avFrame->format == AV_PIX_FMT_CUDA,
625
694
  "Expected CUDA format frame from BETA CUDA interface");
@@ -633,4 +702,17 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
633
702
  avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
634
703
  }
635
704
 
705
+ std::string BetaCudaDeviceInterface::getDetails() {
706
+ std::string details = "Beta CUDA Device Interface.";
707
+ if (cpuFallback_) {
708
+ details += " Using CPU fallback.";
709
+ if (!nvcuvidAvailable_) {
710
+ details += " NVCUVID not available!";
711
+ }
712
+ } else {
713
+ details += " Using NVDEC.";
714
+ }
715
+ return details;
716
+ }
717
+
636
718
  } // namespace facebook::torchcodec
@@ -40,7 +40,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
40
40
 
41
41
  void initialize(
42
42
  const AVStream* avStream,
43
- const UniqueDecodingAVFormatContext& avFormatCtx) override;
43
+ const UniqueDecodingAVFormatContext& avFormatCtx,
44
+ const SharedAVCodecContext& codecContext) override;
44
45
 
45
46
  void convertAVFrameToFrameOutput(
46
47
  UniqueAVFrame& avFrame,
@@ -48,10 +49,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
48
49
  std::optional<torch::Tensor> preAllocatedOutputTensor =
49
50
  std::nullopt) override;
50
51
 
51
- bool canDecodePacketDirectly() const override {
52
- return true;
53
- }
54
-
55
52
  int sendPacket(ReferenceAVPacket& packet) override;
56
53
  int sendEOFPacket() override;
57
54
  int receiveFrame(UniqueAVFrame& avFrame) override;
@@ -62,6 +59,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
62
59
  int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
63
60
  int frameReadyInDisplayOrder(CUVIDPARSERDISPINFO* dispInfo);
64
61
 
62
+ std::string getDetails() override;
63
+
65
64
  private:
66
65
  int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
67
66
 
@@ -97,6 +96,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
97
96
 
98
97
  // NPP context for color conversion
99
98
  UniqueNppContext nppCtx_;
99
+
100
+ std::unique_ptr<DeviceInterface> cpuFallback_;
101
+ bool nvcuvidAvailable_ = false;
100
102
  };
101
103
 
102
104
  } // namespace facebook::torchcodec
@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
99
99
  )
100
100
 
101
101
  if(ENABLE_CUDA)
102
- list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp)
102
+ list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
103
103
  endif()
104
104
 
105
105
  set(core_library_dependencies
@@ -108,27 +108,9 @@ function(make_torchcodec_libraries
108
108
  )
109
109
 
110
110
  if(ENABLE_CUDA)
111
- # Try to find NVCUVID. Try the normal way first. This should work locally.
112
- find_library(NVCUVID_LIBRARY NAMES nvcuvid)
113
- # If not found, try with version suffix, or hardcoded path. Appears
114
- # to be necessary on the CI.
115
- if(NOT NVCUVID_LIBRARY)
116
- find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
117
- endif()
118
- if(NOT NVCUVID_LIBRARY)
119
- set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
120
- endif()
121
-
122
- if(NVCUVID_LIBRARY)
123
- message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
124
- else()
125
- message(FATAL_ERROR "Could not find NVCUVID library")
126
- endif()
127
-
128
111
  list(APPEND core_library_dependencies
129
112
  ${CUDA_nppi_LIBRARY}
130
113
  ${CUDA_nppicc_LIBRARY}
131
- ${NVCUVID_LIBRARY}
132
114
  )
133
115
  endif()
134
116
 
@@ -5,14 +5,12 @@
5
5
  // LICENSE file in the root directory of this source tree.
6
6
 
7
7
  #include "src/torchcodec/_core/CUDACommon.h"
8
+ #include "src/torchcodec/_core/Cache.h" // for PerGpuCache
8
9
 
9
10
  namespace facebook::torchcodec {
10
11
 
11
12
  namespace {
12
13
 
13
- // Pytorch can only handle up to 128 GPUs.
14
- // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
15
- const int MAX_CUDA_GPUS = 128;
16
14
  // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
17
15
  // Set to a positive number to have a cache of that size.
18
16
  const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
249
247
  }
250
248
 
251
249
  UniqueNppContext getNppStreamContext(const torch::Device& device) {
252
- torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
250
+ int deviceIndex = getDeviceIndex(device);
253
251
 
254
252
  UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
255
253
  if (nppCtx) {
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {
266
264
 
267
265
  nppCtx = std::make_unique<NppStreamContext>();
268
266
  cudaDeviceProp prop{};
269
- cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
267
+ cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
270
268
  TORCH_CHECK(
271
269
  err == cudaSuccess,
272
270
  "cudaGetDeviceProperties failed: ",
273
271
  cudaGetErrorString(err));
274
272
 
275
- nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
273
+ nppCtx->nCudaDeviceId = deviceIndex;
276
274
  nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
277
275
  nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
278
276
  nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
312
310
  }
313
311
  }
314
312
 
313
+ int getDeviceIndex(const torch::Device& device) {
314
+ // PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
315
+ // libraries use int. So we use int, too.
316
+ int deviceIndex = static_cast<int>(device.index());
317
+ TORCH_CHECK(
318
+ deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
319
+ "Invalid device index = ",
320
+ deviceIndex);
321
+
322
+ if (deviceIndex == -1) {
323
+ TORCH_CHECK(
324
+ cudaGetDevice(&deviceIndex) == cudaSuccess,
325
+ "Failed to get current CUDA device.");
326
+ }
327
+ return deviceIndex;
328
+ }
329
+
315
330
  } // namespace facebook::torchcodec
@@ -11,7 +11,6 @@
11
11
  #include <npp.h>
12
12
  #include <torch/types.h>
13
13
 
14
- #include "src/torchcodec/_core/Cache.h"
15
14
  #include "src/torchcodec/_core/FFMPEGCommon.h"
16
15
  #include "src/torchcodec/_core/Frame.h"
17
16
 
@@ -22,6 +21,10 @@ extern "C" {
22
21
 
23
22
  namespace facebook::torchcodec {
24
23
 
24
+ // Pytorch can only handle up to 128 GPUs.
25
+ // https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
26
+ constexpr int MAX_CUDA_GPUS = 128;
27
+
25
28
  void initializeCudaContextWithPytorch(const torch::Device& device);
26
29
 
27
30
  // Unique pointer type for NPP stream context
@@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
43
46
  const std::optional<torch::Tensor>& preAllocatedOutputTensor,
44
47
  const UniqueAVFrame& avFrame);
45
48
 
49
+ int getDeviceIndex(const torch::Device& device);
50
+
46
51
  } // namespace facebook::torchcodec