PyPI - torchcodec - Versions diffs - 0.8.0__cp311-cp311-win_amd64.whl → 0.8.1__cp311-cp311-win_amd64.whl - Mend

torchcodec 0.8.0__cp311-cp311-win_amd64.whl → 0.8.1__cp311-cp311-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (55) hide show

torchcodec/_core/AVIOTensorContext.cpp +23 -16
torchcodec/_core/AVIOTensorContext.h +2 -1
torchcodec/_core/BetaCudaDeviceInterface.cpp +168 -86
torchcodec/_core/BetaCudaDeviceInterface.h +7 -5
torchcodec/_core/CMakeLists.txt +1 -19
torchcodec/_core/CUDACommon.cpp +21 -6
torchcodec/_core/CUDACommon.h +6 -1
torchcodec/_core/Cache.h +6 -20
torchcodec/_core/CpuDeviceInterface.cpp +7 -1
torchcodec/_core/CpuDeviceInterface.h +4 -1
torchcodec/_core/CudaDeviceInterface.cpp +19 -11
torchcodec/_core/CudaDeviceInterface.h +6 -1
torchcodec/_core/DeviceInterface.h +27 -27
torchcodec/_core/Encoder.cpp +51 -7
torchcodec/_core/Encoder.h +12 -1
torchcodec/_core/FFMPEGCommon.cpp +1 -1
torchcodec/_core/FFMPEGCommon.h +9 -1
torchcodec/_core/FilterGraph.cpp +2 -1
torchcodec/_core/Frame.cpp +5 -0
torchcodec/_core/Frame.h +1 -1
torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
torchcodec/_core/NVDECCache.cpp +3 -13
torchcodec/_core/NVDECCache.h +4 -6
torchcodec/_core/SingleStreamDecoder.cpp +22 -31
torchcodec/_core/SingleStreamDecoder.h +4 -2
torchcodec/_core/StreamOptions.h +2 -2
torchcodec/_core/Transform.cpp +27 -0
torchcodec/_core/Transform.h +25 -0
torchcodec/_core/__init__.py +3 -0
torchcodec/_core/custom_ops.cpp +99 -22
torchcodec/_core/ops.py +76 -16
torchcodec/decoders/_video_decoder.py +0 -10
torchcodec/libtorchcodec_core4.dll +0 -0
torchcodec/libtorchcodec_core5.dll +0 -0
torchcodec/libtorchcodec_core6.dll +0 -0
torchcodec/libtorchcodec_core7.dll +0 -0
torchcodec/libtorchcodec_core8.dll +0 -0
torchcodec/libtorchcodec_custom_ops4.dll +0 -0
torchcodec/libtorchcodec_custom_ops5.dll +0 -0
torchcodec/libtorchcodec_custom_ops6.dll +0 -0
torchcodec/libtorchcodec_custom_ops7.dll +0 -0
torchcodec/libtorchcodec_custom_ops8.dll +0 -0
torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
torchcodec/version.py +1 -1
{torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +6 -4
torchcodec-0.8.1.dist-info/RECORD +82 -0
torchcodec-0.8.0.dist-info/RECORD +0 -80
{torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +0 -0
{torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
{torchcodec-0.8.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0

torchcodec/_core/AVIOTensorContext.cpp CHANGED Viewed

@@ -18,15 +18,15 @@ constexpr int64_t MAX_TENSOR_SIZE = 320'000'000; // 320 MB
 int read(void* opaque, uint8_t* buf, int buf_size) {
   auto tensorContext = static_cast<detail::TensorContext*>(opaque);
   TORCH_CHECK(
-      tensorContext->current <= tensorContext->data.numel(),
-      "Tried to read outside of the buffer: current=",
-      tensorContext->current,
+      tensorContext->current_pos <= tensorContext->data.numel(),
+      "Tried to read outside of the buffer: current_pos=",
+      tensorContext->current_pos,
       ", size=",
       tensorContext->data.numel());
   int64_t numBytesRead = std::min(
       static_cast<int64_t>(buf_size),
-      tensorContext->data.numel() - tensorContext->current);
+      tensorContext->data.numel() - tensorContext->current_pos);
   TORCH_CHECK(
       numBytesRead >= 0,
@@ -34,8 +34,8 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
       numBytesRead,
       ", size=",
       tensorContext->data.numel(),
-      ", current=",
-      tensorContext->current);
+      ", current_pos=",
+      tensorContext->current_pos);
   if (numBytesRead == 0) {
     return AVERROR_EOF;
@@ -43,9 +43,9 @@ int read(void* opaque, uint8_t* buf, int buf_size) {
   std::memcpy(
       buf,
-      tensorContext->data.data_ptr<uint8_t>() + tensorContext->current,
+      tensorContext->data.data_ptr<uint8_t>() + tensorContext->current_pos,
       numBytesRead);
-  tensorContext->current += numBytesRead;
+  tensorContext->current_pos += numBytesRead;
   return numBytesRead;
 }
@@ -54,7 +54,7 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
   auto tensorContext = static_cast<detail::TensorContext*>(opaque);
   int64_t bufSize = static_cast<int64_t>(buf_size);
-  if (tensorContext->current + bufSize > tensorContext->data.numel()) {
+  if (tensorContext->current_pos + bufSize > tensorContext->data.numel()) {
     TORCH_CHECK(
         tensorContext->data.numel() * 2 <= MAX_TENSOR_SIZE,
         "We tried to allocate an output encoded tensor larger than ",
@@ -68,13 +68,17 @@ int write(void* opaque, const uint8_t* buf, int buf_size) {
   }
   TORCH_CHECK(
-      tensorContext->current + bufSize <= tensorContext->data.numel(),
+      tensorContext->current_pos + bufSize <= tensorContext->data.numel(),
       "Re-allocation of the output tensor didn't work. ",
       "This should not happen, please report on TorchCodec bug tracker");
   uint8_t* outputTensorData = tensorContext->data.data_ptr<uint8_t>();
-  std::memcpy(outputTensorData + tensorContext->current, buf, bufSize);
-  tensorContext->current += bufSize;
+  std::memcpy(outputTensorData + tensorContext->current_pos, buf, bufSize);
+  tensorContext->current_pos += bufSize;
+  // Track the maximum position written so getOutputTensor's narrow() does not
+  // truncate the file if final seek was backwards
+  tensorContext->max_pos =
+      std::max(tensorContext->current_pos, tensorContext->max_pos);
   return buf_size;
 }
@@ -88,7 +92,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
       ret = tensorContext->data.numel();
       break;
     case SEEK_SET:
-      tensorContext->current = offset;
+      tensorContext->current_pos = offset;
       ret = offset;
       break;
     default:
@@ -101,7 +105,7 @@ int64_t seek(void* opaque, int64_t offset, int whence) {
 } // namespace
 AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
-    : tensorContext_{data, 0} {
+    : tensorContext_{data, 0, 0} {
   TORCH_CHECK(data.numel() > 0, "data must not be empty");
   TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
   TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
@@ -110,14 +114,17 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
 }
 AVIOToTensorContext::AVIOToTensorContext()
-    : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
+    : tensorContext_{
+          torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}),
+          0,
+          0} {
   createAVIOContext(
       nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
 }
 torch::Tensor AVIOToTensorContext::getOutputTensor() {
   return tensorContext_.data.narrow(
-      /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.current);
+      /*dim=*/0, /*start=*/0, /*length=*/tensorContext_.max_pos);
 }
 } // namespace facebook::torchcodec

torchcodec/_core/AVIOTensorContext.h CHANGED Viewed

@@ -15,7 +15,8 @@ namespace detail {
 struct TensorContext {
   torch::Tensor data;
-  int64_t current;
+  int64_t current_pos;
+  int64_t max_pos;
 };
 } // namespace detail

torchcodec/_core/BetaCudaDeviceInterface.cpp CHANGED Viewed

@@ -15,7 +15,7 @@
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/NVDECCache.h"
-// #include <cuda_runtime.h> // For cudaStreamSynchronize
+#include "src/torchcodec/_core/NVCUVIDRuntimeLoader.h"
 #include "src/torchcodec/_core/nvcuvid_include/cuviddec.h"
 #include "src/torchcodec/_core/nvcuvid_include/nvcuvid.h"
@@ -53,74 +53,6 @@ pfnDisplayPictureCallback(void* pUserData, CUVIDPARSERDISPINFO* dispInfo) {
 }
 static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
-  // Check decoder capabilities - same checks as DALI
-  auto caps = CUVIDDECODECAPS{};
-  caps.eCodecType = videoFormat->codec;
-  caps.eChromaFormat = videoFormat->chroma_format;
-  caps.nBitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
-  CUresult result = cuvidGetDecoderCaps(&caps);
-  TORCH_CHECK(result == CUDA_SUCCESS, "Failed to get decoder caps: ", result);
-  TORCH_CHECK(
-      caps.bIsSupported,
-      "Codec configuration not supported on this GPU. "
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
-  TORCH_CHECK(
-      videoFormat->coded_width >= caps.nMinWidth &&
-          videoFormat->coded_height >= caps.nMinHeight,
-      "Video is too small in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMinWidth,
-      "x",
-      caps.nMinHeight);
-  TORCH_CHECK(
-      videoFormat->coded_width <= caps.nMaxWidth &&
-          videoFormat->coded_height <= caps.nMaxHeight,
-      "Video is too large in at least one dimension. Provided: ",
-      videoFormat->coded_width,
-      "x",
-      videoFormat->coded_height,
-      " vs supported:",
-      caps.nMaxWidth,
-      "x",
-      caps.nMaxHeight);
-  // See nMaxMBCount in cuviddec.h
-  constexpr unsigned int macroblockConstant = 256;
-  TORCH_CHECK(
-      videoFormat->coded_width * videoFormat->coded_height /
-              macroblockConstant <=
-          caps.nMaxMBCount,
-      "Video is too large (too many macroblocks). "
-      "Provided (width * height / ",
-      macroblockConstant,
-      "): ",
-      videoFormat->coded_width * videoFormat->coded_height / macroblockConstant,
-      " vs supported:",
-      caps.nMaxMBCount);
-  // Below we'll set the decoderParams.OutputFormat to NV12, so we need to make
-  // sure it's actually supported.
-  TORCH_CHECK(
-      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1,
-      "NV12 output format is not supported for this configuration. ",
-      "Codec: ",
-      static_cast<int>(videoFormat->codec),
-      ", chroma format: ",
-      static_cast<int>(videoFormat->chroma_format),
-      ", bit depth: ",
-      videoFormat->bit_depth_luma_minus8 + 8);
   // Decoder creation parameters, most are taken from DALI
   CUVIDDECODECREATEINFO decoderParams = {};
   decoderParams.bitDepthMinus8 = videoFormat->bit_depth_luma_minus8;
@@ -129,7 +61,7 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   // automatically converted to 8bits by NVDEC itself. That is, the raw frames
   // we get back from cuvidMapVideoFrame will already be in 8bit format.  We
   // won't need to do the conversion ourselves, so that's a lot easier.
-  // In the default interface, we have to do the 10 -> 8bits conversion
+  // In the ffmpeg CUDA interface, we have to do the 10 -> 8bits conversion
   // ourselves later in convertAVFrameToFrameOutput(), because FFmpeg explicitly
   // requests 10 or 16bits output formats for >8-bit videos!
   // https://github.com/FFmpeg/FFmpeg/blob/e05f8acabff468c1382277c1f31fa8e9d90c3202/libavcodec/nvdec.c#L376-L403
@@ -157,13 +89,39 @@ static UniqueCUvideodecoder createDecoder(CUVIDEOFORMAT* videoFormat) {
   decoderParams.display_area.bottom = videoFormat->display_area.bottom;
   CUvideodecoder* decoder = new CUvideodecoder();
-  result = cuvidCreateDecoder(decoder, &decoderParams);
+  CUresult result = cuvidCreateDecoder(decoder, &decoderParams);
   TORCH_CHECK(
       result == CUDA_SUCCESS, "Failed to create NVDEC decoder: ", result);
   return UniqueCUvideodecoder(decoder, CUvideoDecoderDeleter{});
 }
-cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
+std::optional<cudaVideoChromaFormat> validateChromaSupport(
+    const AVPixFmtDescriptor* desc) {
+  // Return the corresponding cudaVideoChromaFormat if supported, std::nullopt
+  // otherwise.
+  TORCH_CHECK(desc != nullptr, "desc can't be null");
+  if (desc->nb_components == 1) {
+    return cudaVideoChromaFormat_Monochrome;
+  } else if (desc->nb_components >= 3 && !(desc->flags & AV_PIX_FMT_FLAG_RGB)) {
+    // Make sure it's YUV: has chroma planes and isn't RGB
+    if (desc->log2_chroma_w == 0 && desc->log2_chroma_h == 0) {
+      return cudaVideoChromaFormat_444; // 1x1 subsampling = 4:4:4
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 1) {
+      return cudaVideoChromaFormat_420; // 2x2 subsampling = 4:2:0
+    } else if (desc->log2_chroma_w == 1 && desc->log2_chroma_h == 0) {
+      return cudaVideoChromaFormat_422; // 2x1 subsampling = 4:2:2
+    }
+  }
+  return std::nullopt;
+}
+std::optional<cudaVideoCodec> validateCodecSupport(AVCodecID codecId) {
+  // Return the corresponding cudaVideoCodec if supported, std::nullopt
+  // otherwise
+  // Note that we currently return nullopt (and thus fallback to CPU) for some
+  // codecs that are technically supported by NVDEC, see comment below.
   switch (codecId) {
     case AV_CODEC_ID_H264:
       return cudaVideoCodec_H264;
@@ -189,12 +147,72 @@ cudaVideoCodec validateCodecSupport(AVCodecID codecId) {
     //   return cudaVideoCodec_JPEG;
     // case AV_CODEC_ID_VC1:
     //   return cudaVideoCodec_VC1;
-    default: {
-      TORCH_CHECK(false, "Unsupported codec type: ", avcodec_get_name(codecId));
-    }
+    default:
+      return std::nullopt;
   }
 }
+bool nativeNVDECSupport(const SharedAVCodecContext& codecContext) {
+  // Return true iff the input video stream is supported by our NVDEC
+  // implementation.
+  auto codecType = validateCodecSupport(codecContext->codec_id);
+  if (!codecType.has_value()) {
+    return false;
+  }
+  const AVPixFmtDescriptor* desc = av_pix_fmt_desc_get(codecContext->pix_fmt);
+  if (!desc) {
+    return false;
+  }
+  auto chromaFormat = validateChromaSupport(desc);
+  if (!chromaFormat.has_value()) {
+    return false;
+  }
+  auto caps = CUVIDDECODECAPS{};
+  caps.eCodecType = codecType.value();
+  caps.eChromaFormat = chromaFormat.value();
+  caps.nBitDepthMinus8 = desc->comp[0].depth - 8;
+  CUresult result = cuvidGetDecoderCaps(&caps);
+  if (result != CUDA_SUCCESS) {
+    return false;
+  }
+  if (!caps.bIsSupported) {
+    return false;
+  }
+  auto coded_width = static_cast<unsigned int>(codecContext->coded_width);
+  auto coded_height = static_cast<unsigned int>(codecContext->coded_height);
+  if (coded_width < static_cast<unsigned int>(caps.nMinWidth) ||
+      coded_height < static_cast<unsigned int>(caps.nMinHeight) ||
+      coded_width > caps.nMaxWidth || coded_height > caps.nMaxHeight) {
+    return false;
+  }
+  // See nMaxMBCount in cuviddec.h
+  constexpr unsigned int macroblockConstant = 256;
+  if (coded_width * coded_height / macroblockConstant > caps.nMaxMBCount) {
+    return false;
+  }
+  // We'll set the decoderParams.OutputFormat to NV12, so we need to make
+  // sure it's actually supported.
+  // TODO: If this fail, we could consider decoding to something else than NV12
+  // (like cudaVideoSurfaceFormat_P016) instead of falling back to CPU. This is
+  // what FFmpeg does.
+  bool supportsNV12Output =
+      (caps.nOutputFormatMask >> cudaVideoSurfaceFormat_NV12) & 1;
+  if (!supportsNV12Output) {
+    return false;
+  }
+  return true;
+}
 } // namespace
 BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
@@ -205,6 +223,8 @@ BetaCudaDeviceInterface::BetaCudaDeviceInterface(const torch::Device& device)
   initializeCudaContextWithPytorch(device_);
   nppCtx_ = getNppStreamContext(device_);
+  nvcuvidAvailable_ = loadNVCUVIDLibrary();
 }
 BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
@@ -216,12 +236,11 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
     // unclear.
     flush();
     unmapPreviousFrame();
-    NVDECCache::getCache(device_.index())
-        .returnDecoder(&videoFormat_, std::move(decoder_));
+    NVDECCache::getCache(device_).returnDecoder(
+        &videoFormat_, std::move(decoder_));
   }
   if (videoParser_) {
-    // TODONVDEC P2: consider caching this? Does DALI do that?
     cuvidDestroyVideoParser(videoParser_);
     videoParser_ = nullptr;
   }
@@ -231,7 +250,21 @@ BetaCudaDeviceInterface::~BetaCudaDeviceInterface() {
 void BetaCudaDeviceInterface::initialize(
     const AVStream* avStream,
-    const UniqueDecodingAVFormatContext& avFormatCtx) {
+    const UniqueDecodingAVFormatContext& avFormatCtx,
+    [[maybe_unused]] const SharedAVCodecContext& codecContext) {
+  if (!nvcuvidAvailable_ || !nativeNVDECSupport(codecContext)) {
+    cpuFallback_ = createDeviceInterface(torch::kCPU);
+    TORCH_CHECK(
+        cpuFallback_ != nullptr, "Failed to create CPU device interface");
+    cpuFallback_->initialize(avStream, avFormatCtx, codecContext);
+    cpuFallback_->initializeVideo(
+        VideoStreamOptions(),
+        {},
+        /*resizedOutputDims=*/std::nullopt);
+    // We'll always use the CPU fallback from now on, so we can return early.
+    return;
+  }
   TORCH_CHECK(avStream != nullptr, "AVStream cannot be null");
   timeBase_ = avStream->time_base;
   frameRateAvgFromFFmpeg_ = avStream->r_frame_rate;
@@ -243,7 +276,11 @@ void BetaCudaDeviceInterface::initialize(
   // Create parser. Default values that aren't obvious are taken from DALI.
   CUVIDPARSERPARAMS parserParams = {};
-  parserParams.CodecType = validateCodecSupport(codecPar->codec_id);
+  auto codecType = validateCodecSupport(codecPar->codec_id);
+  TORCH_CHECK(
+      codecType.has_value(),
+      "This should never happen, we should be using the CPU fallback by now. Please report a bug.");
+  parserParams.CodecType = codecType.value();
   parserParams.ulMaxNumDecodeSurfaces = 8;
   parserParams.ulMaxDisplayDelay = 0;
   // Callback setup, all are triggered by the parser within a call
@@ -362,11 +399,12 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
   }
   if (!decoder_) {
-    decoder_ = NVDECCache::getCache(device_.index()).getDecoder(videoFormat);
+    decoder_ = NVDECCache::getCache(device_).getDecoder(videoFormat);
     if (!decoder_) {
       // TODONVDEC P2: consider re-configuring an existing decoder instead of
-      // re-creating one. See docs, see DALI.
+      // re-creating one. See docs, see DALI. Re-configuration doesn't seem to
+      // be enabled in DALI by default.
       decoder_ = createDecoder(videoFormat);
     }
@@ -382,6 +420,10 @@ int BetaCudaDeviceInterface::streamPropertyChange(CUVIDEOFORMAT* videoFormat) {
 // Moral equivalent of avcodec_send_packet(). Here, we pass the AVPacket down to
 // the NVCUVID parser.
 int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
+  if (cpuFallback_) {
+    return cpuFallback_->sendPacket(packet);
+  }
   TORCH_CHECK(
       packet.get() && packet->data && packet->size > 0,
       "sendPacket received an empty packet, this is unexpected, please report.");
@@ -405,6 +447,10 @@ int BetaCudaDeviceInterface::sendPacket(ReferenceAVPacket& packet) {
 }
 int BetaCudaDeviceInterface::sendEOFPacket() {
+  if (cpuFallback_) {
+    return cpuFallback_->sendEOFPacket();
+  }
   CUVIDSOURCEDATAPACKET cuvidPacket = {};
   cuvidPacket.flags = CUVID_PKT_ENDOFSTREAM;
   eofSent_ = true;
@@ -466,6 +512,10 @@ int BetaCudaDeviceInterface::frameReadyInDisplayOrder(
 // Moral equivalent of avcodec_receive_frame().
 int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
+  if (cpuFallback_) {
+    return cpuFallback_->receiveFrame(avFrame);
+  }
   if (readyFrames_.empty()) {
     // No frame found, instruct caller to try again later after sending more
     // packets, or to stop if EOF was already sent.
@@ -480,8 +530,7 @@ int BetaCudaDeviceInterface::receiveFrame(UniqueAVFrame& avFrame) {
   procParams.top_field_first = dispInfo.top_field_first;
   procParams.unpaired_field = dispInfo.repeat_first_field < 0;
   // We set the NVDEC stream to the current stream. It will be waited upon by
-  // the NPP stream before any color conversion. Currently, that syncing logic
-  // is in the default interface.
+  // the NPP stream before any color conversion.
   // Re types: we get a cudaStream_t from PyTorch but it's interchangeable with
   // CUstream
   procParams.output_stream = reinterpret_cast<CUstream>(
@@ -601,6 +650,11 @@ UniqueAVFrame BetaCudaDeviceInterface::convertCudaFrameToAVFrame(
 }
 void BetaCudaDeviceInterface::flush() {
+  if (cpuFallback_) {
+    cpuFallback_->flush();
+    return;
+  }
   // The NVCUVID docs mention that after seeking, i.e. when flush() is called,
   // we should send a packet with the CUVID_PKT_DISCONTINUITY flag. The docs
   // don't say whether this should be an empty packet, or whether it should be a
@@ -618,8 +672,23 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
     UniqueAVFrame& avFrame,
     FrameOutput& frameOutput,
     std::optional<torch::Tensor> preAllocatedOutputTensor) {
-  // TODONVDEC P2: we may need to handle 10bit videos the same way the default
-  // interface does it with maybeConvertAVFrameToNV12OrRGB24().
+  if (cpuFallback_) {
+    // CPU decoded frame - need to do CPU color conversion then transfer to GPU
+    FrameOutput cpuFrameOutput;
+    cpuFallback_->convertAVFrameToFrameOutput(avFrame, cpuFrameOutput);
+    // Transfer CPU frame to GPU
+    if (preAllocatedOutputTensor.has_value()) {
+      preAllocatedOutputTensor.value().copy_(cpuFrameOutput.data);
+      frameOutput.data = preAllocatedOutputTensor.value();
+    } else {
+      frameOutput.data = cpuFrameOutput.data.to(device_);
+    }
+    return;
+  }
+  // TODONVDEC P2: we may need to handle 10bit videos the same way the CUDA
+  // ffmpeg interface does it with maybeConvertAVFrameToNV12OrRGB24().
   TORCH_CHECK(
       avFrame->format == AV_PIX_FMT_CUDA,
       "Expected CUDA format frame from BETA CUDA interface");
@@ -633,4 +702,17 @@ void BetaCudaDeviceInterface::convertAVFrameToFrameOutput(
       avFrame, device_, nppCtx_, nvdecStream, preAllocatedOutputTensor);
 }
+std::string BetaCudaDeviceInterface::getDetails() {
+  std::string details = "Beta CUDA Device Interface.";
+  if (cpuFallback_) {
+    details += " Using CPU fallback.";
+    if (!nvcuvidAvailable_) {
+      details += " NVCUVID not available!";
+    }
+  } else {
+    details += " Using NVDEC.";
+  }
+  return details;
+}
 } // namespace facebook::torchcodec

torchcodec/_core/BetaCudaDeviceInterface.h CHANGED Viewed

@@ -40,7 +40,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   void initialize(
       const AVStream* avStream,
-      const UniqueDecodingAVFormatContext& avFormatCtx) override;
+      const UniqueDecodingAVFormatContext& avFormatCtx,
+      const SharedAVCodecContext& codecContext) override;
   void convertAVFrameToFrameOutput(
       UniqueAVFrame& avFrame,
@@ -48,10 +49,6 @@ class BetaCudaDeviceInterface : public DeviceInterface {
       std::optional<torch::Tensor> preAllocatedOutputTensor =
           std::nullopt) override;
-  bool canDecodePacketDirectly() const override {
-    return true;
-  }
   int sendPacket(ReferenceAVPacket& packet) override;
   int sendEOFPacket() override;
   int receiveFrame(UniqueAVFrame& avFrame) override;
@@ -62,6 +59,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   int frameReadyForDecoding(CUVIDPICPARAMS* picParams);
   int frameReadyInDisplayOrder(CUVIDPARSERDISPINFO* dispInfo);
+  std::string getDetails() override;
  private:
   int sendCuvidPacket(CUVIDSOURCEDATAPACKET& cuvidPacket);
@@ -97,6 +96,9 @@ class BetaCudaDeviceInterface : public DeviceInterface {
   // NPP context for color conversion
   UniqueNppContext nppCtx_;
+  std::unique_ptr<DeviceInterface> cpuFallback_;
+  bool nvcuvidAvailable_ = false;
 };
 } // namespace facebook::torchcodec

torchcodec/_core/CMakeLists.txt CHANGED Viewed

@@ -99,7 +99,7 @@ function(make_torchcodec_libraries
     )
     if(ENABLE_CUDA)
-	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp)
+	    list(APPEND core_sources CudaDeviceInterface.cpp BetaCudaDeviceInterface.cpp NVDECCache.cpp CUDACommon.cpp NVCUVIDRuntimeLoader.cpp)
     endif()
     set(core_library_dependencies
@@ -108,27 +108,9 @@ function(make_torchcodec_libraries
     )
     if(ENABLE_CUDA)
-        # Try to find NVCUVID. Try the normal way first. This should work locally.
-        find_library(NVCUVID_LIBRARY NAMES nvcuvid)
-        # If not found, try with version suffix, or hardcoded path. Appears
-        # to be necessary on the CI.
-        if(NOT NVCUVID_LIBRARY)
-            find_library(NVCUVID_LIBRARY NAMES nvcuvid.1 PATHS /usr/lib64 /usr/lib)
-        endif()
-        if(NOT NVCUVID_LIBRARY)
-            set(NVCUVID_LIBRARY "/usr/lib64/libnvcuvid.so.1")
-        endif()
-        if(NVCUVID_LIBRARY)
-            message(STATUS "Found NVCUVID: ${NVCUVID_LIBRARY}")
-        else()
-            message(FATAL_ERROR "Could not find NVCUVID library")
-        endif()
         list(APPEND core_library_dependencies
             ${CUDA_nppi_LIBRARY}
             ${CUDA_nppicc_LIBRARY}
-            ${NVCUVID_LIBRARY}
         )
     endif()

torchcodec/_core/CUDACommon.cpp CHANGED Viewed

@@ -5,14 +5,12 @@
 // LICENSE file in the root directory of this source tree.
 #include "src/torchcodec/_core/CUDACommon.h"
+#include "src/torchcodec/_core/Cache.h" // for PerGpuCache
 namespace facebook::torchcodec {
 namespace {
-// Pytorch can only handle up to 128 GPUs.
-// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
-const int MAX_CUDA_GPUS = 128;
 // Set to -1 to have an infinitely sized cache. Set it to 0 to disable caching.
 // Set to a positive number to have a cache of that size.
 const int MAX_CONTEXTS_PER_GPU_IN_CACHE = -1;
@@ -249,7 +247,7 @@ torch::Tensor convertNV12FrameToRGB(
 }
 UniqueNppContext getNppStreamContext(const torch::Device& device) {
-  torch::DeviceIndex nonNegativeDeviceIndex = getNonNegativeDeviceIndex(device);
+  int deviceIndex = getDeviceIndex(device);
   UniqueNppContext nppCtx = g_cached_npp_ctxs.get(device);
   if (nppCtx) {
@@ -266,13 +264,13 @@ UniqueNppContext getNppStreamContext(const torch::Device& device) {
   nppCtx = std::make_unique<NppStreamContext>();
   cudaDeviceProp prop{};
-  cudaError_t err = cudaGetDeviceProperties(&prop, nonNegativeDeviceIndex);
+  cudaError_t err = cudaGetDeviceProperties(&prop, deviceIndex);
   TORCH_CHECK(
       err == cudaSuccess,
       "cudaGetDeviceProperties failed: ",
       cudaGetErrorString(err));
-  nppCtx->nCudaDeviceId = nonNegativeDeviceIndex;
+  nppCtx->nCudaDeviceId = deviceIndex;
   nppCtx->nMultiProcessorCount = prop.multiProcessorCount;
   nppCtx->nMaxThreadsPerMultiProcessor = prop.maxThreadsPerMultiProcessor;
   nppCtx->nMaxThreadsPerBlock = prop.maxThreadsPerBlock;
@@ -312,4 +310,21 @@ void validatePreAllocatedTensorShape(
   }
 }
+int getDeviceIndex(const torch::Device& device) {
+  // PyTorch uses int8_t as its torch::DeviceIndex, but FFmpeg and CUDA
+  // libraries use int. So we use int, too.
+  int deviceIndex = static_cast<int>(device.index());
+  TORCH_CHECK(
+      deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
+      "Invalid device index = ",
+      deviceIndex);
+  if (deviceIndex == -1) {
+    TORCH_CHECK(
+        cudaGetDevice(&deviceIndex) == cudaSuccess,
+        "Failed to get current CUDA device.");
+  }
+  return deviceIndex;
+}
 } // namespace facebook::torchcodec

torchcodec/_core/CUDACommon.h CHANGED Viewed

@@ -11,7 +11,6 @@
 #include <npp.h>
 #include <torch/types.h>
-#include "src/torchcodec/_core/Cache.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
@@ -22,6 +21,10 @@ extern "C" {
 namespace facebook::torchcodec {
+// Pytorch can only handle up to 128 GPUs.
+// https://github.com/pytorch/pytorch/blob/e30c55ee527b40d67555464b9e402b4b7ce03737/c10/cuda/CUDAMacros.h#L44
+constexpr int MAX_CUDA_GPUS = 128;
 void initializeCudaContextWithPytorch(const torch::Device& device);
 // Unique pointer type for NPP stream context
@@ -43,4 +46,6 @@ void validatePreAllocatedTensorShape(
     const std::optional<torch::Tensor>& preAllocatedOutputTensor,
     const UniqueAVFrame& avFrame);
+int getDeviceIndex(const torch::Device& device);
 } // namespace facebook::torchcodec