PyPI - torchcodec - Versions diffs - 0.7.0__cp313-cp313-win_amd64.whl → 0.8.1__cp313-cp313-win_amd64.whl - Mend

torchcodec 0.7.0__cp313-cp313-win_amd64.whl → 0.8.1__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (66) hide show

torchcodec/_core/AVIOTensorContext.cpp +23 -16
torchcodec/_core/AVIOTensorContext.h +2 -1
torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
torchcodec/_core/CMakeLists.txt +18 -3
torchcodec/_core/CUDACommon.cpp +330 -0
torchcodec/_core/CUDACommon.h +51 -0
torchcodec/_core/Cache.h +6 -20
torchcodec/_core/CpuDeviceInterface.cpp +195 -108
torchcodec/_core/CpuDeviceInterface.h +84 -19
torchcodec/_core/CudaDeviceInterface.cpp +227 -376
torchcodec/_core/CudaDeviceInterface.h +38 -6
torchcodec/_core/DeviceInterface.cpp +57 -19
torchcodec/_core/DeviceInterface.h +97 -16
torchcodec/_core/Encoder.cpp +346 -9
torchcodec/_core/Encoder.h +62 -1
torchcodec/_core/FFMPEGCommon.cpp +190 -3
torchcodec/_core/FFMPEGCommon.h +27 -1
torchcodec/_core/FilterGraph.cpp +30 -22
torchcodec/_core/FilterGraph.h +15 -1
torchcodec/_core/Frame.cpp +22 -7
torchcodec/_core/Frame.h +15 -61
torchcodec/_core/Metadata.h +2 -2
torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
torchcodec/_core/NVDECCache.cpp +60 -0
torchcodec/_core/NVDECCache.h +102 -0
torchcodec/_core/SingleStreamDecoder.cpp +196 -201
torchcodec/_core/SingleStreamDecoder.h +42 -15
torchcodec/_core/StreamOptions.h +16 -6
torchcodec/_core/Transform.cpp +87 -0
torchcodec/_core/Transform.h +84 -0
torchcodec/_core/__init__.py +4 -0
torchcodec/_core/custom_ops.cpp +257 -32
torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
torchcodec/_core/ops.py +147 -44
torchcodec/_core/pybind_ops.cpp +22 -59
torchcodec/_samplers/video_clip_sampler.py +7 -19
torchcodec/decoders/__init__.py +1 -0
torchcodec/decoders/_decoder_utils.py +61 -1
torchcodec/decoders/_video_decoder.py +46 -20
torchcodec/libtorchcodec_core4.dll +0 -0
torchcodec/libtorchcodec_core5.dll +0 -0
torchcodec/libtorchcodec_core6.dll +0 -0
torchcodec/libtorchcodec_core7.dll +0 -0
torchcodec/libtorchcodec_core8.dll +0 -0
torchcodec/libtorchcodec_custom_ops4.dll +0 -0
torchcodec/libtorchcodec_custom_ops5.dll +0 -0
torchcodec/libtorchcodec_custom_ops6.dll +0 -0
torchcodec/libtorchcodec_custom_ops7.dll +0 -0
torchcodec/libtorchcodec_custom_ops8.dll +0 -0
torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
torchcodec/samplers/_time_based.py +8 -0
torchcodec/version.py +1 -1
{torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
torchcodec-0.8.1.dist-info/RECORD +82 -0
{torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
torchcodec-0.7.0.dist-info/RECORD +0 -67
{torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
{torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0

torchcodec/_core/ops.py CHANGED Viewed

@@ -41,7 +41,7 @@ def load_torchcodec_shared_libraries():
     #      libraries do not meet those conditions.
     exceptions = []
-    for ffmpeg_major_version in (7, 6, 5, 4):
+    for ffmpeg_major_version in (8, 7, 6, 5, 4):
         pybind_ops_module_name = _get_pybind_ops_module_name(ffmpeg_major_version)
         decoder_library_name = f"libtorchcodec_core{ffmpeg_major_version}"
         custom_ops_library_name = f"libtorchcodec_custom_ops{ffmpeg_major_version}"
@@ -69,7 +69,7 @@ def load_torchcodec_shared_libraries():
     raise RuntimeError(
         f"""Could not load libtorchcodec. Likely causes:
           1. FFmpeg is not properly installed in your environment. We support
-             versions 4, 5, 6 and 7.
+             versions 4, 5, 6, and 7 on all platforms, and 8 on Mac and Linux.
           2. The PyTorch version ({torch.__version__}) is not compatible with
              this version of TorchCodec. Refer to the version compatibility
              table:
@@ -95,11 +95,23 @@ encode_audio_to_file = torch._dynamo.disallow_in_graph(
 encode_audio_to_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.encode_audio_to_tensor.default
 )
+_encode_audio_to_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._encode_audio_to_file_like.default
+)
+encode_video_to_file = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns.encode_video_to_file.default
+)
+encode_video_to_tensor = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns.encode_video_to_tensor.default
+)
+_encode_video_to_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._encode_video_to_file_like.default
+)
 create_from_tensor = torch._dynamo.disallow_in_graph(
     torch.ops.torchcodec_ns.create_from_tensor.default
 )
-_convert_to_tensor = torch._dynamo.disallow_in_graph(
-    torch.ops.torchcodec_ns._convert_to_tensor.default
+_create_from_file_like = torch._dynamo.disallow_in_graph(
+    torch.ops.torchcodec_ns._create_from_file_like.default
 )
 add_video_stream = torch.ops.torchcodec_ns.add_video_stream.default
 _add_video_stream = torch.ops.torchcodec_ns._add_video_stream.default
@@ -108,8 +120,10 @@ seek_to_pts = torch.ops.torchcodec_ns.seek_to_pts.default
 get_next_frame = torch.ops.torchcodec_ns.get_next_frame.default
 get_frame_at_pts = torch.ops.torchcodec_ns.get_frame_at_pts.default
 get_frame_at_index = torch.ops.torchcodec_ns.get_frame_at_index.default
-get_frames_at_indices = torch.ops.torchcodec_ns.get_frames_at_indices.default
-get_frames_by_pts = torch.ops.torchcodec_ns.get_frames_by_pts.default
+_get_frames_at_indices_tensor_input = (
+    torch.ops.torchcodec_ns.get_frames_at_indices.default
+)
+_get_frames_by_pts_tensor_input = torch.ops.torchcodec_ns.get_frames_by_pts.default
 get_frames_in_range = torch.ops.torchcodec_ns.get_frames_in_range.default
 get_frames_by_pts_in_range = torch.ops.torchcodec_ns.get_frames_by_pts_in_range.default
 get_frames_by_pts_in_range_audio = (
@@ -128,6 +142,7 @@ _get_stream_json_metadata = torch.ops.torchcodec_ns.get_stream_json_metadata.def
 _get_json_ffmpeg_library_versions = (
     torch.ops.torchcodec_ns._get_json_ffmpeg_library_versions.default
 )
+_get_backend_details = torch.ops.torchcodec_ns._get_backend_details.default
 # =============================
@@ -148,7 +163,12 @@ def create_from_file_like(
     file_like: Union[io.RawIOBase, io.BufferedReader], seek_mode: Optional[str] = None
 ) -> torch.Tensor:
     assert _pybind_ops is not None
-    return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
+    return _create_from_file_like(
+        _pybind_ops.create_file_like_context(
+            file_like, False  # False means not for writing
+        ),
+        seek_mode,
+    )
 def encode_audio_to_file_like(
@@ -176,35 +196,69 @@ def encode_audio_to_file_like(
     if samples.dtype != torch.float32:
         raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
-    # We're having the same problem as with the decoder's create_from_file_like:
-    # We should be able to pass a tensor directly, but this leads to a pybind
-    # error. In order to work around this, we pass the pointer to the tensor's
-    # data, and its shape, in order to re-construct it in C++. For this to work:
-    # - the tensor must be float32
-    # - the tensor  must be contiguous, which is why we call contiguous().
-    #   In theory we could avoid this restriction by also passing the strides?
-    # - IMPORTANT: the input samples tensor and its underlying data must be
-    #   alive during the call.
-    #
-    # A more elegant solution would be to cast the tensor into a py::object, but
-    # casting the py::object backk to a tensor in C++ seems to lead to the same
-    # pybing error.
-    samples = samples.contiguous()
-    _pybind_ops.encode_audio_to_file_like(
-        samples.data_ptr(),
-        list(samples.shape),
+    _encode_audio_to_file_like(
+        samples,
         sample_rate,
         format,
-        file_like,
+        _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
         bit_rate,
         num_channels,
         desired_sample_rate,
     )
-    # This check is useless but it's critical to keep it to ensures that samples
-    # is still alive during the call to encode_audio_to_file_like.
-    assert samples.is_contiguous()
+def encode_video_to_file_like(
+    frames: torch.Tensor,
+    frame_rate: int,
+    format: str,
+    file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    crf: Optional[int] = None,
+) -> None:
+    """Encode video frames to a file-like object.
+    Args:
+        frames: Video frames tensor
+        frame_rate: Frame rate in frames per second
+        format: Video format (e.g., "mp4", "mov", "mkv")
+        file_like: File-like object that supports write() and seek() methods
+        crf: Optional constant rate factor for encoding quality
+    """
+    assert _pybind_ops is not None
+    _encode_video_to_file_like(
+        frames,
+        frame_rate,
+        format,
+        _pybind_ops.create_file_like_context(file_like, True),  # True means for writing
+        crf,
+    )
+def get_frames_at_indices(
+    decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, list[int]]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if isinstance(frame_indices, torch.Tensor):
+        # Ensure indices is the correct dtype (int64)
+        frame_indices = frame_indices.to(torch.int64)
+    else:
+        # Convert list to tensor for dispatch
+        frame_indices = torch.tensor(frame_indices)
+    return _get_frames_at_indices_tensor_input(decoder, frame_indices=frame_indices)
+def get_frames_by_pts(
+    decoder: torch.Tensor, *, timestamps: Union[torch.Tensor, list[float]]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    if isinstance(timestamps, torch.Tensor):
+        # Ensure indices is the correct dtype (float64)
+        timestamps = timestamps.to(torch.float64)
+    else:
+        # Convert list to tensor for dispatch
+        try:
+            timestamps = torch.tensor(timestamps, dtype=torch.float64)
+        except Exception as e:
+            raise ValueError("Couldn't convert timestamps input to a tensor") from e
+    return _get_frames_by_pts_tensor_input(decoder, timestamps=timestamps)
 # ==============================
@@ -215,6 +269,13 @@ def create_from_file_abstract(filename: str, seek_mode: Optional[str]) -> torch.
     return torch.empty([], dtype=torch.long)
+@register_fake("torchcodec_ns::_create_from_file_like")
+def _create_from_file_like_abstract(
+    file_like: int, seek_mode: Optional[str]
+) -> torch.Tensor:
+    return torch.empty([], dtype=torch.long)
 @register_fake("torchcodec_ns::encode_audio_to_file")
 def encode_audio_to_file_abstract(
     samples: torch.Tensor,
@@ -239,15 +300,54 @@ def encode_audio_to_tensor_abstract(
     return torch.empty([], dtype=torch.long)
-@register_fake("torchcodec_ns::create_from_tensor")
-def create_from_tensor_abstract(
-    video_tensor: torch.Tensor, seek_mode: Optional[str]
+@register_fake("torchcodec_ns::_encode_audio_to_file_like")
+def _encode_audio_to_file_like_abstract(
+    samples: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    file_like_context: int,
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
+    desired_sample_rate: Optional[int] = None,
+) -> None:
+    return
+@register_fake("torchcodec_ns::encode_video_to_file")
+def encode_video_to_file_abstract(
+    frames: torch.Tensor,
+    frame_rate: int,
+    filename: str,
+    crf: Optional[int],
+) -> None:
+    return
+@register_fake("torchcodec_ns::encode_video_to_tensor")
+def encode_video_to_tensor_abstract(
+    frames: torch.Tensor,
+    frame_rate: int,
+    format: str,
+    crf: Optional[int],
 ) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
-@register_fake("torchcodec_ns::_convert_to_tensor")
-def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor:
+@register_fake("torchcodec_ns::_encode_video_to_file_like")
+def _encode_video_to_file_like_abstract(
+    frames: torch.Tensor,
+    frame_rate: int,
+    format: str,
+    file_like_context: int,
+    crf: Optional[int] = None,
+) -> None:
+    return
+@register_fake("torchcodec_ns::create_from_tensor")
+def create_from_tensor_abstract(
+    video_tensor: torch.Tensor, seek_mode: Optional[str]
+) -> torch.Tensor:
     return torch.empty([], dtype=torch.long)
@@ -255,12 +355,12 @@ def _convert_to_tensor_abstract(decoder_ptr: int) -> torch.Tensor:
 def _add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
-    device: Optional[str] = None,
+    device: str = "cpu",
+    device_variant: str = "ffmpeg",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -273,12 +373,12 @@ def _add_video_stream_abstract(
 def add_video_stream_abstract(
     decoder: torch.Tensor,
     *,
-    width: Optional[int] = None,
-    height: Optional[int] = None,
     num_threads: Optional[int] = None,
     dimension_order: Optional[str] = None,
     stream_index: Optional[int] = None,
-    device: Optional[str] = None,
+    device: str = "cpu",
+    device_variant: str = "ffmpeg",
+    transform_specs: str = "",
     custom_frame_mappings: Optional[
         tuple[torch.Tensor, torch.Tensor, torch.Tensor]
     ] = None,
@@ -332,7 +432,7 @@ def get_frame_at_pts_abstract(
 def get_frames_by_pts_abstract(
     decoder: torch.Tensor,
     *,
-    timestamps: List[float],
+    timestamps: Union[torch.Tensor, List[float]],
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
@@ -356,9 +456,7 @@ def get_frame_at_index_abstract(
 @register_fake("torchcodec_ns::get_frames_at_indices")
 def get_frames_at_indices_abstract(
-    decoder: torch.Tensor,
-    *,
-    frame_indices: List[int],
+    decoder: torch.Tensor, *, frame_indices: Union[torch.Tensor, List[int]]
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     image_size = [get_ctx().new_dynamic_size() for _ in range(4)]
     return (
@@ -453,3 +551,8 @@ def scan_all_streams_to_update_metadata_abstract(decoder: torch.Tensor) -> None:
 def get_ffmpeg_library_versions():
     versions_json = _get_json_ffmpeg_library_versions()
     return json.loads(versions_json)
+@register_fake("torchcodec_ns::_get_backend_details")
+def _get_backend_details_abstract(decoder: torch.Tensor) -> str:
+    return ""

torchcodec/_core/pybind_ops.cpp CHANGED Viewed

@@ -7,72 +7,36 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <cstdint>
-#include <string>
 #include "src/torchcodec/_core/AVIOFileLikeContext.h"
-#include "src/torchcodec/_core/Encoder.h"
-#include "src/torchcodec/_core/SingleStreamDecoder.h"
-#include "src/torchcodec/_core/StreamOptions.h"
-#include "src/torchcodec/_core/ValidationUtils.h"
 namespace py = pybind11;
 namespace facebook::torchcodec {
-// In principle, this should be able to return a tensor. But when we try that,
-// we run into the bug reported here:
+// Note: It's not immediately obvous why we need both custom_ops.cpp and
+//       pybind_ops.cpp. We do all other Python to C++ bridging in
+//       custom_ops.cpp, and that even depends on pybind11, so why have an
+//       explicit pybind-only file?
 //
-//   https://github.com/pytorch/pytorch/issues/136664
+//       The reason is that we want to accept OWNERSHIP of a file-like object
+//       from the Python side. In order to do that, we need a proper
+//       py::object. For raw bytes, we can launder that through a tensor on the
+//       custom_ops.cpp side, but we can't launder a proper Python object
+//       through a tensor. Custom ops can't accept a proper Python object
+//       through py::object, so we have to do direct pybind11 here.
 //
-// So we instead launder the pointer through an int, and then use a conversion
-// function on the custom ops side to launder that int into a tensor.
-int64_t create_from_file_like(
-    py::object file_like,
-    std::optional<std::string_view> seek_mode) {
-  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
-  if (seek_mode.has_value()) {
-    realSeek = seekModeFromString(seek_mode.value());
-  }
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
-  SingleStreamDecoder* decoder =
-      new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
-  return reinterpret_cast<int64_t>(decoder);
-}
-void encode_audio_to_file_like(
-    int64_t data_ptr,
-    const std::vector<int64_t>& shape,
-    int64_t sample_rate,
-    std::string_view format,
-    py::object file_like,
-    std::optional<int64_t> bit_rate = std::nullopt,
-    std::optional<int64_t> num_channels = std::nullopt,
-    std::optional<int64_t> desired_sample_rate = std::nullopt) {
-  // We assume float32 *and* contiguity, this must be enforced by the caller.
-  auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
-  auto samples = torch::from_blob(
-      reinterpret_cast<void*>(data_ptr), shape, tensor_options);
-  AudioStreamOptions audioStreamOptions;
-  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
-  audioStreamOptions.numChannels =
-      validateOptionalInt64ToInt(num_channels, "num_channels");
-  audioStreamOptions.sampleRate =
-      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
-  auto avioContextHolder =
-      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
-  AudioEncoder encoder(
-      samples,
-      validateInt64ToInt(sample_rate, "sample_rate"),
-      format,
-      std::move(avioContextHolder),
-      audioStreamOptions);
-  encoder.encode();
+// TODO: Investigate if we can do something better here. See:
+//         https://github.com/pytorch/torchcodec/issues/896
+//       Short version is that we're laundering a pointer through an int, the
+//       Python side forwards that to decoder creation functions in
+//       custom_ops.cpp and we do another cast on that side to get a pointer
+//       again. We want to investigate if we can do something cleaner by
+//       defining proper pybind objects.
+int64_t create_file_like_context(py::object file_like, bool is_for_writing) {
+  AVIOFileLikeContext* context =
+      new AVIOFileLikeContext(file_like, is_for_writing);
+  return reinterpret_cast<int64_t>(context);
 }
 #ifndef PYBIND_OPS_MODULE_NAME
@@ -80,8 +44,7 @@ void encode_audio_to_file_like(
 #endif
 PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
-  m.def("create_from_file_like", &create_from_file_like);
-  m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
+  m.def("create_file_like_context", &create_file_like_context);
 }
 } // namespace facebook::torchcodec

torchcodec/_samplers/video_clip_sampler.py CHANGED Viewed

@@ -105,25 +105,12 @@ class IndexBasedSamplerArgs(SamplerArgs):
     sample_step: int = 1
-class VideoClipSampler(nn.Module):
+class DEPRECATED_VideoClipSampler(nn.Module):
     """
-    VideoClipSampler will do video clip sampling with given video args and sampler args.
-    The video args contains video related information, frames_per_clip, dimensions etc.
-    The sampler args can be either time-based or index-based, it will be used to decide clip start time pts or index.
-    ClipSampling support, random, uniform, periodic, target, keyframe sampling etc.
+    DEPRECATED: Do not use. The supported samplers are in `torchcodec.samplers`. See:
-    Args:
-        video_args (`VideoArgs`): The video args
-        sampler_args (`SamplerArgs`): The sampler args. Can be TimeBasedSamplerArgs or IndexBasedSamplerArgs
-        decoder_args (`DecoderArgs`): Decoder args contain value needs for decoder, for example, thread count
-    Example:
-        >>> video_args = VideoArgs(desired_width=224, desired_height=224)
-        >>> time_based_sampler_args = TimeBasedSamplerArgs(sampler_type="random", clips_per_video=1, frames_per_clip=4)
-        >>> video_decoder_args = DecoderArgs(num_threads=1)
-        >>> video_clip_sampler = VideoClipSampler(video_args, time_based_sampler_args, decoder_args)
-        >>> clips = video_clip_sampler(video_data)
-        clips now contains a list of clip, where clip is a list of frame tensors, each tensor represents a frame image.
+      * https://docs.pytorch.org/torchcodec/stable/api_ref_torchcodec.html
+      * https://docs.pytorch.org/torchcodec/stable/generated_examples/decoding/sampling.html
     """
     def __init__(
@@ -160,8 +147,7 @@ class VideoClipSampler(nn.Module):
         scan_all_streams_to_update_metadata(video_decoder)
         add_video_stream(
             video_decoder,
-            width=target_width,
-            height=target_height,
+            transform_specs=f"resize, {target_height}, {target_width}",
             num_threads=self.decoder_args.num_threads,
         )
@@ -240,6 +226,8 @@ class VideoClipSampler(nn.Module):
                 clip_start_idx + i * index_based_sampler_args.video_frame_dilation
                 for i in range(index_based_sampler_args.frames_per_clip)
             ]
+            # Need torch.stack to convert List[Tensor[int]] into 1D Tensor[int]
+            batch_indexes = torch.stack(batch_indexes)
             frames, *_ = get_frames_at_indices(
                 video_decoder,
                 frame_indices=batch_indexes,

torchcodec/decoders/__init__.py CHANGED Viewed

@@ -6,6 +6,7 @@
 from .._core import AudioStreamMetadata, VideoStreamMetadata
 from ._audio_decoder import AudioDecoder  # noqa
+from ._decoder_utils import set_cuda_backend  # noqa
 from ._video_decoder import VideoDecoder  # noqa
 SimpleVideoDecoder = VideoDecoder

torchcodec/decoders/_decoder_utils.py CHANGED Viewed

@@ -4,10 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import contextvars
 import io
+from contextlib import contextmanager
 from pathlib import Path
-from typing import Union
+from typing import Generator, Union
 from torch import Tensor
 from torchcodec import _core as core
@@ -50,3 +52,61 @@ def create_decoder(
         "read(self, size: int) -> bytes and "
         "seek(self, offset: int, whence: int) -> int methods."
     )
+# Thread-local and async-safe storage for the current CUDA backend
+_CUDA_BACKEND: contextvars.ContextVar[str] = contextvars.ContextVar(
+    "_CUDA_BACKEND", default="ffmpeg"
+)
+@contextmanager
+def set_cuda_backend(backend: str) -> Generator[None, None, None]:
+    """Context Manager to set the CUDA backend for :class:`~torchcodec.decoders.VideoDecoder`.
+    This context manager allows you to specify which CUDA backend implementation
+    to use when creating :class:`~torchcodec.decoders.VideoDecoder` instances
+    with CUDA devices.
+    .. note::
+        **We recommend trying the "beta" backend instead of the default "ffmpeg"
+        backend!** The beta backend is faster, and will eventually become the
+        default in future versions. It may have rough edges that we'll polish
+        over time, but it's already quite stable and ready for adoption. Let us
+        know what you think!
+    Only the creation of the decoder needs to be inside the context manager, the
+    decoding methods can be called outside of it. You still need to pass
+    ``device="cuda"`` when creating the
+    :class:`~torchcodec.decoders.VideoDecoder` instance. If a CUDA device isn't
+    specified, this context manager will have no effect. See example below.
+    This is thread-safe and async-safe.
+    Args:
+        backend (str): The CUDA backend to use. Can be "ffmpeg" (default) or
+            "beta". We recommend trying "beta" as it's faster!
+    Example:
+        >>> with set_cuda_backend("beta"):
+        ...     decoder = VideoDecoder("video.mp4", device="cuda")
+        ...
+        ... # Only the decoder creation needs to be part of the context manager.
+        ... # Decoder will now the beta CUDA implementation:
+        ... decoder.get_frame_at(0)
+    """
+    backend = backend.lower()
+    if backend not in ("ffmpeg", "beta"):
+        raise ValueError(
+            f"Invalid CUDA backend ({backend}). Supported values are 'ffmpeg' and 'beta'."
+        )
+    previous_state = _CUDA_BACKEND.set(backend)
+    try:
+        yield
+    finally:
+        _CUDA_BACKEND.reset(previous_state)
+def _get_cuda_backend() -> str:
+    return _CUDA_BACKEND.get()