PyPI - torchcodec - Versions diffs - 0.7.0__cp310-cp310-win_amd64.whl → 0.8.0__cp310-cp310-win_amd64.whl - Mend

torchcodec 0.7.0__cp310-cp310-win_amd64.whl → 0.8.0__cp310-cp310-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (61) hide show

torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
torchcodec/_core/CMakeLists.txt +36 -3
torchcodec/_core/CUDACommon.cpp +315 -0
torchcodec/_core/CUDACommon.h +46 -0
torchcodec/_core/CpuDeviceInterface.cpp +189 -108
torchcodec/_core/CpuDeviceInterface.h +81 -19
torchcodec/_core/CudaDeviceInterface.cpp +211 -368
torchcodec/_core/CudaDeviceInterface.h +33 -6
torchcodec/_core/DeviceInterface.cpp +57 -19
torchcodec/_core/DeviceInterface.h +97 -16
torchcodec/_core/Encoder.cpp +302 -9
torchcodec/_core/Encoder.h +51 -1
torchcodec/_core/FFMPEGCommon.cpp +189 -2
torchcodec/_core/FFMPEGCommon.h +18 -0
torchcodec/_core/FilterGraph.cpp +28 -21
torchcodec/_core/FilterGraph.h +15 -1
torchcodec/_core/Frame.cpp +17 -7
torchcodec/_core/Frame.h +15 -61
torchcodec/_core/Metadata.h +2 -2
torchcodec/_core/NVDECCache.cpp +70 -0
torchcodec/_core/NVDECCache.h +104 -0
torchcodec/_core/SingleStreamDecoder.cpp +202 -198
torchcodec/_core/SingleStreamDecoder.h +39 -14
torchcodec/_core/StreamOptions.h +16 -6
torchcodec/_core/Transform.cpp +60 -0
torchcodec/_core/Transform.h +59 -0
torchcodec/_core/__init__.py +1 -0
torchcodec/_core/custom_ops.cpp +180 -32
torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
torchcodec/_core/ops.py +86 -43
torchcodec/_core/pybind_ops.cpp +22 -59
torchcodec/_samplers/video_clip_sampler.py +7 -19
torchcodec/decoders/__init__.py +1 -0
torchcodec/decoders/_decoder_utils.py +61 -1
torchcodec/decoders/_video_decoder.py +56 -20
torchcodec/libtorchcodec_core4.dll +0 -0
torchcodec/libtorchcodec_core5.dll +0 -0
torchcodec/libtorchcodec_core6.dll +0 -0
torchcodec/libtorchcodec_core7.dll +0 -0
torchcodec/libtorchcodec_core8.dll +0 -0
torchcodec/libtorchcodec_custom_ops4.dll +0 -0
torchcodec/libtorchcodec_custom_ops5.dll +0 -0
torchcodec/libtorchcodec_custom_ops6.dll +0 -0
torchcodec/libtorchcodec_custom_ops7.dll +0 -0
torchcodec/libtorchcodec_custom_ops8.dll +0 -0
torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
torchcodec/samplers/_time_based.py +8 -0
torchcodec/version.py +1 -1
{torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/METADATA +24 -13
torchcodec-0.8.0.dist-info/RECORD +80 -0
{torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/WHEEL +1 -1
torchcodec-0.7.0.dist-info/RECORD +0 -67
{torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/licenses/LICENSE +0 -0
{torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/top_level.txt +0 -0

torchcodec/_core/SingleStreamDecoder.h CHANGED Viewed

@@ -17,6 +17,7 @@
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/Frame.h"
 #include "src/torchcodec/_core/StreamOptions.h"
+#include "src/torchcodec/_core/Transform.h"
 namespace facebook::torchcodec {
@@ -83,6 +84,7 @@ class SingleStreamDecoder {
   void addVideoStream(
       int streamIndex,
+      std::vector<Transform*>& transforms,
       const VideoStreamOptions& videoStreamOptions = VideoStreamOptions(),
       std::optional<FrameMappings> customFrameMappings = std::nullopt);
   void addAudioStream(
@@ -106,7 +108,7 @@ class SingleStreamDecoder {
   // Returns frames at the given indices for a given stream as a single stacked
   // Tensor.
-  FrameBatchOutput getFramesAtIndices(const std::vector<int64_t>& frameIndices);
+  FrameBatchOutput getFramesAtIndices(const torch::Tensor& frameIndices);
   // Returns frames within a given range. The range is defined by [start, stop).
   // The values retrieved from the range are: [start, start+step,
@@ -121,7 +123,7 @@ class SingleStreamDecoder {
   // seconds=5.999, etc.
   FrameOutput getFramePlayedAt(double seconds);
-  FrameBatchOutput getFramesPlayedAt(const std::vector<double>& timestamps);
+  FrameBatchOutput getFramesPlayedAt(const torch::Tensor& timestamps);
   // Returns frames within a given pts range. The range is defined by
   // [startSeconds, stopSeconds) with respect to the pts values for frames. The
@@ -226,17 +228,8 @@ class SingleStreamDecoder {
     std::vector<FrameInfo> keyFrames;
     std::vector<FrameInfo> allFrames;
-    // TODO since the decoder is single-stream, these should be decoder fields,
-    // not streamInfo fields. And they should be defined right next to
-    // `cursor_`, with joint documentation.
-    int64_t lastDecodedAvFramePts = 0;
-    int64_t lastDecodedAvFrameDuration = 0;
     VideoStreamOptions videoStreamOptions;
     AudioStreamOptions audioStreamOptions;
-    // color-conversion fields. Only one of FilterGraphContext and
-    // UniqueSwsContext should be non-null.
-    UniqueSwrContext swrContext;
   };
   // --------------------------------------------------------------------------
@@ -318,6 +311,7 @@ class SingleStreamDecoder {
       int streamIndex,
       AVMediaType mediaType,
       const torch::Device& device = torch::kCPU,
+      const std::string_view deviceVariant = "default",
       std::optional<int> ffmpegThreadCount = std::nullopt);
   // Returns the "best" stream index for a given media type. The "best" is
@@ -356,16 +350,49 @@ class SingleStreamDecoder {
   const int NO_ACTIVE_STREAM = -2;
   int activeStreamIndex_ = NO_ACTIVE_STREAM;
-  bool cursorWasJustSet_ = false;
   // The desired position of the cursor in the stream. We send frames >= this
   // pts to the user when they request a frame.
   int64_t cursor_ = INT64_MIN;
+  bool cursorWasJustSet_ = false;
+  int64_t lastDecodedAvFramePts_ = 0;
+  int64_t lastDecodedAvFrameDuration_ = 0;
+  // Audio only. We cache it for performance. The video equivalents live in
+  // deviceInterface_. We store swrContext_ here because we only handle audio
+  // on the CPU.
+  UniqueSwrContext swrContext_;
   // Stores various internal decoding stats.
   DecodeStats decodeStats_;
   // Stores the AVIOContext for the input buffer.
   std::unique_ptr<AVIOContextHolder> avioContextHolder_;
+  // We will receive a vector of transforms upon adding a stream and store it
+  // here. However, we need to know if any of those operations change the
+  // dimensions of the output frame. If they do, we need to figure out what are
+  // the final dimensions of the output frame after ALL transformations. We
+  // figure this out as soon as we receive the transforms. If any of the
+  // transforms change the final output frame dimensions, we store that in
+  // resizedOutputDims_. If resizedOutputDims_ has no value, that means there
+  // are no transforms that change the output frame dimensions.
+  //
+  // The priority order for output frame dimension is:
+  //
+  // 1. resizedOutputDims_; the resize requested by the user always takes
+  //    priority.
+  // 2. The dimemnsions of the actual decoded AVFrame. This can change
+  //    per-decoded frame, and is unknown in SingleStreamDecoder. Only the
+  //    DeviceInterface learns it immediately after decoding a raw frame but
+  //    before the color transformation.
+  // 3. metdataDims_; the dimensions we learned from the metadata.
+  std::vector<std::unique_ptr<Transform>> transforms_;
+  std::optional<FrameDims> resizedOutputDims_;
+  FrameDims metadataDims_;
   // Whether or not we have already scanned all streams to update the metadata.
   bool scannedAllStreams_ = false;
   // Tracks that we've already been initialized.
   bool initialized_ = false;
 };
@@ -375,6 +402,4 @@ std::ostream& operator<<(
     std::ostream& os,
     const SingleStreamDecoder::DecodeStats& stats);
-SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode);
 } // namespace facebook::torchcodec

torchcodec/_core/StreamOptions.h CHANGED Viewed

@@ -9,11 +9,11 @@
 #include <torch/types.h>
 #include <optional>
 #include <string>
+#include <string_view>
 namespace facebook::torchcodec {
 enum ColorConversionLibrary {
-  // TODO: Add an AUTO option later.
   // Use the libavfilter library for color conversion.
   FILTERGRAPH,
   // Use the libswscale library for color conversion.
@@ -28,16 +28,26 @@ struct VideoStreamOptions {
   // utilize all cores. If not set, it will be the default FFMPEG behavior for
   // the given codec.
   std::optional<int> ffmpegThreadCount;
   // Currently the dimension order can be either NHWC or NCHW.
   // H=height, W=width, C=channel.
   std::string dimensionOrder = "NCHW";
-  // The output height and width of the frame. If not specified, the output
-  // is the same as the original video.
-  std::optional<int> width;
-  std::optional<int> height;
-  std::optional<ColorConversionLibrary> colorConversionLibrary;
+  // By default we have to use filtergraph, as it is more general. We can only
+  // use swscale when we have met strict requirements. See
+  // CpuDeviceInterface::initialze() for the logic.
+  ColorConversionLibrary colorConversionLibrary =
+      ColorConversionLibrary::FILTERGRAPH;
   // By default we use CPU for decoding for both C++ and python users.
   torch::Device device = torch::kCPU;
+  // Device variant (e.g., "default", "beta", etc.)
+  std::string_view deviceVariant = "default";
+  // Encoding options
+  // TODO-VideoEncoder: Consider adding other optional fields here
+  // (bit rate, gop size, max b frames, preset)
+  std::optional<int> crf;
 };
 struct AudioStreamOptions {

torchcodec/_core/Transform.cpp ADDED Viewed

@@ -0,0 +1,60 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include "src/torchcodec/_core/Transform.h"
+#include <torch/types.h>
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+namespace facebook::torchcodec {
+namespace {
+std::string toFilterGraphInterpolation(
+    ResizeTransform::InterpolationMode mode) {
+  switch (mode) {
+    case ResizeTransform::InterpolationMode::BILINEAR:
+      return "bilinear";
+    default:
+      TORCH_CHECK(
+          false,
+          "Unknown interpolation mode: " +
+              std::to_string(static_cast<int>(mode)));
+  }
+}
+int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
+  switch (mode) {
+    case ResizeTransform::InterpolationMode::BILINEAR:
+      return SWS_BILINEAR;
+    default:
+      TORCH_CHECK(
+          false,
+          "Unknown interpolation mode: " +
+              std::to_string(static_cast<int>(mode)));
+  }
+}
+} // namespace
+std::string ResizeTransform::getFilterGraphCpu() const {
+  return "scale=" + std::to_string(outputDims_.width) + ":" +
+      std::to_string(outputDims_.height) +
+      ":sws_flags=" + toFilterGraphInterpolation(interpolationMode_);
+}
+std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
+  return outputDims_;
+}
+bool ResizeTransform::isResize() const {
+  return true;
+}
+int ResizeTransform::getSwsFlags() const {
+  return toSwsInterpolation(interpolationMode_);
+}
+} // namespace facebook::torchcodec

torchcodec/_core/Transform.h ADDED Viewed

@@ -0,0 +1,59 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <optional>
+#include <string>
+#include "src/torchcodec/_core/Frame.h"
+namespace facebook::torchcodec {
+class Transform {
+ public:
+  virtual std::string getFilterGraphCpu() const = 0;
+  virtual ~Transform() = default;
+  // If the transformation does not change the output frame dimensions, then
+  // there is no need to override this member function. The default
+  // implementation returns an empty optional, indicating that the output frame
+  // has the same dimensions as the input frame.
+  //
+  // If the transformation does change the output frame dimensions, then it
+  // must override this member function and return the output frame dimensions.
+  virtual std::optional<FrameDims> getOutputFrameDims() const {
+    return std::nullopt;
+  }
+  // The ResizeTransform is special, because it is the only transform that
+  // swscale can handle.
+  virtual bool isResize() const {
+    return false;
+  }
+};
+class ResizeTransform : public Transform {
+ public:
+  enum class InterpolationMode { BILINEAR };
+  ResizeTransform(const FrameDims& dims)
+      : outputDims_(dims), interpolationMode_(InterpolationMode::BILINEAR) {}
+  ResizeTransform(const FrameDims& dims, InterpolationMode interpolationMode)
+      : outputDims_(dims), interpolationMode_(interpolationMode) {}
+  std::string getFilterGraphCpu() const override;
+  std::optional<FrameDims> getOutputFrameDims() const override;
+  bool isResize() const override;
+  int getSwsFlags() const;
+ private:
+  FrameDims outputDims_;
+  InterpolationMode interpolationMode_;
+};
+} // namespace facebook::torchcodec

torchcodec/_core/__init__.py CHANGED Viewed

@@ -25,6 +25,7 @@ from .ops import (
     encode_audio_to_file,
     encode_audio_to_file_like,
     encode_audio_to_tensor,
+    encode_video_to_file,
     get_ffmpeg_library_versions,
     get_frame_at_index,
     get_frame_at_pts,

torchcodec/_core/custom_ops.cpp CHANGED Viewed

@@ -10,6 +10,7 @@
 #include <string>
 #include "c10/core/SymIntArrayRef.h"
 #include "c10/util/Exception.h"
+#include "src/torchcodec/_core/AVIOFileLikeContext.h"
 #include "src/torchcodec/_core/AVIOTensorContext.h"
 #include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
@@ -31,15 +32,20 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def("create_from_file(str filename, str? seek_mode=None) -> Tensor");
   m.def(
       "encode_audio_to_file(Tensor samples, int sample_rate, str filename, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
+  m.def(
+      "encode_video_to_file(Tensor frames, int frame_rate, str filename, int? crf=None) -> ()");
   m.def(
       "encode_audio_to_tensor(Tensor samples, int sample_rate, str format, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> Tensor");
+  m.def(
+      "_encode_audio_to_file_like(Tensor samples, int sample_rate, str format, int file_like_context, int? bit_rate=None, int? num_channels=None, int? desired_sample_rate=None) -> ()");
   m.def(
       "create_from_tensor(Tensor video_tensor, str? seek_mode=None) -> Tensor");
-  m.def("_convert_to_tensor(int decoder_ptr) -> Tensor");
   m.def(
-      "_add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+      "_create_from_file_like(int file_like_context, str? seek_mode=None) -> Tensor");
   m.def(
-      "add_video_stream(Tensor(a!) decoder, *, int? width=None, int? height=None, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str? device=None, (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
+      "_add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None, str? color_conversion_library=None) -> ()");
+  m.def(
+      "add_video_stream(Tensor(a!) decoder, *, int? num_threads=None, str? dimension_order=None, int? stream_index=None, str device=\"cpu\", str device_variant=\"default\", str transform_specs=\"\", (Tensor, Tensor, Tensor)? custom_frame_mappings=None) -> ()");
   m.def(
       "add_audio_stream(Tensor(a!) decoder, *, int? stream_index=None, int? sample_rate=None, int? num_channels=None) -> ()");
   m.def("seek_to_pts(Tensor(a!) decoder, float seconds) -> ()");
@@ -49,7 +55,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frame_at_index(Tensor(a!) decoder, *, int frame_index) -> (Tensor, Tensor, Tensor)");
   m.def(
-      "get_frames_at_indices(Tensor(a!) decoder, *, int[] frame_indices) -> (Tensor, Tensor, Tensor)");
+      "get_frames_at_indices(Tensor(a!) decoder, *, Tensor frame_indices) -> (Tensor, Tensor, Tensor)");
   m.def(
       "get_frames_in_range(Tensor(a!) decoder, *, int start, int stop, int? step=None) -> (Tensor, Tensor, Tensor)");
   m.def(
@@ -57,7 +63,7 @@ TORCH_LIBRARY(torchcodec_ns, m) {
   m.def(
       "get_frames_by_pts_in_range_audio(Tensor(a!) decoder, *, float start_seconds, float? stop_seconds) -> (Tensor, Tensor)");
   m.def(
-      "get_frames_by_pts(Tensor(a!) decoder, *, float[] timestamps) -> (Tensor, Tensor, Tensor)");
+      "get_frames_by_pts(Tensor(a!) decoder, *, Tensor timestamps) -> (Tensor, Tensor, Tensor)");
   m.def("_get_key_frame_indices(Tensor(a!) decoder) -> Tensor");
   m.def("get_json_metadata(Tensor(a!) decoder) -> str");
   m.def("get_container_json_metadata(Tensor(a!) decoder) -> str");
@@ -165,6 +171,81 @@ std::string mapToJson(const std::map<std::string, std::string>& metadataMap) {
   return ss.str();
 }
+SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
+  if (seekMode == "exact") {
+    return SingleStreamDecoder::SeekMode::exact;
+  } else if (seekMode == "approximate") {
+    return SingleStreamDecoder::SeekMode::approximate;
+  } else if (seekMode == "custom_frame_mappings") {
+    return SingleStreamDecoder::SeekMode::custom_frame_mappings;
+  } else {
+    TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
+  }
+}
+int checkedToPositiveInt(const std::string& str) {
+  int ret = 0;
+  try {
+    ret = std::stoi(str);
+  } catch (const std::invalid_argument&) {
+    TORCH_CHECK(false, "String cannot be converted to an int:" + str);
+  } catch (const std::out_of_range&) {
+    TORCH_CHECK(false, "String would become integer out of range:" + str);
+  }
+  TORCH_CHECK(ret > 0, "String must be a positive integer:" + str);
+  return ret;
+}
+// Resize transform specs take the form:
+//
+//   "resize, <height>, <width>"
+//
+// Where "resize" is the string literal and <height> and <width> are positive
+// integers.
+Transform* makeResizeTransform(
+    const std::vector<std::string>& resizeTransformSpec) {
+  TORCH_CHECK(
+      resizeTransformSpec.size() == 3,
+      "resizeTransformSpec must have 3 elements including its name");
+  int height = checkedToPositiveInt(resizeTransformSpec[1]);
+  int width = checkedToPositiveInt(resizeTransformSpec[2]);
+  return new ResizeTransform(FrameDims(height, width));
+}
+std::vector<std::string> split(const std::string& str, char delimiter) {
+  std::vector<std::string> tokens;
+  std::string token;
+  std::istringstream tokenStream(str);
+  while (std::getline(tokenStream, token, delimiter)) {
+    tokens.push_back(token);
+  }
+  return tokens;
+}
+// The transformSpecsRaw string is always in the format:
+//
+//   "name1, param1, param2, ...; name2, param1, param2, ...; ..."
+//
+// Where "nameX" is the name of the transform, and "paramX" are the parameters.
+std::vector<Transform*> makeTransforms(const std::string& transformSpecsRaw) {
+  std::vector<Transform*> transforms;
+  std::vector<std::string> transformSpecs = split(transformSpecsRaw, ';');
+  for (const std::string& transformSpecRaw : transformSpecs) {
+    std::vector<std::string> transformSpec = split(transformSpecRaw, ',');
+    TORCH_CHECK(
+        transformSpec.size() >= 1,
+        "Invalid transform spec: " + transformSpecRaw);
+    auto name = transformSpec[0];
+    if (name == "resize") {
+      transforms.push_back(makeResizeTransform(transformSpec));
+    } else {
+      TORCH_CHECK(false, "Invalid transform name: " + name);
+    }
+  }
+  return transforms;
+}
 } // namespace
 // ==============================
@@ -203,33 +284,47 @@ at::Tensor create_from_tensor(
     realSeek = seekModeFromString(seek_mode.value());
   }
-  auto contextHolder = std::make_unique<AVIOFromTensorContext>(video_tensor);
+  auto avioContextHolder =
+      std::make_unique<AVIOFromTensorContext>(video_tensor);
   std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
-      std::make_unique<SingleStreamDecoder>(std::move(contextHolder), realSeek);
+      std::make_unique<SingleStreamDecoder>(
+          std::move(avioContextHolder), realSeek);
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
-at::Tensor _convert_to_tensor(int64_t decoder_ptr) {
-  auto decoder = reinterpret_cast<SingleStreamDecoder*>(decoder_ptr);
-  std::unique_ptr<SingleStreamDecoder> uniqueDecoder(decoder);
+at::Tensor _create_from_file_like(
+    int64_t file_like_context,
+    std::optional<std::string_view> seek_mode) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(
+      fileLikeContext != nullptr, "file_like_context must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
+  SingleStreamDecoder::SeekMode realSeek = SingleStreamDecoder::SeekMode::exact;
+  if (seek_mode.has_value()) {
+    realSeek = seekModeFromString(seek_mode.value());
+  }
+  std::unique_ptr<SingleStreamDecoder> uniqueDecoder =
+      std::make_unique<SingleStreamDecoder>(
+          std::move(avioContextHolder), realSeek);
   return wrapDecoderPointerToTensor(std::move(uniqueDecoder));
 }
 void _add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt,
+    std::string_view device = "cpu",
+    std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>
         custom_frame_mappings = std::nullopt,
     std::optional<std::string_view> color_conversion_library = std::nullopt) {
   VideoStreamOptions videoStreamOptions;
-  videoStreamOptions.width = width;
-  videoStreamOptions.height = height;
   videoStreamOptions.ffmpegThreadCount = num_threads;
   if (dimension_order.has_value()) {
@@ -253,37 +348,46 @@ void _add_video_stream(
           ". color_conversion_library must be either filtergraph or swscale.");
     }
   }
-  if (device.has_value()) {
-    videoStreamOptions.device = createTorchDevice(std::string(device.value()));
-  }
+  validateDeviceInterface(std::string(device), std::string(device_variant));
+  videoStreamOptions.device = torch::Device(std::string(device));
+  videoStreamOptions.deviceVariant = device_variant;
+  std::vector<Transform*> transforms =
+      makeTransforms(std::string(transform_specs));
   std::optional<SingleStreamDecoder::FrameMappings> converted_mappings =
       custom_frame_mappings.has_value()
       ? std::make_optional(makeFrameMappings(custom_frame_mappings.value()))
       : std::nullopt;
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
   videoDecoder->addVideoStream(
-      stream_index.value_or(-1), videoStreamOptions, converted_mappings);
+      stream_index.value_or(-1),
+      transforms,
+      videoStreamOptions,
+      converted_mappings);
 }
 // Add a new video stream at `stream_index` using the provided options.
 void add_video_stream(
     at::Tensor& decoder,
-    std::optional<int64_t> width = std::nullopt,
-    std::optional<int64_t> height = std::nullopt,
     std::optional<int64_t> num_threads = std::nullopt,
     std::optional<std::string_view> dimension_order = std::nullopt,
     std::optional<int64_t> stream_index = std::nullopt,
-    std::optional<std::string_view> device = std::nullopt,
+    std::string_view device = "cpu",
+    std::string_view device_variant = "default",
+    std::string_view transform_specs = "",
     const std::optional<std::tuple<at::Tensor, at::Tensor, at::Tensor>>&
         custom_frame_mappings = std::nullopt) {
   _add_video_stream(
       decoder,
-      width,
-      height,
       num_threads,
       dimension_order,
       stream_index,
       device,
+      device_variant,
+      transform_specs,
       custom_frame_mappings);
 }
@@ -344,11 +448,9 @@ OpsFrameOutput get_frame_at_index(at::Tensor& decoder, int64_t frame_index) {
 // Return the frames at given indices for a given stream
 OpsFrameBatchOutput get_frames_at_indices(
     at::Tensor& decoder,
-    at::IntArrayRef frame_indices) {
+    const at::Tensor& frame_indices) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  std::vector<int64_t> frameIndicesVec(
-      frame_indices.begin(), frame_indices.end());
-  auto result = videoDecoder->getFramesAtIndices(frameIndicesVec);
+  auto result = videoDecoder->getFramesAtIndices(frame_indices);
   return makeOpsFrameBatchOutput(result);
 }
@@ -367,10 +469,9 @@ OpsFrameBatchOutput get_frames_in_range(
 // Return the frames at given ptss for a given stream
 OpsFrameBatchOutput get_frames_by_pts(
     at::Tensor& decoder,
-    at::ArrayRef<double> timestamps) {
+    const at::Tensor& timestamps) {
   auto videoDecoder = unwrapTensorToGetDecoder(decoder);
-  std::vector<double> timestampsVec(timestamps.begin(), timestamps.end());
-  auto result = videoDecoder->getFramesPlayedAt(timestampsVec);
+  auto result = videoDecoder->getFramesPlayedAt(timestamps);
   return makeOpsFrameBatchOutput(result);
 }
@@ -397,6 +498,21 @@ OpsAudioFramesOutput get_frames_by_pts_in_range_audio(
   return makeOpsAudioFramesOutput(result);
 }
+void encode_video_to_file(
+    const at::Tensor& frames,
+    int64_t frame_rate,
+    std::string_view file_name,
+    std::optional<int64_t> crf = std::nullopt) {
+  VideoStreamOptions videoStreamOptions;
+  videoStreamOptions.crf = crf;
+  VideoEncoder(
+      frames,
+      validateInt64ToInt(frame_rate, "frame_rate"),
+      file_name,
+      videoStreamOptions)
+      .encode();
+}
 void encode_audio_to_file(
     const at::Tensor& samples,
     int64_t sample_rate,
@@ -441,6 +557,36 @@ at::Tensor encode_audio_to_tensor(
       .encodeToTensor();
 }
+void _encode_audio_to_file_like(
+    const at::Tensor& samples,
+    int64_t sample_rate,
+    std::string_view format,
+    int64_t file_like_context,
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt,
+    std::optional<int64_t> desired_sample_rate = std::nullopt) {
+  auto fileLikeContext =
+      reinterpret_cast<AVIOFileLikeContext*>(file_like_context);
+  TORCH_CHECK(
+      fileLikeContext != nullptr, "file_like_context must be a valid pointer");
+  std::unique_ptr<AVIOFileLikeContext> avioContextHolder(fileLikeContext);
+  AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.bitRate = validateOptionalInt64ToInt(bit_rate, "bit_rate");
+  audioStreamOptions.numChannels =
+      validateOptionalInt64ToInt(num_channels, "num_channels");
+  audioStreamOptions.sampleRate =
+      validateOptionalInt64ToInt(desired_sample_rate, "desired_sample_rate");
+  AudioEncoder encoder(
+      samples,
+      validateInt64ToInt(sample_rate, "sample_rate"),
+      format,
+      std::move(avioContextHolder),
+      audioStreamOptions);
+  encoder.encode();
+}
 // For testing only. We need to implement this operation as a core library
 // function because what we're testing is round-tripping pts values as
 // double-precision floating point numbers from C++ to Python and back to C++.
@@ -694,14 +840,16 @@ void scan_all_streams_to_update_metadata(at::Tensor& decoder) {
 TORCH_LIBRARY_IMPL(torchcodec_ns, BackendSelect, m) {
   m.impl("create_from_file", &create_from_file);
   m.impl("create_from_tensor", &create_from_tensor);
-  m.impl("_convert_to_tensor", &_convert_to_tensor);
+  m.impl("_create_from_file_like", &_create_from_file_like);
   m.impl(
       "_get_json_ffmpeg_library_versions", &_get_json_ffmpeg_library_versions);
 }
 TORCH_LIBRARY_IMPL(torchcodec_ns, CPU, m) {
   m.impl("encode_audio_to_file", &encode_audio_to_file);
+  m.impl("encode_video_to_file", &encode_video_to_file);
   m.impl("encode_audio_to_tensor", &encode_audio_to_tensor);
+  m.impl("_encode_audio_to_file_like", &_encode_audio_to_file_like);
   m.impl("seek_to_pts", &seek_to_pts);
   m.impl("add_video_stream", &add_video_stream);
   m.impl("_add_video_stream", &_add_video_stream);