PyPI - torchcodec - Versions diffs - 0.6.0__cp310-cp310-macosx_11_0_arm64.whl → 0.7.0__cp310-cp310-macosx_11_0_arm64.whl - Mend

torchcodec 0.6.0__cp310-cp310-macosx_11_0_arm64.whl → 0.7.0__cp310-cp310-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (53) hide show

torchcodec/.dylibs/libc++.1.0.dylib +0 -0
torchcodec/.dylibs/libpython3.10.dylib +0 -0
torchcodec/_core/AVIOContextHolder.cpp +10 -5
torchcodec/_core/AVIOContextHolder.h +1 -0
torchcodec/_core/AVIOFileLikeContext.cpp +23 -5
torchcodec/_core/AVIOFileLikeContext.h +2 -1
torchcodec/_core/AVIOTensorContext.cpp +4 -2
torchcodec/_core/CMakeLists.txt +57 -18
torchcodec/_core/Cache.h +138 -0
torchcodec/_core/CpuDeviceInterface.cpp +55 -149
torchcodec/_core/CpuDeviceInterface.h +13 -23
torchcodec/_core/CudaDeviceInterface.cpp +310 -78
torchcodec/_core/CudaDeviceInterface.h +3 -1
torchcodec/_core/Encoder.cpp +13 -5
torchcodec/_core/Encoder.h +6 -4
torchcodec/_core/FFMPEGCommon.cpp +9 -1
torchcodec/_core/FFMPEGCommon.h +15 -0
torchcodec/_core/FilterGraph.cpp +142 -0
torchcodec/_core/FilterGraph.h +45 -0
torchcodec/_core/SingleStreamDecoder.cpp +32 -32
torchcodec/_core/ValidationUtils.cpp +35 -0
torchcodec/_core/ValidationUtils.h +21 -0
torchcodec/_core/__init__.py +1 -0
torchcodec/_core/custom_ops.cpp +23 -23
torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +81 -7
torchcodec/_core/ops.py +56 -0
torchcodec/_core/pybind_ops.cpp +39 -1
torchcodec/_internally_replaced_utils.py +9 -6
torchcodec/decoders/_audio_decoder.py +3 -1
torchcodec/decoders/_decoder_utils.py +1 -1
torchcodec/decoders/_video_decoder.py +88 -29
torchcodec/encoders/_audio_encoder.py +41 -1
torchcodec/libtorchcodec_core4.dylib +0 -0
torchcodec/libtorchcodec_core5.dylib +0 -0
torchcodec/libtorchcodec_core6.dylib +0 -0
torchcodec/libtorchcodec_core7.dylib +0 -0
torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
torchcodec/libtorchcodec_pybind_ops4.so +0 -0
torchcodec/libtorchcodec_pybind_ops5.so +0 -0
torchcodec/libtorchcodec_pybind_ops6.so +0 -0
torchcodec/libtorchcodec_pybind_ops7.so +0 -0
torchcodec/samplers/_index_based.py +2 -0
torchcodec/samplers/_time_based.py +2 -0
torchcodec/version.py +1 -1
{torchcodec-0.6.0.dist-info → torchcodec-0.7.0.dist-info}/METADATA +8 -35
torchcodec-0.7.0.dist-info/RECORD +69 -0
torchcodec-0.6.0.dist-info/RECORD +0 -64
{torchcodec-0.6.0.dist-info → torchcodec-0.7.0.dist-info}/WHEEL +0 -0
{torchcodec-0.6.0.dist-info → torchcodec-0.7.0.dist-info}/licenses/LICENSE +0 -0
{torchcodec-0.6.0.dist-info → torchcodec-0.7.0.dist-info}/top_level.txt +0 -0

torchcodec/.dylibs/libc++.1.0.dylib CHANGED Viewed

Binary file

torchcodec/.dylibs/libpython3.10.dylib CHANGED Viewed

Binary file

torchcodec/_core/AVIOContextHolder.cpp CHANGED Viewed

@@ -14,6 +14,7 @@ void AVIOContextHolder::createAVIOContext(
     AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
+    bool isForWriting,
     int bufferSize) {
   TORCH_CHECK(
       bufferSize > 0,
@@ -23,14 +24,18 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
-  TORCH_CHECK(
-      (seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
-      "seek method must be defined, and either write or read must be defined. "
-      "But not both!")
+  TORCH_CHECK(seek != nullptr, "seek method must be defined");
+  if (isForWriting) {
+    TORCH_CHECK(write != nullptr, "write method must be defined for writing");
+  } else {
+    TORCH_CHECK(read != nullptr, "read method must be defined for reading");
+  }
   avioContext_.reset(avioAllocContext(
       buffer,
       bufferSize,
-      /*write_flag=*/write != nullptr,
+      /*write_flag=*/isForWriting,
       heldData,
       read,
       write,

torchcodec/_core/AVIOContextHolder.h CHANGED Viewed

@@ -51,6 +51,7 @@ class AVIOContextHolder {
       AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
+      bool isForWriting,
       int bufferSize = defaultBufferSize);
  private:

torchcodec/_core/AVIOFileLikeContext.cpp CHANGED Viewed

@@ -9,21 +9,31 @@
 namespace facebook::torchcodec {
-AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
+AVIOFileLikeContext::AVIOFileLikeContext(
+    const py::object& fileLike,
+    bool isForWriting)
     : fileLike_{UniquePyObject(new py::object(fileLike))} {
   {
     // TODO: Is it necessary to acquire the GIL here? Is it maybe even
     // harmful? At the moment, this is only called from within a pybind
     // function, and pybind guarantees we have the GIL.
     py::gil_scoped_acquire gil;
-    TORCH_CHECK(
-        py::hasattr(fileLike, "read"),
-        "File like object must implement a read method.");
+    if (isForWriting) {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "write"),
+          "File like object must implement a write method for writing.");
+    } else {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "read"),
+          "File like object must implement a read method for reading.");
+    }
     TORCH_CHECK(
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, nullptr, &seek, &fileLike_);
+  createAVIOContext(&read, &write, &seek, &fileLike_, isForWriting);
 }
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
@@ -77,4 +87,12 @@ int64_t AVIOFileLikeContext::seek(void* opaque, int64_t offset, int whence) {
   return py::cast<int64_t>((*fileLike)->attr("seek")(offset, whence));
 }
+int AVIOFileLikeContext::write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto fileLike = static_cast<UniquePyObject*>(opaque);
+  py::gil_scoped_acquire gil;
+  py::bytes bytes_obj(reinterpret_cast<const char*>(buf), buf_size);
+  return py::cast<int>((*fileLike)->attr("write")(bytes_obj));
+}
 } // namespace facebook::torchcodec

torchcodec/_core/AVIOFileLikeContext.h CHANGED Viewed

@@ -19,11 +19,12 @@ namespace facebook::torchcodec {
 // and seek calls back up to the methods on the Python object.
 class AVIOFileLikeContext : public AVIOContextHolder {
  public:
-  explicit AVIOFileLikeContext(py::object fileLike);
+  explicit AVIOFileLikeContext(const py::object& fileLike, bool isForWriting);
  private:
   static int read(void* opaque, uint8_t* buf, int buf_size);
   static int64_t seek(void* opaque, int64_t offset, int whence);
+  static int write(void* opaque, const uint8_t* buf, int buf_size);
   // Note that we dynamically allocate the Python object because we need to
   // strictly control when its destructor is called. We must hold the GIL

torchcodec/_core/AVIOTensorContext.cpp CHANGED Viewed

@@ -105,12 +105,14 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
   TORCH_CHECK(data.numel() > 0, "data must not be empty");
   TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
   TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
-  createAVIOContext(&read, nullptr, &seek, &tensorContext_);
+  createAVIOContext(
+      &read, nullptr, &seek, &tensorContext_, /*isForWriting=*/false);
 }
 AVIOToTensorContext::AVIOToTensorContext()
     : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
-  createAVIOContext(nullptr, &write, &seek, &tensorContext_);
+  createAVIOContext(
+      nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
 }
 torch::Tensor AVIOToTensorContext::getOutputTensor() {

torchcodec/_core/CMakeLists.txt CHANGED Viewed

@@ -11,10 +11,29 @@ find_package(Python3 ${PYTHON_VERSION} EXACT COMPONENTS Development)
 if(DEFINED TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR AND TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR)
     set(TORCHCODEC_WERROR_OPTION "")
 else()
-    set(TORCHCODEC_WERROR_OPTION "-Werror")
+    if (WIN32)
+        # TODO set warnings as errors on Windows as well.
+        # set(TORCHCODEC_WERROR_OPTION "/WX")
+    else()
+        set(TORCHCODEC_WERROR_OPTION "-Werror")
+    endif()
+endif()
+if (WIN32)
+  # Avoid warnings about non-ASCII characters in source files.
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4819")
+  # Important for when we add Windows CUDA: exporting all symbols is limited to
+  # 65535 symbols, which (apparently) will not work for CUDA.
+  # https://github.com/pytorch/pytorch/pull/3650
+  set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+if (WIN32)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 ${TORCHCODEC_WERROR_OPTION} ${TORCH_CXX_FLAGS}")
+else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic ${TORCHCODEC_WERROR_OPTION} ${TORCH_CXX_FLAGS}")
 endif()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pedantic ${TORCHCODEC_WERROR_OPTION} ${TORCH_CXX_FLAGS}")
 function(make_torchcodec_sublibrary
     library_name
@@ -39,6 +58,7 @@ function(make_torchcodec_sublibrary
         PUBLIC
         ${library_dependencies}
     )
 endfunction()
 function(make_torchcodec_libraries
@@ -50,16 +70,17 @@ function(make_torchcodec_libraries
     #
     # 1. libtorchcodec_coreN.{ext}: Base library which contains the
     #    implementation of VideoDecoder and everything VideoDecoder needs. On
-    #    Linux, {ext} is so. On Mac, it is dylib.
+    #    Linux, {ext} is so. On Mac, it is dylib. On Windows it's dll.
     #
     # 2. libtorchcodec_custom_opsN.{ext}: Implementation of the PyTorch custom
     #    ops. Depends on libtorchcodec_coreN.{ext}. On Linux, {ext} is so.
-    #    On Mac, it is dylib.
+    #    On Mac, it is dylib. On Windows it's dll.
     #
     # 3. libtorchcodec_pybind_opsN.{ext}: Implementation of the pybind11 ops. We
     #    keep these separate from the PyTorch custom ops because we have to
     #    load these libraries separately on the Python side. Depends on
-    #    libtorchcodec_coreN.{ext}. On BOTH Linux and Mac {ext} is so.
+    #    libtorchcodec_coreN.{ext}. On BOTH Linux and Mac {ext} is so. On
+    #    Windows, it's pyd.
     # 1. Create libtorchcodec_coreN.{ext}.
     set(core_library_name "libtorchcodec_core${ffmpeg_major_version}")
@@ -67,11 +88,13 @@ function(make_torchcodec_libraries
         AVIOContextHolder.cpp
         AVIOTensorContext.cpp
         FFMPEGCommon.cpp
+        FilterGraph.cpp
         Frame.cpp
         DeviceInterface.cpp
         CpuDeviceInterface.cpp
         SingleStreamDecoder.cpp
         Encoder.cpp
+        ValidationUtils.cpp
     )
     if(ENABLE_CUDA)
@@ -140,15 +163,26 @@ function(make_torchcodec_libraries
         "${pybind_ops_sources}"
         "${pybind_ops_dependencies}"
     )
+    if(WIN32)
+      # On Windows, we need to set the suffix to .pyd so that Python can
+      # import the shared library as a module. Just setting the MODULE type
+      # isn't enough.
+      set_target_properties(${pybind_ops_library_name} PROPERTIES SUFFIX ".pyd")
+    endif()
     # pybind11 limits the visibility of symbols in the shared library to prevent
     # stray initialization of py::objects. The rest of the object code must
     # match. See:
     #   https://pybind11.readthedocs.io/en/stable/faq.html#someclass-declared-with-greater-visibility-than-the-type-of-its-field-someclass-member-wattributes
-    target_compile_options(
-        ${pybind_ops_library_name}
-        PUBLIC
-      "-fvisibility=hidden"
-    )
+    if(NOT WIN32)
+        target_compile_options(
+            ${pybind_ops_library_name}
+            PUBLIC
+            "-fvisibility=hidden"
+        )
+    endif()
     # The value we use here must match the value we return from
     # _get_pybind_ops_module_name() on the Python side. If the values do not
     # match, then we will be unable to import the C++ shared library as a
@@ -158,14 +192,17 @@ function(make_torchcodec_libraries
         PRIVATE
         PYBIND_OPS_MODULE_NAME=core_pybind_ops
     )
-    # If we don't make sure this flag is set, we run into segfauls at import
-    # time on Mac. See:
-    #    https://github.com/pybind/pybind11/issues/3907#issuecomment-1170412764
-    target_link_options(
-        ${pybind_ops_library_name}
-        PUBLIC
-        "LINKER:-undefined,dynamic_lookup"
-    )
+    if(APPLE)
+        # If we don't make sure this flag is set, we run into segfauls at import
+        # time on Mac. See:
+        # https://github.com/pybind/pybind11/issues/3907#issuecomment-1170412764
+        target_link_options(
+            ${pybind_ops_library_name}
+            PUBLIC
+            "LINKER:-undefined,dynamic_lookup"
+        )
+    endif()
     # Install all libraries.
     set(
@@ -183,7 +220,9 @@ function(make_torchcodec_libraries
     install(
         TARGETS ${all_libraries}
         LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}  # For Windows
     )
 endfunction()
 if(DEFINED ENV{BUILD_AGAINST_ALL_FFMPEG_FROM_S3})

torchcodec/_core/Cache.h ADDED Viewed

@@ -0,0 +1,138 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <torch/types.h>
+#include <memory>
+#include <mutex>
+namespace facebook::torchcodec {
+// This header defines simple cache class primitives to store reusable objects
+// across TorchCodec stream instances. Intended usage is to store hardware
+// contexts creation of which is expensive. The cache mechanism is as follows:
+// 1. 'PerGpuCache' provides a dynamic cache with the specified maximum capacity
+//    for the given number of GPUs.
+// 2. When stream object (e.g. SingleStreamDecoder) is destoyed cachable object
+//    must be released to the cache. Cache will accept the object if it is not
+//    full.
+// 3. When stream object (e.g. SingleStreamDecoder) is created cachable object
+//    must be first queried from the cache. If the cache is empty then new
+//    object must be created.
+template <typename T, typename D = std::default_delete<T>>
+class Cache {
+ public:
+  using element_type = std::unique_ptr<T, D>;
+  explicit Cache(int capacity) : capacity_(capacity) {}
+  // Adds an object to the cache if the cache has capacity. Returns true
+  // if object was added and false otherwise.
+  bool addIfCacheHasCapacity(element_type&& obj);
+  // Returns an object from the cache. Cache does not hold a reference
+  // to the object after this call.
+  element_type get();
+ private:
+  int capacity_;
+  std::mutex mutex_;
+  std::vector<element_type> cache_;
+};
+template <typename T, typename D>
+bool Cache<T, D>::addIfCacheHasCapacity(element_type&& obj) {
+  std::scoped_lock lock(mutex_);
+  if (capacity_ >= 0 && cache_.size() >= static_cast<size_t>(capacity_)) {
+    return false;
+  }
+  cache_.push_back(std::move(obj));
+  return true;
+}
+template <typename T, typename D>
+typename Cache<T, D>::element_type Cache<T, D>::get() {
+  std::scoped_lock lock(mutex_);
+  if (cache_.empty()) {
+    return nullptr;
+  }
+  element_type obj = std::move(cache_.back());
+  cache_.pop_back();
+  return obj;
+}
+template <typename T, typename D = std::default_delete<T>>
+class PerGpuCache {
+ public:
+  using element_type = typename Cache<T, D>::element_type;
+  // Initializes 'maxGpus' number of caches. Each cache can hold no
+  // more than 'capacity' items. If 'capacity' <0 cache size is unlimited.
+  PerGpuCache(int maxGpus, int capacity) {
+    TORCH_CHECK(maxGpus > 0, "maxGpus for PerGpuCache must be >0");
+    for (int i = 0; i < maxGpus; ++i) {
+      cache_.emplace_back(std::make_unique<Cache<T, D>>(capacity));
+    }
+  }
+  // Adds an object to the specified device cache if the cache has
+  // capacity. Returns true if object was added and false otherwise.
+  bool addIfCacheHasCapacity(const torch::Device& device, element_type&& obj);
+  // Returns an object from the cache of the specified device. Cache
+  // does not hold a reference to the object after this call.
+  element_type get(const torch::Device& device);
+ private:
+  // 'Cache' class implementation contains mutex which makes it non-movable
+  // and non-copyable, so we need to wrap it in std::unique_ptr.
+  std::vector<std::unique_ptr<Cache<T, D>>> cache_;
+};
+// Note: this function is inline for convenience, not performance. Because the
+// rest of this file is template functions, they must all be defined in this
+// header. This function is not a template function, and should, in principle,
+// be defined in a .cpp file to preserve the One Definition Rule. That's
+// annoying for such a small amount of code, so we just inline it. If this file
+// grows, and there are more such functions, we should break them out into a
+// .cpp file.
+inline torch::DeviceIndex getNonNegativeDeviceIndex(
+    const torch::Device& device) {
+  torch::DeviceIndex deviceIndex = device.index();
+  // For single GPU machines libtorch returns -1 for the device index. So for
+  // that case we set the device index to 0. That's used in per-gpu cache
+  // implementation and during initialization of CUDA and FFmpeg contexts
+  // which require non negative indices.
+  deviceIndex = std::max<at::DeviceIndex>(deviceIndex, 0);
+  TORCH_CHECK(deviceIndex >= 0, "Device index out of range");
+  return deviceIndex;
+}
+template <typename T, typename D>
+bool PerGpuCache<T, D>::addIfCacheHasCapacity(
+    const torch::Device& device,
+    element_type&& obj) {
+  torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
+  TORCH_CHECK(
+      static_cast<size_t>(deviceIndex) < cache_.size(),
+      "Device index out of range");
+  return cache_[deviceIndex]->addIfCacheHasCapacity(std::move(obj));
+}
+template <typename T, typename D>
+typename PerGpuCache<T, D>::element_type PerGpuCache<T, D>::get(
+    const torch::Device& device) {
+  torch::DeviceIndex deviceIndex = getNonNegativeDeviceIndex(device);
+  TORCH_CHECK(
+      static_cast<size_t>(deviceIndex) < cache_.size(),
+      "Device index out of range");
+  return cache_[deviceIndex]->get();
+}
+} // namespace facebook::torchcodec

torchcodec/_core/CpuDeviceInterface.cpp CHANGED Viewed

@@ -6,11 +6,6 @@
 #include "src/torchcodec/_core/CpuDeviceInterface.h"
-extern "C" {
-#include <libavfilter/buffersink.h>
-#include <libavfilter/buffersrc.h>
-}
 namespace facebook::torchcodec {
 namespace {
@@ -20,17 +15,15 @@ static bool g_cpu = registerDeviceInterface(
 } // namespace
-bool CpuDeviceInterface::DecodedFrameContext::operator==(
-    const CpuDeviceInterface::DecodedFrameContext& other) {
-  return decodedWidth == other.decodedWidth &&
-      decodedHeight == other.decodedHeight &&
-      decodedFormat == other.decodedFormat &&
-      expectedWidth == other.expectedWidth &&
-      expectedHeight == other.expectedHeight;
+bool CpuDeviceInterface::SwsFrameContext::operator==(
+    const CpuDeviceInterface::SwsFrameContext& other) const {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight;
 }
-bool CpuDeviceInterface::DecodedFrameContext::operator!=(
-    const CpuDeviceInterface::DecodedFrameContext& other) {
+bool CpuDeviceInterface::SwsFrameContext::operator!=(
+    const CpuDeviceInterface::SwsFrameContext& other) const {
   return !(*this == other);
 }
@@ -75,22 +68,8 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
   }
   torch::Tensor outputTensor;
-  // We need to compare the current frame context with our previous frame
-  // context. If they are different, then we need to re-create our colorspace
-  // conversion objects. We create our colorspace conversion objects late so
-  // that we don't have to depend on the unreliable metadata in the header.
-  // And we sometimes re-create them because it's possible for frame
-  // resolution to change mid-stream. Finally, we want to reuse the colorspace
-  // conversion objects as much as possible for performance reasons.
   enum AVPixelFormat frameFormat =
       static_cast<enum AVPixelFormat>(avFrame->format);
-  auto frameContext = DecodedFrameContext{
-      avFrame->width,
-      avFrame->height,
-      frameFormat,
-      avFrame->sample_aspect_ratio,
-      expectedOutputWidth,
-      expectedOutputHeight};
   // By default, we want to use swscale for color conversion because it is
   // faster. However, it has width requirements, so we may need to fall back
@@ -111,12 +90,27 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
       videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
   if (colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
+    // We need to compare the current frame context with our previous frame
+    // context. If they are different, then we need to re-create our colorspace
+    // conversion objects. We create our colorspace conversion objects late so
+    // that we don't have to depend on the unreliable metadata in the header.
+    // And we sometimes re-create them because it's possible for frame
+    // resolution to change mid-stream. Finally, we want to reuse the colorspace
+    // conversion objects as much as possible for performance reasons.
+    SwsFrameContext swsFrameContext;
+    swsFrameContext.inputWidth = avFrame->width;
+    swsFrameContext.inputHeight = avFrame->height;
+    swsFrameContext.inputFormat = frameFormat;
+    swsFrameContext.outputWidth = expectedOutputWidth;
+    swsFrameContext.outputHeight = expectedOutputHeight;
     outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
         expectedOutputHeight, expectedOutputWidth, torch::kCPU));
-    if (!swsContext_ || prevFrameContext_ != frameContext) {
-      createSwsContext(frameContext, avFrame->colorspace);
-      prevFrameContext_ = frameContext;
+    if (!swsContext_ || prevSwsFrameContext_ != swsFrameContext) {
+      createSwsContext(swsFrameContext, avFrame->colorspace);
+      prevSwsFrameContext_ = swsFrameContext;
     }
     int resultHeight =
         convertAVFrameToTensorUsingSwsScale(avFrame, outputTensor);
@@ -132,9 +126,29 @@ void CpuDeviceInterface::convertAVFrameToFrameOutput(
     frameOutput.data = outputTensor;
   } else if (colorConversionLibrary == ColorConversionLibrary::FILTERGRAPH) {
-    if (!filterGraphContext_.filterGraph || prevFrameContext_ != frameContext) {
-      createFilterGraph(frameContext, videoStreamOptions, timeBase);
-      prevFrameContext_ = frameContext;
+    // See comment above in swscale branch about the filterGraphContext_
+    // creation. creation
+    FiltersContext filtersContext;
+    filtersContext.inputWidth = avFrame->width;
+    filtersContext.inputHeight = avFrame->height;
+    filtersContext.inputFormat = frameFormat;
+    filtersContext.inputAspectRatio = avFrame->sample_aspect_ratio;
+    filtersContext.outputWidth = expectedOutputWidth;
+    filtersContext.outputHeight = expectedOutputHeight;
+    filtersContext.outputFormat = AV_PIX_FMT_RGB24;
+    filtersContext.timeBase = timeBase;
+    std::stringstream filters;
+    filters << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
+    filters << ":sws_flags=bilinear";
+    filtersContext.filtergraphStr = filters.str();
+    if (!filterGraphContext_ || prevFiltersContext_ != filtersContext) {
+      filterGraphContext_ =
+          std::make_unique<FilterGraph>(filtersContext, videoStreamOptions);
+      prevFiltersContext_ = std::move(filtersContext);
     }
     outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
@@ -187,14 +201,8 @@ int CpuDeviceInterface::convertAVFrameToTensorUsingSwsScale(
 torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
     const UniqueAVFrame& avFrame) {
-  int status = av_buffersrc_write_frame(
-      filterGraphContext_.sourceContext, avFrame.get());
-  TORCH_CHECK(
-      status >= AVSUCCESS, "Failed to add frame to buffer source context");
+  UniqueAVFrame filteredAVFrame = filterGraphContext_->convert(avFrame);
-  UniqueAVFrame filteredAVFrame(av_frame_alloc());
-  status = av_buffersink_get_frame(
-      filterGraphContext_.sinkContext, filteredAVFrame.get());
   TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
   auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());
@@ -210,117 +218,15 @@ torch::Tensor CpuDeviceInterface::convertAVFrameToTensorUsingFilterGraph(
       filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
 }
-void CpuDeviceInterface::createFilterGraph(
-    const DecodedFrameContext& frameContext,
-    const VideoStreamOptions& videoStreamOptions,
-    const AVRational& timeBase) {
-  filterGraphContext_.filterGraph.reset(avfilter_graph_alloc());
-  TORCH_CHECK(filterGraphContext_.filterGraph.get() != nullptr);
-  if (videoStreamOptions.ffmpegThreadCount.has_value()) {
-    filterGraphContext_.filterGraph->nb_threads =
-        videoStreamOptions.ffmpegThreadCount.value();
-  }
-  const AVFilter* buffersrc = avfilter_get_by_name("buffer");
-  const AVFilter* buffersink = avfilter_get_by_name("buffersink");
-  std::stringstream filterArgs;
-  filterArgs << "video_size=" << frameContext.decodedWidth << "x"
-             << frameContext.decodedHeight;
-  filterArgs << ":pix_fmt=" << frameContext.decodedFormat;
-  filterArgs << ":time_base=" << timeBase.num << "/" << timeBase.den;
-  filterArgs << ":pixel_aspect=" << frameContext.decodedAspectRatio.num << "/"
-             << frameContext.decodedAspectRatio.den;
-  int status = avfilter_graph_create_filter(
-      &filterGraphContext_.sourceContext,
-      buffersrc,
-      "in",
-      filterArgs.str().c_str(),
-      nullptr,
-      filterGraphContext_.filterGraph.get());
-  TORCH_CHECK(
-      status >= 0,
-      "Failed to create filter graph: ",
-      filterArgs.str(),
-      ": ",
-      getFFMPEGErrorStringFromErrorCode(status));
-  status = avfilter_graph_create_filter(
-      &filterGraphContext_.sinkContext,
-      buffersink,
-      "out",
-      nullptr,
-      nullptr,
-      filterGraphContext_.filterGraph.get());
-  TORCH_CHECK(
-      status >= 0,
-      "Failed to create filter graph: ",
-      getFFMPEGErrorStringFromErrorCode(status));
-  enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_NONE};
-  status = av_opt_set_int_list(
-      filterGraphContext_.sinkContext,
-      "pix_fmts",
-      pix_fmts,
-      AV_PIX_FMT_NONE,
-      AV_OPT_SEARCH_CHILDREN);
-  TORCH_CHECK(
-      status >= 0,
-      "Failed to set output pixel formats: ",
-      getFFMPEGErrorStringFromErrorCode(status));
-  UniqueAVFilterInOut outputs(avfilter_inout_alloc());
-  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
-  outputs->name = av_strdup("in");
-  outputs->filter_ctx = filterGraphContext_.sourceContext;
-  outputs->pad_idx = 0;
-  outputs->next = nullptr;
-  inputs->name = av_strdup("out");
-  inputs->filter_ctx = filterGraphContext_.sinkContext;
-  inputs->pad_idx = 0;
-  inputs->next = nullptr;
-  std::stringstream description;
-  description << "scale=" << frameContext.expectedWidth << ":"
-              << frameContext.expectedHeight;
-  description << ":sws_flags=bilinear";
-  AVFilterInOut* outputsTmp = outputs.release();
-  AVFilterInOut* inputsTmp = inputs.release();
-  status = avfilter_graph_parse_ptr(
-      filterGraphContext_.filterGraph.get(),
-      description.str().c_str(),
-      &inputsTmp,
-      &outputsTmp,
-      nullptr);
-  outputs.reset(outputsTmp);
-  inputs.reset(inputsTmp);
-  TORCH_CHECK(
-      status >= 0,
-      "Failed to parse filter description: ",
-      getFFMPEGErrorStringFromErrorCode(status));
-  status =
-      avfilter_graph_config(filterGraphContext_.filterGraph.get(), nullptr);
-  TORCH_CHECK(
-      status >= 0,
-      "Failed to configure filter graph: ",
-      getFFMPEGErrorStringFromErrorCode(status));
-}
 void CpuDeviceInterface::createSwsContext(
-    const DecodedFrameContext& frameContext,
+    const SwsFrameContext& swsFrameContext,
     const enum AVColorSpace colorspace) {
   SwsContext* swsContext = sws_getContext(
-      frameContext.decodedWidth,
-      frameContext.decodedHeight,
-      frameContext.decodedFormat,
-      frameContext.expectedWidth,
-      frameContext.expectedHeight,
+      swsFrameContext.inputWidth,
+      swsFrameContext.inputHeight,
+      swsFrameContext.inputFormat,
+      swsFrameContext.outputWidth,
+      swsFrameContext.outputHeight,
       AV_PIX_FMT_RGB24,
       SWS_BILINEAR,
       nullptr,