PyPI - torchcodec - Versions diffs - 0.8.0__cp313-cp313-macosx_12_0_arm64.whl - Mend

torchcodec 0.8.0__cp313-cp313-macosx_12_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (82) hide show

torchcodec/.dylibs/libc++.1.0.dylib +0 -0
torchcodec/.dylibs/libpython3.13.dylib +0 -0
torchcodec/__init__.py +16 -0
torchcodec/_core/AVIOContextHolder.cpp +60 -0
torchcodec/_core/AVIOContextHolder.h +64 -0
torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
torchcodec/_core/AVIOFileLikeContext.h +55 -0
torchcodec/_core/AVIOTensorContext.cpp +123 -0
torchcodec/_core/AVIOTensorContext.h +43 -0
torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
torchcodec/_core/CMakeLists.txt +325 -0
torchcodec/_core/CUDACommon.cpp +315 -0
torchcodec/_core/CUDACommon.h +46 -0
torchcodec/_core/Cache.h +138 -0
torchcodec/_core/CpuDeviceInterface.cpp +347 -0
torchcodec/_core/CpuDeviceInterface.h +132 -0
torchcodec/_core/CudaDeviceInterface.cpp +357 -0
torchcodec/_core/CudaDeviceInterface.h +64 -0
torchcodec/_core/DeviceInterface.cpp +117 -0
torchcodec/_core/DeviceInterface.h +148 -0
torchcodec/_core/Encoder.cpp +807 -0
torchcodec/_core/Encoder.h +173 -0
torchcodec/_core/FFMPEGCommon.cpp +608 -0
torchcodec/_core/FFMPEGCommon.h +245 -0
torchcodec/_core/FilterGraph.cpp +149 -0
torchcodec/_core/FilterGraph.h +59 -0
torchcodec/_core/Frame.cpp +42 -0
torchcodec/_core/Frame.h +72 -0
torchcodec/_core/Metadata.h +72 -0
torchcodec/_core/NVDECCache.cpp +70 -0
torchcodec/_core/NVDECCache.h +104 -0
torchcodec/_core/SingleStreamDecoder.cpp +1719 -0
torchcodec/_core/SingleStreamDecoder.h +405 -0
torchcodec/_core/StreamOptions.h +63 -0
torchcodec/_core/Transform.cpp +60 -0
torchcodec/_core/Transform.h +59 -0
torchcodec/_core/ValidationUtils.cpp +35 -0
torchcodec/_core/ValidationUtils.h +21 -0
torchcodec/_core/__init__.py +41 -0
torchcodec/_core/_metadata.py +317 -0
torchcodec/_core/custom_ops.cpp +875 -0
torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +360 -0
torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
torchcodec/_core/ops.py +498 -0
torchcodec/_core/pybind_ops.cpp +50 -0
torchcodec/_frame.py +145 -0
torchcodec/_internally_replaced_utils.py +67 -0
torchcodec/_samplers/__init__.py +7 -0
torchcodec/_samplers/video_clip_sampler.py +418 -0
torchcodec/decoders/__init__.py +12 -0
torchcodec/decoders/_audio_decoder.py +177 -0
torchcodec/decoders/_decoder_utils.py +112 -0
torchcodec/decoders/_video_decoder.py +500 -0
torchcodec/encoders/__init__.py +1 -0
torchcodec/encoders/_audio_encoder.py +150 -0
torchcodec/libtorchcodec_core4.dylib +0 -0
torchcodec/libtorchcodec_core5.dylib +0 -0
torchcodec/libtorchcodec_core6.dylib +0 -0
torchcodec/libtorchcodec_core7.dylib +0 -0
torchcodec/libtorchcodec_core8.dylib +0 -0
torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
torchcodec/libtorchcodec_custom_ops8.dylib +0 -0
torchcodec/libtorchcodec_pybind_ops4.so +0 -0
torchcodec/libtorchcodec_pybind_ops5.so +0 -0
torchcodec/libtorchcodec_pybind_ops6.so +0 -0
torchcodec/libtorchcodec_pybind_ops7.so +0 -0
torchcodec/libtorchcodec_pybind_ops8.so +0 -0
torchcodec/samplers/__init__.py +2 -0
torchcodec/samplers/_common.py +84 -0
torchcodec/samplers/_index_based.py +287 -0
torchcodec/samplers/_time_based.py +358 -0
torchcodec/version.py +2 -0
torchcodec-0.8.0.dist-info/METADATA +253 -0
torchcodec-0.8.0.dist-info/RECORD +82 -0
torchcodec-0.8.0.dist-info/WHEEL +5 -0
torchcodec-0.8.0.dist-info/licenses/LICENSE +28 -0
torchcodec-0.8.0.dist-info/top_level.txt +2 -0

torchcodec/_core/FFMPEGCommon.h ADDED Viewed

@@ -0,0 +1,245 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <memory>
+#include <stdexcept>
+#include <string>
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavcodec/bsf.h>
+#include <libavfilter/avfilter.h>
+#include <libavfilter/buffersrc.h>
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+#include <libavutil/audio_fifo.h>
+#include <libavutil/avutil.h>
+#include <libavutil/dict.h>
+#include <libavutil/display.h>
+#include <libavutil/file.h>
+#include <libavutil/opt.h>
+#include <libavutil/pixfmt.h>
+#include <libavutil/version.h>
+#include <libswresample/swresample.h>
+#include <libswscale/swscale.h>
+}
+namespace facebook::torchcodec {
+// FFMPEG uses special delete functions for some structures. These template
+// functions are used to pass into unique_ptr as custom deleters so we can
+// wrap FFMPEG structs with unique_ptrs for ease of use.
+template <typename T, typename R, R (*Fn)(T**)>
+struct Deleterp {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(&p);
+    }
+  }
+};
+template <typename T, typename R, R (*Fn)(void*)>
+struct Deleterv {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(&p);
+    }
+  }
+};
+template <typename T, typename R, R (*Fn)(T*)>
+struct Deleter {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(p);
+    }
+  }
+};
+// Unique pointers for FFMPEG structures.
+using UniqueDecodingAVFormatContext = std::unique_ptr<
+    AVFormatContext,
+    Deleterp<AVFormatContext, void, avformat_close_input>>;
+using UniqueEncodingAVFormatContext = std::unique_ptr<
+    AVFormatContext,
+    Deleter<AVFormatContext, void, avformat_free_context>>;
+using UniqueAVCodecContext = std::unique_ptr<
+    AVCodecContext,
+    Deleterp<AVCodecContext, void, avcodec_free_context>>;
+using UniqueAVFrame =
+    std::unique_ptr<AVFrame, Deleterp<AVFrame, void, av_frame_free>>;
+using UniqueAVFilterGraph = std::unique_ptr<
+    AVFilterGraph,
+    Deleterp<AVFilterGraph, void, avfilter_graph_free>>;
+using UniqueAVFilterInOut = std::unique_ptr<
+    AVFilterInOut,
+    Deleterp<AVFilterInOut, void, avfilter_inout_free>>;
+using UniqueAVIOContext = std::
+    unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
+using UniqueSwsContext =
+    std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
+using UniqueSwrContext =
+    std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
+using UniqueAVAudioFifo = std::
+    unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
+using UniqueAVBSFContext =
+    std::unique_ptr<AVBSFContext, Deleterp<AVBSFContext, void, av_bsf_free>>;
+using UniqueAVBufferRef =
+    std::unique_ptr<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>;
+using UniqueAVBufferSrcParameters = std::unique_ptr<
+    AVBufferSrcParameters,
+    Deleterv<AVBufferSrcParameters, void, av_freep>>;
+// These 2 classes share the same underlying AVPacket object. They are meant to
+// be used in tandem, like so:
+//
+// AutoAVPacket autoAVPacket; // <-- malloc for AVPacket happens here
+// while(...){
+//   ReferenceAVPacket packet(autoAVPacket);
+//   av_read_frame(..., packet.get());  <-- av_packet_ref() called by FFmpeg
+// } <-- av_packet_unref() called here
+//
+// This achieves a few desirable things:
+// - Memory allocation of the underlying AVPacket happens only once, when
+//   autoAVPacket is created.
+// - av_packet_free() is called when autoAVPacket gets out of scope
+// - av_packet_unref() is automatically called when needed, i.e. at the end of
+//   each loop iteration (or when hitting break / continue). This prevents the
+//   risk of us forgetting to call it.
+class AutoAVPacket {
+  friend class ReferenceAVPacket;
+ private:
+  AVPacket* avPacket_;
+ public:
+  AutoAVPacket();
+  AutoAVPacket(const AutoAVPacket& other) = delete;
+  AutoAVPacket& operator=(const AutoAVPacket& other) = delete;
+  ~AutoAVPacket();
+};
+class ReferenceAVPacket {
+ private:
+  AVPacket* avPacket_;
+ public:
+  explicit ReferenceAVPacket(AutoAVPacket& shared);
+  ReferenceAVPacket(const ReferenceAVPacket& other) = delete;
+  ReferenceAVPacket& operator=(const ReferenceAVPacket& other) = delete;
+  ~ReferenceAVPacket();
+  AVPacket* get();
+  AVPacket* operator->();
+};
+// av_find_best_stream is not const-correct before commit:
+// https://github.com/FFmpeg/FFmpeg/commit/46dac8cf3d250184ab4247809bc03f60e14f4c0c
+// which was released in FFMPEG version=5.0.3
+// with libavcodec's version=59.18.100
+// (https://www.ffmpeg.org/olddownload.html).
+// Note that the alias is so-named so that it is only used when interacting with
+// av_find_best_stream(). It is not needed elsewhere.
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
+using AVCodecOnlyUseForCallingAVFindBestStream = AVCodec*;
+#else
+using AVCodecOnlyUseForCallingAVFindBestStream = const AVCodec*;
+#endif
+AVCodecOnlyUseForCallingAVFindBestStream
+makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec);
+// Success code from FFMPEG is just a 0. We define it to make the code more
+// readable.
+const int AVSUCCESS = 0;
+// Returns the FFMPEG error as a string using the provided `errorCode`.
+std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
+// Returns duration from the frame. Abstracted into a function because the
+// struct member representing duration has changed across the versions we
+// support.
+int64_t getDuration(const UniqueAVFrame& frame);
+void setDuration(const UniqueAVFrame& frame, int64_t duration);
+const int* getSupportedSampleRates(const AVCodec& avCodec);
+const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec);
+const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
+int getNumChannels(const UniqueAVFrame& avFrame);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext);
+void setDefaultChannelLayout(
+    UniqueAVCodecContext& avCodecContext,
+    int numChannels);
+void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels);
+void validateNumChannels(const AVCodec& avCodec, int numChannels);
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVFrame& srcAVFrame,
+    int desiredNumChannels);
+UniqueAVFrame allocateAVFrame(
+    int numSamples,
+    int sampleRate,
+    int numChannels,
+    AVSampleFormat sampleFormat);
+SwrContext* createSwrContext(
+    AVSampleFormat srcSampleFormat,
+    AVSampleFormat desiredSampleFormat,
+    int srcSampleRate,
+    int desiredSampleRate,
+    const UniqueAVFrame& srcAVFrame,
+    int desiredNumChannels);
+// Converts, if needed:
+// - sample format
+// - sample rate
+// - number of channels.
+// createSwrContext must have been previously called with matching parameters.
+UniqueAVFrame convertAudioAVFrameSamples(
+    const UniqueSwrContext& swrContext,
+    const UniqueAVFrame& srcAVFrame,
+    AVSampleFormat desiredSampleFormat,
+    int desiredSampleRate,
+    int desiredNumChannels);
+// Returns true if sws_scale can handle unaligned data.
+bool canSwsScaleHandleUnalignedData();
+void setFFmpegLogLevel();
+// These signatures are defined by FFmpeg.
+using AVIOReadFunction = int (*)(void*, uint8_t*, int);
+using AVIOWriteFunction = int (*)(void*, const uint8_t*, int); // FFmpeg >= 7
+using AVIOWriteFunctionOld = int (*)(void*, uint8_t*, int); // FFmpeg < 7
+using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
+AVIOContext* avioAllocContext(
+    uint8_t* buffer,
+    int buffer_size,
+    int write_flag,
+    void* opaque,
+    AVIOReadFunction read_packet,
+    AVIOWriteFunction write_packet,
+    AVIOSeekFunction seek);
+double ptsToSeconds(int64_t pts, const AVRational& timeBase);
+int64_t secondsToClosestPts(double seconds, const AVRational& timeBase);
+int64_t computeSafeDuration(
+    const AVRational& frameRate,
+    const AVRational& timeBase);
+AVFilterContext* createBuffersinkFilter(
+    AVFilterGraph* filterGraph,
+    enum AVPixelFormat outputFormat);
+} // namespace facebook::torchcodec

torchcodec/_core/FilterGraph.cpp ADDED Viewed

@@ -0,0 +1,149 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include "src/torchcodec/_core/FilterGraph.h"
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+extern "C" {
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
+}
+namespace facebook::torchcodec {
+FiltersContext::FiltersContext(
+    int inputWidth,
+    int inputHeight,
+    AVPixelFormat inputFormat,
+    AVRational inputAspectRatio,
+    int outputWidth,
+    int outputHeight,
+    AVPixelFormat outputFormat,
+    const std::string& filtergraphStr,
+    AVRational timeBase,
+    AVBufferRef* hwFramesCtx)
+    : inputWidth(inputWidth),
+      inputHeight(inputHeight),
+      inputFormat(inputFormat),
+      inputAspectRatio(inputAspectRatio),
+      outputWidth(outputWidth),
+      outputHeight(outputHeight),
+      outputFormat(outputFormat),
+      filtergraphStr(filtergraphStr),
+      timeBase(timeBase),
+      hwFramesCtx(hwFramesCtx) {}
+bool operator==(const AVRational& lhs, const AVRational& rhs) {
+  return lhs.num == rhs.num && lhs.den == rhs.den;
+}
+bool FiltersContext::operator==(const FiltersContext& other) const {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight &&
+      outputFormat == other.outputFormat &&
+      filtergraphStr == other.filtergraphStr && timeBase == other.timeBase &&
+      hwFramesCtx.get() == other.hwFramesCtx.get();
+}
+bool FiltersContext::operator!=(const FiltersContext& other) const {
+  return !(*this == other);
+}
+FilterGraph::FilterGraph(
+    const FiltersContext& filtersContext,
+    const VideoStreamOptions& videoStreamOptions) {
+  filterGraph_.reset(avfilter_graph_alloc());
+  TORCH_CHECK(filterGraph_.get() != nullptr);
+  if (videoStreamOptions.ffmpegThreadCount.has_value()) {
+    filterGraph_->nb_threads = videoStreamOptions.ffmpegThreadCount.value();
+  }
+  const AVFilter* buffersrc = avfilter_get_by_name("buffer");
+  UniqueAVBufferSrcParameters srcParams(av_buffersrc_parameters_alloc());
+  TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
+  srcParams->format = filtersContext.inputFormat;
+  srcParams->width = filtersContext.inputWidth;
+  srcParams->height = filtersContext.inputHeight;
+  srcParams->sample_aspect_ratio = filtersContext.inputAspectRatio;
+  srcParams->time_base = filtersContext.timeBase;
+  if (filtersContext.hwFramesCtx) {
+    srcParams->hw_frames_ctx = av_buffer_ref(filtersContext.hwFramesCtx.get());
+  }
+  sourceContext_ =
+      avfilter_graph_alloc_filter(filterGraph_.get(), buffersrc, "in");
+  TORCH_CHECK(sourceContext_, "Failed to allocate filter graph");
+  int status = av_buffersrc_parameters_set(sourceContext_, srcParams.get());
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  status = avfilter_init_str(sourceContext_, nullptr);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph : ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  sinkContext_ =
+      createBuffersinkFilter(filterGraph_.get(), filtersContext.outputFormat);
+  TORCH_CHECK(
+      sinkContext_ != nullptr, "Failed to create and configure buffersink");
+  UniqueAVFilterInOut outputs(avfilter_inout_alloc());
+  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
+  outputs->name = av_strdup("in");
+  outputs->filter_ctx = sourceContext_;
+  outputs->pad_idx = 0;
+  outputs->next = nullptr;
+  inputs->name = av_strdup("out");
+  inputs->filter_ctx = sinkContext_;
+  inputs->pad_idx = 0;
+  inputs->next = nullptr;
+  AVFilterInOut* outputsTmp = outputs.release();
+  AVFilterInOut* inputsTmp = inputs.release();
+  status = avfilter_graph_parse_ptr(
+      filterGraph_.get(),
+      filtersContext.filtergraphStr.c_str(),
+      &inputsTmp,
+      &outputsTmp,
+      nullptr);
+  outputs.reset(outputsTmp);
+  inputs.reset(inputsTmp);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to parse filter description: ",
+      getFFMPEGErrorStringFromErrorCode(status),
+      ", provided filters: " + filtersContext.filtergraphStr);
+  status = avfilter_graph_config(filterGraph_.get(), nullptr);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to configure filter graph: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+}
+UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {
+  int status = av_buffersrc_write_frame(sourceContext_, avFrame.get());
+  TORCH_CHECK(
+      status >= AVSUCCESS, "Failed to add frame to buffer source context");
+  UniqueAVFrame filteredAVFrame(av_frame_alloc());
+  status = av_buffersink_get_frame(sinkContext_, filteredAVFrame.get());
+  TORCH_CHECK(
+      status >= AVSUCCESS, "Failed to get frame from buffer sink context");
+  return filteredAVFrame;
+}
+} // namespace facebook::torchcodec

torchcodec/_core/FilterGraph.h ADDED Viewed

@@ -0,0 +1,59 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+namespace facebook::torchcodec {
+struct FiltersContext {
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  AVRational inputAspectRatio = {0, 0};
+  int outputWidth = 0;
+  int outputHeight = 0;
+  AVPixelFormat outputFormat = AV_PIX_FMT_NONE;
+  std::string filtergraphStr;
+  AVRational timeBase = {0, 0};
+  UniqueAVBufferRef hwFramesCtx;
+  FiltersContext() = default;
+  FiltersContext(FiltersContext&&) = default;
+  FiltersContext& operator=(FiltersContext&&) = default;
+  FiltersContext(
+      int inputWidth,
+      int inputHeight,
+      AVPixelFormat inputFormat,
+      AVRational inputAspectRatio,
+      int outputWidth,
+      int outputHeight,
+      AVPixelFormat outputFormat,
+      const std::string& filtergraphStr,
+      AVRational timeBase,
+      AVBufferRef* hwFramesCtx = nullptr);
+  bool operator==(const FiltersContext&) const;
+  bool operator!=(const FiltersContext&) const;
+};
+class FilterGraph {
+ public:
+  FilterGraph(
+      const FiltersContext& filtersContext,
+      const VideoStreamOptions& videoStreamOptions);
+  UniqueAVFrame convert(const UniqueAVFrame& avFrame);
+ private:
+  UniqueAVFilterGraph filterGraph_;
+  AVFilterContext* sourceContext_ = nullptr;
+  AVFilterContext* sinkContext_ = nullptr;
+};
+} // namespace facebook::torchcodec

torchcodec/_core/Frame.cpp ADDED Viewed

@@ -0,0 +1,42 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include "src/torchcodec/_core/Frame.h"
+namespace facebook::torchcodec {
+FrameBatchOutput::FrameBatchOutput(
+    int64_t numFrames,
+    const FrameDims& outputDims,
+    const torch::Device& device)
+    : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
+      durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
+  data = allocateEmptyHWCTensor(outputDims, device, numFrames);
+}
+torch::Tensor allocateEmptyHWCTensor(
+    const FrameDims& frameDims,
+    const torch::Device& device,
+    std::optional<int> numFrames) {
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kUInt8)
+                           .layout(torch::kStrided)
+                           .device(device);
+  TORCH_CHECK(
+      frameDims.height > 0, "height must be > 0, got: ", frameDims.height);
+  TORCH_CHECK(frameDims.width > 0, "width must be > 0, got: ", frameDims.width);
+  if (numFrames.has_value()) {
+    auto numFramesValue = numFrames.value();
+    TORCH_CHECK(
+        numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
+    return torch::empty(
+        {numFramesValue, frameDims.height, frameDims.width, 3}, tensorOptions);
+  } else {
+    return torch::empty({frameDims.height, frameDims.width, 3}, tensorOptions);
+  }
+}
+} // namespace facebook::torchcodec

torchcodec/_core/Frame.h ADDED Viewed

@@ -0,0 +1,72 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <torch/types.h>
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/Metadata.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+namespace facebook::torchcodec {
+struct FrameDims {
+  int height = 0;
+  int width = 0;
+  FrameDims() = default;
+  FrameDims(int h, int w) : height(h), width(w) {}
+};
+// All public video decoding entry points return either a FrameOutput or a
+// FrameBatchOutput.
+// They are the equivalent of the user-facing Frame and FrameBatch classes in
+// Python. They contain RGB decoded frames along with some associated data
+// like PTS and duration.
+// FrameOutput is also relevant for audio decoding, typically as the output of
+// getNextFrame(), or as a temporary output variable.
+struct FrameOutput {
+  // data shape is:
+  // - 3D (C, H, W) or (H, W, C) for videos
+  // - 2D (numChannels, numSamples) for audio
+  torch::Tensor data;
+  double ptsSeconds;
+  double durationSeconds;
+};
+struct FrameBatchOutput {
+  torch::Tensor data; // 4D: of shape NCHW or NHWC.
+  torch::Tensor ptsSeconds; // 1D of shape (N,)
+  torch::Tensor durationSeconds; // 1D of shape (N,)
+  FrameBatchOutput(
+      int64_t numFrames,
+      const FrameDims& outputDims,
+      const torch::Device& device);
+};
+struct AudioFramesOutput {
+  torch::Tensor data; // shape is (numChannels, numSamples)
+  double ptsSeconds;
+};
+// --------------------------------------------------------------------------
+// FRAME TENSOR ALLOCATION APIs
+// --------------------------------------------------------------------------
+// Note [Frame Tensor allocation]
+//
+// We always allocate [N]HWC tensors. The low-level decoding functions all
+// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
+// the high-level decoding entry-points to permute that back to CHW, by calling
+// maybePermuteHWC2CHW().
+torch::Tensor allocateEmptyHWCTensor(
+    const FrameDims& frameDims,
+    const torch::Device& device,
+    std::optional<int> numFrames = std::nullopt);
+} // namespace facebook::torchcodec

torchcodec/_core/Metadata.h ADDED Viewed

@@ -0,0 +1,72 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <optional>
+#include <string>
+#include <vector>
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+#include <libavutil/rational.h>
+}
+namespace facebook::torchcodec {
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSecondsFromHeader;
+  std::optional<double> beginStreamSecondsFromHeader;
+  std::optional<int64_t> numFramesFromHeader;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFpsFromHeader;
+  std::optional<double> bitRate;
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> beginStreamPtsFromContent;
+  std::optional<int64_t> endStreamPtsFromContent;
+  // These presentation timestamps are in seconds.
+  std::optional<double> beginStreamPtsSecondsFromContent;
+  std::optional<double> endStreamPtsSecondsFromContent;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromContent;
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int> width;
+  std::optional<int> height;
+  std::optional<AVRational> sampleAspectRatio;
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSecondsFromHeader;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+} // namespace facebook::torchcodec