PyPI - torchcodec - Versions diffs - 0.7.0__cp312-cp312-win_amd64.whl - Mend

torchcodec 0.7.0__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of torchcodec might be problematic. Click here for more details.

Files changed (67) hide show

torchcodec/__init__.py +16 -0
torchcodec/_core/AVIOContextHolder.cpp +60 -0
torchcodec/_core/AVIOContextHolder.h +64 -0
torchcodec/_core/AVIOFileLikeContext.cpp +98 -0
torchcodec/_core/AVIOFileLikeContext.h +55 -0
torchcodec/_core/AVIOTensorContext.cpp +123 -0
torchcodec/_core/AVIOTensorContext.h +43 -0
torchcodec/_core/CMakeLists.txt +292 -0
torchcodec/_core/Cache.h +138 -0
torchcodec/_core/CpuDeviceInterface.cpp +266 -0
torchcodec/_core/CpuDeviceInterface.h +70 -0
torchcodec/_core/CudaDeviceInterface.cpp +514 -0
torchcodec/_core/CudaDeviceInterface.h +37 -0
torchcodec/_core/DeviceInterface.cpp +79 -0
torchcodec/_core/DeviceInterface.h +67 -0
torchcodec/_core/Encoder.cpp +514 -0
torchcodec/_core/Encoder.h +123 -0
torchcodec/_core/FFMPEGCommon.cpp +421 -0
torchcodec/_core/FFMPEGCommon.h +227 -0
torchcodec/_core/FilterGraph.cpp +142 -0
torchcodec/_core/FilterGraph.h +45 -0
torchcodec/_core/Frame.cpp +32 -0
torchcodec/_core/Frame.h +118 -0
torchcodec/_core/Metadata.h +72 -0
torchcodec/_core/SingleStreamDecoder.cpp +1715 -0
torchcodec/_core/SingleStreamDecoder.h +380 -0
torchcodec/_core/StreamOptions.h +53 -0
torchcodec/_core/ValidationUtils.cpp +35 -0
torchcodec/_core/ValidationUtils.h +21 -0
torchcodec/_core/__init__.py +40 -0
torchcodec/_core/_metadata.py +317 -0
torchcodec/_core/custom_ops.cpp +727 -0
torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +300 -0
torchcodec/_core/ops.py +455 -0
torchcodec/_core/pybind_ops.cpp +87 -0
torchcodec/_frame.py +145 -0
torchcodec/_internally_replaced_utils.py +67 -0
torchcodec/_samplers/__init__.py +7 -0
torchcodec/_samplers/video_clip_sampler.py +430 -0
torchcodec/decoders/__init__.py +11 -0
torchcodec/decoders/_audio_decoder.py +177 -0
torchcodec/decoders/_decoder_utils.py +52 -0
torchcodec/decoders/_video_decoder.py +464 -0
torchcodec/encoders/__init__.py +1 -0
torchcodec/encoders/_audio_encoder.py +150 -0
torchcodec/libtorchcodec_core4.dll +0 -0
torchcodec/libtorchcodec_core5.dll +0 -0
torchcodec/libtorchcodec_core6.dll +0 -0
torchcodec/libtorchcodec_core7.dll +0 -0
torchcodec/libtorchcodec_custom_ops4.dll +0 -0
torchcodec/libtorchcodec_custom_ops5.dll +0 -0
torchcodec/libtorchcodec_custom_ops6.dll +0 -0
torchcodec/libtorchcodec_custom_ops7.dll +0 -0
torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
torchcodec/samplers/__init__.py +2 -0
torchcodec/samplers/_common.py +84 -0
torchcodec/samplers/_index_based.py +287 -0
torchcodec/samplers/_time_based.py +350 -0
torchcodec/version.py +2 -0
torchcodec-0.7.0.dist-info/METADATA +242 -0
torchcodec-0.7.0.dist-info/RECORD +67 -0
torchcodec-0.7.0.dist-info/WHEEL +5 -0
torchcodec-0.7.0.dist-info/licenses/LICENSE +28 -0
torchcodec-0.7.0.dist-info/top_level.txt +2 -0

torchcodec/_core/FFMPEGCommon.h ADDED Viewed

@@ -0,0 +1,227 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <memory>
+#include <stdexcept>
+#include <string>
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavfilter/avfilter.h>
+#include <libavfilter/buffersrc.h>
+#include <libavformat/avformat.h>
+#include <libavformat/avio.h>
+#include <libavutil/audio_fifo.h>
+#include <libavutil/avutil.h>
+#include <libavutil/dict.h>
+#include <libavutil/display.h>
+#include <libavutil/file.h>
+#include <libavutil/opt.h>
+#include <libavutil/pixfmt.h>
+#include <libavutil/version.h>
+#include <libswresample/swresample.h>
+#include <libswscale/swscale.h>
+}
+namespace facebook::torchcodec {
+// FFMPEG uses special delete functions for some structures. These template
+// functions are used to pass into unique_ptr as custom deleters so we can
+// wrap FFMPEG structs with unique_ptrs for ease of use.
+template <typename T, typename R, R (*Fn)(T**)>
+struct Deleterp {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(&p);
+    }
+  }
+};
+template <typename T, typename R, R (*Fn)(void*)>
+struct Deleterv {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(&p);
+    }
+  }
+};
+template <typename T, typename R, R (*Fn)(T*)>
+struct Deleter {
+  inline void operator()(T* p) const {
+    if (p) {
+      Fn(p);
+    }
+  }
+};
+// Unique pointers for FFMPEG structures.
+using UniqueDecodingAVFormatContext = std::unique_ptr<
+    AVFormatContext,
+    Deleterp<AVFormatContext, void, avformat_close_input>>;
+using UniqueEncodingAVFormatContext = std::unique_ptr<
+    AVFormatContext,
+    Deleter<AVFormatContext, void, avformat_free_context>>;
+using UniqueAVCodecContext = std::unique_ptr<
+    AVCodecContext,
+    Deleterp<AVCodecContext, void, avcodec_free_context>>;
+using UniqueAVFrame =
+    std::unique_ptr<AVFrame, Deleterp<AVFrame, void, av_frame_free>>;
+using UniqueAVFilterGraph = std::unique_ptr<
+    AVFilterGraph,
+    Deleterp<AVFilterGraph, void, avfilter_graph_free>>;
+using UniqueAVFilterInOut = std::unique_ptr<
+    AVFilterInOut,
+    Deleterp<AVFilterInOut, void, avfilter_inout_free>>;
+using UniqueAVIOContext = std::
+    unique_ptr<AVIOContext, Deleterp<AVIOContext, void, avio_context_free>>;
+using UniqueSwsContext =
+    std::unique_ptr<SwsContext, Deleter<SwsContext, void, sws_freeContext>>;
+using UniqueSwrContext =
+    std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
+using UniqueAVAudioFifo = std::
+    unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
+using UniqueAVBufferRef =
+    std::unique_ptr<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>;
+using UniqueAVBufferSrcParameters = std::unique_ptr<
+    AVBufferSrcParameters,
+    Deleterv<AVBufferSrcParameters, void, av_freep>>;
+// These 2 classes share the same underlying AVPacket object. They are meant to
+// be used in tandem, like so:
+//
+// AutoAVPacket autoAVPacket; // <-- malloc for AVPacket happens here
+// while(...){
+//   ReferenceAVPacket packet(autoAVPacket);
+//   av_read_frame(..., packet.get());  <-- av_packet_ref() called by FFmpeg
+// } <-- av_packet_unref() called here
+//
+// This achieves a few desirable things:
+// - Memory allocation of the underlying AVPacket happens only once, when
+//   autoAVPacket is created.
+// - av_packet_free() is called when autoAVPacket gets out of scope
+// - av_packet_unref() is automatically called when needed, i.e. at the end of
+//   each loop iteration (or when hitting break / continue). This prevents the
+//   risk of us forgetting to call it.
+class AutoAVPacket {
+  friend class ReferenceAVPacket;
+ private:
+  AVPacket* avPacket_;
+ public:
+  AutoAVPacket();
+  AutoAVPacket(const AutoAVPacket& other) = delete;
+  AutoAVPacket& operator=(const AutoAVPacket& other) = delete;
+  ~AutoAVPacket();
+};
+class ReferenceAVPacket {
+ private:
+  AVPacket* avPacket_;
+ public:
+  explicit ReferenceAVPacket(AutoAVPacket& shared);
+  ReferenceAVPacket(const ReferenceAVPacket& other) = delete;
+  ReferenceAVPacket& operator=(const ReferenceAVPacket& other) = delete;
+  ~ReferenceAVPacket();
+  AVPacket* get();
+  AVPacket* operator->();
+};
+// av_find_best_stream is not const-correct before commit:
+// https://github.com/FFmpeg/FFmpeg/commit/46dac8cf3d250184ab4247809bc03f60e14f4c0c
+// which was released in FFMPEG version=5.0.3
+// with libavcodec's version=59.18.100
+// (https://www.ffmpeg.org/olddownload.html).
+// Note that the alias is so-named so that it is only used when interacting with
+// av_find_best_stream(). It is not needed elsewhere.
+#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
+using AVCodecOnlyUseForCallingAVFindBestStream = AVCodec*;
+#else
+using AVCodecOnlyUseForCallingAVFindBestStream = const AVCodec*;
+#endif
+AVCodecOnlyUseForCallingAVFindBestStream
+makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec);
+// Success code from FFMPEG is just a 0. We define it to make the code more
+// readable.
+const int AVSUCCESS = 0;
+// Returns the FFMPEG error as a string using the provided `errorCode`.
+std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
+// Returns duration from the frame. Abstracted into a function because the
+// struct member representing duration has changed across the versions we
+// support.
+int64_t getDuration(const UniqueAVFrame& frame);
+int getNumChannels(const UniqueAVFrame& avFrame);
+int getNumChannels(const UniqueAVCodecContext& avCodecContext);
+void setDefaultChannelLayout(
+    UniqueAVCodecContext& avCodecContext,
+    int numChannels);
+void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels);
+void validateNumChannels(const AVCodec& avCodec, int numChannels);
+void setChannelLayout(
+    UniqueAVFrame& dstAVFrame,
+    const UniqueAVFrame& srcAVFrame,
+    int desiredNumChannels);
+UniqueAVFrame allocateAVFrame(
+    int numSamples,
+    int sampleRate,
+    int numChannels,
+    AVSampleFormat sampleFormat);
+SwrContext* createSwrContext(
+    AVSampleFormat srcSampleFormat,
+    AVSampleFormat desiredSampleFormat,
+    int srcSampleRate,
+    int desiredSampleRate,
+    const UniqueAVFrame& srcAVFrame,
+    int desiredNumChannels);
+// Converts, if needed:
+// - sample format
+// - sample rate
+// - number of channels.
+// createSwrContext must have been previously called with matching parameters.
+UniqueAVFrame convertAudioAVFrameSamples(
+    const UniqueSwrContext& swrContext,
+    const UniqueAVFrame& srcAVFrame,
+    AVSampleFormat desiredSampleFormat,
+    int desiredSampleRate,
+    int desiredNumChannels);
+// Returns true if sws_scale can handle unaligned data.
+bool canSwsScaleHandleUnalignedData();
+void setFFmpegLogLevel();
+// These signatures are defined by FFmpeg.
+using AVIOReadFunction = int (*)(void*, uint8_t*, int);
+using AVIOWriteFunction = int (*)(void*, const uint8_t*, int); // FFmpeg >= 7
+using AVIOWriteFunctionOld = int (*)(void*, uint8_t*, int); // FFmpeg < 7
+using AVIOSeekFunction = int64_t (*)(void*, int64_t, int);
+AVIOContext* avioAllocContext(
+    uint8_t* buffer,
+    int buffer_size,
+    int write_flag,
+    void* opaque,
+    AVIOReadFunction read_packet,
+    AVIOWriteFunction write_packet,
+    AVIOSeekFunction seek);
+} // namespace facebook::torchcodec

torchcodec/_core/FilterGraph.cpp ADDED Viewed

@@ -0,0 +1,142 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include "src/torchcodec/_core/FilterGraph.h"
+extern "C" {
+#include <libavfilter/buffersink.h>
+#include <libavfilter/buffersrc.h>
+}
+namespace facebook::torchcodec {
+bool operator==(const AVRational& lhs, const AVRational& rhs) {
+  return lhs.num == rhs.num && lhs.den == rhs.den;
+}
+bool FiltersContext::operator==(const FiltersContext& other) const {
+  return inputWidth == other.inputWidth && inputHeight == other.inputHeight &&
+      inputFormat == other.inputFormat && outputWidth == other.outputWidth &&
+      outputHeight == other.outputHeight &&
+      outputFormat == other.outputFormat &&
+      filtergraphStr == other.filtergraphStr && timeBase == other.timeBase &&
+      hwFramesCtx.get() == other.hwFramesCtx.get();
+}
+bool FiltersContext::operator!=(const FiltersContext& other) const {
+  return !(*this == other);
+}
+FilterGraph::FilterGraph(
+    const FiltersContext& filtersContext,
+    const VideoStreamOptions& videoStreamOptions) {
+  filterGraph_.reset(avfilter_graph_alloc());
+  TORCH_CHECK(filterGraph_.get() != nullptr);
+  if (videoStreamOptions.ffmpegThreadCount.has_value()) {
+    filterGraph_->nb_threads = videoStreamOptions.ffmpegThreadCount.value();
+  }
+  const AVFilter* buffersrc = avfilter_get_by_name("buffer");
+  const AVFilter* buffersink = avfilter_get_by_name("buffersink");
+  UniqueAVBufferSrcParameters srcParams(av_buffersrc_parameters_alloc());
+  TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
+  srcParams->format = filtersContext.inputFormat;
+  srcParams->width = filtersContext.inputWidth;
+  srcParams->height = filtersContext.inputHeight;
+  srcParams->sample_aspect_ratio = filtersContext.inputAspectRatio;
+  srcParams->time_base = filtersContext.timeBase;
+  if (filtersContext.hwFramesCtx) {
+    srcParams->hw_frames_ctx = av_buffer_ref(filtersContext.hwFramesCtx.get());
+  }
+  sourceContext_ =
+      avfilter_graph_alloc_filter(filterGraph_.get(), buffersrc, "in");
+  TORCH_CHECK(sourceContext_, "Failed to allocate filter graph");
+  int status = av_buffersrc_parameters_set(sourceContext_, srcParams.get());
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  status = avfilter_init_str(sourceContext_, nullptr);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph : ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  status = avfilter_graph_create_filter(
+      &sinkContext_, buffersink, "out", nullptr, nullptr, filterGraph_.get());
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to create filter graph: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  enum AVPixelFormat pix_fmts[] = {
+      filtersContext.outputFormat, AV_PIX_FMT_NONE};
+  status = av_opt_set_int_list(
+      sinkContext_,
+      "pix_fmts",
+      pix_fmts,
+      AV_PIX_FMT_NONE,
+      AV_OPT_SEARCH_CHILDREN);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to set output pixel formats: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  UniqueAVFilterInOut outputs(avfilter_inout_alloc());
+  UniqueAVFilterInOut inputs(avfilter_inout_alloc());
+  outputs->name = av_strdup("in");
+  outputs->filter_ctx = sourceContext_;
+  outputs->pad_idx = 0;
+  outputs->next = nullptr;
+  inputs->name = av_strdup("out");
+  inputs->filter_ctx = sinkContext_;
+  inputs->pad_idx = 0;
+  inputs->next = nullptr;
+  AVFilterInOut* outputsTmp = outputs.release();
+  AVFilterInOut* inputsTmp = inputs.release();
+  status = avfilter_graph_parse_ptr(
+      filterGraph_.get(),
+      filtersContext.filtergraphStr.c_str(),
+      &inputsTmp,
+      &outputsTmp,
+      nullptr);
+  outputs.reset(outputsTmp);
+  inputs.reset(inputsTmp);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to parse filter description: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+  status = avfilter_graph_config(filterGraph_.get(), nullptr);
+  TORCH_CHECK(
+      status >= 0,
+      "Failed to configure filter graph: ",
+      getFFMPEGErrorStringFromErrorCode(status));
+}
+UniqueAVFrame FilterGraph::convert(const UniqueAVFrame& avFrame) {
+  int status = av_buffersrc_write_frame(sourceContext_, avFrame.get());
+  TORCH_CHECK(
+      status >= AVSUCCESS, "Failed to add frame to buffer source context");
+  UniqueAVFrame filteredAVFrame(av_frame_alloc());
+  status = av_buffersink_get_frame(sinkContext_, filteredAVFrame.get());
+  TORCH_CHECK(
+      status >= AVSUCCESS, "Failed to get frame from buffer sink context");
+  return filteredAVFrame;
+}
+} // namespace facebook::torchcodec

torchcodec/_core/FilterGraph.h ADDED Viewed

@@ -0,0 +1,45 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+namespace facebook::torchcodec {
+struct FiltersContext {
+  int inputWidth = 0;
+  int inputHeight = 0;
+  AVPixelFormat inputFormat = AV_PIX_FMT_NONE;
+  AVRational inputAspectRatio = {0, 0};
+  int outputWidth = 0;
+  int outputHeight = 0;
+  AVPixelFormat outputFormat = AV_PIX_FMT_NONE;
+  std::string filtergraphStr;
+  AVRational timeBase = {0, 0};
+  UniqueAVBufferRef hwFramesCtx;
+  bool operator==(const FiltersContext&) const;
+  bool operator!=(const FiltersContext&) const;
+};
+class FilterGraph {
+ public:
+  FilterGraph(
+      const FiltersContext& filtersContext,
+      const VideoStreamOptions& videoStreamOptions);
+  UniqueAVFrame convert(const UniqueAVFrame& avFrame);
+ private:
+  UniqueAVFilterGraph filterGraph_;
+  AVFilterContext* sourceContext_ = nullptr;
+  AVFilterContext* sinkContext_ = nullptr;
+};
+} // namespace facebook::torchcodec

torchcodec/_core/Frame.cpp ADDED Viewed

@@ -0,0 +1,32 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#include "src/torchcodec/_core/Frame.h"
+namespace facebook::torchcodec {
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames) {
+  auto tensorOptions = torch::TensorOptions()
+                           .dtype(torch::kUInt8)
+                           .layout(torch::kStrided)
+                           .device(device);
+  TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
+  TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
+  if (numFrames.has_value()) {
+    auto numFramesValue = numFrames.value();
+    TORCH_CHECK(
+        numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
+    return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
+  } else {
+    return torch::empty({height, width, 3}, tensorOptions);
+  }
+}
+} // namespace facebook::torchcodec

torchcodec/_core/Frame.h ADDED Viewed

@@ -0,0 +1,118 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <torch/types.h>
+#include "src/torchcodec/_core/FFMPEGCommon.h"
+#include "src/torchcodec/_core/Metadata.h"
+#include "src/torchcodec/_core/StreamOptions.h"
+namespace facebook::torchcodec {
+// All public video decoding entry points return either a FrameOutput or a
+// FrameBatchOutput.
+// They are the equivalent of the user-facing Frame and FrameBatch classes in
+// Python. They contain RGB decoded frames along with some associated data
+// like PTS and duration.
+// FrameOutput is also relevant for audio decoding, typically as the output of
+// getNextFrame(), or as a temporary output variable.
+struct FrameOutput {
+  // data shape is:
+  // - 3D (C, H, W) or (H, W, C) for videos
+  // - 2D (numChannels, numSamples) for audio
+  torch::Tensor data;
+  double ptsSeconds;
+  double durationSeconds;
+};
+struct FrameBatchOutput {
+  torch::Tensor data; // 4D: of shape NCHW or NHWC.
+  torch::Tensor ptsSeconds; // 1D of shape (N,)
+  torch::Tensor durationSeconds; // 1D of shape (N,)
+  explicit FrameBatchOutput(
+      int64_t numFrames,
+      const VideoStreamOptions& videoStreamOptions,
+      const StreamMetadata& streamMetadata);
+};
+struct AudioFramesOutput {
+  torch::Tensor data; // shape is (numChannels, numSamples)
+  double ptsSeconds;
+};
+// --------------------------------------------------------------------------
+// FRAME TENSOR ALLOCATION APIs
+// --------------------------------------------------------------------------
+// Note [Frame Tensor allocation and height and width]
+//
+// We always allocate [N]HWC tensors. The low-level decoding functions all
+// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
+// the high-level decoding entry-points to permute that back to CHW, by calling
+// maybePermuteHWC2CHW().
+//
+// Also, importantly, the way we figure out the the height and width of the
+// output frame tensor varies, and depends on the decoding entry-point. In
+// *decreasing order of accuracy*, we use the following sources for determining
+// height and width:
+// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
+//   AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
+//   on CPU, with filtergraph.
+// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
+//   the user-specified options if they exist, or the height and width of the
+//   AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
+//   our code or within FFmpeg code, this should be exactly the same as
+//   getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
+//   decoding APIs, on CPU with swscale, and on GPU.
+// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
+//   the user-specified options if they exist, or the height and width form the
+//   stream metadata, which itself got its value from the CodecContext, when the
+//   stream was added. This is used by batch decoding APIs, for both GPU and
+//   CPU.
+//
+// The source of truth for height and width really is the (resized) AVFrame: it
+// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
+// from the CodecContext) may not be as accurate. However, the AVFrame is only
+// available late in the call stack, when the frame is decoded, while the
+// CodecContext is available early when a stream is added. This is why we use
+// the CodecContext for pre-allocating batched output tensors (we could
+// pre-allocate those only once we decode the first frame to get the info frame
+// the AVFrame, but that's a more complex logic).
+//
+// Because the sources for height and width may disagree, we may end up with
+// conflicts: e.g. if we pre-allocate a batch output tensor based on the
+// metadata info, but the decoded AVFrame has a different height and width.
+// it is very important to check the height and width assumptions where the
+// tensors memory is used/filled in order to avoid segfaults.
+struct FrameDims {
+  int height;
+  int width;
+  FrameDims(int h, int w) : height(h), width(w) {}
+};
+// There's nothing preventing you from calling this on a non-resized frame, but
+// please don't.
+FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
+FrameDims getHeightAndWidthFromOptionsOrMetadata(
+    const VideoStreamOptions& videoStreamOptions,
+    const StreamMetadata& streamMetadata);
+FrameDims getHeightAndWidthFromOptionsOrAVFrame(
+    const VideoStreamOptions& videoStreamOptions,
+    const UniqueAVFrame& avFrame);
+torch::Tensor allocateEmptyHWCTensor(
+    int height,
+    int width,
+    torch::Device device,
+    std::optional<int> numFrames = std::nullopt);
+} // namespace facebook::torchcodec

torchcodec/_core/Metadata.h ADDED Viewed

@@ -0,0 +1,72 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+#pragma once
+#include <optional>
+#include <string>
+#include <vector>
+extern "C" {
+#include <libavcodec/avcodec.h>
+#include <libavutil/avutil.h>
+#include <libavutil/rational.h>
+}
+namespace facebook::torchcodec {
+struct StreamMetadata {
+  // Common (video and audio) fields derived from the AVStream.
+  int streamIndex;
+  // See this link for what various values are available:
+  // https://ffmpeg.org/doxygen/trunk/group__lavu__misc.html#ga9a84bba4713dfced21a1a56163be1f48
+  AVMediaType mediaType;
+  std::optional<AVCodecID> codecId;
+  std::optional<std::string> codecName;
+  std::optional<double> durationSecondsFromHeader;
+  std::optional<double> beginStreamSecondsFromHeader;
+  std::optional<int64_t> numFramesFromHeader;
+  std::optional<int64_t> numKeyFrames;
+  std::optional<double> averageFpsFromHeader;
+  std::optional<double> bitRate;
+  // More accurate duration, obtained by scanning the file.
+  // These presentation timestamps are in time base.
+  std::optional<int64_t> beginStreamPtsFromContent;
+  std::optional<int64_t> endStreamPtsFromContent;
+  // These presentation timestamps are in seconds.
+  std::optional<double> beginStreamPtsSecondsFromContent;
+  std::optional<double> endStreamPtsSecondsFromContent;
+  // This can be useful for index-based seeking.
+  std::optional<int64_t> numFramesFromContent;
+  // Video-only fields derived from the AVCodecContext.
+  std::optional<int64_t> width;
+  std::optional<int64_t> height;
+  std::optional<AVRational> sampleAspectRatio;
+  // Audio-only fields
+  std::optional<int64_t> sampleRate;
+  std::optional<int64_t> numChannels;
+  std::optional<std::string> sampleFormat;
+};
+struct ContainerMetadata {
+  std::vector<StreamMetadata> allStreamMetadata;
+  int numAudioStreams = 0;
+  int numVideoStreams = 0;
+  // Note that this is the container-level duration, which is usually the max
+  // of all stream durations available in the container.
+  std::optional<double> durationSecondsFromHeader;
+  // Total BitRate level information at the container level in bit/s
+  std::optional<double> bitRate;
+  // If set, this is the index to the default audio stream.
+  std::optional<int> bestAudioStreamIndex;
+  // If set, this is the index to the default video stream.
+  std::optional<int> bestVideoStreamIndex;
+};
+} // namespace facebook::torchcodec