torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.1__cp312-cp312-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (66) hide show
  1. torchcodec/_core/AVIOTensorContext.cpp +23 -16
  2. torchcodec/_core/AVIOTensorContext.h +2 -1
  3. torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
  4. torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
  5. torchcodec/_core/CMakeLists.txt +18 -3
  6. torchcodec/_core/CUDACommon.cpp +330 -0
  7. torchcodec/_core/CUDACommon.h +51 -0
  8. torchcodec/_core/Cache.h +6 -20
  9. torchcodec/_core/CpuDeviceInterface.cpp +195 -108
  10. torchcodec/_core/CpuDeviceInterface.h +84 -19
  11. torchcodec/_core/CudaDeviceInterface.cpp +227 -376
  12. torchcodec/_core/CudaDeviceInterface.h +38 -6
  13. torchcodec/_core/DeviceInterface.cpp +57 -19
  14. torchcodec/_core/DeviceInterface.h +97 -16
  15. torchcodec/_core/Encoder.cpp +346 -9
  16. torchcodec/_core/Encoder.h +62 -1
  17. torchcodec/_core/FFMPEGCommon.cpp +190 -3
  18. torchcodec/_core/FFMPEGCommon.h +27 -1
  19. torchcodec/_core/FilterGraph.cpp +30 -22
  20. torchcodec/_core/FilterGraph.h +15 -1
  21. torchcodec/_core/Frame.cpp +22 -7
  22. torchcodec/_core/Frame.h +15 -61
  23. torchcodec/_core/Metadata.h +2 -2
  24. torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
  25. torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
  26. torchcodec/_core/NVDECCache.cpp +60 -0
  27. torchcodec/_core/NVDECCache.h +102 -0
  28. torchcodec/_core/SingleStreamDecoder.cpp +196 -201
  29. torchcodec/_core/SingleStreamDecoder.h +42 -15
  30. torchcodec/_core/StreamOptions.h +16 -6
  31. torchcodec/_core/Transform.cpp +87 -0
  32. torchcodec/_core/Transform.h +84 -0
  33. torchcodec/_core/__init__.py +4 -0
  34. torchcodec/_core/custom_ops.cpp +257 -32
  35. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
  36. torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
  37. torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
  38. torchcodec/_core/ops.py +147 -44
  39. torchcodec/_core/pybind_ops.cpp +22 -59
  40. torchcodec/_samplers/video_clip_sampler.py +7 -19
  41. torchcodec/decoders/__init__.py +1 -0
  42. torchcodec/decoders/_decoder_utils.py +61 -1
  43. torchcodec/decoders/_video_decoder.py +46 -20
  44. torchcodec/libtorchcodec_core4.dll +0 -0
  45. torchcodec/libtorchcodec_core5.dll +0 -0
  46. torchcodec/libtorchcodec_core6.dll +0 -0
  47. torchcodec/libtorchcodec_core7.dll +0 -0
  48. torchcodec/libtorchcodec_core8.dll +0 -0
  49. torchcodec/libtorchcodec_custom_ops4.dll +0 -0
  50. torchcodec/libtorchcodec_custom_ops5.dll +0 -0
  51. torchcodec/libtorchcodec_custom_ops6.dll +0 -0
  52. torchcodec/libtorchcodec_custom_ops7.dll +0 -0
  53. torchcodec/libtorchcodec_custom_ops8.dll +0 -0
  54. torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
  55. torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
  56. torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
  57. torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
  58. torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
  59. torchcodec/samplers/_time_based.py +8 -0
  60. torchcodec/version.py +1 -1
  61. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
  62. torchcodec-0.8.1.dist-info/RECORD +82 -0
  63. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
  64. torchcodec-0.7.0.dist-info/RECORD +0 -67
  65. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
  66. {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
@@ -17,6 +17,7 @@
17
17
  #include "src/torchcodec/_core/FFMPEGCommon.h"
18
18
  #include "src/torchcodec/_core/Frame.h"
19
19
  #include "src/torchcodec/_core/StreamOptions.h"
20
+ #include "src/torchcodec/_core/Transform.h"
20
21
 
21
22
  namespace facebook::torchcodec {
22
23
 
@@ -83,6 +84,7 @@ class SingleStreamDecoder {
83
84
 
84
85
  void addVideoStream(
85
86
  int streamIndex,
87
+ std::vector<Transform*>& transforms,
86
88
  const VideoStreamOptions& videoStreamOptions = VideoStreamOptions(),
87
89
  std::optional<FrameMappings> customFrameMappings = std::nullopt);
88
90
  void addAudioStream(
@@ -106,7 +108,7 @@ class SingleStreamDecoder {
106
108
 
107
109
  // Returns frames at the given indices for a given stream as a single stacked
108
110
  // Tensor.
109
- FrameBatchOutput getFramesAtIndices(const std::vector<int64_t>& frameIndices);
111
+ FrameBatchOutput getFramesAtIndices(const torch::Tensor& frameIndices);
110
112
 
111
113
  // Returns frames within a given range. The range is defined by [start, stop).
112
114
  // The values retrieved from the range are: [start, start+step,
@@ -121,7 +123,7 @@ class SingleStreamDecoder {
121
123
  // seconds=5.999, etc.
122
124
  FrameOutput getFramePlayedAt(double seconds);
123
125
 
124
- FrameBatchOutput getFramesPlayedAt(const std::vector<double>& timestamps);
126
+ FrameBatchOutput getFramesPlayedAt(const torch::Tensor& timestamps);
125
127
 
126
128
  // Returns frames within a given pts range. The range is defined by
127
129
  // [startSeconds, stopSeconds) with respect to the pts values for frames. The
@@ -184,6 +186,8 @@ class SingleStreamDecoder {
184
186
  DecodeStats getDecodeStats() const;
185
187
  void resetDecodeStats();
186
188
 
189
+ std::string getDeviceInterfaceDetails() const;
190
+
187
191
  private:
188
192
  // --------------------------------------------------------------------------
189
193
  // STREAMINFO AND ASSOCIATED STRUCTS
@@ -219,24 +223,15 @@ class SingleStreamDecoder {
219
223
  AVMediaType avMediaType = AVMEDIA_TYPE_UNKNOWN;
220
224
 
221
225
  AVRational timeBase = {};
222
- UniqueAVCodecContext codecContext;
226
+ SharedAVCodecContext codecContext;
223
227
 
224
228
  // The FrameInfo indices we built when scanFileAndUpdateMetadataAndIndex was
225
229
  // called.
226
230
  std::vector<FrameInfo> keyFrames;
227
231
  std::vector<FrameInfo> allFrames;
228
232
 
229
- // TODO since the decoder is single-stream, these should be decoder fields,
230
- // not streamInfo fields. And they should be defined right next to
231
- // `cursor_`, with joint documentation.
232
- int64_t lastDecodedAvFramePts = 0;
233
- int64_t lastDecodedAvFrameDuration = 0;
234
233
  VideoStreamOptions videoStreamOptions;
235
234
  AudioStreamOptions audioStreamOptions;
236
-
237
- // color-conversion fields. Only one of FilterGraphContext and
238
- // UniqueSwsContext should be non-null.
239
- UniqueSwrContext swrContext;
240
235
  };
241
236
 
242
237
  // --------------------------------------------------------------------------
@@ -318,6 +313,7 @@ class SingleStreamDecoder {
318
313
  int streamIndex,
319
314
  AVMediaType mediaType,
320
315
  const torch::Device& device = torch::kCPU,
316
+ const std::string_view deviceVariant = "ffmpeg",
321
317
  std::optional<int> ffmpegThreadCount = std::nullopt);
322
318
 
323
319
  // Returns the "best" stream index for a given media type. The "best" is
@@ -356,16 +352,49 @@ class SingleStreamDecoder {
356
352
  const int NO_ACTIVE_STREAM = -2;
357
353
  int activeStreamIndex_ = NO_ACTIVE_STREAM;
358
354
 
359
- bool cursorWasJustSet_ = false;
360
355
  // The desired position of the cursor in the stream. We send frames >= this
361
356
  // pts to the user when they request a frame.
362
357
  int64_t cursor_ = INT64_MIN;
358
+ bool cursorWasJustSet_ = false;
359
+ int64_t lastDecodedAvFramePts_ = 0;
360
+ int64_t lastDecodedAvFrameDuration_ = 0;
361
+
362
+ // Audio only. We cache it for performance. The video equivalents live in
363
+ // deviceInterface_. We store swrContext_ here because we only handle audio
364
+ // on the CPU.
365
+ UniqueSwrContext swrContext_;
366
+
363
367
  // Stores various internal decoding stats.
364
368
  DecodeStats decodeStats_;
369
+
365
370
  // Stores the AVIOContext for the input buffer.
366
371
  std::unique_ptr<AVIOContextHolder> avioContextHolder_;
372
+
373
+ // We will receive a vector of transforms upon adding a stream and store it
374
+ // here. However, we need to know if any of those operations change the
375
+ // dimensions of the output frame. If they do, we need to figure out what are
376
+ // the final dimensions of the output frame after ALL transformations. We
377
+ // figure this out as soon as we receive the transforms. If any of the
378
+ // transforms change the final output frame dimensions, we store that in
379
+ // resizedOutputDims_. If resizedOutputDims_ has no value, that means there
380
+ // are no transforms that change the output frame dimensions.
381
+ //
382
+ // The priority order for output frame dimension is:
383
+ //
384
+ // 1. resizedOutputDims_; the resize requested by the user always takes
385
+ // priority.
386
+ // 2. The dimemnsions of the actual decoded AVFrame. This can change
387
+ // per-decoded frame, and is unknown in SingleStreamDecoder. Only the
388
+ // DeviceInterface learns it immediately after decoding a raw frame but
389
+ // before the color transformation.
390
+ // 3. metdataDims_; the dimensions we learned from the metadata.
391
+ std::vector<std::unique_ptr<Transform>> transforms_;
392
+ std::optional<FrameDims> resizedOutputDims_;
393
+ FrameDims metadataDims_;
394
+
367
395
  // Whether or not we have already scanned all streams to update the metadata.
368
396
  bool scannedAllStreams_ = false;
397
+
369
398
  // Tracks that we've already been initialized.
370
399
  bool initialized_ = false;
371
400
  };
@@ -375,6 +404,4 @@ std::ostream& operator<<(
375
404
  std::ostream& os,
376
405
  const SingleStreamDecoder::DecodeStats& stats);
377
406
 
378
- SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode);
379
-
380
407
  } // namespace facebook::torchcodec
@@ -9,11 +9,11 @@
9
9
  #include <torch/types.h>
10
10
  #include <optional>
11
11
  #include <string>
12
+ #include <string_view>
12
13
 
13
14
  namespace facebook::torchcodec {
14
15
 
15
16
  enum ColorConversionLibrary {
16
- // TODO: Add an AUTO option later.
17
17
  // Use the libavfilter library for color conversion.
18
18
  FILTERGRAPH,
19
19
  // Use the libswscale library for color conversion.
@@ -28,16 +28,26 @@ struct VideoStreamOptions {
28
28
  // utilize all cores. If not set, it will be the default FFMPEG behavior for
29
29
  // the given codec.
30
30
  std::optional<int> ffmpegThreadCount;
31
+
31
32
  // Currently the dimension order can be either NHWC or NCHW.
32
33
  // H=height, W=width, C=channel.
33
34
  std::string dimensionOrder = "NCHW";
34
- // The output height and width of the frame. If not specified, the output
35
- // is the same as the original video.
36
- std::optional<int> width;
37
- std::optional<int> height;
38
- std::optional<ColorConversionLibrary> colorConversionLibrary;
35
+
36
+ // By default we have to use filtergraph, as it is more general. We can only
37
+ // use swscale when we have met strict requirements. See
38
+ // CpuDeviceInterface::initialze() for the logic.
39
+ ColorConversionLibrary colorConversionLibrary =
40
+ ColorConversionLibrary::FILTERGRAPH;
41
+
39
42
  // By default we use CPU for decoding for both C++ and python users.
40
43
  torch::Device device = torch::kCPU;
44
+ // Device variant (e.g., "ffmpeg", "beta", etc.)
45
+ std::string_view deviceVariant = "ffmpeg";
46
+
47
+ // Encoding options
48
+ // TODO-VideoEncoder: Consider adding other optional fields here
49
+ // (bit rate, gop size, max b frames, preset)
50
+ std::optional<int> crf;
41
51
  };
42
52
 
43
53
  struct AudioStreamOptions {
@@ -0,0 +1,87 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include "src/torchcodec/_core/Transform.h"
8
+ #include <torch/types.h>
9
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
10
+
11
+ namespace facebook::torchcodec {
12
+
13
+ namespace {
14
+
15
+ std::string toFilterGraphInterpolation(
16
+ ResizeTransform::InterpolationMode mode) {
17
+ switch (mode) {
18
+ case ResizeTransform::InterpolationMode::BILINEAR:
19
+ return "bilinear";
20
+ default:
21
+ TORCH_CHECK(
22
+ false,
23
+ "Unknown interpolation mode: " +
24
+ std::to_string(static_cast<int>(mode)));
25
+ }
26
+ }
27
+
28
+ int toSwsInterpolation(ResizeTransform::InterpolationMode mode) {
29
+ switch (mode) {
30
+ case ResizeTransform::InterpolationMode::BILINEAR:
31
+ return SWS_BILINEAR;
32
+ default:
33
+ TORCH_CHECK(
34
+ false,
35
+ "Unknown interpolation mode: " +
36
+ std::to_string(static_cast<int>(mode)));
37
+ }
38
+ }
39
+
40
+ } // namespace
41
+
42
+ std::string ResizeTransform::getFilterGraphCpu() const {
43
+ return "scale=" + std::to_string(outputDims_.width) + ":" +
44
+ std::to_string(outputDims_.height) +
45
+ ":sws_flags=" + toFilterGraphInterpolation(interpolationMode_);
46
+ }
47
+
48
+ std::optional<FrameDims> ResizeTransform::getOutputFrameDims() const {
49
+ return outputDims_;
50
+ }
51
+
52
+ bool ResizeTransform::isResize() const {
53
+ return true;
54
+ }
55
+
56
+ int ResizeTransform::getSwsFlags() const {
57
+ return toSwsInterpolation(interpolationMode_);
58
+ }
59
+
60
+ CropTransform::CropTransform(const FrameDims& dims, int x, int y)
61
+ : outputDims_(dims), x_(x), y_(y) {
62
+ TORCH_CHECK(x_ >= 0, "Crop x position must be >= 0, got: ", x_);
63
+ TORCH_CHECK(y_ >= 0, "Crop y position must be >= 0, got: ", y_);
64
+ }
65
+
66
+ std::string CropTransform::getFilterGraphCpu() const {
67
+ return "crop=" + std::to_string(outputDims_.width) + ":" +
68
+ std::to_string(outputDims_.height) + ":" + std::to_string(x_) + ":" +
69
+ std::to_string(y_) + ":exact=1";
70
+ }
71
+
72
+ std::optional<FrameDims> CropTransform::getOutputFrameDims() const {
73
+ return outputDims_;
74
+ }
75
+
76
+ void CropTransform::validate(const StreamMetadata& streamMetadata) const {
77
+ TORCH_CHECK(x_ <= streamMetadata.width, "Crop x position out of bounds");
78
+ TORCH_CHECK(
79
+ x_ + outputDims_.width <= streamMetadata.width,
80
+ "Crop x position out of bounds")
81
+ TORCH_CHECK(y_ <= streamMetadata.height, "Crop y position out of bounds");
82
+ TORCH_CHECK(
83
+ y_ + outputDims_.height <= streamMetadata.height,
84
+ "Crop y position out of bounds");
85
+ }
86
+
87
+ } // namespace facebook::torchcodec
@@ -0,0 +1,84 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #pragma once
8
+
9
+ #include <optional>
10
+ #include <string>
11
+ #include "src/torchcodec/_core/Frame.h"
12
+ #include "src/torchcodec/_core/Metadata.h"
13
+
14
+ namespace facebook::torchcodec {
15
+
16
+ class Transform {
17
+ public:
18
+ virtual std::string getFilterGraphCpu() const = 0;
19
+ virtual ~Transform() = default;
20
+
21
+ // If the transformation does not change the output frame dimensions, then
22
+ // there is no need to override this member function. The default
23
+ // implementation returns an empty optional, indicating that the output frame
24
+ // has the same dimensions as the input frame.
25
+ //
26
+ // If the transformation does change the output frame dimensions, then it
27
+ // must override this member function and return the output frame dimensions.
28
+ virtual std::optional<FrameDims> getOutputFrameDims() const {
29
+ return std::nullopt;
30
+ }
31
+
32
+ // The ResizeTransform is special, because it is the only transform that
33
+ // swscale can handle.
34
+ virtual bool isResize() const {
35
+ return false;
36
+ }
37
+
38
+ // The validity of some transforms depends on the characteristics of the
39
+ // AVStream they're being applied to. For example, some transforms will
40
+ // specify coordinates inside a frame, we need to validate that those are
41
+ // within the frame's bounds.
42
+ //
43
+ // Note that the validation function does not return anything. We expect
44
+ // invalid configurations to throw an exception.
45
+ virtual void validate(
46
+ [[maybe_unused]] const StreamMetadata& streamMetadata) const {}
47
+ };
48
+
49
+ class ResizeTransform : public Transform {
50
+ public:
51
+ enum class InterpolationMode { BILINEAR };
52
+
53
+ ResizeTransform(const FrameDims& dims)
54
+ : outputDims_(dims), interpolationMode_(InterpolationMode::BILINEAR) {}
55
+
56
+ ResizeTransform(const FrameDims& dims, InterpolationMode interpolationMode)
57
+ : outputDims_(dims), interpolationMode_(interpolationMode) {}
58
+
59
+ std::string getFilterGraphCpu() const override;
60
+ std::optional<FrameDims> getOutputFrameDims() const override;
61
+ bool isResize() const override;
62
+
63
+ int getSwsFlags() const;
64
+
65
+ private:
66
+ FrameDims outputDims_;
67
+ InterpolationMode interpolationMode_;
68
+ };
69
+
70
+ class CropTransform : public Transform {
71
+ public:
72
+ CropTransform(const FrameDims& dims, int x, int y);
73
+
74
+ std::string getFilterGraphCpu() const override;
75
+ std::optional<FrameDims> getOutputFrameDims() const override;
76
+ void validate(const StreamMetadata& streamMetadata) const override;
77
+
78
+ private:
79
+ FrameDims outputDims_;
80
+ int x_;
81
+ int y_;
82
+ };
83
+
84
+ } // namespace facebook::torchcodec
@@ -14,6 +14,7 @@ from ._metadata import (
14
14
  )
15
15
  from .ops import (
16
16
  _add_video_stream,
17
+ _get_backend_details,
17
18
  _get_key_frame_indices,
18
19
  _test_frame_pts_equality,
19
20
  add_audio_stream,
@@ -25,6 +26,9 @@ from .ops import (
25
26
  encode_audio_to_file,
26
27
  encode_audio_to_file_like,
27
28
  encode_audio_to_tensor,
29
+ encode_video_to_file,
30
+ encode_video_to_file_like,
31
+ encode_video_to_tensor,
28
32
  get_ffmpeg_library_versions,
29
33
  get_frame_at_index,
30
34
  get_frame_at_pts,