torchcodec 0.3.0__cp313-cp313t-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (59) hide show
  1. torchcodec/.dylibs/libc++.1.0.dylib +0 -0
  2. torchcodec/.dylibs/libpython3.13t.dylib +0 -0
  3. torchcodec/__init__.py +16 -0
  4. torchcodec/_core/AVIOBytesContext.cpp +70 -0
  5. torchcodec/_core/AVIOBytesContext.h +32 -0
  6. torchcodec/_core/AVIOContextHolder.cpp +50 -0
  7. torchcodec/_core/AVIOContextHolder.h +65 -0
  8. torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
  9. torchcodec/_core/AVIOFileLikeContext.h +54 -0
  10. torchcodec/_core/CMakeLists.txt +237 -0
  11. torchcodec/_core/CudaDeviceInterface.cpp +289 -0
  12. torchcodec/_core/CudaDeviceInterface.h +34 -0
  13. torchcodec/_core/DeviceInterface.cpp +88 -0
  14. torchcodec/_core/DeviceInterface.h +66 -0
  15. torchcodec/_core/Encoder.cpp +319 -0
  16. torchcodec/_core/Encoder.h +39 -0
  17. torchcodec/_core/FFMPEGCommon.cpp +264 -0
  18. torchcodec/_core/FFMPEGCommon.h +180 -0
  19. torchcodec/_core/Frame.h +47 -0
  20. torchcodec/_core/Metadata.h +70 -0
  21. torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
  22. torchcodec/_core/SingleStreamDecoder.h +462 -0
  23. torchcodec/_core/StreamOptions.h +49 -0
  24. torchcodec/_core/__init__.py +39 -0
  25. torchcodec/_core/_metadata.py +277 -0
  26. torchcodec/_core/custom_ops.cpp +681 -0
  27. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
  28. torchcodec/_core/ops.py +381 -0
  29. torchcodec/_core/pybind_ops.cpp +45 -0
  30. torchcodec/_frame.py +145 -0
  31. torchcodec/_internally_replaced_utils.py +53 -0
  32. torchcodec/_samplers/__init__.py +7 -0
  33. torchcodec/_samplers/video_clip_sampler.py +430 -0
  34. torchcodec/decoders/__init__.py +11 -0
  35. torchcodec/decoders/_audio_decoder.py +168 -0
  36. torchcodec/decoders/_decoder_utils.py +52 -0
  37. torchcodec/decoders/_video_decoder.py +399 -0
  38. torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
  39. torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
  40. torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
  41. torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
  42. torchcodec/libtorchcodec_decoder4.dylib +0 -0
  43. torchcodec/libtorchcodec_decoder5.dylib +0 -0
  44. torchcodec/libtorchcodec_decoder6.dylib +0 -0
  45. torchcodec/libtorchcodec_decoder7.dylib +0 -0
  46. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  47. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  48. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  49. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  50. torchcodec/samplers/__init__.py +2 -0
  51. torchcodec/samplers/_common.py +84 -0
  52. torchcodec/samplers/_index_based.py +285 -0
  53. torchcodec/samplers/_time_based.py +348 -0
  54. torchcodec/version.py +2 -0
  55. torchcodec-0.3.0.dist-info/LICENSE +28 -0
  56. torchcodec-0.3.0.dist-info/METADATA +280 -0
  57. torchcodec-0.3.0.dist-info/RECORD +59 -0
  58. torchcodec-0.3.0.dist-info/WHEEL +5 -0
  59. torchcodec-0.3.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,319 @@
1
+ #include <sstream>
2
+
3
+ #include "src/torchcodec/_core/Encoder.h"
4
+ #include "torch/types.h"
5
+
6
+ namespace facebook::torchcodec {
7
+
8
+ namespace {
9
+
10
+ void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
11
+ if (avCodec.supported_samplerates == nullptr) {
12
+ return;
13
+ }
14
+
15
+ for (auto i = 0; avCodec.supported_samplerates[i] != 0; ++i) {
16
+ if (sampleRate == avCodec.supported_samplerates[i]) {
17
+ return;
18
+ }
19
+ }
20
+ std::stringstream supportedRates;
21
+ for (auto i = 0; avCodec.supported_samplerates[i] != 0; ++i) {
22
+ if (i > 0) {
23
+ supportedRates << ", ";
24
+ }
25
+ supportedRates << avCodec.supported_samplerates[i];
26
+ }
27
+
28
+ TORCH_CHECK(
29
+ false,
30
+ "invalid sample rate=",
31
+ sampleRate,
32
+ ". Supported sample rate values are: ",
33
+ supportedRates.str());
34
+ }
35
+
36
+ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
37
+ AV_SAMPLE_FMT_FLTP,
38
+ AV_SAMPLE_FMT_FLT,
39
+ AV_SAMPLE_FMT_DBLP,
40
+ AV_SAMPLE_FMT_DBL,
41
+ AV_SAMPLE_FMT_S64P,
42
+ AV_SAMPLE_FMT_S64,
43
+ AV_SAMPLE_FMT_S32P,
44
+ AV_SAMPLE_FMT_S32,
45
+ AV_SAMPLE_FMT_S16P,
46
+ AV_SAMPLE_FMT_S16,
47
+ AV_SAMPLE_FMT_U8P,
48
+ AV_SAMPLE_FMT_U8};
49
+
50
+ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
51
+ // Find a sample format that the encoder supports. We prefer using FLT[P],
52
+ // since this is the format of the input waveform. If FLTP isn't supported
53
+ // then we'll need to convert the AVFrame's format. Our heuristic is to encode
54
+ // into the format with the highest resolution.
55
+ if (avCodec.sample_fmts == nullptr) {
56
+ // Can't really validate anything in this case, best we can do is hope that
57
+ // FLTP is supported by the encoder. If not, FFmpeg will raise.
58
+ return AV_SAMPLE_FMT_FLTP;
59
+ }
60
+
61
+ for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
62
+ for (int i = 0; avCodec.sample_fmts[i] != -1; ++i) {
63
+ if (avCodec.sample_fmts[i] == preferredFormat) {
64
+ return preferredFormat;
65
+ }
66
+ }
67
+ }
68
+ // We should always find a match in preferredFormatsOrder, so we should always
69
+ // return earlier. But in the event that a future FFmpeg version defines an
70
+ // additional sample format that isn't in preferredFormatsOrder, we fallback:
71
+ return avCodec.sample_fmts[0];
72
+ }
73
+
74
+ } // namespace
75
+
76
+ AudioEncoder::~AudioEncoder() {}
77
+
78
+ AudioEncoder::AudioEncoder(
79
+ const torch::Tensor wf,
80
+ int sampleRate,
81
+ std::string_view fileName,
82
+ std::optional<int64_t> bitRate)
83
+ : wf_(wf) {
84
+ TORCH_CHECK(
85
+ wf_.dtype() == torch::kFloat32,
86
+ "waveform must have float32 dtype, got ",
87
+ wf_.dtype());
88
+ // TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
89
+ // planar (fltp).
90
+ TORCH_CHECK(
91
+ wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
92
+
93
+ setFFmpegLogLevel();
94
+ AVFormatContext* avFormatContext = nullptr;
95
+ auto status = avformat_alloc_output_context2(
96
+ &avFormatContext, nullptr, nullptr, fileName.data());
97
+ TORCH_CHECK(
98
+ avFormatContext != nullptr,
99
+ "Couldn't allocate AVFormatContext. ",
100
+ "Check the desired extension? ",
101
+ getFFMPEGErrorStringFromErrorCode(status));
102
+ avFormatContext_.reset(avFormatContext);
103
+
104
+ // TODO-ENCODING: Should also support encoding into bytes (use
105
+ // AVIOBytesContext)
106
+ TORCH_CHECK(
107
+ !(avFormatContext->oformat->flags & AVFMT_NOFILE),
108
+ "AVFMT_NOFILE is set. We only support writing to a file.");
109
+ status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
110
+ TORCH_CHECK(
111
+ status >= 0,
112
+ "avio_open failed: ",
113
+ getFFMPEGErrorStringFromErrorCode(status));
114
+
115
+ // We use the AVFormatContext's default codec for that
116
+ // specific format/container.
117
+ const AVCodec* avCodec =
118
+ avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
119
+ TORCH_CHECK(avCodec != nullptr, "Codec not found");
120
+
121
+ AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
122
+ TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
123
+ avCodecContext_.reset(avCodecContext);
124
+
125
+ if (bitRate.has_value()) {
126
+ TORCH_CHECK(*bitRate >= 0, "bit_rate=", *bitRate, " must be >= 0.");
127
+ }
128
+ // bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
129
+ // well when "-b:a" isn't specified.
130
+ avCodecContext_->bit_rate = bitRate.value_or(0);
131
+
132
+ validateSampleRate(*avCodec, sampleRate);
133
+ avCodecContext_->sample_rate = sampleRate;
134
+
135
+ // Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
136
+ // may need to convert the wf into a supported output sample format, which is
137
+ // what the `.sample_fmt` defines.
138
+ avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
139
+
140
+ int numChannels = static_cast<int>(wf_.sizes()[0]);
141
+ TORCH_CHECK(
142
+ // TODO-ENCODING is this even true / needed? We can probably support more
143
+ // with non-planar data?
144
+ numChannels <= AV_NUM_DATA_POINTERS,
145
+ "Trying to encode ",
146
+ numChannels,
147
+ " channels, but FFmpeg only supports ",
148
+ AV_NUM_DATA_POINTERS,
149
+ " channels per frame.");
150
+
151
+ setDefaultChannelLayout(avCodecContext_, numChannels);
152
+
153
+ status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
154
+ TORCH_CHECK(
155
+ status == AVSUCCESS,
156
+ "avcodec_open2 failed: ",
157
+ getFFMPEGErrorStringFromErrorCode(status));
158
+
159
+ // We're allocating the stream here. Streams are meant to be freed by
160
+ // avformat_free_context(avFormatContext), which we call in the
161
+ // avFormatContext_'s destructor.
162
+ AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
163
+ TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
164
+ status = avcodec_parameters_from_context(
165
+ avStream->codecpar, avCodecContext_.get());
166
+ TORCH_CHECK(
167
+ status == AVSUCCESS,
168
+ "avcodec_parameters_from_context failed: ",
169
+ getFFMPEGErrorStringFromErrorCode(status));
170
+ streamIndex_ = avStream->index;
171
+ }
172
+
173
+ void AudioEncoder::encode() {
174
+ UniqueAVFrame avFrame(av_frame_alloc());
175
+ TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
176
+ // Default to 256 like in torchaudio
177
+ int numSamplesAllocatedPerFrame =
178
+ avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256;
179
+ avFrame->nb_samples = numSamplesAllocatedPerFrame;
180
+ avFrame->format = AV_SAMPLE_FMT_FLTP;
181
+ avFrame->sample_rate = avCodecContext_->sample_rate;
182
+ avFrame->pts = 0;
183
+ setChannelLayout(avFrame, avCodecContext_);
184
+
185
+ auto status = av_frame_get_buffer(avFrame.get(), 0);
186
+ TORCH_CHECK(
187
+ status == AVSUCCESS,
188
+ "Couldn't allocate avFrame's buffers: ",
189
+ getFFMPEGErrorStringFromErrorCode(status));
190
+
191
+ AutoAVPacket autoAVPacket;
192
+
193
+ uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
194
+ int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
195
+ int numEncodedSamples = 0; // per channel
196
+ int numBytesPerSample = static_cast<int>(wf_.element_size());
197
+ int numBytesPerChannel = numSamples * numBytesPerSample;
198
+
199
+ status = avformat_write_header(avFormatContext_.get(), nullptr);
200
+ TORCH_CHECK(
201
+ status == AVSUCCESS,
202
+ "Error in avformat_write_header: ",
203
+ getFFMPEGErrorStringFromErrorCode(status));
204
+
205
+ while (numEncodedSamples < numSamples) {
206
+ status = av_frame_make_writable(avFrame.get());
207
+ TORCH_CHECK(
208
+ status == AVSUCCESS,
209
+ "Couldn't make AVFrame writable: ",
210
+ getFFMPEGErrorStringFromErrorCode(status));
211
+
212
+ int numSamplesToEncode =
213
+ std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
214
+ int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
215
+
216
+ for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
217
+ std::memcpy(
218
+ avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
219
+ }
220
+ pwf += numBytesToEncode;
221
+
222
+ // Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
223
+ // that the frame buffers are allocated to a big enough size. Here, we reset
224
+ // it to the exact number of samples that need to be encoded, otherwise the
225
+ // encoded frame would contain more samples than necessary and our results
226
+ // wouldn't match the ffmpeg CLI.
227
+ avFrame->nb_samples = numSamplesToEncode;
228
+ encodeInnerLoop(autoAVPacket, avFrame);
229
+
230
+ avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
231
+ numEncodedSamples += numSamplesToEncode;
232
+ }
233
+ TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
234
+
235
+ flushBuffers();
236
+
237
+ status = av_write_trailer(avFormatContext_.get());
238
+ TORCH_CHECK(
239
+ status == AVSUCCESS,
240
+ "Error in: av_write_trailer",
241
+ getFFMPEGErrorStringFromErrorCode(status));
242
+ }
243
+
244
+ void AudioEncoder::encodeInnerLoop(
245
+ AutoAVPacket& autoAVPacket,
246
+ const UniqueAVFrame& srcAVFrame) {
247
+ bool mustConvert =
248
+ (avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
249
+ srcAVFrame != nullptr);
250
+ UniqueAVFrame convertedAVFrame;
251
+ if (mustConvert) {
252
+ if (!swrContext_) {
253
+ swrContext_.reset(createSwrContext(
254
+ avCodecContext_,
255
+ AV_SAMPLE_FMT_FLTP,
256
+ avCodecContext_->sample_fmt,
257
+ srcAVFrame->sample_rate, // No sample rate conversion
258
+ srcAVFrame->sample_rate));
259
+ }
260
+ convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
261
+ swrContext_,
262
+ srcAVFrame,
263
+ avCodecContext_->sample_fmt,
264
+ srcAVFrame->sample_rate, // No sample rate conversion
265
+ srcAVFrame->sample_rate);
266
+ TORCH_CHECK(
267
+ convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
268
+ "convertedAVFrame->nb_samples=",
269
+ convertedAVFrame->nb_samples,
270
+ " differs from ",
271
+ "srcAVFrame->nb_samples=",
272
+ srcAVFrame->nb_samples,
273
+ "This is unexpected, please report on the TorchCodec bug tracker.");
274
+ }
275
+ const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
276
+
277
+ auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
278
+ TORCH_CHECK(
279
+ status == AVSUCCESS,
280
+ "Error while sending frame: ",
281
+ getFFMPEGErrorStringFromErrorCode(status));
282
+
283
+ while (status >= 0) {
284
+ ReferenceAVPacket packet(autoAVPacket);
285
+ status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
286
+ if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
287
+ // TODO-ENCODING this is from TorchAudio, probably needed, but not sure.
288
+ // if (status == AVERROR_EOF) {
289
+ // status = av_interleaved_write_frame(avFormatContext_.get(),
290
+ // nullptr); TORCH_CHECK(
291
+ // status == AVSUCCESS,
292
+ // "Failed to flush packet ",
293
+ // getFFMPEGErrorStringFromErrorCode(status));
294
+ // }
295
+ return;
296
+ }
297
+ TORCH_CHECK(
298
+ status >= 0,
299
+ "Error receiving packet: ",
300
+ getFFMPEGErrorStringFromErrorCode(status));
301
+
302
+ packet->stream_index = streamIndex_;
303
+
304
+ status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
305
+ TORCH_CHECK(
306
+ status == AVSUCCESS,
307
+ "Error in av_interleaved_write_frame: ",
308
+ getFFMPEGErrorStringFromErrorCode(status));
309
+ }
310
+ }
311
+
312
+ void AudioEncoder::flushBuffers() {
313
+ // We flush the main FFmpeg buffers, but not swresample buffers. Flushing
314
+ // swresample is only necessary when converting sample rates, which we don't
315
+ // do for encoding.
316
+ AutoAVPacket autoAVPacket;
317
+ encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
318
+ }
319
+ } // namespace facebook::torchcodec
@@ -0,0 +1,39 @@
1
+ #pragma once
2
+ #include <torch/types.h>
3
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
4
+
5
+ namespace facebook::torchcodec {
6
+ class AudioEncoder {
7
+ public:
8
+ ~AudioEncoder();
9
+
10
+ // TODO-ENCODING: document in public docs that bit_rate value is only
11
+ // best-effort, matching to the closest supported bit_rate. I.e. passing 1 is
12
+ // like passing 0, which results in choosing the minimum supported bit rate.
13
+ // Passing 44_100 could result in output being 44000 if only 44000 is
14
+ // supported.
15
+ AudioEncoder(
16
+ const torch::Tensor wf,
17
+ // The *output* sample rate. We can't really decide for the user what it
18
+ // should be. Particularly, the sample rate of the input waveform should
19
+ // match this, and that's up to the user. If sample rates don't match,
20
+ // encoding will still work but audio will be distorted.
21
+ int sampleRate,
22
+ std::string_view fileName,
23
+ std::optional<int64_t> bitRate = std::nullopt);
24
+ void encode();
25
+
26
+ private:
27
+ void encodeInnerLoop(
28
+ AutoAVPacket& autoAVPacket,
29
+ const UniqueAVFrame& srcAVFrame);
30
+ void flushBuffers();
31
+
32
+ UniqueEncodingAVFormatContext avFormatContext_;
33
+ UniqueAVCodecContext avCodecContext_;
34
+ int streamIndex_;
35
+ UniqueSwrContext swrContext_;
36
+
37
+ const torch::Tensor wf_;
38
+ };
39
+ } // namespace facebook::torchcodec
@@ -0,0 +1,264 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include "src/torchcodec/_core/FFMPEGCommon.h"
8
+
9
+ #include <c10/util/Exception.h>
10
+
11
+ namespace facebook::torchcodec {
12
+
13
+ AutoAVPacket::AutoAVPacket() : avPacket_(av_packet_alloc()) {
14
+ TORCH_CHECK(avPacket_ != nullptr, "Couldn't allocate avPacket.");
15
+ }
16
+
17
+ AutoAVPacket::~AutoAVPacket() {
18
+ av_packet_free(&avPacket_);
19
+ }
20
+
21
+ ReferenceAVPacket::ReferenceAVPacket(AutoAVPacket& shared)
22
+ : avPacket_(shared.avPacket_) {}
23
+
24
+ ReferenceAVPacket::~ReferenceAVPacket() {
25
+ av_packet_unref(avPacket_);
26
+ }
27
+
28
+ AVPacket* ReferenceAVPacket::get() {
29
+ return avPacket_;
30
+ }
31
+
32
+ AVPacket* ReferenceAVPacket::operator->() {
33
+ return avPacket_;
34
+ }
35
+
36
+ AVCodecOnlyUseForCallingAVFindBestStream
37
+ makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
38
+ #if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
39
+ return const_cast<AVCodec*>(codec);
40
+ #else
41
+ return codec;
42
+ #endif
43
+ }
44
+
45
+ std::string getFFMPEGErrorStringFromErrorCode(int errorCode) {
46
+ char errorBuffer[AV_ERROR_MAX_STRING_SIZE] = {0};
47
+ av_strerror(errorCode, errorBuffer, AV_ERROR_MAX_STRING_SIZE);
48
+ return std::string(errorBuffer);
49
+ }
50
+
51
+ int64_t getDuration(const UniqueAVFrame& avFrame) {
52
+ #if LIBAVUTIL_VERSION_MAJOR < 58
53
+ return avFrame->pkt_duration;
54
+ #else
55
+ return avFrame->duration;
56
+ #endif
57
+ }
58
+
59
+ int getNumChannels(const UniqueAVFrame& avFrame) {
60
+ #if LIBAVFILTER_VERSION_MAJOR > 8 || \
61
+ (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
62
+ return avFrame->ch_layout.nb_channels;
63
+ #else
64
+ return av_get_channel_layout_nb_channels(avFrame->channel_layout);
65
+ #endif
66
+ }
67
+
68
+ int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
69
+ #if LIBAVFILTER_VERSION_MAJOR > 8 || \
70
+ (LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
71
+ return avCodecContext->ch_layout.nb_channels;
72
+ #else
73
+ return avCodecContext->channels;
74
+ #endif
75
+ }
76
+
77
+ void setDefaultChannelLayout(
78
+ UniqueAVCodecContext& avCodecContext,
79
+ int numChannels) {
80
+ #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
81
+ AVChannelLayout channel_layout;
82
+ av_channel_layout_default(&channel_layout, numChannels);
83
+ avCodecContext->ch_layout = channel_layout;
84
+
85
+ #else
86
+ uint64_t channel_layout = av_get_default_channel_layout(numChannels);
87
+ avCodecContext->channel_layout = channel_layout;
88
+ avCodecContext->channels = numChannels;
89
+ #endif
90
+ }
91
+
92
+ void setChannelLayout(
93
+ UniqueAVFrame& dstAVFrame,
94
+ const UniqueAVCodecContext& avCodecContext) {
95
+ #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
96
+ auto status = av_channel_layout_copy(
97
+ &dstAVFrame->ch_layout, &avCodecContext->ch_layout);
98
+ TORCH_CHECK(
99
+ status == AVSUCCESS,
100
+ "Couldn't copy channel layout to avFrame: ",
101
+ getFFMPEGErrorStringFromErrorCode(status));
102
+ #else
103
+ dstAVFrame->channel_layout = avCodecContext->channel_layout;
104
+ dstAVFrame->channels = avCodecContext->channels;
105
+
106
+ #endif
107
+ }
108
+
109
+ void setChannelLayout(
110
+ UniqueAVFrame& dstAVFrame,
111
+ const UniqueAVFrame& srcAVFrame) {
112
+ #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
113
+ dstAVFrame->ch_layout = srcAVFrame->ch_layout;
114
+ #else
115
+ dstAVFrame->channel_layout = srcAVFrame->channel_layout;
116
+ #endif
117
+ }
118
+
119
+ SwrContext* createSwrContext(
120
+ UniqueAVCodecContext& avCodecContext,
121
+ AVSampleFormat sourceSampleFormat,
122
+ AVSampleFormat desiredSampleFormat,
123
+ int sourceSampleRate,
124
+ int desiredSampleRate) {
125
+ SwrContext* swrContext = nullptr;
126
+ int status = AVSUCCESS;
127
+ #if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
128
+ AVChannelLayout layout = avCodecContext->ch_layout;
129
+ status = swr_alloc_set_opts2(
130
+ &swrContext,
131
+ &layout,
132
+ desiredSampleFormat,
133
+ desiredSampleRate,
134
+ &layout,
135
+ sourceSampleFormat,
136
+ sourceSampleRate,
137
+ 0,
138
+ nullptr);
139
+
140
+ TORCH_CHECK(
141
+ status == AVSUCCESS,
142
+ "Couldn't create SwrContext: ",
143
+ getFFMPEGErrorStringFromErrorCode(status));
144
+ #else
145
+ int64_t layout = static_cast<int64_t>(avCodecContext->channel_layout);
146
+ swrContext = swr_alloc_set_opts(
147
+ nullptr,
148
+ layout,
149
+ desiredSampleFormat,
150
+ desiredSampleRate,
151
+ layout,
152
+ sourceSampleFormat,
153
+ sourceSampleRate,
154
+ 0,
155
+ nullptr);
156
+ #endif
157
+
158
+ TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
159
+ status = swr_init(swrContext);
160
+ TORCH_CHECK(
161
+ status == AVSUCCESS,
162
+ "Couldn't initialize SwrContext: ",
163
+ getFFMPEGErrorStringFromErrorCode(status),
164
+ ". If the error says 'Invalid argument', it's likely that you are using "
165
+ "a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
166
+ "valid scenarios. Try to upgrade FFmpeg?");
167
+ return swrContext;
168
+ }
169
+
170
+ UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
171
+ const UniqueSwrContext& swrContext,
172
+ const UniqueAVFrame& srcAVFrame,
173
+ AVSampleFormat desiredSampleFormat,
174
+ int sourceSampleRate,
175
+ int desiredSampleRate) {
176
+ UniqueAVFrame convertedAVFrame(av_frame_alloc());
177
+ TORCH_CHECK(
178
+ convertedAVFrame,
179
+ "Could not allocate frame for sample format conversion.");
180
+
181
+ setChannelLayout(convertedAVFrame, srcAVFrame);
182
+ convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
183
+ convertedAVFrame->sample_rate = desiredSampleRate;
184
+ if (sourceSampleRate != desiredSampleRate) {
185
+ // Note that this is an upper bound on the number of output samples.
186
+ // `swr_convert()` will likely not fill convertedAVFrame with that many
187
+ // samples if sample rate conversion is needed. It will buffer the last few
188
+ // ones because those require future samples. That's also why we reset
189
+ // nb_samples after the call to `swr_convert()`.
190
+ // We could also use `swr_get_out_samples()` to determine the number of
191
+ // output samples, but empirically `av_rescale_rnd()` seems to provide a
192
+ // tighter bound.
193
+ convertedAVFrame->nb_samples = av_rescale_rnd(
194
+ swr_get_delay(swrContext.get(), sourceSampleRate) +
195
+ srcAVFrame->nb_samples,
196
+ desiredSampleRate,
197
+ sourceSampleRate,
198
+ AV_ROUND_UP);
199
+ } else {
200
+ convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
201
+ }
202
+
203
+ auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
204
+ TORCH_CHECK(
205
+ status == AVSUCCESS,
206
+ "Could not allocate frame buffers for sample format conversion: ",
207
+ getFFMPEGErrorStringFromErrorCode(status));
208
+
209
+ auto numConvertedSamples = swr_convert(
210
+ swrContext.get(),
211
+ convertedAVFrame->data,
212
+ convertedAVFrame->nb_samples,
213
+ static_cast<const uint8_t**>(
214
+ const_cast<const uint8_t**>(srcAVFrame->data)),
215
+ srcAVFrame->nb_samples);
216
+ // numConvertedSamples can be 0 if we're downsampling by a great factor and
217
+ // the first frame doesn't contain a lot of samples. It should be handled
218
+ // properly by the caller.
219
+ TORCH_CHECK(
220
+ numConvertedSamples >= 0,
221
+ "Error in swr_convert: ",
222
+ getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
223
+
224
+ // See comment above about nb_samples
225
+ convertedAVFrame->nb_samples = numConvertedSamples;
226
+
227
+ return convertedAVFrame;
228
+ }
229
+
230
+ void setFFmpegLogLevel() {
231
+ auto logLevel = AV_LOG_QUIET;
232
+ const char* logLevelEnvPtr = std::getenv("TORCHCODEC_FFMPEG_LOG_LEVEL");
233
+ if (logLevelEnvPtr != nullptr) {
234
+ std::string logLevelEnv(logLevelEnvPtr);
235
+ if (logLevelEnv == "QUIET") {
236
+ logLevel = AV_LOG_QUIET;
237
+ } else if (logLevelEnv == "PANIC") {
238
+ logLevel = AV_LOG_PANIC;
239
+ } else if (logLevelEnv == "FATAL") {
240
+ logLevel = AV_LOG_FATAL;
241
+ } else if (logLevelEnv == "ERROR") {
242
+ logLevel = AV_LOG_ERROR;
243
+ } else if (logLevelEnv == "WARNING") {
244
+ logLevel = AV_LOG_WARNING;
245
+ } else if (logLevelEnv == "INFO") {
246
+ logLevel = AV_LOG_INFO;
247
+ } else if (logLevelEnv == "VERBOSE") {
248
+ logLevel = AV_LOG_VERBOSE;
249
+ } else if (logLevelEnv == "DEBUG") {
250
+ logLevel = AV_LOG_DEBUG;
251
+ } else if (logLevelEnv == "TRACE") {
252
+ logLevel = AV_LOG_TRACE;
253
+ } else {
254
+ TORCH_CHECK(
255
+ false,
256
+ "Invalid TORCHCODEC_FFMPEG_LOG_LEVEL: ",
257
+ logLevelEnv,
258
+ ". Use e.g. 'QUIET', 'PANIC', 'VERBOSE', etc.");
259
+ }
260
+ }
261
+ av_log_set_level(logLevel);
262
+ }
263
+
264
+ } // namespace facebook::torchcodec