torchcodec 0.3.0__cp310-cp310-manylinux_2_28_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/__init__.py +16 -0
- torchcodec/_core/AVIOBytesContext.cpp +70 -0
- torchcodec/_core/AVIOBytesContext.h +32 -0
- torchcodec/_core/AVIOContextHolder.cpp +50 -0
- torchcodec/_core/AVIOContextHolder.h +65 -0
- torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
- torchcodec/_core/AVIOFileLikeContext.h +54 -0
- torchcodec/_core/CMakeLists.txt +237 -0
- torchcodec/_core/CudaDeviceInterface.cpp +289 -0
- torchcodec/_core/CudaDeviceInterface.h +34 -0
- torchcodec/_core/DeviceInterface.cpp +88 -0
- torchcodec/_core/DeviceInterface.h +66 -0
- torchcodec/_core/Encoder.cpp +319 -0
- torchcodec/_core/Encoder.h +39 -0
- torchcodec/_core/FFMPEGCommon.cpp +264 -0
- torchcodec/_core/FFMPEGCommon.h +180 -0
- torchcodec/_core/Frame.h +47 -0
- torchcodec/_core/Metadata.h +70 -0
- torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
- torchcodec/_core/SingleStreamDecoder.h +462 -0
- torchcodec/_core/StreamOptions.h +49 -0
- torchcodec/_core/__init__.py +39 -0
- torchcodec/_core/_metadata.py +277 -0
- torchcodec/_core/custom_ops.cpp +681 -0
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
- torchcodec/_core/ops.py +381 -0
- torchcodec/_core/pybind_ops.cpp +45 -0
- torchcodec/_frame.py +145 -0
- torchcodec/_internally_replaced_utils.py +53 -0
- torchcodec/_samplers/__init__.py +7 -0
- torchcodec/_samplers/video_clip_sampler.py +430 -0
- torchcodec/decoders/__init__.py +11 -0
- torchcodec/decoders/_audio_decoder.py +168 -0
- torchcodec/decoders/_decoder_utils.py +52 -0
- torchcodec/decoders/_video_decoder.py +399 -0
- torchcodec/libtorchcodec_custom_ops4.so +0 -0
- torchcodec/libtorchcodec_custom_ops5.so +0 -0
- torchcodec/libtorchcodec_custom_ops6.so +0 -0
- torchcodec/libtorchcodec_custom_ops7.so +0 -0
- torchcodec/libtorchcodec_decoder4.so +0 -0
- torchcodec/libtorchcodec_decoder5.so +0 -0
- torchcodec/libtorchcodec_decoder6.so +0 -0
- torchcodec/libtorchcodec_decoder7.so +0 -0
- torchcodec/libtorchcodec_pybind_ops4.so +0 -0
- torchcodec/libtorchcodec_pybind_ops5.so +0 -0
- torchcodec/libtorchcodec_pybind_ops6.so +0 -0
- torchcodec/libtorchcodec_pybind_ops7.so +0 -0
- torchcodec/samplers/__init__.py +2 -0
- torchcodec/samplers/_common.py +84 -0
- torchcodec/samplers/_index_based.py +285 -0
- torchcodec/samplers/_time_based.py +348 -0
- torchcodec/version.py +2 -0
- torchcodec-0.3.0.dist-info/LICENSE +28 -0
- torchcodec-0.3.0.dist-info/METADATA +280 -0
- torchcodec-0.3.0.dist-info/RECORD +57 -0
- torchcodec-0.3.0.dist-info/WHEEL +5 -0
- torchcodec-0.3.0.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,319 @@
|
|
|
1
|
+
#include <sstream>
|
|
2
|
+
|
|
3
|
+
#include "src/torchcodec/_core/Encoder.h"
|
|
4
|
+
#include "torch/types.h"
|
|
5
|
+
|
|
6
|
+
namespace facebook::torchcodec {
|
|
7
|
+
|
|
8
|
+
namespace {
|
|
9
|
+
|
|
10
|
+
void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
|
|
11
|
+
if (avCodec.supported_samplerates == nullptr) {
|
|
12
|
+
return;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
for (auto i = 0; avCodec.supported_samplerates[i] != 0; ++i) {
|
|
16
|
+
if (sampleRate == avCodec.supported_samplerates[i]) {
|
|
17
|
+
return;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
std::stringstream supportedRates;
|
|
21
|
+
for (auto i = 0; avCodec.supported_samplerates[i] != 0; ++i) {
|
|
22
|
+
if (i > 0) {
|
|
23
|
+
supportedRates << ", ";
|
|
24
|
+
}
|
|
25
|
+
supportedRates << avCodec.supported_samplerates[i];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
TORCH_CHECK(
|
|
29
|
+
false,
|
|
30
|
+
"invalid sample rate=",
|
|
31
|
+
sampleRate,
|
|
32
|
+
". Supported sample rate values are: ",
|
|
33
|
+
supportedRates.str());
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
static const std::vector<AVSampleFormat> preferredFormatsOrder = {
|
|
37
|
+
AV_SAMPLE_FMT_FLTP,
|
|
38
|
+
AV_SAMPLE_FMT_FLT,
|
|
39
|
+
AV_SAMPLE_FMT_DBLP,
|
|
40
|
+
AV_SAMPLE_FMT_DBL,
|
|
41
|
+
AV_SAMPLE_FMT_S64P,
|
|
42
|
+
AV_SAMPLE_FMT_S64,
|
|
43
|
+
AV_SAMPLE_FMT_S32P,
|
|
44
|
+
AV_SAMPLE_FMT_S32,
|
|
45
|
+
AV_SAMPLE_FMT_S16P,
|
|
46
|
+
AV_SAMPLE_FMT_S16,
|
|
47
|
+
AV_SAMPLE_FMT_U8P,
|
|
48
|
+
AV_SAMPLE_FMT_U8};
|
|
49
|
+
|
|
50
|
+
AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
|
|
51
|
+
// Find a sample format that the encoder supports. We prefer using FLT[P],
|
|
52
|
+
// since this is the format of the input waveform. If FLTP isn't supported
|
|
53
|
+
// then we'll need to convert the AVFrame's format. Our heuristic is to encode
|
|
54
|
+
// into the format with the highest resolution.
|
|
55
|
+
if (avCodec.sample_fmts == nullptr) {
|
|
56
|
+
// Can't really validate anything in this case, best we can do is hope that
|
|
57
|
+
// FLTP is supported by the encoder. If not, FFmpeg will raise.
|
|
58
|
+
return AV_SAMPLE_FMT_FLTP;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
|
|
62
|
+
for (int i = 0; avCodec.sample_fmts[i] != -1; ++i) {
|
|
63
|
+
if (avCodec.sample_fmts[i] == preferredFormat) {
|
|
64
|
+
return preferredFormat;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// We should always find a match in preferredFormatsOrder, so we should always
|
|
69
|
+
// return earlier. But in the event that a future FFmpeg version defines an
|
|
70
|
+
// additional sample format that isn't in preferredFormatsOrder, we fallback:
|
|
71
|
+
return avCodec.sample_fmts[0];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
} // namespace
|
|
75
|
+
|
|
76
|
+
AudioEncoder::~AudioEncoder() {}
|
|
77
|
+
|
|
78
|
+
AudioEncoder::AudioEncoder(
|
|
79
|
+
const torch::Tensor wf,
|
|
80
|
+
int sampleRate,
|
|
81
|
+
std::string_view fileName,
|
|
82
|
+
std::optional<int64_t> bitRate)
|
|
83
|
+
: wf_(wf) {
|
|
84
|
+
TORCH_CHECK(
|
|
85
|
+
wf_.dtype() == torch::kFloat32,
|
|
86
|
+
"waveform must have float32 dtype, got ",
|
|
87
|
+
wf_.dtype());
|
|
88
|
+
// TODO-ENCODING check contiguity of the input wf to ensure that it is indeed
|
|
89
|
+
// planar (fltp).
|
|
90
|
+
TORCH_CHECK(
|
|
91
|
+
wf_.dim() == 2, "waveform must have 2 dimensions, got ", wf_.dim());
|
|
92
|
+
|
|
93
|
+
setFFmpegLogLevel();
|
|
94
|
+
AVFormatContext* avFormatContext = nullptr;
|
|
95
|
+
auto status = avformat_alloc_output_context2(
|
|
96
|
+
&avFormatContext, nullptr, nullptr, fileName.data());
|
|
97
|
+
TORCH_CHECK(
|
|
98
|
+
avFormatContext != nullptr,
|
|
99
|
+
"Couldn't allocate AVFormatContext. ",
|
|
100
|
+
"Check the desired extension? ",
|
|
101
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
102
|
+
avFormatContext_.reset(avFormatContext);
|
|
103
|
+
|
|
104
|
+
// TODO-ENCODING: Should also support encoding into bytes (use
|
|
105
|
+
// AVIOBytesContext)
|
|
106
|
+
TORCH_CHECK(
|
|
107
|
+
!(avFormatContext->oformat->flags & AVFMT_NOFILE),
|
|
108
|
+
"AVFMT_NOFILE is set. We only support writing to a file.");
|
|
109
|
+
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
|
|
110
|
+
TORCH_CHECK(
|
|
111
|
+
status >= 0,
|
|
112
|
+
"avio_open failed: ",
|
|
113
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
114
|
+
|
|
115
|
+
// We use the AVFormatContext's default codec for that
|
|
116
|
+
// specific format/container.
|
|
117
|
+
const AVCodec* avCodec =
|
|
118
|
+
avcodec_find_encoder(avFormatContext_->oformat->audio_codec);
|
|
119
|
+
TORCH_CHECK(avCodec != nullptr, "Codec not found");
|
|
120
|
+
|
|
121
|
+
AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
|
|
122
|
+
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
|
|
123
|
+
avCodecContext_.reset(avCodecContext);
|
|
124
|
+
|
|
125
|
+
if (bitRate.has_value()) {
|
|
126
|
+
TORCH_CHECK(*bitRate >= 0, "bit_rate=", *bitRate, " must be >= 0.");
|
|
127
|
+
}
|
|
128
|
+
// bit_rate=None defaults to 0, which is what the FFmpeg CLI seems to use as
|
|
129
|
+
// well when "-b:a" isn't specified.
|
|
130
|
+
avCodecContext_->bit_rate = bitRate.value_or(0);
|
|
131
|
+
|
|
132
|
+
validateSampleRate(*avCodec, sampleRate);
|
|
133
|
+
avCodecContext_->sample_rate = sampleRate;
|
|
134
|
+
|
|
135
|
+
// Input waveform is expected to be FLTP. Not all encoders support FLTP, so we
|
|
136
|
+
// may need to convert the wf into a supported output sample format, which is
|
|
137
|
+
// what the `.sample_fmt` defines.
|
|
138
|
+
avCodecContext_->sample_fmt = findBestOutputSampleFormat(*avCodec);
|
|
139
|
+
|
|
140
|
+
int numChannels = static_cast<int>(wf_.sizes()[0]);
|
|
141
|
+
TORCH_CHECK(
|
|
142
|
+
// TODO-ENCODING is this even true / needed? We can probably support more
|
|
143
|
+
// with non-planar data?
|
|
144
|
+
numChannels <= AV_NUM_DATA_POINTERS,
|
|
145
|
+
"Trying to encode ",
|
|
146
|
+
numChannels,
|
|
147
|
+
" channels, but FFmpeg only supports ",
|
|
148
|
+
AV_NUM_DATA_POINTERS,
|
|
149
|
+
" channels per frame.");
|
|
150
|
+
|
|
151
|
+
setDefaultChannelLayout(avCodecContext_, numChannels);
|
|
152
|
+
|
|
153
|
+
status = avcodec_open2(avCodecContext_.get(), avCodec, nullptr);
|
|
154
|
+
TORCH_CHECK(
|
|
155
|
+
status == AVSUCCESS,
|
|
156
|
+
"avcodec_open2 failed: ",
|
|
157
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
158
|
+
|
|
159
|
+
// We're allocating the stream here. Streams are meant to be freed by
|
|
160
|
+
// avformat_free_context(avFormatContext), which we call in the
|
|
161
|
+
// avFormatContext_'s destructor.
|
|
162
|
+
AVStream* avStream = avformat_new_stream(avFormatContext_.get(), nullptr);
|
|
163
|
+
TORCH_CHECK(avStream != nullptr, "Couldn't create new stream.");
|
|
164
|
+
status = avcodec_parameters_from_context(
|
|
165
|
+
avStream->codecpar, avCodecContext_.get());
|
|
166
|
+
TORCH_CHECK(
|
|
167
|
+
status == AVSUCCESS,
|
|
168
|
+
"avcodec_parameters_from_context failed: ",
|
|
169
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
170
|
+
streamIndex_ = avStream->index;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
void AudioEncoder::encode() {
|
|
174
|
+
UniqueAVFrame avFrame(av_frame_alloc());
|
|
175
|
+
TORCH_CHECK(avFrame != nullptr, "Couldn't allocate AVFrame.");
|
|
176
|
+
// Default to 256 like in torchaudio
|
|
177
|
+
int numSamplesAllocatedPerFrame =
|
|
178
|
+
avCodecContext_->frame_size > 0 ? avCodecContext_->frame_size : 256;
|
|
179
|
+
avFrame->nb_samples = numSamplesAllocatedPerFrame;
|
|
180
|
+
avFrame->format = AV_SAMPLE_FMT_FLTP;
|
|
181
|
+
avFrame->sample_rate = avCodecContext_->sample_rate;
|
|
182
|
+
avFrame->pts = 0;
|
|
183
|
+
setChannelLayout(avFrame, avCodecContext_);
|
|
184
|
+
|
|
185
|
+
auto status = av_frame_get_buffer(avFrame.get(), 0);
|
|
186
|
+
TORCH_CHECK(
|
|
187
|
+
status == AVSUCCESS,
|
|
188
|
+
"Couldn't allocate avFrame's buffers: ",
|
|
189
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
190
|
+
|
|
191
|
+
AutoAVPacket autoAVPacket;
|
|
192
|
+
|
|
193
|
+
uint8_t* pwf = static_cast<uint8_t*>(wf_.data_ptr());
|
|
194
|
+
int numSamples = static_cast<int>(wf_.sizes()[1]); // per channel
|
|
195
|
+
int numEncodedSamples = 0; // per channel
|
|
196
|
+
int numBytesPerSample = static_cast<int>(wf_.element_size());
|
|
197
|
+
int numBytesPerChannel = numSamples * numBytesPerSample;
|
|
198
|
+
|
|
199
|
+
status = avformat_write_header(avFormatContext_.get(), nullptr);
|
|
200
|
+
TORCH_CHECK(
|
|
201
|
+
status == AVSUCCESS,
|
|
202
|
+
"Error in avformat_write_header: ",
|
|
203
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
204
|
+
|
|
205
|
+
while (numEncodedSamples < numSamples) {
|
|
206
|
+
status = av_frame_make_writable(avFrame.get());
|
|
207
|
+
TORCH_CHECK(
|
|
208
|
+
status == AVSUCCESS,
|
|
209
|
+
"Couldn't make AVFrame writable: ",
|
|
210
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
211
|
+
|
|
212
|
+
int numSamplesToEncode =
|
|
213
|
+
std::min(numSamplesAllocatedPerFrame, numSamples - numEncodedSamples);
|
|
214
|
+
int numBytesToEncode = numSamplesToEncode * numBytesPerSample;
|
|
215
|
+
|
|
216
|
+
for (int ch = 0; ch < wf_.sizes()[0]; ch++) {
|
|
217
|
+
std::memcpy(
|
|
218
|
+
avFrame->data[ch], pwf + ch * numBytesPerChannel, numBytesToEncode);
|
|
219
|
+
}
|
|
220
|
+
pwf += numBytesToEncode;
|
|
221
|
+
|
|
222
|
+
// Above, we set the AVFrame's .nb_samples to AVCodecContext.frame_size so
|
|
223
|
+
// that the frame buffers are allocated to a big enough size. Here, we reset
|
|
224
|
+
// it to the exact number of samples that need to be encoded, otherwise the
|
|
225
|
+
// encoded frame would contain more samples than necessary and our results
|
|
226
|
+
// wouldn't match the ffmpeg CLI.
|
|
227
|
+
avFrame->nb_samples = numSamplesToEncode;
|
|
228
|
+
encodeInnerLoop(autoAVPacket, avFrame);
|
|
229
|
+
|
|
230
|
+
avFrame->pts += static_cast<int64_t>(numSamplesToEncode);
|
|
231
|
+
numEncodedSamples += numSamplesToEncode;
|
|
232
|
+
}
|
|
233
|
+
TORCH_CHECK(numEncodedSamples == numSamples, "Hmmmmmm something went wrong.");
|
|
234
|
+
|
|
235
|
+
flushBuffers();
|
|
236
|
+
|
|
237
|
+
status = av_write_trailer(avFormatContext_.get());
|
|
238
|
+
TORCH_CHECK(
|
|
239
|
+
status == AVSUCCESS,
|
|
240
|
+
"Error in: av_write_trailer",
|
|
241
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
void AudioEncoder::encodeInnerLoop(
|
|
245
|
+
AutoAVPacket& autoAVPacket,
|
|
246
|
+
const UniqueAVFrame& srcAVFrame) {
|
|
247
|
+
bool mustConvert =
|
|
248
|
+
(avCodecContext_->sample_fmt != AV_SAMPLE_FMT_FLTP &&
|
|
249
|
+
srcAVFrame != nullptr);
|
|
250
|
+
UniqueAVFrame convertedAVFrame;
|
|
251
|
+
if (mustConvert) {
|
|
252
|
+
if (!swrContext_) {
|
|
253
|
+
swrContext_.reset(createSwrContext(
|
|
254
|
+
avCodecContext_,
|
|
255
|
+
AV_SAMPLE_FMT_FLTP,
|
|
256
|
+
avCodecContext_->sample_fmt,
|
|
257
|
+
srcAVFrame->sample_rate, // No sample rate conversion
|
|
258
|
+
srcAVFrame->sample_rate));
|
|
259
|
+
}
|
|
260
|
+
convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
|
|
261
|
+
swrContext_,
|
|
262
|
+
srcAVFrame,
|
|
263
|
+
avCodecContext_->sample_fmt,
|
|
264
|
+
srcAVFrame->sample_rate, // No sample rate conversion
|
|
265
|
+
srcAVFrame->sample_rate);
|
|
266
|
+
TORCH_CHECK(
|
|
267
|
+
convertedAVFrame->nb_samples == srcAVFrame->nb_samples,
|
|
268
|
+
"convertedAVFrame->nb_samples=",
|
|
269
|
+
convertedAVFrame->nb_samples,
|
|
270
|
+
" differs from ",
|
|
271
|
+
"srcAVFrame->nb_samples=",
|
|
272
|
+
srcAVFrame->nb_samples,
|
|
273
|
+
"This is unexpected, please report on the TorchCodec bug tracker.");
|
|
274
|
+
}
|
|
275
|
+
const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
|
|
276
|
+
|
|
277
|
+
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
|
|
278
|
+
TORCH_CHECK(
|
|
279
|
+
status == AVSUCCESS,
|
|
280
|
+
"Error while sending frame: ",
|
|
281
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
282
|
+
|
|
283
|
+
while (status >= 0) {
|
|
284
|
+
ReferenceAVPacket packet(autoAVPacket);
|
|
285
|
+
status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
|
|
286
|
+
if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
|
|
287
|
+
// TODO-ENCODING this is from TorchAudio, probably needed, but not sure.
|
|
288
|
+
// if (status == AVERROR_EOF) {
|
|
289
|
+
// status = av_interleaved_write_frame(avFormatContext_.get(),
|
|
290
|
+
// nullptr); TORCH_CHECK(
|
|
291
|
+
// status == AVSUCCESS,
|
|
292
|
+
// "Failed to flush packet ",
|
|
293
|
+
// getFFMPEGErrorStringFromErrorCode(status));
|
|
294
|
+
// }
|
|
295
|
+
return;
|
|
296
|
+
}
|
|
297
|
+
TORCH_CHECK(
|
|
298
|
+
status >= 0,
|
|
299
|
+
"Error receiving packet: ",
|
|
300
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
301
|
+
|
|
302
|
+
packet->stream_index = streamIndex_;
|
|
303
|
+
|
|
304
|
+
status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
|
|
305
|
+
TORCH_CHECK(
|
|
306
|
+
status == AVSUCCESS,
|
|
307
|
+
"Error in av_interleaved_write_frame: ",
|
|
308
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
void AudioEncoder::flushBuffers() {
|
|
313
|
+
// We flush the main FFmpeg buffers, but not swresample buffers. Flushing
|
|
314
|
+
// swresample is only necessary when converting sample rates, which we don't
|
|
315
|
+
// do for encoding.
|
|
316
|
+
AutoAVPacket autoAVPacket;
|
|
317
|
+
encodeInnerLoop(autoAVPacket, UniqueAVFrame(nullptr));
|
|
318
|
+
}
|
|
319
|
+
} // namespace facebook::torchcodec
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
#pragma once
|
|
2
|
+
#include <torch/types.h>
|
|
3
|
+
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
4
|
+
|
|
5
|
+
namespace facebook::torchcodec {
|
|
6
|
+
class AudioEncoder {
|
|
7
|
+
public:
|
|
8
|
+
~AudioEncoder();
|
|
9
|
+
|
|
10
|
+
// TODO-ENCODING: document in public docs that bit_rate value is only
|
|
11
|
+
// best-effort, matching to the closest supported bit_rate. I.e. passing 1 is
|
|
12
|
+
// like passing 0, which results in choosing the minimum supported bit rate.
|
|
13
|
+
// Passing 44_100 could result in output being 44000 if only 44000 is
|
|
14
|
+
// supported.
|
|
15
|
+
AudioEncoder(
|
|
16
|
+
const torch::Tensor wf,
|
|
17
|
+
// The *output* sample rate. We can't really decide for the user what it
|
|
18
|
+
// should be. Particularly, the sample rate of the input waveform should
|
|
19
|
+
// match this, and that's up to the user. If sample rates don't match,
|
|
20
|
+
// encoding will still work but audio will be distorted.
|
|
21
|
+
int sampleRate,
|
|
22
|
+
std::string_view fileName,
|
|
23
|
+
std::optional<int64_t> bitRate = std::nullopt);
|
|
24
|
+
void encode();
|
|
25
|
+
|
|
26
|
+
private:
|
|
27
|
+
void encodeInnerLoop(
|
|
28
|
+
AutoAVPacket& autoAVPacket,
|
|
29
|
+
const UniqueAVFrame& srcAVFrame);
|
|
30
|
+
void flushBuffers();
|
|
31
|
+
|
|
32
|
+
UniqueEncodingAVFormatContext avFormatContext_;
|
|
33
|
+
UniqueAVCodecContext avCodecContext_;
|
|
34
|
+
int streamIndex_;
|
|
35
|
+
UniqueSwrContext swrContext_;
|
|
36
|
+
|
|
37
|
+
const torch::Tensor wf_;
|
|
38
|
+
};
|
|
39
|
+
} // namespace facebook::torchcodec
|
|
@@ -0,0 +1,264 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// This source code is licensed under the BSD-style license found in the
|
|
5
|
+
// LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
8
|
+
|
|
9
|
+
#include <c10/util/Exception.h>
|
|
10
|
+
|
|
11
|
+
namespace facebook::torchcodec {
|
|
12
|
+
|
|
13
|
+
AutoAVPacket::AutoAVPacket() : avPacket_(av_packet_alloc()) {
|
|
14
|
+
TORCH_CHECK(avPacket_ != nullptr, "Couldn't allocate avPacket.");
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
AutoAVPacket::~AutoAVPacket() {
|
|
18
|
+
av_packet_free(&avPacket_);
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
ReferenceAVPacket::ReferenceAVPacket(AutoAVPacket& shared)
|
|
22
|
+
: avPacket_(shared.avPacket_) {}
|
|
23
|
+
|
|
24
|
+
ReferenceAVPacket::~ReferenceAVPacket() {
|
|
25
|
+
av_packet_unref(avPacket_);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
AVPacket* ReferenceAVPacket::get() {
|
|
29
|
+
return avPacket_;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
AVPacket* ReferenceAVPacket::operator->() {
|
|
33
|
+
return avPacket_;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
AVCodecOnlyUseForCallingAVFindBestStream
|
|
37
|
+
makeAVCodecOnlyUseForCallingAVFindBestStream(const AVCodec* codec) {
|
|
38
|
+
#if LIBAVCODEC_VERSION_INT < AV_VERSION_INT(59, 18, 100)
|
|
39
|
+
return const_cast<AVCodec*>(codec);
|
|
40
|
+
#else
|
|
41
|
+
return codec;
|
|
42
|
+
#endif
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
std::string getFFMPEGErrorStringFromErrorCode(int errorCode) {
|
|
46
|
+
char errorBuffer[AV_ERROR_MAX_STRING_SIZE] = {0};
|
|
47
|
+
av_strerror(errorCode, errorBuffer, AV_ERROR_MAX_STRING_SIZE);
|
|
48
|
+
return std::string(errorBuffer);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
int64_t getDuration(const UniqueAVFrame& avFrame) {
|
|
52
|
+
#if LIBAVUTIL_VERSION_MAJOR < 58
|
|
53
|
+
return avFrame->pkt_duration;
|
|
54
|
+
#else
|
|
55
|
+
return avFrame->duration;
|
|
56
|
+
#endif
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
int getNumChannels(const UniqueAVFrame& avFrame) {
|
|
60
|
+
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
|
|
61
|
+
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
|
|
62
|
+
return avFrame->ch_layout.nb_channels;
|
|
63
|
+
#else
|
|
64
|
+
return av_get_channel_layout_nb_channels(avFrame->channel_layout);
|
|
65
|
+
#endif
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
int getNumChannels(const UniqueAVCodecContext& avCodecContext) {
|
|
69
|
+
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
|
|
70
|
+
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
|
|
71
|
+
return avCodecContext->ch_layout.nb_channels;
|
|
72
|
+
#else
|
|
73
|
+
return avCodecContext->channels;
|
|
74
|
+
#endif
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
void setDefaultChannelLayout(
|
|
78
|
+
UniqueAVCodecContext& avCodecContext,
|
|
79
|
+
int numChannels) {
|
|
80
|
+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
81
|
+
AVChannelLayout channel_layout;
|
|
82
|
+
av_channel_layout_default(&channel_layout, numChannels);
|
|
83
|
+
avCodecContext->ch_layout = channel_layout;
|
|
84
|
+
|
|
85
|
+
#else
|
|
86
|
+
uint64_t channel_layout = av_get_default_channel_layout(numChannels);
|
|
87
|
+
avCodecContext->channel_layout = channel_layout;
|
|
88
|
+
avCodecContext->channels = numChannels;
|
|
89
|
+
#endif
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
void setChannelLayout(
|
|
93
|
+
UniqueAVFrame& dstAVFrame,
|
|
94
|
+
const UniqueAVCodecContext& avCodecContext) {
|
|
95
|
+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
96
|
+
auto status = av_channel_layout_copy(
|
|
97
|
+
&dstAVFrame->ch_layout, &avCodecContext->ch_layout);
|
|
98
|
+
TORCH_CHECK(
|
|
99
|
+
status == AVSUCCESS,
|
|
100
|
+
"Couldn't copy channel layout to avFrame: ",
|
|
101
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
102
|
+
#else
|
|
103
|
+
dstAVFrame->channel_layout = avCodecContext->channel_layout;
|
|
104
|
+
dstAVFrame->channels = avCodecContext->channels;
|
|
105
|
+
|
|
106
|
+
#endif
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
void setChannelLayout(
|
|
110
|
+
UniqueAVFrame& dstAVFrame,
|
|
111
|
+
const UniqueAVFrame& srcAVFrame) {
|
|
112
|
+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
113
|
+
dstAVFrame->ch_layout = srcAVFrame->ch_layout;
|
|
114
|
+
#else
|
|
115
|
+
dstAVFrame->channel_layout = srcAVFrame->channel_layout;
|
|
116
|
+
#endif
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
SwrContext* createSwrContext(
|
|
120
|
+
UniqueAVCodecContext& avCodecContext,
|
|
121
|
+
AVSampleFormat sourceSampleFormat,
|
|
122
|
+
AVSampleFormat desiredSampleFormat,
|
|
123
|
+
int sourceSampleRate,
|
|
124
|
+
int desiredSampleRate) {
|
|
125
|
+
SwrContext* swrContext = nullptr;
|
|
126
|
+
int status = AVSUCCESS;
|
|
127
|
+
#if LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
128
|
+
AVChannelLayout layout = avCodecContext->ch_layout;
|
|
129
|
+
status = swr_alloc_set_opts2(
|
|
130
|
+
&swrContext,
|
|
131
|
+
&layout,
|
|
132
|
+
desiredSampleFormat,
|
|
133
|
+
desiredSampleRate,
|
|
134
|
+
&layout,
|
|
135
|
+
sourceSampleFormat,
|
|
136
|
+
sourceSampleRate,
|
|
137
|
+
0,
|
|
138
|
+
nullptr);
|
|
139
|
+
|
|
140
|
+
TORCH_CHECK(
|
|
141
|
+
status == AVSUCCESS,
|
|
142
|
+
"Couldn't create SwrContext: ",
|
|
143
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
144
|
+
#else
|
|
145
|
+
int64_t layout = static_cast<int64_t>(avCodecContext->channel_layout);
|
|
146
|
+
swrContext = swr_alloc_set_opts(
|
|
147
|
+
nullptr,
|
|
148
|
+
layout,
|
|
149
|
+
desiredSampleFormat,
|
|
150
|
+
desiredSampleRate,
|
|
151
|
+
layout,
|
|
152
|
+
sourceSampleFormat,
|
|
153
|
+
sourceSampleRate,
|
|
154
|
+
0,
|
|
155
|
+
nullptr);
|
|
156
|
+
#endif
|
|
157
|
+
|
|
158
|
+
TORCH_CHECK(swrContext != nullptr, "Couldn't create swrContext");
|
|
159
|
+
status = swr_init(swrContext);
|
|
160
|
+
TORCH_CHECK(
|
|
161
|
+
status == AVSUCCESS,
|
|
162
|
+
"Couldn't initialize SwrContext: ",
|
|
163
|
+
getFFMPEGErrorStringFromErrorCode(status),
|
|
164
|
+
". If the error says 'Invalid argument', it's likely that you are using "
|
|
165
|
+
"a buggy FFmpeg version. FFmpeg4 is known to fail here in some "
|
|
166
|
+
"valid scenarios. Try to upgrade FFmpeg?");
|
|
167
|
+
return swrContext;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
UniqueAVFrame convertAudioAVFrameSampleFormatAndSampleRate(
|
|
171
|
+
const UniqueSwrContext& swrContext,
|
|
172
|
+
const UniqueAVFrame& srcAVFrame,
|
|
173
|
+
AVSampleFormat desiredSampleFormat,
|
|
174
|
+
int sourceSampleRate,
|
|
175
|
+
int desiredSampleRate) {
|
|
176
|
+
UniqueAVFrame convertedAVFrame(av_frame_alloc());
|
|
177
|
+
TORCH_CHECK(
|
|
178
|
+
convertedAVFrame,
|
|
179
|
+
"Could not allocate frame for sample format conversion.");
|
|
180
|
+
|
|
181
|
+
setChannelLayout(convertedAVFrame, srcAVFrame);
|
|
182
|
+
convertedAVFrame->format = static_cast<int>(desiredSampleFormat);
|
|
183
|
+
convertedAVFrame->sample_rate = desiredSampleRate;
|
|
184
|
+
if (sourceSampleRate != desiredSampleRate) {
|
|
185
|
+
// Note that this is an upper bound on the number of output samples.
|
|
186
|
+
// `swr_convert()` will likely not fill convertedAVFrame with that many
|
|
187
|
+
// samples if sample rate conversion is needed. It will buffer the last few
|
|
188
|
+
// ones because those require future samples. That's also why we reset
|
|
189
|
+
// nb_samples after the call to `swr_convert()`.
|
|
190
|
+
// We could also use `swr_get_out_samples()` to determine the number of
|
|
191
|
+
// output samples, but empirically `av_rescale_rnd()` seems to provide a
|
|
192
|
+
// tighter bound.
|
|
193
|
+
convertedAVFrame->nb_samples = av_rescale_rnd(
|
|
194
|
+
swr_get_delay(swrContext.get(), sourceSampleRate) +
|
|
195
|
+
srcAVFrame->nb_samples,
|
|
196
|
+
desiredSampleRate,
|
|
197
|
+
sourceSampleRate,
|
|
198
|
+
AV_ROUND_UP);
|
|
199
|
+
} else {
|
|
200
|
+
convertedAVFrame->nb_samples = srcAVFrame->nb_samples;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
auto status = av_frame_get_buffer(convertedAVFrame.get(), 0);
|
|
204
|
+
TORCH_CHECK(
|
|
205
|
+
status == AVSUCCESS,
|
|
206
|
+
"Could not allocate frame buffers for sample format conversion: ",
|
|
207
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
208
|
+
|
|
209
|
+
auto numConvertedSamples = swr_convert(
|
|
210
|
+
swrContext.get(),
|
|
211
|
+
convertedAVFrame->data,
|
|
212
|
+
convertedAVFrame->nb_samples,
|
|
213
|
+
static_cast<const uint8_t**>(
|
|
214
|
+
const_cast<const uint8_t**>(srcAVFrame->data)),
|
|
215
|
+
srcAVFrame->nb_samples);
|
|
216
|
+
// numConvertedSamples can be 0 if we're downsampling by a great factor and
|
|
217
|
+
// the first frame doesn't contain a lot of samples. It should be handled
|
|
218
|
+
// properly by the caller.
|
|
219
|
+
TORCH_CHECK(
|
|
220
|
+
numConvertedSamples >= 0,
|
|
221
|
+
"Error in swr_convert: ",
|
|
222
|
+
getFFMPEGErrorStringFromErrorCode(numConvertedSamples));
|
|
223
|
+
|
|
224
|
+
// See comment above about nb_samples
|
|
225
|
+
convertedAVFrame->nb_samples = numConvertedSamples;
|
|
226
|
+
|
|
227
|
+
return convertedAVFrame;
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
void setFFmpegLogLevel() {
|
|
231
|
+
auto logLevel = AV_LOG_QUIET;
|
|
232
|
+
const char* logLevelEnvPtr = std::getenv("TORCHCODEC_FFMPEG_LOG_LEVEL");
|
|
233
|
+
if (logLevelEnvPtr != nullptr) {
|
|
234
|
+
std::string logLevelEnv(logLevelEnvPtr);
|
|
235
|
+
if (logLevelEnv == "QUIET") {
|
|
236
|
+
logLevel = AV_LOG_QUIET;
|
|
237
|
+
} else if (logLevelEnv == "PANIC") {
|
|
238
|
+
logLevel = AV_LOG_PANIC;
|
|
239
|
+
} else if (logLevelEnv == "FATAL") {
|
|
240
|
+
logLevel = AV_LOG_FATAL;
|
|
241
|
+
} else if (logLevelEnv == "ERROR") {
|
|
242
|
+
logLevel = AV_LOG_ERROR;
|
|
243
|
+
} else if (logLevelEnv == "WARNING") {
|
|
244
|
+
logLevel = AV_LOG_WARNING;
|
|
245
|
+
} else if (logLevelEnv == "INFO") {
|
|
246
|
+
logLevel = AV_LOG_INFO;
|
|
247
|
+
} else if (logLevelEnv == "VERBOSE") {
|
|
248
|
+
logLevel = AV_LOG_VERBOSE;
|
|
249
|
+
} else if (logLevelEnv == "DEBUG") {
|
|
250
|
+
logLevel = AV_LOG_DEBUG;
|
|
251
|
+
} else if (logLevelEnv == "TRACE") {
|
|
252
|
+
logLevel = AV_LOG_TRACE;
|
|
253
|
+
} else {
|
|
254
|
+
TORCH_CHECK(
|
|
255
|
+
false,
|
|
256
|
+
"Invalid TORCHCODEC_FFMPEG_LOG_LEVEL: ",
|
|
257
|
+
logLevelEnv,
|
|
258
|
+
". Use e.g. 'QUIET', 'PANIC', 'VERBOSE', etc.");
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
av_log_set_level(logLevel);
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
} // namespace facebook::torchcodec
|