torchcodec 0.3.0__cp39-cp39-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of torchcodec might be problematic. Click here for more details.

Files changed (59) hide show
  1. torchcodec/.dylibs/libc++.1.0.dylib +0 -0
  2. torchcodec/.dylibs/libpython3.9.dylib +0 -0
  3. torchcodec/__init__.py +16 -0
  4. torchcodec/_core/AVIOBytesContext.cpp +70 -0
  5. torchcodec/_core/AVIOBytesContext.h +32 -0
  6. torchcodec/_core/AVIOContextHolder.cpp +50 -0
  7. torchcodec/_core/AVIOContextHolder.h +65 -0
  8. torchcodec/_core/AVIOFileLikeContext.cpp +80 -0
  9. torchcodec/_core/AVIOFileLikeContext.h +54 -0
  10. torchcodec/_core/CMakeLists.txt +237 -0
  11. torchcodec/_core/CudaDeviceInterface.cpp +289 -0
  12. torchcodec/_core/CudaDeviceInterface.h +34 -0
  13. torchcodec/_core/DeviceInterface.cpp +88 -0
  14. torchcodec/_core/DeviceInterface.h +66 -0
  15. torchcodec/_core/Encoder.cpp +319 -0
  16. torchcodec/_core/Encoder.h +39 -0
  17. torchcodec/_core/FFMPEGCommon.cpp +264 -0
  18. torchcodec/_core/FFMPEGCommon.h +180 -0
  19. torchcodec/_core/Frame.h +47 -0
  20. torchcodec/_core/Metadata.h +70 -0
  21. torchcodec/_core/SingleStreamDecoder.cpp +1947 -0
  22. torchcodec/_core/SingleStreamDecoder.h +462 -0
  23. torchcodec/_core/StreamOptions.h +49 -0
  24. torchcodec/_core/__init__.py +39 -0
  25. torchcodec/_core/_metadata.py +277 -0
  26. torchcodec/_core/custom_ops.cpp +681 -0
  27. torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +226 -0
  28. torchcodec/_core/ops.py +381 -0
  29. torchcodec/_core/pybind_ops.cpp +45 -0
  30. torchcodec/_frame.py +145 -0
  31. torchcodec/_internally_replaced_utils.py +53 -0
  32. torchcodec/_samplers/__init__.py +7 -0
  33. torchcodec/_samplers/video_clip_sampler.py +430 -0
  34. torchcodec/decoders/__init__.py +11 -0
  35. torchcodec/decoders/_audio_decoder.py +168 -0
  36. torchcodec/decoders/_decoder_utils.py +52 -0
  37. torchcodec/decoders/_video_decoder.py +399 -0
  38. torchcodec/libtorchcodec_custom_ops4.dylib +0 -0
  39. torchcodec/libtorchcodec_custom_ops5.dylib +0 -0
  40. torchcodec/libtorchcodec_custom_ops6.dylib +0 -0
  41. torchcodec/libtorchcodec_custom_ops7.dylib +0 -0
  42. torchcodec/libtorchcodec_decoder4.dylib +0 -0
  43. torchcodec/libtorchcodec_decoder5.dylib +0 -0
  44. torchcodec/libtorchcodec_decoder6.dylib +0 -0
  45. torchcodec/libtorchcodec_decoder7.dylib +0 -0
  46. torchcodec/libtorchcodec_pybind_ops4.so +0 -0
  47. torchcodec/libtorchcodec_pybind_ops5.so +0 -0
  48. torchcodec/libtorchcodec_pybind_ops6.so +0 -0
  49. torchcodec/libtorchcodec_pybind_ops7.so +0 -0
  50. torchcodec/samplers/__init__.py +2 -0
  51. torchcodec/samplers/_common.py +84 -0
  52. torchcodec/samplers/_index_based.py +285 -0
  53. torchcodec/samplers/_time_based.py +348 -0
  54. torchcodec/version.py +2 -0
  55. torchcodec-0.3.0.dist-info/LICENSE +28 -0
  56. torchcodec-0.3.0.dist-info/METADATA +280 -0
  57. torchcodec-0.3.0.dist-info/RECORD +59 -0
  58. torchcodec-0.3.0.dist-info/WHEEL +5 -0
  59. torchcodec-0.3.0.dist-info/top_level.txt +2 -0
@@ -0,0 +1,1947 @@
1
+ // Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ // All rights reserved.
3
+ //
4
+ // This source code is licensed under the BSD-style license found in the
5
+ // LICENSE file in the root directory of this source tree.
6
+
7
+ #include "src/torchcodec/_core/SingleStreamDecoder.h"
8
+ #include <cstdint>
9
+ #include <cstdio>
10
+ #include <iostream>
11
+ #include <limits>
12
+ #include <sstream>
13
+ #include <stdexcept>
14
+ #include <string_view>
15
+ #include "torch/types.h"
16
+
17
+ extern "C" {
18
+ #include <libavfilter/buffersink.h>
19
+ #include <libavfilter/buffersrc.h>
20
+ #include <libavutil/imgutils.h>
21
+ #include <libavutil/log.h>
22
+ }
23
+
24
+ namespace facebook::torchcodec {
25
+ namespace {
26
+
27
+ double ptsToSeconds(int64_t pts, int den) {
28
+ return static_cast<double>(pts) / den;
29
+ }
30
+
31
+ double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
32
+ return ptsToSeconds(pts, timeBase.den);
33
+ }
34
+
35
+ int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
36
+ return static_cast<int64_t>(std::round(seconds * timeBase.den));
37
+ }
38
+
39
+ } // namespace
40
+
41
+ // --------------------------------------------------------------------------
42
+ // CONSTRUCTORS, INITIALIZATION, DESTRUCTORS
43
+ // --------------------------------------------------------------------------
44
+
45
+ SingleStreamDecoder::SingleStreamDecoder(
46
+ const std::string& videoFilePath,
47
+ SeekMode seekMode)
48
+ : seekMode_(seekMode) {
49
+ setFFmpegLogLevel();
50
+
51
+ AVFormatContext* rawContext = nullptr;
52
+ int status =
53
+ avformat_open_input(&rawContext, videoFilePath.c_str(), nullptr, nullptr);
54
+ TORCH_CHECK(
55
+ status == 0,
56
+ "Could not open input file: " + videoFilePath + " " +
57
+ getFFMPEGErrorStringFromErrorCode(status));
58
+ TORCH_CHECK(rawContext != nullptr);
59
+ formatContext_.reset(rawContext);
60
+
61
+ initializeDecoder();
62
+ }
63
+
64
+ SingleStreamDecoder::SingleStreamDecoder(
65
+ std::unique_ptr<AVIOContextHolder> context,
66
+ SeekMode seekMode)
67
+ : seekMode_(seekMode), avioContextHolder_(std::move(context)) {
68
+ setFFmpegLogLevel();
69
+
70
+ TORCH_CHECK(avioContextHolder_, "Context holder cannot be null");
71
+
72
+ // Because FFmpeg requires a reference to a pointer in the call to open, we
73
+ // can't use a unique pointer here. Note that means we must call free if open
74
+ // fails.
75
+ AVFormatContext* rawContext = avformat_alloc_context();
76
+ TORCH_CHECK(rawContext != nullptr, "Unable to alloc avformat context");
77
+
78
+ rawContext->pb = avioContextHolder_->getAVIOContext();
79
+ int status = avformat_open_input(&rawContext, nullptr, nullptr, nullptr);
80
+ if (status != 0) {
81
+ avformat_free_context(rawContext);
82
+ TORCH_CHECK(
83
+ false,
84
+ "Failed to open input buffer: " +
85
+ getFFMPEGErrorStringFromErrorCode(status));
86
+ }
87
+
88
+ formatContext_.reset(rawContext);
89
+
90
+ initializeDecoder();
91
+ }
92
+
93
+ void SingleStreamDecoder::initializeDecoder() {
94
+ TORCH_CHECK(!initialized_, "Attempted double initialization.");
95
+
96
+ // In principle, the AVFormatContext should be filled in by the call to
97
+ // avformat_open_input() which reads the header. However, some formats do not
98
+ // store enough info in the header, so we call avformat_find_stream_info()
99
+ // which decodes a few frames to get missing info. For more, see:
100
+ // https://ffmpeg.org/doxygen/7.0/group__lavf__decoding.html
101
+ int status = avformat_find_stream_info(formatContext_.get(), nullptr);
102
+ if (status < 0) {
103
+ throw std::runtime_error(
104
+ "Failed to find stream info: " +
105
+ getFFMPEGErrorStringFromErrorCode(status));
106
+ }
107
+
108
+ for (unsigned int i = 0; i < formatContext_->nb_streams; i++) {
109
+ AVStream* avStream = formatContext_->streams[i];
110
+ StreamMetadata streamMetadata;
111
+
112
+ TORCH_CHECK(
113
+ static_cast<int>(i) == avStream->index,
114
+ "Our stream index, " + std::to_string(i) +
115
+ ", does not match AVStream's index, " +
116
+ std::to_string(avStream->index) + ".");
117
+ streamMetadata.streamIndex = i;
118
+ streamMetadata.mediaType = avStream->codecpar->codec_type;
119
+ streamMetadata.codecName = avcodec_get_name(avStream->codecpar->codec_id);
120
+ streamMetadata.bitRate = avStream->codecpar->bit_rate;
121
+
122
+ int64_t frameCount = avStream->nb_frames;
123
+ if (frameCount > 0) {
124
+ streamMetadata.numFrames = frameCount;
125
+ }
126
+
127
+ if (avStream->duration > 0 && avStream->time_base.den > 0) {
128
+ streamMetadata.durationSeconds =
129
+ av_q2d(avStream->time_base) * avStream->duration;
130
+ }
131
+ if (avStream->start_time != AV_NOPTS_VALUE) {
132
+ streamMetadata.beginStreamFromHeader =
133
+ av_q2d(avStream->time_base) * avStream->start_time;
134
+ }
135
+
136
+ if (avStream->codecpar->codec_type == AVMEDIA_TYPE_VIDEO) {
137
+ double fps = av_q2d(avStream->r_frame_rate);
138
+ if (fps > 0) {
139
+ streamMetadata.averageFps = fps;
140
+ }
141
+ containerMetadata_.numVideoStreams++;
142
+ } else if (avStream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) {
143
+ AVSampleFormat format =
144
+ static_cast<AVSampleFormat>(avStream->codecpar->format);
145
+
146
+ // If the AVSampleFormat is not recognized, we get back nullptr. We have
147
+ // to make sure we don't initialize a std::string with nullptr. There's
148
+ // nothing to do on the else branch because we're already using an
149
+ // optional; it'll just remain empty.
150
+ const char* rawSampleFormat = av_get_sample_fmt_name(format);
151
+ if (rawSampleFormat != nullptr) {
152
+ streamMetadata.sampleFormat = std::string(rawSampleFormat);
153
+ }
154
+ containerMetadata_.numAudioStreams++;
155
+ }
156
+
157
+ containerMetadata_.allStreamMetadata.push_back(streamMetadata);
158
+ }
159
+
160
+ if (formatContext_->duration > 0) {
161
+ containerMetadata_.durationSeconds =
162
+ ptsToSeconds(formatContext_->duration, AV_TIME_BASE);
163
+ }
164
+
165
+ if (formatContext_->bit_rate > 0) {
166
+ containerMetadata_.bitRate = formatContext_->bit_rate;
167
+ }
168
+
169
+ int bestVideoStream = getBestStreamIndex(AVMEDIA_TYPE_VIDEO);
170
+ if (bestVideoStream >= 0) {
171
+ containerMetadata_.bestVideoStreamIndex = bestVideoStream;
172
+ }
173
+
174
+ int bestAudioStream = getBestStreamIndex(AVMEDIA_TYPE_AUDIO);
175
+ if (bestAudioStream >= 0) {
176
+ containerMetadata_.bestAudioStreamIndex = bestAudioStream;
177
+ }
178
+
179
+ if (seekMode_ == SeekMode::exact) {
180
+ scanFileAndUpdateMetadataAndIndex();
181
+ }
182
+
183
+ initialized_ = true;
184
+ }
185
+
186
+ int SingleStreamDecoder::getBestStreamIndex(AVMediaType mediaType) {
187
+ AVCodecOnlyUseForCallingAVFindBestStream avCodec = nullptr;
188
+ int streamIndex =
189
+ av_find_best_stream(formatContext_.get(), mediaType, -1, -1, &avCodec, 0);
190
+ return streamIndex;
191
+ }
192
+
193
+ // --------------------------------------------------------------------------
194
+ // VIDEO METADATA QUERY API
195
+ // --------------------------------------------------------------------------
196
+
197
+ void SingleStreamDecoder::scanFileAndUpdateMetadataAndIndex() {
198
+ if (scannedAllStreams_) {
199
+ return;
200
+ }
201
+
202
+ for (unsigned int i = 0; i < formatContext_->nb_streams; ++i) {
203
+ // We want to scan and update the metadata of all streams.
204
+ TORCH_CHECK(
205
+ formatContext_->streams[i]->discard != AVDISCARD_ALL,
206
+ "Did you add a stream before you called for a scan?");
207
+ }
208
+
209
+ AutoAVPacket autoAVPacket;
210
+ while (true) {
211
+ ReferenceAVPacket packet(autoAVPacket);
212
+
213
+ // av_read_frame is a misleading name: it gets the next **packet**.
214
+ int status = av_read_frame(formatContext_.get(), packet.get());
215
+
216
+ if (status == AVERROR_EOF) {
217
+ break;
218
+ }
219
+
220
+ if (status != AVSUCCESS) {
221
+ throw std::runtime_error(
222
+ "Failed to read frame from input file: " +
223
+ getFFMPEGErrorStringFromErrorCode(status));
224
+ }
225
+
226
+ if (packet->flags & AV_PKT_FLAG_DISCARD) {
227
+ continue;
228
+ }
229
+
230
+ // We got a valid packet. Let's figure out what stream it belongs to and
231
+ // record its relevant metadata.
232
+ int streamIndex = packet->stream_index;
233
+ auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex];
234
+ streamMetadata.minPtsFromScan = std::min(
235
+ streamMetadata.minPtsFromScan.value_or(INT64_MAX), packet->pts);
236
+ streamMetadata.maxPtsFromScan = std::max(
237
+ streamMetadata.maxPtsFromScan.value_or(INT64_MIN),
238
+ packet->pts + packet->duration);
239
+ streamMetadata.numFramesFromScan =
240
+ streamMetadata.numFramesFromScan.value_or(0) + 1;
241
+
242
+ // Note that we set the other value in this struct, nextPts, only after
243
+ // we have scanned all packets and sorted by pts.
244
+ FrameInfo frameInfo = {packet->pts};
245
+ if (packet->flags & AV_PKT_FLAG_KEY) {
246
+ frameInfo.isKeyFrame = true;
247
+ streamInfos_[streamIndex].keyFrames.push_back(frameInfo);
248
+ }
249
+ streamInfos_[streamIndex].allFrames.push_back(frameInfo);
250
+ }
251
+
252
+ // Set all per-stream metadata that requires knowing the content of all
253
+ // packets.
254
+ for (size_t streamIndex = 0;
255
+ streamIndex < containerMetadata_.allStreamMetadata.size();
256
+ ++streamIndex) {
257
+ auto& streamMetadata = containerMetadata_.allStreamMetadata[streamIndex];
258
+ auto avStream = formatContext_->streams[streamIndex];
259
+
260
+ streamMetadata.numFramesFromScan =
261
+ streamInfos_[streamIndex].allFrames.size();
262
+
263
+ if (streamMetadata.minPtsFromScan.has_value()) {
264
+ streamMetadata.minPtsSecondsFromScan =
265
+ *streamMetadata.minPtsFromScan * av_q2d(avStream->time_base);
266
+ }
267
+ if (streamMetadata.maxPtsFromScan.has_value()) {
268
+ streamMetadata.maxPtsSecondsFromScan =
269
+ *streamMetadata.maxPtsFromScan * av_q2d(avStream->time_base);
270
+ }
271
+ }
272
+
273
+ // Reset the seek-cursor back to the beginning.
274
+ int status = avformat_seek_file(formatContext_.get(), 0, INT64_MIN, 0, 0, 0);
275
+ if (status < 0) {
276
+ throw std::runtime_error(
277
+ "Could not seek file to pts=0: " +
278
+ getFFMPEGErrorStringFromErrorCode(status));
279
+ }
280
+
281
+ // Sort all frames by their pts.
282
+ for (auto& [streamIndex, streamInfo] : streamInfos_) {
283
+ std::sort(
284
+ streamInfo.keyFrames.begin(),
285
+ streamInfo.keyFrames.end(),
286
+ [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
287
+ return frameInfo1.pts < frameInfo2.pts;
288
+ });
289
+ std::sort(
290
+ streamInfo.allFrames.begin(),
291
+ streamInfo.allFrames.end(),
292
+ [](const FrameInfo& frameInfo1, const FrameInfo& frameInfo2) {
293
+ return frameInfo1.pts < frameInfo2.pts;
294
+ });
295
+
296
+ size_t keyFrameIndex = 0;
297
+ for (size_t i = 0; i < streamInfo.allFrames.size(); ++i) {
298
+ streamInfo.allFrames[i].frameIndex = i;
299
+ if (streamInfo.allFrames[i].isKeyFrame) {
300
+ TORCH_CHECK(
301
+ keyFrameIndex < streamInfo.keyFrames.size(),
302
+ "The allFrames vec claims it has MORE keyFrames than the keyFrames vec. There's a bug in torchcodec.");
303
+ streamInfo.keyFrames[keyFrameIndex].frameIndex = i;
304
+ ++keyFrameIndex;
305
+ }
306
+ if (i + 1 < streamInfo.allFrames.size()) {
307
+ streamInfo.allFrames[i].nextPts = streamInfo.allFrames[i + 1].pts;
308
+ }
309
+ }
310
+ TORCH_CHECK(
311
+ keyFrameIndex == streamInfo.keyFrames.size(),
312
+ "The allFrames vec claims it has LESS keyFrames than the keyFrames vec. There's a bug in torchcodec.");
313
+ }
314
+
315
+ scannedAllStreams_ = true;
316
+ }
317
+
318
+ ContainerMetadata SingleStreamDecoder::getContainerMetadata() const {
319
+ return containerMetadata_;
320
+ }
321
+
322
+ torch::Tensor SingleStreamDecoder::getKeyFrameIndices() {
323
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
324
+ validateScannedAllStreams("getKeyFrameIndices");
325
+
326
+ const std::vector<FrameInfo>& keyFrames =
327
+ streamInfos_[activeStreamIndex_].keyFrames;
328
+ torch::Tensor keyFrameIndices =
329
+ torch::empty({static_cast<int64_t>(keyFrames.size())}, {torch::kInt64});
330
+ for (size_t i = 0; i < keyFrames.size(); ++i) {
331
+ keyFrameIndices[i] = keyFrames[i].frameIndex;
332
+ }
333
+
334
+ return keyFrameIndices;
335
+ }
336
+
337
+ // --------------------------------------------------------------------------
338
+ // ADDING STREAMS API
339
+ // --------------------------------------------------------------------------
340
+
341
+ void SingleStreamDecoder::addStream(
342
+ int streamIndex,
343
+ AVMediaType mediaType,
344
+ const torch::Device& device,
345
+ std::optional<int> ffmpegThreadCount) {
346
+ TORCH_CHECK(
347
+ activeStreamIndex_ == NO_ACTIVE_STREAM,
348
+ "Can only add one single stream.");
349
+ TORCH_CHECK(
350
+ mediaType == AVMEDIA_TYPE_VIDEO || mediaType == AVMEDIA_TYPE_AUDIO,
351
+ "Can only add video or audio streams.");
352
+ TORCH_CHECK(formatContext_.get() != nullptr);
353
+
354
+ AVCodecOnlyUseForCallingAVFindBestStream avCodec = nullptr;
355
+
356
+ activeStreamIndex_ = av_find_best_stream(
357
+ formatContext_.get(), mediaType, streamIndex, -1, &avCodec, 0);
358
+
359
+ if (activeStreamIndex_ < 0) {
360
+ throw std::invalid_argument(
361
+ "No valid stream found in input file. Is " +
362
+ std::to_string(streamIndex) + " of the desired media type?");
363
+ }
364
+
365
+ TORCH_CHECK(avCodec != nullptr);
366
+
367
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
368
+ streamInfo.streamIndex = activeStreamIndex_;
369
+ streamInfo.timeBase = formatContext_->streams[activeStreamIndex_]->time_base;
370
+ streamInfo.stream = formatContext_->streams[activeStreamIndex_];
371
+ streamInfo.avMediaType = mediaType;
372
+
373
+ deviceInterface_ = createDeviceInterface(device);
374
+
375
+ // This should never happen, checking just to be safe.
376
+ TORCH_CHECK(
377
+ streamInfo.stream->codecpar->codec_type == mediaType,
378
+ "FFmpeg found stream with index ",
379
+ activeStreamIndex_,
380
+ " which is of the wrong media type.");
381
+
382
+ // TODO_CODE_QUALITY it's pretty meh to have a video-specific logic within
383
+ // addStream() which is supposed to be generic
384
+ if (mediaType == AVMEDIA_TYPE_VIDEO) {
385
+ if (deviceInterface_) {
386
+ avCodec = makeAVCodecOnlyUseForCallingAVFindBestStream(
387
+ deviceInterface_->findCodec(streamInfo.stream->codecpar->codec_id)
388
+ .value_or(avCodec));
389
+ }
390
+ }
391
+
392
+ AVCodecContext* codecContext = avcodec_alloc_context3(avCodec);
393
+ TORCH_CHECK(codecContext != nullptr);
394
+ streamInfo.codecContext.reset(codecContext);
395
+
396
+ int retVal = avcodec_parameters_to_context(
397
+ streamInfo.codecContext.get(), streamInfo.stream->codecpar);
398
+ TORCH_CHECK_EQ(retVal, AVSUCCESS);
399
+
400
+ streamInfo.codecContext->thread_count = ffmpegThreadCount.value_or(0);
401
+ streamInfo.codecContext->pkt_timebase = streamInfo.stream->time_base;
402
+
403
+ // TODO_CODE_QUALITY same as above.
404
+ if (mediaType == AVMEDIA_TYPE_VIDEO) {
405
+ if (deviceInterface_) {
406
+ deviceInterface_->initializeContext(codecContext);
407
+ }
408
+ }
409
+
410
+ retVal = avcodec_open2(streamInfo.codecContext.get(), avCodec, nullptr);
411
+ if (retVal < AVSUCCESS) {
412
+ throw std::invalid_argument(getFFMPEGErrorStringFromErrorCode(retVal));
413
+ }
414
+
415
+ codecContext->time_base = streamInfo.stream->time_base;
416
+ containerMetadata_.allStreamMetadata[activeStreamIndex_].codecName =
417
+ std::string(avcodec_get_name(codecContext->codec_id));
418
+
419
+ // We will only need packets from the active stream, so we tell FFmpeg to
420
+ // discard packets from the other streams. Note that av_read_frame() may still
421
+ // return some of those un-desired packet under some conditions, so it's still
422
+ // important to discard/demux correctly in the inner decoding loop.
423
+ for (unsigned int i = 0; i < formatContext_->nb_streams; ++i) {
424
+ if (i != static_cast<unsigned int>(activeStreamIndex_)) {
425
+ formatContext_->streams[i]->discard = AVDISCARD_ALL;
426
+ }
427
+ }
428
+ }
429
+
430
+ void SingleStreamDecoder::addVideoStream(
431
+ int streamIndex,
432
+ const VideoStreamOptions& videoStreamOptions) {
433
+ addStream(
434
+ streamIndex,
435
+ AVMEDIA_TYPE_VIDEO,
436
+ videoStreamOptions.device,
437
+ videoStreamOptions.ffmpegThreadCount);
438
+
439
+ auto& streamMetadata =
440
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
441
+
442
+ if (seekMode_ == SeekMode::approximate &&
443
+ !streamMetadata.averageFps.has_value()) {
444
+ throw std::runtime_error(
445
+ "Seek mode is approximate, but stream " +
446
+ std::to_string(activeStreamIndex_) +
447
+ " does not have an average fps in its metadata.");
448
+ }
449
+
450
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
451
+ streamInfo.videoStreamOptions = videoStreamOptions;
452
+
453
+ streamMetadata.width = streamInfo.codecContext->width;
454
+ streamMetadata.height = streamInfo.codecContext->height;
455
+
456
+ // By default, we want to use swscale for color conversion because it is
457
+ // faster. However, it has width requirements, so we may need to fall back
458
+ // to filtergraph. We also need to respect what was requested from the
459
+ // options; we respect the options unconditionally, so it's possible for
460
+ // swscale's width requirements to be violated. We don't expose the ability to
461
+ // choose color conversion library publicly; we only use this ability
462
+ // internally.
463
+ int width = videoStreamOptions.width.value_or(streamInfo.codecContext->width);
464
+
465
+ // swscale requires widths to be multiples of 32:
466
+ // https://stackoverflow.com/questions/74351955/turn-off-sw-scale-conversion-to-planar-yuv-32-byte-alignment-requirements
467
+ // so we fall back to filtergraph if the width is not a multiple of 32.
468
+ auto defaultLibrary = (width % 32 == 0) ? ColorConversionLibrary::SWSCALE
469
+ : ColorConversionLibrary::FILTERGRAPH;
470
+
471
+ streamInfo.colorConversionLibrary =
472
+ videoStreamOptions.colorConversionLibrary.value_or(defaultLibrary);
473
+ }
474
+
475
+ void SingleStreamDecoder::addAudioStream(
476
+ int streamIndex,
477
+ const AudioStreamOptions& audioStreamOptions) {
478
+ TORCH_CHECK(
479
+ seekMode_ == SeekMode::approximate,
480
+ "seek_mode must be 'approximate' for audio streams.");
481
+
482
+ addStream(streamIndex, AVMEDIA_TYPE_AUDIO);
483
+
484
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
485
+ streamInfo.audioStreamOptions = audioStreamOptions;
486
+
487
+ auto& streamMetadata =
488
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
489
+ streamMetadata.sampleRate =
490
+ static_cast<int64_t>(streamInfo.codecContext->sample_rate);
491
+ streamMetadata.numChannels =
492
+ static_cast<int64_t>(getNumChannels(streamInfo.codecContext));
493
+
494
+ // FFmpeg docs say that the decoder will try to decode natively in this
495
+ // format, if it can. Docs don't say what the decoder does when it doesn't
496
+ // support that format, but it looks like it does nothing, so this probably
497
+ // doesn't hurt.
498
+ streamInfo.codecContext->request_sample_fmt = AV_SAMPLE_FMT_FLTP;
499
+ }
500
+
501
+ // --------------------------------------------------------------------------
502
+ // HIGH-LEVEL DECODING ENTRY-POINTS
503
+ // --------------------------------------------------------------------------
504
+
505
+ FrameOutput SingleStreamDecoder::getNextFrame() {
506
+ auto output = getNextFrameInternal();
507
+ if (streamInfos_[activeStreamIndex_].avMediaType == AVMEDIA_TYPE_VIDEO) {
508
+ output.data = maybePermuteHWC2CHW(output.data);
509
+ }
510
+ return output;
511
+ }
512
+
513
+ FrameOutput SingleStreamDecoder::getNextFrameInternal(
514
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
515
+ validateActiveStream();
516
+ UniqueAVFrame avFrame = decodeAVFrame(
517
+ [this](const UniqueAVFrame& avFrame) { return avFrame->pts >= cursor_; });
518
+ return convertAVFrameToFrameOutput(avFrame, preAllocatedOutputTensor);
519
+ }
520
+
521
+ FrameOutput SingleStreamDecoder::getFrameAtIndex(int64_t frameIndex) {
522
+ auto frameOutput = getFrameAtIndexInternal(frameIndex);
523
+ frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
524
+ return frameOutput;
525
+ }
526
+
527
+ FrameOutput SingleStreamDecoder::getFrameAtIndexInternal(
528
+ int64_t frameIndex,
529
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
530
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
531
+
532
+ const auto& streamInfo = streamInfos_[activeStreamIndex_];
533
+ const auto& streamMetadata =
534
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
535
+ validateFrameIndex(streamMetadata, frameIndex);
536
+
537
+ int64_t pts = getPts(frameIndex);
538
+ setCursorPtsInSeconds(ptsToSeconds(pts, streamInfo.timeBase));
539
+ return getNextFrameInternal(preAllocatedOutputTensor);
540
+ }
541
+
542
+ FrameBatchOutput SingleStreamDecoder::getFramesAtIndices(
543
+ const std::vector<int64_t>& frameIndices) {
544
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
545
+
546
+ auto indicesAreSorted =
547
+ std::is_sorted(frameIndices.begin(), frameIndices.end());
548
+
549
+ std::vector<size_t> argsort;
550
+ if (!indicesAreSorted) {
551
+ // if frameIndices is [13, 10, 12, 11]
552
+ // when sorted, it's [10, 11, 12, 13] <-- this is the sorted order we want
553
+ // to use to decode the frames
554
+ // and argsort is [ 1, 3, 2, 0]
555
+ argsort.resize(frameIndices.size());
556
+ for (size_t i = 0; i < argsort.size(); ++i) {
557
+ argsort[i] = i;
558
+ }
559
+ std::sort(
560
+ argsort.begin(), argsort.end(), [&frameIndices](size_t a, size_t b) {
561
+ return frameIndices[a] < frameIndices[b];
562
+ });
563
+ }
564
+
565
+ const auto& streamMetadata =
566
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
567
+ const auto& streamInfo = streamInfos_[activeStreamIndex_];
568
+ const auto& videoStreamOptions = streamInfo.videoStreamOptions;
569
+ FrameBatchOutput frameBatchOutput(
570
+ frameIndices.size(), videoStreamOptions, streamMetadata);
571
+
572
+ auto previousIndexInVideo = -1;
573
+ for (size_t f = 0; f < frameIndices.size(); ++f) {
574
+ auto indexInOutput = indicesAreSorted ? f : argsort[f];
575
+ auto indexInVideo = frameIndices[indexInOutput];
576
+
577
+ validateFrameIndex(streamMetadata, indexInVideo);
578
+
579
+ if ((f > 0) && (indexInVideo == previousIndexInVideo)) {
580
+ // Avoid decoding the same frame twice
581
+ auto previousIndexInOutput = indicesAreSorted ? f - 1 : argsort[f - 1];
582
+ frameBatchOutput.data[indexInOutput].copy_(
583
+ frameBatchOutput.data[previousIndexInOutput]);
584
+ frameBatchOutput.ptsSeconds[indexInOutput] =
585
+ frameBatchOutput.ptsSeconds[previousIndexInOutput];
586
+ frameBatchOutput.durationSeconds[indexInOutput] =
587
+ frameBatchOutput.durationSeconds[previousIndexInOutput];
588
+ } else {
589
+ FrameOutput frameOutput = getFrameAtIndexInternal(
590
+ indexInVideo, frameBatchOutput.data[indexInOutput]);
591
+ frameBatchOutput.ptsSeconds[indexInOutput] = frameOutput.ptsSeconds;
592
+ frameBatchOutput.durationSeconds[indexInOutput] =
593
+ frameOutput.durationSeconds;
594
+ }
595
+ previousIndexInVideo = indexInVideo;
596
+ }
597
+ frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
598
+ return frameBatchOutput;
599
+ }
600
+
601
+ FrameBatchOutput SingleStreamDecoder::getFramesInRange(
602
+ int64_t start,
603
+ int64_t stop,
604
+ int64_t step) {
605
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
606
+
607
+ const auto& streamMetadata =
608
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
609
+ const auto& streamInfo = streamInfos_[activeStreamIndex_];
610
+ int64_t numFrames = getNumFrames(streamMetadata);
611
+ TORCH_CHECK(
612
+ start >= 0, "Range start, " + std::to_string(start) + " is less than 0.");
613
+ TORCH_CHECK(
614
+ stop <= numFrames,
615
+ "Range stop, " + std::to_string(stop) +
616
+ ", is more than the number of frames, " + std::to_string(numFrames));
617
+ TORCH_CHECK(
618
+ step > 0, "Step must be greater than 0; is " + std::to_string(step));
619
+
620
+ int64_t numOutputFrames = std::ceil((stop - start) / double(step));
621
+ const auto& videoStreamOptions = streamInfo.videoStreamOptions;
622
+ FrameBatchOutput frameBatchOutput(
623
+ numOutputFrames, videoStreamOptions, streamMetadata);
624
+
625
+ for (int64_t i = start, f = 0; i < stop; i += step, ++f) {
626
+ FrameOutput frameOutput =
627
+ getFrameAtIndexInternal(i, frameBatchOutput.data[f]);
628
+ frameBatchOutput.ptsSeconds[f] = frameOutput.ptsSeconds;
629
+ frameBatchOutput.durationSeconds[f] = frameOutput.durationSeconds;
630
+ }
631
+ frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
632
+ return frameBatchOutput;
633
+ }
634
+
635
+ FrameOutput SingleStreamDecoder::getFramePlayedAt(double seconds) {
636
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
637
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
638
+ double frameStartTime =
639
+ ptsToSeconds(streamInfo.lastDecodedAvFramePts, streamInfo.timeBase);
640
+ double frameEndTime = ptsToSeconds(
641
+ streamInfo.lastDecodedAvFramePts + streamInfo.lastDecodedAvFrameDuration,
642
+ streamInfo.timeBase);
643
+ if (seconds >= frameStartTime && seconds < frameEndTime) {
644
+ // We are in the same frame as the one we just returned. However, since we
645
+ // don't cache it locally, we have to rewind back.
646
+ seconds = frameStartTime;
647
+ }
648
+
649
+ setCursorPtsInSeconds(seconds);
650
+ UniqueAVFrame avFrame =
651
+ decodeAVFrame([seconds, this](const UniqueAVFrame& avFrame) {
652
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
653
+ double frameStartTime = ptsToSeconds(avFrame->pts, streamInfo.timeBase);
654
+ double frameEndTime = ptsToSeconds(
655
+ avFrame->pts + getDuration(avFrame), streamInfo.timeBase);
656
+ if (frameStartTime > seconds) {
657
+ // FFMPEG seeked past the frame we are looking for even though we
658
+ // set max_ts to be our needed timestamp in avformat_seek_file()
659
+ // in maybeSeekToBeforeDesiredPts().
660
+ // This could be a bug in FFMPEG: https://trac.ffmpeg.org/ticket/11137
661
+ // In this case we return the very next frame instead of throwing an
662
+ // exception.
663
+ // TODO: Maybe log to stderr for Debug builds?
664
+ return true;
665
+ }
666
+ return seconds >= frameStartTime && seconds < frameEndTime;
667
+ });
668
+
669
+ // Convert the frame to tensor.
670
+ FrameOutput frameOutput = convertAVFrameToFrameOutput(avFrame);
671
+ frameOutput.data = maybePermuteHWC2CHW(frameOutput.data);
672
+ return frameOutput;
673
+ }
674
+
675
+ FrameBatchOutput SingleStreamDecoder::getFramesPlayedAt(
676
+ const std::vector<double>& timestamps) {
677
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
678
+
679
+ const auto& streamMetadata =
680
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
681
+
682
+ double minSeconds = getMinSeconds(streamMetadata);
683
+ double maxSeconds = getMaxSeconds(streamMetadata);
684
+
685
+ // The frame played at timestamp t and the one played at timestamp `t +
686
+ // eps` are probably the same frame, with the same index. The easiest way to
687
+ // avoid decoding that unique frame twice is to convert the input timestamps
688
+ // to indices, and leverage the de-duplication logic of getFramesAtIndices.
689
+
690
+ std::vector<int64_t> frameIndices(timestamps.size());
691
+ for (size_t i = 0; i < timestamps.size(); ++i) {
692
+ auto frameSeconds = timestamps[i];
693
+ TORCH_CHECK(
694
+ frameSeconds >= minSeconds && frameSeconds < maxSeconds,
695
+ "frame pts is " + std::to_string(frameSeconds) +
696
+ "; must be in range [" + std::to_string(minSeconds) + ", " +
697
+ std::to_string(maxSeconds) + ").");
698
+
699
+ frameIndices[i] = secondsToIndexLowerBound(frameSeconds);
700
+ }
701
+
702
+ return getFramesAtIndices(frameIndices);
703
+ }
704
+
705
+ FrameBatchOutput SingleStreamDecoder::getFramesPlayedInRange(
706
+ double startSeconds,
707
+ double stopSeconds) {
708
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
709
+ const auto& streamMetadata =
710
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
711
+ TORCH_CHECK(
712
+ startSeconds <= stopSeconds,
713
+ "Start seconds (" + std::to_string(startSeconds) +
714
+ ") must be less than or equal to stop seconds (" +
715
+ std::to_string(stopSeconds) + ".");
716
+
717
+ const auto& streamInfo = streamInfos_[activeStreamIndex_];
718
+ const auto& videoStreamOptions = streamInfo.videoStreamOptions;
719
+
720
+ // Special case needed to implement a half-open range. At first glance, this
721
+ // may seem unnecessary, as our search for stopFrame can return the end, and
722
+ // we don't include stopFramIndex in our output. However, consider the
723
+ // following scenario:
724
+ //
725
+ // frame=0, pts=0.0
726
+ // frame=1, pts=0.3
727
+ //
728
+ // interval A: [0.2, 0.2)
729
+ // interval B: [0.2, 0.15)
730
+ //
731
+ // Both intervals take place between the pts values for frame 0 and frame 1,
732
+ // which by our abstract player, means that both intervals map to frame 0. By
733
+ // the definition of a half open interval, interval A should return no frames.
734
+ // Interval B should return frame 0. However, for both A and B, the individual
735
+ // values of the intervals will map to the same frame indices below. Hence, we
736
+ // need this special case below.
737
+ if (startSeconds == stopSeconds) {
738
+ FrameBatchOutput frameBatchOutput(0, videoStreamOptions, streamMetadata);
739
+ frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
740
+ return frameBatchOutput;
741
+ }
742
+
743
+ double minSeconds = getMinSeconds(streamMetadata);
744
+ double maxSeconds = getMaxSeconds(streamMetadata);
745
+ TORCH_CHECK(
746
+ startSeconds >= minSeconds && startSeconds < maxSeconds,
747
+ "Start seconds is " + std::to_string(startSeconds) +
748
+ "; must be in range [" + std::to_string(minSeconds) + ", " +
749
+ std::to_string(maxSeconds) + ").");
750
+ TORCH_CHECK(
751
+ stopSeconds <= maxSeconds,
752
+ "Stop seconds (" + std::to_string(stopSeconds) +
753
+ "; must be less than or equal to " + std::to_string(maxSeconds) +
754
+ ").");
755
+
756
+ // Note that we look at nextPts for a frame, and not its pts or duration.
757
+ // Our abstract player displays frames starting at the pts for that frame
758
+ // until the pts for the next frame. There are two consequences:
759
+ //
760
+ // 1. We ignore the duration for a frame. A frame is played until the
761
+ // next frame replaces it. This model is robust to durations being 0 or
762
+ // incorrect; our source of truth is the pts for frames. If duration is
763
+ // accurate, the nextPts for a frame would be equivalent to pts +
764
+ // duration.
765
+ // 2. In order to establish if the start of an interval maps to a
766
+ // particular frame, we need to figure out if it is ordered after the
767
+ // frame's pts, but before the next frames's pts.
768
+
769
+ int64_t startFrameIndex = secondsToIndexLowerBound(startSeconds);
770
+ int64_t stopFrameIndex = secondsToIndexUpperBound(stopSeconds);
771
+ int64_t numFrames = stopFrameIndex - startFrameIndex;
772
+
773
+ FrameBatchOutput frameBatchOutput(
774
+ numFrames, videoStreamOptions, streamMetadata);
775
+ for (int64_t i = startFrameIndex, f = 0; i < stopFrameIndex; ++i, ++f) {
776
+ FrameOutput frameOutput =
777
+ getFrameAtIndexInternal(i, frameBatchOutput.data[f]);
778
+ frameBatchOutput.ptsSeconds[f] = frameOutput.ptsSeconds;
779
+ frameBatchOutput.durationSeconds[f] = frameOutput.durationSeconds;
780
+ }
781
+ frameBatchOutput.data = maybePermuteHWC2CHW(frameBatchOutput.data);
782
+
783
+ return frameBatchOutput;
784
+ }
785
+
786
+ // Note [Audio Decoding Design]
787
+ // This note explains why audio decoding is implemented the way it is, and why
788
+ // it inherently differs from video decoding.
789
+ //
790
+ // Like for video, FFmpeg exposes the concept of a frame for audio streams. An
791
+ // audio frame is a contiguous sequence of samples, where a sample consists of
792
+ // `numChannels` values. An audio frame, or a sequence thereof, is always
793
+ // converted into a tensor of shape `(numChannels, numSamplesPerChannel)`.
794
+ //
795
+ // The notion of 'frame' in audio isn't what users want to interact with. Users
796
+ // want to interact with samples. The C++ and core APIs return frames, because
797
+ // we want those to be close to FFmpeg concepts, but the higher-level public
798
+ // APIs expose samples. As a result:
799
+ // - We don't expose index-based APIs for audio, because that would mean
800
+ // exposing the concept of audio frame. For now, we think exposing time-based
801
+ // APIs is more natural.
802
+ // - We never perform a scan for audio streams. We don't need to, since we won't
803
+ // be converting timestamps to indices. That's why we enforce the seek_mode
804
+ // to be "approximate" (which is slightly misleading, because technically the
805
+ // output samples will be at their exact positions. But this incongruence is
806
+ // only exposed at the C++/core private levels).
807
+ //
808
+ // Audio frames are of variable dimensions: in the same stream, a frame can
809
+ // contain 1024 samples and the next one may contain 512 [1]. This makes it
810
+ // impossible to stack audio frames in the same way we can stack video frames.
811
+ // This is one of the main reasons we cannot reuse the same pre-allocation logic
812
+ // we have for videos in getFramesPlayedInRange(): pre-allocating a batch
813
+ // requires constant (and known) frame dimensions. That's also why
814
+ // *concatenated* along the samples dimension, not stacked.
815
+ //
816
+ // [IMPORTANT!] There is one key invariant that we must respect when decoding
817
+ // audio frames:
818
+ //
819
+ // BEFORE DECODING FRAME i, WE MUST DECODE ALL FRAMES j < i.
820
+ //
821
+ // Always. Why? We don't know. What we know is that if we don't, we get clipped,
822
+ // incorrect audio as output [2]. All other (correct) libraries like TorchAudio
823
+ // or Decord do something similar, whether it was intended or not. This has a
824
+ // few implications:
825
+ // - The **only** place we're allowed to seek to in an audio stream is the
826
+ // stream's beginning. This ensures that if we need a frame, we'll have
827
+ // decoded all previous frames.
828
+ // - Because of that, we don't allow the public APIs to seek. Public APIs can
829
+ // call next() and `getFramesPlayedInRangeAudio()`, but they cannot manually
830
+ // seek.
831
+ // - We try not to seek, when we can avoid it. Typically if the next frame we
832
+ // need is in the future, we don't seek back to the beginning, we just decode
833
+ // all the frames in-between.
834
+ //
835
+ // [2] If you're brave and curious, you can read the long "Seek offset for
836
+ // audio" note in https://github.com/pytorch/torchcodec/pull/507/files, which
837
+ // sums up past (and failed) attemps at working around this issue.
838
+ AudioFramesOutput SingleStreamDecoder::getFramesPlayedInRangeAudio(
839
+ double startSeconds,
840
+ std::optional<double> stopSecondsOptional) {
841
+ validateActiveStream(AVMEDIA_TYPE_AUDIO);
842
+
843
+ if (stopSecondsOptional.has_value()) {
844
+ TORCH_CHECK(
845
+ startSeconds <= *stopSecondsOptional,
846
+ "Start seconds (" + std::to_string(startSeconds) +
847
+ ") must be less than or equal to stop seconds (" +
848
+ std::to_string(*stopSecondsOptional) + ").");
849
+ }
850
+
851
+ if (stopSecondsOptional.has_value() && startSeconds == *stopSecondsOptional) {
852
+ // For consistency with video
853
+ return AudioFramesOutput{torch::empty({0, 0}), 0.0};
854
+ }
855
+
856
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
857
+
858
+ auto startPts = secondsToClosestPts(startSeconds, streamInfo.timeBase);
859
+ if (startPts < streamInfo.lastDecodedAvFramePts +
860
+ streamInfo.lastDecodedAvFrameDuration) {
861
+ // If we need to seek backwards, then we have to seek back to the beginning
862
+ // of the stream.
863
+ // See [Audio Decoding Design].
864
+ setCursor(INT64_MIN);
865
+ }
866
+
867
+ // TODO-AUDIO Pre-allocate a long-enough tensor instead of creating a vec +
868
+ // cat(). This would save a copy. We know the duration of the output and the
869
+ // sample rate, so in theory we know the number of output samples.
870
+ std::vector<torch::Tensor> frames;
871
+
872
+ std::optional<double> firstFramePtsSeconds = std::nullopt;
873
+ auto stopPts = stopSecondsOptional.has_value()
874
+ ? secondsToClosestPts(*stopSecondsOptional, streamInfo.timeBase)
875
+ : INT64_MAX;
876
+ auto finished = false;
877
+ while (!finished) {
878
+ try {
879
+ UniqueAVFrame avFrame =
880
+ decodeAVFrame([startPts](const UniqueAVFrame& avFrame) {
881
+ return startPts < avFrame->pts + getDuration(avFrame);
882
+ });
883
+ auto frameOutput = convertAVFrameToFrameOutput(avFrame);
884
+ if (!firstFramePtsSeconds.has_value()) {
885
+ firstFramePtsSeconds = frameOutput.ptsSeconds;
886
+ }
887
+ frames.push_back(frameOutput.data);
888
+ } catch (const EndOfFileException& e) {
889
+ finished = true;
890
+ }
891
+
892
+ // If stopSeconds is in [begin, end] of the last decoded frame, we should
893
+ // stop decoding more frames. Note that if we were to use [begin, end),
894
+ // which may seem more natural, then we would decode the frame starting at
895
+ // stopSeconds, which isn't what we want!
896
+ auto lastDecodedAvFrameEnd = streamInfo.lastDecodedAvFramePts +
897
+ streamInfo.lastDecodedAvFrameDuration;
898
+ finished |= (streamInfo.lastDecodedAvFramePts) <= stopPts &&
899
+ (stopPts <= lastDecodedAvFrameEnd);
900
+ }
901
+
902
+ auto lastSamples = maybeFlushSwrBuffers();
903
+ if (lastSamples.has_value()) {
904
+ frames.push_back(*lastSamples);
905
+ }
906
+
907
+ TORCH_CHECK(
908
+ frames.size() > 0 && firstFramePtsSeconds.has_value(),
909
+ "No audio frames were decoded. ",
910
+ "This is probably because start_seconds is too high? ",
911
+ "Current value is ",
912
+ startSeconds);
913
+
914
+ return AudioFramesOutput{torch::cat(frames, 1), *firstFramePtsSeconds};
915
+ }
916
+
917
+ // --------------------------------------------------------------------------
918
+ // SEEKING APIs
919
+ // --------------------------------------------------------------------------
920
+
921
+ void SingleStreamDecoder::setCursorPtsInSeconds(double seconds) {
922
+ // We don't allow public audio decoding APIs to seek, see [Audio Decoding
923
+ // Design]
924
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
925
+ setCursor(
926
+ secondsToClosestPts(seconds, streamInfos_[activeStreamIndex_].timeBase));
927
+ }
928
+
929
+ void SingleStreamDecoder::setCursor(int64_t pts) {
930
+ cursorWasJustSet_ = true;
931
+ cursor_ = pts;
932
+ }
933
+
934
+ /*
935
+ Videos have I frames and non-I frames (P and B frames). Non-I frames need data
936
+ from the previous I frame to be decoded.
937
+
938
+ Imagine the cursor is at a random frame with PTS=lastDecodedAvFramePts (x for
939
+ brevity) and we wish to seek to a user-specified PTS=y.
940
+
941
+ If y < x, we don't have a choice but to seek backwards to the highest I frame
942
+ before y.
943
+
944
+ If y > x, we have two choices:
945
+
946
+ 1. We could keep decoding forward until we hit y. Illustrated below:
947
+
948
+ I P P P I P P P I P P I P P I P
949
+ x y
950
+
951
+ 2. We could try to jump to an I frame between x and y (indicated by j below).
952
+ And then start decoding until we encounter y. Illustrated below:
953
+
954
+ I P P P I P P P I P P I P P I P
955
+ x j y
956
+
957
+ (2) is more efficient than (1) if there is an I frame between x and y.
958
+ */
959
+ bool SingleStreamDecoder::canWeAvoidSeeking() const {
960
+ const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
961
+ if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
962
+ // For audio, we only need to seek if a backwards seek was requested within
963
+ // getFramesPlayedInRangeAudio(), when setCursorPtsInSeconds() was called.
964
+ // For more context, see [Audio Decoding Design]
965
+ return !cursorWasJustSet_;
966
+ }
967
+ int64_t lastDecodedAvFramePts =
968
+ streamInfos_.at(activeStreamIndex_).lastDecodedAvFramePts;
969
+ if (cursor_ < lastDecodedAvFramePts) {
970
+ // We can never skip a seek if we are seeking backwards.
971
+ return false;
972
+ }
973
+ if (lastDecodedAvFramePts == cursor_) {
974
+ // We are seeking to the exact same frame as we are currently at. Without
975
+ // caching we have to rewind back and decode the frame again.
976
+ // TODO: https://github.com/pytorch-labs/torchcodec/issues/84 we could
977
+ // implement caching.
978
+ return false;
979
+ }
980
+ // We are seeking forwards.
981
+ // We can only skip a seek if both lastDecodedAvFramePts and
982
+ // cursor_ share the same keyframe.
983
+ int lastDecodedAvFrameIndex = getKeyFrameIndexForPts(lastDecodedAvFramePts);
984
+ int targetKeyFrameIndex = getKeyFrameIndexForPts(cursor_);
985
+ return lastDecodedAvFrameIndex >= 0 && targetKeyFrameIndex >= 0 &&
986
+ lastDecodedAvFrameIndex == targetKeyFrameIndex;
987
+ }
988
+
989
+ // This method looks at currentPts and desiredPts and seeks in the
990
+ // AVFormatContext if it is needed. We can skip seeking in certain cases. See
991
+ // the comment of canWeAvoidSeeking() for details.
992
+ void SingleStreamDecoder::maybeSeekToBeforeDesiredPts() {
993
+ validateActiveStream();
994
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
995
+
996
+ decodeStats_.numSeeksAttempted++;
997
+ if (canWeAvoidSeeking()) {
998
+ decodeStats_.numSeeksSkipped++;
999
+ return;
1000
+ }
1001
+
1002
+ int64_t desiredPts = cursor_;
1003
+
1004
+ // For some encodings like H265, FFMPEG sometimes seeks past the point we
1005
+ // set as the max_ts. So we use our own index to give it the exact pts of
1006
+ // the key frame that we want to seek to.
1007
+ // See https://github.com/pytorch/torchcodec/issues/179 for more details.
1008
+ // See https://trac.ffmpeg.org/ticket/11137 for the underlying ffmpeg bug.
1009
+ if (!streamInfo.keyFrames.empty()) {
1010
+ int desiredKeyFrameIndex = getKeyFrameIndexForPtsUsingScannedIndex(
1011
+ streamInfo.keyFrames, desiredPts);
1012
+ desiredKeyFrameIndex = std::max(desiredKeyFrameIndex, 0);
1013
+ desiredPts = streamInfo.keyFrames[desiredKeyFrameIndex].pts;
1014
+ }
1015
+
1016
+ int status = avformat_seek_file(
1017
+ formatContext_.get(),
1018
+ streamInfo.streamIndex,
1019
+ INT64_MIN,
1020
+ desiredPts,
1021
+ desiredPts,
1022
+ 0);
1023
+ if (status < 0) {
1024
+ throw std::runtime_error(
1025
+ "Could not seek file to pts=" + std::to_string(desiredPts) + ": " +
1026
+ getFFMPEGErrorStringFromErrorCode(status));
1027
+ }
1028
+ decodeStats_.numFlushes++;
1029
+ avcodec_flush_buffers(streamInfo.codecContext.get());
1030
+ }
1031
+
1032
+ // --------------------------------------------------------------------------
1033
+ // LOW-LEVEL DECODING
1034
+ // --------------------------------------------------------------------------
1035
+
1036
+ UniqueAVFrame SingleStreamDecoder::decodeAVFrame(
1037
+ std::function<bool(const UniqueAVFrame&)> filterFunction) {
1038
+ validateActiveStream();
1039
+
1040
+ resetDecodeStats();
1041
+
1042
+ if (cursorWasJustSet_) {
1043
+ maybeSeekToBeforeDesiredPts();
1044
+ cursorWasJustSet_ = false;
1045
+ }
1046
+
1047
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
1048
+
1049
+ // Need to get the next frame or error from PopFrame.
1050
+ UniqueAVFrame avFrame(av_frame_alloc());
1051
+ AutoAVPacket autoAVPacket;
1052
+ int status = AVSUCCESS;
1053
+ bool reachedEOF = false;
1054
+ while (true) {
1055
+ status =
1056
+ avcodec_receive_frame(streamInfo.codecContext.get(), avFrame.get());
1057
+
1058
+ if (status != AVSUCCESS && status != AVERROR(EAGAIN)) {
1059
+ // Non-retriable error
1060
+ break;
1061
+ }
1062
+
1063
+ decodeStats_.numFramesReceivedByDecoder++;
1064
+ // Is this the kind of frame we're looking for?
1065
+ if (status == AVSUCCESS && filterFunction(avFrame)) {
1066
+ // Yes, this is the frame we'll return; break out of the decoding loop.
1067
+ break;
1068
+ } else if (status == AVSUCCESS) {
1069
+ // No, but we received a valid frame - just not the kind we're looking
1070
+ // for. The logic below will read packets and send them to the decoder.
1071
+ // But since we did just receive a frame, we should skip reading more
1072
+ // packets and sending them to the decoder and just try to receive more
1073
+ // frames from the decoder.
1074
+ continue;
1075
+ }
1076
+
1077
+ if (reachedEOF) {
1078
+ // We don't have any more packets to receive. So keep on pulling frames
1079
+ // from its internal buffers.
1080
+ continue;
1081
+ }
1082
+
1083
+ // We still haven't found the frame we're looking for. So let's read more
1084
+ // packets and send them to the decoder.
1085
+ ReferenceAVPacket packet(autoAVPacket);
1086
+ do {
1087
+ status = av_read_frame(formatContext_.get(), packet.get());
1088
+ decodeStats_.numPacketsRead++;
1089
+
1090
+ if (status == AVERROR_EOF) {
1091
+ // End of file reached. We must drain the codec by sending a nullptr
1092
+ // packet.
1093
+ status = avcodec_send_packet(
1094
+ streamInfo.codecContext.get(),
1095
+ /*avpkt=*/nullptr);
1096
+ if (status < AVSUCCESS) {
1097
+ throw std::runtime_error(
1098
+ "Could not flush decoder: " +
1099
+ getFFMPEGErrorStringFromErrorCode(status));
1100
+ }
1101
+
1102
+ reachedEOF = true;
1103
+ break;
1104
+ }
1105
+
1106
+ if (status < AVSUCCESS) {
1107
+ throw std::runtime_error(
1108
+ "Could not read frame from input file: " +
1109
+ getFFMPEGErrorStringFromErrorCode(status));
1110
+ }
1111
+ } while (packet->stream_index != activeStreamIndex_);
1112
+
1113
+ if (reachedEOF) {
1114
+ // We don't have any more packets to send to the decoder. So keep on
1115
+ // pulling frames from its internal buffers.
1116
+ continue;
1117
+ }
1118
+
1119
+ // We got a valid packet. Send it to the decoder, and we'll receive it in
1120
+ // the next iteration.
1121
+ status = avcodec_send_packet(streamInfo.codecContext.get(), packet.get());
1122
+ if (status < AVSUCCESS) {
1123
+ throw std::runtime_error(
1124
+ "Could not push packet to decoder: " +
1125
+ getFFMPEGErrorStringFromErrorCode(status));
1126
+ }
1127
+
1128
+ decodeStats_.numPacketsSentToDecoder++;
1129
+ }
1130
+
1131
+ if (status < AVSUCCESS) {
1132
+ if (reachedEOF || status == AVERROR_EOF) {
1133
+ throw SingleStreamDecoder::EndOfFileException(
1134
+ "Requested next frame while there are no more frames left to "
1135
+ "decode.");
1136
+ }
1137
+ throw std::runtime_error(
1138
+ "Could not receive frame from decoder: " +
1139
+ getFFMPEGErrorStringFromErrorCode(status));
1140
+ }
1141
+
1142
+ // Note that we don't flush the decoder when we reach EOF (even though that's
1143
+ // mentioned in https://ffmpeg.org/doxygen/trunk/group__lavc__encdec.html).
1144
+ // This is because we may have packets internally in the decoder that we
1145
+ // haven't received as frames. Eventually we will either hit AVERROR_EOF from
1146
+ // av_receive_frame() or the user will have seeked to a different location in
1147
+ // the file and that will flush the decoder.
1148
+ streamInfo.lastDecodedAvFramePts = avFrame->pts;
1149
+ streamInfo.lastDecodedAvFrameDuration = getDuration(avFrame);
1150
+
1151
+ return avFrame;
1152
+ }
1153
+
1154
+ // --------------------------------------------------------------------------
1155
+ // AVFRAME <-> FRAME OUTPUT CONVERSION
1156
+ // --------------------------------------------------------------------------
1157
+
1158
+ FrameOutput SingleStreamDecoder::convertAVFrameToFrameOutput(
1159
+ UniqueAVFrame& avFrame,
1160
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
1161
+ // Convert the frame to tensor.
1162
+ FrameOutput frameOutput;
1163
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1164
+ frameOutput.ptsSeconds = ptsToSeconds(
1165
+ avFrame->pts, formatContext_->streams[activeStreamIndex_]->time_base);
1166
+ frameOutput.durationSeconds = ptsToSeconds(
1167
+ getDuration(avFrame),
1168
+ formatContext_->streams[activeStreamIndex_]->time_base);
1169
+ if (streamInfo.avMediaType == AVMEDIA_TYPE_AUDIO) {
1170
+ convertAudioAVFrameToFrameOutputOnCPU(avFrame, frameOutput);
1171
+ } else if (!deviceInterface_) {
1172
+ convertAVFrameToFrameOutputOnCPU(
1173
+ avFrame, frameOutput, preAllocatedOutputTensor);
1174
+ } else if (deviceInterface_) {
1175
+ deviceInterface_->convertAVFrameToFrameOutput(
1176
+ streamInfo.videoStreamOptions,
1177
+ avFrame,
1178
+ frameOutput,
1179
+ preAllocatedOutputTensor);
1180
+ }
1181
+ return frameOutput;
1182
+ }
1183
+
1184
+ // Note [preAllocatedOutputTensor with swscale and filtergraph]:
1185
+ // Callers may pass a pre-allocated tensor, where the output.data tensor will
1186
+ // be stored. This parameter is honored in any case, but it only leads to a
1187
+ // speed-up when swscale is used. With swscale, we can tell ffmpeg to place the
1188
+ // decoded frame directly into `preAllocatedtensor.data_ptr()`. We haven't yet
1189
+ // found a way to do that with filtegraph.
1190
+ // TODO: Figure out whether that's possible!
1191
+ // Dimension order of the preAllocatedOutputTensor must be HWC, regardless of
1192
+ // `dimension_order` parameter. It's up to callers to re-shape it if needed.
1193
+ void SingleStreamDecoder::convertAVFrameToFrameOutputOnCPU(
1194
+ UniqueAVFrame& avFrame,
1195
+ FrameOutput& frameOutput,
1196
+ std::optional<torch::Tensor> preAllocatedOutputTensor) {
1197
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1198
+
1199
+ auto frameDims = getHeightAndWidthFromOptionsOrAVFrame(
1200
+ streamInfo.videoStreamOptions, avFrame);
1201
+ int expectedOutputHeight = frameDims.height;
1202
+ int expectedOutputWidth = frameDims.width;
1203
+
1204
+ if (preAllocatedOutputTensor.has_value()) {
1205
+ auto shape = preAllocatedOutputTensor.value().sizes();
1206
+ TORCH_CHECK(
1207
+ (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
1208
+ (shape[1] == expectedOutputWidth) && (shape[2] == 3),
1209
+ "Expected pre-allocated tensor of shape ",
1210
+ expectedOutputHeight,
1211
+ "x",
1212
+ expectedOutputWidth,
1213
+ "x3, got ",
1214
+ shape);
1215
+ }
1216
+
1217
+ torch::Tensor outputTensor;
1218
+ // We need to compare the current frame context with our previous frame
1219
+ // context. If they are different, then we need to re-create our colorspace
1220
+ // conversion objects. We create our colorspace conversion objects late so
1221
+ // that we don't have to depend on the unreliable metadata in the header.
1222
+ // And we sometimes re-create them because it's possible for frame
1223
+ // resolution to change mid-stream. Finally, we want to reuse the colorspace
1224
+ // conversion objects as much as possible for performance reasons.
1225
+ enum AVPixelFormat frameFormat =
1226
+ static_cast<enum AVPixelFormat>(avFrame->format);
1227
+ auto frameContext = DecodedFrameContext{
1228
+ avFrame->width,
1229
+ avFrame->height,
1230
+ frameFormat,
1231
+ expectedOutputWidth,
1232
+ expectedOutputHeight};
1233
+
1234
+ if (streamInfo.colorConversionLibrary == ColorConversionLibrary::SWSCALE) {
1235
+ outputTensor = preAllocatedOutputTensor.value_or(allocateEmptyHWCTensor(
1236
+ expectedOutputHeight, expectedOutputWidth, torch::kCPU));
1237
+
1238
+ if (!streamInfo.swsContext || streamInfo.prevFrameContext != frameContext) {
1239
+ createSwsContext(streamInfo, frameContext, avFrame->colorspace);
1240
+ streamInfo.prevFrameContext = frameContext;
1241
+ }
1242
+ int resultHeight =
1243
+ convertAVFrameToTensorUsingSwsScale(avFrame, outputTensor);
1244
+ // If this check failed, it would mean that the frame wasn't reshaped to
1245
+ // the expected height.
1246
+ // TODO: Can we do the same check for width?
1247
+ TORCH_CHECK(
1248
+ resultHeight == expectedOutputHeight,
1249
+ "resultHeight != expectedOutputHeight: ",
1250
+ resultHeight,
1251
+ " != ",
1252
+ expectedOutputHeight);
1253
+
1254
+ frameOutput.data = outputTensor;
1255
+ } else if (
1256
+ streamInfo.colorConversionLibrary ==
1257
+ ColorConversionLibrary::FILTERGRAPH) {
1258
+ if (!streamInfo.filterGraphContext.filterGraph ||
1259
+ streamInfo.prevFrameContext != frameContext) {
1260
+ createFilterGraph(streamInfo, expectedOutputHeight, expectedOutputWidth);
1261
+ streamInfo.prevFrameContext = frameContext;
1262
+ }
1263
+ outputTensor = convertAVFrameToTensorUsingFilterGraph(avFrame);
1264
+
1265
+ // Similarly to above, if this check fails it means the frame wasn't
1266
+ // reshaped to its expected dimensions by filtergraph.
1267
+ auto shape = outputTensor.sizes();
1268
+ TORCH_CHECK(
1269
+ (shape.size() == 3) && (shape[0] == expectedOutputHeight) &&
1270
+ (shape[1] == expectedOutputWidth) && (shape[2] == 3),
1271
+ "Expected output tensor of shape ",
1272
+ expectedOutputHeight,
1273
+ "x",
1274
+ expectedOutputWidth,
1275
+ "x3, got ",
1276
+ shape);
1277
+
1278
+ if (preAllocatedOutputTensor.has_value()) {
1279
+ // We have already validated that preAllocatedOutputTensor and
1280
+ // outputTensor have the same shape.
1281
+ preAllocatedOutputTensor.value().copy_(outputTensor);
1282
+ frameOutput.data = preAllocatedOutputTensor.value();
1283
+ } else {
1284
+ frameOutput.data = outputTensor;
1285
+ }
1286
+ } else {
1287
+ throw std::runtime_error(
1288
+ "Invalid color conversion library: " +
1289
+ std::to_string(static_cast<int>(streamInfo.colorConversionLibrary)));
1290
+ }
1291
+ }
1292
+
1293
+ int SingleStreamDecoder::convertAVFrameToTensorUsingSwsScale(
1294
+ const UniqueAVFrame& avFrame,
1295
+ torch::Tensor& outputTensor) {
1296
+ StreamInfo& activeStreamInfo = streamInfos_[activeStreamIndex_];
1297
+ SwsContext* swsContext = activeStreamInfo.swsContext.get();
1298
+ uint8_t* pointers[4] = {
1299
+ outputTensor.data_ptr<uint8_t>(), nullptr, nullptr, nullptr};
1300
+ int expectedOutputWidth = outputTensor.sizes()[1];
1301
+ int linesizes[4] = {expectedOutputWidth * 3, 0, 0, 0};
1302
+ int resultHeight = sws_scale(
1303
+ swsContext,
1304
+ avFrame->data,
1305
+ avFrame->linesize,
1306
+ 0,
1307
+ avFrame->height,
1308
+ pointers,
1309
+ linesizes);
1310
+ return resultHeight;
1311
+ }
1312
+
1313
+ torch::Tensor SingleStreamDecoder::convertAVFrameToTensorUsingFilterGraph(
1314
+ const UniqueAVFrame& avFrame) {
1315
+ FilterGraphContext& filterGraphContext =
1316
+ streamInfos_[activeStreamIndex_].filterGraphContext;
1317
+ int status =
1318
+ av_buffersrc_write_frame(filterGraphContext.sourceContext, avFrame.get());
1319
+ if (status < AVSUCCESS) {
1320
+ throw std::runtime_error("Failed to add frame to buffer source context");
1321
+ }
1322
+
1323
+ UniqueAVFrame filteredAVFrame(av_frame_alloc());
1324
+ status = av_buffersink_get_frame(
1325
+ filterGraphContext.sinkContext, filteredAVFrame.get());
1326
+ TORCH_CHECK_EQ(filteredAVFrame->format, AV_PIX_FMT_RGB24);
1327
+
1328
+ auto frameDims = getHeightAndWidthFromResizedAVFrame(*filteredAVFrame.get());
1329
+ int height = frameDims.height;
1330
+ int width = frameDims.width;
1331
+ std::vector<int64_t> shape = {height, width, 3};
1332
+ std::vector<int64_t> strides = {filteredAVFrame->linesize[0], 3, 1};
1333
+ AVFrame* filteredAVFramePtr = filteredAVFrame.release();
1334
+ auto deleter = [filteredAVFramePtr](void*) {
1335
+ UniqueAVFrame avFrameToDelete(filteredAVFramePtr);
1336
+ };
1337
+ return torch::from_blob(
1338
+ filteredAVFramePtr->data[0], shape, strides, deleter, {torch::kUInt8});
1339
+ }
1340
+
1341
+ void SingleStreamDecoder::convertAudioAVFrameToFrameOutputOnCPU(
1342
+ UniqueAVFrame& srcAVFrame,
1343
+ FrameOutput& frameOutput) {
1344
+ AVSampleFormat sourceSampleFormat =
1345
+ static_cast<AVSampleFormat>(srcAVFrame->format);
1346
+ AVSampleFormat desiredSampleFormat = AV_SAMPLE_FMT_FLTP;
1347
+
1348
+ StreamInfo& streamInfo = streamInfos_[activeStreamIndex_];
1349
+ int sourceSampleRate = srcAVFrame->sample_rate;
1350
+ int desiredSampleRate =
1351
+ streamInfo.audioStreamOptions.sampleRate.value_or(sourceSampleRate);
1352
+
1353
+ bool mustConvert =
1354
+ (sourceSampleFormat != desiredSampleFormat ||
1355
+ sourceSampleRate != desiredSampleRate);
1356
+
1357
+ UniqueAVFrame convertedAVFrame;
1358
+ if (mustConvert) {
1359
+ if (!streamInfo.swrContext) {
1360
+ streamInfo.swrContext.reset(createSwrContext(
1361
+ streamInfo.codecContext,
1362
+ sourceSampleFormat,
1363
+ desiredSampleFormat,
1364
+ sourceSampleRate,
1365
+ desiredSampleRate));
1366
+ }
1367
+
1368
+ convertedAVFrame = convertAudioAVFrameSampleFormatAndSampleRate(
1369
+ streamInfo.swrContext,
1370
+ srcAVFrame,
1371
+ desiredSampleFormat,
1372
+ sourceSampleRate,
1373
+ desiredSampleRate);
1374
+ }
1375
+ const UniqueAVFrame& avFrame = mustConvert ? convertedAVFrame : srcAVFrame;
1376
+
1377
+ AVSampleFormat format = static_cast<AVSampleFormat>(avFrame->format);
1378
+ TORCH_CHECK(
1379
+ format == desiredSampleFormat,
1380
+ "Something went wrong, the frame didn't get converted to the desired format. ",
1381
+ "Desired format = ",
1382
+ av_get_sample_fmt_name(desiredSampleFormat),
1383
+ "source format = ",
1384
+ av_get_sample_fmt_name(format));
1385
+
1386
+ auto numSamples = avFrame->nb_samples; // per channel
1387
+ auto numChannels = getNumChannels(avFrame);
1388
+
1389
+ frameOutput.data = torch::empty({numChannels, numSamples}, torch::kFloat32);
1390
+
1391
+ if (numSamples > 0) {
1392
+ uint8_t* outputChannelData =
1393
+ static_cast<uint8_t*>(frameOutput.data.data_ptr());
1394
+ auto numBytesPerChannel = numSamples * av_get_bytes_per_sample(format);
1395
+ for (auto channel = 0; channel < numChannels;
1396
+ ++channel, outputChannelData += numBytesPerChannel) {
1397
+ std::memcpy(
1398
+ outputChannelData,
1399
+ avFrame->extended_data[channel],
1400
+ numBytesPerChannel);
1401
+ }
1402
+ }
1403
+ }
1404
+
1405
+ std::optional<torch::Tensor> SingleStreamDecoder::maybeFlushSwrBuffers() {
1406
+ // When sample rate conversion is involved, swresample buffers some of the
1407
+ // samples in-between calls to swr_convert (see the libswresample docs).
1408
+ // That's because the last few samples in a given frame require future samples
1409
+ // from the next frame to be properly converted. This function flushes out the
1410
+ // samples that are stored in swresample's buffers.
1411
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1412
+ if (!streamInfo.swrContext) {
1413
+ return std::nullopt;
1414
+ }
1415
+ auto numRemainingSamples = // this is an upper bound
1416
+ swr_get_out_samples(streamInfo.swrContext.get(), 0);
1417
+
1418
+ if (numRemainingSamples == 0) {
1419
+ return std::nullopt;
1420
+ }
1421
+
1422
+ auto numChannels = getNumChannels(streamInfo.codecContext);
1423
+ torch::Tensor lastSamples =
1424
+ torch::empty({numChannels, numRemainingSamples}, torch::kFloat32);
1425
+
1426
+ std::vector<uint8_t*> outputBuffers(numChannels);
1427
+ for (auto i = 0; i < numChannels; i++) {
1428
+ outputBuffers[i] = static_cast<uint8_t*>(lastSamples[i].data_ptr());
1429
+ }
1430
+
1431
+ auto actualNumRemainingSamples = swr_convert(
1432
+ streamInfo.swrContext.get(),
1433
+ outputBuffers.data(),
1434
+ numRemainingSamples,
1435
+ nullptr,
1436
+ 0);
1437
+
1438
+ return lastSamples.narrow(
1439
+ /*dim=*/1, /*start=*/0, /*length=*/actualNumRemainingSamples);
1440
+ }
1441
+
1442
+ // --------------------------------------------------------------------------
1443
+ // OUTPUT ALLOCATION AND SHAPE CONVERSION
1444
+ // --------------------------------------------------------------------------
1445
+
1446
+ FrameBatchOutput::FrameBatchOutput(
1447
+ int64_t numFrames,
1448
+ const VideoStreamOptions& videoStreamOptions,
1449
+ const StreamMetadata& streamMetadata)
1450
+ : ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
1451
+ durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
1452
+ auto frameDims = getHeightAndWidthFromOptionsOrMetadata(
1453
+ videoStreamOptions, streamMetadata);
1454
+ int height = frameDims.height;
1455
+ int width = frameDims.width;
1456
+ data = allocateEmptyHWCTensor(
1457
+ height, width, videoStreamOptions.device, numFrames);
1458
+ }
1459
+
1460
+ torch::Tensor allocateEmptyHWCTensor(
1461
+ int height,
1462
+ int width,
1463
+ torch::Device device,
1464
+ std::optional<int> numFrames) {
1465
+ auto tensorOptions = torch::TensorOptions()
1466
+ .dtype(torch::kUInt8)
1467
+ .layout(torch::kStrided)
1468
+ .device(device);
1469
+ TORCH_CHECK(height > 0, "height must be > 0, got: ", height);
1470
+ TORCH_CHECK(width > 0, "width must be > 0, got: ", width);
1471
+ if (numFrames.has_value()) {
1472
+ auto numFramesValue = numFrames.value();
1473
+ TORCH_CHECK(
1474
+ numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
1475
+ return torch::empty({numFramesValue, height, width, 3}, tensorOptions);
1476
+ } else {
1477
+ return torch::empty({height, width, 3}, tensorOptions);
1478
+ }
1479
+ }
1480
+
1481
+ // Returns a [N]CHW *view* of a [N]HWC input tensor, if the options require so.
1482
+ // The [N] leading batch-dimension is optional i.e. the input tensor can be 3D
1483
+ // or 4D.
1484
+ // Calling permute() is guaranteed to return a view as per the docs:
1485
+ // https://pytorch.org/docs/stable/generated/torch.permute.html
1486
+ torch::Tensor SingleStreamDecoder::maybePermuteHWC2CHW(
1487
+ torch::Tensor& hwcTensor) {
1488
+ if (streamInfos_[activeStreamIndex_].videoStreamOptions.dimensionOrder ==
1489
+ "NHWC") {
1490
+ return hwcTensor;
1491
+ }
1492
+ auto numDimensions = hwcTensor.dim();
1493
+ auto shape = hwcTensor.sizes();
1494
+ if (numDimensions == 3) {
1495
+ TORCH_CHECK(shape[2] == 3, "Not a HWC tensor: ", shape);
1496
+ return hwcTensor.permute({2, 0, 1});
1497
+ } else if (numDimensions == 4) {
1498
+ TORCH_CHECK(shape[3] == 3, "Not a NHWC tensor: ", shape);
1499
+ return hwcTensor.permute({0, 3, 1, 2});
1500
+ } else {
1501
+ TORCH_CHECK(
1502
+ false, "Expected tensor with 3 or 4 dimensions, got ", numDimensions);
1503
+ }
1504
+ }
1505
+
1506
+ // --------------------------------------------------------------------------
1507
+ // COLOR CONVERSION UTILS AND INITIALIZERS
1508
+ // --------------------------------------------------------------------------
1509
+
1510
+ bool SingleStreamDecoder::DecodedFrameContext::operator==(
1511
+ const SingleStreamDecoder::DecodedFrameContext& other) {
1512
+ return decodedWidth == other.decodedWidth &&
1513
+ decodedHeight == other.decodedHeight &&
1514
+ decodedFormat == other.decodedFormat &&
1515
+ expectedWidth == other.expectedWidth &&
1516
+ expectedHeight == other.expectedHeight;
1517
+ }
1518
+
1519
+ bool SingleStreamDecoder::DecodedFrameContext::operator!=(
1520
+ const SingleStreamDecoder::DecodedFrameContext& other) {
1521
+ return !(*this == other);
1522
+ }
1523
+
1524
+ void SingleStreamDecoder::createFilterGraph(
1525
+ StreamInfo& streamInfo,
1526
+ int expectedOutputHeight,
1527
+ int expectedOutputWidth) {
1528
+ FilterGraphContext& filterGraphContext = streamInfo.filterGraphContext;
1529
+ filterGraphContext.filterGraph.reset(avfilter_graph_alloc());
1530
+ TORCH_CHECK(filterGraphContext.filterGraph.get() != nullptr);
1531
+
1532
+ if (streamInfo.videoStreamOptions.ffmpegThreadCount.has_value()) {
1533
+ filterGraphContext.filterGraph->nb_threads =
1534
+ streamInfo.videoStreamOptions.ffmpegThreadCount.value();
1535
+ }
1536
+
1537
+ const AVFilter* buffersrc = avfilter_get_by_name("buffer");
1538
+ const AVFilter* buffersink = avfilter_get_by_name("buffersink");
1539
+ AVCodecContext* codecContext = streamInfo.codecContext.get();
1540
+
1541
+ std::stringstream filterArgs;
1542
+ filterArgs << "video_size=" << codecContext->width << "x"
1543
+ << codecContext->height;
1544
+ filterArgs << ":pix_fmt=" << codecContext->pix_fmt;
1545
+ filterArgs << ":time_base=" << streamInfo.stream->time_base.num << "/"
1546
+ << streamInfo.stream->time_base.den;
1547
+ filterArgs << ":pixel_aspect=" << codecContext->sample_aspect_ratio.num << "/"
1548
+ << codecContext->sample_aspect_ratio.den;
1549
+
1550
+ int status = avfilter_graph_create_filter(
1551
+ &filterGraphContext.sourceContext,
1552
+ buffersrc,
1553
+ "in",
1554
+ filterArgs.str().c_str(),
1555
+ nullptr,
1556
+ filterGraphContext.filterGraph.get());
1557
+ if (status < 0) {
1558
+ throw std::runtime_error(
1559
+ std::string("Failed to create filter graph: ") + filterArgs.str() +
1560
+ ": " + getFFMPEGErrorStringFromErrorCode(status));
1561
+ }
1562
+
1563
+ status = avfilter_graph_create_filter(
1564
+ &filterGraphContext.sinkContext,
1565
+ buffersink,
1566
+ "out",
1567
+ nullptr,
1568
+ nullptr,
1569
+ filterGraphContext.filterGraph.get());
1570
+ if (status < 0) {
1571
+ throw std::runtime_error(
1572
+ "Failed to create filter graph: " +
1573
+ getFFMPEGErrorStringFromErrorCode(status));
1574
+ }
1575
+
1576
+ enum AVPixelFormat pix_fmts[] = {AV_PIX_FMT_RGB24, AV_PIX_FMT_NONE};
1577
+
1578
+ status = av_opt_set_int_list(
1579
+ filterGraphContext.sinkContext,
1580
+ "pix_fmts",
1581
+ pix_fmts,
1582
+ AV_PIX_FMT_NONE,
1583
+ AV_OPT_SEARCH_CHILDREN);
1584
+ if (status < 0) {
1585
+ throw std::runtime_error(
1586
+ "Failed to set output pixel formats: " +
1587
+ getFFMPEGErrorStringFromErrorCode(status));
1588
+ }
1589
+
1590
+ UniqueAVFilterInOut outputs(avfilter_inout_alloc());
1591
+ UniqueAVFilterInOut inputs(avfilter_inout_alloc());
1592
+
1593
+ outputs->name = av_strdup("in");
1594
+ outputs->filter_ctx = filterGraphContext.sourceContext;
1595
+ outputs->pad_idx = 0;
1596
+ outputs->next = nullptr;
1597
+ inputs->name = av_strdup("out");
1598
+ inputs->filter_ctx = filterGraphContext.sinkContext;
1599
+ inputs->pad_idx = 0;
1600
+ inputs->next = nullptr;
1601
+
1602
+ std::stringstream description;
1603
+ description << "scale=" << expectedOutputWidth << ":" << expectedOutputHeight;
1604
+ description << ":sws_flags=bilinear";
1605
+
1606
+ AVFilterInOut* outputsTmp = outputs.release();
1607
+ AVFilterInOut* inputsTmp = inputs.release();
1608
+ status = avfilter_graph_parse_ptr(
1609
+ filterGraphContext.filterGraph.get(),
1610
+ description.str().c_str(),
1611
+ &inputsTmp,
1612
+ &outputsTmp,
1613
+ nullptr);
1614
+ outputs.reset(outputsTmp);
1615
+ inputs.reset(inputsTmp);
1616
+ if (status < 0) {
1617
+ throw std::runtime_error(
1618
+ "Failed to parse filter description: " +
1619
+ getFFMPEGErrorStringFromErrorCode(status));
1620
+ }
1621
+
1622
+ status = avfilter_graph_config(filterGraphContext.filterGraph.get(), nullptr);
1623
+ if (status < 0) {
1624
+ throw std::runtime_error(
1625
+ "Failed to configure filter graph: " +
1626
+ getFFMPEGErrorStringFromErrorCode(status));
1627
+ }
1628
+ }
1629
+
1630
+ void SingleStreamDecoder::createSwsContext(
1631
+ StreamInfo& streamInfo,
1632
+ const DecodedFrameContext& frameContext,
1633
+ const enum AVColorSpace colorspace) {
1634
+ SwsContext* swsContext = sws_getContext(
1635
+ frameContext.decodedWidth,
1636
+ frameContext.decodedHeight,
1637
+ frameContext.decodedFormat,
1638
+ frameContext.expectedWidth,
1639
+ frameContext.expectedHeight,
1640
+ AV_PIX_FMT_RGB24,
1641
+ SWS_BILINEAR,
1642
+ nullptr,
1643
+ nullptr,
1644
+ nullptr);
1645
+ TORCH_CHECK(swsContext, "sws_getContext() returned nullptr");
1646
+
1647
+ int* invTable = nullptr;
1648
+ int* table = nullptr;
1649
+ int srcRange, dstRange, brightness, contrast, saturation;
1650
+ int ret = sws_getColorspaceDetails(
1651
+ swsContext,
1652
+ &invTable,
1653
+ &srcRange,
1654
+ &table,
1655
+ &dstRange,
1656
+ &brightness,
1657
+ &contrast,
1658
+ &saturation);
1659
+ TORCH_CHECK(ret != -1, "sws_getColorspaceDetails returned -1");
1660
+
1661
+ const int* colorspaceTable = sws_getCoefficients(colorspace);
1662
+ ret = sws_setColorspaceDetails(
1663
+ swsContext,
1664
+ colorspaceTable,
1665
+ srcRange,
1666
+ colorspaceTable,
1667
+ dstRange,
1668
+ brightness,
1669
+ contrast,
1670
+ saturation);
1671
+ TORCH_CHECK(ret != -1, "sws_setColorspaceDetails returned -1");
1672
+
1673
+ streamInfo.swsContext.reset(swsContext);
1674
+ }
1675
+
1676
+ // --------------------------------------------------------------------------
1677
+ // PTS <-> INDEX CONVERSIONS
1678
+ // --------------------------------------------------------------------------
1679
+
1680
+ int SingleStreamDecoder::getKeyFrameIndexForPts(int64_t pts) const {
1681
+ const StreamInfo& streamInfo = streamInfos_.at(activeStreamIndex_);
1682
+ if (streamInfo.keyFrames.empty()) {
1683
+ return av_index_search_timestamp(
1684
+ streamInfo.stream, pts, AVSEEK_FLAG_BACKWARD);
1685
+ } else {
1686
+ return getKeyFrameIndexForPtsUsingScannedIndex(streamInfo.keyFrames, pts);
1687
+ }
1688
+ }
1689
+
1690
+ int SingleStreamDecoder::getKeyFrameIndexForPtsUsingScannedIndex(
1691
+ const std::vector<SingleStreamDecoder::FrameInfo>& keyFrames,
1692
+ int64_t pts) const {
1693
+ auto upperBound = std::upper_bound(
1694
+ keyFrames.begin(),
1695
+ keyFrames.end(),
1696
+ pts,
1697
+ [](int64_t pts, const SingleStreamDecoder::FrameInfo& frameInfo) {
1698
+ return pts < frameInfo.pts;
1699
+ });
1700
+ if (upperBound == keyFrames.begin()) {
1701
+ return -1;
1702
+ }
1703
+ return upperBound - 1 - keyFrames.begin();
1704
+ }
1705
+
1706
+ int64_t SingleStreamDecoder::secondsToIndexLowerBound(double seconds) {
1707
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1708
+ switch (seekMode_) {
1709
+ case SeekMode::exact: {
1710
+ auto frame = std::lower_bound(
1711
+ streamInfo.allFrames.begin(),
1712
+ streamInfo.allFrames.end(),
1713
+ seconds,
1714
+ [&streamInfo](const FrameInfo& info, double start) {
1715
+ return ptsToSeconds(info.nextPts, streamInfo.timeBase) <= start;
1716
+ });
1717
+
1718
+ return frame - streamInfo.allFrames.begin();
1719
+ }
1720
+ case SeekMode::approximate: {
1721
+ auto& streamMetadata =
1722
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
1723
+ TORCH_CHECK(
1724
+ streamMetadata.averageFps.has_value(),
1725
+ "Cannot use approximate mode since we couldn't find the average fps from the metadata.");
1726
+ return std::floor(seconds * streamMetadata.averageFps.value());
1727
+ }
1728
+ default:
1729
+ throw std::runtime_error("Unknown SeekMode");
1730
+ }
1731
+ }
1732
+
1733
+ int64_t SingleStreamDecoder::secondsToIndexUpperBound(double seconds) {
1734
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1735
+ switch (seekMode_) {
1736
+ case SeekMode::exact: {
1737
+ auto frame = std::upper_bound(
1738
+ streamInfo.allFrames.begin(),
1739
+ streamInfo.allFrames.end(),
1740
+ seconds,
1741
+ [&streamInfo](double stop, const FrameInfo& info) {
1742
+ return stop <= ptsToSeconds(info.pts, streamInfo.timeBase);
1743
+ });
1744
+
1745
+ return frame - streamInfo.allFrames.begin();
1746
+ }
1747
+ case SeekMode::approximate: {
1748
+ auto& streamMetadata =
1749
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
1750
+ TORCH_CHECK(
1751
+ streamMetadata.averageFps.has_value(),
1752
+ "Cannot use approximate mode since we couldn't find the average fps from the metadata.");
1753
+ return std::ceil(seconds * streamMetadata.averageFps.value());
1754
+ }
1755
+ default:
1756
+ throw std::runtime_error("Unknown SeekMode");
1757
+ }
1758
+ }
1759
+
1760
+ int64_t SingleStreamDecoder::getPts(int64_t frameIndex) {
1761
+ auto& streamInfo = streamInfos_[activeStreamIndex_];
1762
+ switch (seekMode_) {
1763
+ case SeekMode::exact:
1764
+ return streamInfo.allFrames[frameIndex].pts;
1765
+ case SeekMode::approximate: {
1766
+ auto& streamMetadata =
1767
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
1768
+ TORCH_CHECK(
1769
+ streamMetadata.averageFps.has_value(),
1770
+ "Cannot use approximate mode since we couldn't find the average fps from the metadata.");
1771
+ return secondsToClosestPts(
1772
+ frameIndex / streamMetadata.averageFps.value(), streamInfo.timeBase);
1773
+ }
1774
+ default:
1775
+ throw std::runtime_error("Unknown SeekMode");
1776
+ }
1777
+ }
1778
+
1779
+ // --------------------------------------------------------------------------
1780
+ // STREAM AND METADATA APIS
1781
+ // --------------------------------------------------------------------------
1782
+
1783
+ int64_t SingleStreamDecoder::getNumFrames(
1784
+ const StreamMetadata& streamMetadata) {
1785
+ switch (seekMode_) {
1786
+ case SeekMode::exact:
1787
+ return streamMetadata.numFramesFromScan.value();
1788
+ case SeekMode::approximate: {
1789
+ TORCH_CHECK(
1790
+ streamMetadata.numFrames.has_value(),
1791
+ "Cannot use approximate mode since we couldn't find the number of frames from the metadata.");
1792
+ return streamMetadata.numFrames.value();
1793
+ }
1794
+ default:
1795
+ throw std::runtime_error("Unknown SeekMode");
1796
+ }
1797
+ }
1798
+
1799
+ double SingleStreamDecoder::getMinSeconds(
1800
+ const StreamMetadata& streamMetadata) {
1801
+ switch (seekMode_) {
1802
+ case SeekMode::exact:
1803
+ return streamMetadata.minPtsSecondsFromScan.value();
1804
+ case SeekMode::approximate:
1805
+ return 0;
1806
+ default:
1807
+ throw std::runtime_error("Unknown SeekMode");
1808
+ }
1809
+ }
1810
+
1811
+ double SingleStreamDecoder::getMaxSeconds(
1812
+ const StreamMetadata& streamMetadata) {
1813
+ switch (seekMode_) {
1814
+ case SeekMode::exact:
1815
+ return streamMetadata.maxPtsSecondsFromScan.value();
1816
+ case SeekMode::approximate: {
1817
+ TORCH_CHECK(
1818
+ streamMetadata.durationSeconds.has_value(),
1819
+ "Cannot use approximate mode since we couldn't find the duration from the metadata.");
1820
+ return streamMetadata.durationSeconds.value();
1821
+ }
1822
+ default:
1823
+ throw std::runtime_error("Unknown SeekMode");
1824
+ }
1825
+ }
1826
+
1827
+ // --------------------------------------------------------------------------
1828
+ // VALIDATION UTILS
1829
+ // --------------------------------------------------------------------------
1830
+
1831
+ void SingleStreamDecoder::validateActiveStream(
1832
+ std::optional<AVMediaType> avMediaType) {
1833
+ auto errorMsg =
1834
+ "Provided stream index=" + std::to_string(activeStreamIndex_) +
1835
+ " was not previously added.";
1836
+ TORCH_CHECK(activeStreamIndex_ != NO_ACTIVE_STREAM, errorMsg);
1837
+ TORCH_CHECK(streamInfos_.count(activeStreamIndex_) > 0, errorMsg);
1838
+
1839
+ int allStreamMetadataSize =
1840
+ static_cast<int>(containerMetadata_.allStreamMetadata.size());
1841
+ TORCH_CHECK(
1842
+ activeStreamIndex_ >= 0 && activeStreamIndex_ < allStreamMetadataSize,
1843
+ "Invalid stream index=" + std::to_string(activeStreamIndex_) +
1844
+ "; valid indices are in the range [0, " +
1845
+ std::to_string(allStreamMetadataSize) + ").");
1846
+
1847
+ if (avMediaType.has_value()) {
1848
+ TORCH_CHECK(
1849
+ streamInfos_[activeStreamIndex_].avMediaType == avMediaType.value(),
1850
+ "The method you called isn't supported. ",
1851
+ "If you're seeing this error, you are probably trying to call an ",
1852
+ "unsupported method on an audio stream.");
1853
+ }
1854
+ }
1855
+
1856
+ void SingleStreamDecoder::validateScannedAllStreams(const std::string& msg) {
1857
+ if (!scannedAllStreams_) {
1858
+ throw std::runtime_error(
1859
+ "Must scan all streams to update metadata before calling " + msg);
1860
+ }
1861
+ }
1862
+
1863
+ void SingleStreamDecoder::validateFrameIndex(
1864
+ const StreamMetadata& streamMetadata,
1865
+ int64_t frameIndex) {
1866
+ int64_t numFrames = getNumFrames(streamMetadata);
1867
+ TORCH_CHECK(
1868
+ frameIndex >= 0 && frameIndex < numFrames,
1869
+ "Invalid frame index=" + std::to_string(frameIndex) +
1870
+ " for streamIndex=" + std::to_string(streamMetadata.streamIndex) +
1871
+ " numFrames=" + std::to_string(numFrames));
1872
+ }
1873
+
1874
+ // --------------------------------------------------------------------------
1875
+ // MORALLY PRIVATE UTILS
1876
+ // --------------------------------------------------------------------------
1877
+
1878
+ SingleStreamDecoder::DecodeStats SingleStreamDecoder::getDecodeStats() const {
1879
+ return decodeStats_;
1880
+ }
1881
+
1882
+ std::ostream& operator<<(
1883
+ std::ostream& os,
1884
+ const SingleStreamDecoder::DecodeStats& stats) {
1885
+ os << "DecodeStats{"
1886
+ << "numFramesReceivedByDecoder=" << stats.numFramesReceivedByDecoder
1887
+ << ", numPacketsRead=" << stats.numPacketsRead
1888
+ << ", numPacketsSentToDecoder=" << stats.numPacketsSentToDecoder
1889
+ << ", numSeeksAttempted=" << stats.numSeeksAttempted
1890
+ << ", numSeeksSkipped=" << stats.numSeeksSkipped
1891
+ << ", numFlushes=" << stats.numFlushes << "}";
1892
+
1893
+ return os;
1894
+ }
1895
+
1896
+ void SingleStreamDecoder::resetDecodeStats() {
1897
+ decodeStats_ = DecodeStats{};
1898
+ }
1899
+
1900
+ double SingleStreamDecoder::getPtsSecondsForFrame(int64_t frameIndex) {
1901
+ validateActiveStream(AVMEDIA_TYPE_VIDEO);
1902
+ validateScannedAllStreams("getPtsSecondsForFrame");
1903
+
1904
+ const auto& streamInfo = streamInfos_[activeStreamIndex_];
1905
+ const auto& streamMetadata =
1906
+ containerMetadata_.allStreamMetadata[activeStreamIndex_];
1907
+ validateFrameIndex(streamMetadata, frameIndex);
1908
+
1909
+ return ptsToSeconds(
1910
+ streamInfo.allFrames[frameIndex].pts, streamInfo.timeBase);
1911
+ }
1912
+
1913
+ // --------------------------------------------------------------------------
1914
+ // FrameDims APIs
1915
+ // --------------------------------------------------------------------------
1916
+
1917
+ FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame) {
1918
+ return FrameDims(resizedAVFrame.height, resizedAVFrame.width);
1919
+ }
1920
+
1921
+ FrameDims getHeightAndWidthFromOptionsOrMetadata(
1922
+ const VideoStreamOptions& videoStreamOptions,
1923
+ const StreamMetadata& streamMetadata) {
1924
+ return FrameDims(
1925
+ videoStreamOptions.height.value_or(*streamMetadata.height),
1926
+ videoStreamOptions.width.value_or(*streamMetadata.width));
1927
+ }
1928
+
1929
+ FrameDims getHeightAndWidthFromOptionsOrAVFrame(
1930
+ const VideoStreamOptions& videoStreamOptions,
1931
+ const UniqueAVFrame& avFrame) {
1932
+ return FrameDims(
1933
+ videoStreamOptions.height.value_or(avFrame->height),
1934
+ videoStreamOptions.width.value_or(avFrame->width));
1935
+ }
1936
+
1937
+ SingleStreamDecoder::SeekMode seekModeFromString(std::string_view seekMode) {
1938
+ if (seekMode == "exact") {
1939
+ return SingleStreamDecoder::SeekMode::exact;
1940
+ } else if (seekMode == "approximate") {
1941
+ return SingleStreamDecoder::SeekMode::approximate;
1942
+ } else {
1943
+ TORCH_CHECK(false, "Invalid seek mode: " + std::string(seekMode));
1944
+ }
1945
+ }
1946
+
1947
+ } // namespace facebook::torchcodec