torchcodec 0.7.0__cp312-cp312-win_amd64.whl → 0.8.1__cp312-cp312-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/AVIOTensorContext.cpp +23 -16
- torchcodec/_core/AVIOTensorContext.h +2 -1
- torchcodec/_core/BetaCudaDeviceInterface.cpp +718 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +193 -0
- torchcodec/_core/CMakeLists.txt +18 -3
- torchcodec/_core/CUDACommon.cpp +330 -0
- torchcodec/_core/CUDACommon.h +51 -0
- torchcodec/_core/Cache.h +6 -20
- torchcodec/_core/CpuDeviceInterface.cpp +195 -108
- torchcodec/_core/CpuDeviceInterface.h +84 -19
- torchcodec/_core/CudaDeviceInterface.cpp +227 -376
- torchcodec/_core/CudaDeviceInterface.h +38 -6
- torchcodec/_core/DeviceInterface.cpp +57 -19
- torchcodec/_core/DeviceInterface.h +97 -16
- torchcodec/_core/Encoder.cpp +346 -9
- torchcodec/_core/Encoder.h +62 -1
- torchcodec/_core/FFMPEGCommon.cpp +190 -3
- torchcodec/_core/FFMPEGCommon.h +27 -1
- torchcodec/_core/FilterGraph.cpp +30 -22
- torchcodec/_core/FilterGraph.h +15 -1
- torchcodec/_core/Frame.cpp +22 -7
- torchcodec/_core/Frame.h +15 -61
- torchcodec/_core/Metadata.h +2 -2
- torchcodec/_core/NVCUVIDRuntimeLoader.cpp +320 -0
- torchcodec/_core/NVCUVIDRuntimeLoader.h +14 -0
- torchcodec/_core/NVDECCache.cpp +60 -0
- torchcodec/_core/NVDECCache.h +102 -0
- torchcodec/_core/SingleStreamDecoder.cpp +196 -201
- torchcodec/_core/SingleStreamDecoder.h +42 -15
- torchcodec/_core/StreamOptions.h +16 -6
- torchcodec/_core/Transform.cpp +87 -0
- torchcodec/_core/Transform.h +84 -0
- torchcodec/_core/__init__.py +4 -0
- torchcodec/_core/custom_ops.cpp +257 -32
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +147 -44
- torchcodec/_core/pybind_ops.cpp +22 -59
- torchcodec/_samplers/video_clip_sampler.py +7 -19
- torchcodec/decoders/__init__.py +1 -0
- torchcodec/decoders/_decoder_utils.py +61 -1
- torchcodec/decoders/_video_decoder.py +46 -20
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/samplers/_time_based.py +8 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/METADATA +29 -16
- torchcodec-0.8.1.dist-info/RECORD +82 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/WHEEL +1 -1
- torchcodec-0.7.0.dist-info/RECORD +0 -67
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.1.dist-info}/top_level.txt +0 -0
torchcodec/_core/Encoder.cpp
CHANGED
|
@@ -33,21 +33,22 @@ torch::Tensor validateSamples(const torch::Tensor& samples) {
|
|
|
33
33
|
}
|
|
34
34
|
|
|
35
35
|
void validateSampleRate(const AVCodec& avCodec, int sampleRate) {
|
|
36
|
-
|
|
36
|
+
const int* supportedSampleRates = getSupportedSampleRates(avCodec);
|
|
37
|
+
if (supportedSampleRates == nullptr) {
|
|
37
38
|
return;
|
|
38
39
|
}
|
|
39
40
|
|
|
40
|
-
for (auto i = 0;
|
|
41
|
-
if (sampleRate ==
|
|
41
|
+
for (auto i = 0; supportedSampleRates[i] != 0; ++i) {
|
|
42
|
+
if (sampleRate == supportedSampleRates[i]) {
|
|
42
43
|
return;
|
|
43
44
|
}
|
|
44
45
|
}
|
|
45
46
|
std::stringstream supportedRates;
|
|
46
|
-
for (auto i = 0;
|
|
47
|
+
for (auto i = 0; supportedSampleRates[i] != 0; ++i) {
|
|
47
48
|
if (i > 0) {
|
|
48
49
|
supportedRates << ", ";
|
|
49
50
|
}
|
|
50
|
-
supportedRates <<
|
|
51
|
+
supportedRates << supportedSampleRates[i];
|
|
51
52
|
}
|
|
52
53
|
|
|
53
54
|
TORCH_CHECK(
|
|
@@ -73,19 +74,22 @@ static const std::vector<AVSampleFormat> preferredFormatsOrder = {
|
|
|
73
74
|
AV_SAMPLE_FMT_U8};
|
|
74
75
|
|
|
75
76
|
AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
|
|
77
|
+
const AVSampleFormat* supportedSampleFormats =
|
|
78
|
+
getSupportedOutputSampleFormats(avCodec);
|
|
79
|
+
|
|
76
80
|
// Find a sample format that the encoder supports. We prefer using FLT[P],
|
|
77
81
|
// since this is the format of the input samples. If FLTP isn't supported
|
|
78
82
|
// then we'll need to convert the AVFrame's format. Our heuristic is to encode
|
|
79
83
|
// into the format with the highest resolution.
|
|
80
|
-
if (
|
|
84
|
+
if (supportedSampleFormats == nullptr) {
|
|
81
85
|
// Can't really validate anything in this case, best we can do is hope that
|
|
82
86
|
// FLTP is supported by the encoder. If not, FFmpeg will raise.
|
|
83
87
|
return AV_SAMPLE_FMT_FLTP;
|
|
84
88
|
}
|
|
85
89
|
|
|
86
90
|
for (AVSampleFormat preferredFormat : preferredFormatsOrder) {
|
|
87
|
-
for (int i = 0;
|
|
88
|
-
if (
|
|
91
|
+
for (int i = 0; supportedSampleFormats[i] != -1; ++i) {
|
|
92
|
+
if (supportedSampleFormats[i] == preferredFormat) {
|
|
89
93
|
return preferredFormat;
|
|
90
94
|
}
|
|
91
95
|
}
|
|
@@ -93,7 +97,7 @@ AVSampleFormat findBestOutputSampleFormat(const AVCodec& avCodec) {
|
|
|
93
97
|
// We should always find a match in preferredFormatsOrder, so we should always
|
|
94
98
|
// return earlier. But in the event that a future FFmpeg version defines an
|
|
95
99
|
// additional sample format that isn't in preferredFormatsOrder, we fallback:
|
|
96
|
-
return
|
|
100
|
+
return supportedSampleFormats[0];
|
|
97
101
|
}
|
|
98
102
|
|
|
99
103
|
} // namespace
|
|
@@ -511,4 +515,337 @@ void AudioEncoder::flushBuffers() {
|
|
|
511
515
|
|
|
512
516
|
encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
|
|
513
517
|
}
|
|
518
|
+
|
|
519
|
+
namespace {
|
|
520
|
+
|
|
521
|
+
torch::Tensor validateFrames(const torch::Tensor& frames) {
|
|
522
|
+
TORCH_CHECK(
|
|
523
|
+
frames.dtype() == torch::kUInt8,
|
|
524
|
+
"frames must have uint8 dtype, got ",
|
|
525
|
+
frames.dtype());
|
|
526
|
+
TORCH_CHECK(
|
|
527
|
+
frames.dim() == 4,
|
|
528
|
+
"frames must have 4 dimensions (N, C, H, W), got ",
|
|
529
|
+
frames.dim());
|
|
530
|
+
TORCH_CHECK(
|
|
531
|
+
frames.sizes()[1] == 3,
|
|
532
|
+
"frame must have 3 channels (R, G, B), got ",
|
|
533
|
+
frames.sizes()[1]);
|
|
534
|
+
// TODO-VideoEncoder: Investigate if non-contiguous frames can be accepted
|
|
535
|
+
return frames.contiguous();
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
} // namespace
|
|
539
|
+
|
|
540
|
+
VideoEncoder::~VideoEncoder() {
|
|
541
|
+
// TODO-VideoEncoder: Unify destructor with ~AudioEncoder()
|
|
542
|
+
if (avFormatContext_ && avFormatContext_->pb) {
|
|
543
|
+
if (avFormatContext_->pb->error == 0) {
|
|
544
|
+
avio_flush(avFormatContext_->pb);
|
|
545
|
+
}
|
|
546
|
+
if (!avioContextHolder_) {
|
|
547
|
+
if (avFormatContext_->pb->error == 0) {
|
|
548
|
+
avio_close(avFormatContext_->pb);
|
|
549
|
+
}
|
|
550
|
+
avFormatContext_->pb = nullptr;
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
VideoEncoder::VideoEncoder(
|
|
556
|
+
const torch::Tensor& frames,
|
|
557
|
+
int frameRate,
|
|
558
|
+
std::string_view fileName,
|
|
559
|
+
const VideoStreamOptions& videoStreamOptions)
|
|
560
|
+
: frames_(validateFrames(frames)), inFrameRate_(frameRate) {
|
|
561
|
+
setFFmpegLogLevel();
|
|
562
|
+
|
|
563
|
+
// Allocate output format context
|
|
564
|
+
AVFormatContext* avFormatContext = nullptr;
|
|
565
|
+
int status = avformat_alloc_output_context2(
|
|
566
|
+
&avFormatContext, nullptr, nullptr, fileName.data());
|
|
567
|
+
|
|
568
|
+
TORCH_CHECK(
|
|
569
|
+
avFormatContext != nullptr,
|
|
570
|
+
"Couldn't allocate AVFormatContext. ",
|
|
571
|
+
"The destination file is ",
|
|
572
|
+
fileName,
|
|
573
|
+
", check the desired extension? ",
|
|
574
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
575
|
+
avFormatContext_.reset(avFormatContext);
|
|
576
|
+
|
|
577
|
+
status = avio_open(&avFormatContext_->pb, fileName.data(), AVIO_FLAG_WRITE);
|
|
578
|
+
TORCH_CHECK(
|
|
579
|
+
status >= 0,
|
|
580
|
+
"avio_open failed. The destination file is ",
|
|
581
|
+
fileName,
|
|
582
|
+
", make sure it's a valid path? ",
|
|
583
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
584
|
+
initializeEncoder(videoStreamOptions);
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
VideoEncoder::VideoEncoder(
|
|
588
|
+
const torch::Tensor& frames,
|
|
589
|
+
int frameRate,
|
|
590
|
+
std::string_view formatName,
|
|
591
|
+
std::unique_ptr<AVIOContextHolder> avioContextHolder,
|
|
592
|
+
const VideoStreamOptions& videoStreamOptions)
|
|
593
|
+
: frames_(validateFrames(frames)),
|
|
594
|
+
inFrameRate_(frameRate),
|
|
595
|
+
avioContextHolder_(std::move(avioContextHolder)) {
|
|
596
|
+
setFFmpegLogLevel();
|
|
597
|
+
// Map mkv -> matroska when used as format name
|
|
598
|
+
formatName = (formatName == "mkv") ? "matroska" : formatName;
|
|
599
|
+
AVFormatContext* avFormatContext = nullptr;
|
|
600
|
+
int status = avformat_alloc_output_context2(
|
|
601
|
+
&avFormatContext, nullptr, formatName.data(), nullptr);
|
|
602
|
+
|
|
603
|
+
TORCH_CHECK(
|
|
604
|
+
avFormatContext != nullptr,
|
|
605
|
+
"Couldn't allocate AVFormatContext. ",
|
|
606
|
+
"Check the desired format? Got format=",
|
|
607
|
+
formatName,
|
|
608
|
+
". ",
|
|
609
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
610
|
+
avFormatContext_.reset(avFormatContext);
|
|
611
|
+
|
|
612
|
+
avFormatContext_->pb = avioContextHolder_->getAVIOContext();
|
|
613
|
+
|
|
614
|
+
initializeEncoder(videoStreamOptions);
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
void VideoEncoder::initializeEncoder(
|
|
618
|
+
const VideoStreamOptions& videoStreamOptions) {
|
|
619
|
+
const AVCodec* avCodec =
|
|
620
|
+
avcodec_find_encoder(avFormatContext_->oformat->video_codec);
|
|
621
|
+
TORCH_CHECK(avCodec != nullptr, "Video codec not found");
|
|
622
|
+
|
|
623
|
+
AVCodecContext* avCodecContext = avcodec_alloc_context3(avCodec);
|
|
624
|
+
TORCH_CHECK(avCodecContext != nullptr, "Couldn't allocate codec context.");
|
|
625
|
+
avCodecContext_.reset(avCodecContext);
|
|
626
|
+
|
|
627
|
+
// Store dimension order and input pixel format
|
|
628
|
+
// TODO-VideoEncoder: Remove assumption that tensor in NCHW format
|
|
629
|
+
auto sizes = frames_.sizes();
|
|
630
|
+
inPixelFormat_ = AV_PIX_FMT_GBRP;
|
|
631
|
+
inHeight_ = static_cast<int>(sizes[2]);
|
|
632
|
+
inWidth_ = static_cast<int>(sizes[3]);
|
|
633
|
+
|
|
634
|
+
// Use specified dimensions or input dimensions
|
|
635
|
+
// TODO-VideoEncoder: Allow height and width to be set
|
|
636
|
+
outWidth_ = inWidth_;
|
|
637
|
+
outHeight_ = inHeight_;
|
|
638
|
+
|
|
639
|
+
// TODO-VideoEncoder: Enable other pixel formats
|
|
640
|
+
// Let FFmpeg choose best pixel format to minimize loss
|
|
641
|
+
outPixelFormat_ = avcodec_find_best_pix_fmt_of_list(
|
|
642
|
+
getSupportedPixelFormats(*avCodec), // List of supported formats
|
|
643
|
+
AV_PIX_FMT_GBRP, // We reorder input to GBRP currently
|
|
644
|
+
0, // No alpha channel
|
|
645
|
+
nullptr // Discard conversion loss information
|
|
646
|
+
);
|
|
647
|
+
TORCH_CHECK(outPixelFormat_ != -1, "Failed to find best pix fmt")
|
|
648
|
+
|
|
649
|
+
// Configure codec parameters
|
|
650
|
+
avCodecContext_->codec_id = avCodec->id;
|
|
651
|
+
avCodecContext_->width = outWidth_;
|
|
652
|
+
avCodecContext_->height = outHeight_;
|
|
653
|
+
avCodecContext_->pix_fmt = outPixelFormat_;
|
|
654
|
+
// TODO-VideoEncoder: Verify that frame_rate and time_base are correct
|
|
655
|
+
avCodecContext_->time_base = {1, inFrameRate_};
|
|
656
|
+
avCodecContext_->framerate = {inFrameRate_, 1};
|
|
657
|
+
|
|
658
|
+
// Set flag for containers that require extradata to be in the codec context
|
|
659
|
+
if (avFormatContext_->oformat->flags & AVFMT_GLOBALHEADER) {
|
|
660
|
+
avCodecContext_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Apply videoStreamOptions
|
|
664
|
+
AVDictionary* options = nullptr;
|
|
665
|
+
if (videoStreamOptions.crf.has_value()) {
|
|
666
|
+
av_dict_set(
|
|
667
|
+
&options,
|
|
668
|
+
"crf",
|
|
669
|
+
std::to_string(videoStreamOptions.crf.value()).c_str(),
|
|
670
|
+
0);
|
|
671
|
+
}
|
|
672
|
+
int status = avcodec_open2(avCodecContext_.get(), avCodec, &options);
|
|
673
|
+
av_dict_free(&options);
|
|
674
|
+
|
|
675
|
+
TORCH_CHECK(
|
|
676
|
+
status == AVSUCCESS,
|
|
677
|
+
"avcodec_open2 failed: ",
|
|
678
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
679
|
+
|
|
680
|
+
avStream_ = avformat_new_stream(avFormatContext_.get(), nullptr);
|
|
681
|
+
TORCH_CHECK(avStream_ != nullptr, "Couldn't create new stream.");
|
|
682
|
+
|
|
683
|
+
// Set the stream time base to encode correct frame timestamps
|
|
684
|
+
avStream_->time_base = avCodecContext_->time_base;
|
|
685
|
+
status = avcodec_parameters_from_context(
|
|
686
|
+
avStream_->codecpar, avCodecContext_.get());
|
|
687
|
+
TORCH_CHECK(
|
|
688
|
+
status == AVSUCCESS,
|
|
689
|
+
"avcodec_parameters_from_context failed: ",
|
|
690
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
void VideoEncoder::encode() {
|
|
694
|
+
// To be on the safe side we enforce that encode() can only be called once
|
|
695
|
+
TORCH_CHECK(!encodeWasCalled_, "Cannot call encode() twice.");
|
|
696
|
+
encodeWasCalled_ = true;
|
|
697
|
+
|
|
698
|
+
int status = avformat_write_header(avFormatContext_.get(), nullptr);
|
|
699
|
+
TORCH_CHECK(
|
|
700
|
+
status == AVSUCCESS,
|
|
701
|
+
"Error in avformat_write_header: ",
|
|
702
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
703
|
+
|
|
704
|
+
AutoAVPacket autoAVPacket;
|
|
705
|
+
int numFrames = static_cast<int>(frames_.sizes()[0]);
|
|
706
|
+
for (int i = 0; i < numFrames; ++i) {
|
|
707
|
+
torch::Tensor currFrame = frames_[i];
|
|
708
|
+
UniqueAVFrame avFrame = convertTensorToAVFrame(currFrame, i);
|
|
709
|
+
encodeFrame(autoAVPacket, avFrame);
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
flushBuffers();
|
|
713
|
+
|
|
714
|
+
status = av_write_trailer(avFormatContext_.get());
|
|
715
|
+
TORCH_CHECK(
|
|
716
|
+
status == AVSUCCESS,
|
|
717
|
+
"Error in av_write_trailer: ",
|
|
718
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
719
|
+
}
|
|
720
|
+
|
|
721
|
+
UniqueAVFrame VideoEncoder::convertTensorToAVFrame(
|
|
722
|
+
const torch::Tensor& frame,
|
|
723
|
+
int frameIndex) {
|
|
724
|
+
// Initialize and cache scaling context if it does not exist
|
|
725
|
+
if (!swsContext_) {
|
|
726
|
+
swsContext_.reset(sws_getContext(
|
|
727
|
+
inWidth_,
|
|
728
|
+
inHeight_,
|
|
729
|
+
inPixelFormat_,
|
|
730
|
+
outWidth_,
|
|
731
|
+
outHeight_,
|
|
732
|
+
outPixelFormat_,
|
|
733
|
+
SWS_BICUBIC, // Used by FFmpeg CLI
|
|
734
|
+
nullptr,
|
|
735
|
+
nullptr,
|
|
736
|
+
nullptr));
|
|
737
|
+
TORCH_CHECK(swsContext_ != nullptr, "Failed to create scaling context");
|
|
738
|
+
}
|
|
739
|
+
|
|
740
|
+
UniqueAVFrame avFrame(av_frame_alloc());
|
|
741
|
+
TORCH_CHECK(avFrame != nullptr, "Failed to allocate AVFrame");
|
|
742
|
+
|
|
743
|
+
// Set output frame properties
|
|
744
|
+
avFrame->format = outPixelFormat_;
|
|
745
|
+
avFrame->width = outWidth_;
|
|
746
|
+
avFrame->height = outHeight_;
|
|
747
|
+
avFrame->pts = frameIndex;
|
|
748
|
+
|
|
749
|
+
int status = av_frame_get_buffer(avFrame.get(), 0);
|
|
750
|
+
TORCH_CHECK(status >= 0, "Failed to allocate frame buffer");
|
|
751
|
+
|
|
752
|
+
// Need to convert/scale the frame
|
|
753
|
+
// Create temporary frame with input format
|
|
754
|
+
UniqueAVFrame inputFrame(av_frame_alloc());
|
|
755
|
+
TORCH_CHECK(inputFrame != nullptr, "Failed to allocate input AVFrame");
|
|
756
|
+
|
|
757
|
+
inputFrame->format = inPixelFormat_;
|
|
758
|
+
inputFrame->width = inWidth_;
|
|
759
|
+
inputFrame->height = inHeight_;
|
|
760
|
+
|
|
761
|
+
uint8_t* tensorData = static_cast<uint8_t*>(frame.data_ptr());
|
|
762
|
+
|
|
763
|
+
// TODO-VideoEncoder: Reorder tensor if in NHWC format
|
|
764
|
+
int channelSize = inHeight_ * inWidth_;
|
|
765
|
+
// Reorder RGB -> GBR for AV_PIX_FMT_GBRP format
|
|
766
|
+
// TODO-VideoEncoder: Determine if FFmpeg supports planar RGB input format
|
|
767
|
+
inputFrame->data[0] = tensorData + channelSize;
|
|
768
|
+
inputFrame->data[1] = tensorData + (2 * channelSize);
|
|
769
|
+
inputFrame->data[2] = tensorData;
|
|
770
|
+
|
|
771
|
+
inputFrame->linesize[0] = inWidth_;
|
|
772
|
+
inputFrame->linesize[1] = inWidth_;
|
|
773
|
+
inputFrame->linesize[2] = inWidth_;
|
|
774
|
+
|
|
775
|
+
status = sws_scale(
|
|
776
|
+
swsContext_.get(),
|
|
777
|
+
inputFrame->data,
|
|
778
|
+
inputFrame->linesize,
|
|
779
|
+
0,
|
|
780
|
+
inputFrame->height,
|
|
781
|
+
avFrame->data,
|
|
782
|
+
avFrame->linesize);
|
|
783
|
+
TORCH_CHECK(status == outHeight_, "sws_scale failed");
|
|
784
|
+
return avFrame;
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
torch::Tensor VideoEncoder::encodeToTensor() {
|
|
788
|
+
TORCH_CHECK(
|
|
789
|
+
avioContextHolder_ != nullptr,
|
|
790
|
+
"Cannot encode to tensor, avio tensor context doesn't exist.");
|
|
791
|
+
encode();
|
|
792
|
+
auto avioToTensorContext =
|
|
793
|
+
dynamic_cast<AVIOToTensorContext*>(avioContextHolder_.get());
|
|
794
|
+
TORCH_CHECK(avioToTensorContext != nullptr, "Invalid AVIO context holder.");
|
|
795
|
+
return avioToTensorContext->getOutputTensor();
|
|
796
|
+
}
|
|
797
|
+
|
|
798
|
+
void VideoEncoder::encodeFrame(
|
|
799
|
+
AutoAVPacket& autoAVPacket,
|
|
800
|
+
const UniqueAVFrame& avFrame) {
|
|
801
|
+
auto status = avcodec_send_frame(avCodecContext_.get(), avFrame.get());
|
|
802
|
+
TORCH_CHECK(
|
|
803
|
+
status == AVSUCCESS,
|
|
804
|
+
"Error while sending frame: ",
|
|
805
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
806
|
+
|
|
807
|
+
while (status >= 0) {
|
|
808
|
+
ReferenceAVPacket packet(autoAVPacket);
|
|
809
|
+
status = avcodec_receive_packet(avCodecContext_.get(), packet.get());
|
|
810
|
+
if (status == AVERROR(EAGAIN) || status == AVERROR_EOF) {
|
|
811
|
+
if (status == AVERROR_EOF) {
|
|
812
|
+
// Flush remaining buffered packets
|
|
813
|
+
status = av_interleaved_write_frame(avFormatContext_.get(), nullptr);
|
|
814
|
+
TORCH_CHECK(
|
|
815
|
+
status == AVSUCCESS,
|
|
816
|
+
"Failed to flush packet: ",
|
|
817
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
818
|
+
}
|
|
819
|
+
return;
|
|
820
|
+
}
|
|
821
|
+
TORCH_CHECK(
|
|
822
|
+
status >= 0,
|
|
823
|
+
"Error receiving packet: ",
|
|
824
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
825
|
+
|
|
826
|
+
// The code below is borrowed from torchaudio:
|
|
827
|
+
// https://github.com/pytorch/audio/blob/b6a3368a45aaafe05f1a6a9f10c68adc5e944d9e/src/libtorio/ffmpeg/stream_writer/encoder.cpp#L46
|
|
828
|
+
// Setting packet->duration to 1 allows the last frame to be properly
|
|
829
|
+
// encoded, and needs to be set before calling av_packet_rescale_ts.
|
|
830
|
+
if (packet->duration == 0) {
|
|
831
|
+
packet->duration = 1;
|
|
832
|
+
}
|
|
833
|
+
av_packet_rescale_ts(
|
|
834
|
+
packet.get(), avCodecContext_->time_base, avStream_->time_base);
|
|
835
|
+
packet->stream_index = avStream_->index;
|
|
836
|
+
|
|
837
|
+
status = av_interleaved_write_frame(avFormatContext_.get(), packet.get());
|
|
838
|
+
TORCH_CHECK(
|
|
839
|
+
status == AVSUCCESS,
|
|
840
|
+
"Error in av_interleaved_write_frame: ",
|
|
841
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
842
|
+
}
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
void VideoEncoder::flushBuffers() {
|
|
846
|
+
AutoAVPacket autoAVPacket;
|
|
847
|
+
// Send null frame to signal end of input
|
|
848
|
+
encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
|
|
849
|
+
}
|
|
850
|
+
|
|
514
851
|
} // namespace facebook::torchcodec
|
torchcodec/_core/Encoder.h
CHANGED
|
@@ -57,7 +57,6 @@ class AudioEncoder {
|
|
|
57
57
|
bool encodeWasCalled_ = false;
|
|
58
58
|
int64_t lastEncodedAVFramePts_ = 0;
|
|
59
59
|
};
|
|
60
|
-
} // namespace facebook::torchcodec
|
|
61
60
|
|
|
62
61
|
/* clang-format off */
|
|
63
62
|
//
|
|
@@ -121,3 +120,65 @@ class AudioEncoder {
|
|
|
121
120
|
//
|
|
122
121
|
//
|
|
123
122
|
/* clang-format on */
|
|
123
|
+
|
|
124
|
+
class VideoEncoder {
|
|
125
|
+
public:
|
|
126
|
+
~VideoEncoder();
|
|
127
|
+
|
|
128
|
+
// Rule of Five requires that we define copy and move
|
|
129
|
+
// constructors and assignment operators.
|
|
130
|
+
// Both are deleted because we have unique_ptr members
|
|
131
|
+
VideoEncoder(const VideoEncoder&) = delete;
|
|
132
|
+
VideoEncoder& operator=(const VideoEncoder&) = delete;
|
|
133
|
+
|
|
134
|
+
// Move assignment operator deleted since we have a const member
|
|
135
|
+
VideoEncoder(VideoEncoder&&) = default;
|
|
136
|
+
VideoEncoder& operator=(VideoEncoder&&) = delete;
|
|
137
|
+
|
|
138
|
+
VideoEncoder(
|
|
139
|
+
const torch::Tensor& frames,
|
|
140
|
+
int frameRate,
|
|
141
|
+
std::string_view fileName,
|
|
142
|
+
const VideoStreamOptions& videoStreamOptions);
|
|
143
|
+
|
|
144
|
+
VideoEncoder(
|
|
145
|
+
const torch::Tensor& frames,
|
|
146
|
+
int frameRate,
|
|
147
|
+
std::string_view formatName,
|
|
148
|
+
std::unique_ptr<AVIOContextHolder> avioContextHolder,
|
|
149
|
+
const VideoStreamOptions& videoStreamOptions);
|
|
150
|
+
|
|
151
|
+
void encode();
|
|
152
|
+
|
|
153
|
+
torch::Tensor encodeToTensor();
|
|
154
|
+
|
|
155
|
+
private:
|
|
156
|
+
void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
|
|
157
|
+
UniqueAVFrame convertTensorToAVFrame(
|
|
158
|
+
const torch::Tensor& frame,
|
|
159
|
+
int frameIndex);
|
|
160
|
+
void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
|
|
161
|
+
void flushBuffers();
|
|
162
|
+
|
|
163
|
+
UniqueEncodingAVFormatContext avFormatContext_;
|
|
164
|
+
UniqueAVCodecContext avCodecContext_;
|
|
165
|
+
AVStream* avStream_ = nullptr;
|
|
166
|
+
UniqueSwsContext swsContext_;
|
|
167
|
+
|
|
168
|
+
const torch::Tensor frames_;
|
|
169
|
+
int inFrameRate_;
|
|
170
|
+
|
|
171
|
+
int inWidth_ = -1;
|
|
172
|
+
int inHeight_ = -1;
|
|
173
|
+
AVPixelFormat inPixelFormat_ = AV_PIX_FMT_NONE;
|
|
174
|
+
|
|
175
|
+
int outWidth_ = -1;
|
|
176
|
+
int outHeight_ = -1;
|
|
177
|
+
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
|
|
178
|
+
|
|
179
|
+
std::unique_ptr<AVIOContextHolder> avioContextHolder_;
|
|
180
|
+
|
|
181
|
+
bool encodeWasCalled_ = false;
|
|
182
|
+
};
|
|
183
|
+
|
|
184
|
+
} // namespace facebook::torchcodec
|
|
@@ -8,6 +8,11 @@
|
|
|
8
8
|
|
|
9
9
|
#include <c10/util/Exception.h>
|
|
10
10
|
|
|
11
|
+
extern "C" {
|
|
12
|
+
#include <libavfilter/avfilter.h>
|
|
13
|
+
#include <libavfilter/buffersink.h>
|
|
14
|
+
}
|
|
15
|
+
|
|
11
16
|
namespace facebook::torchcodec {
|
|
12
17
|
|
|
13
18
|
AutoAVPacket::AutoAVPacket() : avPacket_(av_packet_alloc()) {
|
|
@@ -56,6 +61,77 @@ int64_t getDuration(const UniqueAVFrame& avFrame) {
|
|
|
56
61
|
#endif
|
|
57
62
|
}
|
|
58
63
|
|
|
64
|
+
void setDuration(const UniqueAVFrame& avFrame, int64_t duration) {
|
|
65
|
+
#if LIBAVUTIL_VERSION_MAJOR < 58
|
|
66
|
+
avFrame->pkt_duration = duration;
|
|
67
|
+
#else
|
|
68
|
+
avFrame->duration = duration;
|
|
69
|
+
#endif
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const int* getSupportedSampleRates(const AVCodec& avCodec) {
|
|
73
|
+
const int* supportedSampleRates = nullptr;
|
|
74
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
75
|
+
int numSampleRates = 0;
|
|
76
|
+
int ret = avcodec_get_supported_config(
|
|
77
|
+
nullptr,
|
|
78
|
+
&avCodec,
|
|
79
|
+
AV_CODEC_CONFIG_SAMPLE_RATE,
|
|
80
|
+
0,
|
|
81
|
+
reinterpret_cast<const void**>(&supportedSampleRates),
|
|
82
|
+
&numSampleRates);
|
|
83
|
+
if (ret < 0 || supportedSampleRates == nullptr) {
|
|
84
|
+
// Return nullptr to skip validation in validateSampleRate.
|
|
85
|
+
return nullptr;
|
|
86
|
+
}
|
|
87
|
+
#else
|
|
88
|
+
supportedSampleRates = avCodec.supported_samplerates;
|
|
89
|
+
#endif
|
|
90
|
+
return supportedSampleRates;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec) {
|
|
94
|
+
const AVPixelFormat* supportedPixelFormats = nullptr;
|
|
95
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
96
|
+
int numPixelFormats = 0;
|
|
97
|
+
int ret = avcodec_get_supported_config(
|
|
98
|
+
nullptr,
|
|
99
|
+
&avCodec,
|
|
100
|
+
AV_CODEC_CONFIG_PIX_FORMAT,
|
|
101
|
+
0,
|
|
102
|
+
reinterpret_cast<const void**>(&supportedPixelFormats),
|
|
103
|
+
&numPixelFormats);
|
|
104
|
+
if (ret < 0 || supportedPixelFormats == nullptr) {
|
|
105
|
+
TORCH_CHECK(false, "Couldn't get supported pixel formats from encoder.");
|
|
106
|
+
}
|
|
107
|
+
#else
|
|
108
|
+
supportedPixelFormats = avCodec.pix_fmts;
|
|
109
|
+
#endif
|
|
110
|
+
return supportedPixelFormats;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec) {
|
|
114
|
+
const AVSampleFormat* supportedSampleFormats = nullptr;
|
|
115
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
116
|
+
int numSampleFormats = 0;
|
|
117
|
+
int ret = avcodec_get_supported_config(
|
|
118
|
+
nullptr,
|
|
119
|
+
&avCodec,
|
|
120
|
+
AV_CODEC_CONFIG_SAMPLE_FORMAT,
|
|
121
|
+
0,
|
|
122
|
+
reinterpret_cast<const void**>(&supportedSampleFormats),
|
|
123
|
+
&numSampleFormats);
|
|
124
|
+
if (ret < 0 || supportedSampleFormats == nullptr) {
|
|
125
|
+
// Return nullptr to use default output format in
|
|
126
|
+
// findBestOutputSampleFormat.
|
|
127
|
+
return nullptr;
|
|
128
|
+
}
|
|
129
|
+
#else
|
|
130
|
+
supportedSampleFormats = avCodec.sample_fmts;
|
|
131
|
+
#endif
|
|
132
|
+
return supportedSampleFormats;
|
|
133
|
+
}
|
|
134
|
+
|
|
59
135
|
int getNumChannels(const UniqueAVFrame& avFrame) {
|
|
60
136
|
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
|
|
61
137
|
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
|
|
@@ -73,7 +149,7 @@ int getNumChannels(const UniqueAVFrame& avFrame) {
|
|
|
73
149
|
#endif
|
|
74
150
|
}
|
|
75
151
|
|
|
76
|
-
int getNumChannels(const
|
|
152
|
+
int getNumChannels(const SharedAVCodecContext& avCodecContext) {
|
|
77
153
|
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
|
|
78
154
|
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
|
|
79
155
|
return avCodecContext->ch_layout.nb_channels;
|
|
@@ -109,7 +185,32 @@ void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels) {
|
|
|
109
185
|
}
|
|
110
186
|
|
|
111
187
|
void validateNumChannels(const AVCodec& avCodec, int numChannels) {
|
|
112
|
-
#if
|
|
188
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
189
|
+
std::stringstream supportedNumChannels;
|
|
190
|
+
const AVChannelLayout* supportedLayouts = nullptr;
|
|
191
|
+
int numLayouts = 0;
|
|
192
|
+
int ret = avcodec_get_supported_config(
|
|
193
|
+
nullptr,
|
|
194
|
+
&avCodec,
|
|
195
|
+
AV_CODEC_CONFIG_CHANNEL_LAYOUT,
|
|
196
|
+
0,
|
|
197
|
+
reinterpret_cast<const void**>(&supportedLayouts),
|
|
198
|
+
&numLayouts);
|
|
199
|
+
if (ret < 0 || supportedLayouts == nullptr) {
|
|
200
|
+
// If we can't validate, we must assume it'll be fine. If not, FFmpeg will
|
|
201
|
+
// eventually raise.
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
for (int i = 0; i < numLayouts; ++i) {
|
|
205
|
+
if (i > 0) {
|
|
206
|
+
supportedNumChannels << ", ";
|
|
207
|
+
}
|
|
208
|
+
supportedNumChannels << supportedLayouts[i].nb_channels;
|
|
209
|
+
if (numChannels == supportedLayouts[i].nb_channels) {
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
#elif LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
113
214
|
if (avCodec.ch_layouts == nullptr) {
|
|
114
215
|
// If we can't validate, we must assume it'll be fine. If not, FFmpeg will
|
|
115
216
|
// eventually raise.
|
|
@@ -131,7 +232,7 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {
|
|
|
131
232
|
}
|
|
132
233
|
supportedNumChannels << avCodec.ch_layouts[i].nb_channels;
|
|
133
234
|
}
|
|
134
|
-
#else
|
|
235
|
+
#else // FFmpeg <= 4
|
|
135
236
|
if (avCodec.channel_layouts == nullptr) {
|
|
136
237
|
// can't validate, same as above.
|
|
137
238
|
return;
|
|
@@ -298,6 +399,70 @@ SwrContext* createSwrContext(
|
|
|
298
399
|
return swrContext;
|
|
299
400
|
}
|
|
300
401
|
|
|
402
|
+
AVFilterContext* createBuffersinkFilter(
|
|
403
|
+
AVFilterGraph* filterGraph,
|
|
404
|
+
enum AVPixelFormat outputFormat) {
|
|
405
|
+
const AVFilter* buffersink = avfilter_get_by_name("buffersink");
|
|
406
|
+
TORCH_CHECK(buffersink != nullptr, "Failed to get buffersink filter.");
|
|
407
|
+
|
|
408
|
+
AVFilterContext* sinkContext = nullptr;
|
|
409
|
+
int status;
|
|
410
|
+
const char* filterName = "out";
|
|
411
|
+
|
|
412
|
+
enum AVPixelFormat pix_fmts[] = {outputFormat, AV_PIX_FMT_NONE};
|
|
413
|
+
|
|
414
|
+
// av_opt_set_int_list was replaced by av_opt_set_array() in FFmpeg 8.
|
|
415
|
+
#if LIBAVUTIL_VERSION_MAJOR >= 60 // FFmpeg >= 8
|
|
416
|
+
// Output options like pixel_formats must be set before filter init
|
|
417
|
+
sinkContext =
|
|
418
|
+
avfilter_graph_alloc_filter(filterGraph, buffersink, filterName);
|
|
419
|
+
TORCH_CHECK(
|
|
420
|
+
sinkContext != nullptr, "Failed to allocate buffersink filter context.");
|
|
421
|
+
|
|
422
|
+
// When setting pix_fmts, only the first element is used, so nb_elems = 1
|
|
423
|
+
// AV_PIX_FMT_NONE acts as a terminator for the array in av_opt_set_int_list
|
|
424
|
+
status = av_opt_set_array(
|
|
425
|
+
sinkContext,
|
|
426
|
+
"pixel_formats",
|
|
427
|
+
AV_OPT_SEARCH_CHILDREN,
|
|
428
|
+
0, // start_elem
|
|
429
|
+
1, // nb_elems
|
|
430
|
+
AV_OPT_TYPE_PIXEL_FMT,
|
|
431
|
+
pix_fmts);
|
|
432
|
+
TORCH_CHECK(
|
|
433
|
+
status >= 0,
|
|
434
|
+
"Failed to set pixel format for buffersink filter: ",
|
|
435
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
436
|
+
|
|
437
|
+
status = avfilter_init_str(sinkContext, nullptr);
|
|
438
|
+
TORCH_CHECK(
|
|
439
|
+
status >= 0,
|
|
440
|
+
"Failed to initialize buffersink filter: ",
|
|
441
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
442
|
+
#else // FFmpeg <= 7
|
|
443
|
+
// For older FFmpeg versions, create filter and then set options
|
|
444
|
+
status = avfilter_graph_create_filter(
|
|
445
|
+
&sinkContext, buffersink, filterName, nullptr, nullptr, filterGraph);
|
|
446
|
+
TORCH_CHECK(
|
|
447
|
+
status >= 0,
|
|
448
|
+
"Failed to create buffersink filter: ",
|
|
449
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
450
|
+
|
|
451
|
+
status = av_opt_set_int_list(
|
|
452
|
+
sinkContext,
|
|
453
|
+
"pix_fmts",
|
|
454
|
+
pix_fmts,
|
|
455
|
+
AV_PIX_FMT_NONE,
|
|
456
|
+
AV_OPT_SEARCH_CHILDREN);
|
|
457
|
+
TORCH_CHECK(
|
|
458
|
+
status >= 0,
|
|
459
|
+
"Failed to set pixel formats for buffersink filter: ",
|
|
460
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
461
|
+
#endif
|
|
462
|
+
|
|
463
|
+
return sinkContext;
|
|
464
|
+
}
|
|
465
|
+
|
|
301
466
|
UniqueAVFrame convertAudioAVFrameSamples(
|
|
302
467
|
const UniqueSwrContext& swrContext,
|
|
303
468
|
const UniqueAVFrame& srcAVFrame,
|
|
@@ -418,4 +583,26 @@ AVIOContext* avioAllocContext(
|
|
|
418
583
|
seek);
|
|
419
584
|
}
|
|
420
585
|
|
|
586
|
+
double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
|
|
587
|
+
// To perform the multiplication before the division, av_q2d is not used
|
|
588
|
+
return static_cast<double>(pts) * timeBase.num / timeBase.den;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
|
|
592
|
+
return static_cast<int64_t>(
|
|
593
|
+
std::round(seconds * timeBase.den / timeBase.num));
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
int64_t computeSafeDuration(
|
|
597
|
+
const AVRational& frameRate,
|
|
598
|
+
const AVRational& timeBase) {
|
|
599
|
+
if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 ||
|
|
600
|
+
timeBase.den <= 0) {
|
|
601
|
+
return 0;
|
|
602
|
+
} else {
|
|
603
|
+
return (static_cast<int64_t>(frameRate.den) * timeBase.den) /
|
|
604
|
+
(static_cast<int64_t>(timeBase.num) * frameRate.num);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
421
608
|
} // namespace facebook::torchcodec
|