torchcodec 0.7.0__cp310-cp310-win_amd64.whl → 0.8.0__cp310-cp310-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of torchcodec might be problematic. Click here for more details.
- torchcodec/_core/BetaCudaDeviceInterface.cpp +636 -0
- torchcodec/_core/BetaCudaDeviceInterface.h +191 -0
- torchcodec/_core/CMakeLists.txt +36 -3
- torchcodec/_core/CUDACommon.cpp +315 -0
- torchcodec/_core/CUDACommon.h +46 -0
- torchcodec/_core/CpuDeviceInterface.cpp +189 -108
- torchcodec/_core/CpuDeviceInterface.h +81 -19
- torchcodec/_core/CudaDeviceInterface.cpp +211 -368
- torchcodec/_core/CudaDeviceInterface.h +33 -6
- torchcodec/_core/DeviceInterface.cpp +57 -19
- torchcodec/_core/DeviceInterface.h +97 -16
- torchcodec/_core/Encoder.cpp +302 -9
- torchcodec/_core/Encoder.h +51 -1
- torchcodec/_core/FFMPEGCommon.cpp +189 -2
- torchcodec/_core/FFMPEGCommon.h +18 -0
- torchcodec/_core/FilterGraph.cpp +28 -21
- torchcodec/_core/FilterGraph.h +15 -1
- torchcodec/_core/Frame.cpp +17 -7
- torchcodec/_core/Frame.h +15 -61
- torchcodec/_core/Metadata.h +2 -2
- torchcodec/_core/NVDECCache.cpp +70 -0
- torchcodec/_core/NVDECCache.h +104 -0
- torchcodec/_core/SingleStreamDecoder.cpp +202 -198
- torchcodec/_core/SingleStreamDecoder.h +39 -14
- torchcodec/_core/StreamOptions.h +16 -6
- torchcodec/_core/Transform.cpp +60 -0
- torchcodec/_core/Transform.h +59 -0
- torchcodec/_core/__init__.py +1 -0
- torchcodec/_core/custom_ops.cpp +180 -32
- torchcodec/_core/fetch_and_expose_non_gpl_ffmpeg_libs.cmake +61 -1
- torchcodec/_core/nvcuvid_include/cuviddec.h +1374 -0
- torchcodec/_core/nvcuvid_include/nvcuvid.h +610 -0
- torchcodec/_core/ops.py +86 -43
- torchcodec/_core/pybind_ops.cpp +22 -59
- torchcodec/_samplers/video_clip_sampler.py +7 -19
- torchcodec/decoders/__init__.py +1 -0
- torchcodec/decoders/_decoder_utils.py +61 -1
- torchcodec/decoders/_video_decoder.py +56 -20
- torchcodec/libtorchcodec_core4.dll +0 -0
- torchcodec/libtorchcodec_core5.dll +0 -0
- torchcodec/libtorchcodec_core6.dll +0 -0
- torchcodec/libtorchcodec_core7.dll +0 -0
- torchcodec/libtorchcodec_core8.dll +0 -0
- torchcodec/libtorchcodec_custom_ops4.dll +0 -0
- torchcodec/libtorchcodec_custom_ops5.dll +0 -0
- torchcodec/libtorchcodec_custom_ops6.dll +0 -0
- torchcodec/libtorchcodec_custom_ops7.dll +0 -0
- torchcodec/libtorchcodec_custom_ops8.dll +0 -0
- torchcodec/libtorchcodec_pybind_ops4.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops5.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops6.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops7.pyd +0 -0
- torchcodec/libtorchcodec_pybind_ops8.pyd +0 -0
- torchcodec/samplers/_time_based.py +8 -0
- torchcodec/version.py +1 -1
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/METADATA +24 -13
- torchcodec-0.8.0.dist-info/RECORD +80 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/WHEEL +1 -1
- torchcodec-0.7.0.dist-info/RECORD +0 -67
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/licenses/LICENSE +0 -0
- {torchcodec-0.7.0.dist-info → torchcodec-0.8.0.dist-info}/top_level.txt +0 -0
torchcodec/_core/Encoder.h
CHANGED
|
@@ -57,7 +57,6 @@ class AudioEncoder {
|
|
|
57
57
|
bool encodeWasCalled_ = false;
|
|
58
58
|
int64_t lastEncodedAVFramePts_ = 0;
|
|
59
59
|
};
|
|
60
|
-
} // namespace facebook::torchcodec
|
|
61
60
|
|
|
62
61
|
/* clang-format off */
|
|
63
62
|
//
|
|
@@ -121,3 +120,54 @@ class AudioEncoder {
|
|
|
121
120
|
//
|
|
122
121
|
//
|
|
123
122
|
/* clang-format on */
|
|
123
|
+
|
|
124
|
+
class VideoEncoder {
|
|
125
|
+
public:
|
|
126
|
+
~VideoEncoder();
|
|
127
|
+
|
|
128
|
+
// Rule of Five requires that we define copy and move
|
|
129
|
+
// constructors and assignment operators.
|
|
130
|
+
// Both are deleted because we have unique_ptr members
|
|
131
|
+
VideoEncoder(const VideoEncoder&) = delete;
|
|
132
|
+
VideoEncoder& operator=(const VideoEncoder&) = delete;
|
|
133
|
+
|
|
134
|
+
// Move assignment operator deleted since we have a const member
|
|
135
|
+
VideoEncoder(VideoEncoder&&) = default;
|
|
136
|
+
VideoEncoder& operator=(VideoEncoder&&) = delete;
|
|
137
|
+
|
|
138
|
+
VideoEncoder(
|
|
139
|
+
const torch::Tensor& frames,
|
|
140
|
+
int frameRate,
|
|
141
|
+
std::string_view fileName,
|
|
142
|
+
const VideoStreamOptions& videoStreamOptions);
|
|
143
|
+
|
|
144
|
+
void encode();
|
|
145
|
+
|
|
146
|
+
private:
|
|
147
|
+
void initializeEncoder(const VideoStreamOptions& videoStreamOptions);
|
|
148
|
+
UniqueAVFrame convertTensorToAVFrame(
|
|
149
|
+
const torch::Tensor& frame,
|
|
150
|
+
int frameIndex);
|
|
151
|
+
void encodeFrame(AutoAVPacket& autoAVPacket, const UniqueAVFrame& avFrame);
|
|
152
|
+
void flushBuffers();
|
|
153
|
+
|
|
154
|
+
UniqueEncodingAVFormatContext avFormatContext_;
|
|
155
|
+
UniqueAVCodecContext avCodecContext_;
|
|
156
|
+
AVStream* avStream_;
|
|
157
|
+
UniqueSwsContext swsContext_;
|
|
158
|
+
|
|
159
|
+
const torch::Tensor frames_;
|
|
160
|
+
int inFrameRate_;
|
|
161
|
+
|
|
162
|
+
int inWidth_ = -1;
|
|
163
|
+
int inHeight_ = -1;
|
|
164
|
+
AVPixelFormat inPixelFormat_ = AV_PIX_FMT_NONE;
|
|
165
|
+
|
|
166
|
+
int outWidth_ = -1;
|
|
167
|
+
int outHeight_ = -1;
|
|
168
|
+
AVPixelFormat outPixelFormat_ = AV_PIX_FMT_NONE;
|
|
169
|
+
|
|
170
|
+
bool encodeWasCalled_ = false;
|
|
171
|
+
};
|
|
172
|
+
|
|
173
|
+
} // namespace facebook::torchcodec
|
|
@@ -8,6 +8,11 @@
|
|
|
8
8
|
|
|
9
9
|
#include <c10/util/Exception.h>
|
|
10
10
|
|
|
11
|
+
extern "C" {
|
|
12
|
+
#include <libavfilter/avfilter.h>
|
|
13
|
+
#include <libavfilter/buffersink.h>
|
|
14
|
+
}
|
|
15
|
+
|
|
11
16
|
namespace facebook::torchcodec {
|
|
12
17
|
|
|
13
18
|
AutoAVPacket::AutoAVPacket() : avPacket_(av_packet_alloc()) {
|
|
@@ -56,6 +61,77 @@ int64_t getDuration(const UniqueAVFrame& avFrame) {
|
|
|
56
61
|
#endif
|
|
57
62
|
}
|
|
58
63
|
|
|
64
|
+
void setDuration(const UniqueAVFrame& avFrame, int64_t duration) {
|
|
65
|
+
#if LIBAVUTIL_VERSION_MAJOR < 58
|
|
66
|
+
avFrame->pkt_duration = duration;
|
|
67
|
+
#else
|
|
68
|
+
avFrame->duration = duration;
|
|
69
|
+
#endif
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const int* getSupportedSampleRates(const AVCodec& avCodec) {
|
|
73
|
+
const int* supportedSampleRates = nullptr;
|
|
74
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
75
|
+
int numSampleRates = 0;
|
|
76
|
+
int ret = avcodec_get_supported_config(
|
|
77
|
+
nullptr,
|
|
78
|
+
&avCodec,
|
|
79
|
+
AV_CODEC_CONFIG_SAMPLE_RATE,
|
|
80
|
+
0,
|
|
81
|
+
reinterpret_cast<const void**>(&supportedSampleRates),
|
|
82
|
+
&numSampleRates);
|
|
83
|
+
if (ret < 0 || supportedSampleRates == nullptr) {
|
|
84
|
+
// Return nullptr to skip validation in validateSampleRate.
|
|
85
|
+
return nullptr;
|
|
86
|
+
}
|
|
87
|
+
#else
|
|
88
|
+
supportedSampleRates = avCodec.supported_samplerates;
|
|
89
|
+
#endif
|
|
90
|
+
return supportedSampleRates;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec) {
|
|
94
|
+
const AVPixelFormat* supportedPixelFormats = nullptr;
|
|
95
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
96
|
+
int numPixelFormats = 0;
|
|
97
|
+
int ret = avcodec_get_supported_config(
|
|
98
|
+
nullptr,
|
|
99
|
+
&avCodec,
|
|
100
|
+
AV_CODEC_CONFIG_PIX_FORMAT,
|
|
101
|
+
0,
|
|
102
|
+
reinterpret_cast<const void**>(&supportedPixelFormats),
|
|
103
|
+
&numPixelFormats);
|
|
104
|
+
if (ret < 0 || supportedPixelFormats == nullptr) {
|
|
105
|
+
TORCH_CHECK(false, "Couldn't get supported pixel formats from encoder.");
|
|
106
|
+
}
|
|
107
|
+
#else
|
|
108
|
+
supportedPixelFormats = avCodec.pix_fmts;
|
|
109
|
+
#endif
|
|
110
|
+
return supportedPixelFormats;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec) {
|
|
114
|
+
const AVSampleFormat* supportedSampleFormats = nullptr;
|
|
115
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
116
|
+
int numSampleFormats = 0;
|
|
117
|
+
int ret = avcodec_get_supported_config(
|
|
118
|
+
nullptr,
|
|
119
|
+
&avCodec,
|
|
120
|
+
AV_CODEC_CONFIG_SAMPLE_FORMAT,
|
|
121
|
+
0,
|
|
122
|
+
reinterpret_cast<const void**>(&supportedSampleFormats),
|
|
123
|
+
&numSampleFormats);
|
|
124
|
+
if (ret < 0 || supportedSampleFormats == nullptr) {
|
|
125
|
+
// Return nullptr to use default output format in
|
|
126
|
+
// findBestOutputSampleFormat.
|
|
127
|
+
return nullptr;
|
|
128
|
+
}
|
|
129
|
+
#else
|
|
130
|
+
supportedSampleFormats = avCodec.sample_fmts;
|
|
131
|
+
#endif
|
|
132
|
+
return supportedSampleFormats;
|
|
133
|
+
}
|
|
134
|
+
|
|
59
135
|
int getNumChannels(const UniqueAVFrame& avFrame) {
|
|
60
136
|
#if LIBAVFILTER_VERSION_MAJOR > 8 || \
|
|
61
137
|
(LIBAVFILTER_VERSION_MAJOR == 8 && LIBAVFILTER_VERSION_MINOR >= 44)
|
|
@@ -109,7 +185,32 @@ void setDefaultChannelLayout(UniqueAVFrame& avFrame, int numChannels) {
|
|
|
109
185
|
}
|
|
110
186
|
|
|
111
187
|
void validateNumChannels(const AVCodec& avCodec, int numChannels) {
|
|
112
|
-
#if
|
|
188
|
+
#if LIBAVCODEC_VERSION_INT >= AV_VERSION_INT(61, 13, 100) // FFmpeg >= 7.1
|
|
189
|
+
std::stringstream supportedNumChannels;
|
|
190
|
+
const AVChannelLayout* supportedLayouts = nullptr;
|
|
191
|
+
int numLayouts = 0;
|
|
192
|
+
int ret = avcodec_get_supported_config(
|
|
193
|
+
nullptr,
|
|
194
|
+
&avCodec,
|
|
195
|
+
AV_CODEC_CONFIG_CHANNEL_LAYOUT,
|
|
196
|
+
0,
|
|
197
|
+
reinterpret_cast<const void**>(&supportedLayouts),
|
|
198
|
+
&numLayouts);
|
|
199
|
+
if (ret < 0 || supportedLayouts == nullptr) {
|
|
200
|
+
// If we can't validate, we must assume it'll be fine. If not, FFmpeg will
|
|
201
|
+
// eventually raise.
|
|
202
|
+
return;
|
|
203
|
+
}
|
|
204
|
+
for (int i = 0; i < numLayouts; ++i) {
|
|
205
|
+
if (i > 0) {
|
|
206
|
+
supportedNumChannels << ", ";
|
|
207
|
+
}
|
|
208
|
+
supportedNumChannels << supportedLayouts[i].nb_channels;
|
|
209
|
+
if (numChannels == supportedLayouts[i].nb_channels) {
|
|
210
|
+
return;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
#elif LIBAVFILTER_VERSION_MAJOR > 7 // FFmpeg > 4
|
|
113
214
|
if (avCodec.ch_layouts == nullptr) {
|
|
114
215
|
// If we can't validate, we must assume it'll be fine. If not, FFmpeg will
|
|
115
216
|
// eventually raise.
|
|
@@ -131,7 +232,7 @@ void validateNumChannels(const AVCodec& avCodec, int numChannels) {
|
|
|
131
232
|
}
|
|
132
233
|
supportedNumChannels << avCodec.ch_layouts[i].nb_channels;
|
|
133
234
|
}
|
|
134
|
-
#else
|
|
235
|
+
#else // FFmpeg <= 4
|
|
135
236
|
if (avCodec.channel_layouts == nullptr) {
|
|
136
237
|
// can't validate, same as above.
|
|
137
238
|
return;
|
|
@@ -298,6 +399,70 @@ SwrContext* createSwrContext(
|
|
|
298
399
|
return swrContext;
|
|
299
400
|
}
|
|
300
401
|
|
|
402
|
+
AVFilterContext* createBuffersinkFilter(
|
|
403
|
+
AVFilterGraph* filterGraph,
|
|
404
|
+
enum AVPixelFormat outputFormat) {
|
|
405
|
+
const AVFilter* buffersink = avfilter_get_by_name("buffersink");
|
|
406
|
+
TORCH_CHECK(buffersink != nullptr, "Failed to get buffersink filter.");
|
|
407
|
+
|
|
408
|
+
AVFilterContext* sinkContext = nullptr;
|
|
409
|
+
int status;
|
|
410
|
+
const char* filterName = "out";
|
|
411
|
+
|
|
412
|
+
enum AVPixelFormat pix_fmts[] = {outputFormat, AV_PIX_FMT_NONE};
|
|
413
|
+
|
|
414
|
+
// av_opt_set_int_list was replaced by av_opt_set_array() in FFmpeg 8.
|
|
415
|
+
#if LIBAVUTIL_VERSION_MAJOR >= 60 // FFmpeg >= 8
|
|
416
|
+
// Output options like pixel_formats must be set before filter init
|
|
417
|
+
sinkContext =
|
|
418
|
+
avfilter_graph_alloc_filter(filterGraph, buffersink, filterName);
|
|
419
|
+
TORCH_CHECK(
|
|
420
|
+
sinkContext != nullptr, "Failed to allocate buffersink filter context.");
|
|
421
|
+
|
|
422
|
+
// When setting pix_fmts, only the first element is used, so nb_elems = 1
|
|
423
|
+
// AV_PIX_FMT_NONE acts as a terminator for the array in av_opt_set_int_list
|
|
424
|
+
status = av_opt_set_array(
|
|
425
|
+
sinkContext,
|
|
426
|
+
"pixel_formats",
|
|
427
|
+
AV_OPT_SEARCH_CHILDREN,
|
|
428
|
+
0, // start_elem
|
|
429
|
+
1, // nb_elems
|
|
430
|
+
AV_OPT_TYPE_PIXEL_FMT,
|
|
431
|
+
pix_fmts);
|
|
432
|
+
TORCH_CHECK(
|
|
433
|
+
status >= 0,
|
|
434
|
+
"Failed to set pixel format for buffersink filter: ",
|
|
435
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
436
|
+
|
|
437
|
+
status = avfilter_init_str(sinkContext, nullptr);
|
|
438
|
+
TORCH_CHECK(
|
|
439
|
+
status >= 0,
|
|
440
|
+
"Failed to initialize buffersink filter: ",
|
|
441
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
442
|
+
#else // FFmpeg <= 7
|
|
443
|
+
// For older FFmpeg versions, create filter and then set options
|
|
444
|
+
status = avfilter_graph_create_filter(
|
|
445
|
+
&sinkContext, buffersink, filterName, nullptr, nullptr, filterGraph);
|
|
446
|
+
TORCH_CHECK(
|
|
447
|
+
status >= 0,
|
|
448
|
+
"Failed to create buffersink filter: ",
|
|
449
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
450
|
+
|
|
451
|
+
status = av_opt_set_int_list(
|
|
452
|
+
sinkContext,
|
|
453
|
+
"pix_fmts",
|
|
454
|
+
pix_fmts,
|
|
455
|
+
AV_PIX_FMT_NONE,
|
|
456
|
+
AV_OPT_SEARCH_CHILDREN);
|
|
457
|
+
TORCH_CHECK(
|
|
458
|
+
status >= 0,
|
|
459
|
+
"Failed to set pixel formats for buffersink filter: ",
|
|
460
|
+
getFFMPEGErrorStringFromErrorCode(status));
|
|
461
|
+
#endif
|
|
462
|
+
|
|
463
|
+
return sinkContext;
|
|
464
|
+
}
|
|
465
|
+
|
|
301
466
|
UniqueAVFrame convertAudioAVFrameSamples(
|
|
302
467
|
const UniqueSwrContext& swrContext,
|
|
303
468
|
const UniqueAVFrame& srcAVFrame,
|
|
@@ -418,4 +583,26 @@ AVIOContext* avioAllocContext(
|
|
|
418
583
|
seek);
|
|
419
584
|
}
|
|
420
585
|
|
|
586
|
+
double ptsToSeconds(int64_t pts, const AVRational& timeBase) {
|
|
587
|
+
// To perform the multiplication before the division, av_q2d is not used
|
|
588
|
+
return static_cast<double>(pts) * timeBase.num / timeBase.den;
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
int64_t secondsToClosestPts(double seconds, const AVRational& timeBase) {
|
|
592
|
+
return static_cast<int64_t>(
|
|
593
|
+
std::round(seconds * timeBase.den / timeBase.num));
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
int64_t computeSafeDuration(
|
|
597
|
+
const AVRational& frameRate,
|
|
598
|
+
const AVRational& timeBase) {
|
|
599
|
+
if (frameRate.num <= 0 || frameRate.den <= 0 || timeBase.num <= 0 ||
|
|
600
|
+
timeBase.den <= 0) {
|
|
601
|
+
return 0;
|
|
602
|
+
} else {
|
|
603
|
+
return (static_cast<int64_t>(frameRate.den) * timeBase.den) /
|
|
604
|
+
(static_cast<int64_t>(timeBase.num) * frameRate.num);
|
|
605
|
+
}
|
|
606
|
+
}
|
|
607
|
+
|
|
421
608
|
} // namespace facebook::torchcodec
|
torchcodec/_core/FFMPEGCommon.h
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
|
|
13
13
|
extern "C" {
|
|
14
14
|
#include <libavcodec/avcodec.h>
|
|
15
|
+
#include <libavcodec/bsf.h>
|
|
15
16
|
#include <libavfilter/avfilter.h>
|
|
16
17
|
#include <libavfilter/buffersrc.h>
|
|
17
18
|
#include <libavformat/avformat.h>
|
|
@@ -86,6 +87,8 @@ using UniqueSwrContext =
|
|
|
86
87
|
std::unique_ptr<SwrContext, Deleterp<SwrContext, void, swr_free>>;
|
|
87
88
|
using UniqueAVAudioFifo = std::
|
|
88
89
|
unique_ptr<AVAudioFifo, Deleter<AVAudioFifo, void, av_audio_fifo_free>>;
|
|
90
|
+
using UniqueAVBSFContext =
|
|
91
|
+
std::unique_ptr<AVBSFContext, Deleterp<AVBSFContext, void, av_bsf_free>>;
|
|
89
92
|
using UniqueAVBufferRef =
|
|
90
93
|
std::unique_ptr<AVBufferRef, Deleterp<AVBufferRef, void, av_buffer_unref>>;
|
|
91
94
|
using UniqueAVBufferSrcParameters = std::unique_ptr<
|
|
@@ -161,6 +164,11 @@ std::string getFFMPEGErrorStringFromErrorCode(int errorCode);
|
|
|
161
164
|
// struct member representing duration has changed across the versions we
|
|
162
165
|
// support.
|
|
163
166
|
int64_t getDuration(const UniqueAVFrame& frame);
|
|
167
|
+
void setDuration(const UniqueAVFrame& frame, int64_t duration);
|
|
168
|
+
|
|
169
|
+
const int* getSupportedSampleRates(const AVCodec& avCodec);
|
|
170
|
+
const AVSampleFormat* getSupportedOutputSampleFormats(const AVCodec& avCodec);
|
|
171
|
+
const AVPixelFormat* getSupportedPixelFormats(const AVCodec& avCodec);
|
|
164
172
|
|
|
165
173
|
int getNumChannels(const UniqueAVFrame& avFrame);
|
|
166
174
|
int getNumChannels(const UniqueAVCodecContext& avCodecContext);
|
|
@@ -224,4 +232,14 @@ AVIOContext* avioAllocContext(
|
|
|
224
232
|
AVIOWriteFunction write_packet,
|
|
225
233
|
AVIOSeekFunction seek);
|
|
226
234
|
|
|
235
|
+
double ptsToSeconds(int64_t pts, const AVRational& timeBase);
|
|
236
|
+
int64_t secondsToClosestPts(double seconds, const AVRational& timeBase);
|
|
237
|
+
int64_t computeSafeDuration(
|
|
238
|
+
const AVRational& frameRate,
|
|
239
|
+
const AVRational& timeBase);
|
|
240
|
+
|
|
241
|
+
AVFilterContext* createBuffersinkFilter(
|
|
242
|
+
AVFilterGraph* filterGraph,
|
|
243
|
+
enum AVPixelFormat outputFormat);
|
|
244
|
+
|
|
227
245
|
} // namespace facebook::torchcodec
|
torchcodec/_core/FilterGraph.cpp
CHANGED
|
@@ -5,6 +5,7 @@
|
|
|
5
5
|
// LICENSE file in the root directory of this source tree.
|
|
6
6
|
|
|
7
7
|
#include "src/torchcodec/_core/FilterGraph.h"
|
|
8
|
+
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
8
9
|
|
|
9
10
|
extern "C" {
|
|
10
11
|
#include <libavfilter/buffersink.h>
|
|
@@ -13,6 +14,28 @@ extern "C" {
|
|
|
13
14
|
|
|
14
15
|
namespace facebook::torchcodec {
|
|
15
16
|
|
|
17
|
+
FiltersContext::FiltersContext(
|
|
18
|
+
int inputWidth,
|
|
19
|
+
int inputHeight,
|
|
20
|
+
AVPixelFormat inputFormat,
|
|
21
|
+
AVRational inputAspectRatio,
|
|
22
|
+
int outputWidth,
|
|
23
|
+
int outputHeight,
|
|
24
|
+
AVPixelFormat outputFormat,
|
|
25
|
+
const std::string& filtergraphStr,
|
|
26
|
+
AVRational timeBase,
|
|
27
|
+
AVBufferRef* hwFramesCtx)
|
|
28
|
+
: inputWidth(inputWidth),
|
|
29
|
+
inputHeight(inputHeight),
|
|
30
|
+
inputFormat(inputFormat),
|
|
31
|
+
inputAspectRatio(inputAspectRatio),
|
|
32
|
+
outputWidth(outputWidth),
|
|
33
|
+
outputHeight(outputHeight),
|
|
34
|
+
outputFormat(outputFormat),
|
|
35
|
+
filtergraphStr(filtergraphStr),
|
|
36
|
+
timeBase(timeBase),
|
|
37
|
+
hwFramesCtx(hwFramesCtx) {}
|
|
38
|
+
|
|
16
39
|
bool operator==(const AVRational& lhs, const AVRational& rhs) {
|
|
17
40
|
return lhs.num == rhs.num && lhs.den == rhs.den;
|
|
18
41
|
}
|
|
@@ -41,7 +64,6 @@ FilterGraph::FilterGraph(
|
|
|
41
64
|
}
|
|
42
65
|
|
|
43
66
|
const AVFilter* buffersrc = avfilter_get_by_name("buffer");
|
|
44
|
-
const AVFilter* buffersink = avfilter_get_by_name("buffersink");
|
|
45
67
|
|
|
46
68
|
UniqueAVBufferSrcParameters srcParams(av_buffersrc_parameters_alloc());
|
|
47
69
|
TORCH_CHECK(srcParams, "Failed to allocate buffersrc params");
|
|
@@ -71,26 +93,10 @@ FilterGraph::FilterGraph(
|
|
|
71
93
|
"Failed to create filter graph : ",
|
|
72
94
|
getFFMPEGErrorStringFromErrorCode(status));
|
|
73
95
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
TORCH_CHECK(
|
|
77
|
-
status >= 0,
|
|
78
|
-
"Failed to create filter graph: ",
|
|
79
|
-
getFFMPEGErrorStringFromErrorCode(status));
|
|
80
|
-
|
|
81
|
-
enum AVPixelFormat pix_fmts[] = {
|
|
82
|
-
filtersContext.outputFormat, AV_PIX_FMT_NONE};
|
|
83
|
-
|
|
84
|
-
status = av_opt_set_int_list(
|
|
85
|
-
sinkContext_,
|
|
86
|
-
"pix_fmts",
|
|
87
|
-
pix_fmts,
|
|
88
|
-
AV_PIX_FMT_NONE,
|
|
89
|
-
AV_OPT_SEARCH_CHILDREN);
|
|
96
|
+
sinkContext_ =
|
|
97
|
+
createBuffersinkFilter(filterGraph_.get(), filtersContext.outputFormat);
|
|
90
98
|
TORCH_CHECK(
|
|
91
|
-
|
|
92
|
-
"Failed to set output pixel formats: ",
|
|
93
|
-
getFFMPEGErrorStringFromErrorCode(status));
|
|
99
|
+
sinkContext_ != nullptr, "Failed to create and configure buffersink");
|
|
94
100
|
|
|
95
101
|
UniqueAVFilterInOut outputs(avfilter_inout_alloc());
|
|
96
102
|
UniqueAVFilterInOut inputs(avfilter_inout_alloc());
|
|
@@ -117,7 +123,8 @@ FilterGraph::FilterGraph(
|
|
|
117
123
|
TORCH_CHECK(
|
|
118
124
|
status >= 0,
|
|
119
125
|
"Failed to parse filter description: ",
|
|
120
|
-
getFFMPEGErrorStringFromErrorCode(status)
|
|
126
|
+
getFFMPEGErrorStringFromErrorCode(status),
|
|
127
|
+
", provided filters: " + filtersContext.filtergraphStr);
|
|
121
128
|
|
|
122
129
|
status = avfilter_graph_config(filterGraph_.get(), nullptr);
|
|
123
130
|
TORCH_CHECK(
|
torchcodec/_core/FilterGraph.h
CHANGED
|
@@ -19,11 +19,25 @@ struct FiltersContext {
|
|
|
19
19
|
int outputWidth = 0;
|
|
20
20
|
int outputHeight = 0;
|
|
21
21
|
AVPixelFormat outputFormat = AV_PIX_FMT_NONE;
|
|
22
|
-
|
|
23
22
|
std::string filtergraphStr;
|
|
24
23
|
AVRational timeBase = {0, 0};
|
|
25
24
|
UniqueAVBufferRef hwFramesCtx;
|
|
26
25
|
|
|
26
|
+
FiltersContext() = default;
|
|
27
|
+
FiltersContext(FiltersContext&&) = default;
|
|
28
|
+
FiltersContext& operator=(FiltersContext&&) = default;
|
|
29
|
+
FiltersContext(
|
|
30
|
+
int inputWidth,
|
|
31
|
+
int inputHeight,
|
|
32
|
+
AVPixelFormat inputFormat,
|
|
33
|
+
AVRational inputAspectRatio,
|
|
34
|
+
int outputWidth,
|
|
35
|
+
int outputHeight,
|
|
36
|
+
AVPixelFormat outputFormat,
|
|
37
|
+
const std::string& filtergraphStr,
|
|
38
|
+
AVRational timeBase,
|
|
39
|
+
AVBufferRef* hwFramesCtx = nullptr);
|
|
40
|
+
|
|
27
41
|
bool operator==(const FiltersContext&) const;
|
|
28
42
|
bool operator!=(const FiltersContext&) const;
|
|
29
43
|
};
|
torchcodec/_core/Frame.cpp
CHANGED
|
@@ -8,24 +8,34 @@
|
|
|
8
8
|
|
|
9
9
|
namespace facebook::torchcodec {
|
|
10
10
|
|
|
11
|
+
FrameBatchOutput::FrameBatchOutput(
|
|
12
|
+
int64_t numFrames,
|
|
13
|
+
const FrameDims& outputDims,
|
|
14
|
+
const torch::Device& device)
|
|
15
|
+
: ptsSeconds(torch::empty({numFrames}, {torch::kFloat64})),
|
|
16
|
+
durationSeconds(torch::empty({numFrames}, {torch::kFloat64})) {
|
|
17
|
+
data = allocateEmptyHWCTensor(outputDims, device, numFrames);
|
|
18
|
+
}
|
|
19
|
+
|
|
11
20
|
torch::Tensor allocateEmptyHWCTensor(
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
torch::Device device,
|
|
21
|
+
const FrameDims& frameDims,
|
|
22
|
+
const torch::Device& device,
|
|
15
23
|
std::optional<int> numFrames) {
|
|
16
24
|
auto tensorOptions = torch::TensorOptions()
|
|
17
25
|
.dtype(torch::kUInt8)
|
|
18
26
|
.layout(torch::kStrided)
|
|
19
27
|
.device(device);
|
|
20
|
-
TORCH_CHECK(
|
|
21
|
-
|
|
28
|
+
TORCH_CHECK(
|
|
29
|
+
frameDims.height > 0, "height must be > 0, got: ", frameDims.height);
|
|
30
|
+
TORCH_CHECK(frameDims.width > 0, "width must be > 0, got: ", frameDims.width);
|
|
22
31
|
if (numFrames.has_value()) {
|
|
23
32
|
auto numFramesValue = numFrames.value();
|
|
24
33
|
TORCH_CHECK(
|
|
25
34
|
numFramesValue >= 0, "numFrames must be >= 0, got: ", numFramesValue);
|
|
26
|
-
return torch::empty(
|
|
35
|
+
return torch::empty(
|
|
36
|
+
{numFramesValue, frameDims.height, frameDims.width, 3}, tensorOptions);
|
|
27
37
|
} else {
|
|
28
|
-
return torch::empty({height, width, 3}, tensorOptions);
|
|
38
|
+
return torch::empty({frameDims.height, frameDims.width, 3}, tensorOptions);
|
|
29
39
|
}
|
|
30
40
|
}
|
|
31
41
|
|
torchcodec/_core/Frame.h
CHANGED
|
@@ -13,6 +13,15 @@
|
|
|
13
13
|
|
|
14
14
|
namespace facebook::torchcodec {
|
|
15
15
|
|
|
16
|
+
struct FrameDims {
|
|
17
|
+
int height = 0;
|
|
18
|
+
int width = 0;
|
|
19
|
+
|
|
20
|
+
FrameDims() = default;
|
|
21
|
+
|
|
22
|
+
FrameDims(int h, int w) : height(h), width(w) {}
|
|
23
|
+
};
|
|
24
|
+
|
|
16
25
|
// All public video decoding entry points return either a FrameOutput or a
|
|
17
26
|
// FrameBatchOutput.
|
|
18
27
|
// They are the equivalent of the user-facing Frame and FrameBatch classes in
|
|
@@ -34,10 +43,10 @@ struct FrameBatchOutput {
|
|
|
34
43
|
torch::Tensor ptsSeconds; // 1D of shape (N,)
|
|
35
44
|
torch::Tensor durationSeconds; // 1D of shape (N,)
|
|
36
45
|
|
|
37
|
-
|
|
46
|
+
FrameBatchOutput(
|
|
38
47
|
int64_t numFrames,
|
|
39
|
-
const
|
|
40
|
-
const
|
|
48
|
+
const FrameDims& outputDims,
|
|
49
|
+
const torch::Device& device);
|
|
41
50
|
};
|
|
42
51
|
|
|
43
52
|
struct AudioFramesOutput {
|
|
@@ -49,70 +58,15 @@ struct AudioFramesOutput {
|
|
|
49
58
|
// FRAME TENSOR ALLOCATION APIs
|
|
50
59
|
// --------------------------------------------------------------------------
|
|
51
60
|
|
|
52
|
-
// Note [Frame Tensor allocation
|
|
61
|
+
// Note [Frame Tensor allocation]
|
|
53
62
|
//
|
|
54
63
|
// We always allocate [N]HWC tensors. The low-level decoding functions all
|
|
55
64
|
// assume HWC tensors, since this is what FFmpeg natively handles. It's up to
|
|
56
65
|
// the high-level decoding entry-points to permute that back to CHW, by calling
|
|
57
66
|
// maybePermuteHWC2CHW().
|
|
58
|
-
//
|
|
59
|
-
// Also, importantly, the way we figure out the the height and width of the
|
|
60
|
-
// output frame tensor varies, and depends on the decoding entry-point. In
|
|
61
|
-
// *decreasing order of accuracy*, we use the following sources for determining
|
|
62
|
-
// height and width:
|
|
63
|
-
// - getHeightAndWidthFromResizedAVFrame(). This is the height and width of the
|
|
64
|
-
// AVframe, *post*-resizing. This is only used for single-frame decoding APIs,
|
|
65
|
-
// on CPU, with filtergraph.
|
|
66
|
-
// - getHeightAndWidthFromOptionsOrAVFrame(). This is the height and width from
|
|
67
|
-
// the user-specified options if they exist, or the height and width of the
|
|
68
|
-
// AVFrame *before* it is resized. In theory, i.e. if there are no bugs within
|
|
69
|
-
// our code or within FFmpeg code, this should be exactly the same as
|
|
70
|
-
// getHeightAndWidthFromResizedAVFrame(). This is used by single-frame
|
|
71
|
-
// decoding APIs, on CPU with swscale, and on GPU.
|
|
72
|
-
// - getHeightAndWidthFromOptionsOrMetadata(). This is the height and width from
|
|
73
|
-
// the user-specified options if they exist, or the height and width form the
|
|
74
|
-
// stream metadata, which itself got its value from the CodecContext, when the
|
|
75
|
-
// stream was added. This is used by batch decoding APIs, for both GPU and
|
|
76
|
-
// CPU.
|
|
77
|
-
//
|
|
78
|
-
// The source of truth for height and width really is the (resized) AVFrame: it
|
|
79
|
-
// comes from the decoded ouptut of FFmpeg. The info from the metadata (i.e.
|
|
80
|
-
// from the CodecContext) may not be as accurate. However, the AVFrame is only
|
|
81
|
-
// available late in the call stack, when the frame is decoded, while the
|
|
82
|
-
// CodecContext is available early when a stream is added. This is why we use
|
|
83
|
-
// the CodecContext for pre-allocating batched output tensors (we could
|
|
84
|
-
// pre-allocate those only once we decode the first frame to get the info frame
|
|
85
|
-
// the AVFrame, but that's a more complex logic).
|
|
86
|
-
//
|
|
87
|
-
// Because the sources for height and width may disagree, we may end up with
|
|
88
|
-
// conflicts: e.g. if we pre-allocate a batch output tensor based on the
|
|
89
|
-
// metadata info, but the decoded AVFrame has a different height and width.
|
|
90
|
-
// it is very important to check the height and width assumptions where the
|
|
91
|
-
// tensors memory is used/filled in order to avoid segfaults.
|
|
92
|
-
|
|
93
|
-
struct FrameDims {
|
|
94
|
-
int height;
|
|
95
|
-
int width;
|
|
96
|
-
|
|
97
|
-
FrameDims(int h, int w) : height(h), width(w) {}
|
|
98
|
-
};
|
|
99
|
-
|
|
100
|
-
// There's nothing preventing you from calling this on a non-resized frame, but
|
|
101
|
-
// please don't.
|
|
102
|
-
FrameDims getHeightAndWidthFromResizedAVFrame(const AVFrame& resizedAVFrame);
|
|
103
|
-
|
|
104
|
-
FrameDims getHeightAndWidthFromOptionsOrMetadata(
|
|
105
|
-
const VideoStreamOptions& videoStreamOptions,
|
|
106
|
-
const StreamMetadata& streamMetadata);
|
|
107
|
-
|
|
108
|
-
FrameDims getHeightAndWidthFromOptionsOrAVFrame(
|
|
109
|
-
const VideoStreamOptions& videoStreamOptions,
|
|
110
|
-
const UniqueAVFrame& avFrame);
|
|
111
|
-
|
|
112
67
|
torch::Tensor allocateEmptyHWCTensor(
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
torch::Device device,
|
|
68
|
+
const FrameDims& frameDims,
|
|
69
|
+
const torch::Device& device,
|
|
116
70
|
std::optional<int> numFrames = std::nullopt);
|
|
117
71
|
|
|
118
72
|
} // namespace facebook::torchcodec
|
torchcodec/_core/Metadata.h
CHANGED
|
@@ -44,8 +44,8 @@ struct StreamMetadata {
|
|
|
44
44
|
std::optional<int64_t> numFramesFromContent;
|
|
45
45
|
|
|
46
46
|
// Video-only fields derived from the AVCodecContext.
|
|
47
|
-
std::optional<
|
|
48
|
-
std::optional<
|
|
47
|
+
std::optional<int> width;
|
|
48
|
+
std::optional<int> height;
|
|
49
49
|
std::optional<AVRational> sampleAspectRatio;
|
|
50
50
|
|
|
51
51
|
// Audio-only fields
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
// Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
2
|
+
// All rights reserved.
|
|
3
|
+
//
|
|
4
|
+
// This source code is licensed under the BSD-style license found in the
|
|
5
|
+
// LICENSE file in the root directory of this source tree.
|
|
6
|
+
|
|
7
|
+
#include <torch/types.h>
|
|
8
|
+
#include <mutex>
|
|
9
|
+
|
|
10
|
+
#include "src/torchcodec/_core/FFMPEGCommon.h"
|
|
11
|
+
#include "src/torchcodec/_core/NVDECCache.h"
|
|
12
|
+
|
|
13
|
+
#include <cuda_runtime.h> // For cudaGetDevice
|
|
14
|
+
|
|
15
|
+
extern "C" {
|
|
16
|
+
#include <libavutil/hwcontext_cuda.h>
|
|
17
|
+
#include <libavutil/pixdesc.h>
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
namespace facebook::torchcodec {
|
|
21
|
+
|
|
22
|
+
NVDECCache& NVDECCache::getCache(int deviceIndex) {
|
|
23
|
+
const int MAX_CUDA_GPUS = 128;
|
|
24
|
+
TORCH_CHECK(
|
|
25
|
+
deviceIndex >= -1 && deviceIndex < MAX_CUDA_GPUS,
|
|
26
|
+
"Invalid device index = ",
|
|
27
|
+
deviceIndex);
|
|
28
|
+
static NVDECCache cacheInstances[MAX_CUDA_GPUS];
|
|
29
|
+
if (deviceIndex == -1) {
|
|
30
|
+
// TODO NVDEC P3: Unify with existing getNonNegativeDeviceIndex()
|
|
31
|
+
TORCH_CHECK(
|
|
32
|
+
cudaGetDevice(&deviceIndex) == cudaSuccess,
|
|
33
|
+
"Failed to get current CUDA device.");
|
|
34
|
+
}
|
|
35
|
+
return cacheInstances[deviceIndex];
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
UniqueCUvideodecoder NVDECCache::getDecoder(CUVIDEOFORMAT* videoFormat) {
|
|
39
|
+
CacheKey key(videoFormat);
|
|
40
|
+
std::lock_guard<std::mutex> lock(cacheLock_);
|
|
41
|
+
|
|
42
|
+
auto it = cache_.find(key);
|
|
43
|
+
if (it != cache_.end()) {
|
|
44
|
+
auto decoder = std::move(it->second);
|
|
45
|
+
cache_.erase(it);
|
|
46
|
+
return decoder;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
return nullptr;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
bool NVDECCache::returnDecoder(
|
|
53
|
+
CUVIDEOFORMAT* videoFormat,
|
|
54
|
+
UniqueCUvideodecoder decoder) {
|
|
55
|
+
if (!decoder) {
|
|
56
|
+
return false;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
CacheKey key(videoFormat);
|
|
60
|
+
std::lock_guard<std::mutex> lock(cacheLock_);
|
|
61
|
+
|
|
62
|
+
if (cache_.size() >= MAX_CACHE_SIZE) {
|
|
63
|
+
return false;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
cache_[key] = std::move(decoder);
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
} // namespace facebook::torchcodec
|